Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/subscriptions/combineSubscriptions.py

Issue 11126024: Speed up filter subscription processing by using a thread pool (Closed)
Patch Set: Created July 18, 2013, 11:22 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 # This file is part of the Adblock Plus web scripts, 4 # This file is part of the Adblock Plus web scripts,
5 # Copyright (C) 2006-2013 Eyeo GmbH 5 # Copyright (C) 2006-2013 Eyeo GmbH
6 # 6 #
7 # Adblock Plus is free software: you can redistribute it and/or modify 7 # Adblock Plus is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License version 3 as 8 # it under the terms of the GNU General Public License version 3 as
9 # published by the Free Software Foundation. 9 # published by the Free Software Foundation.
10 # 10 #
11 # Adblock Plus is distributed in the hope that it will be useful, 11 # Adblock Plus is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details. 14 # GNU General Public License for more details.
15 # 15 #
16 # You should have received a copy of the GNU General Public License 16 # You should have received a copy of the GNU General Public License
17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
18 18
19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4 19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4, multiprocessing, functools
20 from getopt import getopt, GetoptError 20 from getopt import getopt, GetoptError
21 21
22 acceptedExtensions = { 22 acceptedExtensions = {
23 '.txt': True, 23 '.txt': True,
24 } 24 }
25 ignore = { 25 ignore = {
26 'Apache.txt': True, 26 'Apache.txt': True,
27 'CC-BY-SA.txt': True, 27 'CC-BY-SA.txt': True,
28 'GPL.txt': True, 28 'GPL.txt': True,
29 'MPL.txt': True, 29 'MPL.txt': True,
30 } 30 }
31 verbatim = { 31 verbatim = {
32 'COPYING': True, 32 'COPYING': True,
33 } 33 }
34 34
35 def getFiles(sourceDirs):
36 for sourceName, sourceDir in sourceDirs.iteritems():
37 for file in os.listdir(sourceDir):
38 if file in ignore or file[0] == '.' or not os.path.isfile(os.path.join(sou rceDir, file)):
39 continue
40
41 if file in verbatim:
42 yield (sourceName, sourceDir, file, 'verbatim')
43 elif not os.path.splitext(file)[1] in acceptedExtensions:
44 continue
45 else:
46 yield (sourceName, sourceDir, file, 'subscription')
47
35 def combineSubscriptions(sourceDirs, targetDir, timeout=30): 48 def combineSubscriptions(sourceDirs, targetDir, timeout=30):
36 global acceptedExtensions, ignore, verbatim 49 global acceptedExtensions, ignore, verbatim
37 50
38 if isinstance(sourceDirs, basestring): 51 if isinstance(sourceDirs, basestring):
39 sourceDirs = {'': sourceDirs} 52 sourceDirs = {'': sourceDirs}
40 53
41 if not os.path.exists(targetDir): 54 if not os.path.exists(targetDir):
42 os.makedirs(targetDir, 0755) 55 os.makedirs(targetDir, 0755)
43 56
44 known = {} 57 known = {}
45 for sourceName, sourceDir in sourceDirs.iteritems(): 58 pool = multiprocessing.Pool()
46 for file in os.listdir(sourceDir): 59 processor = functools.partial(processFile, sourceDirs, targetDir, timeout)
47 if file in ignore or file[0] == '.' or not os.path.isfile(os.path.join(sou rceDir, file)): 60 for file, type in pool.imap(processor, getFiles(sourceDirs)):
48 continue 61 known[file] = True
49 if file in verbatim: 62 known[file + '.gz'] = True
50 processVerbatimFile(sourceDir, targetDir, file) 63 if type == "subscription":
51 elif not os.path.splitext(file)[1] in acceptedExtensions: 64 known[os.path.splitext(file)[0] + '.tpl'] = True
52 continue 65 known[os.path.splitext(file)[0] + '.tpl.gz'] = True
53 else:
54 try:
55 processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeo ut)
56 except:
57 print >>sys.stderr, 'Error processing subscription file "%s"' % file
58 traceback.print_exc()
59 print >>sys.stderr
60 known[os.path.splitext(file)[0] + '.tpl'] = True
61 known[os.path.splitext(file)[0] + '.tpl.gz'] = True
62 known[file] = True
63 known[file + '.gz'] = True
64 66
65 for file in os.listdir(targetDir): 67 for file in os.listdir(targetDir):
66 if file[0] == '.': 68 if file[0] == '.':
67 continue 69 continue
68 if not file in known: 70 if not file in known:
69 os.remove(os.path.join(targetDir, file)) 71 os.remove(os.path.join(targetDir, file))
70 72
71 def saveFile(filePath, data): 73 def saveFile(filePath, data):
72 handle = codecs.open(filePath, 'wb', encoding='utf-8') 74 handle = codecs.open(filePath, 'wb', encoding='utf-8')
73 handle.write(data) 75 handle.write(data)
74 handle.close() 76 handle.close()
75 try: 77 try:
76 subprocess.check_output(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpass=15', filePath + '.gz', filePath]) 78 subprocess.check_output(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpass=15', filePath + '.gz', filePath])
77 except: 79 except:
78 print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % filePath 80 print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % filePath
79 81
82 def processFile(sourceDirs, targetDir, timeout, (sourceName, sourceDir, file, ty pe)):
83 if type == "verbatim":
84 processVerbatimFile(sourceDir, targetDir, file)
85 else:
86 try:
87 processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout)
88 except:
89 print >>sys.stderr, 'Error processing subscription file "%s"' % file
90 traceback.print_exc()
91 print >>sys.stderr
92 return (file, type)
93
80 def processVerbatimFile(sourceDir, targetDir, file): 94 def processVerbatimFile(sourceDir, targetDir, file):
81 handle = codecs.open(os.path.join(sourceDir, file), 'rb', encoding='utf-8') 95 handle = codecs.open(os.path.join(sourceDir, file), 'rb', encoding='utf-8')
82 saveFile(os.path.join(targetDir, file), handle.read()) 96 saveFile(os.path.join(targetDir, file), handle.read())
83 handle.close() 97 handle.close()
84 98
85 def processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout): 99 def processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout):
86 sourceDir = sourceDirs[sourceName] 100 sourceDir = sourceDirs[sourceName]
87 filePath = os.path.join(sourceDir, file) 101 filePath = os.path.join(sourceDir, file)
88 handle = codecs.open(filePath, 'rb', encoding='utf-8') 102 handle = codecs.open(filePath, 'rb', encoding='utf-8')
89 lines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines()) 103 lines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines())
(...skipping 223 matching lines...) Expand 10 before | Expand all | Expand 10 after
313 usage() 327 usage()
314 sys.exit() 328 sys.exit()
315 elif option in ('-t', '--timeout'): 329 elif option in ('-t', '--timeout'):
316 timeout = int(value) 330 timeout = int(value)
317 331
318 if os.path.exists(os.path.join(sourceDir, '.hg')): 332 if os.path.exists(os.path.join(sourceDir, '.hg')):
319 # Our source is a Mercurial repository, try updating 333 # Our source is a Mercurial repository, try updating
320 subprocess.check_call(['hg', '-q', '-R', sourceDir, 'pull', '--update']) 334 subprocess.check_call(['hg', '-q', '-R', sourceDir, 'pull', '--update'])
321 335
322 combineSubscriptions(sourceDir, targetDir, timeout) 336 combineSubscriptions(sourceDir, targetDir, timeout)
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld