Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/subscriptions/combineSubscriptions.py

Issue 28037010: Improved generation of filter subscription files (Closed)
Patch Set: Different approach to atomic updates Created Nov. 11, 2013, 2:52 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sitescripts/subscriptions/bin/updateSubscriptionDownloads.py ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sitescripts/subscriptions/combineSubscriptions.py
===================================================================
--- a/sitescripts/subscriptions/combineSubscriptions.py
+++ b/sitescripts/subscriptions/combineSubscriptions.py
@@ -11,312 +11,314 @@
# Adblock Plus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
-import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base64
+import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base64, tempfile
from getopt import getopt, GetoptError
-acceptedExtensions = {
- '.txt': True,
-}
-ignore = {
- 'Apache.txt': True,
- 'CC-BY-SA.txt': True,
- 'GPL.txt': True,
- 'MPL.txt': True,
-}
-verbatim = {
- 'COPYING': True,
-}
+accepted_extensions = set([".txt"])
+ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])
+verbatim = set(["COPYING"])
-def combineSubscriptions(sourceDirs, targetDir, timeout=30):
- global acceptedExtensions, ignore, verbatim
+def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None):
+ if not os.path.exists(target_dir):
+ os.makedirs(target_dir, 0755)
- if isinstance(sourceDirs, basestring):
- sourceDirs = {'': sourceDirs}
+ def save_file(filename, data):
+ handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=False)
+ handle.write(data.encode("utf-8"))
+ handle.close()
- if not os.path.exists(targetDir):
- os.makedirs(targetDir, 0755)
+ try:
+ subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", handle.name + ".gz", handle.name])
+ except:
+ print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name
- known = {}
- for sourceName, sourceDir in sourceDirs.iteritems():
- for file in os.listdir(sourceDir):
- if file in ignore or file[0] == '.' or not os.path.isfile(os.path.join(sourceDir, file)):
+ path = os.path.join(target_dir, filename)
+ os.rename(handle.name, path)
+ os.rename(handle.name + ".gz", path + ".gz")
+
+ known = set()
+ for source_name, source in sources.iteritems():
+ for filename in source.list_top_level_files():
+ if filename in ignore or filename.startswith("."):
continue
- if file in verbatim:
- processVerbatimFile(sourceDir, targetDir, file)
- elif not os.path.splitext(file)[1] in acceptedExtensions:
+ if filename in verbatim:
+ process_verbatim_file(source, save_file, filename)
+ elif not os.path.splitext(filename)[1] in accepted_extensions:
continue
else:
try:
- processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout)
+ process_subscription_file(source_name, sources, save_file, filename, timeout)
except:
- print >>sys.stderr, 'Error processing subscription file "%s"' % file
+ print >>sys.stderr, 'Error processing subscription file "%s"' % filename
traceback.print_exc()
print >>sys.stderr
- known[os.path.splitext(file)[0] + '.tpl'] = True
- known[os.path.splitext(file)[0] + '.tpl.gz'] = True
- known[file] = True
- known[file + '.gz'] = True
+ known.add(os.path.splitext(filename)[0] + ".tpl")
+ known.add(os.path.splitext(filename)[0] + ".tpl.gz")
+ known.add(filename)
+ known.add(filename + ".gz")
- for file in os.listdir(targetDir):
- if file[0] == '.':
+ for filename in os.listdir(target_dir):
+ if filename.startswith("."):
continue
- if not file in known:
- os.remove(os.path.join(targetDir, file))
+ if not filename in known:
+ os.remove(os.path.join(target_dir, filename))
-def saveFile(filePath, data):
- handle = codecs.open(filePath, 'wb', encoding='utf-8')
- handle.write(data)
- handle.close()
- try:
- subprocess.check_output(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpass=5', filePath + '.gz', filePath])
- except:
- print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % filePath
+def process_verbatim_file(source, save_file, filename):
+ save_file(filename, source.read_file(filename))
-def processVerbatimFile(sourceDir, targetDir, file):
- handle = codecs.open(os.path.join(sourceDir, file), 'rb', encoding='utf-8')
- saveFile(os.path.join(targetDir, file), handle.read())
- handle.close()
+def process_subscription_file(source_name, sources, save_file, filename, timeout):
+ source = sources[source_name]
+ lines = source.read_file(filename).splitlines()
-def processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout):
- sourceDir = sourceDirs[sourceName]
- filePath = os.path.join(sourceDir, file)
- handle = codecs.open(filePath, 'rb', encoding='utf-8')
- lines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines())
- handle.close()
+ header = ""
+ if len(lines) > 0:
+ header = lines.pop(0)
+ if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I):
+ raise Exception("This is not a valid Adblock Plus subscription file.")
- header = ''
- if len(lines) > 0:
- header = lines[0]
- del lines[0]
- if not re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', header, re.I):
- raise Exception('This is not a valid Adblock Plus subscription file.')
-
- lines = resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout)
- seen = set(['checksum', 'version'])
- def checkLine(line):
- if line == '':
+ lines = resolve_includes(source_name, sources, lines, timeout)
+ seen = set(["checksum", "version"])
+ def check_line(line):
+ if line == "":
return False
- match = re.search(r'^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:', line, re.M | re.I)
+ match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I)
if not match:
return True
key = match.group(1).lower()
if key in seen:
return False
seen.add(key)
return True
- lines = filter(checkLine, lines)
+ lines = filter(check_line, lines)
- writeTPL(os.path.join(targetDir, os.path.splitext(file)[0] + '.tpl'), lines)
+ write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines)
- lines.insert(0, '! Version: %s' % time.strftime('%Y%m%d%H%M', time.gmtime()))
+ lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))
checksum = hashlib.md5()
- checksum.update((header + '\n' + '\n'.join(lines)).encode('utf-8'))
- lines.insert(0, '! Checksum: %s' % re.sub(r'=', '', base64.b64encode(checksum.digest())))
+ checksum.update("\n".join([header] + lines).encode("utf-8"))
+ lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip("="))
lines.insert(0, header)
- saveFile(os.path.join(targetDir, file), '\n'.join(lines))
+ save_file(filename, "\n".join(lines))
-def resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout, level=0):
+def resolve_includes(source_name, sources, lines, timeout, level=0):
if level > 5:
- raise Exception('There are too many nested includes, which is probably the result of a circular reference somewhere.')
+ raise Exception("There are too many nested includes, which is probably the result of a circular reference somewhere.")
result = []
for line in lines:
- match = re.search(r'^\s*%include\s+(.*)%\s*$', line)
+ match = re.search(r"^\s*%include\s+(.*)%\s*$", line)
if match:
- file = match.group(1)
- newLines = None
- if re.match(r'^https?://', file):
- result.append('! *** Fetched from: %s ***' % file)
+ filename = match.group(1)
+ newlines = None
+ if re.match(r"^https?://", filename):
+ result.append("! *** Fetched from: %s ***" % filename)
for i in range(3):
try:
- request = urllib2.urlopen(file, None, timeout)
+ request = urllib2.urlopen(filename, None, timeout)
+ data = request.read()
error = None
break
except urllib2.URLError, e:
error = e
time.sleep(5)
if error:
raise error
# We should really get the charset from the headers rather than assuming
# that it is UTF-8. However, some of the Google Code mirrors are
# misconfigured and will return ISO-8859-1 as charset instead of UTF-8.
- newLines = unicode(request.read(), 'utf-8').split('\n')
- newLines = map(lambda l: re.sub(r'[\r\n]', '', l), newLines)
- newLines = filter(lambda l: not re.search(r'^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', l, re.M | re.I), newLines)
- newLines = filter(lambda l: not re.search(r'^\s*!\s*(Redirect|Homepage|Title|Version)\s*:', l, re.M | re.I), newLines)
+ newlines = data.decode("utf-8").splitlines()
+ newlines = filter(lambda l: not re.search(r"^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", l, re.M | re.I), newlines)
+ newlines = filter(lambda l: not re.search(r"^\s*!\s*(Redirect|Homepage|Title|Version)\s*:", l, re.M | re.I), newlines)
else:
- result.append('! *** %s ***' % file)
+ result.append("! *** %s ***" % filename)
- includeSource = sourceName
- if file.find(':') >= 0:
- includeSource, file = file.split(':', 1)
- if not includeSource in sourceDirs:
- raise Exception('Cannot include file from repository "%s", this repository is unknown' % includeSource)
+ include_source = source_name
+ if ":" in filename:
+ include_source, filename = filename.split(":", 1)
+ if not include_source in sources:
+ raise Exception('Cannot include file from repository "%s", this repository is unknown' % include_source)
- parentDir = sourceDirs[includeSource]
- includePath = os.path.join(parentDir, file)
- relPath = os.path.relpath(includePath, parentDir)
- if len(relPath) == 0 or relPath[0] == '.':
- raise Exception('Invalid include "%s", needs to be an HTTP/HTTPS URL or a relative file path' % file)
+ source = sources[include_source]
+ newlines = source.read_file(filename).splitlines()
+ newlines = resolve_includes(include_source, sources, newlines, timeout, level + 1)
- handle = codecs.open(includePath, 'rb', encoding='utf-8')
- newLines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines())
- newLines = resolveIncludes(includeSource, sourceDirs, includePath, newLines, timeout, level + 1)
- handle.close()
-
- if len(newLines) and re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', newLines[0], re.I):
- del newLines[0]
- result.extend(newLines)
+ if len(newlines) and re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", newlines[0], re.I):
+ del newlines[0]
+ result.extend(newlines)
else:
- if line.find('%timestamp%') >= 0:
+ if line.find("%timestamp%") >= 0:
if level == 0:
- line = line.replace('%timestamp%', time.strftime('%d %b %Y %H:%M UTC', time.gmtime()))
+ line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))
else:
- line = ''
+ line = ""
result.append(line)
return result
-def writeTPL(filePath, lines):
+def write_tpl(save_file, filename, lines):
result = []
- result.append('msFilterList')
+ result.append("msFilterList")
for line in lines:
- if re.search(r'^!', line):
+ if re.search(r"^\s*!", line):
# This is a comment. Handle "Expires" comment in a special way, keep the rest.
- match = re.search(r'\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', line, re.I)
+ match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I)
if match:
interval = int(match.group(1))
if match.group(2):
interval = int(interval / 24)
- result.append(': Expires=%i' % interval)
+ result.append(": Expires=%i" % interval)
else:
- result.append(re.sub(r'!', '#', re.sub(r'--!$', '--#', line)))
- elif line.find('#') >= 0:
+ result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line)))
+ elif line.find("#") >= 0:
# Element hiding rules are not supported in MSIE, drop them
pass
else:
# We have a blocking or exception rule, try to convert it
- origLine = line
+ origline = line
- isException = False
- if line[0:2] == '@@':
- isException = True
+ is_exception = False
+ if line.startswith("@@"):
+ is_exception = True
line = line[2:]
- hasUnsupportedOptions = False
- requiresScript = False
- match = re.search(r'^(.*?)\$(.*)', line)
+ has_unsupported = False
+ requires_script = False
+ match = re.search(r"^(.*?)\$(.*)", line)
if match:
# This rule has options, check whether any of them are important
line = match.group(1)
- options = match.group(2).replace('_', '-').lower().split(',')
+ options = match.group(2).replace("_", "-").lower().split(",")
# Remove first-party only exceptions, we will allow an ad server everywhere otherwise
- if isException and '~third-party' in options:
- hasUnsupportedOptions = True
+ if is_exception and "~third-party" in options:
+ has_unsupported = True
# A number of options are not supported in MSIE but can be safely ignored, remove them
- options = filter(lambda o: not o in ('', 'third-party', '~third-party', 'match-case', '~match-case', '~other', '~donottrack'), options)
+ options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options)
# Also ignore domain negation of whitelists
- if isException:
- options = filter(lambda o: not o.startswith('domain=~'), options)
+ if is_exception:
+ options = filter(lambda o: not o.startswith("domain=~"), options)
- unsupportedOptions = filter(lambda o: o in ('other', 'elemhide'), options)
- if unsupportedOptions and len(unsupportedOptions) == len(options):
+ unsupported = filter(lambda o: o in ("other", "elemhide"), options)
+ if unsupported and len(unsupported) == len(options):
# The rule only applies to types that are not supported in MSIE
- hasUnsupportedOptions = True
- elif 'donottrack' in options:
+ has_unsupported = True
+ elif "donottrack" in options:
# Do-Not-Track rules have to be removed even if $donottrack is combined with other options
- hasUnsupportedOptions = True
- elif 'script' in options and len(options) == len(unsupportedOptions) + 1:
+ has_unsupported = True
+ elif "script" in options and len(options) == len(unsupported) + 1:
# Mark rules that only apply to scripts for approximate conversion
- requiresScript = True
+ requires_script = True
elif len(options) > 0:
# The rule has further options that aren't available in TPLs. For
# exception rules that aren't specific to a domain we ignore all
# remaining options to avoid potential false positives. Other rules
# simply aren't included in the TPL file.
- if isException:
- hasUnsupportedOptions = any([o.startswith('domain=') for o in options])
+ if is_exception:
+ has_unsupported = any([o.startswith("domain=") for o in options])
else:
- hasUnsupportedOptions = True
+ has_unsupported = True
- if hasUnsupportedOptions:
+ if has_unsupported:
# Do not include filters with unsupported options
- result.append('# ' + origLine)
+ result.append("# " + origline)
else:
- line = line.replace('^', '/') # Assume that separator placeholders mean slashes
+ line = line.replace("^", "/") # Assume that separator placeholders mean slashes
# Try to extract domain info
domain = None
- match = re.search(r'^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)', line)
+ match = re.search(r"^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)", line)
if match:
domain = match.group(2)
line = match.group(4)
else:
# No domain info, remove anchors at the rule start
- line = re.sub(r'^\|\|', 'http://', line)
- line = re.sub(r'^\|', '', line)
+ line = re.sub(r"^\|\|", "http://", line)
+ line = re.sub(r"^\|", "", line)
# Remove anchors at the rule end
- line = re.sub(r'\|$', '', line)
+ line = re.sub(r"\|$", "", line)
# Remove unnecessary asterisks at the ends of lines
- line = re.sub(r'\*$', '', line)
+ line = re.sub(r"\*$", "", line)
# Emulate $script by appending *.js to the rule
- if requiresScript:
- line += '*.js'
- if line.startswith('/*'):
+ if requires_script:
+ line += "*.js"
+ if line.startswith("/*"):
line = line[2:]
if domain:
- line = '%sd %s %s' % ('+' if isException else '-', domain, line)
- line = re.sub(r'\s+/$', '', line)
+ line = "%sd %s %s" % ("+" if is_exception else "-", domain, line)
+ line = re.sub(r"\s+/$", "", line)
result.append(line)
- elif isException:
+ elif is_exception:
# Exception rules without domains are unsupported
- result.append('# ' + origLine)
+ result.append("# " + origline)
else:
- result.append('- ' + line)
- saveFile(filePath, '\n'.join(result) + '\n')
+ result.append("- " + line)
+ save_file(filename, "\n".join(result) + "\n")
+
+class FileSource:
+ def __init__(self, dir):
+ self._dir = dir
+ if os.path.exists(os.path.join(dir, ".hg")):
+ # This is a Mercurial repository, try updating
+ subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])
+
+ def get_path(self, filename):
+ return os.path.join(self._dir, *filename.split("/"))
+
+ def read_file(self, filename):
+ path = self.get_path(filename)
+ if os.path.relpath(path, self._dir).startswith("."):
+ raise Exception("Attempt to access a file outside the repository")
+ with codecs.open(path, "rb", encoding="utf-8") as handle:
+ return handle.read()
+
+ def list_top_level_files(self):
+ for filename in os.listdir(self._dir):
+ path = os.path.join(self._dir, filename)
+ if os.path.isfile(path):
+ yield filename
def usage():
- print '''Usage: %s [source_dir] [output_dir]
+ print """Usage: %s source_name=source_dir ... [output_dir]
Options:
-h --help Print this message and exit
-t seconds --timeout=seconds Timeout when fetching remote subscriptions
-''' % os.path.basename(sys.argv[0])
+""" % os.path.basename(sys.argv[0])
-if __name__ == '__main__':
+if __name__ == "__main__":
try:
- opts, args = getopt(sys.argv[1:], 'ht:', ['help', 'timeout='])
+ opts, args = getopt(sys.argv[1:], "ht:", ["help", "timeout="])
except GetoptError, e:
print str(e)
usage()
sys.exit(2)
- sourceDir, targetDir = '.', 'subscriptions'
- if len(args) >= 1:
- sourceDir = args[0]
- if len(args) >= 2:
- targetDir = args[1]
+ target_dir = "subscriptions"
+ sources = {}
+ for arg in args:
+ if "=" in arg:
+ source_name, source_dir = arg.split("=", 1)
+ sources[source_name] = FileSource(source_dir)
+ else:
+ target_dir = arg
+ if not sources:
+ sources[""] = FileSource(".")
timeout = 30
for option, value in opts:
- if option in ('-h', '--help'):
+ if option in ("-h", "--help"):
usage()
sys.exit()
- elif option in ('-t', '--timeout'):
+ elif option in ("-t", "--timeout"):
timeout = int(value)
- if os.path.exists(os.path.join(sourceDir, '.hg')):
- # Our source is a Mercurial repository, try updating
- subprocess.check_call(['hg', '-q', '-R', sourceDir, 'pull', '--update'])
-
- combineSubscriptions(sourceDir, targetDir, timeout)
+ combine_subscriptions(sources, target_dir, timeout)
« no previous file with comments | « sitescripts/subscriptions/bin/updateSubscriptionDownloads.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld