| Index: sitescripts/subscriptions/combineSubscriptions.py |
| =================================================================== |
| --- a/sitescripts/subscriptions/combineSubscriptions.py |
| +++ b/sitescripts/subscriptions/combineSubscriptions.py |
| @@ -14,309 +14,308 @@ |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base64 |
| from getopt import getopt, GetoptError |
| -acceptedExtensions = { |
| - '.txt': True, |
| -} |
| -ignore = { |
| - 'Apache.txt': True, |
| - 'CC-BY-SA.txt': True, |
| - 'GPL.txt': True, |
| - 'MPL.txt': True, |
| -} |
| -verbatim = { |
| - 'COPYING': True, |
| -} |
| +accepted_extensions = set([".txt"]) |
| +ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"]) |
| +verbatim = set(["COPYING"]) |
| -def combineSubscriptions(sourceDirs, targetDir, timeout=30): |
| - global acceptedExtensions, ignore, verbatim |
| +def combine_subscriptions(sources, target_dir, timeout=30): |
| + global accepted_extensions, ignore, verbatim |
|
Sebastian Noack
2013/11/06 15:56:22
The global keyword is unneeded here. I know it was
|
| - if isinstance(sourceDirs, basestring): |
| - sourceDirs = {'': sourceDirs} |
| + if not os.path.exists(target_dir): |
| + os.makedirs(target_dir, 0755) |
| - if not os.path.exists(targetDir): |
| - os.makedirs(targetDir, 0755) |
| - |
| - known = {} |
| - for sourceName, sourceDir in sourceDirs.iteritems(): |
| - for file in os.listdir(sourceDir): |
| - if file in ignore or file[0] == '.' or not os.path.isfile(os.path.join(sourceDir, file)): |
| + known = set() |
| + for source_name, source in sources.iteritems(): |
| + for filename in source.list_top_level_files(): |
| + if filename in ignore or filename.startswith("."): |
| continue |
| - if file in verbatim: |
| - processVerbatimFile(sourceDir, targetDir, file) |
| - elif not os.path.splitext(file)[1] in acceptedExtensions: |
| + if filename in verbatim: |
| + process_verbatim_file(source, target_dir, filename) |
| + elif not os.path.splitext(filename)[1] in accepted_extensions: |
| continue |
| else: |
| try: |
| - processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout) |
| + process_subscription_file(source_name, sources, target_dir, filename, timeout) |
| except: |
| - print >>sys.stderr, 'Error processing subscription file "%s"' % file |
| + print >>sys.stderr, 'Error processing subscription file "%s"' % filename |
| traceback.print_exc() |
| print >>sys.stderr |
| - known[os.path.splitext(file)[0] + '.tpl'] = True |
| - known[os.path.splitext(file)[0] + '.tpl.gz'] = True |
| - known[file] = True |
| - known[file + '.gz'] = True |
| + known.add(os.path.splitext(filename)[0] + ".tpl") |
| + known.add(os.path.splitext(filename)[0] + ".tpl.gz") |
| + known.add(filename) |
| + known.add(filename + ".gz") |
| - for file in os.listdir(targetDir): |
| - if file[0] == '.': |
| + for filename in os.listdir(target_dir): |
| + if filename.startswith("."): |
| continue |
| - if not file in known: |
| - os.remove(os.path.join(targetDir, file)) |
| + if not filename in known: |
| + os.remove(os.path.join(target_dir, filename)) |
| -def saveFile(filePath, data): |
| - handle = codecs.open(filePath, 'wb', encoding='utf-8') |
| +def save_file(path, data): |
| + handle = codecs.open(path, "wb", encoding="utf-8") |
| handle.write(data) |
| handle.close() |
| try: |
| - subprocess.check_output(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpass=5', filePath + '.gz', filePath]) |
| + subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", path + ".gz", path]) |
| except: |
| - print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % filePath |
| + print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % path |
| -def processVerbatimFile(sourceDir, targetDir, file): |
| - handle = codecs.open(os.path.join(sourceDir, file), 'rb', encoding='utf-8') |
| - saveFile(os.path.join(targetDir, file), handle.read()) |
| - handle.close() |
| +def process_verbatim_file(source, target_dir, filename): |
| + save_file(os.path.join(target_dir, filename), source.read_file(filename)) |
| -def processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout): |
| - sourceDir = sourceDirs[sourceName] |
| - filePath = os.path.join(sourceDir, file) |
| - handle = codecs.open(filePath, 'rb', encoding='utf-8') |
| - lines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines()) |
| - handle.close() |
| +def process_subscription_file(source_name, sources, target_dir, filename, timeout): |
| + source = sources[source_name] |
| + lines = source.read_file(filename).splitlines() |
| - header = '' |
| + header = "" |
| if len(lines) > 0: |
| - header = lines[0] |
| - del lines[0] |
| - if not re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', header, re.I): |
| - raise Exception('This is not a valid Adblock Plus subscription file.') |
| + header = lines.pop(0) |
| + if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I): |
| + raise Exception("This is not a valid Adblock Plus subscription file.") |
| - lines = resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout) |
| - seen = set(['checksum', 'version']) |
| - def checkLine(line): |
| - if line == '': |
| + lines = resolve_includes(source_name, sources, lines, timeout) |
| + seen = set(["checksum", "version"]) |
| + def check_line(line): |
| + if line == "": |
| return False |
| - match = re.search(r'^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:', line, re.M | re.I) |
| + match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I) |
| if not match: |
| return True |
| key = match.group(1).lower() |
| if key in seen: |
| return False |
| seen.add(key) |
| return True |
| - lines = filter(checkLine, lines) |
| + lines = filter(check_line, lines) |
| - writeTPL(os.path.join(targetDir, os.path.splitext(file)[0] + '.tpl'), lines) |
| + write_tpl(os.path.join(target_dir, os.path.splitext(filename)[0] + ".tpl"), lines) |
| - lines.insert(0, '! Version: %s' % time.strftime('%Y%m%d%H%M', time.gmtime())) |
| + lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime())) |
| checksum = hashlib.md5() |
| - checksum.update((header + '\n' + '\n'.join(lines)).encode('utf-8')) |
| - lines.insert(0, '! Checksum: %s' % re.sub(r'=', '', base64.b64encode(checksum.digest()))) |
| + checksum.update("\n".join([header] + lines).encode("utf-8")) |
| + lines.insert(0, "! Checksum: %s" % re.sub(r"=", "", base64.b64encode(checksum.digest()))) |
|
Sebastian Noack
2013/11/06 15:56:22
You don't need a regex to strip a given character.
Wladimir Palant
2013/11/08 15:08:07
It's way too late to change the specification of c
|
| lines.insert(0, header) |
| - saveFile(os.path.join(targetDir, file), '\n'.join(lines)) |
| + save_file(os.path.join(target_dir, filename), "\n".join(lines)) |
| -def resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout, level=0): |
| +def resolve_includes(source_name, sources, lines, timeout, level=0): |
| if level > 5: |
| - raise Exception('There are too many nested includes, which is probably the result of a circular reference somewhere.') |
| + raise Exception("There are too many nested includes, which is probably the result of a circular reference somewhere.") |
| result = [] |
| for line in lines: |
| - match = re.search(r'^\s*%include\s+(.*)%\s*$', line) |
| + match = re.search(r"^\s*%include\s+(.*)%\s*$", line) |
| if match: |
| - file = match.group(1) |
| - newLines = None |
| - if re.match(r'^https?://', file): |
| - result.append('! *** Fetched from: %s ***' % file) |
| + filename = match.group(1) |
| + newlines = None |
| + if re.match(r"^https?://", filename): |
| + result.append("! *** Fetched from: %s ***" % filename) |
| for i in range(3): |
| try: |
| - request = urllib2.urlopen(file, None, timeout) |
| + request = urllib2.urlopen(filename, None, timeout) |
| + data = request.read() |
| error = None |
| break |
| except urllib2.URLError, e: |
| error = e |
| time.sleep(5) |
| if error: |
| raise error |
| # We should really get the charset from the headers rather than assuming |
| # that it is UTF-8. However, some of the Google Code mirrors are |
| # misconfigured and will return ISO-8859-1 as charset instead of UTF-8. |
| - newLines = unicode(request.read(), 'utf-8').split('\n') |
| - newLines = map(lambda l: re.sub(r'[\r\n]', '', l), newLines) |
| - newLines = filter(lambda l: not re.search(r'^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', l, re.M | re.I), newLines) |
| - newLines = filter(lambda l: not re.search(r'^\s*!\s*(Redirect|Homepage|Title|Version)\s*:', l, re.M | re.I), newLines) |
| + newlines = data.decode("utf-8").splitlines() |
| + newlines = filter(lambda l: not re.search(r"^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", l, re.M | re.I), newlines) |
| + newlines = filter(lambda l: not re.search(r"^\s*!\s*(Redirect|Homepage|Title|Version)\s*:", l, re.M | re.I), newlines) |
| else: |
| - result.append('! *** %s ***' % file) |
| + result.append("! *** %s ***" % filename) |
| - includeSource = sourceName |
| - if file.find(':') >= 0: |
| - includeSource, file = file.split(':', 1) |
| - if not includeSource in sourceDirs: |
| - raise Exception('Cannot include file from repository "%s", this repository is unknown' % includeSource) |
| + include_source = source_name |
| + if ":" in filename: |
| + include_source, filename = filename.split(":", 1) |
| + if not include_source in sources: |
| + raise Exception('Cannot include file from repository "%s", this repository is unknown' % include_source) |
| - parentDir = sourceDirs[includeSource] |
| - includePath = os.path.join(parentDir, file) |
| - relPath = os.path.relpath(includePath, parentDir) |
| - if len(relPath) == 0 or relPath[0] == '.': |
| - raise Exception('Invalid include "%s", needs to be an HTTP/HTTPS URL or a relative file path' % file) |
| + source = sources[include_source] |
| + newlines = source.read_file(filename).splitlines() |
| + newlines = resolve_includes(include_source, sources, newlines, timeout, level + 1) |
| - handle = codecs.open(includePath, 'rb', encoding='utf-8') |
| - newLines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines()) |
| - newLines = resolveIncludes(includeSource, sourceDirs, includePath, newLines, timeout, level + 1) |
| - handle.close() |
| - |
| - if len(newLines) and re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', newLines[0], re.I): |
| - del newLines[0] |
| - result.extend(newLines) |
| + if len(newlines) and re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", newlines[0], re.I): |
| + del newlines[0] |
| + result.extend(newlines) |
| else: |
| - if line.find('%timestamp%') >= 0: |
| + if line.find("%timestamp%") >= 0: |
| if level == 0: |
| - line = line.replace('%timestamp%', time.strftime('%d %b %Y %H:%M UTC', time.gmtime())) |
| + line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime())) |
| else: |
| - line = '' |
| + line = "" |
| result.append(line) |
| return result |
| -def writeTPL(filePath, lines): |
| +def write_tpl(path, lines): |
| result = [] |
| - result.append('msFilterList') |
| + result.append("msFilterList") |
| for line in lines: |
| - if re.search(r'^!', line): |
| + if re.search(r"^\s*!", line): |
| # This is a comment. Handle "Expires" comment in a special way, keep the rest. |
| - match = re.search(r'\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', line, re.I) |
| + match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I) |
| if match: |
| interval = int(match.group(1)) |
| if match.group(2): |
| interval = int(interval / 24) |
| - result.append(': Expires=%i' % interval) |
| + result.append(": Expires=%i" % interval) |
| else: |
| - result.append(re.sub(r'!', '#', re.sub(r'--!$', '--#', line))) |
| - elif line.find('#') >= 0: |
| + result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line))) |
| + elif line.find("#") >= 0: |
| # Element hiding rules are not supported in MSIE, drop them |
| pass |
| else: |
| # We have a blocking or exception rule, try to convert it |
| - origLine = line |
| + origline = line |
| - isException = False |
| - if line[0:2] == '@@': |
| - isException = True |
| + isexception = False |
|
Sebastian Noack
2013/11/06 15:56:22
Apparently you don't like underscores, but "is_exc
Wladimir Palant
2013/11/08 15:08:07
As you wish...
|
| + if line.startswith("@@"): |
| + isexception = True |
| line = line[2:] |
| - hasUnsupportedOptions = False |
| - requiresScript = False |
| - match = re.search(r'^(.*?)\$(.*)', line) |
| + has_unsupported = False |
| + requires_script = False |
| + match = re.search(r"^(.*?)\$(.*)", line) |
| if match: |
| # This rule has options, check whether any of them are important |
| line = match.group(1) |
| - options = match.group(2).replace('_', '-').lower().split(',') |
| + options = match.group(2).replace("_", "-").lower().split(",") |
| # Remove first-party only exceptions, we will allow an ad server everywhere otherwise |
| - if isException and '~third-party' in options: |
| - hasUnsupportedOptions = True |
| + if isexception and "~third-party" in options: |
| + has_unsupported = True |
| # A number of options are not supported in MSIE but can be safely ignored, remove them |
| - options = filter(lambda o: not o in ('', 'third-party', '~third-party', 'match-case', '~match-case', '~other', '~donottrack'), options) |
| + options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options) |
| # Also ignore domain negation of whitelists |
| - if isException: |
| - options = filter(lambda o: not o.startswith('domain=~'), options) |
| + if isexception: |
| + options = filter(lambda o: not o.startswith("domain=~"), options) |
| - unsupportedOptions = filter(lambda o: o in ('other', 'elemhide'), options) |
| - if unsupportedOptions and len(unsupportedOptions) == len(options): |
| + unsupported = filter(lambda o: o in ("other", "elemhide"), options) |
| + if unsupported and len(unsupported) == len(options): |
| # The rule only applies to types that are not supported in MSIE |
| - hasUnsupportedOptions = True |
| - elif 'donottrack' in options: |
| + has_unsupported = True |
| + elif "donottrack" in options: |
| # Do-Not-Track rules have to be removed even if $donottrack is combined with other options |
| - hasUnsupportedOptions = True |
| - elif 'script' in options and len(options) == len(unsupportedOptions) + 1: |
| + has_unsupported = True |
| + elif "script" in options and len(options) == len(unsupported) + 1: |
| # Mark rules that only apply to scripts for approximate conversion |
| - requiresScript = True |
| + requires_script = True |
| elif len(options) > 0: |
| # The rule has further options that aren't available in TPLs. For |
| # exception rules that aren't specific to a domain we ignore all |
| # remaining options to avoid potential false positives. Other rules |
| # simply aren't included in the TPL file. |
| - if isException: |
| - hasUnsupportedOptions = any([o.startswith('domain=') for o in options]) |
| + if isexception: |
| + has_unsupported = any([o.startswith("domain=") for o in options]) |
| else: |
| - hasUnsupportedOptions = True |
| + has_unsupported = True |
| - if hasUnsupportedOptions: |
| + if has_unsupported: |
| # Do not include filters with unsupported options |
| - result.append('# ' + origLine) |
| + result.append("# " + origline) |
| else: |
| - line = line.replace('^', '/') # Assume that separator placeholders mean slashes |
| + line = line.replace("^", "/") # Assume that separator placeholders mean slashes |
| # Try to extract domain info |
| domain = None |
| - match = re.search(r'^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)', line) |
| + match = re.search(r"^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)", line) |
| if match: |
| domain = match.group(2) |
| line = match.group(4) |
| else: |
| # No domain info, remove anchors at the rule start |
| - line = re.sub(r'^\|\|', 'http://', line) |
| - line = re.sub(r'^\|', '', line) |
| + line = re.sub(r"^\|\|", "http://", line) |
| + line = re.sub(r"^\|", "", line) |
| # Remove anchors at the rule end |
| - line = re.sub(r'\|$', '', line) |
| + line = re.sub(r"\|$", "", line) |
| # Remove unnecessary asterisks at the ends of lines |
| - line = re.sub(r'\*$', '', line) |
| + line = re.sub(r"\*$", "", line) |
| # Emulate $script by appending *.js to the rule |
| - if requiresScript: |
| - line += '*.js' |
| - if line.startswith('/*'): |
| + if requires_script: |
| + line += "*.js" |
| + if line.startswith("/*"): |
| line = line[2:] |
| if domain: |
| - line = '%sd %s %s' % ('+' if isException else '-', domain, line) |
| - line = re.sub(r'\s+/$', '', line) |
| + line = "%sd %s %s" % ("+" if isexception else "-", domain, line) |
| + line = re.sub(r"\s+/$", "", line) |
| result.append(line) |
| - elif isException: |
| + elif isexception: |
| # Exception rules without domains are unsupported |
| - result.append('# ' + origLine) |
| + result.append("# " + origline) |
| else: |
| - result.append('- ' + line) |
| - saveFile(filePath, '\n'.join(result) + '\n') |
| + result.append("- " + line) |
| + save_file(path, "\n".join(result) + "\n") |
| + |
| +class FileSource: |
| + def __init__(self, dir): |
| + self._dir = dir |
| + if os.path.exists(os.path.join(dir, ".hg")): |
| + # This is a Mercurial repository, try updating |
| + subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"]) |
| + |
| + def get_path(self, filename): |
| + return os.path.join(self._dir, *filename.split("/")) |
| + |
| + def read_file(self, filename): |
| + path = self.get_path(filename) |
| + if os.path.relpath(path, self._dir).startswith("."): |
| + raise Exception("Attempt to access a file outside the repository") |
| + with codecs.open(path, "rb", encoding="utf-8") as handle: |
| + return handle.read() |
| + |
| + def list_top_level_files(self): |
| + for filename in os.listdir(self._dir): |
| + path = os.path.join(self._dir, filename) |
| + if os.path.isfile(path): |
| + yield filename |
| def usage(): |
| - print '''Usage: %s [source_dir] [output_dir] |
| + print """Usage: %s source_name=source_dir ... [output_dir] |
| Options: |
| -h --help Print this message and exit |
| -t seconds --timeout=seconds Timeout when fetching remote subscriptions |
| -''' % os.path.basename(sys.argv[0]) |
| +""" % os.path.basename(sys.argv[0]) |
| -if __name__ == '__main__': |
| +if __name__ == "__main__": |
| try: |
| - opts, args = getopt(sys.argv[1:], 'ht:', ['help', 'timeout=']) |
| + opts, args = getopt(sys.argv[1:], "ht:", ["help", "timeout="]) |
| except GetoptError, e: |
| print str(e) |
| usage() |
| sys.exit(2) |
| - sourceDir, targetDir = '.', 'subscriptions' |
| - if len(args) >= 1: |
| - sourceDir = args[0] |
| - if len(args) >= 2: |
| - targetDir = args[1] |
| + target_dir = "subscriptions" |
| + sources = {} |
| + for arg in args: |
| + if "=" in arg: |
| + source_name, source_dir = arg.split("=", 1) |
| + sources[source_name] = FileSource(source_dir) |
| + else: |
| + target_dir = arg |
| + if not sources: |
| + sources[""] = FileSource(".") |
| timeout = 30 |
| for option, value in opts: |
| - if option in ('-h', '--help'): |
| + if option in ("-h", "--help"): |
| usage() |
| sys.exit() |
| - elif option in ('-t', '--timeout'): |
| + elif option in ("-t", "--timeout"): |
| timeout = int(value) |
| - if os.path.exists(os.path.join(sourceDir, '.hg')): |
| - # Our source is a Mercurial repository, try updating |
| - subprocess.check_call(['hg', '-q', '-R', sourceDir, 'pull', '--update']) |
| - |
| - combineSubscriptions(sourceDir, targetDir, timeout) |
| + combine_subscriptions(sources, target_dir, timeout) |