| Index: sitescripts/subscriptions/combineSubscriptions.py | 
| =================================================================== | 
| --- a/sitescripts/subscriptions/combineSubscriptions.py | 
| +++ b/sitescripts/subscriptions/combineSubscriptions.py | 
| @@ -11,312 +11,314 @@ | 
| # Adblock Plus is distributed in the hope that it will be useful, | 
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 
| # GNU General Public License for more details. | 
| # | 
| # You should have received a copy of the GNU General Public License | 
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 
| -import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base64 | 
| +import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base64, tempfile | 
| from getopt import getopt, GetoptError | 
| -acceptedExtensions = { | 
| - '.txt': True, | 
| -} | 
| -ignore = { | 
| - 'Apache.txt': True, | 
| - 'CC-BY-SA.txt': True, | 
| - 'GPL.txt': True, | 
| - 'MPL.txt': True, | 
| -} | 
| -verbatim = { | 
| - 'COPYING': True, | 
| -} | 
| +accepted_extensions = set([".txt"]) | 
| +ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"]) | 
| +verbatim = set(["COPYING"]) | 
| -def combineSubscriptions(sourceDirs, targetDir, timeout=30): | 
| - global acceptedExtensions, ignore, verbatim | 
| +def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None): | 
| + if not os.path.exists(target_dir): | 
| + os.makedirs(target_dir, 0755) | 
| - if isinstance(sourceDirs, basestring): | 
| - sourceDirs = {'': sourceDirs} | 
| + def save_file(filename, data): | 
| + handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=False) | 
| + handle.write(data.encode("utf-8")) | 
| + handle.close() | 
| - if not os.path.exists(targetDir): | 
| - os.makedirs(targetDir, 0755) | 
| + try: | 
| + subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", handle.name + ".gz", handle.name]) | 
| + except: | 
| + print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name | 
| - known = {} | 
| - for sourceName, sourceDir in sourceDirs.iteritems(): | 
| - for file in os.listdir(sourceDir): | 
| - if file in ignore or file[0] == '.' or not os.path.isfile(os.path.join(sourceDir, file)): | 
| + path = os.path.join(target_dir, filename) | 
| + os.rename(handle.name, path) | 
| + os.rename(handle.name + ".gz", path + ".gz") | 
| + | 
| + known = set() | 
| + for source_name, source in sources.iteritems(): | 
| + for filename in source.list_top_level_files(): | 
| + if filename in ignore or filename.startswith("."): | 
| continue | 
| - if file in verbatim: | 
| - processVerbatimFile(sourceDir, targetDir, file) | 
| - elif not os.path.splitext(file)[1] in acceptedExtensions: | 
| + if filename in verbatim: | 
| + process_verbatim_file(source, save_file, filename) | 
| + elif not os.path.splitext(filename)[1] in accepted_extensions: | 
| continue | 
| else: | 
| try: | 
| - processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout) | 
| + process_subscription_file(source_name, sources, save_file, filename, timeout) | 
| except: | 
| - print >>sys.stderr, 'Error processing subscription file "%s"' % file | 
| + print >>sys.stderr, 'Error processing subscription file "%s"' % filename | 
| traceback.print_exc() | 
| print >>sys.stderr | 
| - known[os.path.splitext(file)[0] + '.tpl'] = True | 
| - known[os.path.splitext(file)[0] + '.tpl.gz'] = True | 
| - known[file] = True | 
| - known[file + '.gz'] = True | 
| + known.add(os.path.splitext(filename)[0] + ".tpl") | 
| + known.add(os.path.splitext(filename)[0] + ".tpl.gz") | 
| + known.add(filename) | 
| + known.add(filename + ".gz") | 
| - for file in os.listdir(targetDir): | 
| - if file[0] == '.': | 
| + for filename in os.listdir(target_dir): | 
| + if filename.startswith("."): | 
| continue | 
| - if not file in known: | 
| - os.remove(os.path.join(targetDir, file)) | 
| + if not filename in known: | 
| + os.remove(os.path.join(target_dir, filename)) | 
| -def saveFile(filePath, data): | 
| - handle = codecs.open(filePath, 'wb', encoding='utf-8') | 
| - handle.write(data) | 
| - handle.close() | 
| - try: | 
| - subprocess.check_output(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpass=5', filePath + '.gz', filePath]) | 
| - except: | 
| - print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % filePath | 
| +def process_verbatim_file(source, save_file, filename): | 
| + save_file(filename, source.read_file(filename)) | 
| -def processVerbatimFile(sourceDir, targetDir, file): | 
| - handle = codecs.open(os.path.join(sourceDir, file), 'rb', encoding='utf-8') | 
| - saveFile(os.path.join(targetDir, file), handle.read()) | 
| - handle.close() | 
| +def process_subscription_file(source_name, sources, save_file, filename, timeout): | 
| + source = sources[source_name] | 
| + lines = source.read_file(filename).splitlines() | 
| -def processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout): | 
| - sourceDir = sourceDirs[sourceName] | 
| - filePath = os.path.join(sourceDir, file) | 
| - handle = codecs.open(filePath, 'rb', encoding='utf-8') | 
| - lines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines()) | 
| - handle.close() | 
| + header = "" | 
| + if len(lines) > 0: | 
| + header = lines.pop(0) | 
| + if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I): | 
| + raise Exception("This is not a valid Adblock Plus subscription file.") | 
| - header = '' | 
| - if len(lines) > 0: | 
| - header = lines[0] | 
| - del lines[0] | 
| - if not re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', header, re.I): | 
| - raise Exception('This is not a valid Adblock Plus subscription file.') | 
| - | 
| - lines = resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout) | 
| - seen = set(['checksum', 'version']) | 
| - def checkLine(line): | 
| - if line == '': | 
| + lines = resolve_includes(source_name, sources, lines, timeout) | 
| + seen = set(["checksum", "version"]) | 
| + def check_line(line): | 
| + if line == "": | 
| return False | 
| - match = re.search(r'^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:', line, re.M | re.I) | 
| + match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I) | 
| if not match: | 
| return True | 
| key = match.group(1).lower() | 
| if key in seen: | 
| return False | 
| seen.add(key) | 
| return True | 
| - lines = filter(checkLine, lines) | 
| + lines = filter(check_line, lines) | 
| - writeTPL(os.path.join(targetDir, os.path.splitext(file)[0] + '.tpl'), lines) | 
| + write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines) | 
| - lines.insert(0, '! Version: %s' % time.strftime('%Y%m%d%H%M', time.gmtime())) | 
| + lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime())) | 
| checksum = hashlib.md5() | 
| - checksum.update((header + '\n' + '\n'.join(lines)).encode('utf-8')) | 
| - lines.insert(0, '! Checksum: %s' % re.sub(r'=', '', base64.b64encode(checksum.digest()))) | 
| + checksum.update("\n".join([header] + lines).encode("utf-8")) | 
| + lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip("=")) | 
| lines.insert(0, header) | 
| - saveFile(os.path.join(targetDir, file), '\n'.join(lines)) | 
| + save_file(filename, "\n".join(lines)) | 
| -def resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout, level=0): | 
| +def resolve_includes(source_name, sources, lines, timeout, level=0): | 
| if level > 5: | 
| - raise Exception('There are too many nested includes, which is probably the result of a circular reference somewhere.') | 
| + raise Exception("There are too many nested includes, which is probably the result of a circular reference somewhere.") | 
| result = [] | 
| for line in lines: | 
| - match = re.search(r'^\s*%include\s+(.*)%\s*$', line) | 
| + match = re.search(r"^\s*%include\s+(.*)%\s*$", line) | 
| if match: | 
| - file = match.group(1) | 
| - newLines = None | 
| - if re.match(r'^https?://', file): | 
| - result.append('! *** Fetched from: %s ***' % file) | 
| + filename = match.group(1) | 
| + newlines = None | 
| + if re.match(r"^https?://", filename): | 
| + result.append("! *** Fetched from: %s ***" % filename) | 
| for i in range(3): | 
| try: | 
| - request = urllib2.urlopen(file, None, timeout) | 
| + request = urllib2.urlopen(filename, None, timeout) | 
| + data = request.read() | 
| error = None | 
| break | 
| except urllib2.URLError, e: | 
| error = e | 
| time.sleep(5) | 
| if error: | 
| raise error | 
| # We should really get the charset from the headers rather than assuming | 
| # that it is UTF-8. However, some of the Google Code mirrors are | 
| # misconfigured and will return ISO-8859-1 as charset instead of UTF-8. | 
| - newLines = unicode(request.read(), 'utf-8').split('\n') | 
| - newLines = map(lambda l: re.sub(r'[\r\n]', '', l), newLines) | 
| - newLines = filter(lambda l: not re.search(r'^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', l, re.M | re.I), newLines) | 
| - newLines = filter(lambda l: not re.search(r'^\s*!\s*(Redirect|Homepage|Title|Version)\s*:', l, re.M | re.I), newLines) | 
| + newlines = data.decode("utf-8").splitlines() | 
| + newlines = filter(lambda l: not re.search(r"^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", l, re.M | re.I), newlines) | 
| + newlines = filter(lambda l: not re.search(r"^\s*!\s*(Redirect|Homepage|Title|Version)\s*:", l, re.M | re.I), newlines) | 
| else: | 
| - result.append('! *** %s ***' % file) | 
| + result.append("! *** %s ***" % filename) | 
| - includeSource = sourceName | 
| - if file.find(':') >= 0: | 
| - includeSource, file = file.split(':', 1) | 
| - if not includeSource in sourceDirs: | 
| - raise Exception('Cannot include file from repository "%s", this repository is unknown' % includeSource) | 
| + include_source = source_name | 
| + if ":" in filename: | 
| + include_source, filename = filename.split(":", 1) | 
| + if not include_source in sources: | 
| + raise Exception('Cannot include file from repository "%s", this repository is unknown' % include_source) | 
| - parentDir = sourceDirs[includeSource] | 
| - includePath = os.path.join(parentDir, file) | 
| - relPath = os.path.relpath(includePath, parentDir) | 
| - if len(relPath) == 0 or relPath[0] == '.': | 
| - raise Exception('Invalid include "%s", needs to be an HTTP/HTTPS URL or a relative file path' % file) | 
| + source = sources[include_source] | 
| + newlines = source.read_file(filename).splitlines() | 
| + newlines = resolve_includes(include_source, sources, newlines, timeout, level + 1) | 
| - handle = codecs.open(includePath, 'rb', encoding='utf-8') | 
| - newLines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines()) | 
| - newLines = resolveIncludes(includeSource, sourceDirs, includePath, newLines, timeout, level + 1) | 
| - handle.close() | 
| - | 
| - if len(newLines) and re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', newLines[0], re.I): | 
| - del newLines[0] | 
| - result.extend(newLines) | 
| + if len(newlines) and re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", newlines[0], re.I): | 
| + del newlines[0] | 
| + result.extend(newlines) | 
| else: | 
| - if line.find('%timestamp%') >= 0: | 
| + if line.find("%timestamp%") >= 0: | 
| if level == 0: | 
| - line = line.replace('%timestamp%', time.strftime('%d %b %Y %H:%M UTC', time.gmtime())) | 
| + line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime())) | 
| else: | 
| - line = '' | 
| + line = "" | 
| result.append(line) | 
| return result | 
| -def writeTPL(filePath, lines): | 
| +def write_tpl(save_file, filename, lines): | 
| result = [] | 
| - result.append('msFilterList') | 
| + result.append("msFilterList") | 
| for line in lines: | 
| - if re.search(r'^!', line): | 
| + if re.search(r"^\s*!", line): | 
| # This is a comment. Handle "Expires" comment in a special way, keep the rest. | 
| - match = re.search(r'\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', line, re.I) | 
| + match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I) | 
| if match: | 
| interval = int(match.group(1)) | 
| if match.group(2): | 
| interval = int(interval / 24) | 
| - result.append(': Expires=%i' % interval) | 
| + result.append(": Expires=%i" % interval) | 
| else: | 
| - result.append(re.sub(r'!', '#', re.sub(r'--!$', '--#', line))) | 
| - elif line.find('#') >= 0: | 
| + result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line))) | 
| + elif line.find("#") >= 0: | 
| # Element hiding rules are not supported in MSIE, drop them | 
| pass | 
| else: | 
| # We have a blocking or exception rule, try to convert it | 
| - origLine = line | 
| + origline = line | 
| - isException = False | 
| - if line[0:2] == '@@': | 
| - isException = True | 
| + is_exception = False | 
| + if line.startswith("@@"): | 
| + is_exception = True | 
| line = line[2:] | 
| - hasUnsupportedOptions = False | 
| - requiresScript = False | 
| - match = re.search(r'^(.*?)\$(.*)', line) | 
| + has_unsupported = False | 
| + requires_script = False | 
| + match = re.search(r"^(.*?)\$(.*)", line) | 
| if match: | 
| # This rule has options, check whether any of them are important | 
| line = match.group(1) | 
| - options = match.group(2).replace('_', '-').lower().split(',') | 
| + options = match.group(2).replace("_", "-").lower().split(",") | 
| # Remove first-party only exceptions, we will allow an ad server everywhere otherwise | 
| - if isException and '~third-party' in options: | 
| - hasUnsupportedOptions = True | 
| + if is_exception and "~third-party" in options: | 
| + has_unsupported = True | 
| # A number of options are not supported in MSIE but can be safely ignored, remove them | 
| - options = filter(lambda o: not o in ('', 'third-party', '~third-party', 'match-case', '~match-case', '~other', '~donottrack'), options) | 
| + options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options) | 
| # Also ignore domain negation of whitelists | 
| - if isException: | 
| - options = filter(lambda o: not o.startswith('domain=~'), options) | 
| + if is_exception: | 
| + options = filter(lambda o: not o.startswith("domain=~"), options) | 
| - unsupportedOptions = filter(lambda o: o in ('other', 'elemhide'), options) | 
| - if unsupportedOptions and len(unsupportedOptions) == len(options): | 
| + unsupported = filter(lambda o: o in ("other", "elemhide"), options) | 
| + if unsupported and len(unsupported) == len(options): | 
| # The rule only applies to types that are not supported in MSIE | 
| - hasUnsupportedOptions = True | 
| - elif 'donottrack' in options: | 
| + has_unsupported = True | 
| + elif "donottrack" in options: | 
| # Do-Not-Track rules have to be removed even if $donottrack is combined with other options | 
| - hasUnsupportedOptions = True | 
| - elif 'script' in options and len(options) == len(unsupportedOptions) + 1: | 
| + has_unsupported = True | 
| + elif "script" in options and len(options) == len(unsupported) + 1: | 
| # Mark rules that only apply to scripts for approximate conversion | 
| - requiresScript = True | 
| + requires_script = True | 
| elif len(options) > 0: | 
| # The rule has further options that aren't available in TPLs. For | 
| # exception rules that aren't specific to a domain we ignore all | 
| # remaining options to avoid potential false positives. Other rules | 
| # simply aren't included in the TPL file. | 
| - if isException: | 
| - hasUnsupportedOptions = any([o.startswith('domain=') for o in options]) | 
| + if is_exception: | 
| + has_unsupported = any([o.startswith("domain=") for o in options]) | 
| else: | 
| - hasUnsupportedOptions = True | 
| + has_unsupported = True | 
| - if hasUnsupportedOptions: | 
| + if has_unsupported: | 
| # Do not include filters with unsupported options | 
| - result.append('# ' + origLine) | 
| + result.append("# " + origline) | 
| else: | 
| - line = line.replace('^', '/') # Assume that separator placeholders mean slashes | 
| + line = line.replace("^", "/") # Assume that separator placeholders mean slashes | 
| # Try to extract domain info | 
| domain = None | 
| - match = re.search(r'^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)', line) | 
| + match = re.search(r"^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)", line) | 
| if match: | 
| domain = match.group(2) | 
| line = match.group(4) | 
| else: | 
| # No domain info, remove anchors at the rule start | 
| - line = re.sub(r'^\|\|', 'http://', line) | 
| - line = re.sub(r'^\|', '', line) | 
| + line = re.sub(r"^\|\|", "http://", line) | 
| + line = re.sub(r"^\|", "", line) | 
| # Remove anchors at the rule end | 
| - line = re.sub(r'\|$', '', line) | 
| + line = re.sub(r"\|$", "", line) | 
| # Remove unnecessary asterisks at the ends of lines | 
| - line = re.sub(r'\*$', '', line) | 
| + line = re.sub(r"\*$", "", line) | 
| # Emulate $script by appending *.js to the rule | 
| - if requiresScript: | 
| - line += '*.js' | 
| - if line.startswith('/*'): | 
| + if requires_script: | 
| + line += "*.js" | 
| + if line.startswith("/*"): | 
| line = line[2:] | 
| if domain: | 
| - line = '%sd %s %s' % ('+' if isException else '-', domain, line) | 
| - line = re.sub(r'\s+/$', '', line) | 
| + line = "%sd %s %s" % ("+" if is_exception else "-", domain, line) | 
| + line = re.sub(r"\s+/$", "", line) | 
| result.append(line) | 
| - elif isException: | 
| + elif is_exception: | 
| # Exception rules without domains are unsupported | 
| - result.append('# ' + origLine) | 
| + result.append("# " + origline) | 
| else: | 
| - result.append('- ' + line) | 
| - saveFile(filePath, '\n'.join(result) + '\n') | 
| + result.append("- " + line) | 
| + save_file(filename, "\n".join(result) + "\n") | 
| + | 
| +class FileSource: | 
| + def __init__(self, dir): | 
| + self._dir = dir | 
| + if os.path.exists(os.path.join(dir, ".hg")): | 
| + # This is a Mercurial repository, try updating | 
| + subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"]) | 
| + | 
| + def get_path(self, filename): | 
| + return os.path.join(self._dir, *filename.split("/")) | 
| + | 
| + def read_file(self, filename): | 
| + path = self.get_path(filename) | 
| + if os.path.relpath(path, self._dir).startswith("."): | 
| + raise Exception("Attempt to access a file outside the repository") | 
| + with codecs.open(path, "rb", encoding="utf-8") as handle: | 
| + return handle.read() | 
| + | 
| + def list_top_level_files(self): | 
| + for filename in os.listdir(self._dir): | 
| + path = os.path.join(self._dir, filename) | 
| + if os.path.isfile(path): | 
| + yield filename | 
| def usage(): | 
| - print '''Usage: %s [source_dir] [output_dir] | 
| + print """Usage: %s source_name=source_dir ... [output_dir] | 
| Options: | 
| -h --help Print this message and exit | 
| -t seconds --timeout=seconds Timeout when fetching remote subscriptions | 
| -''' % os.path.basename(sys.argv[0]) | 
| +""" % os.path.basename(sys.argv[0]) | 
| -if __name__ == '__main__': | 
| +if __name__ == "__main__": | 
| try: | 
| - opts, args = getopt(sys.argv[1:], 'ht:', ['help', 'timeout=']) | 
| + opts, args = getopt(sys.argv[1:], "ht:", ["help", "timeout="]) | 
| except GetoptError, e: | 
| print str(e) | 
| usage() | 
| sys.exit(2) | 
| - sourceDir, targetDir = '.', 'subscriptions' | 
| - if len(args) >= 1: | 
| - sourceDir = args[0] | 
| - if len(args) >= 2: | 
| - targetDir = args[1] | 
| + target_dir = "subscriptions" | 
| + sources = {} | 
| + for arg in args: | 
| + if "=" in arg: | 
| + source_name, source_dir = arg.split("=", 1) | 
| + sources[source_name] = FileSource(source_dir) | 
| + else: | 
| + target_dir = arg | 
| + if not sources: | 
| + sources[""] = FileSource(".") | 
| timeout = 30 | 
| for option, value in opts: | 
| - if option in ('-h', '--help'): | 
| + if option in ("-h", "--help"): | 
| usage() | 
| sys.exit() | 
| - elif option in ('-t', '--timeout'): | 
| + elif option in ("-t", "--timeout"): | 
| timeout = int(value) | 
| - if os.path.exists(os.path.join(sourceDir, '.hg')): | 
| - # Our source is a Mercurial repository, try updating | 
| - subprocess.check_call(['hg', '-q', '-R', sourceDir, 'pull', '--update']) | 
| - | 
| - combineSubscriptions(sourceDir, targetDir, timeout) | 
| + combine_subscriptions(sources, target_dir, timeout) |