sitescripts/subscriptions/combineSubscriptions.py - Issue 28037010: Improved generation of filter subscription files

Unified Diff: sitescripts/subscriptions/combineSubscriptions.py

Issue 28037010: Improved generation of filter subscription files (Closed)

Patch Set: Fixed review comments Created Nov. 8, 2013, 3:05 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: sitescripts/subscriptions/combineSubscriptions.py

===================================================================

--- a/sitescripts/subscriptions/combineSubscriptions.py

+++ b/sitescripts/subscriptions/combineSubscriptions.py

@@ -14,309 +14,306 @@

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base64

from getopt import getopt, GetoptError

-acceptedExtensions = {

- '.txt': True,

-ignore = {

- 'Apache.txt': True,

- 'CC-BY-SA.txt': True,

- 'GPL.txt': True,

- 'MPL.txt': True,

-verbatim = {

- 'COPYING': True,

+accepted_extensions = set([".txt"])

+ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])

+verbatim = set(["COPYING"])

-def combineSubscriptions(sourceDirs, targetDir, timeout=30):

- global acceptedExtensions, ignore, verbatim

+def combine_subscriptions(sources, target_dir, timeout=30):

+ if not os.path.exists(target_dir):

+ os.makedirs(target_dir, 0755)

- if isinstance(sourceDirs, basestring):

- sourceDirs = {'': sourceDirs}

- if not os.path.exists(targetDir):

- os.makedirs(targetDir, 0755)

- known = {}

- for sourceName, sourceDir in sourceDirs.iteritems():

- for file in os.listdir(sourceDir):

- if file in ignore or file[0] == '.' or not os.path.isfile(os.path.join(sourceDir, file)):

+ known = set()

+ for source_name, source in sources.iteritems():

+ for filename in source.list_top_level_files():

+ if filename in ignore or filename.startswith("."):

continue

- if file in verbatim:

- processVerbatimFile(sourceDir, targetDir, file)

- elif not os.path.splitext(file)[1] in acceptedExtensions:

+ if filename in verbatim:

+ process_verbatim_file(source, target_dir, filename)

+ elif not os.path.splitext(filename)[1] in accepted_extensions:

continue

else:

try:

- processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout)

+ process_subscription_file(source_name, sources, target_dir, filename, timeout)

except:

- print >>sys.stderr, 'Error processing subscription file "%s"' % file

+ print >>sys.stderr, 'Error processing subscription file "%s"' % filename

traceback.print_exc()

print >>sys.stderr

- known[os.path.splitext(file)[0] + '.tpl'] = True

- known[os.path.splitext(file)[0] + '.tpl.gz'] = True

- known[file] = True

- known[file + '.gz'] = True

+ known.add(os.path.splitext(filename)[0] + ".tpl")

+ known.add(os.path.splitext(filename)[0] + ".tpl.gz")

+ known.add(filename)

+ known.add(filename + ".gz")

- for file in os.listdir(targetDir):

- if file[0] == '.':

+ for filename in os.listdir(target_dir):

+ if filename.startswith("."):

continue

- if not file in known:

- os.remove(os.path.join(targetDir, file))

+ if not filename in known:

+ os.remove(os.path.join(target_dir, filename))

-def saveFile(filePath, data):

- handle = codecs.open(filePath, 'wb', encoding='utf-8')

+def save_file(path, data):

+ handle = codecs.open(path, "wb", encoding="utf-8")

handle.write(data)

handle.close()

try:

- subprocess.check_output(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpass=5', filePath + '.gz', filePath])

+ subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", path + ".gz", path])

except:

- print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % filePath

+ print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % path

-def processVerbatimFile(sourceDir, targetDir, file):

- handle = codecs.open(os.path.join(sourceDir, file), 'rb', encoding='utf-8')

- saveFile(os.path.join(targetDir, file), handle.read())

- handle.close()

+def process_verbatim_file(source, target_dir, filename):

+ save_file(os.path.join(target_dir, filename), source.read_file(filename))

-def processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout):

- sourceDir = sourceDirs[sourceName]

- filePath = os.path.join(sourceDir, file)

- handle = codecs.open(filePath, 'rb', encoding='utf-8')

- lines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines())

- handle.close()

+def process_subscription_file(source_name, sources, target_dir, filename, timeout):

+ source = sources[source_name]

+ lines = source.read_file(filename).splitlines()

- header = ''

+ header = ""

if len(lines) > 0:

- header = lines[0]

- del lines[0]

- if not re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', header, re.I):

- raise Exception('This is not a valid Adblock Plus subscription file.')

+ header = lines.pop(0)

+ if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I):

+ raise Exception("This is not a valid Adblock Plus subscription file.")

- lines = resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout)

- seen = set(['checksum', 'version'])

- def checkLine(line):

- if line == '':

+ lines = resolve_includes(source_name, sources, lines, timeout)

+ seen = set(["checksum", "version"])

+ def check_line(line):

+ if line == "":

return False

if not match:

return True

key = match.group(1).lower()

if key in seen:

return False

seen.add(key)

return True

- lines = filter(checkLine, lines)

+ lines = filter(check_line, lines)

- writeTPL(os.path.join(targetDir, os.path.splitext(file)[0] + '.tpl'), lines)

+ write_tpl(os.path.join(target_dir, os.path.splitext(filename)[0] + ".tpl"), lines)

- lines.insert(0, '! Version: %s' % time.strftime('%Y%m%d%H%M', time.gmtime()))

+ lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))

checksum = hashlib.md5()

- checksum.update((header + '\n' + '\n'.join(lines)).encode('utf-8'))

- lines.insert(0, '! Checksum: %s' % re.sub(r'=', '', base64.b64encode(checksum.digest())))

+ checksum.update("\n".join([header] + lines).encode("utf-8"))

+ lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip("="))

lines.insert(0, header)

- saveFile(os.path.join(targetDir, file), '\n'.join(lines))

+ save_file(os.path.join(target_dir, filename), "\n".join(lines))

-def resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout, level=0):

+def resolve_includes(source_name, sources, lines, timeout, level=0):

if level > 5:

- raise Exception('There are too many nested includes, which is probably the result of a circular reference somewhere.')

+ raise Exception("There are too many nested includes, which is probably the result of a circular reference somewhere.")

result = []

for line in lines:

- match = re.search(r'^\s*%include\s+(.*)%\s*$', line)

+ match = re.search(r"^\s*%include\s+(.*)%\s*$", line)

if match:

- file = match.group(1)

- newLines = None

- if re.match(r'^https?://', file):

- result.append('! *** Fetched from: %s ***' % file)

+ filename = match.group(1)

+ newlines = None

+ if re.match(r"^https?://", filename):

+ result.append("! *** Fetched from: %s ***" % filename)

for i in range(3):

try:

- request = urllib2.urlopen(file, None, timeout)

+ request = urllib2.urlopen(filename, None, timeout)

+ data = request.read()

error = None

break

except urllib2.URLError, e:

error = e

time.sleep(5)

if error:

raise error

# We should really get the charset from the headers rather than assuming

# that it is UTF-8. However, some of the Google Code mirrors are

# misconfigured and will return ISO-8859-1 as charset instead of UTF-8.

- newLines = unicode(request.read(), 'utf-8').split('\n')

- newLines = map(lambda l: re.sub(r'[\r\n]', '', l), newLines)

- newLines = filter(lambda l: not re.search(r'^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', l, re.M | re.I), newLines)

- newLines = filter(lambda l: not re.search(r'^\s*!\s*(Redirect|Homepage|Title|Version)\s*:', l, re.M | re.I), newLines)

+ newlines = data.decode("utf-8").splitlines()

+ newlines = filter(lambda l: not re.search(r"^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", l, re.M | re.I), newlines)

+ newlines = filter(lambda l: not re.search(r"^\s*!\s*(Redirect|Homepage|Title|Version)\s*:", l, re.M | re.I), newlines)

else:

- result.append('! *** %s ***' % file)

+ result.append("! *** %s ***" % filename)

- includeSource = sourceName

- if file.find(':') >= 0:

- includeSource, file = file.split(':', 1)

- if not includeSource in sourceDirs:

- raise Exception('Cannot include file from repository "%s", this repository is unknown' % includeSource)

+ include_source = source_name

+ if ":" in filename:

+ include_source, filename = filename.split(":", 1)

+ if not include_source in sources:

+ raise Exception('Cannot include file from repository "%s", this repository is unknown' % include_source)

- parentDir = sourceDirs[includeSource]

- includePath = os.path.join(parentDir, file)

- relPath = os.path.relpath(includePath, parentDir)

- if len(relPath) == 0 or relPath[0] == '.':

- raise Exception('Invalid include "%s", needs to be an HTTP/HTTPS URL or a relative file path' % file)

+ source = sources[include_source]

+ newlines = source.read_file(filename).splitlines()

+ newlines = resolve_includes(include_source, sources, newlines, timeout, level + 1)

- handle = codecs.open(includePath, 'rb', encoding='utf-8')

- newLines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines())

- newLines = resolveIncludes(includeSource, sourceDirs, includePath, newLines, timeout, level + 1)

- handle.close()

- if len(newLines) and re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', newLines[0], re.I):

- del newLines[0]

- result.extend(newLines)

+ if len(newlines) and re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", newlines[0], re.I):

+ del newlines[0]

+ result.extend(newlines)

else:

- if line.find('%timestamp%') >= 0:

+ if line.find("%timestamp%") >= 0:

if level == 0:

- line = line.replace('%timestamp%', time.strftime('%d %b %Y %H:%M UTC', time.gmtime()))

+ line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))

else:

- line = ''

+ line = ""

result.append(line)

return result

-def writeTPL(filePath, lines):

+def write_tpl(path, lines):

result = []

- result.append('msFilterList')

+ result.append("msFilterList")

for line in lines:

- if re.search(r'^!', line):

+ if re.search(r"^\s*!", line):

# This is a comment. Handle "Expires" comment in a special way, keep the rest.

- match = re.search(r'\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', line, re.I)

+ match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I)

if match:

interval = int(match.group(1))

if match.group(2):

interval = int(interval / 24)

- result.append(': Expires=%i' % interval)

+ result.append(": Expires=%i" % interval)

else:

- result.append(re.sub(r'!', '#', re.sub(r'--!$', '--#', line)))

- elif line.find('#') >= 0:

+ result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line)))

+ elif line.find("#") >= 0:

# Element hiding rules are not supported in MSIE, drop them

pass

else:

# We have a blocking or exception rule, try to convert it

- origLine = line

+ origline = line

- isException = False

- if line[0:2] == '@@':

- isException = True

+ is_exception = False

+ if line.startswith("@@"):

+ is_exception = True

line = line[2:]

- hasUnsupportedOptions = False

- requiresScript = False

- match = re.search(r'^(.*?)\$(.*)', line)

+ has_unsupported = False

+ requires_script = False

+ match = re.search(r"^(.*?)\$(.*)", line)

if match:

# This rule has options, check whether any of them are important

line = match.group(1)

- options = match.group(2).replace('_', '-').lower().split(',')

+ options = match.group(2).replace("_", "-").lower().split(",")

# Remove first-party only exceptions, we will allow an ad server everywhere otherwise

- if isException and '~third-party' in options:

- hasUnsupportedOptions = True

+ if is_exception and "~third-party" in options:

+ has_unsupported = True

# A number of options are not supported in MSIE but can be safely ignored, remove them

- options = filter(lambda o: not o in ('', 'third-party', '~third-party', 'match-case', '~match-case', '~other', '~donottrack'), options)

+ options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options)

# Also ignore domain negation of whitelists

- if isException:

- options = filter(lambda o: not o.startswith('domain=~'), options)

+ if is_exception:

+ options = filter(lambda o: not o.startswith("domain=~"), options)

- unsupportedOptions = filter(lambda o: o in ('other', 'elemhide'), options)

- if unsupportedOptions and len(unsupportedOptions) == len(options):

+ unsupported = filter(lambda o: o in ("other", "elemhide"), options)

+ if unsupported and len(unsupported) == len(options):

# The rule only applies to types that are not supported in MSIE

- hasUnsupportedOptions = True

- elif 'donottrack' in options:

+ has_unsupported = True

+ elif "donottrack" in options:

# Do-Not-Track rules have to be removed even if $donottrack is combined with other options

- hasUnsupportedOptions = True

- elif 'script' in options and len(options) == len(unsupportedOptions) + 1:

+ has_unsupported = True

+ elif "script" in options and len(options) == len(unsupported) + 1:

# Mark rules that only apply to scripts for approximate conversion

- requiresScript = True

+ requires_script = True

elif len(options) > 0:

# The rule has further options that aren't available in TPLs. For

# exception rules that aren't specific to a domain we ignore all

# remaining options to avoid potential false positives. Other rules

# simply aren't included in the TPL file.

- if isException:

- hasUnsupportedOptions = any([o.startswith('domain=') for o in options])

+ if is_exception:

+ has_unsupported = any([o.startswith("domain=") for o in options])

else:

- hasUnsupportedOptions = True

+ has_unsupported = True

- if hasUnsupportedOptions:

+ if has_unsupported:

# Do not include filters with unsupported options

- result.append('# ' + origLine)

+ result.append("# " + origline)

else:

- line = line.replace('^', '/') # Assume that separator placeholders mean slashes

+ line = line.replace("^", "/") # Assume that separator placeholders mean slashes

# Try to extract domain info

domain = None

- match = re.search(r'^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)', line)

+ match = re.search(r"^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)", line)

if match:

domain = match.group(2)

line = match.group(4)

else:

# No domain info, remove anchors at the rule start

- line = re.sub(r'^\|\|', 'http://', line)

- line = re.sub(r'^\|', '', line)

+ line = re.sub(r"^\|\|", "http://", line)

+ line = re.sub(r"^\|", "", line)

# Remove anchors at the rule end

- line = re.sub(r'\|$', '', line)

+ line = re.sub(r"\|$", "", line)

# Remove unnecessary asterisks at the ends of lines

- line = re.sub(r'\*$', '', line)

+ line = re.sub(r"\*$", "", line)

# Emulate $script by appending *.js to the rule

- if requiresScript:

- line += '*.js'

- if line.startswith('/*'):

+ if requires_script:

+ line += "*.js"

+ if line.startswith("/*"):

line = line[2:]

if domain:

- line = '%sd %s %s' % ('+' if isException else '-', domain, line)

- line = re.sub(r'\s+/$', '', line)

+ line = "%sd %s %s" % ("+" if is_exception else "-", domain, line)

+ line = re.sub(r"\s+/$", "", line)

result.append(line)

- elif isException:

+ elif is_exception:

# Exception rules without domains are unsupported

- result.append('# ' + origLine)

+ result.append("# " + origline)

else:

- result.append('- ' + line)

- saveFile(filePath, '\n'.join(result) + '\n')

+ result.append("- " + line)

+ save_file(path, "\n".join(result) + "\n")

+class FileSource:

+ def __init__(self, dir):

+ self._dir = dir

+ if os.path.exists(os.path.join(dir, ".hg")):

+ # This is a Mercurial repository, try updating

+ subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])

+ def get_path(self, filename):

+ return os.path.join(self._dir, *filename.split("/"))

+ def read_file(self, filename):

+ path = self.get_path(filename)

+ if os.path.relpath(path, self._dir).startswith("."):

+ raise Exception("Attempt to access a file outside the repository")

+ with codecs.open(path, "rb", encoding="utf-8") as handle:

+ return handle.read()

+ def list_top_level_files(self):

+ for filename in os.listdir(self._dir):

+ path = os.path.join(self._dir, filename)

+ if os.path.isfile(path):

+ yield filename

def usage():

- print '''Usage: %s [source_dir] [output_dir]

+ print """Usage: %s source_name=source_dir ... [output_dir]

Options:

-h --help Print this message and exit

-t seconds --timeout=seconds Timeout when fetching remote subscriptions

-''' % os.path.basename(sys.argv[0])

+""" % os.path.basename(sys.argv[0])

-if __name__ == '__main__':

+if __name__ == "__main__":

try:

- opts, args = getopt(sys.argv[1:], 'ht:', ['help', 'timeout='])

+ opts, args = getopt(sys.argv[1:], "ht:", ["help", "timeout="])

except GetoptError, e:

print str(e)

usage()

sys.exit(2)

- sourceDir, targetDir = '.', 'subscriptions'

- if len(args) >= 1:

- sourceDir = args[0]

- if len(args) >= 2:

- targetDir = args[1]

+ target_dir = "subscriptions"

+ sources = {}

+ for arg in args:

+ if "=" in arg:

+ source_name, source_dir = arg.split("=", 1)

+ sources[source_name] = FileSource(source_dir)

+ else:

+ target_dir = arg

+ if not sources:

+ sources[""] = FileSource(".")

timeout = 30

for option, value in opts:

- if option in ('-h', '--help'):

+ if option in ("-h", "--help"):

usage()

sys.exit()

- elif option in ('-t', '--timeout'):

+ elif option in ("-t", "--timeout"):

timeout = int(value)

- if os.path.exists(os.path.join(sourceDir, '.hg')):

- # Our source is a Mercurial repository, try updating

- subprocess.check_call(['hg', '-q', '-R', sourceDir, 'pull', '--update'])

- combineSubscriptions(sourceDir, targetDir, timeout)

+ combine_subscriptions(sources, target_dir, timeout)

« no previous file with comments | « sitescripts/subscriptions/bin/updateSubscriptionDownloads.py ('k') | no next file » | no next file with comments »