Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/subscriptions/combineSubscriptions.py

Issue 28037010: Improved generation of filter subscription files (Closed)
Left Patch Set: Fixed review comments Created Nov. 8, 2013, 3:05 p.m.
Right Patch Set: Different approach to atomic updates Created Nov. 11, 2013, 2:52 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « sitescripts/subscriptions/bin/updateSubscriptionDownloads.py ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 # This file is part of the Adblock Plus web scripts, 4 # This file is part of the Adblock Plus web scripts,
5 # Copyright (C) 2006-2013 Eyeo GmbH 5 # Copyright (C) 2006-2013 Eyeo GmbH
6 # 6 #
7 # Adblock Plus is free software: you can redistribute it and/or modify 7 # Adblock Plus is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License version 3 as 8 # it under the terms of the GNU General Public License version 3 as
9 # published by the Free Software Foundation. 9 # published by the Free Software Foundation.
10 # 10 #
11 # Adblock Plus is distributed in the hope that it will be useful, 11 # Adblock Plus is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details. 14 # GNU General Public License for more details.
15 # 15 #
16 # You should have received a copy of the GNU General Public License 16 # You should have received a copy of the GNU General Public License
17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
18 18
19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4 19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4, tempfile
20 from getopt import getopt, GetoptError 20 from getopt import getopt, GetoptError
21 21
22 accepted_extensions = set([".txt"]) 22 accepted_extensions = set([".txt"])
23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"]) 23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])
24 verbatim = set(["COPYING"]) 24 verbatim = set(["COPYING"])
25 25
26 def combine_subscriptions(sources, target_dir, timeout=30): 26 def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None):
27 if not os.path.exists(target_dir): 27 if not os.path.exists(target_dir):
28 os.makedirs(target_dir, 0755) 28 os.makedirs(target_dir, 0755)
29
30 def save_file(filename, data):
31 handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=False)
32 handle.write(data.encode("utf-8"))
33 handle.close()
34
35 try:
36 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", handle.name + ".gz", handle.name])
37 except:
38 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name
39
40 path = os.path.join(target_dir, filename)
41 os.rename(handle.name, path)
42 os.rename(handle.name + ".gz", path + ".gz")
29 43
30 known = set() 44 known = set()
31 for source_name, source in sources.iteritems(): 45 for source_name, source in sources.iteritems():
32 for filename in source.list_top_level_files(): 46 for filename in source.list_top_level_files():
33 if filename in ignore or filename.startswith("."): 47 if filename in ignore or filename.startswith("."):
34 continue 48 continue
35 if filename in verbatim: 49 if filename in verbatim:
36 process_verbatim_file(source, target_dir, filename) 50 process_verbatim_file(source, save_file, filename)
37 elif not os.path.splitext(filename)[1] in accepted_extensions: 51 elif not os.path.splitext(filename)[1] in accepted_extensions:
38 continue 52 continue
39 else: 53 else:
40 try: 54 try:
41 process_subscription_file(source_name, sources, target_dir, filename, timeout) 55 process_subscription_file(source_name, sources, save_file, filename, t imeout)
42 except: 56 except:
43 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me 57 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me
44 traceback.print_exc() 58 traceback.print_exc()
45 print >>sys.stderr 59 print >>sys.stderr
46 known.add(os.path.splitext(filename)[0] + ".tpl") 60 known.add(os.path.splitext(filename)[0] + ".tpl")
47 known.add(os.path.splitext(filename)[0] + ".tpl.gz") 61 known.add(os.path.splitext(filename)[0] + ".tpl.gz")
48 known.add(filename) 62 known.add(filename)
49 known.add(filename + ".gz") 63 known.add(filename + ".gz")
50 64
51 for filename in os.listdir(target_dir): 65 for filename in os.listdir(target_dir):
52 if filename.startswith("."): 66 if filename.startswith("."):
53 continue 67 continue
54 if not filename in known: 68 if not filename in known:
55 os.remove(os.path.join(target_dir, filename)) 69 os.remove(os.path.join(target_dir, filename))
56 70
57 def save_file(path, data): 71 def process_verbatim_file(source, save_file, filename):
58 handle = codecs.open(path, "wb", encoding="utf-8") 72 save_file(filename, source.read_file(filename))
59 handle.write(data) 73
60 handle.close() 74 def process_subscription_file(source_name, sources, save_file, filename, timeout ):
61 try:
62 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", p ath + ".gz", path])
63 except:
64 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % path
65
66 def process_verbatim_file(source, target_dir, filename):
67 save_file(os.path.join(target_dir, filename), source.read_file(filename))
68
69 def process_subscription_file(source_name, sources, target_dir, filename, timeou t):
70 source = sources[source_name] 75 source = sources[source_name]
71 lines = source.read_file(filename).splitlines() 76 lines = source.read_file(filename).splitlines()
72 77
73 header = "" 78 header = ""
74 if len(lines) > 0: 79 if len(lines) > 0:
75 header = lines.pop(0) 80 header = lines.pop(0)
76 if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I): 81 if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I):
77 raise Exception("This is not a valid Adblock Plus subscription file.") 82 raise Exception("This is not a valid Adblock Plus subscription file.")
78 83
79 lines = resolve_includes(source_name, sources, lines, timeout) 84 lines = resolve_includes(source_name, sources, lines, timeout)
80 seen = set(["checksum", "version"]) 85 seen = set(["checksum", "version"])
81 def check_line(line): 86 def check_line(line):
82 if line == "": 87 if line == "":
83 return False 88 return False
84 match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I) 89 match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I)
85 if not match: 90 if not match:
86 return True 91 return True
87 key = match.group(1).lower() 92 key = match.group(1).lower()
88 if key in seen: 93 if key in seen:
89 return False 94 return False
90 seen.add(key) 95 seen.add(key)
91 return True 96 return True
92 lines = filter(check_line, lines) 97 lines = filter(check_line, lines)
93 98
94 write_tpl(os.path.join(target_dir, os.path.splitext(filename)[0] + ".tpl"), li nes) 99 write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines)
95 100
96 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime())) 101 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))
97 102
98 checksum = hashlib.md5() 103 checksum = hashlib.md5()
99 checksum.update("\n".join([header] + lines).encode("utf-8")) 104 checksum.update("\n".join([header] + lines).encode("utf-8"))
100 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip( "=")) 105 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip( "="))
101 lines.insert(0, header) 106 lines.insert(0, header)
102 save_file(os.path.join(target_dir, filename), "\n".join(lines)) 107 save_file(filename, "\n".join(lines))
103 108
104 def resolve_includes(source_name, sources, lines, timeout, level=0): 109 def resolve_includes(source_name, sources, lines, timeout, level=0):
105 if level > 5: 110 if level > 5:
106 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.") 111 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.")
107 112
108 result = [] 113 result = []
109 for line in lines: 114 for line in lines:
110 match = re.search(r"^\s*%include\s+(.*)%\s*$", line) 115 match = re.search(r"^\s*%include\s+(.*)%\s*$", line)
111 if match: 116 if match:
112 filename = match.group(1) 117 filename = match.group(1)
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
150 result.extend(newlines) 155 result.extend(newlines)
151 else: 156 else:
152 if line.find("%timestamp%") >= 0: 157 if line.find("%timestamp%") >= 0:
153 if level == 0: 158 if level == 0:
154 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime())) 159 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))
155 else: 160 else:
156 line = "" 161 line = ""
157 result.append(line) 162 result.append(line)
158 return result 163 return result
159 164
160 def write_tpl(path, lines): 165 def write_tpl(save_file, filename, lines):
161 result = [] 166 result = []
162 result.append("msFilterList") 167 result.append("msFilterList")
163 for line in lines: 168 for line in lines:
164 if re.search(r"^\s*!", line): 169 if re.search(r"^\s*!", line):
165 # This is a comment. Handle "Expires" comment in a special way, keep the r est. 170 # This is a comment. Handle "Expires" comment in a special way, keep the r est.
166 match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I) 171 match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I)
167 if match: 172 if match:
168 interval = int(match.group(1)) 173 interval = int(match.group(1))
169 if match.group(2): 174 if match.group(2):
170 interval = int(interval / 24) 175 interval = int(interval / 24)
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
249 line = line[2:] 254 line = line[2:]
250 if domain: 255 if domain:
251 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line) 256 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line)
252 line = re.sub(r"\s+/$", "", line) 257 line = re.sub(r"\s+/$", "", line)
253 result.append(line) 258 result.append(line)
254 elif is_exception: 259 elif is_exception:
255 # Exception rules without domains are unsupported 260 # Exception rules without domains are unsupported
256 result.append("# " + origline) 261 result.append("# " + origline)
257 else: 262 else:
258 result.append("- " + line) 263 result.append("- " + line)
259 save_file(path, "\n".join(result) + "\n") 264 save_file(filename, "\n".join(result) + "\n")
260 265
261 class FileSource: 266 class FileSource:
262 def __init__(self, dir): 267 def __init__(self, dir):
263 self._dir = dir 268 self._dir = dir
264 if os.path.exists(os.path.join(dir, ".hg")): 269 if os.path.exists(os.path.join(dir, ".hg")):
265 # This is a Mercurial repository, try updating 270 # This is a Mercurial repository, try updating
266 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"]) 271 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])
267 272
268 def get_path(self, filename): 273 def get_path(self, filename):
269 return os.path.join(self._dir, *filename.split("/")) 274 return os.path.join(self._dir, *filename.split("/"))
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
310 315
311 timeout = 30 316 timeout = 30
312 for option, value in opts: 317 for option, value in opts:
313 if option in ("-h", "--help"): 318 if option in ("-h", "--help"):
314 usage() 319 usage()
315 sys.exit() 320 sys.exit()
316 elif option in ("-t", "--timeout"): 321 elif option in ("-t", "--timeout"):
317 timeout = int(value) 322 timeout = int(value)
318 323
319 combine_subscriptions(sources, target_dir, timeout) 324 combine_subscriptions(sources, target_dir, timeout)
LEFTRIGHT

Powered by Google App Engine
This is Rietveld