Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/subscriptions/combineSubscriptions.py

Issue 28037010: Improved generation of filter subscription files (Closed)
Left Patch Set: Created Nov. 6, 2013, 2:27 p.m.
Right Patch Set: Different approach to atomic updates Created Nov. 11, 2013, 2:52 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « sitescripts/subscriptions/bin/updateSubscriptionDownloads.py ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 # This file is part of the Adblock Plus web scripts, 4 # This file is part of the Adblock Plus web scripts,
5 # Copyright (C) 2006-2013 Eyeo GmbH 5 # Copyright (C) 2006-2013 Eyeo GmbH
6 # 6 #
7 # Adblock Plus is free software: you can redistribute it and/or modify 7 # Adblock Plus is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License version 3 as 8 # it under the terms of the GNU General Public License version 3 as
9 # published by the Free Software Foundation. 9 # published by the Free Software Foundation.
10 # 10 #
11 # Adblock Plus is distributed in the hope that it will be useful, 11 # Adblock Plus is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details. 14 # GNU General Public License for more details.
15 # 15 #
16 # You should have received a copy of the GNU General Public License 16 # You should have received a copy of the GNU General Public License
17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
18 18
19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4 19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4, tempfile
20 from getopt import getopt, GetoptError 20 from getopt import getopt, GetoptError
21 21
22 accepted_extensions = set([".txt"]) 22 accepted_extensions = set([".txt"])
23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"]) 23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])
24 verbatim = set(["COPYING"]) 24 verbatim = set(["COPYING"])
25 25
26 def combine_subscriptions(sources, target_dir, timeout=30): 26 def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None):
27 global accepted_extensions, ignore, verbatim
Sebastian Noack 2013/11/06 15:56:22 The global keyword is unneeded here. I know it was
28
29 if not os.path.exists(target_dir): 27 if not os.path.exists(target_dir):
30 os.makedirs(target_dir, 0755) 28 os.makedirs(target_dir, 0755)
29
30 def save_file(filename, data):
31 handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=False)
32 handle.write(data.encode("utf-8"))
33 handle.close()
34
35 try:
36 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", handle.name + ".gz", handle.name])
37 except:
38 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name
39
40 path = os.path.join(target_dir, filename)
41 os.rename(handle.name, path)
42 os.rename(handle.name + ".gz", path + ".gz")
31 43
32 known = set() 44 known = set()
33 for source_name, source in sources.iteritems(): 45 for source_name, source in sources.iteritems():
34 for filename in source.list_top_level_files(): 46 for filename in source.list_top_level_files():
35 if filename in ignore or filename.startswith("."): 47 if filename in ignore or filename.startswith("."):
36 continue 48 continue
37 if filename in verbatim: 49 if filename in verbatim:
38 process_verbatim_file(source, target_dir, filename) 50 process_verbatim_file(source, save_file, filename)
39 elif not os.path.splitext(filename)[1] in accepted_extensions: 51 elif not os.path.splitext(filename)[1] in accepted_extensions:
40 continue 52 continue
41 else: 53 else:
42 try: 54 try:
43 process_subscription_file(source_name, sources, target_dir, filename, timeout) 55 process_subscription_file(source_name, sources, save_file, filename, t imeout)
44 except: 56 except:
45 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me 57 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me
46 traceback.print_exc() 58 traceback.print_exc()
47 print >>sys.stderr 59 print >>sys.stderr
48 known.add(os.path.splitext(filename)[0] + ".tpl") 60 known.add(os.path.splitext(filename)[0] + ".tpl")
49 known.add(os.path.splitext(filename)[0] + ".tpl.gz") 61 known.add(os.path.splitext(filename)[0] + ".tpl.gz")
50 known.add(filename) 62 known.add(filename)
51 known.add(filename + ".gz") 63 known.add(filename + ".gz")
52 64
53 for filename in os.listdir(target_dir): 65 for filename in os.listdir(target_dir):
54 if filename.startswith("."): 66 if filename.startswith("."):
55 continue 67 continue
56 if not filename in known: 68 if not filename in known:
57 os.remove(os.path.join(target_dir, filename)) 69 os.remove(os.path.join(target_dir, filename))
58 70
59 def save_file(path, data): 71 def process_verbatim_file(source, save_file, filename):
60 handle = codecs.open(path, "wb", encoding="utf-8") 72 save_file(filename, source.read_file(filename))
61 handle.write(data) 73
62 handle.close() 74 def process_subscription_file(source_name, sources, save_file, filename, timeout ):
63 try:
64 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", p ath + ".gz", path])
65 except:
66 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % path
67
68 def process_verbatim_file(source, target_dir, filename):
69 save_file(os.path.join(target_dir, filename), source.read_file(filename))
70
71 def process_subscription_file(source_name, sources, target_dir, filename, timeou t):
72 source = sources[source_name] 75 source = sources[source_name]
73 lines = source.read_file(filename).splitlines() 76 lines = source.read_file(filename).splitlines()
74 77
75 header = "" 78 header = ""
76 if len(lines) > 0: 79 if len(lines) > 0:
77 header = lines.pop(0) 80 header = lines.pop(0)
78 if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I): 81 if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I):
79 raise Exception("This is not a valid Adblock Plus subscription file.") 82 raise Exception("This is not a valid Adblock Plus subscription file.")
80 83
81 lines = resolve_includes(source_name, sources, lines, timeout) 84 lines = resolve_includes(source_name, sources, lines, timeout)
82 seen = set(["checksum", "version"]) 85 seen = set(["checksum", "version"])
83 def check_line(line): 86 def check_line(line):
84 if line == "": 87 if line == "":
85 return False 88 return False
86 match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I) 89 match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I)
87 if not match: 90 if not match:
88 return True 91 return True
89 key = match.group(1).lower() 92 key = match.group(1).lower()
90 if key in seen: 93 if key in seen:
91 return False 94 return False
92 seen.add(key) 95 seen.add(key)
93 return True 96 return True
94 lines = filter(check_line, lines) 97 lines = filter(check_line, lines)
95 98
96 write_tpl(os.path.join(target_dir, os.path.splitext(filename)[0] + ".tpl"), li nes) 99 write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines)
97 100
98 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime())) 101 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))
99 102
100 checksum = hashlib.md5() 103 checksum = hashlib.md5()
101 checksum.update("\n".join([header] + lines).encode("utf-8")) 104 checksum.update("\n".join([header] + lines).encode("utf-8"))
102 lines.insert(0, "! Checksum: %s" % re.sub(r"=", "", base64.b64encode(checksum. digest()))) 105 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip( "="))
Sebastian Noack 2013/11/06 15:56:22 You don't need a regex to strip a given character.
Wladimir Palant 2013/11/08 15:08:07 It's way too late to change the specification of c
103 lines.insert(0, header) 106 lines.insert(0, header)
104 save_file(os.path.join(target_dir, filename), "\n".join(lines)) 107 save_file(filename, "\n".join(lines))
105 108
106 def resolve_includes(source_name, sources, lines, timeout, level=0): 109 def resolve_includes(source_name, sources, lines, timeout, level=0):
107 if level > 5: 110 if level > 5:
108 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.") 111 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.")
109 112
110 result = [] 113 result = []
111 for line in lines: 114 for line in lines:
112 match = re.search(r"^\s*%include\s+(.*)%\s*$", line) 115 match = re.search(r"^\s*%include\s+(.*)%\s*$", line)
113 if match: 116 if match:
114 filename = match.group(1) 117 filename = match.group(1)
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
152 result.extend(newlines) 155 result.extend(newlines)
153 else: 156 else:
154 if line.find("%timestamp%") >= 0: 157 if line.find("%timestamp%") >= 0:
155 if level == 0: 158 if level == 0:
156 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime())) 159 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))
157 else: 160 else:
158 line = "" 161 line = ""
159 result.append(line) 162 result.append(line)
160 return result 163 return result
161 164
162 def write_tpl(path, lines): 165 def write_tpl(save_file, filename, lines):
163 result = [] 166 result = []
164 result.append("msFilterList") 167 result.append("msFilterList")
165 for line in lines: 168 for line in lines:
166 if re.search(r"^\s*!", line): 169 if re.search(r"^\s*!", line):
167 # This is a comment. Handle "Expires" comment in a special way, keep the r est. 170 # This is a comment. Handle "Expires" comment in a special way, keep the r est.
168 match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I) 171 match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I)
169 if match: 172 if match:
170 interval = int(match.group(1)) 173 interval = int(match.group(1))
171 if match.group(2): 174 if match.group(2):
172 interval = int(interval / 24) 175 interval = int(interval / 24)
173 result.append(": Expires=%i" % interval) 176 result.append(": Expires=%i" % interval)
174 else: 177 else:
175 result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line))) 178 result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line)))
176 elif line.find("#") >= 0: 179 elif line.find("#") >= 0:
177 # Element hiding rules are not supported in MSIE, drop them 180 # Element hiding rules are not supported in MSIE, drop them
178 pass 181 pass
179 else: 182 else:
180 # We have a blocking or exception rule, try to convert it 183 # We have a blocking or exception rule, try to convert it
181 origline = line 184 origline = line
182 185
183 isexception = False 186 is_exception = False
Sebastian Noack 2013/11/06 15:56:22 Apparently you don't like underscores, but "is_exc
Wladimir Palant 2013/11/08 15:08:07 As you wish...
184 if line.startswith("@@"): 187 if line.startswith("@@"):
185 isexception = True 188 is_exception = True
186 line = line[2:] 189 line = line[2:]
187 190
188 has_unsupported = False 191 has_unsupported = False
189 requires_script = False 192 requires_script = False
190 match = re.search(r"^(.*?)\$(.*)", line) 193 match = re.search(r"^(.*?)\$(.*)", line)
191 if match: 194 if match:
192 # This rule has options, check whether any of them are important 195 # This rule has options, check whether any of them are important
193 line = match.group(1) 196 line = match.group(1)
194 options = match.group(2).replace("_", "-").lower().split(",") 197 options = match.group(2).replace("_", "-").lower().split(",")
195 198
196 # Remove first-party only exceptions, we will allow an ad server everywh ere otherwise 199 # Remove first-party only exceptions, we will allow an ad server everywh ere otherwise
197 if isexception and "~third-party" in options: 200 if is_exception and "~third-party" in options:
198 has_unsupported = True 201 has_unsupported = True
199 202
200 # A number of options are not supported in MSIE but can be safely ignore d, remove them 203 # A number of options are not supported in MSIE but can be safely ignore d, remove them
201 options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options) 204 options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options)
202 205
203 # Also ignore domain negation of whitelists 206 # Also ignore domain negation of whitelists
204 if isexception: 207 if is_exception:
205 options = filter(lambda o: not o.startswith("domain=~"), options) 208 options = filter(lambda o: not o.startswith("domain=~"), options)
206 209
207 unsupported = filter(lambda o: o in ("other", "elemhide"), options) 210 unsupported = filter(lambda o: o in ("other", "elemhide"), options)
208 if unsupported and len(unsupported) == len(options): 211 if unsupported and len(unsupported) == len(options):
209 # The rule only applies to types that are not supported in MSIE 212 # The rule only applies to types that are not supported in MSIE
210 has_unsupported = True 213 has_unsupported = True
211 elif "donottrack" in options: 214 elif "donottrack" in options:
212 # Do-Not-Track rules have to be removed even if $donottrack is combine d with other options 215 # Do-Not-Track rules have to be removed even if $donottrack is combine d with other options
213 has_unsupported = True 216 has_unsupported = True
214 elif "script" in options and len(options) == len(unsupported) + 1: 217 elif "script" in options and len(options) == len(unsupported) + 1:
215 # Mark rules that only apply to scripts for approximate conversion 218 # Mark rules that only apply to scripts for approximate conversion
216 requires_script = True 219 requires_script = True
217 elif len(options) > 0: 220 elif len(options) > 0:
218 # The rule has further options that aren't available in TPLs. For 221 # The rule has further options that aren't available in TPLs. For
219 # exception rules that aren't specific to a domain we ignore all 222 # exception rules that aren't specific to a domain we ignore all
220 # remaining options to avoid potential false positives. Other rules 223 # remaining options to avoid potential false positives. Other rules
221 # simply aren't included in the TPL file. 224 # simply aren't included in the TPL file.
222 if isexception: 225 if is_exception:
223 has_unsupported = any([o.startswith("domain=") for o in options]) 226 has_unsupported = any([o.startswith("domain=") for o in options])
224 else: 227 else:
225 has_unsupported = True 228 has_unsupported = True
226 229
227 if has_unsupported: 230 if has_unsupported:
228 # Do not include filters with unsupported options 231 # Do not include filters with unsupported options
229 result.append("# " + origline) 232 result.append("# " + origline)
230 else: 233 else:
231 line = line.replace("^", "/") # Assume that separator placeholders mean slashes 234 line = line.replace("^", "/") # Assume that separator placeholders mean slashes
232 235
(...skipping 10 matching lines...) Expand all
243 # Remove anchors at the rule end 246 # Remove anchors at the rule end
244 line = re.sub(r"\|$", "", line) 247 line = re.sub(r"\|$", "", line)
245 # Remove unnecessary asterisks at the ends of lines 248 # Remove unnecessary asterisks at the ends of lines
246 line = re.sub(r"\*$", "", line) 249 line = re.sub(r"\*$", "", line)
247 # Emulate $script by appending *.js to the rule 250 # Emulate $script by appending *.js to the rule
248 if requires_script: 251 if requires_script:
249 line += "*.js" 252 line += "*.js"
250 if line.startswith("/*"): 253 if line.startswith("/*"):
251 line = line[2:] 254 line = line[2:]
252 if domain: 255 if domain:
253 line = "%sd %s %s" % ("+" if isexception else "-", domain, line) 256 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line)
254 line = re.sub(r"\s+/$", "", line) 257 line = re.sub(r"\s+/$", "", line)
255 result.append(line) 258 result.append(line)
256 elif isexception: 259 elif is_exception:
257 # Exception rules without domains are unsupported 260 # Exception rules without domains are unsupported
258 result.append("# " + origline) 261 result.append("# " + origline)
259 else: 262 else:
260 result.append("- " + line) 263 result.append("- " + line)
261 save_file(path, "\n".join(result) + "\n") 264 save_file(filename, "\n".join(result) + "\n")
262 265
263 class FileSource: 266 class FileSource:
264 def __init__(self, dir): 267 def __init__(self, dir):
265 self._dir = dir 268 self._dir = dir
266 if os.path.exists(os.path.join(dir, ".hg")): 269 if os.path.exists(os.path.join(dir, ".hg")):
267 # This is a Mercurial repository, try updating 270 # This is a Mercurial repository, try updating
268 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"]) 271 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])
269 272
270 def get_path(self, filename): 273 def get_path(self, filename):
271 return os.path.join(self._dir, *filename.split("/")) 274 return os.path.join(self._dir, *filename.split("/"))
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
312 315
313 timeout = 30 316 timeout = 30
314 for option, value in opts: 317 for option, value in opts:
315 if option in ("-h", "--help"): 318 if option in ("-h", "--help"):
316 usage() 319 usage()
317 sys.exit() 320 sys.exit()
318 elif option in ("-t", "--timeout"): 321 elif option in ("-t", "--timeout"):
319 timeout = int(value) 322 timeout = int(value)
320 323
321 combine_subscriptions(sources, target_dir, timeout) 324 combine_subscriptions(sources, target_dir, timeout)
LEFTRIGHT

Powered by Google App Engine
This is Rietveld