Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # coding: utf-8 | 2 # coding: utf-8 |
3 | 3 |
4 # This file is part of the Adblock Plus web scripts, | 4 # This file is part of the Adblock Plus web scripts, |
5 # Copyright (C) 2006-2013 Eyeo GmbH | 5 # Copyright (C) 2006-2013 Eyeo GmbH |
6 # | 6 # |
7 # Adblock Plus is free software: you can redistribute it and/or modify | 7 # Adblock Plus is free software: you can redistribute it and/or modify |
8 # it under the terms of the GNU General Public License version 3 as | 8 # it under the terms of the GNU General Public License version 3 as |
9 # published by the Free Software Foundation. | 9 # published by the Free Software Foundation. |
10 # | 10 # |
11 # Adblock Plus is distributed in the hope that it will be useful, | 11 # Adblock Plus is distributed in the hope that it will be useful, |
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 # GNU General Public License for more details. | 14 # GNU General Public License for more details. |
15 # | 15 # |
16 # You should have received a copy of the GNU General Public License | 16 # You should have received a copy of the GNU General Public License |
17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
18 | 18 |
19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4 | 19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4, tempfile |
20 from getopt import getopt, GetoptError | 20 from getopt import getopt, GetoptError |
21 | 21 |
22 accepted_extensions = set([".txt"]) | 22 accepted_extensions = set([".txt"]) |
23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"]) | 23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"]) |
24 verbatim = set(["COPYING"]) | 24 verbatim = set(["COPYING"]) |
25 | 25 |
26 def combine_subscriptions(sources, target_dir, timeout=30): | 26 def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None): |
27 global accepted_extensions, ignore, verbatim | |
Sebastian Noack
2013/11/06 15:56:22
The global keyword is unneeded here. I know it was
| |
28 | |
29 if not os.path.exists(target_dir): | 27 if not os.path.exists(target_dir): |
30 os.makedirs(target_dir, 0755) | 28 os.makedirs(target_dir, 0755) |
29 | |
30 def save_file(filename, data): | |
31 handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=False) | |
32 handle.write(data.encode("utf-8")) | |
33 handle.close() | |
34 | |
35 try: | |
36 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", handle.name + ".gz", handle.name]) | |
37 except: | |
38 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name | |
39 | |
40 path = os.path.join(target_dir, filename) | |
41 os.rename(handle.name, path) | |
42 os.rename(handle.name + ".gz", path + ".gz") | |
31 | 43 |
32 known = set() | 44 known = set() |
33 for source_name, source in sources.iteritems(): | 45 for source_name, source in sources.iteritems(): |
34 for filename in source.list_top_level_files(): | 46 for filename in source.list_top_level_files(): |
35 if filename in ignore or filename.startswith("."): | 47 if filename in ignore or filename.startswith("."): |
36 continue | 48 continue |
37 if filename in verbatim: | 49 if filename in verbatim: |
38 process_verbatim_file(source, target_dir, filename) | 50 process_verbatim_file(source, save_file, filename) |
39 elif not os.path.splitext(filename)[1] in accepted_extensions: | 51 elif not os.path.splitext(filename)[1] in accepted_extensions: |
40 continue | 52 continue |
41 else: | 53 else: |
42 try: | 54 try: |
43 process_subscription_file(source_name, sources, target_dir, filename, timeout) | 55 process_subscription_file(source_name, sources, save_file, filename, t imeout) |
44 except: | 56 except: |
45 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me | 57 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me |
46 traceback.print_exc() | 58 traceback.print_exc() |
47 print >>sys.stderr | 59 print >>sys.stderr |
48 known.add(os.path.splitext(filename)[0] + ".tpl") | 60 known.add(os.path.splitext(filename)[0] + ".tpl") |
49 known.add(os.path.splitext(filename)[0] + ".tpl.gz") | 61 known.add(os.path.splitext(filename)[0] + ".tpl.gz") |
50 known.add(filename) | 62 known.add(filename) |
51 known.add(filename + ".gz") | 63 known.add(filename + ".gz") |
52 | 64 |
53 for filename in os.listdir(target_dir): | 65 for filename in os.listdir(target_dir): |
54 if filename.startswith("."): | 66 if filename.startswith("."): |
55 continue | 67 continue |
56 if not filename in known: | 68 if not filename in known: |
57 os.remove(os.path.join(target_dir, filename)) | 69 os.remove(os.path.join(target_dir, filename)) |
58 | 70 |
59 def save_file(path, data): | 71 def process_verbatim_file(source, save_file, filename): |
60 handle = codecs.open(path, "wb", encoding="utf-8") | 72 save_file(filename, source.read_file(filename)) |
61 handle.write(data) | 73 |
62 handle.close() | 74 def process_subscription_file(source_name, sources, save_file, filename, timeout ): |
63 try: | |
64 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", p ath + ".gz", path]) | |
65 except: | |
66 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % path | |
67 | |
68 def process_verbatim_file(source, target_dir, filename): | |
69 save_file(os.path.join(target_dir, filename), source.read_file(filename)) | |
70 | |
71 def process_subscription_file(source_name, sources, target_dir, filename, timeou t): | |
72 source = sources[source_name] | 75 source = sources[source_name] |
73 lines = source.read_file(filename).splitlines() | 76 lines = source.read_file(filename).splitlines() |
74 | 77 |
75 header = "" | 78 header = "" |
76 if len(lines) > 0: | 79 if len(lines) > 0: |
77 header = lines.pop(0) | 80 header = lines.pop(0) |
78 if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I): | 81 if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I): |
79 raise Exception("This is not a valid Adblock Plus subscription file.") | 82 raise Exception("This is not a valid Adblock Plus subscription file.") |
80 | 83 |
81 lines = resolve_includes(source_name, sources, lines, timeout) | 84 lines = resolve_includes(source_name, sources, lines, timeout) |
82 seen = set(["checksum", "version"]) | 85 seen = set(["checksum", "version"]) |
83 def check_line(line): | 86 def check_line(line): |
84 if line == "": | 87 if line == "": |
85 return False | 88 return False |
86 match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I) | 89 match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version)\s*:", line, re.M | re.I) |
87 if not match: | 90 if not match: |
88 return True | 91 return True |
89 key = match.group(1).lower() | 92 key = match.group(1).lower() |
90 if key in seen: | 93 if key in seen: |
91 return False | 94 return False |
92 seen.add(key) | 95 seen.add(key) |
93 return True | 96 return True |
94 lines = filter(check_line, lines) | 97 lines = filter(check_line, lines) |
95 | 98 |
96 write_tpl(os.path.join(target_dir, os.path.splitext(filename)[0] + ".tpl"), li nes) | 99 write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines) |
97 | 100 |
98 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime())) | 101 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime())) |
99 | 102 |
100 checksum = hashlib.md5() | 103 checksum = hashlib.md5() |
101 checksum.update("\n".join([header] + lines).encode("utf-8")) | 104 checksum.update("\n".join([header] + lines).encode("utf-8")) |
102 lines.insert(0, "! Checksum: %s" % re.sub(r"=", "", base64.b64encode(checksum. digest()))) | 105 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip( "=")) |
Sebastian Noack
2013/11/06 15:56:22
You don't need a regex to strip a given character.
Wladimir Palant
2013/11/08 15:08:07
It's way too late to change the specification of c
| |
103 lines.insert(0, header) | 106 lines.insert(0, header) |
104 save_file(os.path.join(target_dir, filename), "\n".join(lines)) | 107 save_file(filename, "\n".join(lines)) |
105 | 108 |
106 def resolve_includes(source_name, sources, lines, timeout, level=0): | 109 def resolve_includes(source_name, sources, lines, timeout, level=0): |
107 if level > 5: | 110 if level > 5: |
108 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.") | 111 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.") |
109 | 112 |
110 result = [] | 113 result = [] |
111 for line in lines: | 114 for line in lines: |
112 match = re.search(r"^\s*%include\s+(.*)%\s*$", line) | 115 match = re.search(r"^\s*%include\s+(.*)%\s*$", line) |
113 if match: | 116 if match: |
114 filename = match.group(1) | 117 filename = match.group(1) |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
152 result.extend(newlines) | 155 result.extend(newlines) |
153 else: | 156 else: |
154 if line.find("%timestamp%") >= 0: | 157 if line.find("%timestamp%") >= 0: |
155 if level == 0: | 158 if level == 0: |
156 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime())) | 159 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime())) |
157 else: | 160 else: |
158 line = "" | 161 line = "" |
159 result.append(line) | 162 result.append(line) |
160 return result | 163 return result |
161 | 164 |
162 def write_tpl(path, lines): | 165 def write_tpl(save_file, filename, lines): |
163 result = [] | 166 result = [] |
164 result.append("msFilterList") | 167 result.append("msFilterList") |
165 for line in lines: | 168 for line in lines: |
166 if re.search(r"^\s*!", line): | 169 if re.search(r"^\s*!", line): |
167 # This is a comment. Handle "Expires" comment in a special way, keep the r est. | 170 # This is a comment. Handle "Expires" comment in a special way, keep the r est. |
168 match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I) | 171 match = re.search(r"\bExpires\s*(?::|after)\s*(\d+)\s*(h)?", line, re.I) |
169 if match: | 172 if match: |
170 interval = int(match.group(1)) | 173 interval = int(match.group(1)) |
171 if match.group(2): | 174 if match.group(2): |
172 interval = int(interval / 24) | 175 interval = int(interval / 24) |
173 result.append(": Expires=%i" % interval) | 176 result.append(": Expires=%i" % interval) |
174 else: | 177 else: |
175 result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line))) | 178 result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line))) |
176 elif line.find("#") >= 0: | 179 elif line.find("#") >= 0: |
177 # Element hiding rules are not supported in MSIE, drop them | 180 # Element hiding rules are not supported in MSIE, drop them |
178 pass | 181 pass |
179 else: | 182 else: |
180 # We have a blocking or exception rule, try to convert it | 183 # We have a blocking or exception rule, try to convert it |
181 origline = line | 184 origline = line |
182 | 185 |
183 isexception = False | 186 is_exception = False |
Sebastian Noack
2013/11/06 15:56:22
Apparently you don't like underscores, but "is_exc
Wladimir Palant
2013/11/08 15:08:07
As you wish...
| |
184 if line.startswith("@@"): | 187 if line.startswith("@@"): |
185 isexception = True | 188 is_exception = True |
186 line = line[2:] | 189 line = line[2:] |
187 | 190 |
188 has_unsupported = False | 191 has_unsupported = False |
189 requires_script = False | 192 requires_script = False |
190 match = re.search(r"^(.*?)\$(.*)", line) | 193 match = re.search(r"^(.*?)\$(.*)", line) |
191 if match: | 194 if match: |
192 # This rule has options, check whether any of them are important | 195 # This rule has options, check whether any of them are important |
193 line = match.group(1) | 196 line = match.group(1) |
194 options = match.group(2).replace("_", "-").lower().split(",") | 197 options = match.group(2).replace("_", "-").lower().split(",") |
195 | 198 |
196 # Remove first-party only exceptions, we will allow an ad server everywh ere otherwise | 199 # Remove first-party only exceptions, we will allow an ad server everywh ere otherwise |
197 if isexception and "~third-party" in options: | 200 if is_exception and "~third-party" in options: |
198 has_unsupported = True | 201 has_unsupported = True |
199 | 202 |
200 # A number of options are not supported in MSIE but can be safely ignore d, remove them | 203 # A number of options are not supported in MSIE but can be safely ignore d, remove them |
201 options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options) | 204 options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options) |
202 | 205 |
203 # Also ignore domain negation of whitelists | 206 # Also ignore domain negation of whitelists |
204 if isexception: | 207 if is_exception: |
205 options = filter(lambda o: not o.startswith("domain=~"), options) | 208 options = filter(lambda o: not o.startswith("domain=~"), options) |
206 | 209 |
207 unsupported = filter(lambda o: o in ("other", "elemhide"), options) | 210 unsupported = filter(lambda o: o in ("other", "elemhide"), options) |
208 if unsupported and len(unsupported) == len(options): | 211 if unsupported and len(unsupported) == len(options): |
209 # The rule only applies to types that are not supported in MSIE | 212 # The rule only applies to types that are not supported in MSIE |
210 has_unsupported = True | 213 has_unsupported = True |
211 elif "donottrack" in options: | 214 elif "donottrack" in options: |
212 # Do-Not-Track rules have to be removed even if $donottrack is combine d with other options | 215 # Do-Not-Track rules have to be removed even if $donottrack is combine d with other options |
213 has_unsupported = True | 216 has_unsupported = True |
214 elif "script" in options and len(options) == len(unsupported) + 1: | 217 elif "script" in options and len(options) == len(unsupported) + 1: |
215 # Mark rules that only apply to scripts for approximate conversion | 218 # Mark rules that only apply to scripts for approximate conversion |
216 requires_script = True | 219 requires_script = True |
217 elif len(options) > 0: | 220 elif len(options) > 0: |
218 # The rule has further options that aren't available in TPLs. For | 221 # The rule has further options that aren't available in TPLs. For |
219 # exception rules that aren't specific to a domain we ignore all | 222 # exception rules that aren't specific to a domain we ignore all |
220 # remaining options to avoid potential false positives. Other rules | 223 # remaining options to avoid potential false positives. Other rules |
221 # simply aren't included in the TPL file. | 224 # simply aren't included in the TPL file. |
222 if isexception: | 225 if is_exception: |
223 has_unsupported = any([o.startswith("domain=") for o in options]) | 226 has_unsupported = any([o.startswith("domain=") for o in options]) |
224 else: | 227 else: |
225 has_unsupported = True | 228 has_unsupported = True |
226 | 229 |
227 if has_unsupported: | 230 if has_unsupported: |
228 # Do not include filters with unsupported options | 231 # Do not include filters with unsupported options |
229 result.append("# " + origline) | 232 result.append("# " + origline) |
230 else: | 233 else: |
231 line = line.replace("^", "/") # Assume that separator placeholders mean slashes | 234 line = line.replace("^", "/") # Assume that separator placeholders mean slashes |
232 | 235 |
(...skipping 10 matching lines...) Expand all Loading... | |
243 # Remove anchors at the rule end | 246 # Remove anchors at the rule end |
244 line = re.sub(r"\|$", "", line) | 247 line = re.sub(r"\|$", "", line) |
245 # Remove unnecessary asterisks at the ends of lines | 248 # Remove unnecessary asterisks at the ends of lines |
246 line = re.sub(r"\*$", "", line) | 249 line = re.sub(r"\*$", "", line) |
247 # Emulate $script by appending *.js to the rule | 250 # Emulate $script by appending *.js to the rule |
248 if requires_script: | 251 if requires_script: |
249 line += "*.js" | 252 line += "*.js" |
250 if line.startswith("/*"): | 253 if line.startswith("/*"): |
251 line = line[2:] | 254 line = line[2:] |
252 if domain: | 255 if domain: |
253 line = "%sd %s %s" % ("+" if isexception else "-", domain, line) | 256 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line) |
254 line = re.sub(r"\s+/$", "", line) | 257 line = re.sub(r"\s+/$", "", line) |
255 result.append(line) | 258 result.append(line) |
256 elif isexception: | 259 elif is_exception: |
257 # Exception rules without domains are unsupported | 260 # Exception rules without domains are unsupported |
258 result.append("# " + origline) | 261 result.append("# " + origline) |
259 else: | 262 else: |
260 result.append("- " + line) | 263 result.append("- " + line) |
261 save_file(path, "\n".join(result) + "\n") | 264 save_file(filename, "\n".join(result) + "\n") |
262 | 265 |
263 class FileSource: | 266 class FileSource: |
264 def __init__(self, dir): | 267 def __init__(self, dir): |
265 self._dir = dir | 268 self._dir = dir |
266 if os.path.exists(os.path.join(dir, ".hg")): | 269 if os.path.exists(os.path.join(dir, ".hg")): |
267 # This is a Mercurial repository, try updating | 270 # This is a Mercurial repository, try updating |
268 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"]) | 271 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"]) |
269 | 272 |
270 def get_path(self, filename): | 273 def get_path(self, filename): |
271 return os.path.join(self._dir, *filename.split("/")) | 274 return os.path.join(self._dir, *filename.split("/")) |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
312 | 315 |
313 timeout = 30 | 316 timeout = 30 |
314 for option, value in opts: | 317 for option, value in opts: |
315 if option in ("-h", "--help"): | 318 if option in ("-h", "--help"): |
316 usage() | 319 usage() |
317 sys.exit() | 320 sys.exit() |
318 elif option in ("-t", "--timeout"): | 321 elif option in ("-t", "--timeout"): |
319 timeout = int(value) | 322 timeout = int(value) |
320 | 323 |
321 combine_subscriptions(sources, target_dir, timeout) | 324 combine_subscriptions(sources, target_dir, timeout) |
LEFT | RIGHT |