Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/subscriptions/combineSubscriptions.py

Issue 29345242: Noissue - Adapt quotes for compliance with our coding style in sitescripts (Closed)
Patch Set: Fixed raw string Created May 30, 2016, 8:47 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2016 Eyeo GmbH 4 # Copyright (C) 2006-2016 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
(...skipping 10 matching lines...) Expand all
21 import subprocess 21 import subprocess
22 import urllib2 22 import urllib2
23 import time 23 import time
24 import traceback 24 import traceback
25 import codecs 25 import codecs
26 import hashlib 26 import hashlib
27 import base64 27 import base64
28 import tempfile 28 import tempfile
29 from getopt import getopt, GetoptError 29 from getopt import getopt, GetoptError
30 30
31 accepted_extensions = set([".txt"]) 31 accepted_extensions = set(['.txt'])
32 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"]) 32 ignore = set(['Apache.txt', 'CC-BY-SA.txt', 'GPL.txt', 'MPL.txt'])
33 verbatim = set(["COPYING"]) 33 verbatim = set(['COPYING'])
34 34
35 35
36 def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None): 36 def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None):
37 if not os.path.exists(target_dir): 37 if not os.path.exists(target_dir):
38 os.makedirs(target_dir, 0755) 38 os.makedirs(target_dir, 0755)
39 39
40 def save_file(filename, data): 40 def save_file(filename, data):
41 handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=Fals e) 41 handle = tempfile.NamedTemporaryFile(mode='wb', dir=tempdir, delete=Fals e)
42 handle.write(data.encode("utf-8")) 42 handle.write(data.encode('utf-8'))
43 handle.close() 43 handle.close()
44 44
45 if hasattr(os, "chmod"): 45 if hasattr(os, 'chmod'):
46 os.chmod(handle.name, 0644) 46 os.chmod(handle.name, 0644)
47 47
48 try: 48 try:
49 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpa ss=5", handle.name + ".gz", handle.name]) 49 subprocess.check_output(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpa ss=5', handle.name + '.gz', handle.name])
50 except: 50 except:
51 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name 51 print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % handle.name
52 52
53 path = os.path.join(target_dir, filename) 53 path = os.path.join(target_dir, filename)
54 os.rename(handle.name, path) 54 os.rename(handle.name, path)
55 os.rename(handle.name + ".gz", path + ".gz") 55 os.rename(handle.name + '.gz', path + '.gz')
56 56
57 known = set() 57 known = set()
58 for source_name, source in sources.iteritems(): 58 for source_name, source in sources.iteritems():
59 for filename in source.list_top_level_files(): 59 for filename in source.list_top_level_files():
60 if filename in ignore or filename.startswith("."): 60 if filename in ignore or filename.startswith('.'):
61 continue 61 continue
62 if filename in verbatim: 62 if filename in verbatim:
63 process_verbatim_file(source, save_file, filename) 63 process_verbatim_file(source, save_file, filename)
64 elif not os.path.splitext(filename)[1] in accepted_extensions: 64 elif not os.path.splitext(filename)[1] in accepted_extensions:
65 continue 65 continue
66 else: 66 else:
67 try: 67 try:
68 process_subscription_file(source_name, sources, save_file, f ilename, timeout) 68 process_subscription_file(source_name, sources, save_file, f ilename, timeout)
69 except: 69 except:
70 print >>sys.stderr, 'Error processing subscription file "%s" ' % filename 70 print >>sys.stderr, 'Error processing subscription file "%s" ' % filename
71 traceback.print_exc() 71 traceback.print_exc()
72 print >>sys.stderr 72 print >>sys.stderr
73 known.add(os.path.splitext(filename)[0] + ".tpl") 73 known.add(os.path.splitext(filename)[0] + '.tpl')
74 known.add(os.path.splitext(filename)[0] + ".tpl.gz") 74 known.add(os.path.splitext(filename)[0] + '.tpl.gz')
75 known.add(filename) 75 known.add(filename)
76 known.add(filename + ".gz") 76 known.add(filename + '.gz')
77 77
78 for filename in os.listdir(target_dir): 78 for filename in os.listdir(target_dir):
79 if filename.startswith("."): 79 if filename.startswith('.'):
80 continue 80 continue
81 if not filename in known: 81 if not filename in known:
82 os.remove(os.path.join(target_dir, filename)) 82 os.remove(os.path.join(target_dir, filename))
83 83
84 84
85 def process_verbatim_file(source, save_file, filename): 85 def process_verbatim_file(source, save_file, filename):
86 save_file(filename, source.read_file(filename)) 86 save_file(filename, source.read_file(filename))
87 87
88 88
89 def process_subscription_file(source_name, sources, save_file, filename, timeout ): 89 def process_subscription_file(source_name, sources, save_file, filename, timeout ):
90 source = sources[source_name] 90 source = sources[source_name]
91 lines = source.read_file(filename).splitlines() 91 lines = source.read_file(filename).splitlines()
92 92
93 header = "" 93 header = ''
94 if len(lines) > 0: 94 if len(lines) > 0:
95 header = lines.pop(0) 95 header = lines.pop(0)
96 if not re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]", header, re.I): 96 if not re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', header, re.I):
97 raise Exception("This is not a valid Adblock Plus subscription file.") 97 raise Exception('This is not a valid Adblock Plus subscription file.')
98 98
99 lines = resolve_includes(source_name, sources, lines, timeout) 99 lines = resolve_includes(source_name, sources, lines, timeout)
100 seen = set(["checksum", "version"]) 100 seen = set(['checksum', 'version'])
101 101
102 def check_line(line): 102 def check_line(line):
103 if line == "": 103 if line == '':
104 return False 104 return False
105 match = re.search(r"^\s*!\s*(Redirect|Homepage|Title|Checksum|Version|Ex pires)\s*:", line, re.M | re.I) 105 match = re.search(r'^\s*!\s*(Redirect|Homepage|Title|Checksum|Version|Ex pires)\s*:', line, re.M | re.I)
106 if not match: 106 if not match:
107 return True 107 return True
108 key = match.group(1).lower() 108 key = match.group(1).lower()
109 if key in seen: 109 if key in seen:
110 return False 110 return False
111 seen.add(key) 111 seen.add(key)
112 return True 112 return True
113 lines = filter(check_line, lines) 113 lines = filter(check_line, lines)
114 114
115 write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines) 115 write_tpl(save_file, os.path.splitext(filename)[0] + '.tpl', lines)
116 116
117 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()) ) 117 lines.insert(0, '! Version: %s' % time.strftime('%Y%m%d%H%M', time.gmtime()) )
118 118
119 checksum = hashlib.md5() 119 checksum = hashlib.md5()
120 checksum.update("\n".join([header] + lines).encode("utf-8")) 120 checksum.update('\n'.join([header] + lines).encode('utf-8'))
121 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstri p("=")) 121 lines.insert(0, '! Checksum: %s' % base64.b64encode(checksum.digest()).rstri p('='))
122 lines.insert(0, header) 122 lines.insert(0, header)
123 save_file(filename, "\n".join(lines)) 123 save_file(filename, '\n'.join(lines))
124 124
125 125
126 def resolve_includes(source_name, sources, lines, timeout, level=0): 126 def resolve_includes(source_name, sources, lines, timeout, level=0):
127 if level > 5: 127 if level > 5:
128 raise Exception("There are too many nested includes, which is probably t he result of a circular reference somewhere.") 128 raise Exception('There are too many nested includes, which is probably t he result of a circular reference somewhere.')
129 129
130 result = [] 130 result = []
131 for line in lines: 131 for line in lines:
132 match = re.search(r"^\s*%include\s+(.*)%\s*$", line) 132 match = re.search(r'^\s*%include\s+(.*)%\s*$', line)
133 if match: 133 if match:
134 filename = match.group(1) 134 filename = match.group(1)
135 newlines = None 135 newlines = None
136 if re.match(r"^https?://", filename): 136 if re.match(r'^https?://', filename):
137 result.append("! *** Fetched from: %s ***" % filename) 137 result.append('! *** Fetched from: %s ***' % filename)
138 138
139 for i in range(3): 139 for i in range(3):
140 try: 140 try:
141 request = urllib2.urlopen(filename, None, timeout) 141 request = urllib2.urlopen(filename, None, timeout)
142 data = request.read() 142 data = request.read()
143 error = None 143 error = None
144 break 144 break
145 except urllib2.URLError, e: 145 except urllib2.URLError, e:
146 error = e 146 error = e
147 time.sleep(5) 147 time.sleep(5)
148 if error: 148 if error:
149 raise error 149 raise error
150 150
151 # We should really get the charset from the headers rather than assuming 151 # We should really get the charset from the headers rather than assuming
152 # that it is UTF-8. However, some of the Google Code mirrors are 152 # that it is UTF-8. However, some of the Google Code mirrors are
153 # misconfigured and will return ISO-8859-1 as charset instead of UTF-8. 153 # misconfigured and will return ISO-8859-1 as charset instead of UTF-8.
154 newlines = data.decode("utf-8").splitlines() 154 newlines = data.decode('utf-8').splitlines()
155 newlines = filter(lambda l: not re.search(r"^\s*!\s*(Redirect|Ho mepage|Title|Version|Expires)\s*:", l, re.M | re.I), newlines) 155 newlines = filter(lambda l: not re.search(r'^\s*!\s*(Redirect|Ho mepage|Title|Version|Expires)\s*:', l, re.M | re.I), newlines)
156 else: 156 else:
157 result.append("! *** %s ***" % filename) 157 result.append('! *** %s ***' % filename)
158 158
159 include_source = source_name 159 include_source = source_name
160 if ":" in filename: 160 if ':' in filename:
161 include_source, filename = filename.split(":", 1) 161 include_source, filename = filename.split(':', 1)
162 if not include_source in sources: 162 if not include_source in sources:
163 raise Exception('Cannot include file from repository "%s", t his repository is unknown' % include_source) 163 raise Exception('Cannot include file from repository "%s", t his repository is unknown' % include_source)
164 164
165 source = sources[include_source] 165 source = sources[include_source]
166 newlines = source.read_file(filename).splitlines() 166 newlines = source.read_file(filename).splitlines()
167 newlines = resolve_includes(include_source, sources, newlines, t imeout, level + 1) 167 newlines = resolve_includes(include_source, sources, newlines, t imeout, level + 1)
168 168
169 if len(newlines) and re.search(r"\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\ ]", newlines[0], re.I): 169 if len(newlines) and re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\ ]', newlines[0], re.I):
170 del newlines[0] 170 del newlines[0]
171 result.extend(newlines) 171 result.extend(newlines)
172 else: 172 else:
173 if line.find("%timestamp%") >= 0: 173 if line.find('%timestamp%') >= 0:
174 if level == 0: 174 if level == 0:
175 line = line.replace("%timestamp%", time.strftime("%d %b %Y % H:%M UTC", time.gmtime())) 175 line = line.replace('%timestamp%', time.strftime('%d %b %Y % H:%M UTC', time.gmtime()))
176 else: 176 else:
177 line = "" 177 line = ''
178 result.append(line) 178 result.append(line)
179 return result 179 return result
180 180
181 181
182 def write_tpl(save_file, filename, lines): 182 def write_tpl(save_file, filename, lines):
183 result = [] 183 result = []
184 result.append("msFilterList") 184 result.append('msFilterList')
185 for line in lines: 185 for line in lines:
186 if re.search(r"^\s*!", line): 186 if re.search(r'^\s*!', line):
187 # This is a comment. Handle "Expires" comment in a special way, keep the rest. 187 # This is a comment. Handle "Expires" comment in a special way, keep the rest.
188 match = re.search(r"^\s*!\s*Expires\s*:\s*(\d+)\s*(h)?", line, re.I) 188 match = re.search(r'^\s*!\s*Expires\s*:\s*(\d+)\s*(h)?', line, re.I)
189 if match: 189 if match:
190 interval = int(match.group(1)) 190 interval = int(match.group(1))
191 if match.group(2): 191 if match.group(2):
192 interval = int(interval / 24) 192 interval = int(interval / 24)
193 result.append(": Expires=%i" % interval) 193 result.append(': Expires=%i' % interval)
194 else: 194 else:
195 result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line) )) 195 result.append(re.sub(r'^\s*!', '#', re.sub(r'--!$', '--#', line) ))
196 elif line.find("#") >= 0: 196 elif line.find('#') >= 0:
197 # Element hiding rules are not supported in MSIE, drop them 197 # Element hiding rules are not supported in MSIE, drop them
198 pass 198 pass
199 else: 199 else:
200 # We have a blocking or exception rule, try to convert it 200 # We have a blocking or exception rule, try to convert it
201 origline = line 201 origline = line
202 202
203 is_exception = False 203 is_exception = False
204 if line.startswith("@@"): 204 if line.startswith('@@'):
205 is_exception = True 205 is_exception = True
206 line = line[2:] 206 line = line[2:]
207 207
208 has_unsupported = False 208 has_unsupported = False
209 requires_script = False 209 requires_script = False
210 match = re.search(r"^(.*?)\$(.*)", line) 210 match = re.search(r'^(.*?)\$(.*)', line)
211 if match: 211 if match:
212 # This rule has options, check whether any of them are important 212 # This rule has options, check whether any of them are important
213 line = match.group(1) 213 line = match.group(1)
214 options = match.group(2).replace("_", "-").lower().split(",") 214 options = match.group(2).replace('_', '-').lower().split(',')
215 215
216 # Remove first-party only exceptions, we will allow an ad server everywhere otherwise 216 # Remove first-party only exceptions, we will allow an ad server everywhere otherwise
217 if is_exception and "~third-party" in options: 217 if is_exception and '~third-party' in options:
218 has_unsupported = True 218 has_unsupported = True
219 219
220 # A number of options are not supported in MSIE but can be safel y ignored, remove them 220 # A number of options are not supported in MSIE but can be safel y ignored, remove them
221 options = filter(lambda o: not o in ("", "third-party", "~third- party", "match-case", "~match-case", "~other", "~donottrack"), options) 221 options = filter(lambda o: not o in ('', 'third-party', '~third- party', 'match-case', '~match-case', '~other', '~donottrack'), options)
222 222
223 # Also ignore domain negation of whitelists 223 # Also ignore domain negation of whitelists
224 if is_exception: 224 if is_exception:
225 options = filter(lambda o: not o.startswith("domain=~"), opt ions) 225 options = filter(lambda o: not o.startswith('domain=~'), opt ions)
226 226
227 unsupported = filter(lambda o: o in ("other", "elemhide"), optio ns) 227 unsupported = filter(lambda o: o in ('other', 'elemhide'), optio ns)
228 if unsupported and len(unsupported) == len(options): 228 if unsupported and len(unsupported) == len(options):
229 # The rule only applies to types that are not supported in M SIE 229 # The rule only applies to types that are not supported in M SIE
230 has_unsupported = True 230 has_unsupported = True
231 elif "donottrack" in options: 231 elif 'donottrack' in options:
232 # Do-Not-Track rules have to be removed even if $donottrack is combined with other options 232 # Do-Not-Track rules have to be removed even if $donottrack is combined with other options
233 has_unsupported = True 233 has_unsupported = True
234 elif "script" in options and len(options) == len(unsupported) + 1: 234 elif 'script' in options and len(options) == len(unsupported) + 1:
235 # Mark rules that only apply to scripts for approximate conv ersion 235 # Mark rules that only apply to scripts for approximate conv ersion
236 requires_script = True 236 requires_script = True
237 elif len(options) > 0: 237 elif len(options) > 0:
238 # The rule has further options that aren't available in TPLs . For 238 # The rule has further options that aren't available in TPLs . For
239 # exception rules that aren't specific to a domain we ignore all 239 # exception rules that aren't specific to a domain we ignore all
240 # remaining options to avoid potential false positives. Othe r rules 240 # remaining options to avoid potential false positives. Othe r rules
241 # simply aren't included in the TPL file. 241 # simply aren't included in the TPL file.
242 if is_exception: 242 if is_exception:
243 has_unsupported = any([o.startswith("domain=") for o in options]) 243 has_unsupported = any([o.startswith('domain=') for o in options])
244 else: 244 else:
245 has_unsupported = True 245 has_unsupported = True
246 246
247 if has_unsupported: 247 if has_unsupported:
248 # Do not include filters with unsupported options 248 # Do not include filters with unsupported options
249 result.append("# " + origline) 249 result.append('# ' + origline)
250 else: 250 else:
251 line = line.replace("^", "/") # Assume that separator placehold ers mean slashes 251 line = line.replace('^', '/') # Assume that separator placehold ers mean slashes
252 252
253 # Try to extract domain info 253 # Try to extract domain info
254 domain = None 254 domain = None
255 match = re.search(r"^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)", line ) 255 match = re.search(r'^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)', line )
256 if match: 256 if match:
257 domain = match.group(2) 257 domain = match.group(2)
258 line = match.group(4) 258 line = match.group(4)
259 else: 259 else:
260 # No domain info, remove anchors at the rule start 260 # No domain info, remove anchors at the rule start
261 line = re.sub(r"^\|\|", "http://", line) 261 line = re.sub(r'^\|\|', 'http://', line)
262 line = re.sub(r"^\|", "", line) 262 line = re.sub(r'^\|', '', line)
263 # Remove anchors at the rule end 263 # Remove anchors at the rule end
264 line = re.sub(r"\|$", "", line) 264 line = re.sub(r'\|$', '', line)
265 # Remove unnecessary asterisks at the ends of lines 265 # Remove unnecessary asterisks at the ends of lines
266 line = re.sub(r"\*$", "", line) 266 line = re.sub(r'\*$', '', line)
267 # Emulate $script by appending *.js to the rule 267 # Emulate $script by appending *.js to the rule
268 if requires_script: 268 if requires_script:
269 line += "*.js" 269 line += '*.js'
270 if line.startswith("/*"): 270 if line.startswith('/*'):
271 line = line[2:] 271 line = line[2:]
272 if domain: 272 if domain:
273 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line) 273 line = '%sd %s %s' % ('+' if is_exception else '-', domain, line)
274 line = re.sub(r"\s+/$", "", line) 274 line = re.sub(r'\s+/$', '', line)
275 result.append(line) 275 result.append(line)
276 elif is_exception: 276 elif is_exception:
277 # Exception rules without domains are unsupported 277 # Exception rules without domains are unsupported
278 result.append("# " + origline) 278 result.append('# ' + origline)
279 else: 279 else:
280 result.append("- " + line) 280 result.append('- ' + line)
281 save_file(filename, "\n".join(result) + "\n") 281 save_file(filename, '\n'.join(result) + '\n')
282 282
283 283
284 class FileSource: 284 class FileSource:
285 def __init__(self, dir): 285 def __init__(self, dir):
286 self._dir = dir 286 self._dir = dir
287 if os.path.exists(os.path.join(dir, ".hg")): 287 if os.path.exists(os.path.join(dir, '.hg')):
288 # This is a Mercurial repository, try updating 288 # This is a Mercurial repository, try updating
289 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"]) 289 subprocess.call(['hg', '-q', '-R', dir, 'pull', '--update'])
290 290
291 def get_path(self, filename): 291 def get_path(self, filename):
292 return os.path.join(self._dir, *filename.split("/")) 292 return os.path.join(self._dir, *filename.split('/'))
293 293
294 def read_file(self, filename): 294 def read_file(self, filename):
295 path = self.get_path(filename) 295 path = self.get_path(filename)
296 if os.path.relpath(path, self._dir).startswith("."): 296 if os.path.relpath(path, self._dir).startswith('.'):
297 raise Exception("Attempt to access a file outside the repository") 297 raise Exception('Attempt to access a file outside the repository')
298 with codecs.open(path, "rb", encoding="utf-8") as handle: 298 with codecs.open(path, 'rb', encoding='utf-8') as handle:
299 return handle.read() 299 return handle.read()
300 300
301 def list_top_level_files(self): 301 def list_top_level_files(self):
302 for filename in os.listdir(self._dir): 302 for filename in os.listdir(self._dir):
303 path = os.path.join(self._dir, filename) 303 path = os.path.join(self._dir, filename)
304 if os.path.isfile(path): 304 if os.path.isfile(path):
305 yield filename 305 yield filename
306 306
307 307
308 def usage(): 308 def usage():
309 print """Usage: %s source_name=source_dir ... [output_dir] 309 print '''Usage: %s source_name=source_dir ... [output_dir]
310 310
311 Options: 311 Options:
312 -h --help Print this message and exit 312 -h --help Print this message and exit
313 -t seconds --timeout=seconds Timeout when fetching remote subscriptions 313 -t seconds --timeout=seconds Timeout when fetching remote subscriptions
314 """ % os.path.basename(sys.argv[0]) 314 ''' % os.path.basename(sys.argv[0])
315 315
316 if __name__ == "__main__": 316 if __name__ == '__main__':
317 try: 317 try:
318 opts, args = getopt(sys.argv[1:], "ht:", ["help", "timeout="]) 318 opts, args = getopt(sys.argv[1:], 'ht:', ['help', 'timeout='])
319 except GetoptError, e: 319 except GetoptError, e:
320 print str(e) 320 print str(e)
321 usage() 321 usage()
322 sys.exit(2) 322 sys.exit(2)
323 323
324 target_dir = "subscriptions" 324 target_dir = 'subscriptions'
325 sources = {} 325 sources = {}
326 for arg in args: 326 for arg in args:
327 if "=" in arg: 327 if '=' in arg:
328 source_name, source_dir = arg.split("=", 1) 328 source_name, source_dir = arg.split('=', 1)
329 sources[source_name] = FileSource(source_dir) 329 sources[source_name] = FileSource(source_dir)
330 else: 330 else:
331 target_dir = arg 331 target_dir = arg
332 if not sources: 332 if not sources:
333 sources[""] = FileSource(".") 333 sources[''] = FileSource('.')
334 334
335 timeout = 30 335 timeout = 30
336 for option, value in opts: 336 for option, value in opts:
337 if option in ("-h", "--help"): 337 if option in ('-h', '--help'):
338 usage() 338 usage()
339 sys.exit() 339 sys.exit()
340 elif option in ("-t", "--timeout"): 340 elif option in ('-t', '--timeout'):
341 timeout = int(value) 341 timeout = int(value)
342 342
343 combine_subscriptions(sources, target_dir, timeout) 343 combine_subscriptions(sources, target_dir, timeout)
OLDNEW

Powered by Google App Engine
This is Rietveld