sitescripts/subscriptions/combineSubscriptions.py - Issue 28037010: Improved generation of filter subscription files

Delta Between Two Patch Sets: sitescripts/subscriptions/combineSubscriptions.py

Issue 28037010: Improved generation of filter subscription files (Closed)

Left Patch Set: Created Nov. 6, 2013, 2:27 p.m.

Right Patch Set: Different approach to atomic updates Created Nov. 11, 2013, 2:52 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # coding: utf-8	2 # coding: utf-8

3	3

4 # This file is part of the Adblock Plus web scripts,	4 # This file is part of the Adblock Plus web scripts,

5 # Copyright (C) 2006-2013 Eyeo GmbH	5 # Copyright (C) 2006-2013 Eyeo GmbH

6 #	6 #

7 # Adblock Plus is free software: you can redistribute it and/or modify	7 # Adblock Plus is free software: you can redistribute it and/or modify

8 # it under the terms of the GNU General Public License version 3 as	8 # it under the terms of the GNU General Public License version 3 as

9 # published by the Free Software Foundation.	9 # published by the Free Software Foundation.

10 #	10 #

11 # Adblock Plus is distributed in the hope that it will be useful,	11 # Adblock Plus is distributed in the hope that it will be useful,

12 # but WITHOUT ANY WARRANTY; without even the implied warranty of	12 # but WITHOUT ANY WARRANTY; without even the implied warranty of

13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 # GNU General Public License for more details.	14 # GNU General Public License for more details.

15 #	15 #

16 # You should have received a copy of the GNU General Public License	16 # You should have received a copy of the GNU General Public License

17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

18	18

19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4	19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4, tempfile

20 from getopt import getopt, GetoptError	20 from getopt import getopt, GetoptError

21	21

22 accepted_extensions = set([".txt"])	22 accepted_extensions = set([".txt"])

23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])	23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])

24 verbatim = set(["COPYING"])	24 verbatim = set(["COPYING"])

25	25

26 def combine_subscriptions(sources, target_dir, timeout=30):	26 def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None):

27 global accepted_extensions, ignore, verbatim
Sebastian Noack 2013/11/06 15:56:22 The global keyword is unneeded here. I know it was The global keyword is unneeded here. I know it was there before, but feel free to remove it. The global keyword makes assignments below, assigning the variable in the global instead of local scope. However it doesn't affect reading of variables.
28

29 if not os.path.exists(target_dir):	27 if not os.path.exists(target_dir):

30 os.makedirs(target_dir, 0755)	28 os.makedirs(target_dir, 0755)

	29

	30 def save_file(filename, data):

	31 handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=False)

	32 handle.write(data.encode("utf-8"))

	33 handle.close()

	34

	35 try:

	36 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", handle.name + ".gz", handle.name])

	37 except:

	38 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name

	39

	40 path = os.path.join(target_dir, filename)

	41 os.rename(handle.name, path)

	42 os.rename(handle.name + ".gz", path + ".gz")

31	43

32 known = set()	44 known = set()

33 for source_name, source in sources.iteritems():	45 for source_name, source in sources.iteritems():

34 for filename in source.list_top_level_files():	46 for filename in source.list_top_level_files():

35 if filename in ignore or filename.startswith("."):	47 if filename in ignore or filename.startswith("."):

36 continue	48 continue

37 if filename in verbatim:	49 if filename in verbatim:

38 process_verbatim_file(source, target_dir, filename)	50 process_verbatim_file(source, save_file, filename)

39 elif not os.path.splitext(filename)[1] in accepted_extensions:	51 elif not os.path.splitext(filename)[1] in accepted_extensions:

40 continue	52 continue

41 else:	53 else:

42 try:	54 try:

43 process_subscription_file(source_name, sources, target_dir, filename, timeout)	55 process_subscription_file(source_name, sources, save_file, filename, t imeout)

44 except:	56 except:

45 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me	57 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me

46 traceback.print_exc()	58 traceback.print_exc()

47 print >>sys.stderr	59 print >>sys.stderr

48 known.add(os.path.splitext(filename)[0] + ".tpl")	60 known.add(os.path.splitext(filename)[0] + ".tpl")

49 known.add(os.path.splitext(filename)[0] + ".tpl.gz")	61 known.add(os.path.splitext(filename)[0] + ".tpl.gz")

50 known.add(filename)	62 known.add(filename)

51 known.add(filename + ".gz")	63 known.add(filename + ".gz")

52	64

53 for filename in os.listdir(target_dir):	65 for filename in os.listdir(target_dir):

54 if filename.startswith("."):	66 if filename.startswith("."):

55 continue	67 continue

56 if not filename in known:	68 if not filename in known:

57 os.remove(os.path.join(target_dir, filename))	69 os.remove(os.path.join(target_dir, filename))

58	70

59 def save_file(path, data):	71 def process_verbatim_file(source, save_file, filename):

60 handle = codecs.open(path, "wb", encoding="utf-8")	72 save_file(filename, source.read_file(filename))

61 handle.write(data)	73

62 handle.close()	74 def process_subscription_file(source_name, sources, save_file, filename, timeout ):

63 try:

64 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", p ath + ".gz", path])

65 except:

66 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % path

67

68 def process_verbatim_file(source, target_dir, filename):

69 save_file(os.path.join(target_dir, filename), source.read_file(filename))

70

71 def process_subscription_file(source_name, sources, target_dir, filename, timeou t):

72 source = sources[source_name]	75 source = sources[source_name]

73 lines = source.read_file(filename).splitlines()	76 lines = source.read_file(filename).splitlines()

74	77

75 header = ""	78 header = ""

76 if len(lines) > 0:	79 if len(lines) > 0:

77 header = lines.pop(0)	80 header = lines.pop(0)

78 if not re.search(r"\[Adblock(?:\sPlus\s([\d\.]+)?)?\]", header, re.I):	81 if not re.search(r"\[Adblock(?:\sPlus\s([\d\.]+)?)?\]", header, re.I):

79 raise Exception("This is not a valid Adblock Plus subscription file.")	82 raise Exception("This is not a valid Adblock Plus subscription file.")

80	83

81 lines = resolve_includes(source_name, sources, lines, timeout)	84 lines = resolve_includes(source_name, sources, lines, timeout)

82 seen = set(["checksum", "version"])	85 seen = set(["checksum", "version"])

83 def check_line(line):	86 def check_line(line):

84 if line == "":	87 if line == "":

85 return False	88 return False

86 match = re.search(r"^\s!\s(Redirect\|Homepage\|Title\|Checksum\|Version)\s*:", line, re.M \| re.I)	89 match = re.search(r"^\s!\s(Redirect\|Homepage\|Title\|Checksum\|Version)\s*:", line, re.M \| re.I)

87 if not match:	90 if not match:

88 return True	91 return True

89 key = match.group(1).lower()	92 key = match.group(1).lower()

90 if key in seen:	93 if key in seen:

91 return False	94 return False

92 seen.add(key)	95 seen.add(key)

93 return True	96 return True

94 lines = filter(check_line, lines)	97 lines = filter(check_line, lines)

95	98

96 write_tpl(os.path.join(target_dir, os.path.splitext(filename)[0] + ".tpl"), li nes)	99 write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines)

97	100

98 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))	101 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))

99	102

100 checksum = hashlib.md5()	103 checksum = hashlib.md5()

101 checksum.update("\n".join([header] + lines).encode("utf-8"))	104 checksum.update("\n".join([header] + lines).encode("utf-8"))

102 lines.insert(0, "! Checksum: %s" % re.sub(r"=", "", base64.b64encode(checksum. digest())))	105 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip( "="))
Sebastian Noack 2013/11/06 15:56:22 You don't need a regex to strip a given character. You don't need a regex to strip a given character. You could just use .replace("=", "") or since the equals character is only used as padding in the end by base64 you could also use .rstrip('='), wich IMHO makes the intention of the code even more obvious. However I've never seen the use of base64 to bring an MD5 digit in a printable form. Ususally you you just convert it to hexadecimal, which can be done as easy as checksum.hexdigest() in Python. But I don't know if changing that now would break something. Wladimir Palant 2013/11/08 15:08:07 It's way too late to change the specification of c Show quoted text On 2013/11/06 15:56:22, sebastian wrote: > However I've never seen the use of base64 to bring an MD5 digit in a printable > form. Ususally you you just convert it to hexadecimal, which can be done as easy > as checksum.hexdigest() in Python. But I don't know if changing that now would > break something. It's way too late to change the specification of checksums. It's the way it is because Mozilla APIs produce base64 - and we mimic this everywhere else.
103 lines.insert(0, header)	106 lines.insert(0, header)

104 save_file(os.path.join(target_dir, filename), "\n".join(lines))	107 save_file(filename, "\n".join(lines))

105	108

106 def resolve_includes(source_name, sources, lines, timeout, level=0):	109 def resolve_includes(source_name, sources, lines, timeout, level=0):

107 if level > 5:	110 if level > 5:

108 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.")	111 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.")

109	112

110 result = []	113 result = []

111 for line in lines:	114 for line in lines:

112 match = re.search(r"^\s%include\s+(.)%\s*$", line)	115 match = re.search(r"^\s%include\s+(.)%\s*$", line)

113 if match:	116 if match:

114 filename = match.group(1)	117 filename = match.group(1)

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
152 result.extend(newlines)	155 result.extend(newlines)

153 else:	156 else:

154 if line.find("%timestamp%") >= 0:	157 if line.find("%timestamp%") >= 0:

155 if level == 0:	158 if level == 0:

156 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))	159 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))

157 else:	160 else:

158 line = ""	161 line = ""

159 result.append(line)	162 result.append(line)

160 return result	163 return result

161	164

162 def write_tpl(path, lines):	165 def write_tpl(save_file, filename, lines):

163 result = []	166 result = []

164 result.append("msFilterList")	167 result.append("msFilterList")

165 for line in lines:	168 for line in lines:

166 if re.search(r"^\s*!", line):	169 if re.search(r"^\s*!", line):

167 # This is a comment. Handle "Expires" comment in a special way, keep the r est.	170 # This is a comment. Handle "Expires" comment in a special way, keep the r est.

168 match = re.search(r"\bExpires\s(?::\|after)\s(\d+)\s*(h)?", line, re.I)	171 match = re.search(r"\bExpires\s(?::\|after)\s(\d+)\s*(h)?", line, re.I)

169 if match:	172 if match:

170 interval = int(match.group(1))	173 interval = int(match.group(1))

171 if match.group(2):	174 if match.group(2):

172 interval = int(interval / 24)	175 interval = int(interval / 24)

173 result.append(": Expires=%i" % interval)	176 result.append(": Expires=%i" % interval)

174 else:	177 else:

175 result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line)))	178 result.append(re.sub(r"^\s*!", "#", re.sub(r"--!$", "--#", line)))

176 elif line.find("#") >= 0:	179 elif line.find("#") >= 0:

177 # Element hiding rules are not supported in MSIE, drop them	180 # Element hiding rules are not supported in MSIE, drop them

178 pass	181 pass

179 else:	182 else:

180 # We have a blocking or exception rule, try to convert it	183 # We have a blocking or exception rule, try to convert it

181 origline = line	184 origline = line

182	185

183 isexception = False	186 is_exception = False
Sebastian Noack 2013/11/06 15:56:22 Apparently you don't like underscores, but "is_exc Apparently you don't like underscores, but "is_exception" would be way more readable than "isexcpetion". ;) Wladimir Palant 2013/11/08 15:08:07 As you wish... Show quoted text On 2013/11/06 15:56:22, sebastian wrote: > Apparently you don't like underscores, but "is_exception" would be way more > readable than "isexcpetion". ;) As you wish...
184 if line.startswith("@@"):	187 if line.startswith("@@"):

185 isexception = True	188 is_exception = True

186 line = line[2:]	189 line = line[2:]

187	190

188 has_unsupported = False	191 has_unsupported = False

189 requires_script = False	192 requires_script = False

190 match = re.search(r"^(.?)\$(.)", line)	193 match = re.search(r"^(.?)\$(.)", line)

191 if match:	194 if match:

192 # This rule has options, check whether any of them are important	195 # This rule has options, check whether any of them are important

193 line = match.group(1)	196 line = match.group(1)

194 options = match.group(2).replace("_", "-").lower().split(",")	197 options = match.group(2).replace("_", "-").lower().split(",")

195	198

196 # Remove first-party only exceptions, we will allow an ad server everywh ere otherwise	199 # Remove first-party only exceptions, we will allow an ad server everywh ere otherwise

197 if isexception and "~third-party" in options:	200 if is_exception and "~third-party" in options:

198 has_unsupported = True	201 has_unsupported = True

199	202

200 # A number of options are not supported in MSIE but can be safely ignore d, remove them	203 # A number of options are not supported in MSIE but can be safely ignore d, remove them

201 options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options)	204 options = filter(lambda o: not o in ("", "third-party", "~third-party", "match-case", "~match-case", "~other", "~donottrack"), options)

202	205

203 # Also ignore domain negation of whitelists	206 # Also ignore domain negation of whitelists

204 if isexception:	207 if is_exception:

205 options = filter(lambda o: not o.startswith("domain=~"), options)	208 options = filter(lambda o: not o.startswith("domain=~"), options)

206	209

207 unsupported = filter(lambda o: o in ("other", "elemhide"), options)	210 unsupported = filter(lambda o: o in ("other", "elemhide"), options)

208 if unsupported and len(unsupported) == len(options):	211 if unsupported and len(unsupported) == len(options):

209 # The rule only applies to types that are not supported in MSIE	212 # The rule only applies to types that are not supported in MSIE

210 has_unsupported = True	213 has_unsupported = True

211 elif "donottrack" in options:	214 elif "donottrack" in options:

212 # Do-Not-Track rules have to be removed even if $donottrack is combine d with other options	215 # Do-Not-Track rules have to be removed even if $donottrack is combine d with other options

213 has_unsupported = True	216 has_unsupported = True

214 elif "script" in options and len(options) == len(unsupported) + 1:	217 elif "script" in options and len(options) == len(unsupported) + 1:

215 # Mark rules that only apply to scripts for approximate conversion	218 # Mark rules that only apply to scripts for approximate conversion

216 requires_script = True	219 requires_script = True

217 elif len(options) > 0:	220 elif len(options) > 0:

218 # The rule has further options that aren't available in TPLs. For	221 # The rule has further options that aren't available in TPLs. For

219 # exception rules that aren't specific to a domain we ignore all	222 # exception rules that aren't specific to a domain we ignore all

220 # remaining options to avoid potential false positives. Other rules	223 # remaining options to avoid potential false positives. Other rules

221 # simply aren't included in the TPL file.	224 # simply aren't included in the TPL file.

222 if isexception:	225 if is_exception:

223 has_unsupported = any([o.startswith("domain=") for o in options])	226 has_unsupported = any([o.startswith("domain=") for o in options])

224 else:	227 else:

225 has_unsupported = True	228 has_unsupported = True

226	229

227 if has_unsupported:	230 if has_unsupported:

228 # Do not include filters with unsupported options	231 # Do not include filters with unsupported options

229 result.append("# " + origline)	232 result.append("# " + origline)

230 else:	233 else:

231 line = line.replace("^", "/") # Assume that separator placeholders mean slashes	234 line = line.replace("^", "/") # Assume that separator placeholders mean slashes

232	235

(...skipping 10 matching lines...) Expand all Loading...
243 # Remove anchors at the rule end	246 # Remove anchors at the rule end

244 line = re.sub(r"\\|$", "", line)	247 line = re.sub(r"\\|$", "", line)

245 # Remove unnecessary asterisks at the ends of lines	248 # Remove unnecessary asterisks at the ends of lines

246 line = re.sub(r"\*$", "", line)	249 line = re.sub(r"\*$", "", line)

247 # Emulate $script by appending *.js to the rule	250 # Emulate $script by appending *.js to the rule

248 if requires_script:	251 if requires_script:

249 line += "*.js"	252 line += "*.js"

250 if line.startswith("/*"):	253 if line.startswith("/*"):

251 line = line[2:]	254 line = line[2:]

252 if domain:	255 if domain:

253 line = "%sd %s %s" % ("+" if isexception else "-", domain, line)	256 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line)

254 line = re.sub(r"\s+/$", "", line)	257 line = re.sub(r"\s+/$", "", line)

255 result.append(line)	258 result.append(line)

256 elif isexception:	259 elif is_exception:

257 # Exception rules without domains are unsupported	260 # Exception rules without domains are unsupported

258 result.append("# " + origline)	261 result.append("# " + origline)

259 else:	262 else:

260 result.append("- " + line)	263 result.append("- " + line)

261 save_file(path, "\n".join(result) + "\n")	264 save_file(filename, "\n".join(result) + "\n")

262	265

263 class FileSource:	266 class FileSource:

264 def __init__(self, dir):	267 def __init__(self, dir):

265 self._dir = dir	268 self._dir = dir

266 if os.path.exists(os.path.join(dir, ".hg")):	269 if os.path.exists(os.path.join(dir, ".hg")):

267 # This is a Mercurial repository, try updating	270 # This is a Mercurial repository, try updating

268 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])	271 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])

269	272

270 def get_path(self, filename):	273 def get_path(self, filename):

271 return os.path.join(self._dir, *filename.split("/"))	274 return os.path.join(self._dir, *filename.split("/"))

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
312	315

313 timeout = 30	316 timeout = 30

314 for option, value in opts:	317 for option, value in opts:

315 if option in ("-h", "--help"):	318 if option in ("-h", "--help"):

316 usage()	319 usage()

317 sys.exit()	320 sys.exit()

318 elif option in ("-t", "--timeout"):	321 elif option in ("-t", "--timeout"):

319 timeout = int(value)	322 timeout = int(value)

320	323

321 combine_subscriptions(sources, target_dir, timeout)	324 combine_subscriptions(sources, target_dir, timeout)

LEFT	RIGHT