sitescripts/subscriptions/combineSubscriptions.py - Issue 28037010: Improved generation of filter subscription files

Delta Between Two Patch Sets: sitescripts/subscriptions/combineSubscriptions.py

Issue 28037010: Improved generation of filter subscription files (Closed)

Left Patch Set: Fixed review comments Created Nov. 8, 2013, 3:05 p.m.

Right Patch Set: Different approach to atomic updates Created Nov. 11, 2013, 2:52 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # coding: utf-8	2 # coding: utf-8

3	3

4 # This file is part of the Adblock Plus web scripts,	4 # This file is part of the Adblock Plus web scripts,

5 # Copyright (C) 2006-2013 Eyeo GmbH	5 # Copyright (C) 2006-2013 Eyeo GmbH

6 #	6 #

7 # Adblock Plus is free software: you can redistribute it and/or modify	7 # Adblock Plus is free software: you can redistribute it and/or modify

8 # it under the terms of the GNU General Public License version 3 as	8 # it under the terms of the GNU General Public License version 3 as

9 # published by the Free Software Foundation.	9 # published by the Free Software Foundation.

10 #	10 #

11 # Adblock Plus is distributed in the hope that it will be useful,	11 # Adblock Plus is distributed in the hope that it will be useful,

12 # but WITHOUT ANY WARRANTY; without even the implied warranty of	12 # but WITHOUT ANY WARRANTY; without even the implied warranty of

13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 # GNU General Public License for more details.	14 # GNU General Public License for more details.

15 #	15 #

16 # You should have received a copy of the GNU General Public License	16 # You should have received a copy of the GNU General Public License

17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

18	18

19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4	19 import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base6 4, tempfile

20 from getopt import getopt, GetoptError	20 from getopt import getopt, GetoptError

21	21

22 accepted_extensions = set([".txt"])	22 accepted_extensions = set([".txt"])

23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])	23 ignore = set(["Apache.txt", "CC-BY-SA.txt", "GPL.txt", "MPL.txt"])

24 verbatim = set(["COPYING"])	24 verbatim = set(["COPYING"])

25	25

26 def combine_subscriptions(sources, target_dir, timeout=30):	26 def combine_subscriptions(sources, target_dir, timeout=30, tempdir=None):

27 if not os.path.exists(target_dir):	27 if not os.path.exists(target_dir):

28 os.makedirs(target_dir, 0755)	28 os.makedirs(target_dir, 0755)

	29

	30 def save_file(filename, data):

	31 handle = tempfile.NamedTemporaryFile(mode="wb", dir=tempdir, delete=False)

	32 handle.write(data.encode("utf-8"))

	33 handle.close()

	34

	35 try:

	36 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", handle.name + ".gz", handle.name])

	37 except:

	38 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % handle.name

	39

	40 path = os.path.join(target_dir, filename)

	41 os.rename(handle.name, path)

	42 os.rename(handle.name + ".gz", path + ".gz")

29	43

30 known = set()	44 known = set()

31 for source_name, source in sources.iteritems():	45 for source_name, source in sources.iteritems():

32 for filename in source.list_top_level_files():	46 for filename in source.list_top_level_files():

33 if filename in ignore or filename.startswith("."):	47 if filename in ignore or filename.startswith("."):

34 continue	48 continue

35 if filename in verbatim:	49 if filename in verbatim:

36 process_verbatim_file(source, target_dir, filename)	50 process_verbatim_file(source, save_file, filename)

37 elif not os.path.splitext(filename)[1] in accepted_extensions:	51 elif not os.path.splitext(filename)[1] in accepted_extensions:

38 continue	52 continue

39 else:	53 else:

40 try:	54 try:

41 process_subscription_file(source_name, sources, target_dir, filename, timeout)	55 process_subscription_file(source_name, sources, save_file, filename, t imeout)

42 except:	56 except:

43 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me	57 print >>sys.stderr, 'Error processing subscription file "%s"' % filena me

44 traceback.print_exc()	58 traceback.print_exc()

45 print >>sys.stderr	59 print >>sys.stderr

46 known.add(os.path.splitext(filename)[0] + ".tpl")	60 known.add(os.path.splitext(filename)[0] + ".tpl")

47 known.add(os.path.splitext(filename)[0] + ".tpl.gz")	61 known.add(os.path.splitext(filename)[0] + ".tpl.gz")

48 known.add(filename)	62 known.add(filename)

49 known.add(filename + ".gz")	63 known.add(filename + ".gz")

50	64

51 for filename in os.listdir(target_dir):	65 for filename in os.listdir(target_dir):

52 if filename.startswith("."):	66 if filename.startswith("."):

53 continue	67 continue

54 if not filename in known:	68 if not filename in known:

55 os.remove(os.path.join(target_dir, filename))	69 os.remove(os.path.join(target_dir, filename))

56	70

57 def save_file(path, data):	71 def process_verbatim_file(source, save_file, filename):

58 handle = codecs.open(path, "wb", encoding="utf-8")	72 save_file(filename, source.read_file(filename))

59 handle.write(data)	73

60 handle.close()	74 def process_subscription_file(source_name, sources, save_file, filename, timeout ):

61 try:

62 subprocess.check_output(["7za", "a", "-tgzip", "-mx=9", "-bd", "-mpass=5", p ath + ".gz", path])

63 except:

64 print >>sys.stderr, "Failed to compress file %s. Please ensure that p7zip is installed on the system." % path

65

66 def process_verbatim_file(source, target_dir, filename):

67 save_file(os.path.join(target_dir, filename), source.read_file(filename))

68

69 def process_subscription_file(source_name, sources, target_dir, filename, timeou t):

70 source = sources[source_name]	75 source = sources[source_name]

71 lines = source.read_file(filename).splitlines()	76 lines = source.read_file(filename).splitlines()

72	77

73 header = ""	78 header = ""

74 if len(lines) > 0:	79 if len(lines) > 0:

75 header = lines.pop(0)	80 header = lines.pop(0)

76 if not re.search(r"\[Adblock(?:\sPlus\s([\d\.]+)?)?\]", header, re.I):	81 if not re.search(r"\[Adblock(?:\sPlus\s([\d\.]+)?)?\]", header, re.I):

77 raise Exception("This is not a valid Adblock Plus subscription file.")	82 raise Exception("This is not a valid Adblock Plus subscription file.")

78	83

79 lines = resolve_includes(source_name, sources, lines, timeout)	84 lines = resolve_includes(source_name, sources, lines, timeout)

80 seen = set(["checksum", "version"])	85 seen = set(["checksum", "version"])

81 def check_line(line):	86 def check_line(line):

82 if line == "":	87 if line == "":

83 return False	88 return False

84 match = re.search(r"^\s!\s(Redirect\|Homepage\|Title\|Checksum\|Version)\s*:", line, re.M \| re.I)	89 match = re.search(r"^\s!\s(Redirect\|Homepage\|Title\|Checksum\|Version)\s*:", line, re.M \| re.I)

85 if not match:	90 if not match:

86 return True	91 return True

87 key = match.group(1).lower()	92 key = match.group(1).lower()

88 if key in seen:	93 if key in seen:

89 return False	94 return False

90 seen.add(key)	95 seen.add(key)

91 return True	96 return True

92 lines = filter(check_line, lines)	97 lines = filter(check_line, lines)

93	98

94 write_tpl(os.path.join(target_dir, os.path.splitext(filename)[0] + ".tpl"), li nes)	99 write_tpl(save_file, os.path.splitext(filename)[0] + ".tpl", lines)

95	100

96 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))	101 lines.insert(0, "! Version: %s" % time.strftime("%Y%m%d%H%M", time.gmtime()))

97	102

98 checksum = hashlib.md5()	103 checksum = hashlib.md5()

99 checksum.update("\n".join([header] + lines).encode("utf-8"))	104 checksum.update("\n".join([header] + lines).encode("utf-8"))

100 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip( "="))	105 lines.insert(0, "! Checksum: %s" % base64.b64encode(checksum.digest()).rstrip( "="))

101 lines.insert(0, header)	106 lines.insert(0, header)

102 save_file(os.path.join(target_dir, filename), "\n".join(lines))	107 save_file(filename, "\n".join(lines))

103	108

104 def resolve_includes(source_name, sources, lines, timeout, level=0):	109 def resolve_includes(source_name, sources, lines, timeout, level=0):

105 if level > 5:	110 if level > 5:

106 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.")	111 raise Exception("There are too many nested includes, which is probably the r esult of a circular reference somewhere.")

107	112

108 result = []	113 result = []

109 for line in lines:	114 for line in lines:

110 match = re.search(r"^\s%include\s+(.)%\s*$", line)	115 match = re.search(r"^\s%include\s+(.)%\s*$", line)

111 if match:	116 if match:

112 filename = match.group(1)	117 filename = match.group(1)

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
150 result.extend(newlines)	155 result.extend(newlines)

151 else:	156 else:

152 if line.find("%timestamp%") >= 0:	157 if line.find("%timestamp%") >= 0:

153 if level == 0:	158 if level == 0:

154 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))	159 line = line.replace("%timestamp%", time.strftime("%d %b %Y %H:%M UTC", time.gmtime()))

155 else:	160 else:

156 line = ""	161 line = ""

157 result.append(line)	162 result.append(line)

158 return result	163 return result

159	164

160 def write_tpl(path, lines):	165 def write_tpl(save_file, filename, lines):

161 result = []	166 result = []

162 result.append("msFilterList")	167 result.append("msFilterList")

163 for line in lines:	168 for line in lines:

164 if re.search(r"^\s*!", line):	169 if re.search(r"^\s*!", line):

165 # This is a comment. Handle "Expires" comment in a special way, keep the r est.	170 # This is a comment. Handle "Expires" comment in a special way, keep the r est.

166 match = re.search(r"\bExpires\s(?::\|after)\s(\d+)\s*(h)?", line, re.I)	171 match = re.search(r"\bExpires\s(?::\|after)\s(\d+)\s*(h)?", line, re.I)

167 if match:	172 if match:

168 interval = int(match.group(1))	173 interval = int(match.group(1))

169 if match.group(2):	174 if match.group(2):

170 interval = int(interval / 24)	175 interval = int(interval / 24)

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
249 line = line[2:]	254 line = line[2:]

250 if domain:	255 if domain:

251 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line)	256 line = "%sd %s %s" % ("+" if is_exception else "-", domain, line)

252 line = re.sub(r"\s+/$", "", line)	257 line = re.sub(r"\s+/$", "", line)

253 result.append(line)	258 result.append(line)

254 elif is_exception:	259 elif is_exception:

255 # Exception rules without domains are unsupported	260 # Exception rules without domains are unsupported

256 result.append("# " + origline)	261 result.append("# " + origline)

257 else:	262 else:

258 result.append("- " + line)	263 result.append("- " + line)

259 save_file(path, "\n".join(result) + "\n")	264 save_file(filename, "\n".join(result) + "\n")

260	265

261 class FileSource:	266 class FileSource:

262 def __init__(self, dir):	267 def __init__(self, dir):

263 self._dir = dir	268 self._dir = dir

264 if os.path.exists(os.path.join(dir, ".hg")):	269 if os.path.exists(os.path.join(dir, ".hg")):

265 # This is a Mercurial repository, try updating	270 # This is a Mercurial repository, try updating

266 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])	271 subprocess.call(["hg", "-q", "-R", dir, "pull", "--update"])

267	272

268 def get_path(self, filename):	273 def get_path(self, filename):

269 return os.path.join(self._dir, *filename.split("/"))	274 return os.path.join(self._dir, *filename.split("/"))

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
310	315

311 timeout = 30	316 timeout = 30

312 for option, value in opts:	317 for option, value in opts:

313 if option in ("-h", "--help"):	318 if option in ("-h", "--help"):

314 usage()	319 usage()

315 sys.exit()	320 sys.exit()

316 elif option in ("-t", "--timeout"):	321 elif option in ("-t", "--timeout"):

317 timeout = int(value)	322 timeout = int(value)

318	323

319 combine_subscriptions(sources, target_dir, timeout)	324 combine_subscriptions(sources, target_dir, timeout)

LEFT	RIGHT