Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/content_blocker_lists/bin/generate_lists.py

Issue 29331148: Issue 3176 - Add metadata to content blocker lists (Closed)
Left Patch Set: Created Nov. 27, 2015, 4:22 p.m.
Right Patch Set: Improved regexp Created Dec. 8, 2015, 3:31 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « .sitescripts.example ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 # This file is part of Adblock Plus <https://adblockplus.org/>, 4 # This file is part of Adblock Plus <https://adblockplus.org/>,
5 # Copyright (C) 2006-2015 Eyeo GmbH 5 # Copyright (C) 2006-2015 Eyeo GmbH
6 # 6 #
7 # Adblock Plus is free software: you can redistribute it and/or modify 7 # Adblock Plus is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License version 3 as 8 # it under the terms of the GNU General Public License version 3 as
9 # published by the Free Software Foundation. 9 # published by the Free Software Foundation.
10 # 10 #
11 # Adblock Plus is distributed in the hope that it will be useful, 11 # Adblock Plus is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details. 14 # GNU General Public License for more details.
15 # 15 #
16 # You should have received a copy of the GNU General Public License 16 # You should have received a copy of the GNU General Public License
17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
18 18
19 from collections import OrderedDict
19 from contextlib import closing 20 from contextlib import closing
20 from datetime import datetime
21 import json 21 import json
22 import os 22 import os
23 from StringIO import StringIO
24 import subprocess 23 import subprocess
24 import threading
25 import time
25 import re 26 import re
26 import urllib2 27 import urllib2
27 28
28 from sitescripts.utils import get_config 29 from sitescripts.utils import get_config
29 30
30 config = dict(get_config().items("content_blocker_lists")) 31 config = dict(get_config().items("content_blocker_lists"))
31 32
32 def update_abp2blocklist(): 33 def update_abp2blocklist():
33 with open(os.devnull, "w") as devnull: 34 with open(os.devnull, "w") as devnull:
34 abp2blocklist_path = config["abp2blocklist_path"] 35 abp2blocklist_path = config["abp2blocklist_path"]
35 if os.path.isdir(abp2blocklist_path): 36 if os.path.isdir(abp2blocklist_path):
36 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path), 37 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path),
37 stdout=devnull) 38 stdout=devnull)
38 else: 39 else:
39 subprocess.check_call(("hg", "clone", config["abp2blocklist_url"], 40 subprocess.check_call(("hg", "clone", config["abp2blocklist_url"],
40 abp2blocklist_path), stdout=devnull) 41 abp2blocklist_path), stdout=devnull)
41 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path, 42 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path,
42 stdout=devnull) 43 stdout=devnull)
43 44
44 def download_filter_list(url): 45 def download_filter_list(url):
45 filter_list = {}
46 with closing(urllib2.urlopen(url)) as response: 46 with closing(urllib2.urlopen(url)) as response:
47 filter_list["body"] = response.read() 47 body = response.read()
48 filter_list["header"] = parse_filter_list_header(filter_list["body"]) 48 version = re.search(r"^(?:[^[!])|^!\s*Version:\s*(.+)$",
49 filter_list["header"]["url"] = url 49 body, re.MULTILINE).group(1)
50 return filter_list 50 return body, url, version
51 51
52 def parse_filter_list_header(filter_list): 52 def generate_metadata(filter_lists, expires):
53 field_re = re.compile(r"^!\s*([^:]+):\s*(.+)$") 53 metadata = OrderedDict((
54 with closing(StringIO(filter_list)) as stream: 54 ("version", time.strftime("%Y%m%d%H%M", time.gmtime())),
Sebastian Noack 2015/11/30 13:55:49 Never mind closing a StringIO. It doesn't do anyth
kzar 2015/11/30 15:13:11 We need the Version field, but otherwise Done.
Sebastian Noack 2015/11/30 15:49:19 Well, you set the version field based on the curre
kzar 2015/11/30 17:06:00 That's the version for the block list, in the sour
Felix Dahlke 2015/12/01 08:32:56 You still won't have to parse the header of the fi
kzar 2015/12/01 12:13:38 I don't see that header present? curl -I https:/
Felix Dahlke 2015/12/01 14:04:19 Ouch, big mixup on my end, we only have this for n
55 header = {} 55 ("expires", expires),
56 next(stream) 56 ("sources", [])
57 for line in stream: 57 ))
58 match = field_re.search(line) 58 for body, url, version in filter_lists:
59 if match: 59 metadata["sources"].append({"url": url, "version": version})
60 header[match.group(1)] = match.group(2)
61 else:
Sebastian Noack 2015/11/30 13:55:49 Nit: If you negate the logic you don't need an els
62 break
63 return header
64
65 def generate_metadata(filter_lists, expires="4 days"):
kzar 2015/11/27 16:28:11 It is unclear where the expires value for content
Sebastian Noack 2015/11/30 13:55:49 The expiration interval should be configured in si
kzar 2015/11/30 15:13:11 Done.
66 metadata = {
67 "sources": [],
68 "version": datetime.utcnow().strftime("%Y%m%d%H%M"),
69 "expires": expires
70 }
71 for filter_list in filter_lists:
72 metadata["sources"].append({ k.lower(): filter_list["header"][k]
73 for k in ["url", "Version"]})
74 return metadata 60 return metadata
75 61
76 def write_block_list(filter_lists, path): 62 def pipe_in(process, filter_lists):
77 metadata = generate_metadata(filter_lists) 63 try:
kzar 2015/11/27 16:28:11 I'm doing it this way to avoid having to load the
Sebastian Noack 2015/11/30 13:55:49 We don't have to care too much about memory consum
kzar 2015/11/30 15:13:11 Done.
78 header = json.dumps(metadata, indent=2).rsplit("}", 1)[0].rstrip() 64 for body, _, _ in filter_lists:
79 header += ',\n "rules": ' 65 print >>process.stdin, body
66 finally:
67 process.stdin.close()
68
69 def write_block_list(filter_lists, path, expires):
70 block_list = generate_metadata(filter_lists, expires)
71 process = subprocess.Popen(("node", "abp2blocklist.js"),
72 cwd=config["abp2blocklist_path"],
73 stdin=subprocess.PIPE, stdout=subprocess.PIPE)
74 threading.Thread(target=pipe_in, args=(process, filter_lists)).start()
75 block_list["rules"] = json.load(process.stdout)
76 if process.wait():
77 raise Exception("abp2blocklist returned %s" % process.returncode)
78
80 with open(path, "wb") as destination_file: 79 with open(path, "wb") as destination_file:
81 destination_file.write(header) 80 json.dump(block_list, destination_file, indent=2, separators=(",", ": "))
82 destination_file.flush()
Sebastian Noack 2015/11/30 13:55:49 Any particular reason you flush the file here?
83 process = subprocess.Popen(("node", "abp2blocklist.js"),
84 cwd=config["abp2blocklist_path"],
85 stdin=subprocess.PIPE,
86 stdout=destination_file)
87 try:
88 for filter_list in filter_lists:
89 print >>process.stdin, filter_list["body"]
90 finally:
91 process.stdin.close()
92 process.wait()
93 print >>destination_file, "}"
94
95 if process.returncode:
96 raise Exception("abp2blocklist returned %s" % process.returncode)
97 81
98 if __name__ == "__main__": 82 if __name__ == "__main__":
99 update_abp2blocklist() 83 update_abp2blocklist()
100 84
101 easylist = download_filter_list(config["easylist_url"]) 85 easylist = download_filter_list(config["easylist_url"])
102 exceptionrules = download_filter_list(config["exceptionrules_url"]) 86 exceptionrules = download_filter_list(config["exceptionrules_url"])
103 87
104 write_block_list([easylist], config["easylist_content_blocker_path"]) 88 write_block_list([easylist],
89 config["easylist_content_blocker_path"],
90 config["easylist_content_blocker_expires"])
105 write_block_list([easylist, exceptionrules], 91 write_block_list([easylist, exceptionrules],
106 config["combined_content_blocker_path"]) 92 config["combined_content_blocker_path"],
93 config["combined_content_blocker_expires"])
LEFTRIGHT

Powered by Google App Engine
This is Rietveld