Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/content_blocker_lists/bin/generate_lists.py

Issue 29331148: Issue 3176 - Add metadata to content blocker lists (Closed)
Left Patch Set: Addressed further feedback Created Nov. 30, 2015, 5:05 p.m.
Right Patch Set: Improved regexp Created Dec. 8, 2015, 3:31 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « .sitescripts.example ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 # This file is part of Adblock Plus <https://adblockplus.org/>, 4 # This file is part of Adblock Plus <https://adblockplus.org/>,
5 # Copyright (C) 2006-2015 Eyeo GmbH 5 # Copyright (C) 2006-2015 Eyeo GmbH
6 # 6 #
7 # Adblock Plus is free software: you can redistribute it and/or modify 7 # Adblock Plus is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License version 3 as 8 # it under the terms of the GNU General Public License version 3 as
9 # published by the Free Software Foundation. 9 # published by the Free Software Foundation.
10 # 10 #
11 # Adblock Plus is distributed in the hope that it will be useful, 11 # Adblock Plus is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details. 14 # GNU General Public License for more details.
15 # 15 #
16 # You should have received a copy of the GNU General Public License 16 # You should have received a copy of the GNU General Public License
17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
18 18
19 from collections import OrderedDict 19 from collections import OrderedDict
20 from contextlib import closing 20 from contextlib import closing
21 from datetime import datetime
22 import json 21 import json
23 import os 22 import os
24 import subprocess 23 import subprocess
25 import thread 24 import threading
25 import time
26 import re 26 import re
27 import urllib2 27 import urllib2
28 28
29 from sitescripts.utils import get_config 29 from sitescripts.utils import get_config
30 30
31 config = dict(get_config().items("content_blocker_lists")) 31 config = dict(get_config().items("content_blocker_lists"))
32 32
33 def update_abp2blocklist(): 33 def update_abp2blocklist():
Felix Dahlke 2015/12/01 08:43:09 These functions were prefixed with an underscore f
Sebastian Noack 2015/12/01 10:18:39 Well, one could argue that this is less a module (
kzar 2015/12/01 12:13:38 I would prefer to leave them off, they don't reall
Felix Dahlke 2015/12/01 14:04:19 I would argue that this is still a module - this s
Sebastian Noack 2015/12/07 12:38:24 Well, technically every piece of code is part of a
Felix Dahlke 2015/12/08 06:51:24 Wladimir's newer code in Sitescripts also uses tho
34 with open(os.devnull, "w") as devnull: 34 with open(os.devnull, "w") as devnull:
35 abp2blocklist_path = config["abp2blocklist_path"] 35 abp2blocklist_path = config["abp2blocklist_path"]
36 if os.path.isdir(abp2blocklist_path): 36 if os.path.isdir(abp2blocklist_path):
37 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path), 37 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path),
38 stdout=devnull) 38 stdout=devnull)
39 else: 39 else:
40 subprocess.check_call(("hg", "clone", config["abp2blocklist_url"], 40 subprocess.check_call(("hg", "clone", config["abp2blocklist_url"],
41 abp2blocklist_path), stdout=devnull) 41 abp2blocklist_path), stdout=devnull)
42 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path, 42 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path,
43 stdout=devnull) 43 stdout=devnull)
44 44
45 def download_filter_list(url): 45 def download_filter_list(url):
46 filter_list = {}
47 with closing(urllib2.urlopen(url)) as response: 46 with closing(urllib2.urlopen(url)) as response:
48 filter_list["body"] = response.read() 47 body = response.read()
49 filter_list["header"] = parse_filter_list_header(filter_list["body"]) 48 version = re.search(r"^(?:[^[!])|^!\s*Version:\s*(.+)$",
50 filter_list["header"]["url"] = url 49 body, re.MULTILINE).group(1)
51 return filter_list 50 return body, url, version
Sebastian Noack 2015/12/01 11:01:48 I think it would be simpler if you just return a t
kzar 2015/12/01 12:13:38 Done.
52
53 def parse_filter_list_header(filter_list):
54 body_start = re.search(r"^[^![]", filter_list, re.MULTILINE).start()
55 field_re = re.compile(r"^!\s*([^:\s]+):\s*(.+)$", re.MULTILINE)
56 return { match.group(1): match.group(2)
57 for match in field_re.finditer(filter_list, 0, body_start) }
58 51
59 def generate_metadata(filter_lists, expires): 52 def generate_metadata(filter_lists, expires):
60 metadata = OrderedDict(( 53 metadata = OrderedDict((
61 ("version", datetime.utcnow().strftime("%Y%m%d%H%M")), 54 ("version", time.strftime("%Y%m%d%H%M", time.gmtime())),
Felix Dahlke 2015/12/01 08:43:08 FWIW, we're using `time.strftime("%Y%m%d%H%M", tim
Sebastian Noack 2015/12/01 10:18:39 I tend to agree, creating a datetime object is unn
kzar 2015/12/01 12:13:39 Done.
62 ("expires", expires), 55 ("expires", expires),
63 ("sources", []) 56 ("sources", [])
64 )) 57 ))
65 for filter_list in filter_lists: 58 for body, url, version in filter_lists:
66 metadata["sources"].append({ k.lower(): filter_list["header"][k] 59 metadata["sources"].append({"url": url, "version": version})
67 for k in ["url", "Version"]})
Felix Dahlke 2015/12/01 08:43:08 Nit: Sebastian convinced me a while ago that tuple
Sebastian Noack 2015/12/01 10:18:39 Well, frankly, I don't think that it matters in th
Felix Dahlke 2015/12/01 10:26:29 Wouldn't insist either.
Sebastian Noack 2015/12/01 10:38:09 For reference, I found a quite interesting answer
kzar 2015/12/01 12:13:39 Acknowledged.
68 return metadata 60 return metadata
61
62 def pipe_in(process, filter_lists):
63 try:
64 for body, _, _ in filter_lists:
65 print >>process.stdin, body
66 finally:
67 process.stdin.close()
69 68
70 def write_block_list(filter_lists, path, expires): 69 def write_block_list(filter_lists, path, expires):
71 block_list = generate_metadata(filter_lists, expires) 70 block_list = generate_metadata(filter_lists, expires)
72 process = subprocess.Popen(("node", "abp2blocklist.js"), 71 process = subprocess.Popen(("node", "abp2blocklist.js"),
73 cwd=config["abp2blocklist_path"], 72 cwd=config["abp2blocklist_path"],
74 stdin=subprocess.PIPE, stdout=subprocess.PIPE) 73 stdin=subprocess.PIPE, stdout=subprocess.PIPE)
75 def pipe_in(process): 74 threading.Thread(target=pipe_in, args=(process, filter_lists)).start()
Sebastian Noack 2015/12/01 10:47:14 Nit: This is inconsistent. You pass in the process
kzar 2015/12/01 12:13:38 Done.
76 try:
77 for filter_list in filter_lists:
78 print >>process.stdin, filter_list["body"]
79 finally:
80 process.stdin.close()
81 process.wait()
82
83 thread.start_new_thread(pipe_in, (process,))
Sebastian Noack 2015/12/01 10:47:14 Please use the high-level threading module instead
kzar 2015/12/01 12:13:38 Done.
84 block_list["rules"] = json.load(process.stdout) 75 block_list["rules"] = json.load(process.stdout)
85 76 if process.wait():
86 if process.returncode:
87 raise Exception("abp2blocklist returned %s" % process.returncode) 77 raise Exception("abp2blocklist returned %s" % process.returncode)
88 78
89 with open(path, "wb") as destination_file: 79 with open(path, "wb") as destination_file:
90 json.dump(block_list, destination_file, indent=2, separators=(",", ": ")) 80 json.dump(block_list, destination_file, indent=2, separators=(",", ": "))
91 81
92 if __name__ == "__main__": 82 if __name__ == "__main__":
93 update_abp2blocklist() 83 update_abp2blocklist()
94 84
95 easylist = download_filter_list(config["easylist_url"]) 85 easylist = download_filter_list(config["easylist_url"])
96 exceptionrules = download_filter_list(config["exceptionrules_url"]) 86 exceptionrules = download_filter_list(config["exceptionrules_url"])
97 87
98 write_block_list([easylist], 88 write_block_list([easylist],
Felix Dahlke 2015/12/01 08:43:08 Nit: "block list" is highly ambiguous, we often us
Sebastian Noack 2015/12/01 10:18:39 Well, after all, the program called here is also c
Felix Dahlke 2015/12/01 10:26:29 Good point, let's really leave it alone.
99 config["easylist_content_blocker_path"], 89 config["easylist_content_blocker_path"],
100 config["easylist_content_blocker_expires"]) 90 config["easylist_content_blocker_expires"])
101 write_block_list([easylist, exceptionrules], 91 write_block_list([easylist, exceptionrules],
102 config["combined_content_blocker_path"], 92 config["combined_content_blocker_path"],
103 config["combined_content_blocker_expires"]) 93 config["combined_content_blocker_expires"])
LEFTRIGHT

Powered by Google App Engine
This is Rietveld