sitescripts/content_blocker_lists/bin/generate_lists.py - Issue 29331148: Issue 3176 - Add metadata to content blocker lists

Delta Between Two Patch Sets: sitescripts/content_blocker_lists/bin/generate_lists.py

Issue 29331148: Issue 3176 - Add metadata to content blocker lists (Closed)

Left Patch Set: Created Nov. 27, 2015, 4:22 p.m.

Right Patch Set: Improved regexp Created Dec. 8, 2015, 3:31 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # coding: utf-8	2 # coding: utf-8

3	3

4 # This file is part of Adblock Plus <https://adblockplus.org/>,	4 # This file is part of Adblock Plus <https://adblockplus.org/>,

5 # Copyright (C) 2006-2015 Eyeo GmbH	5 # Copyright (C) 2006-2015 Eyeo GmbH

6 #	6 #

7 # Adblock Plus is free software: you can redistribute it and/or modify	7 # Adblock Plus is free software: you can redistribute it and/or modify

8 # it under the terms of the GNU General Public License version 3 as	8 # it under the terms of the GNU General Public License version 3 as

9 # published by the Free Software Foundation.	9 # published by the Free Software Foundation.

10 #	10 #

11 # Adblock Plus is distributed in the hope that it will be useful,	11 # Adblock Plus is distributed in the hope that it will be useful,

12 # but WITHOUT ANY WARRANTY; without even the implied warranty of	12 # but WITHOUT ANY WARRANTY; without even the implied warranty of

13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 # GNU General Public License for more details.	14 # GNU General Public License for more details.

15 #	15 #

16 # You should have received a copy of the GNU General Public License	16 # You should have received a copy of the GNU General Public License

17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

18	18

	19 from collections import OrderedDict

19 from contextlib import closing	20 from contextlib import closing

20 from datetime import datetime

21 import json	21 import json

22 import os	22 import os

23 from StringIO import StringIO

24 import subprocess	23 import subprocess

	24 import threading

	25 import time

25 import re	26 import re

26 import urllib2	27 import urllib2

27	28

28 from sitescripts.utils import get_config	29 from sitescripts.utils import get_config

29	30

30 config = dict(get_config().items("content_blocker_lists"))	31 config = dict(get_config().items("content_blocker_lists"))

31	32

32 def update_abp2blocklist():	33 def update_abp2blocklist():

33 with open(os.devnull, "w") as devnull:	34 with open(os.devnull, "w") as devnull:

34 abp2blocklist_path = config["abp2blocklist_path"]	35 abp2blocklist_path = config["abp2blocklist_path"]

35 if os.path.isdir(abp2blocklist_path):	36 if os.path.isdir(abp2blocklist_path):

36 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path),	37 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path),

37 stdout=devnull)	38 stdout=devnull)

38 else:	39 else:

39 subprocess.check_call(("hg", "clone", config["abp2blocklist_url"],	40 subprocess.check_call(("hg", "clone", config["abp2blocklist_url"],

40 abp2blocklist_path), stdout=devnull)	41 abp2blocklist_path), stdout=devnull)

41 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path,	42 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path,

42 stdout=devnull)	43 stdout=devnull)

43	44

44 def download_filter_list(url):	45 def download_filter_list(url):

45 filter_list = {}

46 with closing(urllib2.urlopen(url)) as response:	46 with closing(urllib2.urlopen(url)) as response:

47 filter_list["body"] = response.read()	47 body = response.read()

48 filter_list["header"] = parse_filter_list_header(filter_list["body"])	48 version = re.search(r"^(?:[^[!])\|^!\sVersion:\s(.+)$",

49 filter_list["header"]["url"] = url	49 body, re.MULTILINE).group(1)

50 return filter_list	50 return body, url, version

51	51

52 def parse_filter_list_header(filter_list):	52 def generate_metadata(filter_lists, expires):

53 field_re = re.compile(r"^!\s([^:]+):\s(.+)$")	53 metadata = OrderedDict((

54 with closing(StringIO(filter_list)) as stream:	54 ("version", time.strftime("%Y%m%d%H%M", time.gmtime())),
Sebastian Noack 2015/11/30 13:55:49 Never mind closing a StringIO. It doesn't do anyth Never mind closing a StringIO. It doesn't do anything that wouldn't happen anyway. There are no resources that needs to be released associated with a StringIO. The close method merely exist for compatibility. Wait.., why do you use a StringIO in the first place? How about simply using a multi-line regexp? But wait again.., why do you parse the filter list headers in the first place? We don't need any of these data in the generated block list. kzar 2015/11/30 15:13:11 We need the Version field, but otherwise Done. Show quoted text On 2015/11/30 13:55:49, Sebastian Noack wrote: > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > anyway. There are no resources that needs to be released associated with a > StringIO. The close method merely exist for compatibility. > > Wait.., why do you use a StringIO in the first place? How about simply using a > multi-line regexp? > > But wait again.., why do you parse the filter list headers in the first place? > We don't need any of these data in the generated block list. We need the Version field, but otherwise Done. Sebastian Noack 2015/11/30 15:49:19 Well, you set the version field based on the curre Show quoted text On 2015/11/30 15:13:11, kzar wrote: > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > > anyway. There are no resources that needs to be released associated with a > > StringIO. The close method merely exist for compatibility. > > > > Wait.., why do you use a StringIO in the first place? How about simply using a > > multi-line regexp? > > > > But wait again.., why do you parse the filter list headers in the first place? > > We don't need any of these data in the generated block list. > > We need the Version field, but otherwise Done. Well, you set the version field based on the current timestamp. So there is no need to parse the source filter lists. Or do I miss something? kzar 2015/11/30 17:06:00 That's the version for the block list, in the sour Show quoted text On 2015/11/30 15:49:19, Sebastian Noack wrote: > On 2015/11/30 15:13:11, kzar wrote: > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > > > anyway. There are no resources that needs to be released associated with a > > > StringIO. The close method merely exist for compatibility. > > > > > > Wait.., why do you use a StringIO in the first place? How about simply using > a > > > multi-line regexp? > > > > > > But wait again.., why do you parse the filter list headers in the first > place? > > > We don't need any of these data in the generated block list. > > > > We need the Version field, but otherwise Done. > > Well, you set the version field based on the current timestamp. So there is no > need to parse the source filter lists. Or do I miss something? That's the version for the block list, in the sources section we keep a record of the version of each filter list. Felix Dahlke 2015/12/01 08:32:56 You still won't have to parse the header of the fi Show quoted text On 2015/11/30 17:06:00, kzar wrote: > On 2015/11/30 15:49:19, Sebastian Noack wrote: > > On 2015/11/30 15:13:11, kzar wrote: > > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > > > > anyway. There are no resources that needs to be released associated with a > > > > StringIO. The close method merely exist for compatibility. > > > > > > > > Wait.., why do you use a StringIO in the first place? How about simply > using > > a > > > > multi-line regexp? > > > > > > > > But wait again.., why do you parse the filter list headers in the first > > place? > > > > We don't need any of these data in the generated block list. > > > > > > We need the Version field, but otherwise Done. > > > > Well, you set the version field based on the current timestamp. So there is no > > need to parse the source filter lists. Or do I miss something? > > That's the version for the block list, in the sources section we keep a record > of the version of each filter list. You still won't have to parse the header of the files, you can read the version from the "ABP-Notification-Version" HTTP header in the response. kzar 2015/12/01 12:13:38 I don't see that header present? curl -I https:/ Show quoted text On 2015/12/01 08:32:56, Felix Dahlke wrote: > On 2015/11/30 17:06:00, kzar wrote: > > On 2015/11/30 15:49:19, Sebastian Noack wrote: > > > On 2015/11/30 15:13:11, kzar wrote: > > > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > > > Never mind closing a StringIO. It doesn't do anything that wouldn't > happen > > > > > anyway. There are no resources that needs to be released associated with > a > > > > > StringIO. The close method merely exist for compatibility. > > > > > > > > > > Wait.., why do you use a StringIO in the first place? How about simply > > using > > > a > > > > > multi-line regexp? > > > > > > > > > > But wait again.., why do you parse the filter list headers in the first > > > place? > > > > > We don't need any of these data in the generated block list. > > > > > > > > We need the Version field, but otherwise Done. > > > > > > Well, you set the version field based on the current timestamp. So there is > no > > > need to parse the source filter lists. Or do I miss something? > > > > That's the version for the block list, in the sources section we keep a record > > of the version of each filter list. > > You still won't have to parse the header of the files, you can read the version > from the "ABP-Notification-Version" HTTP header in the response. I don't see that header present? curl -I https://easylist-downloads.adblockplus.org/easylist_noadult.txt HTTP/1.1 200 OK Server: nginx Date: Tue, 01 Dec 2015 11:24:23 GMT Content-Type: text/plain; charset=utf-8 Content-Length: 1532799 Last-Modified: Tue, 01 Dec 2015 11:20:46 GMT Connection: close ETag: "565d828e-17637f" Strict-Transport-Security: max-age=31536000 Accept-Ranges: bytes Felix Dahlke 2015/12/01 14:04:19 Ouch, big mixup on my end, we only have this for n Show quoted text On 2015/12/01 12:13:38, kzar wrote: > On 2015/12/01 08:32:56, Felix Dahlke wrote: > > On 2015/11/30 17:06:00, kzar wrote: > > > On 2015/11/30 15:49:19, Sebastian Noack wrote: > > > > On 2015/11/30 15:13:11, kzar wrote: > > > > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > > > > Never mind closing a StringIO. It doesn't do anything that wouldn't > > happen > > > > > > anyway. There are no resources that needs to be released associated > with > > a > > > > > > StringIO. The close method merely exist for compatibility. > > > > > > > > > > > > Wait.., why do you use a StringIO in the first place? How about simply > > > using > > > > a > > > > > > multi-line regexp? > > > > > > > > > > > > But wait again.., why do you parse the filter list headers in the > first > > > > place? > > > > > > We don't need any of these data in the generated block list. > > > > > > > > > > We need the Version field, but otherwise Done. > > > > > > > > Well, you set the version field based on the current timestamp. So there > is > > no > > > > need to parse the source filter lists. Or do I miss something? > > > > > > That's the version for the block list, in the sources section we keep a > record > > > of the version of each filter list. > > > > You still won't have to parse the header of the files, you can read the > version > > from the "ABP-Notification-Version" HTTP header in the response. > > I don't see that header present? > > curl -I https://easylist-downloads.adblockplus.org/easylist_noadult.txt > > HTTP/1.1 200 OK > Server: nginx > Date: Tue, 01 Dec 2015 11:24:23 GMT > Content-Type: text/plain; charset=utf-8 > Content-Length: 1532799 > Last-Modified: Tue, 01 Dec 2015 11:20:46 GMT > Connection: close > ETag: "565d828e-17637f" > Strict-Transport-Security: max-age=31536000 > Accept-Ranges: bytes Ouch, big mixup on my end, we only have this for notifications at this point... Yes, I'm afraid there's no other way than to parse the file header then.
55 header = {}	55 ("expires", expires),

56 next(stream)	56 ("sources", [])

57 for line in stream:	57 ))

58 match = field_re.search(line)	58 for body, url, version in filter_lists:

59 if match:	59 metadata["sources"].append({"url": url, "version": version})

60 header[match.group(1)] = match.group(2)

61 else:
Sebastian Noack 2015/11/30 13:55:49 Nit: If you negate the logic you don't need an els Nit: If you negate the logic you don't need an else-block. if not match: break header[match.group(1)] = match.group(2)
62 break

63 return header

64

65 def generate_metadata(filter_lists, expires="4 days"):
kzar 2015/11/27 16:28:11 It is unclear where the expires value for content It is unclear where the expires value for content blocking lists is supposed to come from. I've started a discussion about that in the issue, but in the mean time I've just hard coded it to "4 days". Sebastian Noack 2015/11/30 13:55:49 The expiration interval should be configured in si Show quoted text On 2015/11/27 16:28:11, kzar wrote: > It is unclear where the expires value for content blocking lists is supposed to > come from. I've started a discussion about that in the issue, but in the mean > time I've just hard coded it to "4 days". The expiration interval should be configured in sitescripts.ini. kzar 2015/11/30 15:13:11 Done. Show quoted text On 2015/11/30 13:55:49, Sebastian Noack wrote: > On 2015/11/27 16:28:11, kzar wrote: > > It is unclear where the expires value for content blocking lists is supposed > to > > come from. I've started a discussion about that in the issue, but in the mean > > time I've just hard coded it to "4 days". > > The expiration interval should be configured in sitescripts.ini. Done.
66 metadata = {

67 "sources": [],

68 "version": datetime.utcnow().strftime("%Y%m%d%H%M"),

69 "expires": expires

70 }

71 for filter_list in filter_lists:

72 metadata["sources"].append({ k.lower(): filter_list["header"][k]

73 for k in ["url", "Version"]})

74 return metadata	60 return metadata

75	61

76 def write_block_list(filter_lists, path):	62 def pipe_in(process, filter_lists):

77 metadata = generate_metadata(filter_lists)	63 try:
kzar 2015/11/27 16:28:11 I'm doing it this way to avoid having to load the I'm doing it this way to avoid having to load the block list into memory, parse it as JSON again and then mutate it to add the metadata. I realise it's kind of ugly though, I'm open to either approach. Sebastian Noack 2015/11/30 13:55:49 We don't have to care too much about memory consum Show quoted text On 2015/11/27 16:28:11, kzar wrote: > I'm doing it this way to avoid having to load the block list into memory, parse > it as JSON again and then mutate it to add the metadata. I realise it's kind of > ugly though, I'm open to either approach. We don't have to care too much about memory consumption here. I would be interested in how much the difference is though, to get an idea whether it's even worth considering a more complex solution. kzar 2015/11/30 15:13:11 Done. Show quoted text On 2015/11/30 13:55:49, Sebastian Noack wrote: > On 2015/11/27 16:28:11, kzar wrote: > > I'm doing it this way to avoid having to load the block list into memory, > parse > > it as JSON again and then mutate it to add the metadata. I realise it's kind > of > > ugly though, I'm open to either approach. > > We don't have to care too much about memory consumption here. I would be > interested in how much the difference is though, to get an idea whether it's > even worth considering a more complex solution. Done.
78 header = json.dumps(metadata, indent=2).rsplit("}", 1)[0].rstrip()	64 for body, _, _ in filter_lists:

79 header += ',\n "rules": '	65 print >>process.stdin, body

	66 finally:

	67 process.stdin.close()

	68

	69 def write_block_list(filter_lists, path, expires):

	70 block_list = generate_metadata(filter_lists, expires)

	71 process = subprocess.Popen(("node", "abp2blocklist.js"),

	72 cwd=config["abp2blocklist_path"],

	73 stdin=subprocess.PIPE, stdout=subprocess.PIPE)

	74 threading.Thread(target=pipe_in, args=(process, filter_lists)).start()

	75 block_list["rules"] = json.load(process.stdout)

	76 if process.wait():

	77 raise Exception("abp2blocklist returned %s" % process.returncode)

	78

80 with open(path, "wb") as destination_file:	79 with open(path, "wb") as destination_file:

81 destination_file.write(header)	80 json.dump(block_list, destination_file, indent=2, separators=(",", ": "))

82 destination_file.flush()
Sebastian Noack 2015/11/30 13:55:49 Any particular reason you flush the file here? Any particular reason you flush the file here?
83 process = subprocess.Popen(("node", "abp2blocklist.js"),

84 cwd=config["abp2blocklist_path"],

85 stdin=subprocess.PIPE,

86 stdout=destination_file)

87 try:

88 for filter_list in filter_lists:

89 print >>process.stdin, filter_list["body"]

90 finally:

91 process.stdin.close()

92 process.wait()

93 print >>destination_file, "}"

94

95 if process.returncode:

96 raise Exception("abp2blocklist returned %s" % process.returncode)

97	81

98 if __name__ == "__main__":	82 if __name__ == "__main__":

99 update_abp2blocklist()	83 update_abp2blocklist()

100	84

101 easylist = download_filter_list(config["easylist_url"])	85 easylist = download_filter_list(config["easylist_url"])

102 exceptionrules = download_filter_list(config["exceptionrules_url"])	86 exceptionrules = download_filter_list(config["exceptionrules_url"])

103	87

104 write_block_list([easylist], config["easylist_content_blocker_path"])	88 write_block_list([easylist],

	89 config["easylist_content_blocker_path"],

	90 config["easylist_content_blocker_expires"])

105 write_block_list([easylist, exceptionrules],	91 write_block_list([easylist, exceptionrules],

106 config["combined_content_blocker_path"])	92 config["combined_content_blocker_path"],

	93 config["combined_content_blocker_expires"])

LEFT	RIGHT