sitescripts/content_blocker_lists/bin/generate_lists.py - Issue 29331148: Issue 3176 - Add metadata to content blocker lists

Side by Side Diff: sitescripts/content_blocker_lists/bin/generate_lists.py

Issue 29331148: Issue 3176 - Add metadata to content blocker lists (Closed)

Patch Set: Created Nov. 27, 2015, 4:22 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # coding: utf-8	2 # coding: utf-8

3	3

4 # This file is part of Adblock Plus <https://adblockplus.org/>,	4 # This file is part of Adblock Plus <https://adblockplus.org/>,

5 # Copyright (C) 2006-2015 Eyeo GmbH	5 # Copyright (C) 2006-2015 Eyeo GmbH

6 #	6 #

7 # Adblock Plus is free software: you can redistribute it and/or modify	7 # Adblock Plus is free software: you can redistribute it and/or modify

8 # it under the terms of the GNU General Public License version 3 as	8 # it under the terms of the GNU General Public License version 3 as

9 # published by the Free Software Foundation.	9 # published by the Free Software Foundation.

10 #	10 #

11 # Adblock Plus is distributed in the hope that it will be useful,	11 # Adblock Plus is distributed in the hope that it will be useful,

12 # but WITHOUT ANY WARRANTY; without even the implied warranty of	12 # but WITHOUT ANY WARRANTY; without even the implied warranty of

13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 # GNU General Public License for more details.	14 # GNU General Public License for more details.

15 #	15 #

16 # You should have received a copy of the GNU General Public License	16 # You should have received a copy of the GNU General Public License

17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	17 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

18	18

	19 from contextlib import closing

	20 from datetime import datetime

	21 import json

19 import os	22 import os

	23 from StringIO import StringIO

20 import subprocess	24 import subprocess

	25 import re

21 import urllib2	26 import urllib2

22	27

23 from sitescripts.utils import get_config	28 from sitescripts.utils import get_config

24	29

25 def _update_abp2blocklist():	30 config = dict(get_config().items("content_blocker_lists"))

	31

	32 def update_abp2blocklist():

26 with open(os.devnull, "w") as devnull:	33 with open(os.devnull, "w") as devnull:

27 config = get_config()	34 abp2blocklist_path = config["abp2blocklist_path"]

28 abp2blocklist_path = config.get("content_blocker_lists",

29 "abp2blocklist_path")

30 if os.path.isdir(abp2blocklist_path):	35 if os.path.isdir(abp2blocklist_path):

31 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path),	36 subprocess.check_call(("hg", "pull", "-u", "-R", abp2blocklist_path),

32 stdout=devnull)	37 stdout=devnull)

33 else:	38 else:

34 abp2blocklist_url = config.get("content_blocker_lists",	39 subprocess.check_call(("hg", "clone", config["abp2blocklist_url"],

35 "abp2blocklist_url")

36 subprocess.check_call(("hg", "clone", abp2blocklist_url,

37 abp2blocklist_path), stdout=devnull)	40 abp2blocklist_path), stdout=devnull)

38 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path,	41 subprocess.check_call(("npm", "install"), cwd=abp2blocklist_path,

39 stdout=devnull)	42 stdout=devnull)

40	43

41 def _download(url_key):	44 def download_filter_list(url):

42 url = get_config().get("content_blocker_lists", url_key)	45 filter_list = {}

43 response = urllib2.urlopen(url)	46 with closing(urllib2.urlopen(url)) as response:

44 try:	47 filter_list["body"] = response.read()

45 return response.read()	48 filter_list["header"] = parse_filter_list_header(filter_list["body"])

46 finally:	49 filter_list["header"]["url"] = url

47 response.close()	50 return filter_list

48	51

49 def _convert_filter_list(sources, destination_path_key):	52 def parse_filter_list_header(filter_list):

50 config = get_config()	53 field_re = re.compile(r"^!\s([^:]+):\s(.+)$")

51 destination_path = config.get("content_blocker_lists", destination_path_key)	54 with closing(StringIO(filter_list)) as stream:
	Sebastian Noack 2015/11/30 13:55:49 Never mind closing a StringIO. It doesn't do anyth Never mind closing a StringIO. It doesn't do anything that wouldn't happen anyway. There are no resources that needs to be released associated with a StringIO. The close method merely exist for compatibility. Wait.., why do you use a StringIO in the first place? How about simply using a multi-line regexp? But wait again.., why do you parse the filter list headers in the first place? We don't need any of these data in the generated block list. kzar 2015/11/30 15:13:11 We need the Version field, but otherwise Done. Show quoted text On 2015/11/30 13:55:49, Sebastian Noack wrote: > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > anyway. There are no resources that needs to be released associated with a > StringIO. The close method merely exist for compatibility. > > Wait.., why do you use a StringIO in the first place? How about simply using a > multi-line regexp? > > But wait again.., why do you parse the filter list headers in the first place? > We don't need any of these data in the generated block list. We need the Version field, but otherwise Done. Sebastian Noack 2015/11/30 15:49:19 Well, you set the version field based on the curre Show quoted text On 2015/11/30 15:13:11, kzar wrote: > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > > anyway. There are no resources that needs to be released associated with a > > StringIO. The close method merely exist for compatibility. > > > > Wait.., why do you use a StringIO in the first place? How about simply using a > > multi-line regexp? > > > > But wait again.., why do you parse the filter list headers in the first place? > > We don't need any of these data in the generated block list. > > We need the Version field, but otherwise Done. Well, you set the version field based on the current timestamp. So there is no need to parse the source filter lists. Or do I miss something? kzar 2015/11/30 17:06:00 That's the version for the block list, in the sour Show quoted text On 2015/11/30 15:49:19, Sebastian Noack wrote: > On 2015/11/30 15:13:11, kzar wrote: > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > > > anyway. There are no resources that needs to be released associated with a > > > StringIO. The close method merely exist for compatibility. > > > > > > Wait.., why do you use a StringIO in the first place? How about simply using > a > > > multi-line regexp? > > > > > > But wait again.., why do you parse the filter list headers in the first > place? > > > We don't need any of these data in the generated block list. > > > > We need the Version field, but otherwise Done. > > Well, you set the version field based on the current timestamp. So there is no > need to parse the source filter lists. Or do I miss something? That's the version for the block list, in the sources section we keep a record of the version of each filter list. Felix Dahlke 2015/12/01 08:32:56 You still won't have to parse the header of the fi Show quoted text On 2015/11/30 17:06:00, kzar wrote: > On 2015/11/30 15:49:19, Sebastian Noack wrote: > > On 2015/11/30 15:13:11, kzar wrote: > > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > > Never mind closing a StringIO. It doesn't do anything that wouldn't happen > > > > anyway. There are no resources that needs to be released associated with a > > > > StringIO. The close method merely exist for compatibility. > > > > > > > > Wait.., why do you use a StringIO in the first place? How about simply > using > > a > > > > multi-line regexp? > > > > > > > > But wait again.., why do you parse the filter list headers in the first > > place? > > > > We don't need any of these data in the generated block list. > > > > > > We need the Version field, but otherwise Done. > > > > Well, you set the version field based on the current timestamp. So there is no > > need to parse the source filter lists. Or do I miss something? > > That's the version for the block list, in the sources section we keep a record > of the version of each filter list. You still won't have to parse the header of the files, you can read the version from the "ABP-Notification-Version" HTTP header in the response. kzar 2015/12/01 12:13:38 I don't see that header present? curl -I https:/ Show quoted text On 2015/12/01 08:32:56, Felix Dahlke wrote: > On 2015/11/30 17:06:00, kzar wrote: > > On 2015/11/30 15:49:19, Sebastian Noack wrote: > > > On 2015/11/30 15:13:11, kzar wrote: > > > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > > > Never mind closing a StringIO. It doesn't do anything that wouldn't > happen > > > > > anyway. There are no resources that needs to be released associated with > a > > > > > StringIO. The close method merely exist for compatibility. > > > > > > > > > > Wait.., why do you use a StringIO in the first place? How about simply > > using > > > a > > > > > multi-line regexp? > > > > > > > > > > But wait again.., why do you parse the filter list headers in the first > > > place? > > > > > We don't need any of these data in the generated block list. > > > > > > > > We need the Version field, but otherwise Done. > > > > > > Well, you set the version field based on the current timestamp. So there is > no > > > need to parse the source filter lists. Or do I miss something? > > > > That's the version for the block list, in the sources section we keep a record > > of the version of each filter list. > > You still won't have to parse the header of the files, you can read the version > from the "ABP-Notification-Version" HTTP header in the response. I don't see that header present? curl -I https://easylist-downloads.adblockplus.org/easylist_noadult.txt HTTP/1.1 200 OK Server: nginx Date: Tue, 01 Dec 2015 11:24:23 GMT Content-Type: text/plain; charset=utf-8 Content-Length: 1532799 Last-Modified: Tue, 01 Dec 2015 11:20:46 GMT Connection: close ETag: "565d828e-17637f" Strict-Transport-Security: max-age=31536000 Accept-Ranges: bytes Felix Dahlke 2015/12/01 14:04:19 Ouch, big mixup on my end, we only have this for n Show quoted text On 2015/12/01 12:13:38, kzar wrote: > On 2015/12/01 08:32:56, Felix Dahlke wrote: > > On 2015/11/30 17:06:00, kzar wrote: > > > On 2015/11/30 15:49:19, Sebastian Noack wrote: > > > > On 2015/11/30 15:13:11, kzar wrote: > > > > > On 2015/11/30 13:55:49, Sebastian Noack wrote: > > > > > > Never mind closing a StringIO. It doesn't do anything that wouldn't > > happen > > > > > > anyway. There are no resources that needs to be released associated > with > > a > > > > > > StringIO. The close method merely exist for compatibility. > > > > > > > > > > > > Wait.., why do you use a StringIO in the first place? How about simply > > > using > > > > a > > > > > > multi-line regexp? > > > > > > > > > > > > But wait again.., why do you parse the filter list headers in the > first > > > > place? > > > > > > We don't need any of these data in the generated block list. > > > > > > > > > > We need the Version field, but otherwise Done. > > > > > > > > Well, you set the version field based on the current timestamp. So there > is > > no > > > > need to parse the source filter lists. Or do I miss something? > > > > > > That's the version for the block list, in the sources section we keep a > record > > > of the version of each filter list. > > > > You still won't have to parse the header of the files, you can read the > version > > from the "ABP-Notification-Version" HTTP header in the response. > > I don't see that header present? > > curl -I https://easylist-downloads.adblockplus.org/easylist_noadult.txt > > HTTP/1.1 200 OK > Server: nginx > Date: Tue, 01 Dec 2015 11:24:23 GMT > Content-Type: text/plain; charset=utf-8 > Content-Length: 1532799 > Last-Modified: Tue, 01 Dec 2015 11:20:46 GMT > Connection: close > ETag: "565d828e-17637f" > Strict-Transport-Security: max-age=31536000 > Accept-Ranges: bytes Ouch, big mixup on my end, we only have this for notifications at this point... Yes, I'm afraid there's no other way than to parse the file header then.
52 with open(destination_path, "wb") as destination_file:	55 header = {}

53 abp2blocklist_path = config.get("content_blocker_lists",	56 next(stream)

54 "abp2blocklist_path")	57 for line in stream:

	58 match = field_re.search(line)

	59 if match:

	60 header[match.group(1)] = match.group(2)

	61 else:
	Sebastian Noack 2015/11/30 13:55:49 Nit: If you negate the logic you don't need an els Nit: If you negate the logic you don't need an else-block. if not match: break header[match.group(1)] = match.group(2)
	62 break

	63 return header

	64

	65 def generate_metadata(filter_lists, expires="4 days"):
	kzar 2015/11/27 16:28:11 It is unclear where the expires value for content It is unclear where the expires value for content blocking lists is supposed to come from. I've started a discussion about that in the issue, but in the mean time I've just hard coded it to "4 days". Sebastian Noack 2015/11/30 13:55:49 The expiration interval should be configured in si Show quoted text On 2015/11/27 16:28:11, kzar wrote: > It is unclear where the expires value for content blocking lists is supposed to > come from. I've started a discussion about that in the issue, but in the mean > time I've just hard coded it to "4 days". The expiration interval should be configured in sitescripts.ini. kzar 2015/11/30 15:13:11 Done. Show quoted text On 2015/11/30 13:55:49, Sebastian Noack wrote: > On 2015/11/27 16:28:11, kzar wrote: > > It is unclear where the expires value for content blocking lists is supposed > to > > come from. I've started a discussion about that in the issue, but in the mean > > time I've just hard coded it to "4 days". > > The expiration interval should be configured in sitescripts.ini. Done.
	66 metadata = {

	67 "sources": [],

	68 "version": datetime.utcnow().strftime("%Y%m%d%H%M"),

	69 "expires": expires

	70 }

	71 for filter_list in filter_lists:

	72 metadata["sources"].append({ k.lower(): filter_list["header"][k]

	73 for k in ["url", "Version"]})

	74 return metadata

	75

	76 def write_block_list(filter_lists, path):

	77 metadata = generate_metadata(filter_lists)
	kzar 2015/11/27 16:28:11 I'm doing it this way to avoid having to load the I'm doing it this way to avoid having to load the block list into memory, parse it as JSON again and then mutate it to add the metadata. I realise it's kind of ugly though, I'm open to either approach. Sebastian Noack 2015/11/30 13:55:49 We don't have to care too much about memory consum Show quoted text On 2015/11/27 16:28:11, kzar wrote: > I'm doing it this way to avoid having to load the block list into memory, parse > it as JSON again and then mutate it to add the metadata. I realise it's kind of > ugly though, I'm open to either approach. We don't have to care too much about memory consumption here. I would be interested in how much the difference is though, to get an idea whether it's even worth considering a more complex solution. kzar 2015/11/30 15:13:11 Done. Show quoted text On 2015/11/30 13:55:49, Sebastian Noack wrote: > On 2015/11/27 16:28:11, kzar wrote: > > I'm doing it this way to avoid having to load the block list into memory, > parse > > it as JSON again and then mutate it to add the metadata. I realise it's kind > of > > ugly though, I'm open to either approach. > > We don't have to care too much about memory consumption here. I would be > interested in how much the difference is though, to get an idea whether it's > even worth considering a more complex solution. Done.
	78 header = json.dumps(metadata, indent=2).rsplit("}", 1)[0].rstrip()

	79 header += ',\n "rules": '

	80 with open(path, "wb") as destination_file:

	81 destination_file.write(header)

	82 destination_file.flush()
	Sebastian Noack 2015/11/30 13:55:49 Any particular reason you flush the file here? Any particular reason you flush the file here?
55 process = subprocess.Popen(("node", "abp2blocklist.js"),	83 process = subprocess.Popen(("node", "abp2blocklist.js"),

56 cwd=abp2blocklist_path, stdin=subprocess.PIPE,	84 cwd=config["abp2blocklist_path"],

	85 stdin=subprocess.PIPE,

57 stdout=destination_file)	86 stdout=destination_file)

58 try:	87 try:

59 for source in sources:	88 for filter_list in filter_lists:

60 print >>process.stdin, source	89 print >>process.stdin, filter_list["body"]

61 finally:	90 finally:

62 process.stdin.close()	91 process.stdin.close()

63 process.wait()	92 process.wait()

	93 print >>destination_file, "}"

64	94

65 if process.returncode:	95 if process.returncode:

66 raise Exception("abp2blocklist returned %s" % process.returncode)	96 raise Exception("abp2blocklist returned %s" % process.returncode)

67	97

68 if __name__ == "__main__":	98 if __name__ == "__main__":

69 _update_abp2blocklist()	99 update_abp2blocklist()

70	100

71 easylist = _download("easylist_url")	101 easylist = download_filter_list(config["easylist_url"])

72 exceptionrules = _download("exceptionrules_url")	102 exceptionrules = download_filter_list(config["exceptionrules_url"])

73	103

74 _convert_filter_list([easylist], "easylist_content_blocker_path")	104 write_block_list([easylist], config["easylist_content_blocker_path"])

75 _convert_filter_list([easylist, exceptionrules],	105 write_block_list([easylist, exceptionrules],

76 "combined_content_blocker_path")	106 config["combined_content_blocker_path"])

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »