abp/filters/renderer.py - Issue 29845767: Issue 6685 - Offer incremental filter list downloads

Delta Between Two Patch Sets: abp/filters/renderer.py

Issue 29845767: Issue 6685 - Offer incremental filter list downloads (Closed) Base URL: https://hg.adblockplus.org/python-abp/

Left Patch Set: Use namedtuple filter list objects instead of strings Created Aug. 9, 2018, 7:26 p.m.

Right Patch Set: Address comments on PS8 Created Aug. 30, 2018, 5:37 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-present eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 """Combine filter list fragments to produce filter lists."""	16 """Combine filter list fragments to produce filter lists."""

17	17

18 from __future__ import unicode_literals	18 from __future__ import unicode_literals

19	19

20 import base64

21 import hashlib

22 import itertools	20 import itertools

23 import logging	21 import logging

24 import time	22 import time

25	23

26 from .parser import parse_filterlist, Comment, Metadata	24 from .parser import parse_filterlist, Comment, Metadata

27 from .sources import NotFound	25 from .sources import NotFound

28	26

29 __all__ = ['IncludeError', 'MissingHeader', 'render_filterlist']	27 __all__ = ['IncludeError', 'MissingHeader', 'render_filterlist', 'render_diff']
Vasily Kuznetsov 2018/08/17 10:30:54 render_diff() should probably become a public expo render_diff() should probably become a public export here. rhowell 2018/08/20 18:21:27 Done. Show quoted text On 2018/08/17 10:30:54, Vasily Kuznetsov wrote: > render_diff() should probably become a public export here. Done.
30	28

31 _logger = logging.getLogger(__name__)	29 _logger = logging.getLogger(__name__)

32	30

33	31

34 class IncludeError(Exception):	32 class IncludeError(Exception):

35 """Error in processing include instruction."""	33 """Error in processing include instruction."""

36	34

37 def __init__(self, error, stack):	35 def __init__(self, error, stack):

38 stack_str = ' from '.join(map("'{}'".format, reversed(stack)))	36 stack_str = ' from '.join(map("'{}'".format, reversed(stack)))

39 if stack_str:	37 if stack_str:

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
119 """Insert metadata comment with version (a.k.a. date)."""	117 """Insert metadata comment with version (a.k.a. date)."""

120 first_line, rest = _first_and_rest(lines)	118 first_line, rest = _first_and_rest(lines)

121 version = Metadata('Version', time.strftime('%Y%m%d%H%M', time.gmtime()))	119 version = Metadata('Version', time.strftime('%Y%m%d%H%M', time.gmtime()))

122 return itertools.chain([first_line, version], rest)	120 return itertools.chain([first_line, version], rest)

123	121

124	122

125 def _remove_duplicates(lines):	123 def _remove_duplicates(lines):

126 """Remove duplicate metadata and headers."""	124 """Remove duplicate metadata and headers."""

127 # Always remove checksum -- a checksum coming from a fragment	125 # Always remove checksum -- a checksum coming from a fragment

128 # will not match for the rendered list.	126 # will not match for the rendered list.

129 seen = {'Checksum'}	127 seen = {'checksum'}

130 for i, line in enumerate(lines):	128 for i, line in enumerate(lines):

131 if line.type == 'metadata':	129 if line.type == 'metadata':

132 if line.key not in seen:	130 key = line.key.lower()

133 seen.add(line.key)	131 if key not in seen:

	132 seen.add(key)

134 yield line	133 yield line

135 elif line.type == 'header':	134 elif line.type == 'header':

136 if i == 0:	135 if i == 0:

137 yield line	136 yield line

138 else:	137 else:

139 yield line	138 yield line

140

141

142 def _insert_checksum(lines):

143 """Add checksum to the filter list.

144

145 See https://adblockplus.org/filters#special-comments for description

146 of the checksum algorithm.

147 """

148 md5sum = hashlib.md5()

149

150 for line in lines:

151 if line.type != 'emptyline':

152 md5sum.update(line.to_string().encode('utf-8') + b'\n')

153 yield line

154

155 checksum = base64.b64encode(md5sum.digest()).rstrip(b'=')

156 yield Metadata('Checksum', checksum.decode('utf-8'))

157	139

158	140

159 def _validate(lines):	141 def _validate(lines):

160 """Validate the final list."""	142 """Validate the final list."""

161 first_line, rest = _first_and_rest(lines)	143 first_line, rest = _first_and_rest(lines)

162 if first_line.type != 'header':	144 if first_line.type != 'header':

163 raise MissingHeader('No header found at the beginning of the input.')	145 raise MissingHeader('No header found at the beginning of the input.')

164 return itertools.chain([first_line], rest)	146 return itertools.chain([first_line], rest)

165	147

166	148

(...skipping 22 matching lines...) Expand all Loading...
189 When any of the fragments contain lines that can't be parsed.	171 When any of the fragments contain lines that can't be parsed.

190 MissingHeader	172 MissingHeader

191 If the top level fragment doesn't start with a valid header. This would	173 If the top level fragment doesn't start with a valid header. This would

192 lead to rendering an invalid filter list, so we immediately abort.	174 lead to rendering an invalid filter list, so we immediately abort.

193	175

194 """	176 """

195 _logger.info('Rendering: %s', name)	177 _logger.info('Rendering: %s', name)

196 lines, default_source = _get_and_parse_fragment(name, sources, top_source)	178 lines, default_source = _get_and_parse_fragment(name, sources, top_source)

197 lines = _process_includes(sources, default_source, [name], lines)	179 lines = _process_includes(sources, default_source, [name], lines)

198 for proc in [_process_timestamps, _insert_version, _remove_duplicates,	180 for proc in [_process_timestamps, _insert_version, _remove_duplicates,

199 _insert_checksum, _validate]:	181 _validate]:

200 lines = proc(lines)	182 lines = proc(lines)

201 return lines	183 return lines

202	184

203	185

	186 def _split_list_for_diff(list_in):

	187 """Split a filter list into metadata, keys, and rules."""
	Vasily Kuznetsov 2018/08/31 12:11:02 Nit: strictly speaking this function now returns m Nit: strictly speaking this function now returns metadata and rules (the keys are just part of metadata). The docstring could be updated. rhowell 2018/08/31 21:21:02 Ah, good point! I'll update that. Show quoted text On 2018/08/31 12:11:02, Vasily Kuznetsov wrote: > Nit: strictly speaking this function now returns metadata and rules (the keys > are just part of metadata). The docstring could be updated. Ah, good point! I'll update that.
	188 metadata = {}

	189 rules = set()

	190 for line in parse_filterlist(list_in):

	191 if line.type == 'metadata':

	192 metadata[line.key.lower()] = line

	193 elif line.type == 'filter':

	194 rules.add(line.to_string())

	195 return metadata, rules

	196

	197

204 def render_diff(base, latest):	198 def render_diff(base, latest):
Vasily Kuznetsov 2018/08/17 10:30:54 I think it makes more sense to make `base`, `lates I think it makes more sense to make `base`, `latest` and the return value iterables of strings. This is what parse_filterlist() expects and this is what you get when you open a file. This is also the API that other functions dealing with filterlists take and return. It would also reduce the amount of code here eliminating the calls to splitlines() and some others. It would also be good to document the API of this function in the same way as the other functions are documented. rhowell 2018/08/20 18:21:27 Done. Show quoted text On 2018/08/17 10:30:54, Vasily Kuznetsov wrote: > I think it makes more sense to make `base`, `latest` and the return value > iterables of strings. This is what parse_filterlist() expects and this is what > you get when you open a file. This is also the API that other functions dealing > with filterlists take and return. It would also reduce the amount of code here > eliminating the calls to splitlines() and some others. > > It would also be good to document the API of this function in the same way as > the other functions are documented. Done.
205 """Return a diff between two filter lists."""	199 """Return a diff between two filter lists.

206 # Collect the special comments	200

207 diff = ['[Adblock Plus Diff]\n']	201 Parameters

208 latest_fl, latest_md, latest_keys = (set() for i in range(3))	202 ----------

209 base_fl, base_md, base_keys = (set() for i in range(3))	203 base : iterator of str

210	204 The base (old) list that we want to update to latest.

211 for line in parse_filterlist(latest.splitlines()):	205 lastest : iterator of str

212 if line.type == 'metadata' and 'Checksum' not in line.to_string():	206 The latest (most recent) list that we want to update to.
Vasily Kuznetsov 2018/08/17 10:30:53 Note: If this lands after the checksum patch, the Note: If this lands after the checksum patch, the 'Checksum' exception should be removed. If this lands first, it should be removed by that patch. rhowell 2018/08/20 18:21:27 Are we guaranteed that no filter lists will be enc Show quoted text On 2018/08/17 10:30:53, Vasily Kuznetsov wrote: > Note: If this lands after the checksum patch, the 'Checksum' exception should be > removed. If this lands first, it should be removed by that patch. Are we guaranteed that no filter lists will be encountered that have a checksum? This line and line 220 make sure that, if we ever see a checksum in either `base` or `latest`, it will not make it into the diff. But, if we will never see a checksum, then this check is unnecessary. Vasily Kuznetsov 2018/08/21 14:59:59 Hm. No, for now we're not guaranteed that there's Show quoted text On 2018/08/20 18:21:27, rhowell wrote: > On 2018/08/17 10:30:53, Vasily Kuznetsov wrote: > > Note: If this lands after the checksum patch, the 'Checksum' exception should > be > > removed. If this lands first, it should be removed by that patch. > > Are we guaranteed that no filter lists will be encountered that have a checksum? > This line and line 220 make sure that, if we ever see a checksum in either > `base` or `latest`, it will not make it into the diff. But, if we will never see > a checksum, then this check is unnecessary. Hm. No, for now we're not guaranteed that there's no Checksum, so you're right, we should keep this. Sebastian Noack 2018/08/21 19:42:45 Isn't python-abp stripping checksums when generati Show quoted text On 2018/08/21 14:59:59, Vasily Kuznetsov wrote: > On 2018/08/20 18:21:27, rhowell wrote: > > On 2018/08/17 10:30:53, Vasily Kuznetsov wrote: > > > Note: If this lands after the checksum patch, the 'Checksum' exception > should > > be > > > removed. If this lands first, it should be removed by that patch. > > > > Are we guaranteed that no filter lists will be encountered that have a > checksum? > > This line and line 220 make sure that, if we ever see a checksum in either > > `base` or `latest`, it will not make it into the diff. But, if we will never > see > > a checksum, then this check is unnecessary. > > Hm. No, for now we're not guaranteed that there's no Checksum, so you're right, > we should keep this. Isn't python-abp stripping checksums when generating the filter lists now, and therefore isn't it in fact guaranteed that filter lists delivered by our servers at least won't include any checksum? Even if not, any reason we'd have to strip them here? Clients that support diffs, will ignore checksums anyway. Sebastian Noack 2018/08/22 16:03:29 Moving the discussion from IRC over here: 16:34:4 Show quoted text On 2018/08/21 19:42:45, Sebastian Noack wrote: > On 2018/08/21 14:59:59, Vasily Kuznetsov wrote: > > On 2018/08/20 18:21:27, rhowell wrote: > > > On 2018/08/17 10:30:53, Vasily Kuznetsov wrote: > > > > Note: If this lands after the checksum patch, the 'Checksum' exception > > should > > > be > > > > removed. If this lands first, it should be removed by that patch. > > > > > > Are we guaranteed that no filter lists will be encountered that have a > > checksum? > > > This line and line 220 make sure that, if we ever see a checksum in either > > > `base` or `latest`, it will not make it into the diff. But, if we will never > > see > > > a checksum, then this check is unnecessary. > > > > Hm. No, for now we're not guaranteed that there's no Checksum, so you're > right, > > we should keep this. > > Isn't python-abp stripping checksums when generating the filter lists now, and > therefore isn't it in fact guaranteed that filter lists delivered by our servers > at least won't include any checksum? Even if not, any reason we'd have to strip > them here? Clients that support diffs, will ignore checksums anyway. Moving the discussion from IRC over here: 16:34:47 <•vasily> snoack: the second one I agree with you. For the first one, at the moment, I think we have other code producing filter lists in production, so they do have checksums. But we might convert all of this to python-abp (perhaps together with deployment of the incremental updates) so you would also be right then... however, we haven't yet done it. 16:36:03 <•vasily> snoack: in general seems like the checksums will be ignored by diff-aware clients, so yeah, it can be removed. 17:24:01 <snoack> vasily: How about ignoring checksum metadata in the parser (rather than when rendering the full list and/or diff)? That way checksums would be stripped no matter what, without any additional complexity.
213 latest_md.add(line.to_string())	207

214 latest_keys.add(line.key)	208 Returns

215 elif line.type == 'filter':	209 -------

216 latest_fl.add(line.to_string())	210 iterable of str

217	211 A diff between two lists (https://issues.adblockplus.org/ticket/6685)

218 # Get the diff between the rest of the lines	212

219 for line in parse_filterlist(base.splitlines()):	213 """
Vasily Kuznetsov 2018/08/17 10:30:53 It would be good to move this repeated piece of co It would be good to move this repeated piece of code into a small internal function. This will also make render_diff() more compact and easy to follow. Maybe something like: base_fl, base_md, base_keys = _split_list_for_diff(base) rhowell 2018/08/20 18:21:27 Done. Show quoted text On 2018/08/17 10:30:53, Vasily Kuznetsov wrote: > It would be good to move this repeated piece of code into a small internal > function. This will also make render_diff() more compact and easy to follow. > Maybe something like: > > base_fl, base_md, base_keys = _split_list_for_diff(base) Done.
220 if line.type == 'metadata' and 'Checksum' not in line.to_string():	214 latest_metadata, latest_rules = _split_list_for_diff(latest)

221 base_md.add(line.to_string())	215 base_metadata, base_rules = _split_list_for_diff(base)

222 base_keys.add(line.key)	216

223 elif line.type == 'filter':	217 yield '[Adblock Plus Diff]'

224 base_fl.add(line.to_string())	218 for key, latest in latest_metadata.items():

225 new_md = latest_md - base_md	219 base = base_metadata.get(key)

226 removed_keys = base_keys - latest_keys	220 if not base or base.value != latest.value:

227 add_fl = latest_fl - base_fl	221 yield latest.to_string()

228 remove_fl = base_fl - latest_fl	222 for key in set(base_metadata) - set(latest_metadata):

229 for item in new_md:	223 yield '! {}:'.format(base_metadata[key].key)

230 diff.append('{}\n'.format(item))	224 for rule in base_rules - latest_rules:
Vasily Kuznetsov 2018/08/17 10:30:53 If we return an iterable of strings from this func If we return an iterable of strings from this function we can just yield in this and the following loops. rhowell 2018/08/20 18:21:27 Done. Show quoted text On 2018/08/17 10:30:53, Vasily Kuznetsov wrote: > If we return an iterable of strings from this function we can just yield in this > and the following loops. Done.
231 for key in removed_keys:	225 yield '- {}'.format(rule)

232 # If a special comment has been removed, enter it with a blank value	226 for rule in latest_rules - base_rules:

233 # so the client will set it back to the default value	227 yield '+ {}'.format(rule)

234 diff.append('! {}:\n'.format(key))

235 for item in add_fl:

236 diff.append('+ {}\n'.format(item))

237 for item in remove_fl:

238 diff.append('- {}\n'.format(item))

239 return ''.join(diff)

LEFT	RIGHT