abp/filters/renderer.py - Issue 29845767: Issue 6685 - Offer incremental filter list downloads

Side by Side Diff: abp/filters/renderer.py

Issue 29845767: Issue 6685 - Offer incremental filter list downloads (Closed) Base URL: https://hg.adblockplus.org/python-abp/

Patch Set: Remove metadata_keys, yield deletions first Created Aug. 27, 2018, 10:04 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-present eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 """Combine filter list fragments to produce filter lists."""	16 """Combine filter list fragments to produce filter lists."""

17	17

18 from __future__ import unicode_literals	18 from __future__ import unicode_literals

19	19

20 import itertools	20 import itertools

21 import logging	21 import logging

22 import time	22 import time

23	23

24 from .parser import parse_filterlist, Comment, Metadata	24 from .parser import parse_filterlist, Comment, Metadata

25 from .sources import NotFound	25 from .sources import NotFound

26	26

27 __all__ = ['IncludeError', 'MissingHeader', 'render_filterlist']	27 __all__ = ['IncludeError', 'MissingHeader', 'render_filterlist', 'render_diff']

28	28

29 _logger = logging.getLogger(__name__)	29 _logger = logging.getLogger(__name__)

30	30

31	31

32 class IncludeError(Exception):	32 class IncludeError(Exception):

33 """Error in processing include instruction."""	33 """Error in processing include instruction."""

34	34

35 def __init__(self, error, stack):	35 def __init__(self, error, stack):

36 stack_str = ' from '.join(map("'{}'".format, reversed(stack)))	36 stack_str = ' from '.join(map("'{}'".format, reversed(stack)))

37 if stack_str:	37 if stack_str:

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
117 """Insert metadata comment with version (a.k.a. date)."""	117 """Insert metadata comment with version (a.k.a. date)."""

118 first_line, rest = _first_and_rest(lines)	118 first_line, rest = _first_and_rest(lines)

119 version = Metadata('Version', time.strftime('%Y%m%d%H%M', time.gmtime()))	119 version = Metadata('Version', time.strftime('%Y%m%d%H%M', time.gmtime()))

120 return itertools.chain([first_line, version], rest)	120 return itertools.chain([first_line, version], rest)

121	121

122	122

123 def _remove_duplicates(lines):	123 def _remove_duplicates(lines):

124 """Remove duplicate metadata and headers."""	124 """Remove duplicate metadata and headers."""

125 # Always remove checksum -- a checksum coming from a fragment	125 # Always remove checksum -- a checksum coming from a fragment

126 # will not match for the rendered list.	126 # will not match for the rendered list.

127 seen = {'Checksum'}	127 seen = {'Checksum'}
	Sebastian Noack 2018/08/28 19:52:18 Since we parse metadata with arbitrary keys now, w Since we parse metadata with arbitrary keys now, we have to account for keys with inconsistent capitalization, here: seen = {'checksum'} for i, line in enumerate(lines): if line.type == 'metadata': key = line.key.lower() if key not in seen: seen.add(key) yield line rhowell 2018/08/29 21:43:34 Done. Show quoted text On 2018/08/28 19:52:18, Sebastian Noack wrote: > Since we parse metadata with arbitrary keys now, we have to account for keys > with inconsistent capitalization, here: > > seen = {'checksum'} > for i, line in enumerate(lines): > if line.type == 'metadata': > key = line.key.lower() > if key not in seen: > seen.add(key) > yield line Done.
128 for i, line in enumerate(lines):	128 for i, line in enumerate(lines):

129 if line.type == 'metadata':	129 if line.type == 'metadata':

130 if line.key not in seen:	130 if line.key not in seen:

131 seen.add(line.key)	131 seen.add(line.key)

132 yield line	132 yield line

133 elif line.type == 'header':	133 elif line.type == 'header':

134 if i == 0:	134 if i == 0:

135 yield line	135 yield line

136 else:	136 else:

137 yield line	137 yield line

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
173 lead to rendering an invalid filter list, so we immediately abort.	173 lead to rendering an invalid filter list, so we immediately abort.

174	174

175 """	175 """

176 _logger.info('Rendering: %s', name)	176 _logger.info('Rendering: %s', name)

177 lines, default_source = _get_and_parse_fragment(name, sources, top_source)	177 lines, default_source = _get_and_parse_fragment(name, sources, top_source)

178 lines = _process_includes(sources, default_source, [name], lines)	178 lines = _process_includes(sources, default_source, [name], lines)

179 for proc in [_process_timestamps, _insert_version, _remove_duplicates,	179 for proc in [_process_timestamps, _insert_version, _remove_duplicates,

180 _validate]:	180 _validate]:

181 lines = proc(lines)	181 lines = proc(lines)

182 return lines	182 return lines

	183

	184

	185 def _split_list_for_diff(list_in):

	186 filterlist, metadata, keys = set(), set(), set()

	187 for line in parse_filterlist(list_in):

	188 if line.type == 'metadata' and 'Checksum' not in line.to_string():
	Sebastian Noack 2018/08/28 19:52:18 I think it has been agreed on that this isn't nece I think it has been agreed on that this isn't necessary, as any diff-aware client ignores checksums anyway (besides most filter list if not all wouldn't have a checksum anymore anyway). rhowell 2018/08/29 21:43:35 Done. Show quoted text On 2018/08/28 19:52:18, Sebastian Noack wrote: > I think it has been agreed on that this isn't necessary, as any diff-aware > client ignores checksums anyway (besides most filter list if not all wouldn't > have a checksum anymore anyway). Done.
	189 metadata.add(line.to_string())

	190 keys.add(line.key)
	Sebastian Noack 2018/08/28 19:52:18 This code has to be changed as well, to handle cas This code has to be changed as well, to handle case-insensitive keys: def _split_list_for_diff(list_in): metadata = {} for line in parse_filterlist(list_in): if line.type == 'metadata': metadata[line.key.lower()] = line ... def render_diff(base, latest): ... for key, line in latest_md.items(): other = base_md.get(key) if not other or other.value != line.value: yield line.to_string() for key in set(base_md) - set(latest_md): yield '! {}:'.format(base_md[key].key) ... rhowell 2018/08/29 21:43:34 Any reason to use a dict instead of a set? I guess Show quoted text On 2018/08/28 19:52:18, Sebastian Noack wrote: > This code has to be changed as well, to handle case-insensitive keys: > > def _split_list_for_diff(list_in): > metadata = {} > for line in parse_filterlist(list_in): > if line.type == 'metadata': > metadata[line.key.lower()] = line > ... > > def render_diff(base, latest): > ... > for key, line in latest_md.items(): > other = base_md.get(key) > if not other or other.value != line.value: > yield line.to_string() > for key in set(base_md) - set(latest_md): > yield '! {}:'.format(base_md[key].key) > ... Any reason to use a dict instead of a set? I guess one advantage would be: if a metadata item is removed in the new list, we would convert it to lower-case and give it a blank value. If that's a concern (converting it to lower case), then I can use dicts instead of sets, as you suggested. Sebastian Noack 2018/08/29 21:56:37 The algorithm I suggest here, is the (probably) si Show quoted text On 2018/08/29 21:43:34, rhowell wrote: > On 2018/08/28 19:52:18, Sebastian Noack wrote: > > This code has to be changed as well, to handle case-insensitive keys: > > > > def _split_list_for_diff(list_in): > > metadata = {} > > for line in parse_filterlist(list_in): > > if line.type == 'metadata': > > metadata[line.key.lower()] = line > > ... > > > > def render_diff(base, latest): > > ... > > for key, line in latest_md.items(): > > other = base_md.get(key) > > if not other or other.value != line.value: > > yield line.to_string() > > for key in set(base_md) - set(latest_md): > > yield '! {}:'.format(base_md[key].key) > > ... > > Any reason to use a dict instead of a set? I guess one advantage would be: if a > metadata item is removed in the new list, we would convert it to lower-case and > give it a blank value. If that's a concern (converting it to lower case), then I > can use dicts instead of sets, as you suggested. The algorithm I suggest here, is the (probably) simplest way to treat keys case-insensitive while preserving the original capitalization. With a set alone you cannot associate the normalized keys with the original. rhowell 2018/08/29 22:52:23 Done. Show quoted text On 2018/08/29 21:56:37, Sebastian Noack wrote: > On 2018/08/29 21:43:34, rhowell wrote: > > On 2018/08/28 19:52:18, Sebastian Noack wrote: > > > This code has to be changed as well, to handle case-insensitive keys: > > > > > > def _split_list_for_diff(list_in): > > > metadata = {} > > > for line in parse_filterlist(list_in): > > > if line.type == 'metadata': > > > metadata[line.key.lower()] = line > > > ... > > > > > > def render_diff(base, latest): > > > ... > > > for key, line in latest_md.items(): > > > other = base_md.get(key) > > > if not other or other.value != line.value: > > > yield line.to_string() > > > for key in set(base_md) - set(latest_md): > > > yield '! {}:'.format(base_md[key].key) > > > ... > > > > Any reason to use a dict instead of a set? I guess one advantage would be: if > a > > metadata item is removed in the new list, we would convert it to lower-case > and > > give it a blank value. If that's a concern (converting it to lower case), then > I > > can use dicts instead of sets, as you suggested. > > The algorithm I suggest here, is the (probably) simplest way to treat keys > case-insensitive while preserving the original capitalization. With a set alone > you cannot associate the normalized keys with the original. Done.
	191 elif line.type == 'filter':

	192 filterlist.add(line.to_string())

	193 return filterlist, metadata, keys

	194

	195

	196 def render_diff(base, latest):

	197 """Return a diff between two filter lists.

	198

	199 Parameters

	200 ----------

	201 base : iterator of str

	202 The base (old) list that we want to update to latest.

	203 lastest : iterator of str

	204 The latest (most recent) list that we want to update to.

	205

	206 Returns

	207 -------

	208 iterable of str

	209 A diff between two lists (https://issues.adblockplus.org/ticket/6685)

	210

	211 """

	212 latest_fl, latest_md, latest_keys = _split_list_for_diff(latest)

	213 base_fl, base_md, base_keys = _split_list_for_diff(base)

	214

	215 new_md = latest_md - base_md

	216 removed_keys = base_keys - latest_keys

	217 add_fl = latest_fl - base_fl

	218 remove_fl = base_fl - latest_fl

	219

	220 yield '[Adblock Plus Diff]'

	221 for item in new_md:

	222 yield item

	223 for key in removed_keys:

	224 # If a special comment has been removed, enter it with a blank value

	225 # so the client will set it back to the default value

	226 yield '! {}:'.format(key)

	227 for item in remove_fl:

	228 yield '- {}'.format(item)

	229 for item in add_fl:

	230 yield '+ {}'.format(item)

OLD	NEW

« no previous file with comments | « abp/filters/parser.py ('k') | tests/test_differ.py » ('j') | tests/test_parser.py » ('J')