abp/filters/parser.py - Issue 29465715: Fixes 4969 - Add parsing of filters

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)

Patch Set: Created June 14, 2017, 5:32 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-2017 eyeo GmbH	2 # Copyright (C) 2006-2017 eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 from __future__ import unicode_literals	16 from __future__ import unicode_literals

17	17

18 import re	18 import re

19 from collections import namedtuple	19 from collections import namedtuple

20	20

21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError']	21 __all__ = ['parse_filterlist', 'parse_line', 'parse_filter']

22	22

23	23

24 class ParseError(Exception):	24 class ParseError(Exception):

25 """Exception thrown by the parser when it encounters invalid input.	25 """Internal exception used by the parser to signal invalid input."""
	mathias 2017/07/26 20:37:15 Removing the custom __init__ function looks like a Removing the custom __init__ function looks like an unrelated change to me. Don't you think it should be part of a separate patch-set?
26

27 :param error: Description of the error.

28 :param text: The text which was being parsed when an error occurred.

29 """

30

31 def __init__(self, error, text):

32 Exception.__init__(self, '{} in "{}"'.format(error, text))

33 self.text = text

34 self.error = error

35	26

36	27

37 def line_type(name, field_names, format_string):	28 def line_type(name, field_names, format_string):

38 """Define a line type.	29 """Define a line type.

39	30

40 :param name: The name of the line type to define.	31 :param name: The name of the line type to define.

41 :param field_names: A sequence of field names or one space-separated	32 :param field_names: A sequence of field names or one space-separated

42 string that contains all field names.	33 string that contains all field names.

	34 :param format_string: A format specifier for converting this line type
	mathias 2017/07/26 20:37:15 Fixing the missing format_string parameter documen Fixing the missing format_string parameter documentation looks like an unrelated change as well. So what about this one?
	35 back to string representation.

43 :returns: Class created with `namedtuple` that has `.type` set to	36 :returns: Class created with `namedtuple` that has `.type` set to

44 lowercased `name` and supports conversion back to string with	37 lowercased `name` and supports conversion back to string with

45 `.to_string()` method.	38 `.to_string()` method.

46 """	39 """

47 lt = namedtuple(name, field_names)	40 lt = namedtuple(name, field_names)

48 lt.type = name.lower()	41 lt.type = name.lower()

49 lt.to_string = lambda self: format_string.format(self)	42 lt.to_string = lambda self: format_string.format(self)

50 return lt	43 return lt

51	44

52	45

	46 InvalidLine = line_type('Invalid', 'text error', '{.text}')

53 Header = line_type('Header', 'version', '[{.version}]')	47 Header = line_type('Header', 'version', '[{.version}]')

54 EmptyLine = line_type('EmptyLine', '', '')	48 EmptyLine = line_type('EmptyLine', '', '')

55 Comment = line_type('Comment', 'text', '! {.text}')	49 Comment = line_type('Comment', 'text', '! {.text}')

56 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}')	50 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}')

57 Filter = line_type('Filter', 'expression', '{.expression}')

58 Include = line_type('Include', 'target', '%include {0.target}%')	51 Include = line_type('Include', 'target', '%include {0.target}%')

	52 Filter = line_type('Filter', 'text selector action options', '{.text}')

59	53

60	54

61 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')	55 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')

62 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',	56 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',

63 'Version'}	57 'Version'}

64 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	58 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

65 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	59 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)

	60 BFILTER_OPTIONS_REGEXP = re.compile(

	61 r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$'

	62 )

	63 HFILTER_REGEXP = re.compile(

	64 r'^([^\/\\\|\@"!]?)#(\@)?(?:([\w\-]+\|\*)'

	65 r'((?:$[\w\-]+(?:[$^]?=[^\($"])?\))*)\|#([^{}]+))$'

	66 )

	67

	68 # Types of resources to block (based on adblockpluscore/lib/filterClasses.js).

	69 TYPES = {

	70 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media',

	71 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup',

	72 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide',

	73 }

	74

	75 # Special types used for whitelisting.

	76 TYPES_WHITELIST = {

	77 'document', 'elemhide', 'generichide', 'genericblock',

	78 }

	79

	80 # By default blocking filters apply to everything except whitelist-only types

	81 # and popups (based on adblockpluscore/lib/filterClasses.js).

	82 TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST}

	83

	84 # Type options that are synonyms for other types.

	85 TYPE_SYNONYMS = {

	86 'xbl': 'other',

	87 'dtd': 'other',

	88 'background': 'image',

	89 }

66	90

67	91

68 def _parse_comment(text):	92 def _parse_comment(text):

69 match = METADATA_REGEXP.match(text)	93 match = METADATA_REGEXP.match(text)

70 if match and match.group(1) in METADATA_KEYS:	94 if match and match.group(1) in METADATA_KEYS:

71 return Metadata(match.group(1), match.group(2))	95 return Metadata(match.group(1), match.group(2))

72 return Comment(text[1:].strip())	96 return Comment(text[1:].strip())

73	97

74	98

75 def _parse_header(text):	99 def _parse_header(text):

76 match = HEADER_REGEXP.match(text)	100 match = HEADER_REGEXP.match(text)

77 if not match:	101 if not match:

78 raise ParseError('Malformed header', text)	102 raise ParseError('Malformed header')
	mathias 2017/07/26 20:37:15 Please explain why you don't include the malformed Please explain why you don't include the malformed text in the error message here (and the similar functions below). I was under the impression that such kind of information is highly appreciated because Python exception trace output does not include invocation parameters, hence reproduction without can is often quite tricky. Vasily Kuznetsov 2017/07/27 11:05:02 My reasoning was that you can get to this place in Show quoted text On 2017/07/26 20:37:15, mathias wrote: > Please explain why you don't include the malformed text in the error message > here (and the similar functions below). I was under the impression that such > kind of information is highly appreciated because Python exception trace output > does not include invocation parameters, hence reproduction without can is often > quite tricky. My reasoning was that you can get to this place in two ways: 1. by calling `parse_line(stuff)` directly: in this case the caller should expect to get the exception (it's documented in the docstring of `parse_line`) and handle it more or less in the same scope, where they know what was passed to `parse_line`. 2. by calling `parse_filterlist(list_of_stuff)`: in this case the exception gets caught inside of `parse_filterlist` and the result is an `InvalidLine` object, that contains the original line. Looking at it now, it seems to me that the assumption that the user will handle the exception before they lose track of what was passed to `parse_line`, that I made in considering case 1, is too optimistic. I will change the exception to be more user-friendly. This should also resolve your concern about the disappearing `__init__` in line 25.
79 return Header(match.group(1))	103 return Header(match.group(1))

80	104

81	105

82 def _parse_instruction(text):	106 def _parse_instruction(text):

83 match = INCLUDE_REGEXP.match(text)	107 match = INCLUDE_REGEXP.match(text)

84 if not match:	108 if not match:

85 raise ParseError('Unrecognized instruction', text)	109 raise ParseError('Unrecognized instruction')

86 return Include(match.group(1))	110 return Include(match.group(1))

87	111

88	112

	113 def _separate_domains(domains):

	114 options = {}

	115 for d in domains:

	116 if d.startswith('~'):

	117 options.setdefault('domains-exclude', []).append(d.lstrip('~'))

	118 else:

	119 options.setdefault('domains-include', []).append(d)

	120 if 'domains-include' in options:

	121 options['domains-none'] = True

	122 return options

	123

	124

	125 def _separate_types(types):

	126 """Convert a list of `(type, on_off)` tuples to options:

	127

	128 - types-none: True if we start with nothing included, absent if we start

	129 with TYPES_DEFAULT included.

	130 - types-include: List of additional included types.

	131 - types-exclude: List of excluded types.

	132 """

	133 if not types:

	134 return {}

	135

	136 if types[0][1]: # If the first type is ON, we start with nothing...

	137 types_default = set()

	138 options = {'types-none': True}

	139 else: # ...otherwise we start with default type set.

	140 types_default = TYPES_DEFAULT

	141 options = {}

	142

	143 # Include/exclude any deviations from default.

	144 for name, value in dict(types).items():

	145 if value and name not in types_default:

	146 options.setdefault('types-include', []).append(name)

	147 if not value and name in types_default:

	148 options.setdefault('types-exclude', []).append(name)

	149

	150 return options

	151

	152

	153 def _parse_hiding_filter(text, match):

	154 if match.group(5):

	155 selector = {'type': 'css', 'value': match.group(5)}

	156 else:

	157 selector = {

	158 'type': 'abp-simple',

	159 'value': match.group(3) + match.group(4),

	160 }

	161 action = 'show' if match.group(2) else 'hide'

	162 options = _separate_domains(list(filter(None, match.group(1).split(','))))

	163 return Filter(text, selector, action, options)

	164

	165

	166 def _parse_filter_options(options):

	167 # Based on RegExpFilter.fromText in lib/filterClasses.js

	168 # in adblockpluscore.

	169 parsed_options = {}

	170 type_options = []

	171

	172 for option in options.split(','):

	173 if '=' in option:

	174 name, value = option.split('=', 1)

	175 elif option.startswith('~'):

	176 name, value = option[1:], False

	177 else:

	178 name, value = option, True

	179

	180 if name in TYPE_SYNONYMS:

	181 name = TYPE_SYNONYMS[name]

	182 if name in TYPES:

	183 type_options.append((name, value))

	184 elif name == 'domain':

	185 parsed_options.update(_separate_domains(value.split('\|')))

	186 elif name == 'sitekey':

	187 parsed_options['sitekeys'] = value.split('\|')

	188 else:

	189 parsed_options[name] = value

	190

	191 parsed_options.update(_separate_types(type_options))

	192 return parsed_options

	193

	194

	195 def _parse_blocking_filter(text):

	196 # Based on RegExpFilter.fromText in lib/filterClasses.js

	197 # in adblockpluscore.

	198 action = 'block'

	199 options = {}

	200 selector = text

	201

	202 if selector.startswith('@@'):

	203 action = 'allow'
	mathias 2017/07/26 20:37:15 I think we should have symbols like BFILTER_ACTION I think we should have symbols like BFILTER_ACTION_ALLOW and BFILTER_ACTION_BLOCK for actions asslociated with blocking filters, analogous to HFILTER_ACTION_HIDE and HFILTER_ACTION_SHOW for element hiding filters above. Vasily Kuznetsov 2017/07/27 11:05:02 Probably not these exact names for the constants, Show quoted text On 2017/07/26 20:37:15, mathias wrote: > I think we should have symbols like BFILTER_ACTION_ALLOW and > BFILTER_ACTION_BLOCK for actions asslociated with blocking filters, analogous to > HFILTER_ACTION_HIDE and HFILTER_ACTION_SHOW for element hiding filters above. Probably not these exact names for the constants, but in general I agree, constants are better than magic strings.
	204 selector = selector[2:]

	205

	206 if '$' in selector:

	207 opt_match = BFILTER_OPTIONS_REGEXP.search(selector)

	208 if opt_match:

	209 selector = selector[:opt_match.start(0)]

	210 options = _parse_filter_options(opt_match.group(1))

	211

	212 if (len(selector) > 1 and

	213 selector.startswith('/') and selector.endswith('/')):

	214 selector = {'type': 'url-regexp', 'value': selector[1:-1]}
	mathias 2017/07/26 20:37:15 I also think we should have symbols like SELECTOR_ I also think we should have symbols like SELECTOR_TYPE_REGEXP and SELECTOR_TYPE_PATTERN. And be it just to have a place to document them like # http://link/to/explanation or an actual definition or something, or for IDE's to recognize them symbols, or plain old helping humans with the association by creating an official item. Vasily Kuznetsov 2017/07/27 11:05:02 Acknowledged. Show quoted text On 2017/07/26 20:37:15, mathias wrote: > I also think we should have symbols like SELECTOR_TYPE_REGEXP and > SELECTOR_TYPE_PATTERN. And be it just to have a place to document them like # > http://link/to/explanation or an actual definition or something, or for IDE's to > recognize them symbols, or plain old helping humans with the association by > creating an official item. Acknowledged.
	215 else:

	216 selector = {'type': 'url-pattern', 'value': selector}

	217

	218 return Filter(text, selector, action, options)

	219

	220

	221 def parse_filter(text):

	222 """Parse one filter.

	223

	224 :param text: Text representation of a filter.

	225 :returns: filter object.

	226 """

	227 match = HFILTER_REGEXP.match(text) if '#' in text else False
	mathias 2017/07/26 20:37:15 Call me old-fashioned but I seriously dislike chan Call me old-fashioned but I seriously dislike changing the type of a variable (especially when the former and new value types quack quite differently). You could just use None instead of False here, so it'll be match or no match, not match or untruth. Vasily Kuznetsov 2017/07/27 11:05:02 Completely agree about changing the type of the va Show quoted text On 2017/07/26 20:37:15, mathias wrote: > Call me old-fashioned but I seriously dislike changing the type of a variable > (especially when the former and new value types quack quite differently). You > could just use None instead of False here, so it'll be match or no match, not > match or untruth. Completely agree about changing the type of the variable. Here, as far as we're concerned (the variable will only be used in the following `if`, if it's not a proper match), the quacking is the same, but the following code could potentially change, leading to subtle bugs. Thanks for catching this, I will fix it.
	228 if match:

	229 return _parse_hiding_filter(text, match)

	230 return _parse_blocking_filter(text)

	231

	232

89 def parse_line(line_text):	233 def parse_line(line_text):

90 """Parse one line of a filter list.	234 """Parse one line of a filter list.

91	235

92 :param line_text: Line of a filter list (must be a unicode string).	236 :param line_text: Line of a filter list (must be a unicode string).

93 :returns: Parsed line object (see `line_type`).	237 :returns: Parsed line object (see `line_type`).

94 :raises ParseError: If the line can't be successfully parsed.	238 :raises ParseError: If the line can't be successfully parsed.

95 """	239 """

96 content = line_text.strip()	240 content = line_text.strip()

97	241

98 if content == '':	242 if content == '':

99 line = EmptyLine()	243 line = EmptyLine()

100 elif content.startswith('!'):	244 elif content.startswith('!'):

101 line = _parse_comment(content)	245 line = _parse_comment(content)

102 elif content.startswith('%') and content.endswith('%'):	246 elif content.startswith('%') and content.endswith('%'):

103 line = _parse_instruction(content)	247 line = _parse_instruction(content)

104 elif content.startswith('[') and content.endswith(']'):	248 elif content.startswith('[') and content.endswith(']'):

105 line = _parse_header(content)	249 line = _parse_header(content)

106 else:	250 else:

107 line = Filter(content)	251 line = parse_filter(content)

108	252

109 assert line.to_string().replace(' ', '') == content.replace(' ', '')	253 assert line.to_string().replace(' ', '') == content.replace(' ', '')

110 return line	254 return line

111	255

112	256

113 def parse_filterlist(lines):	257 def parse_filterlist(lines):

114 """Parse filter list from an iterable.	258 """Parse filter list from an iterable.

115	259

116 :param lines: List of strings or file or other iterable.	260 :param lines: List of strings or file or other iterable.

117 :returns: Iterator over parsed lines.	261 :returns: Iterator over parsed lines.

118 :raises ParseError: Can be thrown during iteration for invalid lines.	262 :raises ParseError: Can be thrown during iteration for invalid lines.

119 """	263 """

120 for line in lines:	264 for line in lines:

121 yield parse_line(line)	265 try:

	266 yield parse_line(line)

	267 except ParseError as pe:

	268 yield InvalidLine(line.strip(), str(pe))

OLD	NEW

« no previous file with comments | « no previous file | setup.py » ('j') | setup.py » ('J')