abp/filters/parser.py - Issue 29465715: Fixes 4969 - Add parsing of filters

Delta Between Two Patch Sets: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)

Left Patch Set: Created June 14, 2017, 5:32 p.m.

Right Patch Set: Rebase to 1f5d7ead9bff Created Oct. 24, 2017, 3:58 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-2017 eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 from __future__ import unicode_literals	16 from __future__ import unicode_literals

17	17

18 import re	18 import re

19 from collections import namedtuple	19 from collections import namedtuple

20	20

21 __all__ = ['parse_filterlist', 'parse_line', 'parse_filter']	21 __all__ = [

	22 'FILTER_ACTION',

	23 'FILTER_OPTION',

	24 'ParseError',

	25 'SELECTOR_TYPE',

	26 'parse_filterlist',

	27 'parse_line',

	28 ]

22	29

23	30

24 class ParseError(Exception):	31 class ParseError(Exception):

25 """Internal exception used by the parser to signal invalid input."""	32 """Exception thrown by the parser when it encounters invalid input.
mathias 2017/07/26 20:37:15 Removing the custom __init__ function looks like a Removing the custom __init__ function looks like an unrelated change to me. Don't you think it should be part of a separate patch-set?
26	33

27	34 :param error: Description of the error.

28 def line_type(name, field_names, format_string):	35 :param text: The text which was being parsed when an error occurred.

	36 """

	37

	38 def __init__(self, error, text):

	39 Exception.__init__(self, '{} in "{}"'.format(error, text))

	40 self.text = text

	41 self.error = error

	42

	43

	44 # Constants related to filters (see https://adblockplus.org/filters).

	45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).

	46 """Selector types"""

	47 URL_PATTERN = 'url-pattern' # Normal URL patterns.

	48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.

	49 CSS = 'css' # CSS selectors for hiding filters.

	50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).

	51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.

	52

	53

	54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).

	55 """Filter actions"""

	56 BLOCK = 'block' # Block the request.

	57 ALLOW = 'allow' # Allow the request (whitelist).

	58 HIDE = 'hide' # Hide selected element(s).

	59 SHOW = 'show' # Show selected element(s) (whitelist).

	60

	61

	62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).

	63 """Filter options"""

	64 # Resource types.

	65 OTHER = 'other'

	66 SCRIPT = 'script'

	67 IMAGE = 'image'

	68 STYLESHEET = 'stylesheet'

	69 OBJECT = 'object'

	70 SUBDOCUMENT = 'subdocument'

	71 DOCUMENT = 'document'

	72 WEBSOCKET = 'websocket'

	73 WEBRTC = 'webrtc'

	74 PING = 'ping'

	75 XMLHTTPREQUEST = 'xmlhttprequest'

	76 OBJECT_SUBREQUEST = 'object-subrequest'

	77 MEDIA = 'media'

	78 FONT = 'font'

	79 POPUP = 'popup'

	80 GENERICBLOCK = 'genericblock'

	81 ELEMHIDE = 'elemhide'

	82 GENERICHIDE = 'generichide'

	83

	84 # Deprecated resource types.

	85 BACKGROUND = 'background'

	86 XBL = 'xbl'

	87 DTD = 'dtd'

	88

	89 # Other options.

	90 MATCH_CASE = 'match-case'

	91 DOMAIN = 'domain'

	92 THIRD_PARTY = 'third-party'

	93 COLLAPSE = 'collapse'

	94 SITEKEY = 'sitekey'

	95 DONOTTRACK = 'donottrack'

	96

	97

	98 def _line_type(name, field_names, format_string):

29 """Define a line type.	99 """Define a line type.

30	100

31 :param name: The name of the line type to define.	101 :param name: The name of the line type to define.

32 :param field_names: A sequence of field names or one space-separated	102 :param field_names: A sequence of field names or one space-separated

33 string that contains all field names.	103 string that contains all field names.

34 :param format_string: A format specifier for converting this line type	104 :param format_string: A format specifier for converting this line type
mathias 2017/07/26 20:37:15 Fixing the missing format_string parameter documen Fixing the missing format_string parameter documentation looks like an unrelated change as well. So what about this one?
35 back to string representation.	105 back to string representation.

36 :returns: Class created with `namedtuple` that has `.type` set to	106 :returns: Class created with `namedtuple` that has `.type` set to

37 lowercased `name` and supports conversion back to string with	107 lowercased `name` and supports conversion back to string with

38 `.to_string()` method.	108 `.to_string()` method.

39 """	109 """

40 lt = namedtuple(name, field_names)	110 lt = namedtuple(name, field_names)

41 lt.type = name.lower()	111 lt.type = name.lower()

42 lt.to_string = lambda self: format_string.format(self)	112 lt.to_string = lambda self: format_string.format(self)

43 return lt	113 return lt

44	114

45	115

46 InvalidLine = line_type('Invalid', 'text error', '{.text}')	116 Header = _line_type('Header', 'version', '[{.version}]')

47 Header = line_type('Header', 'version', '[{.version}]')	117 EmptyLine = _line_type('EmptyLine', '', '')

48 EmptyLine = line_type('EmptyLine', '', '')	118 Comment = _line_type('Comment', 'text', '! {.text}')

49 Comment = line_type('Comment', 'text', '! {.text}')	119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

50 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}')	120 Filter = _line_type('Filter', 'text selector action options', '{.text}')

51 Include = line_type('Include', 'target', '%include {0.target}%')	121 Include = _line_type('Include', 'target', '%include {0.target}%')

52 Filter = line_type('Filter', 'text selector action options', '{.text}')

53	122

54	123

55 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')	124 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')

56 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',	125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',

57 'Version'}	126 'Version'}

58 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

59 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)

60 BFILTER_OPTIONS_REGEXP = re.compile(	129 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')

61 r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$'	130 FILTER_OPTIONS_REGEXP = re.compile(

	131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'

62 )	132 )

63 HFILTER_REGEXP = re.compile(

64 r'^([^\/\\\|\@"!]?)#(\@)?(?:([\w\-]+\|\*)'

65 r'((?:$[\w\-]+(?:[$^]?=[^\($"])?\))*)\|#([^{}]+))$'

66 )

67

68 # Types of resources to block (based on adblockpluscore/lib/filterClasses.js).

69 TYPES = {

70 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media',

71 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup',

72 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide',

73 }

74

75 # Special types used for whitelisting.

76 TYPES_WHITELIST = {

77 'document', 'elemhide', 'generichide', 'genericblock',

78 }

79

80 # By default blocking filters apply to everything except whitelist-only types

81 # and popups (based on adblockpluscore/lib/filterClasses.js).

82 TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST}

83

84 # Type options that are synonyms for other types.

85 TYPE_SYNONYMS = {

86 'xbl': 'other',

87 'dtd': 'other',

88 'background': 'image',

89 }

90	133

91	134

92 def _parse_comment(text):	135 def _parse_comment(text):

93 match = METADATA_REGEXP.match(text)	136 match = METADATA_REGEXP.match(text)

94 if match and match.group(1) in METADATA_KEYS:	137 if match and match.group(1) in METADATA_KEYS:

95 return Metadata(match.group(1), match.group(2))	138 return Metadata(match.group(1), match.group(2))

96 return Comment(text[1:].strip())	139 return Comment(text[1:].strip())

97	140

98	141

99 def _parse_header(text):	142 def _parse_header(text):

100 match = HEADER_REGEXP.match(text)	143 match = HEADER_REGEXP.match(text)

101 if not match:	144 if not match:

102 raise ParseError('Malformed header')	145 raise ParseError('Malformed header', text)
mathias 2017/07/26 20:37:15 Please explain why you don't include the malformed Please explain why you don't include the malformed text in the error message here (and the similar functions below). I was under the impression that such kind of information is highly appreciated because Python exception trace output does not include invocation parameters, hence reproduction without can is often quite tricky. Vasily Kuznetsov 2017/07/27 11:05:02 My reasoning was that you can get to this place in Show quoted text On 2017/07/26 20:37:15, mathias wrote: > Please explain why you don't include the malformed text in the error message > here (and the similar functions below). I was under the impression that such > kind of information is highly appreciated because Python exception trace output > does not include invocation parameters, hence reproduction without can is often > quite tricky. My reasoning was that you can get to this place in two ways: 1. by calling `parse_line(stuff)` directly: in this case the caller should expect to get the exception (it's documented in the docstring of `parse_line`) and handle it more or less in the same scope, where they know what was passed to `parse_line`. 2. by calling `parse_filterlist(list_of_stuff)`: in this case the exception gets caught inside of `parse_filterlist` and the result is an `InvalidLine` object, that contains the original line. Looking at it now, it seems to me that the assumption that the user will handle the exception before they lose track of what was passed to `parse_line`, that I made in considering case 1, is too optimistic. I will change the exception to be more user-friendly. This should also resolve your concern about the disappearing `__init__` in line 25.
103 return Header(match.group(1))	146 return Header(match.group(1))

104	147

105	148

106 def _parse_instruction(text):	149 def _parse_instruction(text):

107 match = INCLUDE_REGEXP.match(text)	150 match = INCLUDE_REGEXP.match(text)

108 if not match:	151 if not match:

109 raise ParseError('Unrecognized instruction')	152 raise ParseError('Unrecognized instruction', text)

110 return Include(match.group(1))	153 return Include(match.group(1))

111	154

112	155

113 def _separate_domains(domains):	156 def _parse_option(option):

114 options = {}	157 if '=' in option:

115 for d in domains:	158 return option.split('=', 1)

116 if d.startswith('~'):	159 if option.startswith('~'):

117 options.setdefault('domains-exclude', []).append(d.lstrip('~'))	160 return option[1:], False

118 else:	161 return option, True

119 options.setdefault('domains-include', []).append(d)	162

120 if 'domains-include' in options:	163

121 options['domains-none'] = True	164 def _parse_filter_option(option):

122 return options	165 name, value = _parse_option(option)

123	166

124	167 # Handle special cases of multivalued options.

125 def _separate_types(types):	168 if name == FILTER_OPTION.DOMAIN:

126 """Convert a list of `(type, on_off)` tuples to options:	169 value = [_parse_option(o) for o in value.split('\|')]

127	170 elif name == FILTER_OPTION.SITEKEY:

128 - types-none: True if we start with nothing included, absent if we start	171 value = value.split('\|')

129 with TYPES_DEFAULT included.	172

130 - types-include: List of additional included types.	173 return name, value

131 - types-exclude: List of excluded types.

132 """

133 if not types:

134 return {}

135

136 if types[0][1]: # If the first type is ON, we start with nothing...

137 types_default = set()

138 options = {'types-none': True}

139 else: # ...otherwise we start with default type set.

140 types_default = TYPES_DEFAULT

141 options = {}

142

143 # Include/exclude any deviations from default.

144 for name, value in dict(types).items():

145 if value and name not in types_default:

146 options.setdefault('types-include', []).append(name)

147 if not value and name in types_default:

148 options.setdefault('types-exclude', []).append(name)

149

150 return options

151

152

153 def _parse_hiding_filter(text, match):

154 if match.group(5):

155 selector = {'type': 'css', 'value': match.group(5)}

156 else:

157 selector = {

158 'type': 'abp-simple',

159 'value': match.group(3) + match.group(4),

160 }

161 action = 'show' if match.group(2) else 'hide'

162 options = _separate_domains(list(filter(None, match.group(1).split(','))))

163 return Filter(text, selector, action, options)

164	174

165	175

166 def _parse_filter_options(options):	176 def _parse_filter_options(options):

167 # Based on RegExpFilter.fromText in lib/filterClasses.js	177 return [_parse_filter_option(o) for o in options.split(',')]

168 # in adblockpluscore.

169 parsed_options = {}

170 type_options = []

171

172 for option in options.split(','):

173 if '=' in option:

174 name, value = option.split('=', 1)

175 elif option.startswith('~'):

176 name, value = option[1:], False

177 else:

178 name, value = option, True

179

180 if name in TYPE_SYNONYMS:

181 name = TYPE_SYNONYMS[name]

182 if name in TYPES:

183 type_options.append((name, value))

184 elif name == 'domain':

185 parsed_options.update(_separate_domains(value.split('\|')))

186 elif name == 'sitekey':

187 parsed_options['sitekeys'] = value.split('\|')

188 else:

189 parsed_options[name] = value

190

191 parsed_options.update(_separate_types(type_options))

192 return parsed_options

193	178

194	179

195 def _parse_blocking_filter(text):	180 def _parse_blocking_filter(text):

196 # Based on RegExpFilter.fromText in lib/filterClasses.js	181 # Based on RegExpFilter.fromText in lib/filterClasses.js

197 # in adblockpluscore.	182 # in https://hg.adblockplus.org/adblockpluscore.

198 action = 'block'	183 action = FILTER_ACTION.BLOCK

199 options = {}	184 options = []

200 selector = text	185 selector = text

201	186

202 if selector.startswith('@@'):	187 if selector.startswith('@@'):

203 action = 'allow'	188 action = FILTER_ACTION.ALLOW
mathias 2017/07/26 20:37:15 I think we should have symbols like BFILTER_ACTION I think we should have symbols like BFILTER_ACTION_ALLOW and BFILTER_ACTION_BLOCK for actions asslociated with blocking filters, analogous to HFILTER_ACTION_HIDE and HFILTER_ACTION_SHOW for element hiding filters above. Vasily Kuznetsov 2017/07/27 11:05:02 Probably not these exact names for the constants, Show quoted text On 2017/07/26 20:37:15, mathias wrote: > I think we should have symbols like BFILTER_ACTION_ALLOW and > BFILTER_ACTION_BLOCK for actions asslociated with blocking filters, analogous to > HFILTER_ACTION_HIDE and HFILTER_ACTION_SHOW for element hiding filters above. Probably not these exact names for the constants, but in general I agree, constants are better than magic strings.
204 selector = selector[2:]	189 selector = selector[2:]

205	190

206 if '$' in selector:	191 if '$' in selector:

207 opt_match = BFILTER_OPTIONS_REGEXP.search(selector)	192 opt_match = FILTER_OPTIONS_REGEXP.search(selector)

208 if opt_match:	193 if opt_match:

209 selector = selector[:opt_match.start(0)]	194 selector = selector[:opt_match.start(0)]

210 options = _parse_filter_options(opt_match.group(1))	195 options = _parse_filter_options(opt_match.group(1))

211	196

212 if (len(selector) > 1 and	197 if (len(selector) > 1 and

213 selector.startswith('/') and selector.endswith('/')):	198 selector.startswith('/') and selector.endswith('/')):

214 selector = {'type': 'url-regexp', 'value': selector[1:-1]}	199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}
mathias 2017/07/26 20:37:15 I also think we should have symbols like SELECTOR_ I also think we should have symbols like SELECTOR_TYPE_REGEXP and SELECTOR_TYPE_PATTERN. And be it just to have a place to document them like # http://link/to/explanation or an actual definition or something, or for IDE's to recognize them symbols, or plain old helping humans with the association by creating an official item. Vasily Kuznetsov 2017/07/27 11:05:02 Acknowledged. Show quoted text On 2017/07/26 20:37:15, mathias wrote: > I also think we should have symbols like SELECTOR_TYPE_REGEXP and > SELECTOR_TYPE_PATTERN. And be it just to have a place to document them like # > http://link/to/explanation or an actual definition or something, or for IDE's to > recognize them symbols, or plain old helping humans with the association by > creating an official item. Acknowledged.
215 else:	200 else:

216 selector = {'type': 'url-pattern', 'value': selector}	201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}

	202

	203 return Filter(text, selector, action, options)

	204

	205

	206 def _parse_hiding_filter(text, domain, type_flag, selector_value):

	207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}

	208 action = FILTER_ACTION.HIDE

	209 options = []

	210

	211 if type_flag == '@':

	212 action = FILTER_ACTION.SHOW

	213 elif type_flag == '?':

	214 selector['type'] = SELECTOR_TYPE.XCSS

	215

	216 if domain:

	217 domains = [_parse_option(d) for d in domain.split(',')]

	218 options.append((FILTER_OPTION.DOMAIN, domains))

217	219

218 return Filter(text, selector, action, options)	220 return Filter(text, selector, action, options)

219	221

220	222

221 def parse_filter(text):	223 def parse_filter(text):

222 """Parse one filter.	224 """Parse one filter.

223	225

224 :param text: Text representation of a filter.	226 :param text: Text representation of a filter.

225 :returns: filter object.	227 :returns: Filter object.

226 """	228 """

227 match = HFILTER_REGEXP.match(text) if '#' in text else False	229 if '#' in text:
mathias 2017/07/26 20:37:15 Call me old-fashioned but I seriously dislike chan Call me old-fashioned but I seriously dislike changing the type of a variable (especially when the former and new value types quack quite differently). You could just use None instead of False here, so it'll be match or no match, not match or untruth. Vasily Kuznetsov 2017/07/27 11:05:02 Completely agree about changing the type of the va Show quoted text On 2017/07/26 20:37:15, mathias wrote: > Call me old-fashioned but I seriously dislike changing the type of a variable > (especially when the former and new value types quack quite differently). You > could just use None instead of False here, so it'll be match or no match, not > match or untruth. Completely agree about changing the type of the variable. Here, as far as we're concerned (the variable will only be used in the following `if`, if it's not a proper match), the quacking is the same, but the following code could potentially change, leading to subtle bugs. Thanks for catching this, I will fix it.
228 if match:	230 match = HIDING_FILTER_REGEXP.search(text)

229 return _parse_hiding_filter(text, match)	231 if match:

	232 return _parse_hiding_filter(text, *match.groups())

230 return _parse_blocking_filter(text)	233 return _parse_blocking_filter(text)

231	234

232	235

233 def parse_line(line_text):	236 def parse_line(line_text):

234 """Parse one line of a filter list.	237 """Parse one line of a filter list.

235	238

236 :param line_text: Line of a filter list (must be a unicode string).	239 :param line_text: Line of a filter list (must be a unicode string).

237 :returns: Parsed line object (see `line_type`).	240 :returns: Parsed line object (see `_line_type`).

238 :raises ParseError: If the line can't be successfully parsed.	241 :raises ParseError: If the line can't be successfully parsed.

239 """	242 """

240 content = line_text.strip()	243 content = line_text.strip()

241	244

242 if content == '':	245 if content == '':

243 line = EmptyLine()	246 line = EmptyLine()

244 elif content.startswith('!'):	247 elif content.startswith('!'):

245 line = _parse_comment(content)	248 line = _parse_comment(content)

246 elif content.startswith('%') and content.endswith('%'):	249 elif content.startswith('%') and content.endswith('%'):

247 line = _parse_instruction(content)	250 line = _parse_instruction(content)

248 elif content.startswith('[') and content.endswith(']'):	251 elif content.startswith('[') and content.endswith(']'):

249 line = _parse_header(content)	252 line = _parse_header(content)

250 else:	253 else:

251 line = parse_filter(content)	254 line = parse_filter(content)

252	255

253 assert line.to_string().replace(' ', '') == content.replace(' ', '')	256 assert line.to_string().replace(' ', '') == content.replace(' ', '')

254 return line	257 return line

255	258

256	259

257 def parse_filterlist(lines):	260 def parse_filterlist(lines):

258 """Parse filter list from an iterable.	261 """Parse filter list from an iterable.

259	262

260 :param lines: List of strings or file or other iterable.	263 :param lines: List of strings or file or other iterable.

261 :returns: Iterator over parsed lines.	264 :returns: Iterator over parsed lines.

262 :raises ParseError: Can be thrown during iteration for invalid lines.	265 :raises ParseError: Can be thrown during iteration for invalid lines.

263 """	266 """

264 for line in lines:	267 for line in lines:

265 try:	268 yield parse_line(line)

266 yield parse_line(line)

267 except ParseError as pe:

268 yield InvalidLine(line.strip(), str(pe))

LEFT	RIGHT