Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)
Left Patch Set: Created June 14, 2017, 5:32 p.m.
Right Patch Set: Rebase to 1f5d7ead9bff Created Oct. 24, 2017, 3:58 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | tests/test_parser.py » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-2017 eyeo GmbH 2 # Copyright (C) 2006-present eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 from __future__ import unicode_literals 16 from __future__ import unicode_literals
17 17
18 import re 18 import re
19 from collections import namedtuple 19 from collections import namedtuple
20 20
21 __all__ = ['parse_filterlist', 'parse_line', 'parse_filter'] 21 __all__ = [
22 'FILTER_ACTION',
23 'FILTER_OPTION',
24 'ParseError',
25 'SELECTOR_TYPE',
26 'parse_filterlist',
27 'parse_line',
28 ]
22 29
23 30
24 class ParseError(Exception): 31 class ParseError(Exception):
25 """Internal exception used by the parser to signal invalid input.""" 32 """Exception thrown by the parser when it encounters invalid input.
mathias 2017/07/26 20:37:15 Removing the custom __init__ function looks like a
26 33
27 34 :param error: Description of the error.
28 def line_type(name, field_names, format_string): 35 :param text: The text which was being parsed when an error occurred.
36 """
37
38 def __init__(self, error, text):
39 Exception.__init__(self, '{} in "{}"'.format(error, text))
40 self.text = text
41 self.error = error
42
43
44 # Constants related to filters (see https://adblockplus.org/filters).
45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).
46 """Selector types"""
47 URL_PATTERN = 'url-pattern' # Normal URL patterns.
48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.
49 CSS = 'css' # CSS selectors for hiding filters.
50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).
51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.
52
53
54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).
55 """Filter actions"""
56 BLOCK = 'block' # Block the request.
57 ALLOW = 'allow' # Allow the request (whitelist).
58 HIDE = 'hide' # Hide selected element(s).
59 SHOW = 'show' # Show selected element(s) (whitelist).
60
61
62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).
63 """Filter options"""
64 # Resource types.
65 OTHER = 'other'
66 SCRIPT = 'script'
67 IMAGE = 'image'
68 STYLESHEET = 'stylesheet'
69 OBJECT = 'object'
70 SUBDOCUMENT = 'subdocument'
71 DOCUMENT = 'document'
72 WEBSOCKET = 'websocket'
73 WEBRTC = 'webrtc'
74 PING = 'ping'
75 XMLHTTPREQUEST = 'xmlhttprequest'
76 OBJECT_SUBREQUEST = 'object-subrequest'
77 MEDIA = 'media'
78 FONT = 'font'
79 POPUP = 'popup'
80 GENERICBLOCK = 'genericblock'
81 ELEMHIDE = 'elemhide'
82 GENERICHIDE = 'generichide'
83
84 # Deprecated resource types.
85 BACKGROUND = 'background'
86 XBL = 'xbl'
87 DTD = 'dtd'
88
89 # Other options.
90 MATCH_CASE = 'match-case'
91 DOMAIN = 'domain'
92 THIRD_PARTY = 'third-party'
93 COLLAPSE = 'collapse'
94 SITEKEY = 'sitekey'
95 DONOTTRACK = 'donottrack'
96
97
98 def _line_type(name, field_names, format_string):
29 """Define a line type. 99 """Define a line type.
30 100
31 :param name: The name of the line type to define. 101 :param name: The name of the line type to define.
32 :param field_names: A sequence of field names or one space-separated 102 :param field_names: A sequence of field names or one space-separated
33 string that contains all field names. 103 string that contains all field names.
34 :param format_string: A format specifier for converting this line type 104 :param format_string: A format specifier for converting this line type
mathias 2017/07/26 20:37:15 Fixing the missing format_string parameter documen
35 back to string representation. 105 back to string representation.
36 :returns: Class created with `namedtuple` that has `.type` set to 106 :returns: Class created with `namedtuple` that has `.type` set to
37 lowercased `name` and supports conversion back to string with 107 lowercased `name` and supports conversion back to string with
38 `.to_string()` method. 108 `.to_string()` method.
39 """ 109 """
40 lt = namedtuple(name, field_names) 110 lt = namedtuple(name, field_names)
41 lt.type = name.lower() 111 lt.type = name.lower()
42 lt.to_string = lambda self: format_string.format(self) 112 lt.to_string = lambda self: format_string.format(self)
43 return lt 113 return lt
44 114
45 115
46 InvalidLine = line_type('Invalid', 'text error', '{.text}') 116 Header = _line_type('Header', 'version', '[{.version}]')
47 Header = line_type('Header', 'version', '[{.version}]') 117 EmptyLine = _line_type('EmptyLine', '', '')
48 EmptyLine = line_type('EmptyLine', '', '') 118 Comment = _line_type('Comment', 'text', '! {.text}')
49 Comment = line_type('Comment', 'text', '! {.text}') 119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
50 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}') 120 Filter = _line_type('Filter', 'text selector action options', '{.text}')
51 Include = line_type('Include', 'target', '%include {0.target}%') 121 Include = _line_type('Include', 'target', '%include {0.target}%')
52 Filter = line_type('Filter', 'text selector action options', '{.text}')
53 122
54 123
55 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') 124 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')
56 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', 125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',
57 'Version'} 126 'Version'}
58 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
59 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
60 BFILTER_OPTIONS_REGEXP = re.compile( 129 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
61 r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$' 130 FILTER_OPTIONS_REGEXP = re.compile(
131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'
62 ) 132 )
63 HFILTER_REGEXP = re.compile(
64 r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)'
65 r'((?:\([\w\-]+(?:[$^*]?=[^\(\)"]*)?\))*)|#([^{}]+))$'
66 )
67
68 # Types of resources to block (based on adblockpluscore/lib/filterClasses.js).
69 TYPES = {
70 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media',
71 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup',
72 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide',
73 }
74
75 # Special types used for whitelisting.
76 TYPES_WHITELIST = {
77 'document', 'elemhide', 'generichide', 'genericblock',
78 }
79
80 # By default blocking filters apply to everything except whitelist-only types
81 # and popups (based on adblockpluscore/lib/filterClasses.js).
82 TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST}
83
84 # Type options that are synonyms for other types.
85 TYPE_SYNONYMS = {
86 'xbl': 'other',
87 'dtd': 'other',
88 'background': 'image',
89 }
90 133
91 134
92 def _parse_comment(text): 135 def _parse_comment(text):
93 match = METADATA_REGEXP.match(text) 136 match = METADATA_REGEXP.match(text)
94 if match and match.group(1) in METADATA_KEYS: 137 if match and match.group(1) in METADATA_KEYS:
95 return Metadata(match.group(1), match.group(2)) 138 return Metadata(match.group(1), match.group(2))
96 return Comment(text[1:].strip()) 139 return Comment(text[1:].strip())
97 140
98 141
99 def _parse_header(text): 142 def _parse_header(text):
100 match = HEADER_REGEXP.match(text) 143 match = HEADER_REGEXP.match(text)
101 if not match: 144 if not match:
102 raise ParseError('Malformed header') 145 raise ParseError('Malformed header', text)
mathias 2017/07/26 20:37:15 Please explain why you don't include the malformed
Vasily Kuznetsov 2017/07/27 11:05:02 My reasoning was that you can get to this place in
103 return Header(match.group(1)) 146 return Header(match.group(1))
104 147
105 148
106 def _parse_instruction(text): 149 def _parse_instruction(text):
107 match = INCLUDE_REGEXP.match(text) 150 match = INCLUDE_REGEXP.match(text)
108 if not match: 151 if not match:
109 raise ParseError('Unrecognized instruction') 152 raise ParseError('Unrecognized instruction', text)
110 return Include(match.group(1)) 153 return Include(match.group(1))
111 154
112 155
113 def _separate_domains(domains): 156 def _parse_option(option):
114 options = {} 157 if '=' in option:
115 for d in domains: 158 return option.split('=', 1)
116 if d.startswith('~'): 159 if option.startswith('~'):
117 options.setdefault('domains-exclude', []).append(d.lstrip('~')) 160 return option[1:], False
118 else: 161 return option, True
119 options.setdefault('domains-include', []).append(d) 162
120 if 'domains-include' in options: 163
121 options['domains-none'] = True 164 def _parse_filter_option(option):
122 return options 165 name, value = _parse_option(option)
123 166
124 167 # Handle special cases of multivalued options.
125 def _separate_types(types): 168 if name == FILTER_OPTION.DOMAIN:
126 """Convert a list of `(type, on_off)` tuples to options: 169 value = [_parse_option(o) for o in value.split('|')]
127 170 elif name == FILTER_OPTION.SITEKEY:
128 - types-none: True if we start with nothing included, absent if we start 171 value = value.split('|')
129 with TYPES_DEFAULT included. 172
130 - types-include: List of additional included types. 173 return name, value
131 - types-exclude: List of excluded types.
132 """
133 if not types:
134 return {}
135
136 if types[0][1]: # If the first type is ON, we start with nothing...
137 types_default = set()
138 options = {'types-none': True}
139 else: # ...otherwise we start with default type set.
140 types_default = TYPES_DEFAULT
141 options = {}
142
143 # Include/exclude any deviations from default.
144 for name, value in dict(types).items():
145 if value and name not in types_default:
146 options.setdefault('types-include', []).append(name)
147 if not value and name in types_default:
148 options.setdefault('types-exclude', []).append(name)
149
150 return options
151
152
153 def _parse_hiding_filter(text, match):
154 if match.group(5):
155 selector = {'type': 'css', 'value': match.group(5)}
156 else:
157 selector = {
158 'type': 'abp-simple',
159 'value': match.group(3) + match.group(4),
160 }
161 action = 'show' if match.group(2) else 'hide'
162 options = _separate_domains(list(filter(None, match.group(1).split(','))))
163 return Filter(text, selector, action, options)
164 174
165 175
166 def _parse_filter_options(options): 176 def _parse_filter_options(options):
167 # Based on RegExpFilter.fromText in lib/filterClasses.js 177 return [_parse_filter_option(o) for o in options.split(',')]
168 # in adblockpluscore.
169 parsed_options = {}
170 type_options = []
171
172 for option in options.split(','):
173 if '=' in option:
174 name, value = option.split('=', 1)
175 elif option.startswith('~'):
176 name, value = option[1:], False
177 else:
178 name, value = option, True
179
180 if name in TYPE_SYNONYMS:
181 name = TYPE_SYNONYMS[name]
182 if name in TYPES:
183 type_options.append((name, value))
184 elif name == 'domain':
185 parsed_options.update(_separate_domains(value.split('|')))
186 elif name == 'sitekey':
187 parsed_options['sitekeys'] = value.split('|')
188 else:
189 parsed_options[name] = value
190
191 parsed_options.update(_separate_types(type_options))
192 return parsed_options
193 178
194 179
195 def _parse_blocking_filter(text): 180 def _parse_blocking_filter(text):
196 # Based on RegExpFilter.fromText in lib/filterClasses.js 181 # Based on RegExpFilter.fromText in lib/filterClasses.js
197 # in adblockpluscore. 182 # in https://hg.adblockplus.org/adblockpluscore.
198 action = 'block' 183 action = FILTER_ACTION.BLOCK
199 options = {} 184 options = []
200 selector = text 185 selector = text
201 186
202 if selector.startswith('@@'): 187 if selector.startswith('@@'):
203 action = 'allow' 188 action = FILTER_ACTION.ALLOW
mathias 2017/07/26 20:37:15 I think we should have symbols like BFILTER_ACTION
Vasily Kuznetsov 2017/07/27 11:05:02 Probably not these exact names for the constants,
204 selector = selector[2:] 189 selector = selector[2:]
205 190
206 if '$' in selector: 191 if '$' in selector:
207 opt_match = BFILTER_OPTIONS_REGEXP.search(selector) 192 opt_match = FILTER_OPTIONS_REGEXP.search(selector)
208 if opt_match: 193 if opt_match:
209 selector = selector[:opt_match.start(0)] 194 selector = selector[:opt_match.start(0)]
210 options = _parse_filter_options(opt_match.group(1)) 195 options = _parse_filter_options(opt_match.group(1))
211 196
212 if (len(selector) > 1 and 197 if (len(selector) > 1 and
213 selector.startswith('/') and selector.endswith('/')): 198 selector.startswith('/') and selector.endswith('/')):
214 selector = {'type': 'url-regexp', 'value': selector[1:-1]} 199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}
mathias 2017/07/26 20:37:15 I also think we should have symbols like SELECTOR_
Vasily Kuznetsov 2017/07/27 11:05:02 Acknowledged.
215 else: 200 else:
216 selector = {'type': 'url-pattern', 'value': selector} 201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}
202
203 return Filter(text, selector, action, options)
204
205
206 def _parse_hiding_filter(text, domain, type_flag, selector_value):
207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}
208 action = FILTER_ACTION.HIDE
209 options = []
210
211 if type_flag == '@':
212 action = FILTER_ACTION.SHOW
213 elif type_flag == '?':
214 selector['type'] = SELECTOR_TYPE.XCSS
215
216 if domain:
217 domains = [_parse_option(d) for d in domain.split(',')]
218 options.append((FILTER_OPTION.DOMAIN, domains))
217 219
218 return Filter(text, selector, action, options) 220 return Filter(text, selector, action, options)
219 221
220 222
221 def parse_filter(text): 223 def parse_filter(text):
222 """Parse one filter. 224 """Parse one filter.
223 225
224 :param text: Text representation of a filter. 226 :param text: Text representation of a filter.
225 :returns: filter object. 227 :returns: Filter object.
226 """ 228 """
227 match = HFILTER_REGEXP.match(text) if '#' in text else False 229 if '#' in text:
mathias 2017/07/26 20:37:15 Call me old-fashioned but I seriously dislike chan
Vasily Kuznetsov 2017/07/27 11:05:02 Completely agree about changing the type of the va
228 if match: 230 match = HIDING_FILTER_REGEXP.search(text)
229 return _parse_hiding_filter(text, match) 231 if match:
232 return _parse_hiding_filter(text, *match.groups())
230 return _parse_blocking_filter(text) 233 return _parse_blocking_filter(text)
231 234
232 235
233 def parse_line(line_text): 236 def parse_line(line_text):
234 """Parse one line of a filter list. 237 """Parse one line of a filter list.
235 238
236 :param line_text: Line of a filter list (must be a unicode string). 239 :param line_text: Line of a filter list (must be a unicode string).
237 :returns: Parsed line object (see `line_type`). 240 :returns: Parsed line object (see `_line_type`).
238 :raises ParseError: If the line can't be successfully parsed. 241 :raises ParseError: If the line can't be successfully parsed.
239 """ 242 """
240 content = line_text.strip() 243 content = line_text.strip()
241 244
242 if content == '': 245 if content == '':
243 line = EmptyLine() 246 line = EmptyLine()
244 elif content.startswith('!'): 247 elif content.startswith('!'):
245 line = _parse_comment(content) 248 line = _parse_comment(content)
246 elif content.startswith('%') and content.endswith('%'): 249 elif content.startswith('%') and content.endswith('%'):
247 line = _parse_instruction(content) 250 line = _parse_instruction(content)
248 elif content.startswith('[') and content.endswith(']'): 251 elif content.startswith('[') and content.endswith(']'):
249 line = _parse_header(content) 252 line = _parse_header(content)
250 else: 253 else:
251 line = parse_filter(content) 254 line = parse_filter(content)
252 255
253 assert line.to_string().replace(' ', '') == content.replace(' ', '') 256 assert line.to_string().replace(' ', '') == content.replace(' ', '')
254 return line 257 return line
255 258
256 259
257 def parse_filterlist(lines): 260 def parse_filterlist(lines):
258 """Parse filter list from an iterable. 261 """Parse filter list from an iterable.
259 262
260 :param lines: List of strings or file or other iterable. 263 :param lines: List of strings or file or other iterable.
261 :returns: Iterator over parsed lines. 264 :returns: Iterator over parsed lines.
262 :raises ParseError: Can be thrown during iteration for invalid lines. 265 :raises ParseError: Can be thrown during iteration for invalid lines.
263 """ 266 """
264 for line in lines: 267 for line in lines:
265 try: 268 yield parse_line(line)
266 yield parse_line(line)
267 except ParseError as pe:
268 yield InvalidLine(line.strip(), str(pe))
LEFTRIGHT

Powered by Google App Engine
This is Rietveld