Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)
Patch Set: Address review comments on patch set 2 Created July 28, 2017, 6:52 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | tests/test_parser.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-2017 eyeo GmbH 2 # Copyright (C) 2006-2017 eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 from __future__ import unicode_literals 16 from __future__ import unicode_literals
17 17
18 import re 18 import re
19 from collections import namedtuple 19 from collections import namedtuple
20 20
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] 21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError',
22 'SELECTOR_TYPE', 'FILTER_ACTION', 'FILTER_OPTION']
22 23
23 24
24 class ParseError(Exception): 25 class ParseError(Exception):
25 """Exception thrown by the parser when it encounters invalid input. 26 """Exception thrown by the parser when it encounters invalid input.
26 27
27 :param error: Description of the error. 28 :param error: Description of the error.
28 :param text: The text which was being parsed when an error occurred. 29 :param text: The text which was being parsed when an error occurred.
29 """ 30 """
30 31
31 def __init__(self, error, text): 32 def __init__(self, error, text):
32 Exception.__init__(self, '{} in "{}"'.format(error, text)) 33 Exception.__init__(self, '{} in "{}"'.format(error, text))
33 self.text = text 34 self.text = text
34 self.error = error 35 self.error = error
35 36
36 37
38 # Constants related to filters (see https://adblockplus.org/filters).
39 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).
40 """Selector types"""
41 URL_PATTERN = 'url-pattern' # Normal URL patterns.
42 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.
43 CSS = 'css' # CSS selectors for hiding filters.
44 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).
45 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.
46
47
48 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).
49 """Filter actions"""
50 BLOCK = 'block' # Block the request.
51 ALLOW = 'allow' # Allow the request (whitelist).
52 HIDE = 'hide' # Hide selected element(s).
53 SHOW = 'show' # Show selected element(s) (whitelist).
54
55
56 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).
57 """Filter options"""
58 # Resource types.
59 OTHER = 'other'
60 SCRIPT = 'script'
61 IMAGE = 'image'
62 STYLESHEET = 'stylesheet'
63 OBJECT = 'object'
64 SUBDOCUMENT = 'subdocument'
65 DOCUMENT = 'document'
66 WEBSOCKET = 'websocket'
67 WEBRTC = 'webrtc'
68 PING = 'ping'
69 XMLHTTPREQUEST = 'xmlhttprequest'
70 OBJECT_SUBREQUEST = 'object-subrequest'
71 MEDIA = 'media'
72 FONT = 'font'
73 POPUP = 'popup'
74 GENERICBLOCK = 'genericblock'
75 ELEMHIDE = 'elemhide'
76 GENERICHIDE = 'generichide'
77
78 # Deprecated resource types.
79 BACKGROUND = 'background'
80 XBL = 'xbl'
81 DTD = 'dtd'
82
83 # Other options.
84 MATCH_CASE = 'match-case'
85 DOMAIN = 'domain'
86 THIRD_PARTY = 'third-party'
87 COLLAPSE = 'collapse'
88 SITEKEY = 'sitekey'
89 DONOTTRACK = 'donottrack'
90
91
92 ALL_OPTIONS = {opt for name, opt in vars(FILTER_OPTION).items()
93 if not name.startswith('__')}
94
95
37 def _line_type(name, field_names, format_string): 96 def _line_type(name, field_names, format_string):
38 """Define a line type. 97 """Define a line type.
39 98
40 :param name: The name of the line type to define. 99 :param name: The name of the line type to define.
41 :param field_names: A sequence of field names or one space-separated 100 :param field_names: A sequence of field names or one space-separated
42 string that contains all field names. 101 string that contains all field names.
43 :param format_string: A format specifier for converting this line type 102 :param format_string: A format specifier for converting this line type
44 back to string representation. 103 back to string representation.
45 :returns: Class created with `namedtuple` that has `.type` set to 104 :returns: Class created with `namedtuple` that has `.type` set to
46 lowercased `name` and supports conversion back to string with 105 lowercased `name` and supports conversion back to string with
47 `.to_string()` method. 106 `.to_string()` method.
48 """ 107 """
49 lt = namedtuple(name, field_names) 108 lt = namedtuple(name, field_names)
50 lt.type = name.lower() 109 lt.type = name.lower()
51 lt.to_string = lambda self: format_string.format(self) 110 lt.to_string = lambda self: format_string.format(self)
52 return lt 111 return lt
53 112
54 113
55 Header = _line_type('Header', 'version', '[{.version}]') 114 Header = _line_type('Header', 'version', '[{.version}]')
56 EmptyLine = _line_type('EmptyLine', '', '') 115 EmptyLine = _line_type('EmptyLine', '', '')
57 Comment = _line_type('Comment', 'text', '! {.text}') 116 Comment = _line_type('Comment', 'text', '! {.text}')
58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 117 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
59 Filter = _line_type('Filter', 'expression', '{.expression}') 118 Filter = _line_type('Filter', 'text selector action options', '{.text}')
60 Include = _line_type('Include', 'target', '%include {0.target}%') 119 Include = _line_type('Include', 'target', '%include {0.target}%')
61 120
62 121
63 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') 122 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')
64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', 123 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',
65 'Version'} 124 'Version'}
66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 125 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 126 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
127 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
128 FILTER_OPTIONS_REGEXP = re.compile(
129 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'
130 )
68 131
69 132
70 def _parse_comment(text): 133 def _parse_comment(text):
71 match = METADATA_REGEXP.match(text) 134 match = METADATA_REGEXP.match(text)
72 if match and match.group(1) in METADATA_KEYS: 135 if match and match.group(1) in METADATA_KEYS:
73 return Metadata(match.group(1), match.group(2)) 136 return Metadata(match.group(1), match.group(2))
74 return Comment(text[1:].strip()) 137 return Comment(text[1:].strip())
75 138
76 139
77 def _parse_header(text): 140 def _parse_header(text):
78 match = HEADER_REGEXP.match(text) 141 match = HEADER_REGEXP.match(text)
79 if not match: 142 if not match:
80 raise ParseError('Malformed header', text) 143 raise ParseError('Malformed header', text)
81 return Header(match.group(1)) 144 return Header(match.group(1))
82 145
83 146
84 def _parse_instruction(text): 147 def _parse_instruction(text):
85 match = INCLUDE_REGEXP.match(text) 148 match = INCLUDE_REGEXP.match(text)
86 if not match: 149 if not match:
87 raise ParseError('Unrecognized instruction', text) 150 raise ParseError('Unrecognized instruction', text)
88 return Include(match.group(1)) 151 return Include(match.group(1))
89 152
90 153
154 def _parse_option(option):
155 if '=' in option:
156 return option.split('=', 1)
157 if option.startswith('~'):
158 return option[1:], False
159 return option, True
160
161
162 def _parse_filter_option(option):
163 name, value = _parse_option(option)
164
165 if name not in ALL_OPTIONS:
mathias 2017/08/01 06:31:35 I don't think this part of the code should validat
Vasily Kuznetsov 2017/08/02 16:21:17 Following our conversation, I agree. Done
166 raise ParseError('Unrecognized option', name)
167
168 # Handle special cases of multivalued options.
169 if name == FILTER_OPTION.DOMAIN:
170 value = [_parse_option(o) for o in value.split('|')]
171 elif name == FILTER_OPTION.SITEKEY:
172 value = value.split('|')
173
174 return name, value
175
176
177 def _parse_filter_options(options, separator=','):
mathias 2017/08/01 06:31:35 Why is the separator a parameter? The only place w
Vasily Kuznetsov 2017/08/02 16:21:17 This is left-over from an earlier version that use
178 return [_parse_filter_option(o) for o in options.split(separator)]
179
180
181 def _parse_blocking_filter(text):
182 # Based on RegExpFilter.fromText in lib/filterClasses.js
183 # in https://hg.adblockplus.org/adblockpluscore.
184 action = FILTER_ACTION.BLOCK
185 options = []
186 selector = text
187
188 if selector.startswith('@@'):
189 action = FILTER_ACTION.ALLOW
190 selector = selector[2:]
191
192 if '$' in selector:
193 opt_match = FILTER_OPTIONS_REGEXP.search(selector)
194 if opt_match:
195 selector = selector[:opt_match.start(0)]
196 options = _parse_filter_options(opt_match.group(1))
197
198 if (len(selector) > 1 and
199 selector.startswith('/') and selector.endswith('/')):
200 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}
201 else:
202 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}
203
204 return Filter(text, selector, action, options)
205
206
207 def _parse_hiding_filter(text, domain, type_flag, selector_value):
208 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}
209 action = FILTER_ACTION.HIDE
210 options = []
211
212 if type_flag == '@':
213 action = FILTER_ACTION.SHOW
214 elif type_flag == '?':
215 selector['type'] = SELECTOR_TYPE.XCSS
216
217 if domain:
218 domains = [_parse_option(d) for d in domain.split(',')]
219 options.append((FILTER_OPTION.DOMAIN, domains))
220
221 return Filter(text, selector, action, options)
222
223
224 def parse_filter(text):
225 """Parse one filter.
226
227 :param text: Text representation of a filter.
228 :returns: Filter object.
229 """
230 if '#' in text:
231 match = HIDING_FILTER_REGEXP.search(text)
232 if match:
233 return _parse_hiding_filter(text, *match.groups())
234 return _parse_blocking_filter(text)
235
236
91 def parse_line(line_text): 237 def parse_line(line_text):
92 """Parse one line of a filter list. 238 """Parse one line of a filter list.
93 239
94 :param line_text: Line of a filter list (must be a unicode string). 240 :param line_text: Line of a filter list (must be a unicode string).
95 :returns: Parsed line object (see `_line_type`). 241 :returns: Parsed line object (see `_line_type`).
96 :raises ParseError: If the line can't be successfully parsed. 242 :raises ParseError: If the line can't be successfully parsed.
97 """ 243 """
98 content = line_text.strip() 244 content = line_text.strip()
99 245
100 if content == '': 246 if content == '':
101 line = EmptyLine() 247 line = EmptyLine()
102 elif content.startswith('!'): 248 elif content.startswith('!'):
103 line = _parse_comment(content) 249 line = _parse_comment(content)
104 elif content.startswith('%') and content.endswith('%'): 250 elif content.startswith('%') and content.endswith('%'):
105 line = _parse_instruction(content) 251 line = _parse_instruction(content)
106 elif content.startswith('[') and content.endswith(']'): 252 elif content.startswith('[') and content.endswith(']'):
107 line = _parse_header(content) 253 line = _parse_header(content)
108 else: 254 else:
109 line = Filter(content) 255 line = parse_filter(content)
110 256
111 assert line.to_string().replace(' ', '') == content.replace(' ', '') 257 assert line.to_string().replace(' ', '') == content.replace(' ', '')
112 return line 258 return line
113 259
114 260
115 def parse_filterlist(lines): 261 def parse_filterlist(lines):
116 """Parse filter list from an iterable. 262 """Parse filter list from an iterable.
117 263
118 :param lines: List of strings or file or other iterable. 264 :param lines: List of strings or file or other iterable.
119 :returns: Iterator over parsed lines. 265 :returns: Iterator over parsed lines.
120 :raises ParseError: Can be thrown during iteration for invalid lines. 266 :raises ParseError: Can be thrown during iteration for invalid lines.
121 """ 267 """
122 for line in lines: 268 for line in lines:
123 yield parse_line(line) 269 yield parse_line(line)
OLDNEW
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | tests/test_parser.py » ('J')

Powered by Google App Engine
This is Rietveld