Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)
Patch Set: Address review comments on patch set 3 Created Aug. 2, 2017, 4:15 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-2017 eyeo GmbH 2 # Copyright (C) 2006-2017 eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 from __future__ import unicode_literals 16 from __future__ import unicode_literals
17 17
18 import re 18 import re
19 from collections import namedtuple 19 from collections import namedtuple
20 20
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] 21 __all__ = [
Vasily Kuznetsov 2017/08/02 16:21:18 I updated my formatting rules to include "prefer o
22 'FILTER_ACTION',
23 'FILTER_OPTION',
24 'ParseError',
25 'SELECTOR_TYPE',
26 'parse_filterlist',
27 'parse_line',
28 ]
22 29
23 30
24 class ParseError(Exception): 31 class ParseError(Exception):
25 """Exception thrown by the parser when it encounters invalid input. 32 """Exception thrown by the parser when it encounters invalid input.
26 33
27 :param error: Description of the error. 34 :param error: Description of the error.
28 :param text: The text which was being parsed when an error occurred. 35 :param text: The text which was being parsed when an error occurred.
29 """ 36 """
30 37
31 def __init__(self, error, text): 38 def __init__(self, error, text):
32 Exception.__init__(self, '{} in "{}"'.format(error, text)) 39 Exception.__init__(self, '{} in "{}"'.format(error, text))
33 self.text = text 40 self.text = text
34 self.error = error 41 self.error = error
35 42
36 43
44 # Constants related to filters (see https://adblockplus.org/filters).
45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).
46 """Selector types"""
47 URL_PATTERN = 'url-pattern' # Normal URL patterns.
48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.
49 CSS = 'css' # CSS selectors for hiding filters.
50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).
51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.
52
53
54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).
55 """Filter actions"""
56 BLOCK = 'block' # Block the request.
57 ALLOW = 'allow' # Allow the request (whitelist).
58 HIDE = 'hide' # Hide selected element(s).
59 SHOW = 'show' # Show selected element(s) (whitelist).
60
61
62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).
63 """Filter options"""
64 # Resource types.
65 OTHER = 'other'
66 SCRIPT = 'script'
67 IMAGE = 'image'
68 STYLESHEET = 'stylesheet'
69 OBJECT = 'object'
70 SUBDOCUMENT = 'subdocument'
71 DOCUMENT = 'document'
72 WEBSOCKET = 'websocket'
73 WEBRTC = 'webrtc'
74 PING = 'ping'
75 XMLHTTPREQUEST = 'xmlhttprequest'
76 OBJECT_SUBREQUEST = 'object-subrequest'
77 MEDIA = 'media'
78 FONT = 'font'
79 POPUP = 'popup'
80 GENERICBLOCK = 'genericblock'
81 ELEMHIDE = 'elemhide'
82 GENERICHIDE = 'generichide'
83
84 # Deprecated resource types.
85 BACKGROUND = 'background'
86 XBL = 'xbl'
87 DTD = 'dtd'
88
89 # Other options.
90 MATCH_CASE = 'match-case'
91 DOMAIN = 'domain'
92 THIRD_PARTY = 'third-party'
93 COLLAPSE = 'collapse'
94 SITEKEY = 'sitekey'
95 DONOTTRACK = 'donottrack'
96
97
37 def _line_type(name, field_names, format_string): 98 def _line_type(name, field_names, format_string):
38 """Define a line type. 99 """Define a line type.
39 100
40 :param name: The name of the line type to define. 101 :param name: The name of the line type to define.
41 :param field_names: A sequence of field names or one space-separated 102 :param field_names: A sequence of field names or one space-separated
42 string that contains all field names. 103 string that contains all field names.
43 :param format_string: A format specifier for converting this line type 104 :param format_string: A format specifier for converting this line type
44 back to string representation. 105 back to string representation.
45 :returns: Class created with `namedtuple` that has `.type` set to 106 :returns: Class created with `namedtuple` that has `.type` set to
46 lowercased `name` and supports conversion back to string with 107 lowercased `name` and supports conversion back to string with
47 `.to_string()` method. 108 `.to_string()` method.
48 """ 109 """
49 lt = namedtuple(name, field_names) 110 lt = namedtuple(name, field_names)
50 lt.type = name.lower() 111 lt.type = name.lower()
51 lt.to_string = lambda self: format_string.format(self) 112 lt.to_string = lambda self: format_string.format(self)
52 return lt 113 return lt
53 114
54 115
55 Header = _line_type('Header', 'version', '[{.version}]') 116 Header = _line_type('Header', 'version', '[{.version}]')
56 EmptyLine = _line_type('EmptyLine', '', '') 117 EmptyLine = _line_type('EmptyLine', '', '')
57 Comment = _line_type('Comment', 'text', '! {.text}') 118 Comment = _line_type('Comment', 'text', '! {.text}')
58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
59 Filter = _line_type('Filter', 'expression', '{.expression}') 120 Filter = _line_type('Filter', 'text selector action options', '{.text}')
60 Include = _line_type('Include', 'target', '%include {0.target}%') 121 Include = _line_type('Include', 'target', '%include {0.target}%')
61 122
62 123
63 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') 124 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')
64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', 125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',
65 'Version'} 126 'Version'}
66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
129 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
130 FILTER_OPTIONS_REGEXP = re.compile(
131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'
132 )
68 133
69 134
70 def _parse_comment(text): 135 def _parse_comment(text):
71 match = METADATA_REGEXP.match(text) 136 match = METADATA_REGEXP.match(text)
72 if match and match.group(1) in METADATA_KEYS: 137 if match and match.group(1) in METADATA_KEYS:
73 return Metadata(match.group(1), match.group(2)) 138 return Metadata(match.group(1), match.group(2))
74 return Comment(text[1:].strip()) 139 return Comment(text[1:].strip())
75 140
76 141
77 def _parse_header(text): 142 def _parse_header(text):
78 match = HEADER_REGEXP.match(text) 143 match = HEADER_REGEXP.match(text)
79 if not match: 144 if not match:
80 raise ParseError('Malformed header', text) 145 raise ParseError('Malformed header', text)
81 return Header(match.group(1)) 146 return Header(match.group(1))
82 147
83 148
84 def _parse_instruction(text): 149 def _parse_instruction(text):
85 match = INCLUDE_REGEXP.match(text) 150 match = INCLUDE_REGEXP.match(text)
86 if not match: 151 if not match:
87 raise ParseError('Unrecognized instruction', text) 152 raise ParseError('Unrecognized instruction', text)
88 return Include(match.group(1)) 153 return Include(match.group(1))
89 154
90 155
156 def _parse_option(option):
157 if '=' in option:
158 return option.split('=', 1)
159 if option.startswith('~'):
160 return option[1:], False
161 return option, True
162
163
164 def _parse_filter_option(option):
165 name, value = _parse_option(option)
166
167 # Handle special cases of multivalued options.
168 if name == FILTER_OPTION.DOMAIN:
169 value = [_parse_option(o) for o in value.split('|')]
170 elif name == FILTER_OPTION.SITEKEY:
171 value = value.split('|')
172
173 return name, value
174
175
176 def _parse_filter_options(options):
177 return [_parse_filter_option(o) for o in options.split(',')]
178
179
180 def _parse_blocking_filter(text):
181 # Based on RegExpFilter.fromText in lib/filterClasses.js
182 # in https://hg.adblockplus.org/adblockpluscore.
183 action = FILTER_ACTION.BLOCK
184 options = []
185 selector = text
186
187 if selector.startswith('@@'):
188 action = FILTER_ACTION.ALLOW
189 selector = selector[2:]
190
191 if '$' in selector:
192 opt_match = FILTER_OPTIONS_REGEXP.search(selector)
193 if opt_match:
194 selector = selector[:opt_match.start(0)]
195 options = _parse_filter_options(opt_match.group(1))
196
197 if (len(selector) > 1 and
198 selector.startswith('/') and selector.endswith('/')):
199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}
200 else:
201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}
202
203 return Filter(text, selector, action, options)
204
205
206 def _parse_hiding_filter(text, domain, type_flag, selector_value):
207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}
208 action = FILTER_ACTION.HIDE
209 options = []
210
211 if type_flag == '@':
212 action = FILTER_ACTION.SHOW
213 elif type_flag == '?':
214 selector['type'] = SELECTOR_TYPE.XCSS
215
216 if domain:
217 domains = [_parse_option(d) for d in domain.split(',')]
218 options.append((FILTER_OPTION.DOMAIN, domains))
219
220 return Filter(text, selector, action, options)
221
222
223 def parse_filter(text):
224 """Parse one filter.
225
226 :param text: Text representation of a filter.
227 :returns: Filter object.
228 """
229 if '#' in text:
230 match = HIDING_FILTER_REGEXP.search(text)
231 if match:
232 return _parse_hiding_filter(text, *match.groups())
233 return _parse_blocking_filter(text)
234
235
91 def parse_line(line_text): 236 def parse_line(line_text):
92 """Parse one line of a filter list. 237 """Parse one line of a filter list.
93 238
94 :param line_text: Line of a filter list (must be a unicode string). 239 :param line_text: Line of a filter list (must be a unicode string).
95 :returns: Parsed line object (see `_line_type`). 240 :returns: Parsed line object (see `_line_type`).
96 :raises ParseError: If the line can't be successfully parsed. 241 :raises ParseError: If the line can't be successfully parsed.
97 """ 242 """
98 content = line_text.strip() 243 content = line_text.strip()
99 244
100 if content == '': 245 if content == '':
101 line = EmptyLine() 246 line = EmptyLine()
102 elif content.startswith('!'): 247 elif content.startswith('!'):
103 line = _parse_comment(content) 248 line = _parse_comment(content)
104 elif content.startswith('%') and content.endswith('%'): 249 elif content.startswith('%') and content.endswith('%'):
105 line = _parse_instruction(content) 250 line = _parse_instruction(content)
106 elif content.startswith('[') and content.endswith(']'): 251 elif content.startswith('[') and content.endswith(']'):
107 line = _parse_header(content) 252 line = _parse_header(content)
108 else: 253 else:
109 line = Filter(content) 254 line = parse_filter(content)
110 255
111 assert line.to_string().replace(' ', '') == content.replace(' ', '') 256 assert line.to_string().replace(' ', '') == content.replace(' ', '')
112 return line 257 return line
113 258
114 259
115 def parse_filterlist(lines): 260 def parse_filterlist(lines):
116 """Parse filter list from an iterable. 261 """Parse filter list from an iterable.
117 262
118 :param lines: List of strings or file or other iterable. 263 :param lines: List of strings or file or other iterable.
119 :returns: Iterator over parsed lines. 264 :returns: Iterator over parsed lines.
120 :raises ParseError: Can be thrown during iteration for invalid lines. 265 :raises ParseError: Can be thrown during iteration for invalid lines.
121 """ 266 """
122 for line in lines: 267 for line in lines:
123 yield parse_line(line) 268 yield parse_line(line)
OLDNEW
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld