Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)
Left Patch Set: remove all interpretation and keep only parsing, add support for element hiding emulation filters, … Created July 27, 2017, 7:16 p.m.
Right Patch Set: Rebase to 1f5d7ead9bff Created Oct. 24, 2017, 3:58 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | tests/test_parser.py » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-2017 eyeo GmbH 2 # Copyright (C) 2006-present eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 from __future__ import unicode_literals 16 from __future__ import unicode_literals
17 17
18 import re 18 import re
19 from collections import namedtuple 19 from collections import namedtuple
20 20
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError', 'ST', 'FA'] 21 __all__ = [
22 'FILTER_ACTION',
23 'FILTER_OPTION',
24 'ParseError',
25 'SELECTOR_TYPE',
26 'parse_filterlist',
27 'parse_line',
28 ]
22 29
23 30
24 class ParseError(Exception): 31 class ParseError(Exception):
25 """Exception thrown by the parser when it encounters invalid input. 32 """Exception thrown by the parser when it encounters invalid input.
26 33
27 :param error: Description of the error. 34 :param error: Description of the error.
28 :param text: The text which was being parsed when an error occurred. 35 :param text: The text which was being parsed when an error occurred.
29 """ 36 """
30 37
31 def __init__(self, error, text): 38 def __init__(self, error, text):
32 Exception.__init__(self, '{} in "{}"'.format(error, text)) 39 Exception.__init__(self, '{} in "{}"'.format(error, text))
33 self.text = text 40 self.text = text
34 self.error = error 41 self.error = error
35 42
36 43
37 # Constants related to filters (see https://adblockplus.org/filters). 44 # Constants related to filters (see https://adblockplus.org/filters).
38 class ST: 45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).
mathias 2017/07/28 16:43:29 Why abbreviating here (ST) and below (FA)?
Vasily Kuznetsov 2017/07/28 17:38:10 To be completely honest, the reason is kind of stu
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
39 """Selector types""" 46 """Selector types"""
40 URL_PATTERN = 'url-pattern' # Normal URL patterns. 47 URL_PATTERN = 'url-pattern' # Normal URL patterns.
41 URL_REGEXP = 'url-regexp' # Regular expressions for URLs. 48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.
42 CSS = 'css' # CSS selectors for hiding filters. 49 CSS = 'css' # CSS selectors for hiding filters.
43 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). 50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).
44 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. 51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.
45 52
46 53
47 class FA: 54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).
48 """Filter actions""" 55 """Filter actions"""
49 BLOCK = 'block' # Block the request. 56 BLOCK = 'block' # Block the request.
50 ALLOW = 'allow' # Allow the request (whitelist). 57 ALLOW = 'allow' # Allow the request (whitelist).
51 HIDE = 'hide' # Hide selected element(s). 58 HIDE = 'hide' # Hide selected element(s).
52 SHOW = 'show' # Show selected element(s) (whitelist). 59 SHOW = 'show' # Show selected element(s) (whitelist).
60
61
62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).
63 """Filter options"""
64 # Resource types.
65 OTHER = 'other'
66 SCRIPT = 'script'
67 IMAGE = 'image'
68 STYLESHEET = 'stylesheet'
69 OBJECT = 'object'
70 SUBDOCUMENT = 'subdocument'
71 DOCUMENT = 'document'
72 WEBSOCKET = 'websocket'
73 WEBRTC = 'webrtc'
74 PING = 'ping'
75 XMLHTTPREQUEST = 'xmlhttprequest'
76 OBJECT_SUBREQUEST = 'object-subrequest'
77 MEDIA = 'media'
78 FONT = 'font'
79 POPUP = 'popup'
80 GENERICBLOCK = 'genericblock'
81 ELEMHIDE = 'elemhide'
82 GENERICHIDE = 'generichide'
83
84 # Deprecated resource types.
85 BACKGROUND = 'background'
86 XBL = 'xbl'
87 DTD = 'dtd'
88
89 # Other options.
90 MATCH_CASE = 'match-case'
91 DOMAIN = 'domain'
92 THIRD_PARTY = 'third-party'
93 COLLAPSE = 'collapse'
94 SITEKEY = 'sitekey'
95 DONOTTRACK = 'donottrack'
53 96
54 97
55 def _line_type(name, field_names, format_string): 98 def _line_type(name, field_names, format_string):
56 """Define a line type. 99 """Define a line type.
57 100
58 :param name: The name of the line type to define. 101 :param name: The name of the line type to define.
59 :param field_names: A sequence of field names or one space-separated 102 :param field_names: A sequence of field names or one space-separated
60 string that contains all field names. 103 string that contains all field names.
61 :param format_string: A format specifier for converting this line type 104 :param format_string: A format specifier for converting this line type
62 back to string representation. 105 back to string representation.
(...skipping 13 matching lines...) Expand all
76 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
77 Filter = _line_type('Filter', 'text selector action options', '{.text}') 120 Filter = _line_type('Filter', 'text selector action options', '{.text}')
78 Include = _line_type('Include', 'target', '%include {0.target}%') 121 Include = _line_type('Include', 'target', '%include {0.target}%')
79 122
80 123
81 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') 124 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')
82 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', 125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',
83 'Version'} 126 'Version'}
84 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
85 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
86 HFILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') 129 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
mathias 2017/07/28 16:43:29 Why abbreviating? What's wrong about HIDING_FILTER
Vasily Kuznetsov 2017/07/28 17:38:10 It's shorter this way. But I don't feel very stron
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
87 BFILTER_REGEXP_REGEXP = re.compile( 130 FILTER_OPTIONS_REGEXP = re.compile(
mathias 2017/07/28 16:43:30 I was wondering about the *_REGEXP_REGEXP name, bu
Vasily Kuznetsov 2017/07/28 17:38:10 It's the regular expression for blocking filters w
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
88 r'^(@@)?\/.*\/(?:\$~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)?$'
89 )
90 BFILTER_OPTIONS_REGEXP = re.compile(
91 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' 131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'
92 ) 132 )
93 133
94 134
95 def _parse_comment(text): 135 def _parse_comment(text):
96 match = METADATA_REGEXP.match(text) 136 match = METADATA_REGEXP.match(text)
97 if match and match.group(1) in METADATA_KEYS: 137 if match and match.group(1) in METADATA_KEYS:
98 return Metadata(match.group(1), match.group(2)) 138 return Metadata(match.group(1), match.group(2))
99 return Comment(text[1:].strip()) 139 return Comment(text[1:].strip())
100 140
101 141
102 def _parse_header(text): 142 def _parse_header(text):
103 match = HEADER_REGEXP.match(text) 143 match = HEADER_REGEXP.match(text)
104 if not match: 144 if not match:
105 raise ParseError('Malformed header', text) 145 raise ParseError('Malformed header', text)
106 return Header(match.group(1)) 146 return Header(match.group(1))
107 147
108 148
109 def _parse_instruction(text): 149 def _parse_instruction(text):
110 match = INCLUDE_REGEXP.match(text) 150 match = INCLUDE_REGEXP.match(text)
111 if not match: 151 if not match:
112 raise ParseError('Unrecognized instruction', text) 152 raise ParseError('Unrecognized instruction', text)
113 return Include(match.group(1)) 153 return Include(match.group(1))
114 154
115 155
116 def _parse_option(option): 156 def _parse_option(option):
117 if '=' in option: 157 if '=' in option:
118 name, value = option.split('=', 1) 158 return option.split('=', 1)
119 elif option.startswith('~'): 159 if option.startswith('~'):
120 name, value = option[1:], False 160 return option[1:], False
121 else: 161 return option, True
122 name, value = option, True 162
163
164 def _parse_filter_option(option):
165 name, value = _parse_option(option)
123 166
124 # Handle special cases of multivalued options. 167 # Handle special cases of multivalued options.
125 if name == 'domain': 168 if name == FILTER_OPTION.DOMAIN:
mathias 2017/07/28 16:43:30 Wouldn't it make sense to enumerate recognized OPT
Vasily Kuznetsov 2017/07/28 17:38:10 Yeah, probably makes sense to make some kind of en
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
126 name, value = 'domains', _parse_options(value, '|') 169 value = [_parse_option(o) for o in value.split('|')]
mathias 2017/07/28 16:43:30 Why using a different / plural key for the parsed
Vasily Kuznetsov 2017/07/28 17:38:10 Because semantically it's a list, always, so calli
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
127 elif name == 'sitekey': 170 elif name == FILTER_OPTION.SITEKEY:
128 name, value = 'sitekeys', value.split('|') 171 value = value.split('|')
129 172
130 return name, value 173 return name, value
131 174
132 175
133 def _parse_options(options, separator=','): 176 def _parse_filter_options(options):
134 return [_parse_option(o) for o in options.split(separator)] 177 return [_parse_filter_option(o) for o in options.split(',')]
135 178
136 179
137 def _parse_blocking_filter(text): 180 def _parse_blocking_filter(text):
138 # Based on RegExpFilter.fromText in lib/filterClasses.js 181 # Based on RegExpFilter.fromText in lib/filterClasses.js
139 # in https://hg.adblockplus.org/adblockpluscore. 182 # in https://hg.adblockplus.org/adblockpluscore.
140 action = FA.BLOCK 183 action = FILTER_ACTION.BLOCK
141 options = [] 184 options = []
142 selector = text 185 selector = text
143 186
144 if selector.startswith('@@'): 187 if selector.startswith('@@'):
145 action = FA.ALLOW 188 action = FILTER_ACTION.ALLOW
146 selector = selector[2:] 189 selector = selector[2:]
147 190
148 if '$' in selector: 191 if '$' in selector:
149 opt_match = BFILTER_OPTIONS_REGEXP.search(selector) 192 opt_match = FILTER_OPTIONS_REGEXP.search(selector)
150 if opt_match: 193 if opt_match:
151 selector = selector[:opt_match.start(0)] 194 selector = selector[:opt_match.start(0)]
152 options = _parse_options(opt_match.group(1)) 195 options = _parse_filter_options(opt_match.group(1))
153 196
154 if (len(selector) > 1 and 197 if (len(selector) > 1 and
155 selector.startswith('/') and selector.endswith('/')): 198 selector.startswith('/') and selector.endswith('/')):
156 selector = {'type': ST.URL_REGEXP, 'value': selector[1:-1]} 199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}
157 else: 200 else:
158 selector = {'type': ST.URL_PATTERN, 'value': selector} 201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}
159 202
160 return Filter(text, selector, action, options) 203 return Filter(text, selector, action, options)
161 204
162 205
163 def _parse_hiding_filter(text, domains, type_flag, selector_value): 206 def _parse_hiding_filter(text, domain, type_flag, selector_value):
164 selector = {'type': ST.CSS, 'value': selector_value} 207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}
165 action = FA.HIDE 208 action = FILTER_ACTION.HIDE
166 options = [] 209 options = []
167 210
168 if type_flag == '@': 211 if type_flag == '@':
169 action = FA.SHOW 212 action = FILTER_ACTION.SHOW
170 elif type_flag == '?': 213 elif type_flag == '?':
171 selector['type'] = ST.XCSS 214 selector['type'] = SELECTOR_TYPE.XCSS
172 215
173 if domains: 216 if domain:
174 options.append(('domains', _parse_options(domains))) 217 domains = [_parse_option(d) for d in domain.split(',')]
218 options.append((FILTER_OPTION.DOMAIN, domains))
175 219
176 return Filter(text, selector, action, options) 220 return Filter(text, selector, action, options)
177 221
178 222
179 def parse_filter(text): 223 def parse_filter(text):
180 """Parse one filter. 224 """Parse one filter.
181 225
182 :param text: Text representation of a filter. 226 :param text: Text representation of a filter.
183 :returns: filter object. 227 :returns: Filter object.
mathias 2017/07/28 16:43:29 I think this should be upper-case "Filter".
Vasily Kuznetsov 2017/07/28 17:38:10 Yes.
184 """ 228 """
185 if '#' in text: 229 if '#' in text:
186 match = HFILTER_REGEXP.search(text) 230 match = HIDING_FILTER_REGEXP.search(text)
187 if match: 231 if match:
188 return _parse_hiding_filter(text, *match.groups()) 232 return _parse_hiding_filter(text, *match.groups())
189 return _parse_blocking_filter(text) 233 return _parse_blocking_filter(text)
190 234
191 235
192 def parse_line(line_text): 236 def parse_line(line_text):
193 """Parse one line of a filter list. 237 """Parse one line of a filter list.
194 238
195 :param line_text: Line of a filter list (must be a unicode string). 239 :param line_text: Line of a filter list (must be a unicode string).
196 :returns: Parsed line object (see `_line_type`). 240 :returns: Parsed line object (see `_line_type`).
(...skipping 18 matching lines...) Expand all
215 259
216 def parse_filterlist(lines): 260 def parse_filterlist(lines):
217 """Parse filter list from an iterable. 261 """Parse filter list from an iterable.
218 262
219 :param lines: List of strings or file or other iterable. 263 :param lines: List of strings or file or other iterable.
220 :returns: Iterator over parsed lines. 264 :returns: Iterator over parsed lines.
221 :raises ParseError: Can be thrown during iteration for invalid lines. 265 :raises ParseError: Can be thrown during iteration for invalid lines.
222 """ 266 """
223 for line in lines: 267 for line in lines:
224 yield parse_line(line) 268 yield parse_line(line)
LEFTRIGHT

Powered by Google App Engine
This is Rietveld