Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)
Patch Set: Created June 14, 2017, 5:32 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | setup.py » ('j') | setup.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-2017 eyeo GmbH 2 # Copyright (C) 2006-2017 eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 from __future__ import unicode_literals 16 from __future__ import unicode_literals
17 17
18 import re 18 import re
19 from collections import namedtuple 19 from collections import namedtuple
20 20
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] 21 __all__ = ['parse_filterlist', 'parse_line', 'parse_filter']
22 22
23 23
24 class ParseError(Exception): 24 class ParseError(Exception):
25 """Exception thrown by the parser when it encounters invalid input. 25 """Internal exception used by the parser to signal invalid input."""
mathias 2017/07/26 20:37:15 Removing the custom __init__ function looks like a
26
27 :param error: Description of the error.
28 :param text: The text which was being parsed when an error occurred.
29 """
30
31 def __init__(self, error, text):
32 Exception.__init__(self, '{} in "{}"'.format(error, text))
33 self.text = text
34 self.error = error
35 26
36 27
37 def line_type(name, field_names, format_string): 28 def line_type(name, field_names, format_string):
38 """Define a line type. 29 """Define a line type.
39 30
40 :param name: The name of the line type to define. 31 :param name: The name of the line type to define.
41 :param field_names: A sequence of field names or one space-separated 32 :param field_names: A sequence of field names or one space-separated
42 string that contains all field names. 33 string that contains all field names.
34 :param format_string: A format specifier for converting this line type
mathias 2017/07/26 20:37:15 Fixing the missing format_string parameter documen
35 back to string representation.
43 :returns: Class created with `namedtuple` that has `.type` set to 36 :returns: Class created with `namedtuple` that has `.type` set to
44 lowercased `name` and supports conversion back to string with 37 lowercased `name` and supports conversion back to string with
45 `.to_string()` method. 38 `.to_string()` method.
46 """ 39 """
47 lt = namedtuple(name, field_names) 40 lt = namedtuple(name, field_names)
48 lt.type = name.lower() 41 lt.type = name.lower()
49 lt.to_string = lambda self: format_string.format(self) 42 lt.to_string = lambda self: format_string.format(self)
50 return lt 43 return lt
51 44
52 45
46 InvalidLine = line_type('Invalid', 'text error', '{.text}')
53 Header = line_type('Header', 'version', '[{.version}]') 47 Header = line_type('Header', 'version', '[{.version}]')
54 EmptyLine = line_type('EmptyLine', '', '') 48 EmptyLine = line_type('EmptyLine', '', '')
55 Comment = line_type('Comment', 'text', '! {.text}') 49 Comment = line_type('Comment', 'text', '! {.text}')
56 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}') 50 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}')
57 Filter = line_type('Filter', 'expression', '{.expression}')
58 Include = line_type('Include', 'target', '%include {0.target}%') 51 Include = line_type('Include', 'target', '%include {0.target}%')
52 Filter = line_type('Filter', 'text selector action options', '{.text}')
59 53
60 54
61 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') 55 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')
62 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', 56 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',
63 'Version'} 57 'Version'}
64 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 58 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
65 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 59 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
60 BFILTER_OPTIONS_REGEXP = re.compile(
61 r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$'
62 )
63 HFILTER_REGEXP = re.compile(
64 r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)'
65 r'((?:\([\w\-]+(?:[$^*]?=[^\(\)"]*)?\))*)|#([^{}]+))$'
66 )
67
68 # Types of resources to block (based on adblockpluscore/lib/filterClasses.js).
69 TYPES = {
70 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media',
71 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup',
72 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide',
73 }
74
75 # Special types used for whitelisting.
76 TYPES_WHITELIST = {
77 'document', 'elemhide', 'generichide', 'genericblock',
78 }
79
80 # By default blocking filters apply to everything except whitelist-only types
81 # and popups (based on adblockpluscore/lib/filterClasses.js).
82 TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST}
83
84 # Type options that are synonyms for other types.
85 TYPE_SYNONYMS = {
86 'xbl': 'other',
87 'dtd': 'other',
88 'background': 'image',
89 }
66 90
67 91
68 def _parse_comment(text): 92 def _parse_comment(text):
69 match = METADATA_REGEXP.match(text) 93 match = METADATA_REGEXP.match(text)
70 if match and match.group(1) in METADATA_KEYS: 94 if match and match.group(1) in METADATA_KEYS:
71 return Metadata(match.group(1), match.group(2)) 95 return Metadata(match.group(1), match.group(2))
72 return Comment(text[1:].strip()) 96 return Comment(text[1:].strip())
73 97
74 98
75 def _parse_header(text): 99 def _parse_header(text):
76 match = HEADER_REGEXP.match(text) 100 match = HEADER_REGEXP.match(text)
77 if not match: 101 if not match:
78 raise ParseError('Malformed header', text) 102 raise ParseError('Malformed header')
mathias 2017/07/26 20:37:15 Please explain why you don't include the malformed
Vasily Kuznetsov 2017/07/27 11:05:02 My reasoning was that you can get to this place in
79 return Header(match.group(1)) 103 return Header(match.group(1))
80 104
81 105
82 def _parse_instruction(text): 106 def _parse_instruction(text):
83 match = INCLUDE_REGEXP.match(text) 107 match = INCLUDE_REGEXP.match(text)
84 if not match: 108 if not match:
85 raise ParseError('Unrecognized instruction', text) 109 raise ParseError('Unrecognized instruction')
86 return Include(match.group(1)) 110 return Include(match.group(1))
87 111
88 112
113 def _separate_domains(domains):
114 options = {}
115 for d in domains:
116 if d.startswith('~'):
117 options.setdefault('domains-exclude', []).append(d.lstrip('~'))
118 else:
119 options.setdefault('domains-include', []).append(d)
120 if 'domains-include' in options:
121 options['domains-none'] = True
122 return options
123
124
125 def _separate_types(types):
126 """Convert a list of `(type, on_off)` tuples to options:
127
128 - types-none: True if we start with nothing included, absent if we start
129 with TYPES_DEFAULT included.
130 - types-include: List of additional included types.
131 - types-exclude: List of excluded types.
132 """
133 if not types:
134 return {}
135
136 if types[0][1]: # If the first type is ON, we start with nothing...
137 types_default = set()
138 options = {'types-none': True}
139 else: # ...otherwise we start with default type set.
140 types_default = TYPES_DEFAULT
141 options = {}
142
143 # Include/exclude any deviations from default.
144 for name, value in dict(types).items():
145 if value and name not in types_default:
146 options.setdefault('types-include', []).append(name)
147 if not value and name in types_default:
148 options.setdefault('types-exclude', []).append(name)
149
150 return options
151
152
153 def _parse_hiding_filter(text, match):
154 if match.group(5):
155 selector = {'type': 'css', 'value': match.group(5)}
156 else:
157 selector = {
158 'type': 'abp-simple',
159 'value': match.group(3) + match.group(4),
160 }
161 action = 'show' if match.group(2) else 'hide'
162 options = _separate_domains(list(filter(None, match.group(1).split(','))))
163 return Filter(text, selector, action, options)
164
165
166 def _parse_filter_options(options):
167 # Based on RegExpFilter.fromText in lib/filterClasses.js
168 # in adblockpluscore.
169 parsed_options = {}
170 type_options = []
171
172 for option in options.split(','):
173 if '=' in option:
174 name, value = option.split('=', 1)
175 elif option.startswith('~'):
176 name, value = option[1:], False
177 else:
178 name, value = option, True
179
180 if name in TYPE_SYNONYMS:
181 name = TYPE_SYNONYMS[name]
182 if name in TYPES:
183 type_options.append((name, value))
184 elif name == 'domain':
185 parsed_options.update(_separate_domains(value.split('|')))
186 elif name == 'sitekey':
187 parsed_options['sitekeys'] = value.split('|')
188 else:
189 parsed_options[name] = value
190
191 parsed_options.update(_separate_types(type_options))
192 return parsed_options
193
194
195 def _parse_blocking_filter(text):
196 # Based on RegExpFilter.fromText in lib/filterClasses.js
197 # in adblockpluscore.
198 action = 'block'
199 options = {}
200 selector = text
201
202 if selector.startswith('@@'):
203 action = 'allow'
mathias 2017/07/26 20:37:15 I think we should have symbols like BFILTER_ACTION
Vasily Kuznetsov 2017/07/27 11:05:02 Probably not these exact names for the constants,
204 selector = selector[2:]
205
206 if '$' in selector:
207 opt_match = BFILTER_OPTIONS_REGEXP.search(selector)
208 if opt_match:
209 selector = selector[:opt_match.start(0)]
210 options = _parse_filter_options(opt_match.group(1))
211
212 if (len(selector) > 1 and
213 selector.startswith('/') and selector.endswith('/')):
214 selector = {'type': 'url-regexp', 'value': selector[1:-1]}
mathias 2017/07/26 20:37:15 I also think we should have symbols like SELECTOR_
Vasily Kuznetsov 2017/07/27 11:05:02 Acknowledged.
215 else:
216 selector = {'type': 'url-pattern', 'value': selector}
217
218 return Filter(text, selector, action, options)
219
220
221 def parse_filter(text):
222 """Parse one filter.
223
224 :param text: Text representation of a filter.
225 :returns: filter object.
226 """
227 match = HFILTER_REGEXP.match(text) if '#' in text else False
mathias 2017/07/26 20:37:15 Call me old-fashioned but I seriously dislike chan
Vasily Kuznetsov 2017/07/27 11:05:02 Completely agree about changing the type of the va
228 if match:
229 return _parse_hiding_filter(text, match)
230 return _parse_blocking_filter(text)
231
232
89 def parse_line(line_text): 233 def parse_line(line_text):
90 """Parse one line of a filter list. 234 """Parse one line of a filter list.
91 235
92 :param line_text: Line of a filter list (must be a unicode string). 236 :param line_text: Line of a filter list (must be a unicode string).
93 :returns: Parsed line object (see `line_type`). 237 :returns: Parsed line object (see `line_type`).
94 :raises ParseError: If the line can't be successfully parsed. 238 :raises ParseError: If the line can't be successfully parsed.
95 """ 239 """
96 content = line_text.strip() 240 content = line_text.strip()
97 241
98 if content == '': 242 if content == '':
99 line = EmptyLine() 243 line = EmptyLine()
100 elif content.startswith('!'): 244 elif content.startswith('!'):
101 line = _parse_comment(content) 245 line = _parse_comment(content)
102 elif content.startswith('%') and content.endswith('%'): 246 elif content.startswith('%') and content.endswith('%'):
103 line = _parse_instruction(content) 247 line = _parse_instruction(content)
104 elif content.startswith('[') and content.endswith(']'): 248 elif content.startswith('[') and content.endswith(']'):
105 line = _parse_header(content) 249 line = _parse_header(content)
106 else: 250 else:
107 line = Filter(content) 251 line = parse_filter(content)
108 252
109 assert line.to_string().replace(' ', '') == content.replace(' ', '') 253 assert line.to_string().replace(' ', '') == content.replace(' ', '')
110 return line 254 return line
111 255
112 256
113 def parse_filterlist(lines): 257 def parse_filterlist(lines):
114 """Parse filter list from an iterable. 258 """Parse filter list from an iterable.
115 259
116 :param lines: List of strings or file or other iterable. 260 :param lines: List of strings or file or other iterable.
117 :returns: Iterator over parsed lines. 261 :returns: Iterator over parsed lines.
118 :raises ParseError: Can be thrown during iteration for invalid lines. 262 :raises ParseError: Can be thrown during iteration for invalid lines.
119 """ 263 """
120 for line in lines: 264 for line in lines:
121 yield parse_line(line) 265 try:
266 yield parse_line(line)
267 except ParseError as pe:
268 yield InvalidLine(line.strip(), str(pe))
OLDNEW
« no previous file with comments | « no previous file | setup.py » ('j') | setup.py » ('J')

Powered by Google App Engine
This is Rietveld