Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-2017 eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
12 # | 12 # |
13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
15 | 15 |
16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
17 | 17 |
18 import re | 18 import re |
19 from collections import namedtuple | 19 from collections import namedtuple |
20 | 20 |
21 __all__ = ['parse_filterlist', 'parse_line', 'parse_filter'] | 21 __all__ = [ |
22 'FILTER_ACTION', | |
23 'FILTER_OPTION', | |
24 'ParseError', | |
25 'SELECTOR_TYPE', | |
26 'parse_filterlist', | |
27 'parse_line', | |
28 ] | |
22 | 29 |
23 | 30 |
24 class ParseError(Exception): | 31 class ParseError(Exception): |
25 """Internal exception used by the parser to signal invalid input.""" | 32 """Exception thrown by the parser when it encounters invalid input. |
mathias
2017/07/26 20:37:15
Removing the custom __init__ function looks like a
| |
26 | 33 |
27 | 34 :param error: Description of the error. |
28 def line_type(name, field_names, format_string): | 35 :param text: The text which was being parsed when an error occurred. |
36 """ | |
37 | |
38 def __init__(self, error, text): | |
39 Exception.__init__(self, '{} in "{}"'.format(error, text)) | |
40 self.text = text | |
41 self.error = error | |
42 | |
43 | |
44 # Constants related to filters (see https://adblockplus.org/filters). | |
45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant). | |
46 """Selector types""" | |
47 URL_PATTERN = 'url-pattern' # Normal URL patterns. | |
48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs. | |
49 CSS = 'css' # CSS selectors for hiding filters. | |
50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). | |
51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. | |
52 | |
53 | |
54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant). | |
55 """Filter actions""" | |
56 BLOCK = 'block' # Block the request. | |
57 ALLOW = 'allow' # Allow the request (whitelist). | |
58 HIDE = 'hide' # Hide selected element(s). | |
59 SHOW = 'show' # Show selected element(s) (whitelist). | |
60 | |
61 | |
62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant). | |
63 """Filter options""" | |
64 # Resource types. | |
65 OTHER = 'other' | |
66 SCRIPT = 'script' | |
67 IMAGE = 'image' | |
68 STYLESHEET = 'stylesheet' | |
69 OBJECT = 'object' | |
70 SUBDOCUMENT = 'subdocument' | |
71 DOCUMENT = 'document' | |
72 WEBSOCKET = 'websocket' | |
73 WEBRTC = 'webrtc' | |
74 PING = 'ping' | |
75 XMLHTTPREQUEST = 'xmlhttprequest' | |
76 OBJECT_SUBREQUEST = 'object-subrequest' | |
77 MEDIA = 'media' | |
78 FONT = 'font' | |
79 POPUP = 'popup' | |
80 GENERICBLOCK = 'genericblock' | |
81 ELEMHIDE = 'elemhide' | |
82 GENERICHIDE = 'generichide' | |
83 | |
84 # Deprecated resource types. | |
85 BACKGROUND = 'background' | |
86 XBL = 'xbl' | |
87 DTD = 'dtd' | |
88 | |
89 # Other options. | |
90 MATCH_CASE = 'match-case' | |
91 DOMAIN = 'domain' | |
92 THIRD_PARTY = 'third-party' | |
93 COLLAPSE = 'collapse' | |
94 SITEKEY = 'sitekey' | |
95 DONOTTRACK = 'donottrack' | |
96 | |
97 | |
98 def _line_type(name, field_names, format_string): | |
29 """Define a line type. | 99 """Define a line type. |
30 | 100 |
31 :param name: The name of the line type to define. | 101 :param name: The name of the line type to define. |
32 :param field_names: A sequence of field names or one space-separated | 102 :param field_names: A sequence of field names or one space-separated |
33 string that contains all field names. | 103 string that contains all field names. |
34 :param format_string: A format specifier for converting this line type | 104 :param format_string: A format specifier for converting this line type |
mathias
2017/07/26 20:37:15
Fixing the missing format_string parameter documen
| |
35 back to string representation. | 105 back to string representation. |
36 :returns: Class created with `namedtuple` that has `.type` set to | 106 :returns: Class created with `namedtuple` that has `.type` set to |
37 lowercased `name` and supports conversion back to string with | 107 lowercased `name` and supports conversion back to string with |
38 `.to_string()` method. | 108 `.to_string()` method. |
39 """ | 109 """ |
40 lt = namedtuple(name, field_names) | 110 lt = namedtuple(name, field_names) |
41 lt.type = name.lower() | 111 lt.type = name.lower() |
42 lt.to_string = lambda self: format_string.format(self) | 112 lt.to_string = lambda self: format_string.format(self) |
43 return lt | 113 return lt |
44 | 114 |
45 | 115 |
46 InvalidLine = line_type('Invalid', 'text error', '{.text}') | 116 Header = _line_type('Header', 'version', '[{.version}]') |
47 Header = line_type('Header', 'version', '[{.version}]') | 117 EmptyLine = _line_type('EmptyLine', '', '') |
48 EmptyLine = line_type('EmptyLine', '', '') | 118 Comment = _line_type('Comment', 'text', '! {.text}') |
49 Comment = line_type('Comment', 'text', '! {.text}') | 119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
50 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 120 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
51 Include = line_type('Include', 'target', '%include {0.target}%') | 121 Include = _line_type('Include', 'target', '%include {0.target}%') |
52 Filter = line_type('Filter', 'text selector action options', '{.text}') | |
53 | 122 |
54 | 123 |
55 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 124 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
56 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
57 'Version'} | 126 'Version'} |
58 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
59 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
60 BFILTER_OPTIONS_REGEXP = re.compile( | 129 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
61 r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$' | 130 FILTER_OPTIONS_REGEXP = re.compile( |
131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' | |
62 ) | 132 ) |
63 HFILTER_REGEXP = re.compile( | |
64 r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)' | |
65 r'((?:\([\w\-]+(?:[$^*]?=[^\(\)"]*)?\))*)|#([^{}]+))$' | |
66 ) | |
67 | |
68 # Types of resources to block (based on adblockpluscore/lib/filterClasses.js). | |
69 TYPES = { | |
70 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media', | |
71 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup', | |
72 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide', | |
73 } | |
74 | |
75 # Special types used for whitelisting. | |
76 TYPES_WHITELIST = { | |
77 'document', 'elemhide', 'generichide', 'genericblock', | |
78 } | |
79 | |
80 # By default blocking filters apply to everything except whitelist-only types | |
81 # and popups (based on adblockpluscore/lib/filterClasses.js). | |
82 TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST} | |
83 | |
84 # Type options that are synonyms for other types. | |
85 TYPE_SYNONYMS = { | |
86 'xbl': 'other', | |
87 'dtd': 'other', | |
88 'background': 'image', | |
89 } | |
90 | 133 |
91 | 134 |
92 def _parse_comment(text): | 135 def _parse_comment(text): |
93 match = METADATA_REGEXP.match(text) | 136 match = METADATA_REGEXP.match(text) |
94 if match and match.group(1) in METADATA_KEYS: | 137 if match and match.group(1) in METADATA_KEYS: |
95 return Metadata(match.group(1), match.group(2)) | 138 return Metadata(match.group(1), match.group(2)) |
96 return Comment(text[1:].strip()) | 139 return Comment(text[1:].strip()) |
97 | 140 |
98 | 141 |
99 def _parse_header(text): | 142 def _parse_header(text): |
100 match = HEADER_REGEXP.match(text) | 143 match = HEADER_REGEXP.match(text) |
101 if not match: | 144 if not match: |
102 raise ParseError('Malformed header') | 145 raise ParseError('Malformed header', text) |
mathias
2017/07/26 20:37:15
Please explain why you don't include the malformed
Vasily Kuznetsov
2017/07/27 11:05:02
My reasoning was that you can get to this place in
| |
103 return Header(match.group(1)) | 146 return Header(match.group(1)) |
104 | 147 |
105 | 148 |
106 def _parse_instruction(text): | 149 def _parse_instruction(text): |
107 match = INCLUDE_REGEXP.match(text) | 150 match = INCLUDE_REGEXP.match(text) |
108 if not match: | 151 if not match: |
109 raise ParseError('Unrecognized instruction') | 152 raise ParseError('Unrecognized instruction', text) |
110 return Include(match.group(1)) | 153 return Include(match.group(1)) |
111 | 154 |
112 | 155 |
113 def _separate_domains(domains): | 156 def _parse_option(option): |
114 options = {} | 157 if '=' in option: |
115 for d in domains: | 158 return option.split('=', 1) |
116 if d.startswith('~'): | 159 if option.startswith('~'): |
117 options.setdefault('domains-exclude', []).append(d.lstrip('~')) | 160 return option[1:], False |
118 else: | 161 return option, True |
119 options.setdefault('domains-include', []).append(d) | 162 |
120 if 'domains-include' in options: | 163 |
121 options['domains-none'] = True | 164 def _parse_filter_option(option): |
122 return options | 165 name, value = _parse_option(option) |
123 | 166 |
124 | 167 # Handle special cases of multivalued options. |
125 def _separate_types(types): | 168 if name == FILTER_OPTION.DOMAIN: |
126 """Convert a list of `(type, on_off)` tuples to options: | 169 value = [_parse_option(o) for o in value.split('|')] |
127 | 170 elif name == FILTER_OPTION.SITEKEY: |
128 - types-none: True if we start with nothing included, absent if we start | 171 value = value.split('|') |
129 with TYPES_DEFAULT included. | 172 |
130 - types-include: List of additional included types. | 173 return name, value |
131 - types-exclude: List of excluded types. | |
132 """ | |
133 if not types: | |
134 return {} | |
135 | |
136 if types[0][1]: # If the first type is ON, we start with nothing... | |
137 types_default = set() | |
138 options = {'types-none': True} | |
139 else: # ...otherwise we start with default type set. | |
140 types_default = TYPES_DEFAULT | |
141 options = {} | |
142 | |
143 # Include/exclude any deviations from default. | |
144 for name, value in dict(types).items(): | |
145 if value and name not in types_default: | |
146 options.setdefault('types-include', []).append(name) | |
147 if not value and name in types_default: | |
148 options.setdefault('types-exclude', []).append(name) | |
149 | |
150 return options | |
151 | |
152 | |
153 def _parse_hiding_filter(text, match): | |
154 if match.group(5): | |
155 selector = {'type': 'css', 'value': match.group(5)} | |
156 else: | |
157 selector = { | |
158 'type': 'abp-simple', | |
159 'value': match.group(3) + match.group(4), | |
160 } | |
161 action = 'show' if match.group(2) else 'hide' | |
162 options = _separate_domains(list(filter(None, match.group(1).split(',')))) | |
163 return Filter(text, selector, action, options) | |
164 | 174 |
165 | 175 |
166 def _parse_filter_options(options): | 176 def _parse_filter_options(options): |
167 # Based on RegExpFilter.fromText in lib/filterClasses.js | 177 return [_parse_filter_option(o) for o in options.split(',')] |
168 # in adblockpluscore. | |
169 parsed_options = {} | |
170 type_options = [] | |
171 | |
172 for option in options.split(','): | |
173 if '=' in option: | |
174 name, value = option.split('=', 1) | |
175 elif option.startswith('~'): | |
176 name, value = option[1:], False | |
177 else: | |
178 name, value = option, True | |
179 | |
180 if name in TYPE_SYNONYMS: | |
181 name = TYPE_SYNONYMS[name] | |
182 if name in TYPES: | |
183 type_options.append((name, value)) | |
184 elif name == 'domain': | |
185 parsed_options.update(_separate_domains(value.split('|'))) | |
186 elif name == 'sitekey': | |
187 parsed_options['sitekeys'] = value.split('|') | |
188 else: | |
189 parsed_options[name] = value | |
190 | |
191 parsed_options.update(_separate_types(type_options)) | |
192 return parsed_options | |
193 | 178 |
194 | 179 |
195 def _parse_blocking_filter(text): | 180 def _parse_blocking_filter(text): |
196 # Based on RegExpFilter.fromText in lib/filterClasses.js | 181 # Based on RegExpFilter.fromText in lib/filterClasses.js |
197 # in adblockpluscore. | 182 # in https://hg.adblockplus.org/adblockpluscore. |
198 action = 'block' | 183 action = FILTER_ACTION.BLOCK |
199 options = {} | 184 options = [] |
200 selector = text | 185 selector = text |
201 | 186 |
202 if selector.startswith('@@'): | 187 if selector.startswith('@@'): |
203 action = 'allow' | 188 action = FILTER_ACTION.ALLOW |
mathias
2017/07/26 20:37:15
I think we should have symbols like BFILTER_ACTION
Vasily Kuznetsov
2017/07/27 11:05:02
Probably not these exact names for the constants,
| |
204 selector = selector[2:] | 189 selector = selector[2:] |
205 | 190 |
206 if '$' in selector: | 191 if '$' in selector: |
207 opt_match = BFILTER_OPTIONS_REGEXP.search(selector) | 192 opt_match = FILTER_OPTIONS_REGEXP.search(selector) |
208 if opt_match: | 193 if opt_match: |
209 selector = selector[:opt_match.start(0)] | 194 selector = selector[:opt_match.start(0)] |
210 options = _parse_filter_options(opt_match.group(1)) | 195 options = _parse_filter_options(opt_match.group(1)) |
211 | 196 |
212 if (len(selector) > 1 and | 197 if (len(selector) > 1 and |
213 selector.startswith('/') and selector.endswith('/')): | 198 selector.startswith('/') and selector.endswith('/')): |
214 selector = {'type': 'url-regexp', 'value': selector[1:-1]} | 199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]} |
mathias
2017/07/26 20:37:15
I also think we should have symbols like SELECTOR_
Vasily Kuznetsov
2017/07/27 11:05:02
Acknowledged.
| |
215 else: | 200 else: |
216 selector = {'type': 'url-pattern', 'value': selector} | 201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector} |
202 | |
203 return Filter(text, selector, action, options) | |
204 | |
205 | |
206 def _parse_hiding_filter(text, domain, type_flag, selector_value): | |
207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value} | |
208 action = FILTER_ACTION.HIDE | |
209 options = [] | |
210 | |
211 if type_flag == '@': | |
212 action = FILTER_ACTION.SHOW | |
213 elif type_flag == '?': | |
214 selector['type'] = SELECTOR_TYPE.XCSS | |
215 | |
216 if domain: | |
217 domains = [_parse_option(d) for d in domain.split(',')] | |
218 options.append((FILTER_OPTION.DOMAIN, domains)) | |
217 | 219 |
218 return Filter(text, selector, action, options) | 220 return Filter(text, selector, action, options) |
219 | 221 |
220 | 222 |
221 def parse_filter(text): | 223 def parse_filter(text): |
222 """Parse one filter. | 224 """Parse one filter. |
223 | 225 |
224 :param text: Text representation of a filter. | 226 :param text: Text representation of a filter. |
225 :returns: filter object. | 227 :returns: Filter object. |
226 """ | 228 """ |
227 match = HFILTER_REGEXP.match(text) if '#' in text else False | 229 if '#' in text: |
mathias
2017/07/26 20:37:15
Call me old-fashioned but I seriously dislike chan
Vasily Kuznetsov
2017/07/27 11:05:02
Completely agree about changing the type of the va
| |
228 if match: | 230 match = HIDING_FILTER_REGEXP.search(text) |
229 return _parse_hiding_filter(text, match) | 231 if match: |
232 return _parse_hiding_filter(text, *match.groups()) | |
230 return _parse_blocking_filter(text) | 233 return _parse_blocking_filter(text) |
231 | 234 |
232 | 235 |
233 def parse_line(line_text): | 236 def parse_line(line_text): |
234 """Parse one line of a filter list. | 237 """Parse one line of a filter list. |
235 | 238 |
236 :param line_text: Line of a filter list (must be a unicode string). | 239 :param line_text: Line of a filter list (must be a unicode string). |
237 :returns: Parsed line object (see `line_type`). | 240 :returns: Parsed line object (see `_line_type`). |
238 :raises ParseError: If the line can't be successfully parsed. | 241 :raises ParseError: If the line can't be successfully parsed. |
239 """ | 242 """ |
240 content = line_text.strip() | 243 content = line_text.strip() |
241 | 244 |
242 if content == '': | 245 if content == '': |
243 line = EmptyLine() | 246 line = EmptyLine() |
244 elif content.startswith('!'): | 247 elif content.startswith('!'): |
245 line = _parse_comment(content) | 248 line = _parse_comment(content) |
246 elif content.startswith('%') and content.endswith('%'): | 249 elif content.startswith('%') and content.endswith('%'): |
247 line = _parse_instruction(content) | 250 line = _parse_instruction(content) |
248 elif content.startswith('[') and content.endswith(']'): | 251 elif content.startswith('[') and content.endswith(']'): |
249 line = _parse_header(content) | 252 line = _parse_header(content) |
250 else: | 253 else: |
251 line = parse_filter(content) | 254 line = parse_filter(content) |
252 | 255 |
253 assert line.to_string().replace(' ', '') == content.replace(' ', '') | 256 assert line.to_string().replace(' ', '') == content.replace(' ', '') |
254 return line | 257 return line |
255 | 258 |
256 | 259 |
257 def parse_filterlist(lines): | 260 def parse_filterlist(lines): |
258 """Parse filter list from an iterable. | 261 """Parse filter list from an iterable. |
259 | 262 |
260 :param lines: List of strings or file or other iterable. | 263 :param lines: List of strings or file or other iterable. |
261 :returns: Iterator over parsed lines. | 264 :returns: Iterator over parsed lines. |
262 :raises ParseError: Can be thrown during iteration for invalid lines. | 265 :raises ParseError: Can be thrown during iteration for invalid lines. |
263 """ | 266 """ |
264 for line in lines: | 267 for line in lines: |
265 try: | 268 yield parse_line(line) |
266 yield parse_line(line) | |
267 except ParseError as pe: | |
268 yield InvalidLine(line.strip(), str(pe)) | |
LEFT | RIGHT |