Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-2017 eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
12 # | 12 # |
13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
15 | 15 |
16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
17 | 17 |
18 import re | 18 import re |
19 from collections import namedtuple | 19 from collections import namedtuple |
20 | 20 |
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError', 'ST', 'FA'] | 21 __all__ = [ |
22 'FILTER_ACTION', | |
23 'FILTER_OPTION', | |
24 'ParseError', | |
25 'SELECTOR_TYPE', | |
26 'parse_filterlist', | |
27 'parse_line', | |
28 ] | |
22 | 29 |
23 | 30 |
24 class ParseError(Exception): | 31 class ParseError(Exception): |
25 """Exception thrown by the parser when it encounters invalid input. | 32 """Exception thrown by the parser when it encounters invalid input. |
26 | 33 |
27 :param error: Description of the error. | 34 :param error: Description of the error. |
28 :param text: The text which was being parsed when an error occurred. | 35 :param text: The text which was being parsed when an error occurred. |
29 """ | 36 """ |
30 | 37 |
31 def __init__(self, error, text): | 38 def __init__(self, error, text): |
32 Exception.__init__(self, '{} in "{}"'.format(error, text)) | 39 Exception.__init__(self, '{} in "{}"'.format(error, text)) |
33 self.text = text | 40 self.text = text |
34 self.error = error | 41 self.error = error |
35 | 42 |
36 | 43 |
37 # Constants related to filters (see https://adblockplus.org/filters). | 44 # Constants related to filters (see https://adblockplus.org/filters). |
38 class ST: | 45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant). |
mathias
2017/07/28 16:43:29
Why abbreviating here (ST) and below (FA)?
Vasily Kuznetsov
2017/07/28 17:38:10
To be completely honest, the reason is kind of stu
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
39 """Selector types""" | 46 """Selector types""" |
40 URL_PATTERN = 'url-pattern' # Normal URL patterns. | 47 URL_PATTERN = 'url-pattern' # Normal URL patterns. |
41 URL_REGEXP = 'url-regexp' # Regular expressions for URLs. | 48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs. |
42 CSS = 'css' # CSS selectors for hiding filters. | 49 CSS = 'css' # CSS selectors for hiding filters. |
43 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). | 50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). |
44 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. | 51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. |
45 | 52 |
46 | 53 |
47 class FA: | 54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant). |
48 """Filter actions""" | 55 """Filter actions""" |
49 BLOCK = 'block' # Block the request. | 56 BLOCK = 'block' # Block the request. |
50 ALLOW = 'allow' # Allow the request (whitelist). | 57 ALLOW = 'allow' # Allow the request (whitelist). |
51 HIDE = 'hide' # Hide selected element(s). | 58 HIDE = 'hide' # Hide selected element(s). |
52 SHOW = 'show' # Show selected element(s) (whitelist). | 59 SHOW = 'show' # Show selected element(s) (whitelist). |
60 | |
61 | |
62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant). | |
63 """Filter options""" | |
64 # Resource types. | |
65 OTHER = 'other' | |
66 SCRIPT = 'script' | |
67 IMAGE = 'image' | |
68 STYLESHEET = 'stylesheet' | |
69 OBJECT = 'object' | |
70 SUBDOCUMENT = 'subdocument' | |
71 DOCUMENT = 'document' | |
72 WEBSOCKET = 'websocket' | |
73 WEBRTC = 'webrtc' | |
74 PING = 'ping' | |
75 XMLHTTPREQUEST = 'xmlhttprequest' | |
76 OBJECT_SUBREQUEST = 'object-subrequest' | |
77 MEDIA = 'media' | |
78 FONT = 'font' | |
79 POPUP = 'popup' | |
80 GENERICBLOCK = 'genericblock' | |
81 ELEMHIDE = 'elemhide' | |
82 GENERICHIDE = 'generichide' | |
83 | |
84 # Deprecated resource types. | |
85 BACKGROUND = 'background' | |
86 XBL = 'xbl' | |
87 DTD = 'dtd' | |
88 | |
89 # Other options. | |
90 MATCH_CASE = 'match-case' | |
91 DOMAIN = 'domain' | |
92 THIRD_PARTY = 'third-party' | |
93 COLLAPSE = 'collapse' | |
94 SITEKEY = 'sitekey' | |
95 DONOTTRACK = 'donottrack' | |
53 | 96 |
54 | 97 |
55 def _line_type(name, field_names, format_string): | 98 def _line_type(name, field_names, format_string): |
56 """Define a line type. | 99 """Define a line type. |
57 | 100 |
58 :param name: The name of the line type to define. | 101 :param name: The name of the line type to define. |
59 :param field_names: A sequence of field names or one space-separated | 102 :param field_names: A sequence of field names or one space-separated |
60 string that contains all field names. | 103 string that contains all field names. |
61 :param format_string: A format specifier for converting this line type | 104 :param format_string: A format specifier for converting this line type |
62 back to string representation. | 105 back to string representation. |
(...skipping 13 matching lines...) Expand all Loading... | |
76 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
77 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 120 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
78 Include = _line_type('Include', 'target', '%include {0.target}%') | 121 Include = _line_type('Include', 'target', '%include {0.target}%') |
79 | 122 |
80 | 123 |
81 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 124 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
82 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
83 'Version'} | 126 'Version'} |
84 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
85 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
86 HFILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 129 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
mathias
2017/07/28 16:43:29
Why abbreviating? What's wrong about HIDING_FILTER
Vasily Kuznetsov
2017/07/28 17:38:10
It's shorter this way. But I don't feel very stron
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
87 BFILTER_REGEXP_REGEXP = re.compile( | 130 FILTER_OPTIONS_REGEXP = re.compile( |
mathias
2017/07/28 16:43:30
I was wondering about the *_REGEXP_REGEXP name, bu
Vasily Kuznetsov
2017/07/28 17:38:10
It's the regular expression for blocking filters w
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
88 r'^(@@)?\/.*\/(?:\$~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)?$' | |
89 ) | |
90 BFILTER_OPTIONS_REGEXP = re.compile( | |
91 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' | 131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' |
92 ) | 132 ) |
93 | 133 |
94 | 134 |
95 def _parse_comment(text): | 135 def _parse_comment(text): |
96 match = METADATA_REGEXP.match(text) | 136 match = METADATA_REGEXP.match(text) |
97 if match and match.group(1) in METADATA_KEYS: | 137 if match and match.group(1) in METADATA_KEYS: |
98 return Metadata(match.group(1), match.group(2)) | 138 return Metadata(match.group(1), match.group(2)) |
99 return Comment(text[1:].strip()) | 139 return Comment(text[1:].strip()) |
100 | 140 |
101 | 141 |
102 def _parse_header(text): | 142 def _parse_header(text): |
103 match = HEADER_REGEXP.match(text) | 143 match = HEADER_REGEXP.match(text) |
104 if not match: | 144 if not match: |
105 raise ParseError('Malformed header', text) | 145 raise ParseError('Malformed header', text) |
106 return Header(match.group(1)) | 146 return Header(match.group(1)) |
107 | 147 |
108 | 148 |
109 def _parse_instruction(text): | 149 def _parse_instruction(text): |
110 match = INCLUDE_REGEXP.match(text) | 150 match = INCLUDE_REGEXP.match(text) |
111 if not match: | 151 if not match: |
112 raise ParseError('Unrecognized instruction', text) | 152 raise ParseError('Unrecognized instruction', text) |
113 return Include(match.group(1)) | 153 return Include(match.group(1)) |
114 | 154 |
115 | 155 |
116 def _parse_option(option): | 156 def _parse_option(option): |
117 if '=' in option: | 157 if '=' in option: |
118 name, value = option.split('=', 1) | 158 return option.split('=', 1) |
119 elif option.startswith('~'): | 159 if option.startswith('~'): |
120 name, value = option[1:], False | 160 return option[1:], False |
121 else: | 161 return option, True |
122 name, value = option, True | 162 |
163 | |
164 def _parse_filter_option(option): | |
165 name, value = _parse_option(option) | |
123 | 166 |
124 # Handle special cases of multivalued options. | 167 # Handle special cases of multivalued options. |
125 if name == 'domain': | 168 if name == FILTER_OPTION.DOMAIN: |
mathias
2017/07/28 16:43:30
Wouldn't it make sense to enumerate recognized OPT
Vasily Kuznetsov
2017/07/28 17:38:10
Yeah, probably makes sense to make some kind of en
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
126 name, value = 'domains', _parse_options(value, '|') | 169 value = [_parse_option(o) for o in value.split('|')] |
mathias
2017/07/28 16:43:30
Why using a different / plural key for the parsed
Vasily Kuznetsov
2017/07/28 17:38:10
Because semantically it's a list, always, so calli
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
127 elif name == 'sitekey': | 170 elif name == FILTER_OPTION.SITEKEY: |
128 name, value = 'sitekeys', value.split('|') | 171 value = value.split('|') |
129 | 172 |
130 return name, value | 173 return name, value |
131 | 174 |
132 | 175 |
133 def _parse_options(options, separator=','): | 176 def _parse_filter_options(options): |
134 return [_parse_option(o) for o in options.split(separator)] | 177 return [_parse_filter_option(o) for o in options.split(',')] |
135 | 178 |
136 | 179 |
137 def _parse_blocking_filter(text): | 180 def _parse_blocking_filter(text): |
138 # Based on RegExpFilter.fromText in lib/filterClasses.js | 181 # Based on RegExpFilter.fromText in lib/filterClasses.js |
139 # in https://hg.adblockplus.org/adblockpluscore. | 182 # in https://hg.adblockplus.org/adblockpluscore. |
140 action = FA.BLOCK | 183 action = FILTER_ACTION.BLOCK |
141 options = [] | 184 options = [] |
142 selector = text | 185 selector = text |
143 | 186 |
144 if selector.startswith('@@'): | 187 if selector.startswith('@@'): |
145 action = FA.ALLOW | 188 action = FILTER_ACTION.ALLOW |
146 selector = selector[2:] | 189 selector = selector[2:] |
147 | 190 |
148 if '$' in selector: | 191 if '$' in selector: |
149 opt_match = BFILTER_OPTIONS_REGEXP.search(selector) | 192 opt_match = FILTER_OPTIONS_REGEXP.search(selector) |
150 if opt_match: | 193 if opt_match: |
151 selector = selector[:opt_match.start(0)] | 194 selector = selector[:opt_match.start(0)] |
152 options = _parse_options(opt_match.group(1)) | 195 options = _parse_filter_options(opt_match.group(1)) |
153 | 196 |
154 if (len(selector) > 1 and | 197 if (len(selector) > 1 and |
155 selector.startswith('/') and selector.endswith('/')): | 198 selector.startswith('/') and selector.endswith('/')): |
156 selector = {'type': ST.URL_REGEXP, 'value': selector[1:-1]} | 199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]} |
157 else: | 200 else: |
158 selector = {'type': ST.URL_PATTERN, 'value': selector} | 201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector} |
159 | 202 |
160 return Filter(text, selector, action, options) | 203 return Filter(text, selector, action, options) |
161 | 204 |
162 | 205 |
163 def _parse_hiding_filter(text, domains, type_flag, selector_value): | 206 def _parse_hiding_filter(text, domain, type_flag, selector_value): |
164 selector = {'type': ST.CSS, 'value': selector_value} | 207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value} |
165 action = FA.HIDE | 208 action = FILTER_ACTION.HIDE |
166 options = [] | 209 options = [] |
167 | 210 |
168 if type_flag == '@': | 211 if type_flag == '@': |
169 action = FA.SHOW | 212 action = FILTER_ACTION.SHOW |
170 elif type_flag == '?': | 213 elif type_flag == '?': |
171 selector['type'] = ST.XCSS | 214 selector['type'] = SELECTOR_TYPE.XCSS |
172 | 215 |
173 if domains: | 216 if domain: |
174 options.append(('domains', _parse_options(domains))) | 217 domains = [_parse_option(d) for d in domain.split(',')] |
218 options.append((FILTER_OPTION.DOMAIN, domains)) | |
175 | 219 |
176 return Filter(text, selector, action, options) | 220 return Filter(text, selector, action, options) |
177 | 221 |
178 | 222 |
179 def parse_filter(text): | 223 def parse_filter(text): |
180 """Parse one filter. | 224 """Parse one filter. |
181 | 225 |
182 :param text: Text representation of a filter. | 226 :param text: Text representation of a filter. |
183 :returns: filter object. | 227 :returns: Filter object. |
mathias
2017/07/28 16:43:29
I think this should be upper-case "Filter".
Vasily Kuznetsov
2017/07/28 17:38:10
Yes.
| |
184 """ | 228 """ |
185 if '#' in text: | 229 if '#' in text: |
186 match = HFILTER_REGEXP.search(text) | 230 match = HIDING_FILTER_REGEXP.search(text) |
187 if match: | 231 if match: |
188 return _parse_hiding_filter(text, *match.groups()) | 232 return _parse_hiding_filter(text, *match.groups()) |
189 return _parse_blocking_filter(text) | 233 return _parse_blocking_filter(text) |
190 | 234 |
191 | 235 |
192 def parse_line(line_text): | 236 def parse_line(line_text): |
193 """Parse one line of a filter list. | 237 """Parse one line of a filter list. |
194 | 238 |
195 :param line_text: Line of a filter list (must be a unicode string). | 239 :param line_text: Line of a filter list (must be a unicode string). |
196 :returns: Parsed line object (see `_line_type`). | 240 :returns: Parsed line object (see `_line_type`). |
(...skipping 18 matching lines...) Expand all Loading... | |
215 | 259 |
216 def parse_filterlist(lines): | 260 def parse_filterlist(lines): |
217 """Parse filter list from an iterable. | 261 """Parse filter list from an iterable. |
218 | 262 |
219 :param lines: List of strings or file or other iterable. | 263 :param lines: List of strings or file or other iterable. |
220 :returns: Iterator over parsed lines. | 264 :returns: Iterator over parsed lines. |
221 :raises ParseError: Can be thrown during iteration for invalid lines. | 265 :raises ParseError: Can be thrown during iteration for invalid lines. |
222 """ | 266 """ |
223 for line in lines: | 267 for line in lines: |
224 yield parse_line(line) | 268 yield parse_line(line) |
LEFT | RIGHT |