Left: | ||
Right: |
OLD | NEW |
---|---|
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-2017 eyeo GmbH | 2 # Copyright (C) 2006-2017 eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
12 # | 12 # |
13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
15 | 15 |
16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
17 | 17 |
18 import re | 18 import re |
19 from collections import namedtuple | 19 from collections import namedtuple |
20 | 20 |
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] | 21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError', 'ST', 'FA'] |
22 | 22 |
23 | 23 |
24 class ParseError(Exception): | 24 class ParseError(Exception): |
25 """Exception thrown by the parser when it encounters invalid input. | 25 """Exception thrown by the parser when it encounters invalid input. |
26 | 26 |
27 :param error: Description of the error. | 27 :param error: Description of the error. |
28 :param text: The text which was being parsed when an error occurred. | 28 :param text: The text which was being parsed when an error occurred. |
29 """ | 29 """ |
30 | 30 |
31 def __init__(self, error, text): | 31 def __init__(self, error, text): |
32 Exception.__init__(self, '{} in "{}"'.format(error, text)) | 32 Exception.__init__(self, '{} in "{}"'.format(error, text)) |
33 self.text = text | 33 self.text = text |
34 self.error = error | 34 self.error = error |
35 | 35 |
36 | 36 |
37 # Constants related to filters (see https://adblockplus.org/filters). | |
38 class ST: | |
mathias
2017/07/28 16:43:29
Why abbreviating here (ST) and below (FA)?
Vasily Kuznetsov
2017/07/28 17:38:10
To be completely honest, the reason is kind of stu
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
39 """Selector types""" | |
40 URL_PATTERN = 'url-pattern' # Normal URL patterns. | |
41 URL_REGEXP = 'url-regexp' # Regular expressions for URLs. | |
42 CSS = 'css' # CSS selectors for hiding filters. | |
43 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). | |
44 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. | |
45 | |
46 | |
47 class FA: | |
48 """Filter actions""" | |
49 BLOCK = 'block' # Block the request. | |
50 ALLOW = 'allow' # Allow the request (whitelist). | |
51 HIDE = 'hide' # Hide selected element(s). | |
52 SHOW = 'show' # Show selected element(s) (whitelist). | |
53 | |
54 | |
37 def _line_type(name, field_names, format_string): | 55 def _line_type(name, field_names, format_string): |
38 """Define a line type. | 56 """Define a line type. |
39 | 57 |
40 :param name: The name of the line type to define. | 58 :param name: The name of the line type to define. |
41 :param field_names: A sequence of field names or one space-separated | 59 :param field_names: A sequence of field names or one space-separated |
42 string that contains all field names. | 60 string that contains all field names. |
43 :param format_string: A format specifier for converting this line type | 61 :param format_string: A format specifier for converting this line type |
44 back to string representation. | 62 back to string representation. |
45 :returns: Class created with `namedtuple` that has `.type` set to | 63 :returns: Class created with `namedtuple` that has `.type` set to |
46 lowercased `name` and supports conversion back to string with | 64 lowercased `name` and supports conversion back to string with |
47 `.to_string()` method. | 65 `.to_string()` method. |
48 """ | 66 """ |
49 lt = namedtuple(name, field_names) | 67 lt = namedtuple(name, field_names) |
50 lt.type = name.lower() | 68 lt.type = name.lower() |
51 lt.to_string = lambda self: format_string.format(self) | 69 lt.to_string = lambda self: format_string.format(self) |
52 return lt | 70 return lt |
53 | 71 |
54 | 72 |
55 Header = _line_type('Header', 'version', '[{.version}]') | 73 Header = _line_type('Header', 'version', '[{.version}]') |
56 EmptyLine = _line_type('EmptyLine', '', '') | 74 EmptyLine = _line_type('EmptyLine', '', '') |
57 Comment = _line_type('Comment', 'text', '! {.text}') | 75 Comment = _line_type('Comment', 'text', '! {.text}') |
58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 76 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
59 Filter = _line_type('Filter', 'expression', '{.expression}') | 77 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
60 Include = _line_type('Include', 'target', '%include {0.target}%') | 78 Include = _line_type('Include', 'target', '%include {0.target}%') |
61 | 79 |
62 | 80 |
63 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 81 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 82 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
65 'Version'} | 83 'Version'} |
66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 84 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 85 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
86 HFILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | |
mathias
2017/07/28 16:43:29
Why abbreviating? What's wrong about HIDING_FILTER
Vasily Kuznetsov
2017/07/28 17:38:10
It's shorter this way. But I don't feel very stron
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
87 BFILTER_REGEXP_REGEXP = re.compile( | |
mathias
2017/07/28 16:43:30
I was wondering about the *_REGEXP_REGEXP name, bu
Vasily Kuznetsov
2017/07/28 17:38:10
It's the regular expression for blocking filters w
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
88 r'^(@@)?\/.*\/(?:\$~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)?$' | |
89 ) | |
90 BFILTER_OPTIONS_REGEXP = re.compile( | |
91 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' | |
92 ) | |
68 | 93 |
69 | 94 |
70 def _parse_comment(text): | 95 def _parse_comment(text): |
71 match = METADATA_REGEXP.match(text) | 96 match = METADATA_REGEXP.match(text) |
72 if match and match.group(1) in METADATA_KEYS: | 97 if match and match.group(1) in METADATA_KEYS: |
73 return Metadata(match.group(1), match.group(2)) | 98 return Metadata(match.group(1), match.group(2)) |
74 return Comment(text[1:].strip()) | 99 return Comment(text[1:].strip()) |
75 | 100 |
76 | 101 |
77 def _parse_header(text): | 102 def _parse_header(text): |
78 match = HEADER_REGEXP.match(text) | 103 match = HEADER_REGEXP.match(text) |
79 if not match: | 104 if not match: |
80 raise ParseError('Malformed header', text) | 105 raise ParseError('Malformed header', text) |
81 return Header(match.group(1)) | 106 return Header(match.group(1)) |
82 | 107 |
83 | 108 |
84 def _parse_instruction(text): | 109 def _parse_instruction(text): |
85 match = INCLUDE_REGEXP.match(text) | 110 match = INCLUDE_REGEXP.match(text) |
86 if not match: | 111 if not match: |
87 raise ParseError('Unrecognized instruction', text) | 112 raise ParseError('Unrecognized instruction', text) |
88 return Include(match.group(1)) | 113 return Include(match.group(1)) |
89 | 114 |
90 | 115 |
116 def _parse_option(option): | |
117 if '=' in option: | |
118 name, value = option.split('=', 1) | |
119 elif option.startswith('~'): | |
120 name, value = option[1:], False | |
121 else: | |
122 name, value = option, True | |
123 | |
124 # Handle special cases of multivalued options. | |
125 if name == 'domain': | |
mathias
2017/07/28 16:43:30
Wouldn't it make sense to enumerate recognized OPT
Vasily Kuznetsov
2017/07/28 17:38:10
Yeah, probably makes sense to make some kind of en
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
126 name, value = 'domains', _parse_options(value, '|') | |
mathias
2017/07/28 16:43:30
Why using a different / plural key for the parsed
Vasily Kuznetsov
2017/07/28 17:38:10
Because semantically it's a list, always, so calli
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
| |
127 elif name == 'sitekey': | |
128 name, value = 'sitekeys', value.split('|') | |
129 | |
130 return name, value | |
131 | |
132 | |
133 def _parse_options(options, separator=','): | |
134 return [_parse_option(o) for o in options.split(separator)] | |
135 | |
136 | |
137 def _parse_blocking_filter(text): | |
138 # Based on RegExpFilter.fromText in lib/filterClasses.js | |
139 # in https://hg.adblockplus.org/adblockpluscore. | |
140 action = FA.BLOCK | |
141 options = [] | |
142 selector = text | |
143 | |
144 if selector.startswith('@@'): | |
145 action = FA.ALLOW | |
146 selector = selector[2:] | |
147 | |
148 if '$' in selector: | |
149 opt_match = BFILTER_OPTIONS_REGEXP.search(selector) | |
150 if opt_match: | |
151 selector = selector[:opt_match.start(0)] | |
152 options = _parse_options(opt_match.group(1)) | |
153 | |
154 if (len(selector) > 1 and | |
155 selector.startswith('/') and selector.endswith('/')): | |
156 selector = {'type': ST.URL_REGEXP, 'value': selector[1:-1]} | |
157 else: | |
158 selector = {'type': ST.URL_PATTERN, 'value': selector} | |
159 | |
160 return Filter(text, selector, action, options) | |
161 | |
162 | |
163 def _parse_hiding_filter(text, domains, type_flag, selector_value): | |
164 selector = {'type': ST.CSS, 'value': selector_value} | |
165 action = FA.HIDE | |
166 options = [] | |
167 | |
168 if type_flag == '@': | |
169 action = FA.SHOW | |
170 elif type_flag == '?': | |
171 selector['type'] = ST.XCSS | |
172 | |
173 if domains: | |
174 options.append(('domains', _parse_options(domains))) | |
175 | |
176 return Filter(text, selector, action, options) | |
177 | |
178 | |
179 def parse_filter(text): | |
180 """Parse one filter. | |
181 | |
182 :param text: Text representation of a filter. | |
183 :returns: filter object. | |
mathias
2017/07/28 16:43:29
I think this should be upper-case "Filter".
Vasily Kuznetsov
2017/07/28 17:38:10
Yes.
| |
184 """ | |
185 if '#' in text: | |
186 match = HFILTER_REGEXP.search(text) | |
187 if match: | |
188 return _parse_hiding_filter(text, *match.groups()) | |
189 return _parse_blocking_filter(text) | |
190 | |
191 | |
91 def parse_line(line_text): | 192 def parse_line(line_text): |
92 """Parse one line of a filter list. | 193 """Parse one line of a filter list. |
93 | 194 |
94 :param line_text: Line of a filter list (must be a unicode string). | 195 :param line_text: Line of a filter list (must be a unicode string). |
95 :returns: Parsed line object (see `_line_type`). | 196 :returns: Parsed line object (see `_line_type`). |
96 :raises ParseError: If the line can't be successfully parsed. | 197 :raises ParseError: If the line can't be successfully parsed. |
97 """ | 198 """ |
98 content = line_text.strip() | 199 content = line_text.strip() |
99 | 200 |
100 if content == '': | 201 if content == '': |
101 line = EmptyLine() | 202 line = EmptyLine() |
102 elif content.startswith('!'): | 203 elif content.startswith('!'): |
103 line = _parse_comment(content) | 204 line = _parse_comment(content) |
104 elif content.startswith('%') and content.endswith('%'): | 205 elif content.startswith('%') and content.endswith('%'): |
105 line = _parse_instruction(content) | 206 line = _parse_instruction(content) |
106 elif content.startswith('[') and content.endswith(']'): | 207 elif content.startswith('[') and content.endswith(']'): |
107 line = _parse_header(content) | 208 line = _parse_header(content) |
108 else: | 209 else: |
109 line = Filter(content) | 210 line = parse_filter(content) |
110 | 211 |
111 assert line.to_string().replace(' ', '') == content.replace(' ', '') | 212 assert line.to_string().replace(' ', '') == content.replace(' ', '') |
112 return line | 213 return line |
113 | 214 |
114 | 215 |
115 def parse_filterlist(lines): | 216 def parse_filterlist(lines): |
116 """Parse filter list from an iterable. | 217 """Parse filter list from an iterable. |
117 | 218 |
118 :param lines: List of strings or file or other iterable. | 219 :param lines: List of strings or file or other iterable. |
119 :returns: Iterator over parsed lines. | 220 :returns: Iterator over parsed lines. |
120 :raises ParseError: Can be thrown during iteration for invalid lines. | 221 :raises ParseError: Can be thrown during iteration for invalid lines. |
121 """ | 222 """ |
122 for line in lines: | 223 for line in lines: |
123 yield parse_line(line) | 224 yield parse_line(line) |
OLD | NEW |