OLD | NEW |
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
12 # | 12 # |
13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
15 | 15 |
16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
17 | 17 |
18 import re | 18 import re |
19 from collections import namedtuple | 19 from collections import namedtuple |
20 | 20 |
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] | 21 __all__ = [ |
| 22 'FILTER_ACTION', |
| 23 'FILTER_OPTION', |
| 24 'ParseError', |
| 25 'SELECTOR_TYPE', |
| 26 'parse_filterlist', |
| 27 'parse_line', |
| 28 ] |
22 | 29 |
23 | 30 |
24 class ParseError(Exception): | 31 class ParseError(Exception): |
25 """Exception thrown by the parser when it encounters invalid input. | 32 """Exception thrown by the parser when it encounters invalid input. |
26 | 33 |
27 :param error: Description of the error. | 34 :param error: Description of the error. |
28 :param text: The text which was being parsed when an error occurred. | 35 :param text: The text which was being parsed when an error occurred. |
29 """ | 36 """ |
30 | 37 |
31 def __init__(self, error, text): | 38 def __init__(self, error, text): |
32 Exception.__init__(self, '{} in "{}"'.format(error, text)) | 39 Exception.__init__(self, '{} in "{}"'.format(error, text)) |
33 self.text = text | 40 self.text = text |
34 self.error = error | 41 self.error = error |
35 | 42 |
36 | 43 |
| 44 # Constants related to filters (see https://adblockplus.org/filters). |
| 45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant). |
| 46 """Selector types""" |
| 47 URL_PATTERN = 'url-pattern' # Normal URL patterns. |
| 48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs. |
| 49 CSS = 'css' # CSS selectors for hiding filters. |
| 50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). |
| 51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. |
| 52 |
| 53 |
| 54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant). |
| 55 """Filter actions""" |
| 56 BLOCK = 'block' # Block the request. |
| 57 ALLOW = 'allow' # Allow the request (whitelist). |
| 58 HIDE = 'hide' # Hide selected element(s). |
| 59 SHOW = 'show' # Show selected element(s) (whitelist). |
| 60 |
| 61 |
| 62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant). |
| 63 """Filter options""" |
| 64 # Resource types. |
| 65 OTHER = 'other' |
| 66 SCRIPT = 'script' |
| 67 IMAGE = 'image' |
| 68 STYLESHEET = 'stylesheet' |
| 69 OBJECT = 'object' |
| 70 SUBDOCUMENT = 'subdocument' |
| 71 DOCUMENT = 'document' |
| 72 WEBSOCKET = 'websocket' |
| 73 WEBRTC = 'webrtc' |
| 74 PING = 'ping' |
| 75 XMLHTTPREQUEST = 'xmlhttprequest' |
| 76 OBJECT_SUBREQUEST = 'object-subrequest' |
| 77 MEDIA = 'media' |
| 78 FONT = 'font' |
| 79 POPUP = 'popup' |
| 80 GENERICBLOCK = 'genericblock' |
| 81 ELEMHIDE = 'elemhide' |
| 82 GENERICHIDE = 'generichide' |
| 83 |
| 84 # Deprecated resource types. |
| 85 BACKGROUND = 'background' |
| 86 XBL = 'xbl' |
| 87 DTD = 'dtd' |
| 88 |
| 89 # Other options. |
| 90 MATCH_CASE = 'match-case' |
| 91 DOMAIN = 'domain' |
| 92 THIRD_PARTY = 'third-party' |
| 93 COLLAPSE = 'collapse' |
| 94 SITEKEY = 'sitekey' |
| 95 DONOTTRACK = 'donottrack' |
| 96 |
| 97 |
37 def _line_type(name, field_names, format_string): | 98 def _line_type(name, field_names, format_string): |
38 """Define a line type. | 99 """Define a line type. |
39 | 100 |
40 :param name: The name of the line type to define. | 101 :param name: The name of the line type to define. |
41 :param field_names: A sequence of field names or one space-separated | 102 :param field_names: A sequence of field names or one space-separated |
42 string that contains all field names. | 103 string that contains all field names. |
43 :param format_string: A format specifier for converting this line type | 104 :param format_string: A format specifier for converting this line type |
44 back to string representation. | 105 back to string representation. |
45 :returns: Class created with `namedtuple` that has `.type` set to | 106 :returns: Class created with `namedtuple` that has `.type` set to |
46 lowercased `name` and supports conversion back to string with | 107 lowercased `name` and supports conversion back to string with |
47 `.to_string()` method. | 108 `.to_string()` method. |
48 """ | 109 """ |
49 lt = namedtuple(name, field_names) | 110 lt = namedtuple(name, field_names) |
50 lt.type = name.lower() | 111 lt.type = name.lower() |
51 lt.to_string = lambda self: format_string.format(self) | 112 lt.to_string = lambda self: format_string.format(self) |
52 return lt | 113 return lt |
53 | 114 |
54 | 115 |
55 Header = _line_type('Header', 'version', '[{.version}]') | 116 Header = _line_type('Header', 'version', '[{.version}]') |
56 EmptyLine = _line_type('EmptyLine', '', '') | 117 EmptyLine = _line_type('EmptyLine', '', '') |
57 Comment = _line_type('Comment', 'text', '! {.text}') | 118 Comment = _line_type('Comment', 'text', '! {.text}') |
58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
59 Filter = _line_type('Filter', 'expression', '{.expression}') | 120 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
60 Include = _line_type('Include', 'target', '%include {0.target}%') | 121 Include = _line_type('Include', 'target', '%include {0.target}%') |
61 | 122 |
62 | 123 |
63 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 124 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
65 'Version'} | 126 'Version'} |
66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| 129 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
| 130 FILTER_OPTIONS_REGEXP = re.compile( |
| 131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' |
| 132 ) |
68 | 133 |
69 | 134 |
70 def _parse_comment(text): | 135 def _parse_comment(text): |
71 match = METADATA_REGEXP.match(text) | 136 match = METADATA_REGEXP.match(text) |
72 if match and match.group(1) in METADATA_KEYS: | 137 if match and match.group(1) in METADATA_KEYS: |
73 return Metadata(match.group(1), match.group(2)) | 138 return Metadata(match.group(1), match.group(2)) |
74 return Comment(text[1:].strip()) | 139 return Comment(text[1:].strip()) |
75 | 140 |
76 | 141 |
77 def _parse_header(text): | 142 def _parse_header(text): |
78 match = HEADER_REGEXP.match(text) | 143 match = HEADER_REGEXP.match(text) |
79 if not match: | 144 if not match: |
80 raise ParseError('Malformed header', text) | 145 raise ParseError('Malformed header', text) |
81 return Header(match.group(1)) | 146 return Header(match.group(1)) |
82 | 147 |
83 | 148 |
84 def _parse_instruction(text): | 149 def _parse_instruction(text): |
85 match = INCLUDE_REGEXP.match(text) | 150 match = INCLUDE_REGEXP.match(text) |
86 if not match: | 151 if not match: |
87 raise ParseError('Unrecognized instruction', text) | 152 raise ParseError('Unrecognized instruction', text) |
88 return Include(match.group(1)) | 153 return Include(match.group(1)) |
89 | 154 |
90 | 155 |
| 156 def _parse_option(option): |
| 157 if '=' in option: |
| 158 return option.split('=', 1) |
| 159 if option.startswith('~'): |
| 160 return option[1:], False |
| 161 return option, True |
| 162 |
| 163 |
| 164 def _parse_filter_option(option): |
| 165 name, value = _parse_option(option) |
| 166 |
| 167 # Handle special cases of multivalued options. |
| 168 if name == FILTER_OPTION.DOMAIN: |
| 169 value = [_parse_option(o) for o in value.split('|')] |
| 170 elif name == FILTER_OPTION.SITEKEY: |
| 171 value = value.split('|') |
| 172 |
| 173 return name, value |
| 174 |
| 175 |
| 176 def _parse_filter_options(options): |
| 177 return [_parse_filter_option(o) for o in options.split(',')] |
| 178 |
| 179 |
| 180 def _parse_blocking_filter(text): |
| 181 # Based on RegExpFilter.fromText in lib/filterClasses.js |
| 182 # in https://hg.adblockplus.org/adblockpluscore. |
| 183 action = FILTER_ACTION.BLOCK |
| 184 options = [] |
| 185 selector = text |
| 186 |
| 187 if selector.startswith('@@'): |
| 188 action = FILTER_ACTION.ALLOW |
| 189 selector = selector[2:] |
| 190 |
| 191 if '$' in selector: |
| 192 opt_match = FILTER_OPTIONS_REGEXP.search(selector) |
| 193 if opt_match: |
| 194 selector = selector[:opt_match.start(0)] |
| 195 options = _parse_filter_options(opt_match.group(1)) |
| 196 |
| 197 if (len(selector) > 1 and |
| 198 selector.startswith('/') and selector.endswith('/')): |
| 199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]} |
| 200 else: |
| 201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector} |
| 202 |
| 203 return Filter(text, selector, action, options) |
| 204 |
| 205 |
| 206 def _parse_hiding_filter(text, domain, type_flag, selector_value): |
| 207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value} |
| 208 action = FILTER_ACTION.HIDE |
| 209 options = [] |
| 210 |
| 211 if type_flag == '@': |
| 212 action = FILTER_ACTION.SHOW |
| 213 elif type_flag == '?': |
| 214 selector['type'] = SELECTOR_TYPE.XCSS |
| 215 |
| 216 if domain: |
| 217 domains = [_parse_option(d) for d in domain.split(',')] |
| 218 options.append((FILTER_OPTION.DOMAIN, domains)) |
| 219 |
| 220 return Filter(text, selector, action, options) |
| 221 |
| 222 |
| 223 def parse_filter(text): |
| 224 """Parse one filter. |
| 225 |
| 226 :param text: Text representation of a filter. |
| 227 :returns: Filter object. |
| 228 """ |
| 229 if '#' in text: |
| 230 match = HIDING_FILTER_REGEXP.search(text) |
| 231 if match: |
| 232 return _parse_hiding_filter(text, *match.groups()) |
| 233 return _parse_blocking_filter(text) |
| 234 |
| 235 |
91 def parse_line(line_text): | 236 def parse_line(line_text): |
92 """Parse one line of a filter list. | 237 """Parse one line of a filter list. |
93 | 238 |
94 :param line_text: Line of a filter list (must be a unicode string). | 239 :param line_text: Line of a filter list (must be a unicode string). |
95 :returns: Parsed line object (see `_line_type`). | 240 :returns: Parsed line object (see `_line_type`). |
96 :raises ParseError: If the line can't be successfully parsed. | 241 :raises ParseError: If the line can't be successfully parsed. |
97 """ | 242 """ |
98 content = line_text.strip() | 243 content = line_text.strip() |
99 | 244 |
100 if content == '': | 245 if content == '': |
101 line = EmptyLine() | 246 line = EmptyLine() |
102 elif content.startswith('!'): | 247 elif content.startswith('!'): |
103 line = _parse_comment(content) | 248 line = _parse_comment(content) |
104 elif content.startswith('%') and content.endswith('%'): | 249 elif content.startswith('%') and content.endswith('%'): |
105 line = _parse_instruction(content) | 250 line = _parse_instruction(content) |
106 elif content.startswith('[') and content.endswith(']'): | 251 elif content.startswith('[') and content.endswith(']'): |
107 line = _parse_header(content) | 252 line = _parse_header(content) |
108 else: | 253 else: |
109 line = Filter(content) | 254 line = parse_filter(content) |
110 | 255 |
111 assert line.to_string().replace(' ', '') == content.replace(' ', '') | 256 assert line.to_string().replace(' ', '') == content.replace(' ', '') |
112 return line | 257 return line |
113 | 258 |
114 | 259 |
115 def parse_filterlist(lines): | 260 def parse_filterlist(lines): |
116 """Parse filter list from an iterable. | 261 """Parse filter list from an iterable. |
117 | 262 |
118 :param lines: List of strings or file or other iterable. | 263 :param lines: List of strings or file or other iterable. |
119 :returns: Iterator over parsed lines. | 264 :returns: Iterator over parsed lines. |
120 :raises ParseError: Can be thrown during iteration for invalid lines. | 265 :raises ParseError: Can be thrown during iteration for invalid lines. |
121 """ | 266 """ |
122 for line in lines: | 267 for line in lines: |
123 yield parse_line(line) | 268 yield parse_line(line) |
OLD | NEW |