| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
| 2 # Copyright (C) 2006-2017 eyeo GmbH | 2 # Copyright (C) 2006-2017 eyeo GmbH |
| 3 # | 3 # |
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
| 5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
| 6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
| 7 # | 7 # |
| 8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
| 12 # | 12 # |
| 13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
| 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| 15 | 15 |
| 16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
| 17 | 17 |
| 18 import re | 18 import re |
| 19 from collections import namedtuple | 19 from collections import namedtuple |
| 20 | 20 |
| 21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] | 21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError', |
| 22 'SELECTOR_TYPE', 'FILTER_ACTION', 'FILTER_OPTION'] | |
| 22 | 23 |
| 23 | 24 |
| 24 class ParseError(Exception): | 25 class ParseError(Exception): |
| 25 """Exception thrown by the parser when it encounters invalid input. | 26 """Exception thrown by the parser when it encounters invalid input. |
| 26 | 27 |
| 27 :param error: Description of the error. | 28 :param error: Description of the error. |
| 28 :param text: The text which was being parsed when an error occurred. | 29 :param text: The text which was being parsed when an error occurred. |
| 29 """ | 30 """ |
| 30 | 31 |
| 31 def __init__(self, error, text): | 32 def __init__(self, error, text): |
| 32 Exception.__init__(self, '{} in "{}"'.format(error, text)) | 33 Exception.__init__(self, '{} in "{}"'.format(error, text)) |
| 33 self.text = text | 34 self.text = text |
| 34 self.error = error | 35 self.error = error |
| 35 | 36 |
| 36 | 37 |
| 38 # Constants related to filters (see https://adblockplus.org/filters). | |
| 39 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant). | |
| 40 """Selector types""" | |
| 41 URL_PATTERN = 'url-pattern' # Normal URL patterns. | |
| 42 URL_REGEXP = 'url-regexp' # Regular expressions for URLs. | |
| 43 CSS = 'css' # CSS selectors for hiding filters. | |
| 44 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). | |
| 45 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. | |
| 46 | |
| 47 | |
| 48 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant). | |
| 49 """Filter actions""" | |
| 50 BLOCK = 'block' # Block the request. | |
| 51 ALLOW = 'allow' # Allow the request (whitelist). | |
| 52 HIDE = 'hide' # Hide selected element(s). | |
| 53 SHOW = 'show' # Show selected element(s) (whitelist). | |
| 54 | |
| 55 | |
| 56 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant). | |
| 57 """Filter options""" | |
| 58 # Resource types. | |
| 59 OTHER = 'other' | |
| 60 SCRIPT = 'script' | |
| 61 IMAGE = 'image' | |
| 62 STYLESHEET = 'stylesheet' | |
| 63 OBJECT = 'object' | |
| 64 SUBDOCUMENT = 'subdocument' | |
| 65 DOCUMENT = 'document' | |
| 66 WEBSOCKET = 'websocket' | |
| 67 WEBRTC = 'webrtc' | |
| 68 PING = 'ping' | |
| 69 XMLHTTPREQUEST = 'xmlhttprequest' | |
| 70 OBJECT_SUBREQUEST = 'object-subrequest' | |
| 71 MEDIA = 'media' | |
| 72 FONT = 'font' | |
| 73 POPUP = 'popup' | |
| 74 GENERICBLOCK = 'genericblock' | |
| 75 ELEMHIDE = 'elemhide' | |
| 76 GENERICHIDE = 'generichide' | |
| 77 | |
| 78 # Deprecated resource types. | |
| 79 BACKGROUND = 'background' | |
| 80 XBL = 'xbl' | |
| 81 DTD = 'dtd' | |
| 82 | |
| 83 # Other options. | |
| 84 MATCH_CASE = 'match-case' | |
| 85 DOMAIN = 'domain' | |
| 86 THIRD_PARTY = 'third-party' | |
| 87 COLLAPSE = 'collapse' | |
| 88 SITEKEY = 'sitekey' | |
| 89 DONOTTRACK = 'donottrack' | |
| 90 | |
| 91 | |
| 92 ALL_OPTIONS = {opt for name, opt in vars(FILTER_OPTION).items() | |
| 93 if not name.startswith('__')} | |
| 94 | |
| 95 | |
| 37 def _line_type(name, field_names, format_string): | 96 def _line_type(name, field_names, format_string): |
| 38 """Define a line type. | 97 """Define a line type. |
| 39 | 98 |
| 40 :param name: The name of the line type to define. | 99 :param name: The name of the line type to define. |
| 41 :param field_names: A sequence of field names or one space-separated | 100 :param field_names: A sequence of field names or one space-separated |
| 42 string that contains all field names. | 101 string that contains all field names. |
| 43 :param format_string: A format specifier for converting this line type | 102 :param format_string: A format specifier for converting this line type |
| 44 back to string representation. | 103 back to string representation. |
| 45 :returns: Class created with `namedtuple` that has `.type` set to | 104 :returns: Class created with `namedtuple` that has `.type` set to |
| 46 lowercased `name` and supports conversion back to string with | 105 lowercased `name` and supports conversion back to string with |
| 47 `.to_string()` method. | 106 `.to_string()` method. |
| 48 """ | 107 """ |
| 49 lt = namedtuple(name, field_names) | 108 lt = namedtuple(name, field_names) |
| 50 lt.type = name.lower() | 109 lt.type = name.lower() |
| 51 lt.to_string = lambda self: format_string.format(self) | 110 lt.to_string = lambda self: format_string.format(self) |
| 52 return lt | 111 return lt |
| 53 | 112 |
| 54 | 113 |
| 55 Header = _line_type('Header', 'version', '[{.version}]') | 114 Header = _line_type('Header', 'version', '[{.version}]') |
| 56 EmptyLine = _line_type('EmptyLine', '', '') | 115 EmptyLine = _line_type('EmptyLine', '', '') |
| 57 Comment = _line_type('Comment', 'text', '! {.text}') | 116 Comment = _line_type('Comment', 'text', '! {.text}') |
| 58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 117 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
| 59 Filter = _line_type('Filter', 'expression', '{.expression}') | 118 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
| 60 Include = _line_type('Include', 'target', '%include {0.target}%') | 119 Include = _line_type('Include', 'target', '%include {0.target}%') |
| 61 | 120 |
| 62 | 121 |
| 63 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 122 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
| 64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 123 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
| 65 'Version'} | 124 'Version'} |
| 66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 125 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| 67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 126 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| 127 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | |
| 128 FILTER_OPTIONS_REGEXP = re.compile( | |
| 129 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' | |
| 130 ) | |
| 68 | 131 |
| 69 | 132 |
| 70 def _parse_comment(text): | 133 def _parse_comment(text): |
| 71 match = METADATA_REGEXP.match(text) | 134 match = METADATA_REGEXP.match(text) |
| 72 if match and match.group(1) in METADATA_KEYS: | 135 if match and match.group(1) in METADATA_KEYS: |
| 73 return Metadata(match.group(1), match.group(2)) | 136 return Metadata(match.group(1), match.group(2)) |
| 74 return Comment(text[1:].strip()) | 137 return Comment(text[1:].strip()) |
| 75 | 138 |
| 76 | 139 |
| 77 def _parse_header(text): | 140 def _parse_header(text): |
| 78 match = HEADER_REGEXP.match(text) | 141 match = HEADER_REGEXP.match(text) |
| 79 if not match: | 142 if not match: |
| 80 raise ParseError('Malformed header', text) | 143 raise ParseError('Malformed header', text) |
| 81 return Header(match.group(1)) | 144 return Header(match.group(1)) |
| 82 | 145 |
| 83 | 146 |
| 84 def _parse_instruction(text): | 147 def _parse_instruction(text): |
| 85 match = INCLUDE_REGEXP.match(text) | 148 match = INCLUDE_REGEXP.match(text) |
| 86 if not match: | 149 if not match: |
| 87 raise ParseError('Unrecognized instruction', text) | 150 raise ParseError('Unrecognized instruction', text) |
| 88 return Include(match.group(1)) | 151 return Include(match.group(1)) |
| 89 | 152 |
| 90 | 153 |
| 154 def _parse_option(option): | |
| 155 if '=' in option: | |
| 156 return option.split('=', 1) | |
| 157 if option.startswith('~'): | |
| 158 return option[1:], False | |
| 159 return option, True | |
| 160 | |
| 161 | |
| 162 def _parse_filter_option(option): | |
| 163 name, value = _parse_option(option) | |
| 164 | |
| 165 if name not in ALL_OPTIONS: | |
|
mathias
2017/08/01 06:31:35
I don't think this part of the code should validat
Vasily Kuznetsov
2017/08/02 16:21:17
Following our conversation, I agree. Done
| |
| 166 raise ParseError('Unrecognized option', name) | |
| 167 | |
| 168 # Handle special cases of multivalued options. | |
| 169 if name == FILTER_OPTION.DOMAIN: | |
| 170 value = [_parse_option(o) for o in value.split('|')] | |
| 171 elif name == FILTER_OPTION.SITEKEY: | |
| 172 value = value.split('|') | |
| 173 | |
| 174 return name, value | |
| 175 | |
| 176 | |
| 177 def _parse_filter_options(options, separator=','): | |
|
mathias
2017/08/01 06:31:35
Why is the separator a parameter? The only place w
Vasily Kuznetsov
2017/08/02 16:21:17
This is left-over from an earlier version that use
| |
| 178 return [_parse_filter_option(o) for o in options.split(separator)] | |
| 179 | |
| 180 | |
| 181 def _parse_blocking_filter(text): | |
| 182 # Based on RegExpFilter.fromText in lib/filterClasses.js | |
| 183 # in https://hg.adblockplus.org/adblockpluscore. | |
| 184 action = FILTER_ACTION.BLOCK | |
| 185 options = [] | |
| 186 selector = text | |
| 187 | |
| 188 if selector.startswith('@@'): | |
| 189 action = FILTER_ACTION.ALLOW | |
| 190 selector = selector[2:] | |
| 191 | |
| 192 if '$' in selector: | |
| 193 opt_match = FILTER_OPTIONS_REGEXP.search(selector) | |
| 194 if opt_match: | |
| 195 selector = selector[:opt_match.start(0)] | |
| 196 options = _parse_filter_options(opt_match.group(1)) | |
| 197 | |
| 198 if (len(selector) > 1 and | |
| 199 selector.startswith('/') and selector.endswith('/')): | |
| 200 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]} | |
| 201 else: | |
| 202 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector} | |
| 203 | |
| 204 return Filter(text, selector, action, options) | |
| 205 | |
| 206 | |
| 207 def _parse_hiding_filter(text, domain, type_flag, selector_value): | |
| 208 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value} | |
| 209 action = FILTER_ACTION.HIDE | |
| 210 options = [] | |
| 211 | |
| 212 if type_flag == '@': | |
| 213 action = FILTER_ACTION.SHOW | |
| 214 elif type_flag == '?': | |
| 215 selector['type'] = SELECTOR_TYPE.XCSS | |
| 216 | |
| 217 if domain: | |
| 218 domains = [_parse_option(d) for d in domain.split(',')] | |
| 219 options.append((FILTER_OPTION.DOMAIN, domains)) | |
| 220 | |
| 221 return Filter(text, selector, action, options) | |
| 222 | |
| 223 | |
| 224 def parse_filter(text): | |
| 225 """Parse one filter. | |
| 226 | |
| 227 :param text: Text representation of a filter. | |
| 228 :returns: Filter object. | |
| 229 """ | |
| 230 if '#' in text: | |
| 231 match = HIDING_FILTER_REGEXP.search(text) | |
| 232 if match: | |
| 233 return _parse_hiding_filter(text, *match.groups()) | |
| 234 return _parse_blocking_filter(text) | |
| 235 | |
| 236 | |
| 91 def parse_line(line_text): | 237 def parse_line(line_text): |
| 92 """Parse one line of a filter list. | 238 """Parse one line of a filter list. |
| 93 | 239 |
| 94 :param line_text: Line of a filter list (must be a unicode string). | 240 :param line_text: Line of a filter list (must be a unicode string). |
| 95 :returns: Parsed line object (see `_line_type`). | 241 :returns: Parsed line object (see `_line_type`). |
| 96 :raises ParseError: If the line can't be successfully parsed. | 242 :raises ParseError: If the line can't be successfully parsed. |
| 97 """ | 243 """ |
| 98 content = line_text.strip() | 244 content = line_text.strip() |
| 99 | 245 |
| 100 if content == '': | 246 if content == '': |
| 101 line = EmptyLine() | 247 line = EmptyLine() |
| 102 elif content.startswith('!'): | 248 elif content.startswith('!'): |
| 103 line = _parse_comment(content) | 249 line = _parse_comment(content) |
| 104 elif content.startswith('%') and content.endswith('%'): | 250 elif content.startswith('%') and content.endswith('%'): |
| 105 line = _parse_instruction(content) | 251 line = _parse_instruction(content) |
| 106 elif content.startswith('[') and content.endswith(']'): | 252 elif content.startswith('[') and content.endswith(']'): |
| 107 line = _parse_header(content) | 253 line = _parse_header(content) |
| 108 else: | 254 else: |
| 109 line = Filter(content) | 255 line = parse_filter(content) |
| 110 | 256 |
| 111 assert line.to_string().replace(' ', '') == content.replace(' ', '') | 257 assert line.to_string().replace(' ', '') == content.replace(' ', '') |
| 112 return line | 258 return line |
| 113 | 259 |
| 114 | 260 |
| 115 def parse_filterlist(lines): | 261 def parse_filterlist(lines): |
| 116 """Parse filter list from an iterable. | 262 """Parse filter list from an iterable. |
| 117 | 263 |
| 118 :param lines: List of strings or file or other iterable. | 264 :param lines: List of strings or file or other iterable. |
| 119 :returns: Iterator over parsed lines. | 265 :returns: Iterator over parsed lines. |
| 120 :raises ParseError: Can be thrown during iteration for invalid lines. | 266 :raises ParseError: Can be thrown during iteration for invalid lines. |
| 121 """ | 267 """ |
| 122 for line in lines: | 268 for line in lines: |
| 123 yield parse_line(line) | 269 yield parse_line(line) |
| OLD | NEW |