| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
| 2 # Copyright (C) 2006-2017 eyeo GmbH | 2 # Copyright (C) 2006-2017 eyeo GmbH |
| 3 # | 3 # |
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
| 5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
| 6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
| 7 # | 7 # |
| 8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
| 12 # | 12 # |
| 13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
| 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| 15 | 15 |
| 16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
| 17 | 17 |
| 18 import re | 18 import re |
| 19 from collections import namedtuple | 19 from collections import namedtuple |
| 20 | 20 |
| 21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] | 21 __all__ = ['parse_filterlist', 'parse_line', 'parse_filter'] |
| 22 | 22 |
| 23 | 23 |
| 24 class ParseError(Exception): | 24 class ParseError(Exception): |
| 25 """Exception thrown by the parser when it encounters invalid input. | 25 """Internal exception used by the parser to signal invalid input.""" |
|
mathias
2017/07/26 20:37:15
Removing the custom __init__ function looks like a
| |
| 26 | |
| 27 :param error: Description of the error. | |
| 28 :param text: The text which was being parsed when an error occurred. | |
| 29 """ | |
| 30 | |
| 31 def __init__(self, error, text): | |
| 32 Exception.__init__(self, '{} in "{}"'.format(error, text)) | |
| 33 self.text = text | |
| 34 self.error = error | |
| 35 | 26 |
| 36 | 27 |
| 37 def line_type(name, field_names, format_string): | 28 def line_type(name, field_names, format_string): |
| 38 """Define a line type. | 29 """Define a line type. |
| 39 | 30 |
| 40 :param name: The name of the line type to define. | 31 :param name: The name of the line type to define. |
| 41 :param field_names: A sequence of field names or one space-separated | 32 :param field_names: A sequence of field names or one space-separated |
| 42 string that contains all field names. | 33 string that contains all field names. |
| 34 :param format_string: A format specifier for converting this line type | |
|
mathias
2017/07/26 20:37:15
Fixing the missing format_string parameter documen
| |
| 35 back to string representation. | |
| 43 :returns: Class created with `namedtuple` that has `.type` set to | 36 :returns: Class created with `namedtuple` that has `.type` set to |
| 44 lowercased `name` and supports conversion back to string with | 37 lowercased `name` and supports conversion back to string with |
| 45 `.to_string()` method. | 38 `.to_string()` method. |
| 46 """ | 39 """ |
| 47 lt = namedtuple(name, field_names) | 40 lt = namedtuple(name, field_names) |
| 48 lt.type = name.lower() | 41 lt.type = name.lower() |
| 49 lt.to_string = lambda self: format_string.format(self) | 42 lt.to_string = lambda self: format_string.format(self) |
| 50 return lt | 43 return lt |
| 51 | 44 |
| 52 | 45 |
| 46 InvalidLine = line_type('Invalid', 'text error', '{.text}') | |
| 53 Header = line_type('Header', 'version', '[{.version}]') | 47 Header = line_type('Header', 'version', '[{.version}]') |
| 54 EmptyLine = line_type('EmptyLine', '', '') | 48 EmptyLine = line_type('EmptyLine', '', '') |
| 55 Comment = line_type('Comment', 'text', '! {.text}') | 49 Comment = line_type('Comment', 'text', '! {.text}') |
| 56 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 50 Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
| 57 Filter = line_type('Filter', 'expression', '{.expression}') | |
| 58 Include = line_type('Include', 'target', '%include {0.target}%') | 51 Include = line_type('Include', 'target', '%include {0.target}%') |
| 52 Filter = line_type('Filter', 'text selector action options', '{.text}') | |
| 59 | 53 |
| 60 | 54 |
| 61 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 55 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
| 62 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 56 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
| 63 'Version'} | 57 'Version'} |
| 64 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 58 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| 65 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 59 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| 60 BFILTER_OPTIONS_REGEXP = re.compile( | |
| 61 r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$' | |
| 62 ) | |
| 63 HFILTER_REGEXP = re.compile( | |
| 64 r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)' | |
| 65 r'((?:\([\w\-]+(?:[$^*]?=[^\(\)"]*)?\))*)|#([^{}]+))$' | |
| 66 ) | |
| 67 | |
| 68 # Types of resources to block (based on adblockpluscore/lib/filterClasses.js). | |
| 69 TYPES = { | |
| 70 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media', | |
| 71 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup', | |
| 72 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide', | |
| 73 } | |
| 74 | |
| 75 # Special types used for whitelisting. | |
| 76 TYPES_WHITELIST = { | |
| 77 'document', 'elemhide', 'generichide', 'genericblock', | |
| 78 } | |
| 79 | |
| 80 # By default blocking filters apply to everything except whitelist-only types | |
| 81 # and popups (based on adblockpluscore/lib/filterClasses.js). | |
| 82 TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST} | |
| 83 | |
| 84 # Type options that are synonyms for other types. | |
| 85 TYPE_SYNONYMS = { | |
| 86 'xbl': 'other', | |
| 87 'dtd': 'other', | |
| 88 'background': 'image', | |
| 89 } | |
| 66 | 90 |
| 67 | 91 |
| 68 def _parse_comment(text): | 92 def _parse_comment(text): |
| 69 match = METADATA_REGEXP.match(text) | 93 match = METADATA_REGEXP.match(text) |
| 70 if match and match.group(1) in METADATA_KEYS: | 94 if match and match.group(1) in METADATA_KEYS: |
| 71 return Metadata(match.group(1), match.group(2)) | 95 return Metadata(match.group(1), match.group(2)) |
| 72 return Comment(text[1:].strip()) | 96 return Comment(text[1:].strip()) |
| 73 | 97 |
| 74 | 98 |
| 75 def _parse_header(text): | 99 def _parse_header(text): |
| 76 match = HEADER_REGEXP.match(text) | 100 match = HEADER_REGEXP.match(text) |
| 77 if not match: | 101 if not match: |
| 78 raise ParseError('Malformed header', text) | 102 raise ParseError('Malformed header') |
|
mathias
2017/07/26 20:37:15
Please explain why you don't include the malformed
Vasily Kuznetsov
2017/07/27 11:05:02
My reasoning was that you can get to this place in
| |
| 79 return Header(match.group(1)) | 103 return Header(match.group(1)) |
| 80 | 104 |
| 81 | 105 |
| 82 def _parse_instruction(text): | 106 def _parse_instruction(text): |
| 83 match = INCLUDE_REGEXP.match(text) | 107 match = INCLUDE_REGEXP.match(text) |
| 84 if not match: | 108 if not match: |
| 85 raise ParseError('Unrecognized instruction', text) | 109 raise ParseError('Unrecognized instruction') |
| 86 return Include(match.group(1)) | 110 return Include(match.group(1)) |
| 87 | 111 |
| 88 | 112 |
| 113 def _separate_domains(domains): | |
| 114 options = {} | |
| 115 for d in domains: | |
| 116 if d.startswith('~'): | |
| 117 options.setdefault('domains-exclude', []).append(d.lstrip('~')) | |
| 118 else: | |
| 119 options.setdefault('domains-include', []).append(d) | |
| 120 if 'domains-include' in options: | |
| 121 options['domains-none'] = True | |
| 122 return options | |
| 123 | |
| 124 | |
| 125 def _separate_types(types): | |
| 126 """Convert a list of `(type, on_off)` tuples to options: | |
| 127 | |
| 128 - types-none: True if we start with nothing included, absent if we start | |
| 129 with TYPES_DEFAULT included. | |
| 130 - types-include: List of additional included types. | |
| 131 - types-exclude: List of excluded types. | |
| 132 """ | |
| 133 if not types: | |
| 134 return {} | |
| 135 | |
| 136 if types[0][1]: # If the first type is ON, we start with nothing... | |
| 137 types_default = set() | |
| 138 options = {'types-none': True} | |
| 139 else: # ...otherwise we start with default type set. | |
| 140 types_default = TYPES_DEFAULT | |
| 141 options = {} | |
| 142 | |
| 143 # Include/exclude any deviations from default. | |
| 144 for name, value in dict(types).items(): | |
| 145 if value and name not in types_default: | |
| 146 options.setdefault('types-include', []).append(name) | |
| 147 if not value and name in types_default: | |
| 148 options.setdefault('types-exclude', []).append(name) | |
| 149 | |
| 150 return options | |
| 151 | |
| 152 | |
| 153 def _parse_hiding_filter(text, match): | |
| 154 if match.group(5): | |
| 155 selector = {'type': 'css', 'value': match.group(5)} | |
| 156 else: | |
| 157 selector = { | |
| 158 'type': 'abp-simple', | |
| 159 'value': match.group(3) + match.group(4), | |
| 160 } | |
| 161 action = 'show' if match.group(2) else 'hide' | |
| 162 options = _separate_domains(list(filter(None, match.group(1).split(',')))) | |
| 163 return Filter(text, selector, action, options) | |
| 164 | |
| 165 | |
| 166 def _parse_filter_options(options): | |
| 167 # Based on RegExpFilter.fromText in lib/filterClasses.js | |
| 168 # in adblockpluscore. | |
| 169 parsed_options = {} | |
| 170 type_options = [] | |
| 171 | |
| 172 for option in options.split(','): | |
| 173 if '=' in option: | |
| 174 name, value = option.split('=', 1) | |
| 175 elif option.startswith('~'): | |
| 176 name, value = option[1:], False | |
| 177 else: | |
| 178 name, value = option, True | |
| 179 | |
| 180 if name in TYPE_SYNONYMS: | |
| 181 name = TYPE_SYNONYMS[name] | |
| 182 if name in TYPES: | |
| 183 type_options.append((name, value)) | |
| 184 elif name == 'domain': | |
| 185 parsed_options.update(_separate_domains(value.split('|'))) | |
| 186 elif name == 'sitekey': | |
| 187 parsed_options['sitekeys'] = value.split('|') | |
| 188 else: | |
| 189 parsed_options[name] = value | |
| 190 | |
| 191 parsed_options.update(_separate_types(type_options)) | |
| 192 return parsed_options | |
| 193 | |
| 194 | |
| 195 def _parse_blocking_filter(text): | |
| 196 # Based on RegExpFilter.fromText in lib/filterClasses.js | |
| 197 # in adblockpluscore. | |
| 198 action = 'block' | |
| 199 options = {} | |
| 200 selector = text | |
| 201 | |
| 202 if selector.startswith('@@'): | |
| 203 action = 'allow' | |
|
mathias
2017/07/26 20:37:15
I think we should have symbols like BFILTER_ACTION
Vasily Kuznetsov
2017/07/27 11:05:02
Probably not these exact names for the constants,
| |
| 204 selector = selector[2:] | |
| 205 | |
| 206 if '$' in selector: | |
| 207 opt_match = BFILTER_OPTIONS_REGEXP.search(selector) | |
| 208 if opt_match: | |
| 209 selector = selector[:opt_match.start(0)] | |
| 210 options = _parse_filter_options(opt_match.group(1)) | |
| 211 | |
| 212 if (len(selector) > 1 and | |
| 213 selector.startswith('/') and selector.endswith('/')): | |
| 214 selector = {'type': 'url-regexp', 'value': selector[1:-1]} | |
|
mathias
2017/07/26 20:37:15
I also think we should have symbols like SELECTOR_
Vasily Kuznetsov
2017/07/27 11:05:02
Acknowledged.
| |
| 215 else: | |
| 216 selector = {'type': 'url-pattern', 'value': selector} | |
| 217 | |
| 218 return Filter(text, selector, action, options) | |
| 219 | |
| 220 | |
| 221 def parse_filter(text): | |
| 222 """Parse one filter. | |
| 223 | |
| 224 :param text: Text representation of a filter. | |
| 225 :returns: filter object. | |
| 226 """ | |
| 227 match = HFILTER_REGEXP.match(text) if '#' in text else False | |
|
mathias
2017/07/26 20:37:15
Call me old-fashioned but I seriously dislike chan
Vasily Kuznetsov
2017/07/27 11:05:02
Completely agree about changing the type of the va
| |
| 228 if match: | |
| 229 return _parse_hiding_filter(text, match) | |
| 230 return _parse_blocking_filter(text) | |
| 231 | |
| 232 | |
| 89 def parse_line(line_text): | 233 def parse_line(line_text): |
| 90 """Parse one line of a filter list. | 234 """Parse one line of a filter list. |
| 91 | 235 |
| 92 :param line_text: Line of a filter list (must be a unicode string). | 236 :param line_text: Line of a filter list (must be a unicode string). |
| 93 :returns: Parsed line object (see `line_type`). | 237 :returns: Parsed line object (see `line_type`). |
| 94 :raises ParseError: If the line can't be successfully parsed. | 238 :raises ParseError: If the line can't be successfully parsed. |
| 95 """ | 239 """ |
| 96 content = line_text.strip() | 240 content = line_text.strip() |
| 97 | 241 |
| 98 if content == '': | 242 if content == '': |
| 99 line = EmptyLine() | 243 line = EmptyLine() |
| 100 elif content.startswith('!'): | 244 elif content.startswith('!'): |
| 101 line = _parse_comment(content) | 245 line = _parse_comment(content) |
| 102 elif content.startswith('%') and content.endswith('%'): | 246 elif content.startswith('%') and content.endswith('%'): |
| 103 line = _parse_instruction(content) | 247 line = _parse_instruction(content) |
| 104 elif content.startswith('[') and content.endswith(']'): | 248 elif content.startswith('[') and content.endswith(']'): |
| 105 line = _parse_header(content) | 249 line = _parse_header(content) |
| 106 else: | 250 else: |
| 107 line = Filter(content) | 251 line = parse_filter(content) |
| 108 | 252 |
| 109 assert line.to_string().replace(' ', '') == content.replace(' ', '') | 253 assert line.to_string().replace(' ', '') == content.replace(' ', '') |
| 110 return line | 254 return line |
| 111 | 255 |
| 112 | 256 |
| 113 def parse_filterlist(lines): | 257 def parse_filterlist(lines): |
| 114 """Parse filter list from an iterable. | 258 """Parse filter list from an iterable. |
| 115 | 259 |
| 116 :param lines: List of strings or file or other iterable. | 260 :param lines: List of strings or file or other iterable. |
| 117 :returns: Iterator over parsed lines. | 261 :returns: Iterator over parsed lines. |
| 118 :raises ParseError: Can be thrown during iteration for invalid lines. | 262 :raises ParseError: Can be thrown during iteration for invalid lines. |
| 119 """ | 263 """ |
| 120 for line in lines: | 264 for line in lines: |
| 121 yield parse_line(line) | 265 try: |
| 266 yield parse_line(line) | |
| 267 except ParseError as pe: | |
| 268 yield InvalidLine(line.strip(), str(pe)) | |
| OLD | NEW |