| OLD | NEW | 
|---|
| 1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 
| 2 # Copyright (C) 2006-2017 eyeo GmbH | 2 # Copyright (C) 2006-2017 eyeo GmbH | 
| 3 # | 3 # | 
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify | 
| 5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as | 
| 6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. | 
| 7 # | 7 # | 
| 8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, | 
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| 11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. | 
| 12 # | 12 # | 
| 13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License | 
| 14 # along with Adblock Plus.  If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus.  If not, see <http://www.gnu.org/licenses/>. | 
| 15 | 15 | 
| 16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals | 
| 17 | 17 | 
| 18 import re | 18 import re | 
| 19 from collections import namedtuple | 19 from collections import namedtuple | 
| 20 | 20 | 
| 21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] | 21 __all__ = [ | 
|  | 22     'FILTER_ACTION', | 
|  | 23     'FILTER_OPTION', | 
|  | 24     'ParseError', | 
|  | 25     'SELECTOR_TYPE', | 
|  | 26     'parse_filterlist', | 
|  | 27     'parse_line', | 
|  | 28 ] | 
| 22 | 29 | 
| 23 | 30 | 
| 24 class ParseError(Exception): | 31 class ParseError(Exception): | 
| 25     """Exception thrown by the parser when it encounters invalid input. | 32     """Exception thrown by the parser when it encounters invalid input. | 
| 26 | 33 | 
| 27     :param error: Description of the error. | 34     :param error: Description of the error. | 
| 28     :param text: The text which was being parsed when an error occurred. | 35     :param text: The text which was being parsed when an error occurred. | 
| 29     """ | 36     """ | 
| 30 | 37 | 
| 31     def __init__(self, error, text): | 38     def __init__(self, error, text): | 
| 32         Exception.__init__(self, '{} in "{}"'.format(error, text)) | 39         Exception.__init__(self, '{} in "{}"'.format(error, text)) | 
| 33         self.text = text | 40         self.text = text | 
| 34         self.error = error | 41         self.error = error | 
| 35 | 42 | 
| 36 | 43 | 
|  | 44 # Constants related to filters (see https://adblockplus.org/filters). | 
|  | 45 class SELECTOR_TYPE:  # flake8: noqa (This class is an enumeration constant). | 
|  | 46     """Selector types""" | 
|  | 47     URL_PATTERN = 'url-pattern'  # Normal URL patterns. | 
|  | 48     URL_REGEXP = 'url-regexp'    # Regular expressions for URLs. | 
|  | 49     CSS = 'css'                  # CSS selectors for hiding filters. | 
|  | 50     XCSS = 'extended-css'        # Extended CSS selectors (to emulate CSS4). | 
|  | 51     ABP_SIMPLE = 'abp-simple'    # Simplified element hiding syntax. | 
|  | 52 | 
|  | 53 | 
|  | 54 class FILTER_ACTION:  # flake8: noqa (This class is an enumeration constant). | 
|  | 55     """Filter actions""" | 
|  | 56     BLOCK = 'block'              # Block the request. | 
|  | 57     ALLOW = 'allow'              # Allow the request (whitelist). | 
|  | 58     HIDE = 'hide'                # Hide selected element(s). | 
|  | 59     SHOW = 'show'                # Show selected element(s) (whitelist). | 
|  | 60 | 
|  | 61 | 
|  | 62 class FILTER_OPTION:  # flake8: noqa (This class is an enumeration constant). | 
|  | 63     """Filter options""" | 
|  | 64     # Resource types. | 
|  | 65     OTHER = 'other' | 
|  | 66     SCRIPT = 'script' | 
|  | 67     IMAGE = 'image' | 
|  | 68     STYLESHEET = 'stylesheet' | 
|  | 69     OBJECT = 'object' | 
|  | 70     SUBDOCUMENT = 'subdocument' | 
|  | 71     DOCUMENT = 'document' | 
|  | 72     WEBSOCKET = 'websocket' | 
|  | 73     WEBRTC = 'webrtc' | 
|  | 74     PING = 'ping' | 
|  | 75     XMLHTTPREQUEST = 'xmlhttprequest' | 
|  | 76     OBJECT_SUBREQUEST = 'object-subrequest' | 
|  | 77     MEDIA = 'media' | 
|  | 78     FONT = 'font' | 
|  | 79     POPUP = 'popup' | 
|  | 80     GENERICBLOCK = 'genericblock' | 
|  | 81     ELEMHIDE = 'elemhide' | 
|  | 82     GENERICHIDE = 'generichide' | 
|  | 83 | 
|  | 84     # Deprecated resource types. | 
|  | 85     BACKGROUND = 'background' | 
|  | 86     XBL = 'xbl' | 
|  | 87     DTD = 'dtd' | 
|  | 88 | 
|  | 89     # Other options. | 
|  | 90     MATCH_CASE = 'match-case' | 
|  | 91     DOMAIN = 'domain' | 
|  | 92     THIRD_PARTY = 'third-party' | 
|  | 93     COLLAPSE = 'collapse' | 
|  | 94     SITEKEY = 'sitekey' | 
|  | 95     DONOTTRACK = 'donottrack' | 
|  | 96 | 
|  | 97 | 
| 37 def _line_type(name, field_names, format_string): | 98 def _line_type(name, field_names, format_string): | 
| 38     """Define a line type. | 99     """Define a line type. | 
| 39 | 100 | 
| 40     :param name: The name of the line type to define. | 101     :param name: The name of the line type to define. | 
| 41     :param field_names: A sequence of field names or one space-separated | 102     :param field_names: A sequence of field names or one space-separated | 
| 42         string that contains all field names. | 103         string that contains all field names. | 
| 43     :param format_string: A format specifier for converting this line type | 104     :param format_string: A format specifier for converting this line type | 
| 44         back to string representation. | 105         back to string representation. | 
| 45     :returns: Class created with `namedtuple` that has `.type` set to | 106     :returns: Class created with `namedtuple` that has `.type` set to | 
| 46         lowercased `name` and supports conversion back to string with | 107         lowercased `name` and supports conversion back to string with | 
| 47         `.to_string()` method. | 108         `.to_string()` method. | 
| 48     """ | 109     """ | 
| 49     lt = namedtuple(name, field_names) | 110     lt = namedtuple(name, field_names) | 
| 50     lt.type = name.lower() | 111     lt.type = name.lower() | 
| 51     lt.to_string = lambda self: format_string.format(self) | 112     lt.to_string = lambda self: format_string.format(self) | 
| 52     return lt | 113     return lt | 
| 53 | 114 | 
| 54 | 115 | 
| 55 Header = _line_type('Header', 'version', '[{.version}]') | 116 Header = _line_type('Header', 'version', '[{.version}]') | 
| 56 EmptyLine = _line_type('EmptyLine', '', '') | 117 EmptyLine = _line_type('EmptyLine', '', '') | 
| 57 Comment = _line_type('Comment', 'text', '! {.text}') | 118 Comment = _line_type('Comment', 'text', '! {.text}') | 
| 58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 
| 59 Filter = _line_type('Filter', 'expression', '{.expression}') | 120 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 
| 60 Include = _line_type('Include', 'target', '%include {0.target}%') | 121 Include = _line_type('Include', 'target', '%include {0.target}%') | 
| 61 | 122 | 
| 62 | 123 | 
| 63 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 124 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 
| 64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 
| 65                  'Version'} | 126                  'Version'} | 
| 66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 
| 67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 
|  | 129 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 
|  | 130 FILTER_OPTIONS_REGEXP = re.compile( | 
|  | 131     r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' | 
|  | 132 ) | 
| 68 | 133 | 
| 69 | 134 | 
| 70 def _parse_comment(text): | 135 def _parse_comment(text): | 
| 71     match = METADATA_REGEXP.match(text) | 136     match = METADATA_REGEXP.match(text) | 
| 72     if match and match.group(1) in METADATA_KEYS: | 137     if match and match.group(1) in METADATA_KEYS: | 
| 73         return Metadata(match.group(1), match.group(2)) | 138         return Metadata(match.group(1), match.group(2)) | 
| 74     return Comment(text[1:].strip()) | 139     return Comment(text[1:].strip()) | 
| 75 | 140 | 
| 76 | 141 | 
| 77 def _parse_header(text): | 142 def _parse_header(text): | 
| 78     match = HEADER_REGEXP.match(text) | 143     match = HEADER_REGEXP.match(text) | 
| 79     if not match: | 144     if not match: | 
| 80         raise ParseError('Malformed header', text) | 145         raise ParseError('Malformed header', text) | 
| 81     return Header(match.group(1)) | 146     return Header(match.group(1)) | 
| 82 | 147 | 
| 83 | 148 | 
| 84 def _parse_instruction(text): | 149 def _parse_instruction(text): | 
| 85     match = INCLUDE_REGEXP.match(text) | 150     match = INCLUDE_REGEXP.match(text) | 
| 86     if not match: | 151     if not match: | 
| 87         raise ParseError('Unrecognized instruction', text) | 152         raise ParseError('Unrecognized instruction', text) | 
| 88     return Include(match.group(1)) | 153     return Include(match.group(1)) | 
| 89 | 154 | 
| 90 | 155 | 
|  | 156 def _parse_option(option): | 
|  | 157     if '=' in option: | 
|  | 158         return option.split('=', 1) | 
|  | 159     if option.startswith('~'): | 
|  | 160         return option[1:], False | 
|  | 161     return option, True | 
|  | 162 | 
|  | 163 | 
|  | 164 def _parse_filter_option(option): | 
|  | 165     name, value = _parse_option(option) | 
|  | 166 | 
|  | 167     # Handle special cases of multivalued options. | 
|  | 168     if name == FILTER_OPTION.DOMAIN: | 
|  | 169         value = [_parse_option(o) for o in value.split('|')] | 
|  | 170     elif name == FILTER_OPTION.SITEKEY: | 
|  | 171         value = value.split('|') | 
|  | 172 | 
|  | 173     return name, value | 
|  | 174 | 
|  | 175 | 
|  | 176 def _parse_filter_options(options): | 
|  | 177     return [_parse_filter_option(o) for o in options.split(',')] | 
|  | 178 | 
|  | 179 | 
|  | 180 def _parse_blocking_filter(text): | 
|  | 181     # Based on RegExpFilter.fromText in lib/filterClasses.js | 
|  | 182     # in https://hg.adblockplus.org/adblockpluscore. | 
|  | 183     action = FILTER_ACTION.BLOCK | 
|  | 184     options = [] | 
|  | 185     selector = text | 
|  | 186 | 
|  | 187     if selector.startswith('@@'): | 
|  | 188         action = FILTER_ACTION.ALLOW | 
|  | 189         selector = selector[2:] | 
|  | 190 | 
|  | 191     if '$' in selector: | 
|  | 192         opt_match = FILTER_OPTIONS_REGEXP.search(selector) | 
|  | 193         if opt_match: | 
|  | 194             selector = selector[:opt_match.start(0)] | 
|  | 195             options = _parse_filter_options(opt_match.group(1)) | 
|  | 196 | 
|  | 197     if (len(selector) > 1 and | 
|  | 198             selector.startswith('/') and selector.endswith('/')): | 
|  | 199         selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]} | 
|  | 200     else: | 
|  | 201         selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector} | 
|  | 202 | 
|  | 203     return Filter(text, selector, action, options) | 
|  | 204 | 
|  | 205 | 
|  | 206 def _parse_hiding_filter(text, domain, type_flag, selector_value): | 
|  | 207     selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value} | 
|  | 208     action = FILTER_ACTION.HIDE | 
|  | 209     options = [] | 
|  | 210 | 
|  | 211     if type_flag == '@': | 
|  | 212         action = FILTER_ACTION.SHOW | 
|  | 213     elif type_flag == '?': | 
|  | 214         selector['type'] = SELECTOR_TYPE.XCSS | 
|  | 215 | 
|  | 216     if domain: | 
|  | 217         domains = [_parse_option(d) for d in domain.split(',')] | 
|  | 218         options.append((FILTER_OPTION.DOMAIN, domains)) | 
|  | 219 | 
|  | 220     return Filter(text, selector, action, options) | 
|  | 221 | 
|  | 222 | 
|  | 223 def parse_filter(text): | 
|  | 224     """Parse one filter. | 
|  | 225 | 
|  | 226     :param text: Text representation of a filter. | 
|  | 227     :returns: Filter object. | 
|  | 228     """ | 
|  | 229     if '#' in text: | 
|  | 230         match = HIDING_FILTER_REGEXP.search(text) | 
|  | 231         if match: | 
|  | 232             return _parse_hiding_filter(text, *match.groups()) | 
|  | 233     return _parse_blocking_filter(text) | 
|  | 234 | 
|  | 235 | 
| 91 def parse_line(line_text): | 236 def parse_line(line_text): | 
| 92     """Parse one line of a filter list. | 237     """Parse one line of a filter list. | 
| 93 | 238 | 
| 94     :param line_text: Line of a filter list (must be a unicode string). | 239     :param line_text: Line of a filter list (must be a unicode string). | 
| 95     :returns: Parsed line object (see `_line_type`). | 240     :returns: Parsed line object (see `_line_type`). | 
| 96     :raises ParseError: If the line can't be successfully parsed. | 241     :raises ParseError: If the line can't be successfully parsed. | 
| 97     """ | 242     """ | 
| 98     content = line_text.strip() | 243     content = line_text.strip() | 
| 99 | 244 | 
| 100     if content == '': | 245     if content == '': | 
| 101         line = EmptyLine() | 246         line = EmptyLine() | 
| 102     elif content.startswith('!'): | 247     elif content.startswith('!'): | 
| 103         line = _parse_comment(content) | 248         line = _parse_comment(content) | 
| 104     elif content.startswith('%') and content.endswith('%'): | 249     elif content.startswith('%') and content.endswith('%'): | 
| 105         line = _parse_instruction(content) | 250         line = _parse_instruction(content) | 
| 106     elif content.startswith('[') and content.endswith(']'): | 251     elif content.startswith('[') and content.endswith(']'): | 
| 107         line = _parse_header(content) | 252         line = _parse_header(content) | 
| 108     else: | 253     else: | 
| 109         line = Filter(content) | 254         line = parse_filter(content) | 
| 110 | 255 | 
| 111     assert line.to_string().replace(' ', '') == content.replace(' ', '') | 256     assert line.to_string().replace(' ', '') == content.replace(' ', '') | 
| 112     return line | 257     return line | 
| 113 | 258 | 
| 114 | 259 | 
| 115 def parse_filterlist(lines): | 260 def parse_filterlist(lines): | 
| 116     """Parse filter list from an iterable. | 261     """Parse filter list from an iterable. | 
| 117 | 262 | 
| 118     :param lines: List of strings or file or other iterable. | 263     :param lines: List of strings or file or other iterable. | 
| 119     :returns: Iterator over parsed lines. | 264     :returns: Iterator over parsed lines. | 
| 120     :raises ParseError: Can be thrown during iteration for invalid lines. | 265     :raises ParseError: Can be thrown during iteration for invalid lines. | 
| 121     """ | 266     """ | 
| 122     for line in lines: | 267     for line in lines: | 
| 123         yield parse_line(line) | 268         yield parse_line(line) | 
| OLD | NEW | 
|---|