| Index: abp/filters/parser.py |
| =================================================================== |
| --- a/abp/filters/parser.py |
| +++ b/abp/filters/parser.py |
| @@ -13,84 +13,228 @@ |
| # You should have received a copy of the GNU General Public License |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| from __future__ import unicode_literals |
| import re |
| from collections import namedtuple |
| -__all__ = ['parse_filterlist', 'parse_line', 'ParseError'] |
| +__all__ = ['parse_filterlist', 'parse_line', 'parse_filter'] |
| class ParseError(Exception): |
| - """Exception thrown by the parser when it encounters invalid input. |
| - |
| - :param error: Description of the error. |
| - :param text: The text which was being parsed when an error occurred. |
| - """ |
| - |
| - def __init__(self, error, text): |
| - Exception.__init__(self, '{} in "{}"'.format(error, text)) |
| - self.text = text |
| - self.error = error |
| + """Internal exception used by the parser to signal invalid input.""" |
|
mathias
2017/07/26 20:37:15
Removing the custom __init__ function looks like a
|
| def line_type(name, field_names, format_string): |
| """Define a line type. |
| :param name: The name of the line type to define. |
| :param field_names: A sequence of field names or one space-separated |
| string that contains all field names. |
| + :param format_string: A format specifier for converting this line type |
|
mathias
2017/07/26 20:37:15
Fixing the missing format_string parameter documen
|
| + back to string representation. |
| :returns: Class created with `namedtuple` that has `.type` set to |
| lowercased `name` and supports conversion back to string with |
| `.to_string()` method. |
| """ |
| lt = namedtuple(name, field_names) |
| lt.type = name.lower() |
| lt.to_string = lambda self: format_string.format(self) |
| return lt |
| +InvalidLine = line_type('Invalid', 'text error', '{.text}') |
| Header = line_type('Header', 'version', '[{.version}]') |
| EmptyLine = line_type('EmptyLine', '', '') |
| Comment = line_type('Comment', 'text', '! {.text}') |
| Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
| -Filter = line_type('Filter', 'expression', '{.expression}') |
| Include = line_type('Include', 'target', '%include {0.target}%') |
| +Filter = line_type('Filter', 'text selector action options', '{.text}') |
| METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
| METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
| 'Version'} |
| INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| +BFILTER_OPTIONS_REGEXP = re.compile( |
| + r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$' |
| +) |
| +HFILTER_REGEXP = re.compile( |
| + r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)' |
| + r'((?:\([\w\-]+(?:[$^*]?=[^\(\)"]*)?\))*)|#([^{}]+))$' |
| +) |
| + |
| +# Types of resources to block (based on adblockpluscore/lib/filterClasses.js). |
| +TYPES = { |
| + 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media', |
| + 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup', |
| + 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide', |
| +} |
| + |
| +# Special types used for whitelisting. |
| +TYPES_WHITELIST = { |
| + 'document', 'elemhide', 'generichide', 'genericblock', |
| +} |
| + |
| +# By default blocking filters apply to everything except whitelist-only types |
| +# and popups (based on adblockpluscore/lib/filterClasses.js). |
| +TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST} |
| + |
| +# Type options that are synonyms for other types. |
| +TYPE_SYNONYMS = { |
| + 'xbl': 'other', |
| + 'dtd': 'other', |
| + 'background': 'image', |
| +} |
| def _parse_comment(text): |
| match = METADATA_REGEXP.match(text) |
| if match and match.group(1) in METADATA_KEYS: |
| return Metadata(match.group(1), match.group(2)) |
| return Comment(text[1:].strip()) |
| def _parse_header(text): |
| match = HEADER_REGEXP.match(text) |
| if not match: |
| - raise ParseError('Malformed header', text) |
| + raise ParseError('Malformed header') |
|
mathias
2017/07/26 20:37:15
Please explain why you don't include the malformed
Vasily Kuznetsov
2017/07/27 11:05:02
My reasoning was that you can get to this place in
|
| return Header(match.group(1)) |
| def _parse_instruction(text): |
| match = INCLUDE_REGEXP.match(text) |
| if not match: |
| - raise ParseError('Unrecognized instruction', text) |
| + raise ParseError('Unrecognized instruction') |
| return Include(match.group(1)) |
| +def _separate_domains(domains): |
| + options = {} |
| + for d in domains: |
| + if d.startswith('~'): |
| + options.setdefault('domains-exclude', []).append(d.lstrip('~')) |
| + else: |
| + options.setdefault('domains-include', []).append(d) |
| + if 'domains-include' in options: |
| + options['domains-none'] = True |
| + return options |
| + |
| + |
| +def _separate_types(types): |
| + """Convert a list of `(type, on_off)` tuples to options: |
| + |
| + - types-none: True if we start with nothing included, absent if we start |
| + with TYPES_DEFAULT included. |
| + - types-include: List of additional included types. |
| + - types-exclude: List of excluded types. |
| + """ |
| + if not types: |
| + return {} |
| + |
| + if types[0][1]: # If the first type is ON, we start with nothing... |
| + types_default = set() |
| + options = {'types-none': True} |
| + else: # ...otherwise we start with default type set. |
| + types_default = TYPES_DEFAULT |
| + options = {} |
| + |
| + # Include/exclude any deviations from default. |
| + for name, value in dict(types).items(): |
| + if value and name not in types_default: |
| + options.setdefault('types-include', []).append(name) |
| + if not value and name in types_default: |
| + options.setdefault('types-exclude', []).append(name) |
| + |
| + return options |
| + |
| + |
| +def _parse_hiding_filter(text, match): |
| + if match.group(5): |
| + selector = {'type': 'css', 'value': match.group(5)} |
| + else: |
| + selector = { |
| + 'type': 'abp-simple', |
| + 'value': match.group(3) + match.group(4), |
| + } |
| + action = 'show' if match.group(2) else 'hide' |
| + options = _separate_domains(list(filter(None, match.group(1).split(',')))) |
| + return Filter(text, selector, action, options) |
| + |
| + |
| +def _parse_filter_options(options): |
| + # Based on RegExpFilter.fromText in lib/filterClasses.js |
| + # in adblockpluscore. |
| + parsed_options = {} |
| + type_options = [] |
| + |
| + for option in options.split(','): |
| + if '=' in option: |
| + name, value = option.split('=', 1) |
| + elif option.startswith('~'): |
| + name, value = option[1:], False |
| + else: |
| + name, value = option, True |
| + |
| + if name in TYPE_SYNONYMS: |
| + name = TYPE_SYNONYMS[name] |
| + if name in TYPES: |
| + type_options.append((name, value)) |
| + elif name == 'domain': |
| + parsed_options.update(_separate_domains(value.split('|'))) |
| + elif name == 'sitekey': |
| + parsed_options['sitekeys'] = value.split('|') |
| + else: |
| + parsed_options[name] = value |
| + |
| + parsed_options.update(_separate_types(type_options)) |
| + return parsed_options |
| + |
| + |
| +def _parse_blocking_filter(text): |
| + # Based on RegExpFilter.fromText in lib/filterClasses.js |
| + # in adblockpluscore. |
| + action = 'block' |
| + options = {} |
| + selector = text |
| + |
| + if selector.startswith('@@'): |
| + action = 'allow' |
|
mathias
2017/07/26 20:37:15
I think we should have symbols like BFILTER_ACTION
Vasily Kuznetsov
2017/07/27 11:05:02
Probably not these exact names for the constants,
|
| + selector = selector[2:] |
| + |
| + if '$' in selector: |
| + opt_match = BFILTER_OPTIONS_REGEXP.search(selector) |
| + if opt_match: |
| + selector = selector[:opt_match.start(0)] |
| + options = _parse_filter_options(opt_match.group(1)) |
| + |
| + if (len(selector) > 1 and |
| + selector.startswith('/') and selector.endswith('/')): |
| + selector = {'type': 'url-regexp', 'value': selector[1:-1]} |
|
mathias
2017/07/26 20:37:15
I also think we should have symbols like SELECTOR_
Vasily Kuznetsov
2017/07/27 11:05:02
Acknowledged.
|
| + else: |
| + selector = {'type': 'url-pattern', 'value': selector} |
| + |
| + return Filter(text, selector, action, options) |
| + |
| + |
| +def parse_filter(text): |
| + """Parse one filter. |
| + |
| + :param text: Text representation of a filter. |
| + :returns: filter object. |
| + """ |
| + match = HFILTER_REGEXP.match(text) if '#' in text else False |
|
mathias
2017/07/26 20:37:15
Call me old-fashioned but I seriously dislike chan
Vasily Kuznetsov
2017/07/27 11:05:02
Completely agree about changing the type of the va
|
| + if match: |
| + return _parse_hiding_filter(text, match) |
| + return _parse_blocking_filter(text) |
| + |
| + |
| def parse_line(line_text): |
| """Parse one line of a filter list. |
| :param line_text: Line of a filter list (must be a unicode string). |
| :returns: Parsed line object (see `line_type`). |
| :raises ParseError: If the line can't be successfully parsed. |
| """ |
| content = line_text.strip() |
| @@ -99,23 +243,26 @@ |
| line = EmptyLine() |
| elif content.startswith('!'): |
| line = _parse_comment(content) |
| elif content.startswith('%') and content.endswith('%'): |
| line = _parse_instruction(content) |
| elif content.startswith('[') and content.endswith(']'): |
| line = _parse_header(content) |
| else: |
| - line = Filter(content) |
| + line = parse_filter(content) |
| assert line.to_string().replace(' ', '') == content.replace(' ', '') |
| return line |
| def parse_filterlist(lines): |
| """Parse filter list from an iterable. |
| :param lines: List of strings or file or other iterable. |
| :returns: Iterator over parsed lines. |
| :raises ParseError: Can be thrown during iteration for invalid lines. |
| """ |
| for line in lines: |
| - yield parse_line(line) |
| + try: |
| + yield parse_line(line) |
| + except ParseError as pe: |
| + yield InvalidLine(line.strip(), str(pe)) |