abp/filters/parser.py - Issue 29465715: Fixes 4969 - Add parsing of filters

Unified Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)

Patch Set: Created June 14, 2017, 5:32 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: abp/filters/parser.py

===================================================================

--- a/abp/filters/parser.py

+++ b/abp/filters/parser.py

@@ -13,84 +13,228 @@

# You should have received a copy of the GNU General Public License

# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import re

from collections import namedtuple

-__all__ = ['parse_filterlist', 'parse_line', 'ParseError']

+__all__ = ['parse_filterlist', 'parse_line', 'parse_filter']

class ParseError(Exception):

- """Exception thrown by the parser when it encounters invalid input.

- :param error: Description of the error.

- :param text: The text which was being parsed when an error occurred.

- """

- def __init__(self, error, text):

- Exception.__init__(self, '{} in "{}"'.format(error, text))

- self.text = text

- self.error = error

+ """Internal exception used by the parser to signal invalid input."""

mathias 2017/07/26 20:37:15 Removing the custom __init__ function looks like a

def line_type(name, field_names, format_string):

"""Define a line type.

:param name: The name of the line type to define.

:param field_names: A sequence of field names or one space-separated

string that contains all field names.

+ :param format_string: A format specifier for converting this line type

mathias 2017/07/26 20:37:15 Fixing the missing format_string parameter documen

+ back to string representation.

:returns: Class created with `namedtuple` that has `.type` set to

lowercased `name` and supports conversion back to string with

`.to_string()` method.

"""

lt = namedtuple(name, field_names)

lt.type = name.lower()

lt.to_string = lambda self: format_string.format(self)

return lt

+InvalidLine = line_type('Invalid', 'text error', '{.text}')

Header = line_type('Header', 'version', '[{.version}]')

EmptyLine = line_type('EmptyLine', '', '')

Comment = line_type('Comment', 'text', '! {.text}')

Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}')

-Filter = line_type('Filter', 'expression', '{.expression}')

Include = line_type('Include', 'target', '%include {0.target}%')

+Filter = line_type('Filter', 'text selector action options', '{.text}')

METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')

METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',

'Version'}

INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)

+BFILTER_OPTIONS_REGEXP = re.compile(

+ r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$'

+HFILTER_REGEXP = re.compile(

+ r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)'

+ r'((?:$[\w\-]+(?:[$^*]?=[^\($"]*)?\))*)|#([^{}]+))$'

+# Types of resources to block (based on adblockpluscore/lib/filterClasses.js).

+TYPES = {

+ 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media',

+ 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup',

+ 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide',

+# Special types used for whitelisting.

+TYPES_WHITELIST = {

+ 'document', 'elemhide', 'generichide', 'genericblock',

+# By default blocking filters apply to everything except whitelist-only types

+# and popups (based on adblockpluscore/lib/filterClasses.js).

+TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST}

+# Type options that are synonyms for other types.

+TYPE_SYNONYMS = {

+ 'xbl': 'other',

+ 'dtd': 'other',

+ 'background': 'image',

def _parse_comment(text):

match = METADATA_REGEXP.match(text)

if match and match.group(1) in METADATA_KEYS:

return Metadata(match.group(1), match.group(2))

return Comment(text[1:].strip())

def _parse_header(text):

match = HEADER_REGEXP.match(text)

if not match:

- raise ParseError('Malformed header', text)

+ raise ParseError('Malformed header')

mathias 2017/07/26 20:37:15 Please explain why you don't include the malformed

Vasily Kuznetsov 2017/07/27 11:05:02 My reasoning was that you can get to this place in

return Header(match.group(1))

def _parse_instruction(text):

match = INCLUDE_REGEXP.match(text)

if not match:

- raise ParseError('Unrecognized instruction', text)

+ raise ParseError('Unrecognized instruction')

return Include(match.group(1))

+def _separate_domains(domains):

+ options = {}

+ for d in domains:

+ if d.startswith('~'):

+ options.setdefault('domains-exclude', []).append(d.lstrip('~'))

+ else:

+ options.setdefault('domains-include', []).append(d)

+ if 'domains-include' in options:

+ options['domains-none'] = True

+ return options

+def _separate_types(types):

+ """Convert a list of `(type, on_off)` tuples to options:

+ - types-none: True if we start with nothing included, absent if we start

+ with TYPES_DEFAULT included.

+ - types-include: List of additional included types.

+ - types-exclude: List of excluded types.

+ """

+ if not types:

+ return {}

+ if types[0][1]: # If the first type is ON, we start with nothing...

+ types_default = set()

+ options = {'types-none': True}

+ else: # ...otherwise we start with default type set.

+ types_default = TYPES_DEFAULT

+ options = {}

+ # Include/exclude any deviations from default.

+ for name, value in dict(types).items():

+ if value and name not in types_default:

+ options.setdefault('types-include', []).append(name)

+ if not value and name in types_default:

+ options.setdefault('types-exclude', []).append(name)

+ return options

+def _parse_hiding_filter(text, match):

+ if match.group(5):

+ selector = {'type': 'css', 'value': match.group(5)}

+ else:

+ selector = {

+ 'type': 'abp-simple',

+ 'value': match.group(3) + match.group(4),

+ }

+ action = 'show' if match.group(2) else 'hide'

+ options = _separate_domains(list(filter(None, match.group(1).split(','))))

+ return Filter(text, selector, action, options)

+def _parse_filter_options(options):

+ # Based on RegExpFilter.fromText in lib/filterClasses.js

+ # in adblockpluscore.

+ parsed_options = {}

+ type_options = []

+ for option in options.split(','):

+ if '=' in option:

+ name, value = option.split('=', 1)

+ elif option.startswith('~'):

+ name, value = option[1:], False

+ else:

+ name, value = option, True

+ if name in TYPE_SYNONYMS:

+ name = TYPE_SYNONYMS[name]

+ if name in TYPES:

+ type_options.append((name, value))

+ elif name == 'domain':

+ parsed_options.update(_separate_domains(value.split('|')))

+ elif name == 'sitekey':

+ parsed_options['sitekeys'] = value.split('|')

+ else:

+ parsed_options[name] = value

+ parsed_options.update(_separate_types(type_options))

+ return parsed_options

+def _parse_blocking_filter(text):

+ # Based on RegExpFilter.fromText in lib/filterClasses.js

+ # in adblockpluscore.

+ action = 'block'

+ options = {}

+ selector = text

+ if selector.startswith('@@'):

+ action = 'allow'

mathias 2017/07/26 20:37:15 I think we should have symbols like BFILTER_ACTION

Vasily Kuznetsov 2017/07/27 11:05:02 Probably not these exact names for the constants,

+ selector = selector[2:]

+ if '$' in selector:

+ opt_match = BFILTER_OPTIONS_REGEXP.search(selector)

+ if opt_match:

+ selector = selector[:opt_match.start(0)]

+ options = _parse_filter_options(opt_match.group(1))

+ if (len(selector) > 1 and

+ selector.startswith('/') and selector.endswith('/')):

+ selector = {'type': 'url-regexp', 'value': selector[1:-1]}

mathias 2017/07/26 20:37:15 I also think we should have symbols like SELECTOR_

Vasily Kuznetsov 2017/07/27 11:05:02 Acknowledged.

+ else:

+ selector = {'type': 'url-pattern', 'value': selector}

+ return Filter(text, selector, action, options)

+def parse_filter(text):

+ """Parse one filter.

+ :param text: Text representation of a filter.

+ :returns: filter object.

+ """

+ match = HFILTER_REGEXP.match(text) if '#' in text else False

mathias 2017/07/26 20:37:15 Call me old-fashioned but I seriously dislike chan

Vasily Kuznetsov 2017/07/27 11:05:02 Completely agree about changing the type of the va

+ if match:

+ return _parse_hiding_filter(text, match)

+ return _parse_blocking_filter(text)

def parse_line(line_text):

"""Parse one line of a filter list.

:param line_text: Line of a filter list (must be a unicode string).

:returns: Parsed line object (see `line_type`).

:raises ParseError: If the line can't be successfully parsed.

"""

content = line_text.strip()

@@ -99,23 +243,26 @@

line = EmptyLine()

elif content.startswith('!'):

line = _parse_comment(content)

elif content.startswith('%') and content.endswith('%'):

line = _parse_instruction(content)

elif content.startswith('[') and content.endswith(']'):

line = _parse_header(content)

else:

- line = Filter(content)

+ line = parse_filter(content)

assert line.to_string().replace(' ', '') == content.replace(' ', '')

return line

def parse_filterlist(lines):

"""Parse filter list from an iterable.

:param lines: List of strings or file or other iterable.

:returns: Iterator over parsed lines.

:raises ParseError: Can be thrown during iteration for invalid lines.

"""

for line in lines:

- yield parse_line(line)

+ try:

+ yield parse_line(line)

+ except ParseError as pe:

+ yield InvalidLine(line.strip(), str(pe))

« no previous file with comments | « no previous file | setup.py » ('j') | setup.py » ('J')