| Index: abp/filters/parser.py |
| =================================================================== |
| --- a/abp/filters/parser.py |
| +++ b/abp/filters/parser.py |
| @@ -13,32 +13,50 @@ |
| # You should have received a copy of the GNU General Public License |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| from __future__ import unicode_literals |
| import re |
| from collections import namedtuple |
| -__all__ = ['parse_filterlist', 'parse_line', 'ParseError'] |
| +__all__ = ['parse_filterlist', 'parse_line', 'ParseError', 'ST', 'FA'] |
| class ParseError(Exception): |
| """Exception thrown by the parser when it encounters invalid input. |
| :param error: Description of the error. |
| :param text: The text which was being parsed when an error occurred. |
| """ |
| def __init__(self, error, text): |
| Exception.__init__(self, '{} in "{}"'.format(error, text)) |
| self.text = text |
| self.error = error |
| +# Constants related to filters (see https://adblockplus.org/filters). |
| +class ST: |
|
mathias
2017/07/28 16:43:29
Why abbreviating here (ST) and below (FA)?
Vasily Kuznetsov
2017/07/28 17:38:10
To be completely honest, the reason is kind of stu
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
|
| + """Selector types""" |
| + URL_PATTERN = 'url-pattern' # Normal URL patterns. |
| + URL_REGEXP = 'url-regexp' # Regular expressions for URLs. |
| + CSS = 'css' # CSS selectors for hiding filters. |
| + XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). |
| + ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. |
| + |
| + |
| +class FA: |
| + """Filter actions""" |
| + BLOCK = 'block' # Block the request. |
| + ALLOW = 'allow' # Allow the request (whitelist). |
| + HIDE = 'hide' # Hide selected element(s). |
| + SHOW = 'show' # Show selected element(s) (whitelist). |
| + |
| + |
| def _line_type(name, field_names, format_string): |
| """Define a line type. |
| :param name: The name of the line type to define. |
| :param field_names: A sequence of field names or one space-separated |
| string that contains all field names. |
| :param format_string: A format specifier for converting this line type |
| back to string representation. |
| @@ -51,25 +69,32 @@ |
| lt.to_string = lambda self: format_string.format(self) |
| return lt |
| Header = _line_type('Header', 'version', '[{.version}]') |
| EmptyLine = _line_type('EmptyLine', '', '') |
| Comment = _line_type('Comment', 'text', '! {.text}') |
| Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
| -Filter = _line_type('Filter', 'expression', '{.expression}') |
| +Filter = _line_type('Filter', 'text selector action options', '{.text}') |
| Include = _line_type('Include', 'target', '%include {0.target}%') |
| METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
| METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
| 'Version'} |
| INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| +HFILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
|
mathias
2017/07/28 16:43:29
Why abbreviating? What's wrong about HIDING_FILTER
Vasily Kuznetsov
2017/07/28 17:38:10
It's shorter this way. But I don't feel very stron
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
|
| +BFILTER_REGEXP_REGEXP = re.compile( |
|
mathias
2017/07/28 16:43:30
I was wondering about the *_REGEXP_REGEXP name, bu
Vasily Kuznetsov
2017/07/28 17:38:10
It's the regular expression for blocking filters w
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
|
| + r'^(@@)?\/.*\/(?:\$~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)?$' |
| +) |
| +BFILTER_OPTIONS_REGEXP = re.compile( |
| + r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' |
| +) |
| def _parse_comment(text): |
| match = METADATA_REGEXP.match(text) |
| if match and match.group(1) in METADATA_KEYS: |
| return Metadata(match.group(1), match.group(2)) |
| return Comment(text[1:].strip()) |
| @@ -83,16 +108,92 @@ |
| def _parse_instruction(text): |
| match = INCLUDE_REGEXP.match(text) |
| if not match: |
| raise ParseError('Unrecognized instruction', text) |
| return Include(match.group(1)) |
| +def _parse_option(option): |
| + if '=' in option: |
| + name, value = option.split('=', 1) |
| + elif option.startswith('~'): |
| + name, value = option[1:], False |
| + else: |
| + name, value = option, True |
| + |
| + # Handle special cases of multivalued options. |
| + if name == 'domain': |
|
mathias
2017/07/28 16:43:30
Wouldn't it make sense to enumerate recognized OPT
Vasily Kuznetsov
2017/07/28 17:38:10
Yeah, probably makes sense to make some kind of en
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
|
| + name, value = 'domains', _parse_options(value, '|') |
|
mathias
2017/07/28 16:43:30
Why using a different / plural key for the parsed
Vasily Kuznetsov
2017/07/28 17:38:10
Because semantically it's a list, always, so calli
Vasily Kuznetsov
2017/07/28 18:57:49
Done.
|
| + elif name == 'sitekey': |
| + name, value = 'sitekeys', value.split('|') |
| + |
| + return name, value |
| + |
| + |
| +def _parse_options(options, separator=','): |
| + return [_parse_option(o) for o in options.split(separator)] |
| + |
| + |
| +def _parse_blocking_filter(text): |
| + # Based on RegExpFilter.fromText in lib/filterClasses.js |
| + # in https://hg.adblockplus.org/adblockpluscore. |
| + action = FA.BLOCK |
| + options = [] |
| + selector = text |
| + |
| + if selector.startswith('@@'): |
| + action = FA.ALLOW |
| + selector = selector[2:] |
| + |
| + if '$' in selector: |
| + opt_match = BFILTER_OPTIONS_REGEXP.search(selector) |
| + if opt_match: |
| + selector = selector[:opt_match.start(0)] |
| + options = _parse_options(opt_match.group(1)) |
| + |
| + if (len(selector) > 1 and |
| + selector.startswith('/') and selector.endswith('/')): |
| + selector = {'type': ST.URL_REGEXP, 'value': selector[1:-1]} |
| + else: |
| + selector = {'type': ST.URL_PATTERN, 'value': selector} |
| + |
| + return Filter(text, selector, action, options) |
| + |
| + |
| +def _parse_hiding_filter(text, domains, type_flag, selector_value): |
| + selector = {'type': ST.CSS, 'value': selector_value} |
| + action = FA.HIDE |
| + options = [] |
| + |
| + if type_flag == '@': |
| + action = FA.SHOW |
| + elif type_flag == '?': |
| + selector['type'] = ST.XCSS |
| + |
| + if domains: |
| + options.append(('domains', _parse_options(domains))) |
| + |
| + return Filter(text, selector, action, options) |
| + |
| + |
| +def parse_filter(text): |
| + """Parse one filter. |
| + |
| + :param text: Text representation of a filter. |
| + :returns: filter object. |
|
mathias
2017/07/28 16:43:29
I think this should be upper-case "Filter".
Vasily Kuznetsov
2017/07/28 17:38:10
Yes.
|
| + """ |
| + if '#' in text: |
| + match = HFILTER_REGEXP.search(text) |
| + if match: |
| + return _parse_hiding_filter(text, *match.groups()) |
| + return _parse_blocking_filter(text) |
| + |
| + |
| def parse_line(line_text): |
| """Parse one line of a filter list. |
| :param line_text: Line of a filter list (must be a unicode string). |
| :returns: Parsed line object (see `_line_type`). |
| :raises ParseError: If the line can't be successfully parsed. |
| """ |
| content = line_text.strip() |
| @@ -101,17 +202,17 @@ |
| line = EmptyLine() |
| elif content.startswith('!'): |
| line = _parse_comment(content) |
| elif content.startswith('%') and content.endswith('%'): |
| line = _parse_instruction(content) |
| elif content.startswith('[') and content.endswith(']'): |
| line = _parse_header(content) |
| else: |
| - line = Filter(content) |
| + line = parse_filter(content) |
| assert line.to_string().replace(' ', '') == content.replace(' ', '') |
| return line |
| def parse_filterlist(lines): |
| """Parse filter list from an iterable. |