 Issue 29465715:
  Fixes 4969 - Add parsing of filters  (Closed)
    
  
    Issue 29465715:
  Fixes 4969 - Add parsing of filters  (Closed) 
  | Index: abp/filters/parser.py | 
| =================================================================== | 
| --- a/abp/filters/parser.py | 
| +++ b/abp/filters/parser.py | 
| @@ -13,32 +13,93 @@ | 
| # You should have received a copy of the GNU General Public License | 
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 
| from __future__ import unicode_literals | 
| import re | 
| from collections import namedtuple | 
| -__all__ = ['parse_filterlist', 'parse_line', 'ParseError'] | 
| +__all__ = [ | 
| 
Vasily Kuznetsov
2017/08/02 16:21:18
I updated my formatting rules to include "prefer o
 | 
| + 'FILTER_ACTION', | 
| + 'FILTER_OPTION', | 
| + 'ParseError', | 
| + 'SELECTOR_TYPE', | 
| + 'parse_filterlist', | 
| + 'parse_line', | 
| +] | 
| class ParseError(Exception): | 
| """Exception thrown by the parser when it encounters invalid input. | 
| :param error: Description of the error. | 
| :param text: The text which was being parsed when an error occurred. | 
| """ | 
| def __init__(self, error, text): | 
| Exception.__init__(self, '{} in "{}"'.format(error, text)) | 
| self.text = text | 
| self.error = error | 
| +# Constants related to filters (see https://adblockplus.org/filters). | 
| +class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant). | 
| + """Selector types""" | 
| + URL_PATTERN = 'url-pattern' # Normal URL patterns. | 
| + URL_REGEXP = 'url-regexp' # Regular expressions for URLs. | 
| + CSS = 'css' # CSS selectors for hiding filters. | 
| + XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). | 
| + ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. | 
| + | 
| + | 
| +class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant). | 
| + """Filter actions""" | 
| + BLOCK = 'block' # Block the request. | 
| + ALLOW = 'allow' # Allow the request (whitelist). | 
| + HIDE = 'hide' # Hide selected element(s). | 
| + SHOW = 'show' # Show selected element(s) (whitelist). | 
| + | 
| + | 
| +class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant). | 
| + """Filter options""" | 
| + # Resource types. | 
| + OTHER = 'other' | 
| + SCRIPT = 'script' | 
| + IMAGE = 'image' | 
| + STYLESHEET = 'stylesheet' | 
| + OBJECT = 'object' | 
| + SUBDOCUMENT = 'subdocument' | 
| + DOCUMENT = 'document' | 
| + WEBSOCKET = 'websocket' | 
| + WEBRTC = 'webrtc' | 
| + PING = 'ping' | 
| + XMLHTTPREQUEST = 'xmlhttprequest' | 
| + OBJECT_SUBREQUEST = 'object-subrequest' | 
| + MEDIA = 'media' | 
| + FONT = 'font' | 
| + POPUP = 'popup' | 
| + GENERICBLOCK = 'genericblock' | 
| + ELEMHIDE = 'elemhide' | 
| + GENERICHIDE = 'generichide' | 
| + | 
| + # Deprecated resource types. | 
| + BACKGROUND = 'background' | 
| + XBL = 'xbl' | 
| + DTD = 'dtd' | 
| + | 
| + # Other options. | 
| + MATCH_CASE = 'match-case' | 
| + DOMAIN = 'domain' | 
| + THIRD_PARTY = 'third-party' | 
| + COLLAPSE = 'collapse' | 
| + SITEKEY = 'sitekey' | 
| + DONOTTRACK = 'donottrack' | 
| + | 
| + | 
| def _line_type(name, field_names, format_string): | 
| """Define a line type. | 
| :param name: The name of the line type to define. | 
| :param field_names: A sequence of field names or one space-separated | 
| string that contains all field names. | 
| :param format_string: A format specifier for converting this line type | 
| back to string representation. | 
| @@ -51,25 +112,29 @@ | 
| lt.to_string = lambda self: format_string.format(self) | 
| return lt | 
| Header = _line_type('Header', 'version', '[{.version}]') | 
| EmptyLine = _line_type('EmptyLine', '', '') | 
| Comment = _line_type('Comment', 'text', '! {.text}') | 
| Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 
| -Filter = _line_type('Filter', 'expression', '{.expression}') | 
| +Filter = _line_type('Filter', 'text selector action options', '{.text}') | 
| Include = _line_type('Include', 'target', '%include {0.target}%') | 
| METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') | 
| METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', | 
| 'Version'} | 
| INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 
| HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 
| +HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 
| +FILTER_OPTIONS_REGEXP = re.compile( | 
| + r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' | 
| +) | 
| def _parse_comment(text): | 
| match = METADATA_REGEXP.match(text) | 
| if match and match.group(1) in METADATA_KEYS: | 
| return Metadata(match.group(1), match.group(2)) | 
| return Comment(text[1:].strip()) | 
| @@ -83,16 +148,96 @@ | 
| def _parse_instruction(text): | 
| match = INCLUDE_REGEXP.match(text) | 
| if not match: | 
| raise ParseError('Unrecognized instruction', text) | 
| return Include(match.group(1)) | 
| +def _parse_option(option): | 
| + if '=' in option: | 
| + return option.split('=', 1) | 
| + if option.startswith('~'): | 
| + return option[1:], False | 
| + return option, True | 
| + | 
| + | 
| +def _parse_filter_option(option): | 
| + name, value = _parse_option(option) | 
| + | 
| + # Handle special cases of multivalued options. | 
| + if name == FILTER_OPTION.DOMAIN: | 
| + value = [_parse_option(o) for o in value.split('|')] | 
| + elif name == FILTER_OPTION.SITEKEY: | 
| + value = value.split('|') | 
| + | 
| + return name, value | 
| + | 
| + | 
| +def _parse_filter_options(options): | 
| + return [_parse_filter_option(o) for o in options.split(',')] | 
| + | 
| + | 
| +def _parse_blocking_filter(text): | 
| + # Based on RegExpFilter.fromText in lib/filterClasses.js | 
| + # in https://hg.adblockplus.org/adblockpluscore. | 
| + action = FILTER_ACTION.BLOCK | 
| + options = [] | 
| + selector = text | 
| + | 
| + if selector.startswith('@@'): | 
| + action = FILTER_ACTION.ALLOW | 
| + selector = selector[2:] | 
| + | 
| + if '$' in selector: | 
| + opt_match = FILTER_OPTIONS_REGEXP.search(selector) | 
| + if opt_match: | 
| + selector = selector[:opt_match.start(0)] | 
| + options = _parse_filter_options(opt_match.group(1)) | 
| + | 
| + if (len(selector) > 1 and | 
| + selector.startswith('/') and selector.endswith('/')): | 
| + selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]} | 
| + else: | 
| + selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector} | 
| + | 
| + return Filter(text, selector, action, options) | 
| + | 
| + | 
| +def _parse_hiding_filter(text, domain, type_flag, selector_value): | 
| + selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value} | 
| + action = FILTER_ACTION.HIDE | 
| + options = [] | 
| + | 
| + if type_flag == '@': | 
| + action = FILTER_ACTION.SHOW | 
| + elif type_flag == '?': | 
| + selector['type'] = SELECTOR_TYPE.XCSS | 
| + | 
| + if domain: | 
| + domains = [_parse_option(d) for d in domain.split(',')] | 
| + options.append((FILTER_OPTION.DOMAIN, domains)) | 
| + | 
| + return Filter(text, selector, action, options) | 
| + | 
| + | 
| +def parse_filter(text): | 
| + """Parse one filter. | 
| + | 
| + :param text: Text representation of a filter. | 
| + :returns: Filter object. | 
| + """ | 
| + if '#' in text: | 
| + match = HIDING_FILTER_REGEXP.search(text) | 
| + if match: | 
| + return _parse_hiding_filter(text, *match.groups()) | 
| + return _parse_blocking_filter(text) | 
| + | 
| + | 
| def parse_line(line_text): | 
| """Parse one line of a filter list. | 
| :param line_text: Line of a filter list (must be a unicode string). | 
| :returns: Parsed line object (see `_line_type`). | 
| :raises ParseError: If the line can't be successfully parsed. | 
| """ | 
| content = line_text.strip() | 
| @@ -101,17 +246,17 @@ | 
| line = EmptyLine() | 
| elif content.startswith('!'): | 
| line = _parse_comment(content) | 
| elif content.startswith('%') and content.endswith('%'): | 
| line = _parse_instruction(content) | 
| elif content.startswith('[') and content.endswith(']'): | 
| line = _parse_header(content) | 
| else: | 
| - line = Filter(content) | 
| + line = parse_filter(content) | 
| assert line.to_string().replace(' ', '') == content.replace(' ', '') | 
| return line | 
| def parse_filterlist(lines): | 
| """Parse filter list from an iterable. |