Index: abp/filters/parser.py |
=================================================================== |
--- a/abp/filters/parser.py |
+++ b/abp/filters/parser.py |
@@ -13,32 +13,91 @@ |
# You should have received a copy of the GNU General Public License |
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
from __future__ import unicode_literals |
import re |
from collections import namedtuple |
-__all__ = ['parse_filterlist', 'parse_line', 'ParseError'] |
+__all__ = ['parse_filterlist', 'parse_line', 'ParseError', |
+ 'SELECTOR_TYPE', 'FILTER_ACTION', 'FILTER_OPTION'] |
class ParseError(Exception): |
"""Exception thrown by the parser when it encounters invalid input. |
:param error: Description of the error. |
:param text: The text which was being parsed when an error occurred. |
""" |
def __init__(self, error, text): |
Exception.__init__(self, '{} in "{}"'.format(error, text)) |
self.text = text |
self.error = error |
+# Constants related to filters (see https://adblockplus.org/filters). |
+class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant). |
+ """Selector types""" |
+ URL_PATTERN = 'url-pattern' # Normal URL patterns. |
+ URL_REGEXP = 'url-regexp' # Regular expressions for URLs. |
+ CSS = 'css' # CSS selectors for hiding filters. |
+ XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4). |
+ ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax. |
+ |
+ |
+class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant). |
+ """Filter actions""" |
+ BLOCK = 'block' # Block the request. |
+ ALLOW = 'allow' # Allow the request (whitelist). |
+ HIDE = 'hide' # Hide selected element(s). |
+ SHOW = 'show' # Show selected element(s) (whitelist). |
+ |
+ |
+class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant). |
+ """Filter options""" |
+ # Resource types. |
+ OTHER = 'other' |
+ SCRIPT = 'script' |
+ IMAGE = 'image' |
+ STYLESHEET = 'stylesheet' |
+ OBJECT = 'object' |
+ SUBDOCUMENT = 'subdocument' |
+ DOCUMENT = 'document' |
+ WEBSOCKET = 'websocket' |
+ WEBRTC = 'webrtc' |
+ PING = 'ping' |
+ XMLHTTPREQUEST = 'xmlhttprequest' |
+ OBJECT_SUBREQUEST = 'object-subrequest' |
+ MEDIA = 'media' |
+ FONT = 'font' |
+ POPUP = 'popup' |
+ GENERICBLOCK = 'genericblock' |
+ ELEMHIDE = 'elemhide' |
+ GENERICHIDE = 'generichide' |
+ |
+ # Deprecated resource types. |
+ BACKGROUND = 'background' |
+ XBL = 'xbl' |
+ DTD = 'dtd' |
+ |
+ # Other options. |
+ MATCH_CASE = 'match-case' |
+ DOMAIN = 'domain' |
+ THIRD_PARTY = 'third-party' |
+ COLLAPSE = 'collapse' |
+ SITEKEY = 'sitekey' |
+ DONOTTRACK = 'donottrack' |
+ |
+ |
+ALL_OPTIONS = {opt for name, opt in vars(FILTER_OPTION).items() |
+ if not name.startswith('__')} |
+ |
+ |
def _line_type(name, field_names, format_string): |
"""Define a line type. |
:param name: The name of the line type to define. |
:param field_names: A sequence of field names or one space-separated |
string that contains all field names. |
:param format_string: A format specifier for converting this line type |
back to string representation. |
@@ -51,25 +110,29 @@ |
lt.to_string = lambda self: format_string.format(self) |
return lt |
Header = _line_type('Header', 'version', '[{.version}]') |
EmptyLine = _line_type('EmptyLine', '', '') |
Comment = _line_type('Comment', 'text', '! {.text}') |
Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
-Filter = _line_type('Filter', 'expression', '{.expression}') |
+Filter = _line_type('Filter', 'text selector action options', '{.text}') |
Include = _line_type('Include', 'target', '%include {0.target}%') |
METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
'Version'} |
INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
+HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
+FILTER_OPTIONS_REGEXP = re.compile( |
+ r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$' |
+) |
def _parse_comment(text): |
match = METADATA_REGEXP.match(text) |
if match and match.group(1) in METADATA_KEYS: |
return Metadata(match.group(1), match.group(2)) |
return Comment(text[1:].strip()) |
@@ -83,16 +146,99 @@ |
def _parse_instruction(text): |
match = INCLUDE_REGEXP.match(text) |
if not match: |
raise ParseError('Unrecognized instruction', text) |
return Include(match.group(1)) |
+def _parse_option(option): |
+ if '=' in option: |
+ return option.split('=', 1) |
+ if option.startswith('~'): |
+ return option[1:], False |
+ return option, True |
+ |
+ |
+def _parse_filter_option(option): |
+ name, value = _parse_option(option) |
+ |
+ if name not in ALL_OPTIONS: |
mathias
2017/08/01 06:31:35
I don't think this part of the code should validat
Vasily Kuznetsov
2017/08/02 16:21:17
Following our conversation, I agree. Done
|
+ raise ParseError('Unrecognized option', name) |
+ |
+ # Handle special cases of multivalued options. |
+ if name == FILTER_OPTION.DOMAIN: |
+ value = [_parse_option(o) for o in value.split('|')] |
+ elif name == FILTER_OPTION.SITEKEY: |
+ value = value.split('|') |
+ |
+ return name, value |
+ |
+ |
+def _parse_filter_options(options, separator=','): |
mathias
2017/08/01 06:31:35
Why is the separator a parameter? The only place w
Vasily Kuznetsov
2017/08/02 16:21:17
This is left-over from an earlier version that use
|
+ return [_parse_filter_option(o) for o in options.split(separator)] |
+ |
+ |
+def _parse_blocking_filter(text): |
+ # Based on RegExpFilter.fromText in lib/filterClasses.js |
+ # in https://hg.adblockplus.org/adblockpluscore. |
+ action = FILTER_ACTION.BLOCK |
+ options = [] |
+ selector = text |
+ |
+ if selector.startswith('@@'): |
+ action = FILTER_ACTION.ALLOW |
+ selector = selector[2:] |
+ |
+ if '$' in selector: |
+ opt_match = FILTER_OPTIONS_REGEXP.search(selector) |
+ if opt_match: |
+ selector = selector[:opt_match.start(0)] |
+ options = _parse_filter_options(opt_match.group(1)) |
+ |
+ if (len(selector) > 1 and |
+ selector.startswith('/') and selector.endswith('/')): |
+ selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]} |
+ else: |
+ selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector} |
+ |
+ return Filter(text, selector, action, options) |
+ |
+ |
+def _parse_hiding_filter(text, domain, type_flag, selector_value): |
+ selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value} |
+ action = FILTER_ACTION.HIDE |
+ options = [] |
+ |
+ if type_flag == '@': |
+ action = FILTER_ACTION.SHOW |
+ elif type_flag == '?': |
+ selector['type'] = SELECTOR_TYPE.XCSS |
+ |
+ if domain: |
+ domains = [_parse_option(d) for d in domain.split(',')] |
+ options.append((FILTER_OPTION.DOMAIN, domains)) |
+ |
+ return Filter(text, selector, action, options) |
+ |
+ |
+def parse_filter(text): |
+ """Parse one filter. |
+ |
+ :param text: Text representation of a filter. |
+ :returns: Filter object. |
+ """ |
+ if '#' in text: |
+ match = HIDING_FILTER_REGEXP.search(text) |
+ if match: |
+ return _parse_hiding_filter(text, *match.groups()) |
+ return _parse_blocking_filter(text) |
+ |
+ |
def parse_line(line_text): |
"""Parse one line of a filter list. |
:param line_text: Line of a filter list (must be a unicode string). |
:returns: Parsed line object (see `_line_type`). |
:raises ParseError: If the line can't be successfully parsed. |
""" |
content = line_text.strip() |
@@ -101,17 +247,17 @@ |
line = EmptyLine() |
elif content.startswith('!'): |
line = _parse_comment(content) |
elif content.startswith('%') and content.endswith('%'): |
line = _parse_instruction(content) |
elif content.startswith('[') and content.endswith(']'): |
line = _parse_header(content) |
else: |
- line = Filter(content) |
+ line = parse_filter(content) |
assert line.to_string().replace(' ', '') == content.replace(' ', '') |
return line |
def parse_filterlist(lines): |
"""Parse filter list from an iterable. |