Index: abp/filters/parser.py |
=================================================================== |
--- a/abp/filters/parser.py |
+++ b/abp/filters/parser.py |
@@ -13,84 +13,228 @@ |
# You should have received a copy of the GNU General Public License |
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
from __future__ import unicode_literals |
import re |
from collections import namedtuple |
-__all__ = ['parse_filterlist', 'parse_line', 'ParseError'] |
+__all__ = ['parse_filterlist', 'parse_line', 'parse_filter'] |
class ParseError(Exception): |
- """Exception thrown by the parser when it encounters invalid input. |
- |
- :param error: Description of the error. |
- :param text: The text which was being parsed when an error occurred. |
- """ |
- |
- def __init__(self, error, text): |
- Exception.__init__(self, '{} in "{}"'.format(error, text)) |
- self.text = text |
- self.error = error |
+ """Internal exception used by the parser to signal invalid input.""" |
mathias
2017/07/26 20:37:15
Removing the custom __init__ function looks like a
|
def line_type(name, field_names, format_string): |
"""Define a line type. |
:param name: The name of the line type to define. |
:param field_names: A sequence of field names or one space-separated |
string that contains all field names. |
+ :param format_string: A format specifier for converting this line type |
mathias
2017/07/26 20:37:15
Fixing the missing format_string parameter documen
|
+ back to string representation. |
:returns: Class created with `namedtuple` that has `.type` set to |
lowercased `name` and supports conversion back to string with |
`.to_string()` method. |
""" |
lt = namedtuple(name, field_names) |
lt.type = name.lower() |
lt.to_string = lambda self: format_string.format(self) |
return lt |
+InvalidLine = line_type('Invalid', 'text error', '{.text}') |
Header = line_type('Header', 'version', '[{.version}]') |
EmptyLine = line_type('EmptyLine', '', '') |
Comment = line_type('Comment', 'text', '! {.text}') |
Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
-Filter = line_type('Filter', 'expression', '{.expression}') |
Include = line_type('Include', 'target', '%include {0.target}%') |
+Filter = line_type('Filter', 'text selector action options', '{.text}') |
METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') |
METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', |
'Version'} |
INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
+BFILTER_OPTIONS_REGEXP = re.compile( |
+ r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$' |
+) |
+HFILTER_REGEXP = re.compile( |
+ r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)' |
+ r'((?:\([\w\-]+(?:[$^*]?=[^\(\)"]*)?\))*)|#([^{}]+))$' |
+) |
+ |
+# Types of resources to block (based on adblockpluscore/lib/filterClasses.js). |
+TYPES = { |
+ 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media', |
+ 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup', |
+ 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide', |
+} |
+ |
+# Special types used for whitelisting. |
+TYPES_WHITELIST = { |
+ 'document', 'elemhide', 'generichide', 'genericblock', |
+} |
+ |
+# By default blocking filters apply to everything except whitelist-only types |
+# and popups (based on adblockpluscore/lib/filterClasses.js). |
+TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST} |
+ |
+# Type options that are synonyms for other types. |
+TYPE_SYNONYMS = { |
+ 'xbl': 'other', |
+ 'dtd': 'other', |
+ 'background': 'image', |
+} |
def _parse_comment(text): |
match = METADATA_REGEXP.match(text) |
if match and match.group(1) in METADATA_KEYS: |
return Metadata(match.group(1), match.group(2)) |
return Comment(text[1:].strip()) |
def _parse_header(text): |
match = HEADER_REGEXP.match(text) |
if not match: |
- raise ParseError('Malformed header', text) |
+ raise ParseError('Malformed header') |
mathias
2017/07/26 20:37:15
Please explain why you don't include the malformed
Vasily Kuznetsov
2017/07/27 11:05:02
My reasoning was that you can get to this place in
|
return Header(match.group(1)) |
def _parse_instruction(text): |
match = INCLUDE_REGEXP.match(text) |
if not match: |
- raise ParseError('Unrecognized instruction', text) |
+ raise ParseError('Unrecognized instruction') |
return Include(match.group(1)) |
+def _separate_domains(domains): |
+ options = {} |
+ for d in domains: |
+ if d.startswith('~'): |
+ options.setdefault('domains-exclude', []).append(d.lstrip('~')) |
+ else: |
+ options.setdefault('domains-include', []).append(d) |
+ if 'domains-include' in options: |
+ options['domains-none'] = True |
+ return options |
+ |
+ |
+def _separate_types(types): |
+ """Convert a list of `(type, on_off)` tuples to options: |
+ |
+ - types-none: True if we start with nothing included, absent if we start |
+ with TYPES_DEFAULT included. |
+ - types-include: List of additional included types. |
+ - types-exclude: List of excluded types. |
+ """ |
+ if not types: |
+ return {} |
+ |
+ if types[0][1]: # If the first type is ON, we start with nothing... |
+ types_default = set() |
+ options = {'types-none': True} |
+ else: # ...otherwise we start with default type set. |
+ types_default = TYPES_DEFAULT |
+ options = {} |
+ |
+ # Include/exclude any deviations from default. |
+ for name, value in dict(types).items(): |
+ if value and name not in types_default: |
+ options.setdefault('types-include', []).append(name) |
+ if not value and name in types_default: |
+ options.setdefault('types-exclude', []).append(name) |
+ |
+ return options |
+ |
+ |
+def _parse_hiding_filter(text, match): |
+ if match.group(5): |
+ selector = {'type': 'css', 'value': match.group(5)} |
+ else: |
+ selector = { |
+ 'type': 'abp-simple', |
+ 'value': match.group(3) + match.group(4), |
+ } |
+ action = 'show' if match.group(2) else 'hide' |
+ options = _separate_domains(list(filter(None, match.group(1).split(',')))) |
+ return Filter(text, selector, action, options) |
+ |
+ |
+def _parse_filter_options(options): |
+ # Based on RegExpFilter.fromText in lib/filterClasses.js |
+ # in adblockpluscore. |
+ parsed_options = {} |
+ type_options = [] |
+ |
+ for option in options.split(','): |
+ if '=' in option: |
+ name, value = option.split('=', 1) |
+ elif option.startswith('~'): |
+ name, value = option[1:], False |
+ else: |
+ name, value = option, True |
+ |
+ if name in TYPE_SYNONYMS: |
+ name = TYPE_SYNONYMS[name] |
+ if name in TYPES: |
+ type_options.append((name, value)) |
+ elif name == 'domain': |
+ parsed_options.update(_separate_domains(value.split('|'))) |
+ elif name == 'sitekey': |
+ parsed_options['sitekeys'] = value.split('|') |
+ else: |
+ parsed_options[name] = value |
+ |
+ parsed_options.update(_separate_types(type_options)) |
+ return parsed_options |
+ |
+ |
+def _parse_blocking_filter(text): |
+ # Based on RegExpFilter.fromText in lib/filterClasses.js |
+ # in adblockpluscore. |
+ action = 'block' |
+ options = {} |
+ selector = text |
+ |
+ if selector.startswith('@@'): |
+ action = 'allow' |
mathias
2017/07/26 20:37:15
I think we should have symbols like BFILTER_ACTION
Vasily Kuznetsov
2017/07/27 11:05:02
Probably not these exact names for the constants,
|
+ selector = selector[2:] |
+ |
+ if '$' in selector: |
+ opt_match = BFILTER_OPTIONS_REGEXP.search(selector) |
+ if opt_match: |
+ selector = selector[:opt_match.start(0)] |
+ options = _parse_filter_options(opt_match.group(1)) |
+ |
+ if (len(selector) > 1 and |
+ selector.startswith('/') and selector.endswith('/')): |
+ selector = {'type': 'url-regexp', 'value': selector[1:-1]} |
mathias
2017/07/26 20:37:15
I also think we should have symbols like SELECTOR_
Vasily Kuznetsov
2017/07/27 11:05:02
Acknowledged.
|
+ else: |
+ selector = {'type': 'url-pattern', 'value': selector} |
+ |
+ return Filter(text, selector, action, options) |
+ |
+ |
+def parse_filter(text): |
+ """Parse one filter. |
+ |
+ :param text: Text representation of a filter. |
+ :returns: filter object. |
+ """ |
+ match = HFILTER_REGEXP.match(text) if '#' in text else False |
mathias
2017/07/26 20:37:15
Call me old-fashioned but I seriously dislike chan
Vasily Kuznetsov
2017/07/27 11:05:02
Completely agree about changing the type of the va
|
+ if match: |
+ return _parse_hiding_filter(text, match) |
+ return _parse_blocking_filter(text) |
+ |
+ |
def parse_line(line_text): |
"""Parse one line of a filter list. |
:param line_text: Line of a filter list (must be a unicode string). |
:returns: Parsed line object (see `line_type`). |
:raises ParseError: If the line can't be successfully parsed. |
""" |
content = line_text.strip() |
@@ -99,23 +243,26 @@ |
line = EmptyLine() |
elif content.startswith('!'): |
line = _parse_comment(content) |
elif content.startswith('%') and content.endswith('%'): |
line = _parse_instruction(content) |
elif content.startswith('[') and content.endswith(']'): |
line = _parse_header(content) |
else: |
- line = Filter(content) |
+ line = parse_filter(content) |
assert line.to_string().replace(' ', '') == content.replace(' ', '') |
return line |
def parse_filterlist(lines): |
"""Parse filter list from an iterable. |
:param lines: List of strings or file or other iterable. |
:returns: Iterator over parsed lines. |
:raises ParseError: Can be thrown during iteration for invalid lines. |
""" |
for line in lines: |
- yield parse_line(line) |
+ try: |
+ yield parse_line(line) |
+ except ParseError as pe: |
+ yield InvalidLine(line.strip(), str(pe)) |