Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)
Patch Set: Created June 14, 2017, 5:32 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | setup.py » ('j') | setup.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: abp/filters/parser.py
===================================================================
--- a/abp/filters/parser.py
+++ b/abp/filters/parser.py
@@ -13,84 +13,228 @@
# You should have received a copy of the GNU General Public License
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
import re
from collections import namedtuple
-__all__ = ['parse_filterlist', 'parse_line', 'ParseError']
+__all__ = ['parse_filterlist', 'parse_line', 'parse_filter']
class ParseError(Exception):
- """Exception thrown by the parser when it encounters invalid input.
-
- :param error: Description of the error.
- :param text: The text which was being parsed when an error occurred.
- """
-
- def __init__(self, error, text):
- Exception.__init__(self, '{} in "{}"'.format(error, text))
- self.text = text
- self.error = error
+ """Internal exception used by the parser to signal invalid input."""
mathias 2017/07/26 20:37:15 Removing the custom __init__ function looks like a
def line_type(name, field_names, format_string):
"""Define a line type.
:param name: The name of the line type to define.
:param field_names: A sequence of field names or one space-separated
string that contains all field names.
+ :param format_string: A format specifier for converting this line type
mathias 2017/07/26 20:37:15 Fixing the missing format_string parameter documen
+ back to string representation.
:returns: Class created with `namedtuple` that has `.type` set to
lowercased `name` and supports conversion back to string with
`.to_string()` method.
"""
lt = namedtuple(name, field_names)
lt.type = name.lower()
lt.to_string = lambda self: format_string.format(self)
return lt
+InvalidLine = line_type('Invalid', 'text error', '{.text}')
Header = line_type('Header', 'version', '[{.version}]')
EmptyLine = line_type('EmptyLine', '', '')
Comment = line_type('Comment', 'text', '! {.text}')
Metadata = line_type('Metadata', 'key value', '! {0.key}: {0.value}')
-Filter = line_type('Filter', 'expression', '{.expression}')
Include = line_type('Include', 'target', '%include {0.target}%')
+Filter = line_type('Filter', 'text selector action options', '{.text}')
METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')
METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',
'Version'}
INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
+BFILTER_OPTIONS_REGEXP = re.compile(
+ r'\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$'
+)
+HFILTER_REGEXP = re.compile(
+ r'^([^\/\*\|\@"!]*?)#(\@)?(?:([\w\-]+|\*)'
+ r'((?:\([\w\-]+(?:[$^*]?=[^\(\)"]*)?\))*)|#([^{}]+))$'
+)
+
+# Types of resources to block (based on adblockpluscore/lib/filterClasses.js).
+TYPES = {
+ 'font', 'websocket', 'object-subrequest', 'script', 'elemhide', 'media',
+ 'image', 'object', 'ping', 'genericblock', 'stylesheet', 'other', 'popup',
+ 'xmlhttprequest', 'document', 'webrtc', 'subdocument', 'generichide',
+}
+
+# Special types used for whitelisting.
+TYPES_WHITELIST = {
+ 'document', 'elemhide', 'generichide', 'genericblock',
+}
+
+# By default blocking filters apply to everything except whitelist-only types
+# and popups (based on adblockpluscore/lib/filterClasses.js).
+TYPES_DEFAULT = {t for t in TYPES if t not in TYPES_WHITELIST}
+
+# Type options that are synonyms for other types.
+TYPE_SYNONYMS = {
+ 'xbl': 'other',
+ 'dtd': 'other',
+ 'background': 'image',
+}
def _parse_comment(text):
match = METADATA_REGEXP.match(text)
if match and match.group(1) in METADATA_KEYS:
return Metadata(match.group(1), match.group(2))
return Comment(text[1:].strip())
def _parse_header(text):
match = HEADER_REGEXP.match(text)
if not match:
- raise ParseError('Malformed header', text)
+ raise ParseError('Malformed header')
mathias 2017/07/26 20:37:15 Please explain why you don't include the malformed
Vasily Kuznetsov 2017/07/27 11:05:02 My reasoning was that you can get to this place in
return Header(match.group(1))
def _parse_instruction(text):
match = INCLUDE_REGEXP.match(text)
if not match:
- raise ParseError('Unrecognized instruction', text)
+ raise ParseError('Unrecognized instruction')
return Include(match.group(1))
+def _separate_domains(domains):
+ options = {}
+ for d in domains:
+ if d.startswith('~'):
+ options.setdefault('domains-exclude', []).append(d.lstrip('~'))
+ else:
+ options.setdefault('domains-include', []).append(d)
+ if 'domains-include' in options:
+ options['domains-none'] = True
+ return options
+
+
+def _separate_types(types):
+ """Convert a list of `(type, on_off)` tuples to options:
+
+ - types-none: True if we start with nothing included, absent if we start
+ with TYPES_DEFAULT included.
+ - types-include: List of additional included types.
+ - types-exclude: List of excluded types.
+ """
+ if not types:
+ return {}
+
+ if types[0][1]: # If the first type is ON, we start with nothing...
+ types_default = set()
+ options = {'types-none': True}
+ else: # ...otherwise we start with default type set.
+ types_default = TYPES_DEFAULT
+ options = {}
+
+ # Include/exclude any deviations from default.
+ for name, value in dict(types).items():
+ if value and name not in types_default:
+ options.setdefault('types-include', []).append(name)
+ if not value and name in types_default:
+ options.setdefault('types-exclude', []).append(name)
+
+ return options
+
+
+def _parse_hiding_filter(text, match):
+ if match.group(5):
+ selector = {'type': 'css', 'value': match.group(5)}
+ else:
+ selector = {
+ 'type': 'abp-simple',
+ 'value': match.group(3) + match.group(4),
+ }
+ action = 'show' if match.group(2) else 'hide'
+ options = _separate_domains(list(filter(None, match.group(1).split(','))))
+ return Filter(text, selector, action, options)
+
+
+def _parse_filter_options(options):
+ # Based on RegExpFilter.fromText in lib/filterClasses.js
+ # in adblockpluscore.
+ parsed_options = {}
+ type_options = []
+
+ for option in options.split(','):
+ if '=' in option:
+ name, value = option.split('=', 1)
+ elif option.startswith('~'):
+ name, value = option[1:], False
+ else:
+ name, value = option, True
+
+ if name in TYPE_SYNONYMS:
+ name = TYPE_SYNONYMS[name]
+ if name in TYPES:
+ type_options.append((name, value))
+ elif name == 'domain':
+ parsed_options.update(_separate_domains(value.split('|')))
+ elif name == 'sitekey':
+ parsed_options['sitekeys'] = value.split('|')
+ else:
+ parsed_options[name] = value
+
+ parsed_options.update(_separate_types(type_options))
+ return parsed_options
+
+
+def _parse_blocking_filter(text):
+ # Based on RegExpFilter.fromText in lib/filterClasses.js
+ # in adblockpluscore.
+ action = 'block'
+ options = {}
+ selector = text
+
+ if selector.startswith('@@'):
+ action = 'allow'
mathias 2017/07/26 20:37:15 I think we should have symbols like BFILTER_ACTION
Vasily Kuznetsov 2017/07/27 11:05:02 Probably not these exact names for the constants,
+ selector = selector[2:]
+
+ if '$' in selector:
+ opt_match = BFILTER_OPTIONS_REGEXP.search(selector)
+ if opt_match:
+ selector = selector[:opt_match.start(0)]
+ options = _parse_filter_options(opt_match.group(1))
+
+ if (len(selector) > 1 and
+ selector.startswith('/') and selector.endswith('/')):
+ selector = {'type': 'url-regexp', 'value': selector[1:-1]}
mathias 2017/07/26 20:37:15 I also think we should have symbols like SELECTOR_
Vasily Kuznetsov 2017/07/27 11:05:02 Acknowledged.
+ else:
+ selector = {'type': 'url-pattern', 'value': selector}
+
+ return Filter(text, selector, action, options)
+
+
+def parse_filter(text):
+ """Parse one filter.
+
+ :param text: Text representation of a filter.
+ :returns: filter object.
+ """
+ match = HFILTER_REGEXP.match(text) if '#' in text else False
mathias 2017/07/26 20:37:15 Call me old-fashioned but I seriously dislike chan
Vasily Kuznetsov 2017/07/27 11:05:02 Completely agree about changing the type of the va
+ if match:
+ return _parse_hiding_filter(text, match)
+ return _parse_blocking_filter(text)
+
+
def parse_line(line_text):
"""Parse one line of a filter list.
:param line_text: Line of a filter list (must be a unicode string).
:returns: Parsed line object (see `line_type`).
:raises ParseError: If the line can't be successfully parsed.
"""
content = line_text.strip()
@@ -99,23 +243,26 @@
line = EmptyLine()
elif content.startswith('!'):
line = _parse_comment(content)
elif content.startswith('%') and content.endswith('%'):
line = _parse_instruction(content)
elif content.startswith('[') and content.endswith(']'):
line = _parse_header(content)
else:
- line = Filter(content)
+ line = parse_filter(content)
assert line.to_string().replace(' ', '') == content.replace(' ', '')
return line
def parse_filterlist(lines):
"""Parse filter list from an iterable.
:param lines: List of strings or file or other iterable.
:returns: Iterator over parsed lines.
:raises ParseError: Can be thrown during iteration for invalid lines.
"""
for line in lines:
- yield parse_line(line)
+ try:
+ yield parse_line(line)
+ except ParseError as pe:
+ yield InvalidLine(line.strip(), str(pe))
« no previous file with comments | « no previous file | setup.py » ('j') | setup.py » ('J')

Powered by Google App Engine
This is Rietveld