| Index: abp/filters/parser.py |
| =================================================================== |
| --- a/abp/filters/parser.py |
| +++ b/abp/filters/parser.py |
| @@ -135,30 +135,32 @@ |
| Header = _line_type('Header', 'version', '[{.version}]') |
| EmptyLine = _line_type('EmptyLine', '', '') |
| Comment = _line_type('Comment', 'text', '! {.text}') |
| Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
| Filter = _line_type('Filter', 'text selector action options', '{.text}') |
| Include = _line_type('Include', 'target', '%include {0.target}%') |
| -METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)') |
| +METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)') |
| INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| -HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| +HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]$', |
|
Sebastian Noack
2018/09/15 16:08:32
I changed this regular epxressions like this in my
Vasily Kuznetsov
2018/09/17 10:40:27
Yeah, you're right. I think the logic of parse_lin
Vasily Kuznetsov
2018/09/18 12:41:14
Done.
|
| + flags=re.I) |
| HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
| FILTER_OPTIONS_REGEXP = re.compile( |
| r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
| ) |
| def _parse_header(text): |
| match = HEADER_REGEXP.match(text) |
| - if not match: |
| + version = match.group(1) if match else None |
| + if not version: |
| raise ParseError('Malformed header', text) |
| - return Header(match.group(1)) |
| + return Header(version) |
| def _parse_instruction(text): |
| match = INCLUDE_REGEXP.match(text) |
| if not match: |
| raise ParseError('Unrecognized instruction', text) |
| return Include(match.group(1)) |
| @@ -246,56 +248,75 @@ |
| """ |
| if '#' in text: |
| match = HIDING_FILTER_REGEXP.search(text) |
| if match: |
| return _parse_hiding_filter(text, *match.groups()) |
| return _parse_blocking_filter(text) |
| -def parse_line(line_text): |
| +def parse_line(line_text, mode='body'): |
|
Sebastian Noack
2018/09/15 16:08:32
Having the "mode" as part of the public API, requi
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to keep the mode in the public API. T
|
| """Parse one line of a filter list. |
| - Note that parse_line() doesn't handle special comments, hence never returns |
| - a Metadata() object, Adblock Plus only considers metadata when parsing the |
| - whole filter list and only if they are given at the top of the filter list. |
| + The types of lines that that the parser recognizes depend on the mode. In |
| + body mode the parser only recognizes filters, comments, processing |
| + instructions and empty lines. In medata mode it in addition recognizes |
| + metadata. In start mode it also recognizes headers. |
| + |
| + Note: checksum metadata lines are recognized in all modes for backwards |
| + compatibility. Historically, checksums can occur at the bottom of the |
| + filter list. They are are no longer used by Adblock Plus, but in order to |
| + strip them (in abp.filters.renderer), we have to make sure to still parse |
| + them regardless of their position in the filter list. |
| Parameters |
| ---------- |
| line_text : str |
| Line of a filter list. |
| + mode : str |
| + Parsing mode, one of "start", "metadata" or "body" (default). |
| Returns |
| ------- |
| namedtuple |
| Parsed line (see `_line_type`). |
| Raises |
| ------ |
| ParseError |
| ParseError: If the line can't be parsed. |
| + |
| """ |
| + MODES = {'body', 'start', 'metadata'} |
| + if mode not in MODES: |
| + raise ValueError('mode should be one of {}'.format(MODES)) |
| + |
| if isinstance(line_text, type(b'')): |
| line_text = line_text.decode('utf-8') |
| content = line_text.strip() |
|
Sebastian Noack
2018/09/15 16:08:32
Adblock Plus doesn't strip the line before process
Vasily Kuznetsov
2018/09/17 10:40:27
The behavior of ABP for the headers seems right. I
Sebastian Noack
2018/09/17 18:11:52
Adblock Plus extracts metadata (and the header) be
Vasily Kuznetsov
2018/09/18 12:41:14
Acknowledged.
|
| if content == '': |
| - line = EmptyLine() |
| - elif content.startswith('!'): |
| - line = Comment(content[1:].lstrip()) |
| - elif content.startswith('%') and content.endswith('%'): |
| - line = _parse_instruction(content) |
| - elif content.startswith('[') and content.endswith(']'): |
| - line = _parse_header(content) |
| - else: |
| - line = parse_filter(content) |
| + return EmptyLine() |
| - assert line.to_string().replace(' ', '') == content.replace(' ', '') |
| - return line |
| + if content.startswith('!'): |
| + match = METADATA_REGEXP.match(line_text) |
| + if match: |
| + key, value = match.groups() |
| + if mode != 'body' or key.lower() == 'checksum': |
|
Sebastian Noack
2018/09/15 16:08:32
We probably should keep the comment why we treat c
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to also keep the note about checksums
Sebastian Noack
2018/09/17 18:11:52
I didn't notice that you moved that note to the do
Vasily Kuznetsov
2018/09/18 12:41:14
It needs to be in the docstring because it's part
|
| + return Metadata(key, value) |
| + return Comment(content[1:].lstrip()) |
| + |
| + if content.startswith('%') and content.endswith('%'): |
| + return _parse_instruction(content) |
| + |
| + if mode == 'start' and content.startswith('[') and content.endswith(']'): |
| + return _parse_header(content) |
| + |
| + return parse_filter(content) |
| def parse_filterlist(lines): |
| """Parse filter list from an iterable. |
| Parameters |
| ---------- |
| lines: iterable of str |
| @@ -309,30 +330,20 @@ |
| Raises |
| ------ |
| ParseError |
| Thrown during iteration for invalid filter list lines. |
| TypeError |
| If `lines` is not iterable. |
| """ |
| - metadata_closed = False |
| + mode = 'start' |
|
Sebastian Noack
2018/09/17 18:11:52
Maybe "position" would be more accurate name for t
Vasily Kuznetsov
2018/09/18 12:41:14
Yeah, "position" is a better name. I changed it.
|
| for line in lines: |
| - result = parse_line(line) |
| - |
| - if result.type == 'comment': |
| - match = METADATA_REGEXP.match(result.text) |
| - if match: |
| - key, value = match.groups() |
| + parsed_line = parse_line(line, mode) |
| + yield parsed_line |
| - # Historically, checksums can occur at the bottom of the |
| - # filter list. Checksums are no longer used by Adblock Plus, |
| - # but in order to strip them (in abp.filters.renderer), |
| - # we have to make sure to still parse them regardless of |
| - # their position in the filter list. |
| - if not metadata_closed or key.lower() == 'checksum': |
| - result = Metadata(key, value) |
| - |
| - if result.type not in {'header', 'metadata'}: |
| - metadata_closed = True |
| - |
| - yield result |
| + if mode != 'body' and parsed_line.type in {'header', 'metadata'}: |
| + # Continue parsing metadata if it's not over... |
| + mode = 'metadata' |
| + else: |
| + # ...otherwise switch to parsing filter list body. |
| + mode = 'body' |