| Index: abp/filters/parser.py |
| =================================================================== |
| --- a/abp/filters/parser.py |
| +++ b/abp/filters/parser.py |
| @@ -140,7 +140,7 @@ |
| Include = _line_type('Include', 'target', '%include {0.target}%') |
| -METADATA_REGEXP = re.compile(r'!\s*([\w-]+)\s*:(?!//)\s*(.*)') |
| +METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)') |
| INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
| @@ -149,11 +149,25 @@ |
| ) |
| -def _parse_comment(text): |
| +def _parse_comment(text, metadata_closed): |
| match = METADATA_REGEXP.match(text) |
| if match: |
| - return Metadata(match.group(1), match.group(2)) |
| - return Comment(text[1:].strip()) |
| + key, value = match.groups() |
| + |
| + # Historically, checksums can occur at the bottom of the filter list. |
| + # Checksums are no longer used by Adblock Plus, but in order to strip |
| + # them (in abp.filters.renderer), we have to make sure to still parse |
| + # them regardless of their position in the filter list. |
| + if key and (not metadata_closed or key.lower() == 'checksum'): |
| + return Metadata(key, value), metadata_closed |
| + else: |
| + # The regular expression matches as well if we see a malformed key |
| + # (e.g. "Last modified"). In that case we want to keep looking for |
| + # more metadata, but return a Comment instead of a Metadata object. |
| + # Hence we only consider the the metadata section closed if the |
| + # regular expression doesn't match. |
| + metadata_closed = True |
| + return Comment(text[1:].lstrip()), metadata_closed |
| def _parse_header(text): |
| @@ -258,6 +272,28 @@ |
| return _parse_blocking_filter(text) |
| +def _parse_line(line_text, metadata_closed): |
| + if isinstance(line_text, type(b'')): |
| + line_text = line_text.decode('utf-8') |
| + |
| + content = line_text.strip() |
| + |
| + if content.startswith('!'): |
| + line, metadata_closed = _parse_comment(content, metadata_closed) |
| + elif content.startswith('[') and content.endswith(']'): |
| + line = _parse_header(content) |
| + else: |
| + if content == '': |
| + line = EmptyLine() |
| + elif content.startswith('%') and content.endswith('%'): |
| + line = _parse_instruction(content) |
| + else: |
| + line = parse_filter(content) |
| + metadata_closed = True |
| + |
| + return line, metadata_closed |
| + |
| + |
| def parse_line(line_text): |
| """Parse one line of a filter list. |
| @@ -276,25 +312,7 @@ |
| ParseError |
| ParseError: If the line can't be parsed. |
| """ |
| - if isinstance(line_text, type(b'')): |
| - line_text = line_text.decode('utf-8') |
| - |
| - content = line_text.strip() |
| - |
| - if content == '': |
| - line = EmptyLine() |
| - elif content.startswith('!'): |
| - line = _parse_comment(content) |
| - elif content.startswith('%') and content.endswith('%'): |
| - line = _parse_instruction(content) |
| - elif content.startswith('[') and content.endswith(']'): |
| - line = _parse_header(content) |
| - else: |
| - line = parse_filter(content) |
| - |
| - assert line.to_string().replace(' ', '') == content.replace(' ', '') |
| - return line |
| - |
| + return _parse_line(line_text, True)[0] |
|
Vasily Kuznetsov
2018/09/04 10:03:30
Here we're changing the behavior of `parse_line` -
Sebastian Noack
2018/09/04 16:01:03
I moved the logic as requested to parse_filterlist
Vasily Kuznetsov
2018/09/04 16:49:50
What I had in mind was to keep _parse_comment() wi
|
| def parse_filterlist(lines): |
| """Parse filter list from an iterable. |
| @@ -317,5 +335,7 @@ |
| If `lines` is not iterable. |
| """ |
| + metadata_closed = False |
|
Vasily Kuznetsov
2018/09/04 10:03:30
What do you think about filter lists like this?
Sebastian Noack
2018/09/04 16:01:03
In practice nobody seems to put comments in betwee
Vasily Kuznetsov
2018/09/04 16:49:50
Well, "Last modified" is considered a comment and
|
| for line in lines: |
| - yield parse_line(line) |
| + parsed, metadata_closed = _parse_line(line, metadata_closed) |
| + yield parsed |