Index: abp/filters/parser.py |
=================================================================== |
--- a/abp/filters/parser.py |
+++ b/abp/filters/parser.py |
@@ -140,7 +140,7 @@ |
Include = _line_type('Include', 'target', '%include {0.target}%') |
-METADATA_REGEXP = re.compile(r'!\s*([\w-]+)\s*:(?!//)\s*(.*)') |
+METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)') |
INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
@@ -149,11 +149,25 @@ |
) |
-def _parse_comment(text): |
+def _parse_comment(text, metadata_closed): |
match = METADATA_REGEXP.match(text) |
if match: |
- return Metadata(match.group(1), match.group(2)) |
- return Comment(text[1:].strip()) |
+ key, value = match.groups() |
+ |
+ # Historically, checksums can occur at the bottom of the filter list. |
+ # Checksums are no longer used by Adblock Plus, but in order to strip |
+ # them (in abp.filters.renderer), we have to make sure to still parse |
+ # them regardless of their position in the filter list. |
+ if key and (not metadata_closed or key.lower() == 'checksum'): |
+ return Metadata(key, value), metadata_closed |
+ else: |
+ # The regular expression matches as well if we see a malformed key |
+ # (e.g. "Last modified"). In that case we want to keep looking for |
+ # more metadata, but return a Comment instead of a Metadata object. |
+ # Hence we only consider the the metadata section closed if the |
+ # regular expression doesn't match. |
+ metadata_closed = True |
+ return Comment(text[1:].lstrip()), metadata_closed |
def _parse_header(text): |
@@ -258,6 +272,28 @@ |
return _parse_blocking_filter(text) |
+def _parse_line(line_text, metadata_closed): |
+ if isinstance(line_text, type(b'')): |
+ line_text = line_text.decode('utf-8') |
+ |
+ content = line_text.strip() |
+ |
+ if content.startswith('!'): |
+ line, metadata_closed = _parse_comment(content, metadata_closed) |
+ elif content.startswith('[') and content.endswith(']'): |
+ line = _parse_header(content) |
+ else: |
+ if content == '': |
+ line = EmptyLine() |
+ elif content.startswith('%') and content.endswith('%'): |
+ line = _parse_instruction(content) |
+ else: |
+ line = parse_filter(content) |
+ metadata_closed = True |
+ |
+ return line, metadata_closed |
+ |
+ |
def parse_line(line_text): |
"""Parse one line of a filter list. |
@@ -276,25 +312,7 @@ |
ParseError |
ParseError: If the line can't be parsed. |
""" |
- if isinstance(line_text, type(b'')): |
- line_text = line_text.decode('utf-8') |
- |
- content = line_text.strip() |
- |
- if content == '': |
- line = EmptyLine() |
- elif content.startswith('!'): |
- line = _parse_comment(content) |
- elif content.startswith('%') and content.endswith('%'): |
- line = _parse_instruction(content) |
- elif content.startswith('[') and content.endswith(']'): |
- line = _parse_header(content) |
- else: |
- line = parse_filter(content) |
- |
- assert line.to_string().replace(' ', '') == content.replace(' ', '') |
- return line |
- |
+ return _parse_line(line_text, True)[0] |
Vasily Kuznetsov
2018/09/04 10:03:30
Here we're changing the behavior of `parse_line` -
Sebastian Noack
2018/09/04 16:01:03
I moved the logic as requested to parse_filterlist
Vasily Kuznetsov
2018/09/04 16:49:50
What I had in mind was to keep _parse_comment() wi
|
def parse_filterlist(lines): |
"""Parse filter list from an iterable. |
@@ -317,5 +335,7 @@ |
If `lines` is not iterable. |
""" |
+ metadata_closed = False |
Vasily Kuznetsov
2018/09/04 10:03:30
What do you think about filter lists like this?
Sebastian Noack
2018/09/04 16:01:03
In practice nobody seems to put comments in betwee
Vasily Kuznetsov
2018/09/04 16:49:50
Well, "Last modified" is considered a comment and
|
for line in lines: |
- yield parse_line(line) |
+ parsed, metadata_closed = _parse_line(line, metadata_closed) |
+ yield parsed |