Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: abp/filters/parser.py

Issue 29873561: Issue 6920 - Only parse metadata from the top of the file (Closed)
Patch Set: Created Sept. 3, 2018, 7:50 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | tests/test_parser.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: abp/filters/parser.py
===================================================================
--- a/abp/filters/parser.py
+++ b/abp/filters/parser.py
@@ -140,7 +140,7 @@
Include = _line_type('Include', 'target', '%include {0.target}%')
-METADATA_REGEXP = re.compile(r'!\s*([\w-]+)\s*:(?!//)\s*(.*)')
+METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)')
INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
@@ -149,11 +149,25 @@
)
-def _parse_comment(text):
+def _parse_comment(text, metadata_closed):
match = METADATA_REGEXP.match(text)
if match:
- return Metadata(match.group(1), match.group(2))
- return Comment(text[1:].strip())
+ key, value = match.groups()
+
+ # Historically, checksums can occur at the bottom of the filter list.
+ # Checksums are no longer used by Adblock Plus, but in order to strip
+ # them (in abp.filters.renderer), we have to make sure to still parse
+ # them regardless of their position in the filter list.
+ if key and (not metadata_closed or key.lower() == 'checksum'):
+ return Metadata(key, value), metadata_closed
+ else:
+ # The regular expression matches as well if we see a malformed key
+ # (e.g. "Last modified"). In that case we want to keep looking for
+ # more metadata, but return a Comment instead of a Metadata object.
+ # Hence we only consider the the metadata section closed if the
+ # regular expression doesn't match.
+ metadata_closed = True
+ return Comment(text[1:].lstrip()), metadata_closed
def _parse_header(text):
@@ -258,6 +272,28 @@
return _parse_blocking_filter(text)
+def _parse_line(line_text, metadata_closed):
+ if isinstance(line_text, type(b'')):
+ line_text = line_text.decode('utf-8')
+
+ content = line_text.strip()
+
+ if content.startswith('!'):
+ line, metadata_closed = _parse_comment(content, metadata_closed)
+ elif content.startswith('[') and content.endswith(']'):
+ line = _parse_header(content)
+ else:
+ if content == '':
+ line = EmptyLine()
+ elif content.startswith('%') and content.endswith('%'):
+ line = _parse_instruction(content)
+ else:
+ line = parse_filter(content)
+ metadata_closed = True
+
+ return line, metadata_closed
+
+
def parse_line(line_text):
"""Parse one line of a filter list.
@@ -276,25 +312,7 @@
ParseError
ParseError: If the line can't be parsed.
"""
- if isinstance(line_text, type(b'')):
- line_text = line_text.decode('utf-8')
-
- content = line_text.strip()
-
- if content == '':
- line = EmptyLine()
- elif content.startswith('!'):
- line = _parse_comment(content)
- elif content.startswith('%') and content.endswith('%'):
- line = _parse_instruction(content)
- elif content.startswith('[') and content.endswith(']'):
- line = _parse_header(content)
- else:
- line = parse_filter(content)
-
- assert line.to_string().replace(' ', '') == content.replace(' ', '')
- return line
-
+ return _parse_line(line_text, True)[0]
Vasily Kuznetsov 2018/09/04 10:03:30 Here we're changing the behavior of `parse_line` -
Sebastian Noack 2018/09/04 16:01:03 I moved the logic as requested to parse_filterlist
Vasily Kuznetsov 2018/09/04 16:49:50 What I had in mind was to keep _parse_comment() wi
def parse_filterlist(lines):
"""Parse filter list from an iterable.
@@ -317,5 +335,7 @@
If `lines` is not iterable.
"""
+ metadata_closed = False
Vasily Kuznetsov 2018/09/04 10:03:30 What do you think about filter lists like this?
Sebastian Noack 2018/09/04 16:01:03 In practice nobody seems to put comments in betwee
Vasily Kuznetsov 2018/09/04 16:49:50 Well, "Last modified" is considered a comment and
for line in lines:
- yield parse_line(line)
+ parsed, metadata_closed = _parse_line(line, metadata_closed)
+ yield parsed
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | tests/test_parser.py » ('J')

Powered by Google App Engine
This is Rietveld