Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: abp/filters/parser.py

Issue 29880577: Issue 6877 - Only parse headers in the first line of the filter list (Closed)
Patch Set: Initial Created Sept. 14, 2018, 4:43 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | abp/filters/rpy.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: abp/filters/parser.py
===================================================================
--- a/abp/filters/parser.py
+++ b/abp/filters/parser.py
@@ -135,30 +135,32 @@
Header = _line_type('Header', 'version', '[{.version}]')
EmptyLine = _line_type('EmptyLine', '', '')
Comment = _line_type('Comment', 'text', '! {.text}')
Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
Filter = _line_type('Filter', 'text selector action options', '{.text}')
Include = _line_type('Include', 'target', '%include {0.target}%')
-METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)')
+METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)')
INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
-HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
+HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]$',
Sebastian Noack 2018/09/15 16:08:32 I changed this regular epxressions like this in my
Vasily Kuznetsov 2018/09/17 10:40:27 Yeah, you're right. I think the logic of parse_lin
Vasily Kuznetsov 2018/09/18 12:41:14 Done.
+ flags=re.I)
HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
FILTER_OPTIONS_REGEXP = re.compile(
r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'
)
def _parse_header(text):
match = HEADER_REGEXP.match(text)
- if not match:
+ version = match.group(1) if match else None
+ if not version:
raise ParseError('Malformed header', text)
- return Header(match.group(1))
+ return Header(version)
def _parse_instruction(text):
match = INCLUDE_REGEXP.match(text)
if not match:
raise ParseError('Unrecognized instruction', text)
return Include(match.group(1))
@@ -246,56 +248,75 @@
"""
if '#' in text:
match = HIDING_FILTER_REGEXP.search(text)
if match:
return _parse_hiding_filter(text, *match.groups())
return _parse_blocking_filter(text)
-def parse_line(line_text):
+def parse_line(line_text, mode='body'):
Sebastian Noack 2018/09/15 16:08:32 Having the "mode" as part of the public API, requi
Vasily Kuznetsov 2018/09/17 10:40:27 I would like to keep the mode in the public API. T
"""Parse one line of a filter list.
- Note that parse_line() doesn't handle special comments, hence never returns
- a Metadata() object, Adblock Plus only considers metadata when parsing the
- whole filter list and only if they are given at the top of the filter list.
+ The types of lines that that the parser recognizes depend on the mode. In
+ body mode the parser only recognizes filters, comments, processing
+ instructions and empty lines. In medata mode it in addition recognizes
+ metadata. In start mode it also recognizes headers.
+
+ Note: checksum metadata lines are recognized in all modes for backwards
+ compatibility. Historically, checksums can occur at the bottom of the
+ filter list. They are are no longer used by Adblock Plus, but in order to
+ strip them (in abp.filters.renderer), we have to make sure to still parse
+ them regardless of their position in the filter list.
Parameters
----------
line_text : str
Line of a filter list.
+ mode : str
+ Parsing mode, one of "start", "metadata" or "body" (default).
Returns
-------
namedtuple
Parsed line (see `_line_type`).
Raises
------
ParseError
ParseError: If the line can't be parsed.
+
"""
+ MODES = {'body', 'start', 'metadata'}
+ if mode not in MODES:
+ raise ValueError('mode should be one of {}'.format(MODES))
+
if isinstance(line_text, type(b'')):
line_text = line_text.decode('utf-8')
content = line_text.strip()
Sebastian Noack 2018/09/15 16:08:32 Adblock Plus doesn't strip the line before process
Vasily Kuznetsov 2018/09/17 10:40:27 The behavior of ABP for the headers seems right. I
Sebastian Noack 2018/09/17 18:11:52 Adblock Plus extracts metadata (and the header) be
Vasily Kuznetsov 2018/09/18 12:41:14 Acknowledged.
if content == '':
- line = EmptyLine()
- elif content.startswith('!'):
- line = Comment(content[1:].lstrip())
- elif content.startswith('%') and content.endswith('%'):
- line = _parse_instruction(content)
- elif content.startswith('[') and content.endswith(']'):
- line = _parse_header(content)
- else:
- line = parse_filter(content)
+ return EmptyLine()
- assert line.to_string().replace(' ', '') == content.replace(' ', '')
- return line
+ if content.startswith('!'):
+ match = METADATA_REGEXP.match(line_text)
+ if match:
+ key, value = match.groups()
+ if mode != 'body' or key.lower() == 'checksum':
Sebastian Noack 2018/09/15 16:08:32 We probably should keep the comment why we treat c
Vasily Kuznetsov 2018/09/17 10:40:27 I would like to also keep the note about checksums
Sebastian Noack 2018/09/17 18:11:52 I didn't notice that you moved that note to the do
Vasily Kuznetsov 2018/09/18 12:41:14 It needs to be in the docstring because it's part
+ return Metadata(key, value)
+ return Comment(content[1:].lstrip())
+
+ if content.startswith('%') and content.endswith('%'):
+ return _parse_instruction(content)
+
+ if mode == 'start' and content.startswith('[') and content.endswith(']'):
+ return _parse_header(content)
+
+ return parse_filter(content)
def parse_filterlist(lines):
"""Parse filter list from an iterable.
Parameters
----------
lines: iterable of str
@@ -309,30 +330,20 @@
Raises
------
ParseError
Thrown during iteration for invalid filter list lines.
TypeError
If `lines` is not iterable.
"""
- metadata_closed = False
+ mode = 'start'
Sebastian Noack 2018/09/17 18:11:52 Maybe "position" would be more accurate name for t
Vasily Kuznetsov 2018/09/18 12:41:14 Yeah, "position" is a better name. I changed it.
for line in lines:
- result = parse_line(line)
-
- if result.type == 'comment':
- match = METADATA_REGEXP.match(result.text)
- if match:
- key, value = match.groups()
+ parsed_line = parse_line(line, mode)
+ yield parsed_line
- # Historically, checksums can occur at the bottom of the
- # filter list. Checksums are no longer used by Adblock Plus,
- # but in order to strip them (in abp.filters.renderer),
- # we have to make sure to still parse them regardless of
- # their position in the filter list.
- if not metadata_closed or key.lower() == 'checksum':
- result = Metadata(key, value)
-
- if result.type not in {'header', 'metadata'}:
- metadata_closed = True
-
- yield result
+ if mode != 'body' and parsed_line.type in {'header', 'metadata'}:
+ # Continue parsing metadata if it's not over...
+ mode = 'metadata'
+ else:
+ # ...otherwise switch to parsing filter list body.
+ mode = 'body'
« no previous file with comments | « no previous file | abp/filters/rpy.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld