abp/filters/parser.py - Issue 29880577: Issue 6877 - Only parse headers in the first line of the filter list

Unified Diff: abp/filters/parser.py

Issue 29880577: Issue 6877 - Only parse headers in the first line of the filter list (Closed)

Patch Set: Initial Created Sept. 14, 2018, 4:43 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: abp/filters/parser.py

===================================================================

--- a/abp/filters/parser.py

+++ b/abp/filters/parser.py

@@ -135,30 +135,32 @@

Header = _line_type('Header', 'version', '[{.version}]')

EmptyLine = _line_type('EmptyLine', '', '')

Comment = _line_type('Comment', 'text', '! {.text}')

Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

Filter = _line_type('Filter', 'text selector action options', '{.text}')

Include = _line_type('Include', 'target', '%include {0.target}%')

-METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)')

+METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)')

INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

-HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)

+HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]$',

Sebastian Noack 2018/09/15 16:08:32 I changed this regular epxressions like this in my

Vasily Kuznetsov 2018/09/17 10:40:27 Yeah, you're right. I think the logic of parse_lin

Vasily Kuznetsov 2018/09/18 12:41:14 Done.

+ flags=re.I)

HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')

FILTER_OPTIONS_REGEXP = re.compile(

r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'

)

def _parse_header(text):

match = HEADER_REGEXP.match(text)

- if not match:

+ version = match.group(1) if match else None

+ if not version:

raise ParseError('Malformed header', text)

- return Header(match.group(1))

+ return Header(version)

def _parse_instruction(text):

match = INCLUDE_REGEXP.match(text)

if not match:

raise ParseError('Unrecognized instruction', text)

return Include(match.group(1))

@@ -246,56 +248,75 @@

"""

if '#' in text:

match = HIDING_FILTER_REGEXP.search(text)

if match:

return _parse_hiding_filter(text, *match.groups())

return _parse_blocking_filter(text)

-def parse_line(line_text):

+def parse_line(line_text, mode='body'):

Sebastian Noack 2018/09/15 16:08:32 Having the "mode" as part of the public API, requi

Vasily Kuznetsov 2018/09/17 10:40:27 I would like to keep the mode in the public API. T

"""Parse one line of a filter list.

- Note that parse_line() doesn't handle special comments, hence never returns

- a Metadata() object, Adblock Plus only considers metadata when parsing the

- whole filter list and only if they are given at the top of the filter list.

+ The types of lines that that the parser recognizes depend on the mode. In

+ body mode the parser only recognizes filters, comments, processing

+ instructions and empty lines. In medata mode it in addition recognizes

+ metadata. In start mode it also recognizes headers.

+ Note: checksum metadata lines are recognized in all modes for backwards

+ compatibility. Historically, checksums can occur at the bottom of the

+ filter list. They are are no longer used by Adblock Plus, but in order to

+ strip them (in abp.filters.renderer), we have to make sure to still parse

+ them regardless of their position in the filter list.

Parameters

----------

line_text : str

Line of a filter list.

+ mode : str

+ Parsing mode, one of "start", "metadata" or "body" (default).

Returns

-------

namedtuple

Parsed line (see `_line_type`).

Raises

------

ParseError

ParseError: If the line can't be parsed.

"""

+ MODES = {'body', 'start', 'metadata'}

+ if mode not in MODES:

+ raise ValueError('mode should be one of {}'.format(MODES))

if isinstance(line_text, type(b'')):

line_text = line_text.decode('utf-8')

content = line_text.strip()

Sebastian Noack 2018/09/15 16:08:32 Adblock Plus doesn't strip the line before process

Vasily Kuznetsov 2018/09/17 10:40:27 The behavior of ABP for the headers seems right. I

Sebastian Noack 2018/09/17 18:11:52 Adblock Plus extracts metadata (and the header) be

Vasily Kuznetsov 2018/09/18 12:41:14 Acknowledged.

if content == '':

- line = EmptyLine()

- elif content.startswith('!'):

- line = Comment(content[1:].lstrip())

- elif content.startswith('%') and content.endswith('%'):

- line = _parse_instruction(content)

- elif content.startswith('[') and content.endswith(']'):

- line = _parse_header(content)

- else:

- line = parse_filter(content)

+ return EmptyLine()

- assert line.to_string().replace(' ', '') == content.replace(' ', '')

- return line

+ if content.startswith('!'):

+ match = METADATA_REGEXP.match(line_text)

+ if match:

+ key, value = match.groups()

+ if mode != 'body' or key.lower() == 'checksum':

Sebastian Noack 2018/09/15 16:08:32 We probably should keep the comment why we treat c

Vasily Kuznetsov 2018/09/17 10:40:27 I would like to also keep the note about checksums

Sebastian Noack 2018/09/17 18:11:52 I didn't notice that you moved that note to the do

Vasily Kuznetsov 2018/09/18 12:41:14 It needs to be in the docstring because it's part

+ return Metadata(key, value)

+ return Comment(content[1:].lstrip())

+ if content.startswith('%') and content.endswith('%'):

+ return _parse_instruction(content)

+ if mode == 'start' and content.startswith('[') and content.endswith(']'):

+ return _parse_header(content)

+ return parse_filter(content)

def parse_filterlist(lines):

"""Parse filter list from an iterable.

Parameters

----------

lines: iterable of str

@@ -309,30 +330,20 @@

Raises

------

ParseError

Thrown during iteration for invalid filter list lines.

TypeError

If `lines` is not iterable.

"""

- metadata_closed = False

+ mode = 'start'

Sebastian Noack 2018/09/17 18:11:52 Maybe "position" would be more accurate name for t

Vasily Kuznetsov 2018/09/18 12:41:14 Yeah, "position" is a better name. I changed it.

for line in lines:

- result = parse_line(line)

- if result.type == 'comment':

- match = METADATA_REGEXP.match(result.text)

- if match:

- key, value = match.groups()

+ parsed_line = parse_line(line, mode)

+ yield parsed_line

- # Historically, checksums can occur at the bottom of the

- # filter list. Checksums are no longer used by Adblock Plus,

- # but in order to strip them (in abp.filters.renderer),

- # we have to make sure to still parse them regardless of

- # their position in the filter list.

- if not metadata_closed or key.lower() == 'checksum':

- result = Metadata(key, value)

- if result.type not in {'header', 'metadata'}:

- metadata_closed = True

- yield result

+ if mode != 'body' and parsed_line.type in {'header', 'metadata'}:

+ # Continue parsing metadata if it's not over...

+ mode = 'metadata'

+ else:

+ # ...otherwise switch to parsing filter list body.

+ mode = 'body'

« no previous file with comments | « no previous file | abp/filters/rpy.py » ('j') | no next file with comments »