Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: abp/filters/parser.py

Issue 29880577: Issue 6877 - Only parse headers in the first line of the filter list (Closed)
Patch Set: Fix header parsing, improve argument naming and documentation Created Sept. 18, 2018, 6:06 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | abp/filters/rpy.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-present eyeo GmbH 2 # Copyright (C) 2006-present eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 133
134 134
135 Header = _line_type('Header', 'version', '[{.version}]') 135 Header = _line_type('Header', 'version', '[{.version}]')
136 EmptyLine = _line_type('EmptyLine', '', '') 136 EmptyLine = _line_type('EmptyLine', '', '')
137 Comment = _line_type('Comment', 'text', '! {.text}') 137 Comment = _line_type('Comment', 'text', '! {.text}')
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') 139 Filter = _line_type('Filter', 'text selector action options', '{.text}')
140 Include = _line_type('Include', 'target', '%include {0.target}%') 140 Include = _line_type('Include', 'target', '%include {0.target}%')
141 141
142 142
143 METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)') 143 METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)')
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 145 HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]',
146 flags=re.I)
146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') 147 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
147 FILTER_OPTIONS_REGEXP = re.compile( 148 FILTER_OPTIONS_REGEXP = re.compile(
148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' 149 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'
149 ) 150 )
150 151
151 152
152 def _parse_header(text):
153 match = HEADER_REGEXP.match(text)
154 if not match:
155 raise ParseError('Malformed header', text)
156 return Header(match.group(1))
157
158
159 def _parse_instruction(text): 153 def _parse_instruction(text):
160 match = INCLUDE_REGEXP.match(text) 154 match = INCLUDE_REGEXP.match(text)
161 if not match: 155 if not match:
162 raise ParseError('Unrecognized instruction', text) 156 raise ParseError('Unrecognized instruction', text)
163 return Include(match.group(1)) 157 return Include(match.group(1))
164 158
165 159
166 def _parse_option(option): 160 def _parse_option(option):
167 if '=' in option: 161 if '=' in option:
168 return option.split('=', 1) 162 return option.split('=', 1)
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
244 Parsed filter. 238 Parsed filter.
245 239
246 """ 240 """
247 if '#' in text: 241 if '#' in text:
248 match = HIDING_FILTER_REGEXP.search(text) 242 match = HIDING_FILTER_REGEXP.search(text)
249 if match: 243 if match:
250 return _parse_hiding_filter(text, *match.groups()) 244 return _parse_hiding_filter(text, *match.groups())
251 return _parse_blocking_filter(text) 245 return _parse_blocking_filter(text)
252 246
253 247
254 def parse_line(line_text): 248 def parse_line(line, position='body'):
255 """Parse one line of a filter list. 249 """Parse one line of a filter list.
256 250
257 Note that parse_line() doesn't handle special comments, hence never returns 251 The types of lines that that the parser recognizes depend on the position.
258 a Metadata() object, Adblock Plus only considers metadata when parsing the 252 If position="body", the parser only recognizes filters, comments,
259 whole filter list and only if they are given at the top of the filter list. 253 processing instructions and empty lines. If position="metadata", it in
254 addition recognizes metadata. If position="start", it also recognizes
255 headers.
256
257 Note: Checksum metadata lines are recognized in all positions for backwards
258 compatibility. Historically, checksums can occur at the bottom of the
259 filter list. They are are no longer used by Adblock Plus, but in order to
260 strip them (in abp.filters.renderer), we have to make sure to still parse
261 them regardless of their position in the filter list.
260 262
261 Parameters 263 Parameters
262 ---------- 264 ----------
263 line_text : str 265 line : str
264 Line of a filter list. 266 Line of a filter list.
267 position : str
268 Position in the filter list, one of "start", "metadata" or "body"
269 (default is "body").
265 270
266 Returns 271 Returns
267 ------- 272 -------
268 namedtuple 273 namedtuple
269 Parsed line (see `_line_type`). 274 Parsed line (see `_line_type`).
270 275
271 Raises 276 Raises
272 ------ 277 ------
273 ParseError 278 ParseError
274 ParseError: If the line can't be parsed. 279 ParseError: If the line can't be parsed.
280
275 """ 281 """
276 if isinstance(line_text, type(b'')): 282 POSITIONS = {'body', 'start', 'metadata'}
277 line_text = line_text.decode('utf-8') 283 if position not in POSITIONS:
284 raise ValueError('position should be one of {}'.format(POSITIONS))
278 285
279 content = line_text.strip() 286 if isinstance(line, type(b'')):
287 line = line.decode('utf-8')
280 288
281 if content == '': 289 stripped = line.strip()
282 line = EmptyLine()
283 elif content.startswith('!'):
284 line = Comment(content[1:].lstrip())
285 elif content.startswith('%') and content.endswith('%'):
286 line = _parse_instruction(content)
287 elif content.startswith('[') and content.endswith(']'):
288 line = _parse_header(content)
289 else:
290 line = parse_filter(content)
291 290
292 assert line.to_string().replace(' ', '') == content.replace(' ', '') 291 if stripped == '':
293 return line 292 return EmptyLine()
293
294 if position == 'start':
295 match = HEADER_REGEXP.search(line)
296 if match:
297 version = match.group(1)
298 if not version:
299 raise ParseError('Malformed header', line)
300 return Header(version)
301
302 if stripped.startswith('!'):
303 match = METADATA_REGEXP.match(line)
304 if match:
305 key, value = match.groups()
306 if position != 'body' or key.lower() == 'checksum':
307 return Metadata(key, value)
308 return Comment(stripped[1:].lstrip())
309
310 if stripped.startswith('%') and stripped.endswith('%'):
311 return _parse_instruction(stripped)
312
313 return parse_filter(stripped)
294 314
295 315
296 def parse_filterlist(lines): 316 def parse_filterlist(lines):
297 """Parse filter list from an iterable. 317 """Parse filter list from an iterable.
298 318
299 Parameters 319 Parameters
300 ---------- 320 ----------
301 lines: iterable of str 321 lines: iterable of str
302 Lines of the filter list. 322 Lines of the filter list.
303 323
304 Returns 324 Returns
305 ------- 325 -------
306 iterator of namedtuple 326 iterator of namedtuple
307 Parsed lines of the filter list. 327 Parsed lines of the filter list.
308 328
309 Raises 329 Raises
310 ------ 330 ------
311 ParseError 331 ParseError
312 Thrown during iteration for invalid filter list lines. 332 Thrown during iteration for invalid filter list lines.
313 TypeError 333 TypeError
314 If `lines` is not iterable. 334 If `lines` is not iterable.
315 335
316 """ 336 """
317 metadata_closed = False 337 position = 'start'
318 338
319 for line in lines: 339 for line in lines:
320 result = parse_line(line) 340 parsed_line = parse_line(line, position)
341 yield parsed_line
321 342
322 if result.type == 'comment': 343 if position != 'body' and parsed_line.type in {'header', 'metadata'}:
323 match = METADATA_REGEXP.match(result.text) 344 # Continue parsing metadata until it's over...
324 if match: 345 position = 'metadata'
325 key, value = match.groups() 346 else:
326 347 # ...then switch to parsing the body.
327 # Historically, checksums can occur at the bottom of the 348 position = 'body'
328 # filter list. Checksums are no longer used by Adblock Plus,
329 # but in order to strip them (in abp.filters.renderer),
330 # we have to make sure to still parse them regardless of
331 # their position in the filter list.
332 if not metadata_closed or key.lower() == 'checksum':
333 result = Metadata(key, value)
334
335 if result.type not in {'header', 'metadata'}:
336 metadata_closed = True
337
338 yield result
OLDNEW
« no previous file with comments | « no previous file | abp/filters/rpy.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld