Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: abp/filters/parser.py

Issue 29880555: Issue 6877 - Only parse headers in the first line of the filter list (Closed)
Patch Set: Created Sept. 14, 2018, 2:40 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-present eyeo GmbH 2 # Copyright (C) 2006-present eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 133
134 134
135 Header = _line_type('Header', 'version', '[{.version}]') 135 Header = _line_type('Header', 'version', '[{.version}]')
136 EmptyLine = _line_type('EmptyLine', '', '') 136 EmptyLine = _line_type('EmptyLine', '', '')
137 Comment = _line_type('Comment', 'text', '! {.text}') 137 Comment = _line_type('Comment', 'text', '! {.text}')
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') 139 Filter = _line_type('Filter', 'text selector action options', '{.text}')
140 Include = _line_type('Include', 'target', '%include {0.target}%') 140 Include = _line_type('Include', 'target', '%include {0.target}%')
141 141
142 142
143 METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)') 143 METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)')
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 145 HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]$', flag s=re.I)
146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
147 FILTER_OPTIONS_REGEXP = re.compile( 147 FILTER_OPTIONS_REGEXP = re.compile(
148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'
149 ) 149 )
150 150
151 151
152 def _parse_header(text):
153 match = HEADER_REGEXP.match(text)
154 if not match:
155 raise ParseError('Malformed header', text)
156 return Header(match.group(1))
157
158
159 def _parse_instruction(text): 152 def _parse_instruction(text):
160 match = INCLUDE_REGEXP.match(text) 153 match = INCLUDE_REGEXP.match(text)
161 if not match: 154 if not match:
162 raise ParseError('Unrecognized instruction', text) 155 raise ParseError('Unrecognized instruction', text)
163 return Include(match.group(1)) 156 return Include(match.group(1))
164 157
165 158
166 def _parse_option(option): 159 def _parse_option(option):
167 if '=' in option: 160 if '=' in option:
168 return option.split('=', 1) 161 return option.split('=', 1)
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
244 Parsed filter. 237 Parsed filter.
245 238
246 """ 239 """
247 if '#' in text: 240 if '#' in text:
248 match = HIDING_FILTER_REGEXP.search(text) 241 match = HIDING_FILTER_REGEXP.search(text)
249 if match: 242 if match:
250 return _parse_hiding_filter(text, *match.groups()) 243 return _parse_hiding_filter(text, *match.groups())
251 return _parse_blocking_filter(text) 244 return _parse_blocking_filter(text)
252 245
253 246
247 def _decode_if_bytes(s):
248 return s.decode('utf-8') if isinstance(s, type(b'')) else s
249
250
254 def parse_line(line_text): 251 def parse_line(line_text):
255 """Parse one line of a filter list. 252 """Parse one line of a filter list.
256 253
257 Note that parse_line() doesn't handle special comments, hence never returns 254 Note that parse_line() doesn't handle headers and special comments,
258 a Metadata() object, Adblock Plus only considers metadata when parsing the 255 hence never returns a Header() or Metadata() object. Adblock Plus only
259 whole filter list and only if they are given at the top of the filter list. 256 considers headers and metadata when parsing the whole filter list and
257 only if they are given at the top of the filter list.
260 258
261 Parameters 259 Parameters
262 ---------- 260 ----------
263 line_text : str 261 line_text : str
264 Line of a filter list. 262 Line of a filter list.
265 263
266 Returns 264 Returns
267 ------- 265 -------
268 namedtuple 266 namedtuple
269 Parsed line (see `_line_type`). 267 Parsed line (see `_line_type`).
270 268
271 Raises 269 Raises
272 ------ 270 ------
273 ParseError 271 ParseError
274 ParseError: If the line can't be parsed. 272 ParseError: If the line can't be parsed.
275 """ 273 """
276 if isinstance(line_text, type(b'')): 274 content = _decode_if_bytes(line_text).strip()
277 line_text = line_text.decode('utf-8')
278
279 content = line_text.strip()
280 275
281 if content == '': 276 if content == '':
282 line = EmptyLine() 277 line = EmptyLine()
283 elif content.startswith('!'): 278 elif content.startswith('!'):
284 line = Comment(content[1:].lstrip()) 279 line = Comment(content[1:].lstrip())
285 elif content.startswith('%') and content.endswith('%'): 280 elif content.startswith('%') and content.endswith('%'):
286 line = _parse_instruction(content) 281 line = _parse_instruction(content)
287 elif content.startswith('[') and content.endswith(']'):
288 line = _parse_header(content)
289 else: 282 else:
290 line = parse_filter(content) 283 line = parse_filter(content)
291 284
292 assert line.to_string().replace(' ', '') == content.replace(' ', '') 285 assert line.to_string().replace(' ', '') == content.replace(' ', '')
293 return line 286 return line
294 287
295 288
296 def parse_filterlist(lines): 289 def parse_filterlist(lines):
297 """Parse filter list from an iterable. 290 """Parse filter list from an iterable.
298 291
(...skipping 10 matching lines...) Expand all
309 Raises 302 Raises
310 ------ 303 ------
311 ParseError 304 ParseError
312 Thrown during iteration for invalid filter list lines. 305 Thrown during iteration for invalid filter list lines.
313 TypeError 306 TypeError
314 If `lines` is not iterable. 307 If `lines` is not iterable.
315 308
316 """ 309 """
317 metadata_closed = False 310 metadata_closed = False
318 311
319 for line in lines: 312 for i, line in enumerate(lines):
320 result = parse_line(line) 313 text = _decode_if_bytes(line)
321 314
322 if result.type == 'comment': 315 if i == 0:
323 match = METADATA_REGEXP.match(result.text) 316 match = HEADER_REGEXP.match(text)
324 if match: 317 if match:
325 key, value = match.groups() 318 version = match.group(1)
319 if not version:
320 raise ParseError('Malformed header', text)
326 321
327 # Historically, checksums can occur at the bottom of the 322 yield Header(version)
328 # filter list. Checksums are no longer used by Adblock Plus, 323 continue
329 # but in order to strip them (in abp.filters.renderer),
330 # we have to make sure to still parse them regardless of
331 # their position in the filter list.
332 if not metadata_closed or key.lower() == 'checksum':
333 result = Metadata(key, value)
334 324
335 if result.type not in {'header', 'metadata'}: 325 match = METADATA_REGEXP.match(text)
336 metadata_closed = True 326 if match:
327 key, value = match.groups()
337 328
338 yield result 329 # Historically, checksums can occur at the bottom of the
330 # filter list. Checksums are no longer used by Adblock Plus,
331 # but in order to strip them (in abp.filters.renderer),
332 # we have to make sure to still parse them regardless of
333 # their position in the filter list.
334 if not metadata_closed or key.lower() == 'checksum':
335 yield Metadata(key, value)
336 continue
337
338 metadata_closed = True
339 yield parse_line(text)
OLDNEW
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld