Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: abp/filters/parser.py

Issue 29873561: Issue 6920 - Only parse metadata from the top of the file (Closed)
Patch Set: Created Sept. 3, 2018, 7:50 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | tests/test_parser.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-present eyeo GmbH 2 # Copyright (C) 2006-present eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 133
134 134
135 Header = _line_type('Header', 'version', '[{.version}]') 135 Header = _line_type('Header', 'version', '[{.version}]')
136 EmptyLine = _line_type('EmptyLine', '', '') 136 EmptyLine = _line_type('EmptyLine', '', '')
137 Comment = _line_type('Comment', 'text', '! {.text}') 137 Comment = _line_type('Comment', 'text', '! {.text}')
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') 139 Filter = _line_type('Filter', 'text selector action options', '{.text}')
140 Include = _line_type('Include', 'target', '%include {0.target}%') 140 Include = _line_type('Include', 'target', '%include {0.target}%')
141 141
142 142
143 METADATA_REGEXP = re.compile(r'!\s*([\w-]+)\s*:(?!//)\s*(.*)') 143 METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)')
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
147 FILTER_OPTIONS_REGEXP = re.compile( 147 FILTER_OPTIONS_REGEXP = re.compile(
148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'
149 ) 149 )
150 150
151 151
152 def _parse_comment(text): 152 def _parse_comment(text, metadata_closed):
153 match = METADATA_REGEXP.match(text) 153 match = METADATA_REGEXP.match(text)
154 if match: 154 if match:
155 return Metadata(match.group(1), match.group(2)) 155 key, value = match.groups()
156 return Comment(text[1:].strip()) 156
157 # Historically, checksums can occur at the bottom of the filter list.
158 # Checksums are no longer used by Adblock Plus, but in order to strip
159 # them (in abp.filters.renderer), we have to make sure to still parse
160 # them regardless of their position in the filter list.
161 if key and (not metadata_closed or key.lower() == 'checksum'):
162 return Metadata(key, value), metadata_closed
163 else:
164 # The regular expression matches as well if we see a malformed key
165 # (e.g. "Last modified"). In that case we want to keep looking for
166 # more metadata, but return a Comment instead of a Metadata object.
167 # Hence we only consider the the metadata section closed if the
168 # regular expression doesn't match.
169 metadata_closed = True
170 return Comment(text[1:].lstrip()), metadata_closed
157 171
158 172
159 def _parse_header(text): 173 def _parse_header(text):
160 match = HEADER_REGEXP.match(text) 174 match = HEADER_REGEXP.match(text)
161 if not match: 175 if not match:
162 raise ParseError('Malformed header', text) 176 raise ParseError('Malformed header', text)
163 return Header(match.group(1)) 177 return Header(match.group(1))
164 178
165 179
166 def _parse_instruction(text): 180 def _parse_instruction(text):
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
251 Parsed filter. 265 Parsed filter.
252 266
253 """ 267 """
254 if '#' in text: 268 if '#' in text:
255 match = HIDING_FILTER_REGEXP.search(text) 269 match = HIDING_FILTER_REGEXP.search(text)
256 if match: 270 if match:
257 return _parse_hiding_filter(text, *match.groups()) 271 return _parse_hiding_filter(text, *match.groups())
258 return _parse_blocking_filter(text) 272 return _parse_blocking_filter(text)
259 273
260 274
275 def _parse_line(line_text, metadata_closed):
276 if isinstance(line_text, type(b'')):
277 line_text = line_text.decode('utf-8')
278
279 content = line_text.strip()
280
281 if content.startswith('!'):
282 line, metadata_closed = _parse_comment(content, metadata_closed)
283 elif content.startswith('[') and content.endswith(']'):
284 line = _parse_header(content)
285 else:
286 if content == '':
287 line = EmptyLine()
288 elif content.startswith('%') and content.endswith('%'):
289 line = _parse_instruction(content)
290 else:
291 line = parse_filter(content)
292 metadata_closed = True
293
294 return line, metadata_closed
295
296
261 def parse_line(line_text): 297 def parse_line(line_text):
262 """Parse one line of a filter list. 298 """Parse one line of a filter list.
263 299
264 Parameters 300 Parameters
265 ---------- 301 ----------
266 line_text : str 302 line_text : str
267 Line of a filter list. 303 Line of a filter list.
268 304
269 Returns 305 Returns
270 ------- 306 -------
271 namedtuple 307 namedtuple
272 Parsed line (see `_line_type`). 308 Parsed line (see `_line_type`).
273 309
274 Raises 310 Raises
275 ------ 311 ------
276 ParseError 312 ParseError
277 ParseError: If the line can't be parsed. 313 ParseError: If the line can't be parsed.
278 """ 314 """
279 if isinstance(line_text, type(b'')): 315 return _parse_line(line_text, True)[0]
Vasily Kuznetsov 2018/09/04 10:03:30 Here we're changing the behavior of `parse_line` -
Sebastian Noack 2018/09/04 16:01:03 I moved the logic as requested to parse_filterlist
Vasily Kuznetsov 2018/09/04 16:49:50 What I had in mind was to keep _parse_comment() wi
280 line_text = line_text.decode('utf-8')
281
282 content = line_text.strip()
283
284 if content == '':
285 line = EmptyLine()
286 elif content.startswith('!'):
287 line = _parse_comment(content)
288 elif content.startswith('%') and content.endswith('%'):
289 line = _parse_instruction(content)
290 elif content.startswith('[') and content.endswith(']'):
291 line = _parse_header(content)
292 else:
293 line = parse_filter(content)
294
295 assert line.to_string().replace(' ', '') == content.replace(' ', '')
296 return line
297
298 316
299 def parse_filterlist(lines): 317 def parse_filterlist(lines):
300 """Parse filter list from an iterable. 318 """Parse filter list from an iterable.
301 319
302 Parameters 320 Parameters
303 ---------- 321 ----------
304 lines: iterable of str 322 lines: iterable of str
305 Lines of the filter list. 323 Lines of the filter list.
306 324
307 Returns 325 Returns
308 ------- 326 -------
309 iterator of namedtuple 327 iterator of namedtuple
310 Parsed lines of the filter list. 328 Parsed lines of the filter list.
311 329
312 Raises 330 Raises
313 ------ 331 ------
314 ParseError 332 ParseError
315 Thrown during iteration for invalid filter list lines. 333 Thrown during iteration for invalid filter list lines.
316 TypeError 334 TypeError
317 If `lines` is not iterable. 335 If `lines` is not iterable.
318 336
319 """ 337 """
338 metadata_closed = False
Vasily Kuznetsov 2018/09/04 10:03:30 What do you think about filter lists like this?
Sebastian Noack 2018/09/04 16:01:03 In practice nobody seems to put comments in betwee
Vasily Kuznetsov 2018/09/04 16:49:50 Well, "Last modified" is considered a comment and
320 for line in lines: 339 for line in lines:
321 yield parse_line(line) 340 parsed, metadata_closed = _parse_line(line, metadata_closed)
341 yield parsed
OLDNEW
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | tests/test_parser.py » ('J')

Powered by Google App Engine
This is Rietveld