Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: abp/filters/parser.py

Issue 29873561: Issue 6920 - Only parse metadata from the top of the file (Closed)
Left Patch Set: Created Sept. 3, 2018, 7:50 p.m.
Right Patch Set: Test 'Last modified' case Created Sept. 5, 2018, 9:09 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | tests/test_parser.py » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-present eyeo GmbH 2 # Copyright (C) 2006-present eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 133
134 134
135 Header = _line_type('Header', 'version', '[{.version}]') 135 Header = _line_type('Header', 'version', '[{.version}]')
136 EmptyLine = _line_type('EmptyLine', '', '') 136 EmptyLine = _line_type('EmptyLine', '', '')
137 Comment = _line_type('Comment', 'text', '! {.text}') 137 Comment = _line_type('Comment', 'text', '! {.text}')
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') 139 Filter = _line_type('Filter', 'text selector action options', '{.text}')
140 Include = _line_type('Include', 'target', '%include {0.target}%') 140 Include = _line_type('Include', 'target', '%include {0.target}%')
141 141
142 142
143 METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)') 143 METADATA_REGEXP = re.compile(r'([\w-]+)\s*:\s*(.*)')
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
147 FILTER_OPTIONS_REGEXP = re.compile( 147 FILTER_OPTIONS_REGEXP = re.compile(
148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'
149 ) 149 )
150 150
151 151
152 def _parse_comment(text, metadata_closed):
153 match = METADATA_REGEXP.match(text)
154 if match:
155 key, value = match.groups()
156
157 # Historically, checksums can occur at the bottom of the filter list.
158 # Checksums are no longer used by Adblock Plus, but in order to strip
159 # them (in abp.filters.renderer), we have to make sure to still parse
160 # them regardless of their position in the filter list.
161 if key and (not metadata_closed or key.lower() == 'checksum'):
162 return Metadata(key, value), metadata_closed
163 else:
164 # The regular expression matches as well if we see a malformed key
165 # (e.g. "Last modified"). In that case we want to keep looking for
166 # more metadata, but return a Comment instead of a Metadata object.
167 # Hence we only consider the the metadata section closed if the
168 # regular expression doesn't match.
169 metadata_closed = True
170 return Comment(text[1:].lstrip()), metadata_closed
171
172
173 def _parse_header(text): 152 def _parse_header(text):
174 match = HEADER_REGEXP.match(text) 153 match = HEADER_REGEXP.match(text)
175 if not match: 154 if not match:
176 raise ParseError('Malformed header', text) 155 raise ParseError('Malformed header', text)
177 return Header(match.group(1)) 156 return Header(match.group(1))
178 157
179 158
180 def _parse_instruction(text): 159 def _parse_instruction(text):
181 match = INCLUDE_REGEXP.match(text) 160 match = INCLUDE_REGEXP.match(text)
182 if not match: 161 if not match:
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after
265 Parsed filter. 244 Parsed filter.
266 245
267 """ 246 """
268 if '#' in text: 247 if '#' in text:
269 match = HIDING_FILTER_REGEXP.search(text) 248 match = HIDING_FILTER_REGEXP.search(text)
270 if match: 249 if match:
271 return _parse_hiding_filter(text, *match.groups()) 250 return _parse_hiding_filter(text, *match.groups())
272 return _parse_blocking_filter(text) 251 return _parse_blocking_filter(text)
273 252
274 253
275 def _parse_line(line_text, metadata_closed):
276 if isinstance(line_text, type(b'')):
277 line_text = line_text.decode('utf-8')
278
279 content = line_text.strip()
280
281 if content.startswith('!'):
282 line, metadata_closed = _parse_comment(content, metadata_closed)
283 elif content.startswith('[') and content.endswith(']'):
284 line = _parse_header(content)
285 else:
286 if content == '':
287 line = EmptyLine()
288 elif content.startswith('%') and content.endswith('%'):
289 line = _parse_instruction(content)
290 else:
291 line = parse_filter(content)
292 metadata_closed = True
293
294 return line, metadata_closed
295
296
297 def parse_line(line_text): 254 def parse_line(line_text):
298 """Parse one line of a filter list. 255 """Parse one line of a filter list.
256
257 Note that parse_line() doesn't handle special comments, hence never returns
258 a Metadata() object, Adblock Plus only considers metadata when parsing the
259 whole filter list and only if they are given at the top of the filter list.
299 260
300 Parameters 261 Parameters
301 ---------- 262 ----------
302 line_text : str 263 line_text : str
303 Line of a filter list. 264 Line of a filter list.
304 265
305 Returns 266 Returns
306 ------- 267 -------
307 namedtuple 268 namedtuple
308 Parsed line (see `_line_type`). 269 Parsed line (see `_line_type`).
309 270
310 Raises 271 Raises
311 ------ 272 ------
312 ParseError 273 ParseError
313 ParseError: If the line can't be parsed. 274 ParseError: If the line can't be parsed.
314 """ 275 """
315 return _parse_line(line_text, True)[0] 276 if isinstance(line_text, type(b'')):
Vasily Kuznetsov 2018/09/04 10:03:30 Here we're changing the behavior of `parse_line` -
Sebastian Noack 2018/09/04 16:01:03 I moved the logic as requested to parse_filterlist
Vasily Kuznetsov 2018/09/04 16:49:50 What I had in mind was to keep _parse_comment() wi
277 line_text = line_text.decode('utf-8')
278
279 content = line_text.strip()
280
281 if content == '':
282 line = EmptyLine()
283 elif content.startswith('!'):
284 line = Comment(content[1:].lstrip())
285 elif content.startswith('%') and content.endswith('%'):
286 line = _parse_instruction(content)
287 elif content.startswith('[') and content.endswith(']'):
288 line = _parse_header(content)
289 else:
290 line = parse_filter(content)
291
292 assert line.to_string().replace(' ', '') == content.replace(' ', '')
293 return line
294
316 295
317 def parse_filterlist(lines): 296 def parse_filterlist(lines):
318 """Parse filter list from an iterable. 297 """Parse filter list from an iterable.
319 298
320 Parameters 299 Parameters
321 ---------- 300 ----------
322 lines: iterable of str 301 lines: iterable of str
323 Lines of the filter list. 302 Lines of the filter list.
324 303
325 Returns 304 Returns
326 ------- 305 -------
327 iterator of namedtuple 306 iterator of namedtuple
328 Parsed lines of the filter list. 307 Parsed lines of the filter list.
329 308
330 Raises 309 Raises
331 ------ 310 ------
332 ParseError 311 ParseError
333 Thrown during iteration for invalid filter list lines. 312 Thrown during iteration for invalid filter list lines.
334 TypeError 313 TypeError
335 If `lines` is not iterable. 314 If `lines` is not iterable.
336 315
337 """ 316 """
338 metadata_closed = False 317 metadata_closed = False
Vasily Kuznetsov 2018/09/04 10:03:30 What do you think about filter lists like this?
Sebastian Noack 2018/09/04 16:01:03 In practice nobody seems to put comments in betwee
Vasily Kuznetsov 2018/09/04 16:49:50 Well, "Last modified" is considered a comment and
318
339 for line in lines: 319 for line in lines:
340 parsed, metadata_closed = _parse_line(line, metadata_closed) 320 result = parse_line(line)
341 yield parsed 321
322 if isinstance(result, Comment):
323 match = METADATA_REGEXP.match(result.text)
324 if match:
325 key, value = match.groups()
326
327 # Historically, checksums can occur at the bottom of the
328 # filter list. Checksums are no longer used by Adblock Plus,
329 # but in order to strip them (in abp.filters.renderer),
330 # we have to make sure to still parse them regardless of
331 # their position in the filter list.
332 if not metadata_closed or key.lower() == 'checksum':
333 yield Metadata(key, value)
334 continue
335
336 if not result.text:
337 metadata_closed = True
338 elif not isinstance(result, Header):
339 metadata_closed = True
340
341 yield result
LEFTRIGHT

Powered by Google App Engine
This is Rietveld