Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
(...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
135 Header = _line_type('Header', 'version', '[{.version}]') | 135 Header = _line_type('Header', 'version', '[{.version}]') |
136 EmptyLine = _line_type('EmptyLine', '', '') | 136 EmptyLine = _line_type('EmptyLine', '', '') |
137 Comment = _line_type('Comment', 'text', '! {.text}') | 137 Comment = _line_type('Comment', 'text', '! {.text}') |
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
140 Include = _line_type('Include', 'target', '%include {0.target}%') | 140 Include = _line_type('Include', 'target', '%include {0.target}%') |
141 | 141 |
142 | 142 |
143 METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)') | 143 METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)') |
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
145 HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]$', | 145 HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]', |
Sebastian Noack
2018/09/15 16:08:32
I changed this regular epxressions like this in my
Vasily Kuznetsov
2018/09/17 10:40:27
Yeah, you're right. I think the logic of parse_lin
Vasily Kuznetsov
2018/09/18 12:41:14
Done.
| |
146 flags=re.I) | 146 flags=re.I) |
147 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 147 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
148 FILTER_OPTIONS_REGEXP = re.compile( | 148 FILTER_OPTIONS_REGEXP = re.compile( |
149 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' | 149 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
150 ) | 150 ) |
151 | |
152 | |
153 def _parse_header(text): | |
154 match = HEADER_REGEXP.match(text) | |
155 version = match.group(1) if match else None | |
156 if not version: | |
157 raise ParseError('Malformed header', text) | |
158 return Header(version) | |
159 | 151 |
160 | 152 |
161 def _parse_instruction(text): | 153 def _parse_instruction(text): |
162 match = INCLUDE_REGEXP.match(text) | 154 match = INCLUDE_REGEXP.match(text) |
163 if not match: | 155 if not match: |
164 raise ParseError('Unrecognized instruction', text) | 156 raise ParseError('Unrecognized instruction', text) |
165 return Include(match.group(1)) | 157 return Include(match.group(1)) |
166 | 158 |
167 | 159 |
168 def _parse_option(option): | 160 def _parse_option(option): |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
246 Parsed filter. | 238 Parsed filter. |
247 | 239 |
248 """ | 240 """ |
249 if '#' in text: | 241 if '#' in text: |
250 match = HIDING_FILTER_REGEXP.search(text) | 242 match = HIDING_FILTER_REGEXP.search(text) |
251 if match: | 243 if match: |
252 return _parse_hiding_filter(text, *match.groups()) | 244 return _parse_hiding_filter(text, *match.groups()) |
253 return _parse_blocking_filter(text) | 245 return _parse_blocking_filter(text) |
254 | 246 |
255 | 247 |
256 def parse_line(line_text, mode='body'): | 248 def parse_line(line, position='body'): |
Sebastian Noack
2018/09/15 16:08:32
Having the "mode" as part of the public API, requi
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to keep the mode in the public API. T
| |
257 """Parse one line of a filter list. | 249 """Parse one line of a filter list. |
258 | 250 |
259 The types of lines that that the parser recognizes depend on the mode. In | 251 The types of lines that that the parser recognizes depend on the position. |
260 body mode the parser only recognizes filters, comments, processing | 252 If position="body", the parser only recognizes filters, comments, |
261 instructions and empty lines. In medata mode it in addition recognizes | 253 processing instructions and empty lines. If position="metadata", it in |
262 metadata. In start mode it also recognizes headers. | 254 addition recognizes metadata. If position="start", it also recognizes |
263 | 255 headers. |
264 Note: checksum metadata lines are recognized in all modes for backwards | 256 |
257 Note: Checksum metadata lines are recognized in all positions for backwards | |
265 compatibility. Historically, checksums can occur at the bottom of the | 258 compatibility. Historically, checksums can occur at the bottom of the |
266 filter list. They are are no longer used by Adblock Plus, but in order to | 259 filter list. They are are no longer used by Adblock Plus, but in order to |
267 strip them (in abp.filters.renderer), we have to make sure to still parse | 260 strip them (in abp.filters.renderer), we have to make sure to still parse |
268 them regardless of their position in the filter list. | 261 them regardless of their position in the filter list. |
269 | 262 |
270 Parameters | 263 Parameters |
271 ---------- | 264 ---------- |
272 line_text : str | 265 line : str |
273 Line of a filter list. | 266 Line of a filter list. |
274 mode : str | 267 position : str |
275 Parsing mode, one of "start", "metadata" or "body" (default). | 268 Position in the filter list, one of "start", "metadata" or "body" |
269 (default is "body"). | |
276 | 270 |
277 Returns | 271 Returns |
278 ------- | 272 ------- |
279 namedtuple | 273 namedtuple |
280 Parsed line (see `_line_type`). | 274 Parsed line (see `_line_type`). |
281 | 275 |
282 Raises | 276 Raises |
283 ------ | 277 ------ |
284 ParseError | 278 ParseError |
285 ParseError: If the line can't be parsed. | 279 ParseError: If the line can't be parsed. |
286 | 280 |
287 """ | 281 """ |
288 MODES = {'body', 'start', 'metadata'} | 282 POSITIONS = {'body', 'start', 'metadata'} |
289 if mode not in MODES: | 283 if position not in POSITIONS: |
290 raise ValueError('mode should be one of {}'.format(MODES)) | 284 raise ValueError('position should be one of {}'.format(POSITIONS)) |
291 | 285 |
292 if isinstance(line_text, type(b'')): | 286 if isinstance(line, type(b'')): |
293 line_text = line_text.decode('utf-8') | 287 line = line.decode('utf-8') |
294 | 288 |
295 content = line_text.strip() | 289 stripped = line.strip() |
296 | 290 |
297 if content == '': | 291 if stripped == '': |
298 return EmptyLine() | 292 return EmptyLine() |
299 | 293 |
300 if content.startswith('!'): | 294 if position == 'start': |
301 match = METADATA_REGEXP.match(line_text) | 295 match = HEADER_REGEXP.search(line) |
296 if match: | |
297 version = match.group(1) | |
298 if not version: | |
299 raise ParseError('Malformed header', line) | |
300 return Header(version) | |
301 | |
302 if stripped.startswith('!'): | |
303 match = METADATA_REGEXP.match(line) | |
302 if match: | 304 if match: |
303 key, value = match.groups() | 305 key, value = match.groups() |
304 if mode != 'body' or key.lower() == 'checksum': | 306 if position != 'body' or key.lower() == 'checksum': |
Sebastian Noack
2018/09/15 16:08:32
We probably should keep the comment why we treat c
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to also keep the note about checksums
Sebastian Noack
2018/09/17 18:11:52
I didn't notice that you moved that note to the do
Vasily Kuznetsov
2018/09/18 12:41:14
It needs to be in the docstring because it's part
| |
305 return Metadata(key, value) | 307 return Metadata(key, value) |
306 return Comment(content[1:].lstrip()) | 308 return Comment(stripped[1:].lstrip()) |
307 | 309 |
308 if content.startswith('%') and content.endswith('%'): | 310 if stripped.startswith('%') and stripped.endswith('%'): |
309 return _parse_instruction(content) | 311 return _parse_instruction(stripped) |
310 | 312 |
311 if mode == 'start' and content.startswith('[') and content.endswith(']'): | 313 return parse_filter(stripped) |
312 return _parse_header(content) | |
313 | |
314 return parse_filter(content) | |
315 | 314 |
316 | 315 |
317 def parse_filterlist(lines): | 316 def parse_filterlist(lines): |
318 """Parse filter list from an iterable. | 317 """Parse filter list from an iterable. |
319 | 318 |
320 Parameters | 319 Parameters |
321 ---------- | 320 ---------- |
322 lines: iterable of str | 321 lines: iterable of str |
323 Lines of the filter list. | 322 Lines of the filter list. |
324 | 323 |
325 Returns | 324 Returns |
326 ------- | 325 ------- |
327 iterator of namedtuple | 326 iterator of namedtuple |
328 Parsed lines of the filter list. | 327 Parsed lines of the filter list. |
329 | 328 |
330 Raises | 329 Raises |
331 ------ | 330 ------ |
332 ParseError | 331 ParseError |
333 Thrown during iteration for invalid filter list lines. | 332 Thrown during iteration for invalid filter list lines. |
334 TypeError | 333 TypeError |
335 If `lines` is not iterable. | 334 If `lines` is not iterable. |
336 | 335 |
337 """ | 336 """ |
338 mode = 'start' | 337 position = 'start' |
Sebastian Noack
2018/09/17 18:11:52
Maybe "position" would be more accurate name for t
Vasily Kuznetsov
2018/09/18 12:41:14
Yeah, "position" is a better name. I changed it.
| |
339 | 338 |
340 for line in lines: | 339 for line in lines: |
341 parsed_line = parse_line(line, mode) | 340 parsed_line = parse_line(line, position) |
342 yield parsed_line | 341 yield parsed_line |
343 | 342 |
344 if mode != 'body' and parsed_line.type in {'header', 'metadata'}: | 343 if position != 'body' and parsed_line.type in {'header', 'metadata'}: |
345 # Continue parsing metadata if it's not over... | 344 # Continue parsing metadata until it's over... |
346 mode = 'metadata' | 345 position = 'metadata' |
347 else: | 346 else: |
348 # ...otherwise switch to parsing filter list body. | 347 # ...then switch to parsing the body. |
349 mode = 'body' | 348 position = 'body' |
LEFT | RIGHT |