| OLD | NEW |
| 1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
| 2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
| 3 # | 3 # |
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
| 5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
| 6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
| 7 # | 7 # |
| 8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| (...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 133 | 133 |
| 134 | 134 |
| 135 Header = _line_type('Header', 'version', '[{.version}]') | 135 Header = _line_type('Header', 'version', '[{.version}]') |
| 136 EmptyLine = _line_type('EmptyLine', '', '') | 136 EmptyLine = _line_type('EmptyLine', '', '') |
| 137 Comment = _line_type('Comment', 'text', '! {.text}') | 137 Comment = _line_type('Comment', 'text', '! {.text}') |
| 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
| 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
| 140 Include = _line_type('Include', 'target', '%include {0.target}%') | 140 Include = _line_type('Include', 'target', '%include {0.target}%') |
| 141 | 141 |
| 142 | 142 |
| 143 METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)') | 143 METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)') |
| 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| 145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 145 HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]', |
| 146 flags=re.I) |
| 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 147 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
| 147 FILTER_OPTIONS_REGEXP = re.compile( | 148 FILTER_OPTIONS_REGEXP = re.compile( |
| 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' | 149 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
| 149 ) | 150 ) |
| 150 | 151 |
| 151 | 152 |
| 152 def _parse_header(text): | |
| 153 match = HEADER_REGEXP.match(text) | |
| 154 if not match: | |
| 155 raise ParseError('Malformed header', text) | |
| 156 return Header(match.group(1)) | |
| 157 | |
| 158 | |
| 159 def _parse_instruction(text): | 153 def _parse_instruction(text): |
| 160 match = INCLUDE_REGEXP.match(text) | 154 match = INCLUDE_REGEXP.match(text) |
| 161 if not match: | 155 if not match: |
| 162 raise ParseError('Unrecognized instruction', text) | 156 raise ParseError('Unrecognized instruction', text) |
| 163 return Include(match.group(1)) | 157 return Include(match.group(1)) |
| 164 | 158 |
| 165 | 159 |
| 166 def _parse_option(option): | 160 def _parse_option(option): |
| 167 if '=' in option: | 161 if '=' in option: |
| 168 return option.split('=', 1) | 162 return option.split('=', 1) |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 244 Parsed filter. | 238 Parsed filter. |
| 245 | 239 |
| 246 """ | 240 """ |
| 247 if '#' in text: | 241 if '#' in text: |
| 248 match = HIDING_FILTER_REGEXP.search(text) | 242 match = HIDING_FILTER_REGEXP.search(text) |
| 249 if match: | 243 if match: |
| 250 return _parse_hiding_filter(text, *match.groups()) | 244 return _parse_hiding_filter(text, *match.groups()) |
| 251 return _parse_blocking_filter(text) | 245 return _parse_blocking_filter(text) |
| 252 | 246 |
| 253 | 247 |
| 254 def parse_line(line_text): | 248 def parse_line(line, position='body'): |
| 255 """Parse one line of a filter list. | 249 """Parse one line of a filter list. |
| 256 | 250 |
| 257 Note that parse_line() doesn't handle special comments, hence never returns | 251 The types of lines that that the parser recognizes depend on the position. |
| 258 a Metadata() object, Adblock Plus only considers metadata when parsing the | 252 If position="body", the parser only recognizes filters, comments, |
| 259 whole filter list and only if they are given at the top of the filter list. | 253 processing instructions and empty lines. If position="metadata", it in |
| 254 addition recognizes metadata. If position="start", it also recognizes |
| 255 headers. |
| 256 |
| 257 Note: Checksum metadata lines are recognized in all positions for backwards |
| 258 compatibility. Historically, checksums can occur at the bottom of the |
| 259 filter list. They are are no longer used by Adblock Plus, but in order to |
| 260 strip them (in abp.filters.renderer), we have to make sure to still parse |
| 261 them regardless of their position in the filter list. |
| 260 | 262 |
| 261 Parameters | 263 Parameters |
| 262 ---------- | 264 ---------- |
| 263 line_text : str | 265 line : str |
| 264 Line of a filter list. | 266 Line of a filter list. |
| 267 position : str |
| 268 Position in the filter list, one of "start", "metadata" or "body" |
| 269 (default is "body"). |
| 265 | 270 |
| 266 Returns | 271 Returns |
| 267 ------- | 272 ------- |
| 268 namedtuple | 273 namedtuple |
| 269 Parsed line (see `_line_type`). | 274 Parsed line (see `_line_type`). |
| 270 | 275 |
| 271 Raises | 276 Raises |
| 272 ------ | 277 ------ |
| 273 ParseError | 278 ParseError |
| 274 ParseError: If the line can't be parsed. | 279 ParseError: If the line can't be parsed. |
| 280 |
| 275 """ | 281 """ |
| 276 if isinstance(line_text, type(b'')): | 282 POSITIONS = {'body', 'start', 'metadata'} |
| 277 line_text = line_text.decode('utf-8') | 283 if position not in POSITIONS: |
| 284 raise ValueError('position should be one of {}'.format(POSITIONS)) |
| 278 | 285 |
| 279 content = line_text.strip() | 286 if isinstance(line, type(b'')): |
| 287 line = line.decode('utf-8') |
| 280 | 288 |
| 281 if content == '': | 289 stripped = line.strip() |
| 282 line = EmptyLine() | |
| 283 elif content.startswith('!'): | |
| 284 line = Comment(content[1:].lstrip()) | |
| 285 elif content.startswith('%') and content.endswith('%'): | |
| 286 line = _parse_instruction(content) | |
| 287 elif content.startswith('[') and content.endswith(']'): | |
| 288 line = _parse_header(content) | |
| 289 else: | |
| 290 line = parse_filter(content) | |
| 291 | 290 |
| 292 assert line.to_string().replace(' ', '') == content.replace(' ', '') | 291 if stripped == '': |
| 293 return line | 292 return EmptyLine() |
| 293 |
| 294 if position == 'start': |
| 295 match = HEADER_REGEXP.search(line) |
| 296 if match: |
| 297 version = match.group(1) |
| 298 if not version: |
| 299 raise ParseError('Malformed header', line) |
| 300 return Header(version) |
| 301 |
| 302 if stripped.startswith('!'): |
| 303 match = METADATA_REGEXP.match(line) |
| 304 if match: |
| 305 key, value = match.groups() |
| 306 if position != 'body' or key.lower() == 'checksum': |
| 307 return Metadata(key, value) |
| 308 return Comment(stripped[1:].lstrip()) |
| 309 |
| 310 if stripped.startswith('%') and stripped.endswith('%'): |
| 311 return _parse_instruction(stripped) |
| 312 |
| 313 return parse_filter(stripped) |
| 294 | 314 |
| 295 | 315 |
| 296 def parse_filterlist(lines): | 316 def parse_filterlist(lines): |
| 297 """Parse filter list from an iterable. | 317 """Parse filter list from an iterable. |
| 298 | 318 |
| 299 Parameters | 319 Parameters |
| 300 ---------- | 320 ---------- |
| 301 lines: iterable of str | 321 lines: iterable of str |
| 302 Lines of the filter list. | 322 Lines of the filter list. |
| 303 | 323 |
| 304 Returns | 324 Returns |
| 305 ------- | 325 ------- |
| 306 iterator of namedtuple | 326 iterator of namedtuple |
| 307 Parsed lines of the filter list. | 327 Parsed lines of the filter list. |
| 308 | 328 |
| 309 Raises | 329 Raises |
| 310 ------ | 330 ------ |
| 311 ParseError | 331 ParseError |
| 312 Thrown during iteration for invalid filter list lines. | 332 Thrown during iteration for invalid filter list lines. |
| 313 TypeError | 333 TypeError |
| 314 If `lines` is not iterable. | 334 If `lines` is not iterable. |
| 315 | 335 |
| 316 """ | 336 """ |
| 317 metadata_closed = False | 337 position = 'start' |
| 318 | 338 |
| 319 for line in lines: | 339 for line in lines: |
| 320 result = parse_line(line) | 340 parsed_line = parse_line(line, position) |
| 341 yield parsed_line |
| 321 | 342 |
| 322 if result.type == 'comment': | 343 if position != 'body' and parsed_line.type in {'header', 'metadata'}: |
| 323 match = METADATA_REGEXP.match(result.text) | 344 # Continue parsing metadata until it's over... |
| 324 if match: | 345 position = 'metadata' |
| 325 key, value = match.groups() | 346 else: |
| 326 | 347 # ...then switch to parsing the body. |
| 327 # Historically, checksums can occur at the bottom of the | 348 position = 'body' |
| 328 # filter list. Checksums are no longer used by Adblock Plus, | |
| 329 # but in order to strip them (in abp.filters.renderer), | |
| 330 # we have to make sure to still parse them regardless of | |
| 331 # their position in the filter list. | |
| 332 if not metadata_closed or key.lower() == 'checksum': | |
| 333 result = Metadata(key, value) | |
| 334 | |
| 335 if result.type not in {'header', 'metadata'}: | |
| 336 metadata_closed = True | |
| 337 | |
| 338 yield result | |
| OLD | NEW |