| Left: | ||
| Right: |
| LEFT | RIGHT |
|---|---|
| 1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
| 2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
| 3 # | 3 # |
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
| 5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
| 6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
| 7 # | 7 # |
| 8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| (...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 133 | 133 |
| 134 | 134 |
| 135 Header = _line_type('Header', 'version', '[{.version}]') | 135 Header = _line_type('Header', 'version', '[{.version}]') |
| 136 EmptyLine = _line_type('EmptyLine', '', '') | 136 EmptyLine = _line_type('EmptyLine', '', '') |
| 137 Comment = _line_type('Comment', 'text', '! {.text}') | 137 Comment = _line_type('Comment', 'text', '! {.text}') |
| 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
| 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
| 140 Include = _line_type('Include', 'target', '%include {0.target}%') | 140 Include = _line_type('Include', 'target', '%include {0.target}%') |
| 141 | 141 |
| 142 | 142 |
| 143 METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)') | 143 METADATA_REGEXP = re.compile(r'([\w-]+)\s*:\s*(.*)') |
| 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
| 145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
| 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
| 147 FILTER_OPTIONS_REGEXP = re.compile( | 147 FILTER_OPTIONS_REGEXP = re.compile( |
| 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' | 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
| 149 ) | 149 ) |
| 150 | 150 |
| 151 | 151 |
| 152 def _parse_comment(text, metadata_closed): | |
| 153 match = METADATA_REGEXP.match(text) | |
| 154 if match: | |
| 155 key, value = match.groups() | |
| 156 | |
| 157 # Historically, checksums can occur at the bottom of the filter list. | |
| 158 # Checksums are no longer used by Adblock Plus, but in order to strip | |
| 159 # them (in abp.filters.renderer), we have to make sure to still parse | |
| 160 # them regardless of their position in the filter list. | |
| 161 if key and (not metadata_closed or key.lower() == 'checksum'): | |
| 162 return Metadata(key, value), metadata_closed | |
| 163 else: | |
| 164 # The regular expression matches as well if we see a malformed key | |
| 165 # (e.g. "Last modified"). In that case we want to keep looking for | |
| 166 # more metadata, but return a Comment instead of a Metadata object. | |
| 167 # Hence we only consider the the metadata section closed if the | |
| 168 # regular expression doesn't match. | |
| 169 metadata_closed = True | |
| 170 return Comment(text[1:].lstrip()), metadata_closed | |
| 171 | |
| 172 | |
| 173 def _parse_header(text): | 152 def _parse_header(text): |
| 174 match = HEADER_REGEXP.match(text) | 153 match = HEADER_REGEXP.match(text) |
| 175 if not match: | 154 if not match: |
| 176 raise ParseError('Malformed header', text) | 155 raise ParseError('Malformed header', text) |
| 177 return Header(match.group(1)) | 156 return Header(match.group(1)) |
| 178 | 157 |
| 179 | 158 |
| 180 def _parse_instruction(text): | 159 def _parse_instruction(text): |
| 181 match = INCLUDE_REGEXP.match(text) | 160 match = INCLUDE_REGEXP.match(text) |
| 182 if not match: | 161 if not match: |
| (...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 265 Parsed filter. | 244 Parsed filter. |
| 266 | 245 |
| 267 """ | 246 """ |
| 268 if '#' in text: | 247 if '#' in text: |
| 269 match = HIDING_FILTER_REGEXP.search(text) | 248 match = HIDING_FILTER_REGEXP.search(text) |
| 270 if match: | 249 if match: |
| 271 return _parse_hiding_filter(text, *match.groups()) | 250 return _parse_hiding_filter(text, *match.groups()) |
| 272 return _parse_blocking_filter(text) | 251 return _parse_blocking_filter(text) |
| 273 | 252 |
| 274 | 253 |
| 275 def _parse_line(line_text, metadata_closed): | |
| 276 if isinstance(line_text, type(b'')): | |
| 277 line_text = line_text.decode('utf-8') | |
| 278 | |
| 279 content = line_text.strip() | |
| 280 | |
| 281 if content.startswith('!'): | |
| 282 line, metadata_closed = _parse_comment(content, metadata_closed) | |
| 283 elif content.startswith('[') and content.endswith(']'): | |
| 284 line = _parse_header(content) | |
| 285 else: | |
| 286 if content == '': | |
| 287 line = EmptyLine() | |
| 288 elif content.startswith('%') and content.endswith('%'): | |
| 289 line = _parse_instruction(content) | |
| 290 else: | |
| 291 line = parse_filter(content) | |
| 292 metadata_closed = True | |
| 293 | |
| 294 return line, metadata_closed | |
| 295 | |
| 296 | |
| 297 def parse_line(line_text): | 254 def parse_line(line_text): |
| 298 """Parse one line of a filter list. | 255 """Parse one line of a filter list. |
| 256 | |
| 257 Note that parse_line() doesn't handle special comments, hence never returns | |
| 258 a Metadata() object, Adblock Plus only considers metadata when parsing the | |
| 259 whole filter list and only if they are given at the top of the filter list. | |
| 299 | 260 |
| 300 Parameters | 261 Parameters |
| 301 ---------- | 262 ---------- |
| 302 line_text : str | 263 line_text : str |
| 303 Line of a filter list. | 264 Line of a filter list. |
| 304 | 265 |
| 305 Returns | 266 Returns |
| 306 ------- | 267 ------- |
| 307 namedtuple | 268 namedtuple |
| 308 Parsed line (see `_line_type`). | 269 Parsed line (see `_line_type`). |
| 309 | 270 |
| 310 Raises | 271 Raises |
| 311 ------ | 272 ------ |
| 312 ParseError | 273 ParseError |
| 313 ParseError: If the line can't be parsed. | 274 ParseError: If the line can't be parsed. |
| 314 """ | 275 """ |
| 315 return _parse_line(line_text, True)[0] | 276 if isinstance(line_text, type(b'')): |
|
Vasily Kuznetsov
2018/09/04 10:03:30
Here we're changing the behavior of `parse_line` -
Sebastian Noack
2018/09/04 16:01:03
I moved the logic as requested to parse_filterlist
Vasily Kuznetsov
2018/09/04 16:49:50
What I had in mind was to keep _parse_comment() wi
| |
| 277 line_text = line_text.decode('utf-8') | |
| 278 | |
| 279 content = line_text.strip() | |
| 280 | |
| 281 if content == '': | |
| 282 line = EmptyLine() | |
| 283 elif content.startswith('!'): | |
| 284 line = Comment(content[1:].lstrip()) | |
| 285 elif content.startswith('%') and content.endswith('%'): | |
| 286 line = _parse_instruction(content) | |
| 287 elif content.startswith('[') and content.endswith(']'): | |
| 288 line = _parse_header(content) | |
| 289 else: | |
| 290 line = parse_filter(content) | |
| 291 | |
| 292 assert line.to_string().replace(' ', '') == content.replace(' ', '') | |
| 293 return line | |
| 294 | |
| 316 | 295 |
| 317 def parse_filterlist(lines): | 296 def parse_filterlist(lines): |
| 318 """Parse filter list from an iterable. | 297 """Parse filter list from an iterable. |
| 319 | 298 |
| 320 Parameters | 299 Parameters |
| 321 ---------- | 300 ---------- |
| 322 lines: iterable of str | 301 lines: iterable of str |
| 323 Lines of the filter list. | 302 Lines of the filter list. |
| 324 | 303 |
| 325 Returns | 304 Returns |
| 326 ------- | 305 ------- |
| 327 iterator of namedtuple | 306 iterator of namedtuple |
| 328 Parsed lines of the filter list. | 307 Parsed lines of the filter list. |
| 329 | 308 |
| 330 Raises | 309 Raises |
| 331 ------ | 310 ------ |
| 332 ParseError | 311 ParseError |
| 333 Thrown during iteration for invalid filter list lines. | 312 Thrown during iteration for invalid filter list lines. |
| 334 TypeError | 313 TypeError |
| 335 If `lines` is not iterable. | 314 If `lines` is not iterable. |
| 336 | 315 |
| 337 """ | 316 """ |
| 338 metadata_closed = False | 317 metadata_closed = False |
|
Vasily Kuznetsov
2018/09/04 10:03:30
What do you think about filter lists like this?
Sebastian Noack
2018/09/04 16:01:03
In practice nobody seems to put comments in betwee
Vasily Kuznetsov
2018/09/04 16:49:50
Well, "Last modified" is considered a comment and
| |
| 318 | |
| 339 for line in lines: | 319 for line in lines: |
| 340 parsed, metadata_closed = _parse_line(line, metadata_closed) | 320 result = parse_line(line) |
| 341 yield parsed | 321 |
| 322 if isinstance(result, Comment): | |
| 323 match = METADATA_REGEXP.match(result.text) | |
| 324 if match: | |
| 325 key, value = match.groups() | |
| 326 | |
| 327 # Historically, checksums can occur at the bottom of the | |
| 328 # filter list. Checksums are no longer used by Adblock Plus, | |
| 329 # but in order to strip them (in abp.filters.renderer), | |
| 330 # we have to make sure to still parse them regardless of | |
| 331 # their position in the filter list. | |
| 332 if not metadata_closed or key.lower() == 'checksum': | |
| 333 yield Metadata(key, value) | |
| 334 continue | |
| 335 | |
| 336 if not result.text: | |
| 337 metadata_closed = True | |
| 338 elif not isinstance(result, Header): | |
| 339 metadata_closed = True | |
| 340 | |
| 341 yield result | |
| LEFT | RIGHT |