Left: | ||
Right: |
OLD | NEW |
---|---|
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
133 | 133 |
134 | 134 |
135 Header = _line_type('Header', 'version', '[{.version}]') | 135 Header = _line_type('Header', 'version', '[{.version}]') |
136 EmptyLine = _line_type('EmptyLine', '', '') | 136 EmptyLine = _line_type('EmptyLine', '', '') |
137 Comment = _line_type('Comment', 'text', '! {.text}') | 137 Comment = _line_type('Comment', 'text', '! {.text}') |
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
140 Include = _line_type('Include', 'target', '%include {0.target}%') | 140 Include = _line_type('Include', 'target', '%include {0.target}%') |
141 | 141 |
142 | 142 |
143 METADATA_REGEXP = re.compile(r'!\s*([\w-]+)\s*:(?!//)\s*(.*)') | 143 METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)') |
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
147 FILTER_OPTIONS_REGEXP = re.compile( | 147 FILTER_OPTIONS_REGEXP = re.compile( |
148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' | 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
149 ) | 149 ) |
150 | 150 |
151 | 151 |
152 def _parse_comment(text): | 152 def _parse_comment(text, metadata_closed): |
153 match = METADATA_REGEXP.match(text) | 153 match = METADATA_REGEXP.match(text) |
154 if match: | 154 if match: |
155 return Metadata(match.group(1), match.group(2)) | 155 key, value = match.groups() |
156 return Comment(text[1:].strip()) | 156 |
157 # Historically, checksums can occur at the bottom of the filter list. | |
158 # Checksums are no longer used by Adblock Plus, but in order to strip | |
159 # them (in abp.filters.renderer), we have to make sure to still parse | |
160 # them regardless of their position in the filter list. | |
161 if key and (not metadata_closed or key.lower() == 'checksum'): | |
162 return Metadata(key, value), metadata_closed | |
163 else: | |
164 # The regular expression matches as well if we see a malformed key | |
165 # (e.g. "Last modified"). In that case we want to keep looking for | |
166 # more metadata, but return a Comment instead of a Metadata object. | |
167 # Hence we only consider the the metadata section closed if the | |
168 # regular expression doesn't match. | |
169 metadata_closed = True | |
170 return Comment(text[1:].lstrip()), metadata_closed | |
157 | 171 |
158 | 172 |
159 def _parse_header(text): | 173 def _parse_header(text): |
160 match = HEADER_REGEXP.match(text) | 174 match = HEADER_REGEXP.match(text) |
161 if not match: | 175 if not match: |
162 raise ParseError('Malformed header', text) | 176 raise ParseError('Malformed header', text) |
163 return Header(match.group(1)) | 177 return Header(match.group(1)) |
164 | 178 |
165 | 179 |
166 def _parse_instruction(text): | 180 def _parse_instruction(text): |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
251 Parsed filter. | 265 Parsed filter. |
252 | 266 |
253 """ | 267 """ |
254 if '#' in text: | 268 if '#' in text: |
255 match = HIDING_FILTER_REGEXP.search(text) | 269 match = HIDING_FILTER_REGEXP.search(text) |
256 if match: | 270 if match: |
257 return _parse_hiding_filter(text, *match.groups()) | 271 return _parse_hiding_filter(text, *match.groups()) |
258 return _parse_blocking_filter(text) | 272 return _parse_blocking_filter(text) |
259 | 273 |
260 | 274 |
275 def _parse_line(line_text, metadata_closed): | |
276 if isinstance(line_text, type(b'')): | |
277 line_text = line_text.decode('utf-8') | |
278 | |
279 content = line_text.strip() | |
280 | |
281 if content.startswith('!'): | |
282 line, metadata_closed = _parse_comment(content, metadata_closed) | |
283 elif content.startswith('[') and content.endswith(']'): | |
284 line = _parse_header(content) | |
285 else: | |
286 if content == '': | |
287 line = EmptyLine() | |
288 elif content.startswith('%') and content.endswith('%'): | |
289 line = _parse_instruction(content) | |
290 else: | |
291 line = parse_filter(content) | |
292 metadata_closed = True | |
293 | |
294 return line, metadata_closed | |
295 | |
296 | |
261 def parse_line(line_text): | 297 def parse_line(line_text): |
262 """Parse one line of a filter list. | 298 """Parse one line of a filter list. |
263 | 299 |
264 Parameters | 300 Parameters |
265 ---------- | 301 ---------- |
266 line_text : str | 302 line_text : str |
267 Line of a filter list. | 303 Line of a filter list. |
268 | 304 |
269 Returns | 305 Returns |
270 ------- | 306 ------- |
271 namedtuple | 307 namedtuple |
272 Parsed line (see `_line_type`). | 308 Parsed line (see `_line_type`). |
273 | 309 |
274 Raises | 310 Raises |
275 ------ | 311 ------ |
276 ParseError | 312 ParseError |
277 ParseError: If the line can't be parsed. | 313 ParseError: If the line can't be parsed. |
278 """ | 314 """ |
279 if isinstance(line_text, type(b'')): | 315 return _parse_line(line_text, True)[0] |
Vasily Kuznetsov
2018/09/04 10:03:30
Here we're changing the behavior of `parse_line` -
Sebastian Noack
2018/09/04 16:01:03
I moved the logic as requested to parse_filterlist
Vasily Kuznetsov
2018/09/04 16:49:50
What I had in mind was to keep _parse_comment() wi
| |
280 line_text = line_text.decode('utf-8') | |
281 | |
282 content = line_text.strip() | |
283 | |
284 if content == '': | |
285 line = EmptyLine() | |
286 elif content.startswith('!'): | |
287 line = _parse_comment(content) | |
288 elif content.startswith('%') and content.endswith('%'): | |
289 line = _parse_instruction(content) | |
290 elif content.startswith('[') and content.endswith(']'): | |
291 line = _parse_header(content) | |
292 else: | |
293 line = parse_filter(content) | |
294 | |
295 assert line.to_string().replace(' ', '') == content.replace(' ', '') | |
296 return line | |
297 | |
298 | 316 |
299 def parse_filterlist(lines): | 317 def parse_filterlist(lines): |
300 """Parse filter list from an iterable. | 318 """Parse filter list from an iterable. |
301 | 319 |
302 Parameters | 320 Parameters |
303 ---------- | 321 ---------- |
304 lines: iterable of str | 322 lines: iterable of str |
305 Lines of the filter list. | 323 Lines of the filter list. |
306 | 324 |
307 Returns | 325 Returns |
308 ------- | 326 ------- |
309 iterator of namedtuple | 327 iterator of namedtuple |
310 Parsed lines of the filter list. | 328 Parsed lines of the filter list. |
311 | 329 |
312 Raises | 330 Raises |
313 ------ | 331 ------ |
314 ParseError | 332 ParseError |
315 Thrown during iteration for invalid filter list lines. | 333 Thrown during iteration for invalid filter list lines. |
316 TypeError | 334 TypeError |
317 If `lines` is not iterable. | 335 If `lines` is not iterable. |
318 | 336 |
319 """ | 337 """ |
338 metadata_closed = False | |
Vasily Kuznetsov
2018/09/04 10:03:30
What do you think about filter lists like this?
Sebastian Noack
2018/09/04 16:01:03
In practice nobody seems to put comments in betwee
Vasily Kuznetsov
2018/09/04 16:49:50
Well, "Last modified" is considered a comment and
| |
320 for line in lines: | 339 for line in lines: |
321 yield parse_line(line) | 340 parsed, metadata_closed = _parse_line(line, metadata_closed) |
341 yield parsed | |
OLD | NEW |