Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
133 | 133 |
134 | 134 |
135 Header = _line_type('Header', 'version', '[{.version}]') | 135 Header = _line_type('Header', 'version', '[{.version}]') |
136 EmptyLine = _line_type('EmptyLine', '', '') | 136 EmptyLine = _line_type('EmptyLine', '', '') |
137 Comment = _line_type('Comment', 'text', '! {.text}') | 137 Comment = _line_type('Comment', 'text', '! {.text}') |
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
140 Include = _line_type('Include', 'target', '%include {0.target}%') | 140 Include = _line_type('Include', 'target', '%include {0.target}%') |
141 | 141 |
142 | 142 |
143 METADATA_REGEXP = re.compile(r'!\s*(?:([\w-]+)|(?:\S.*?))\s*:\s*(.*)') | 143 METADATA_REGEXP = re.compile(r'([\w-]+)\s*:\s*(.*)') |
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
147 FILTER_OPTIONS_REGEXP = re.compile( | 147 FILTER_OPTIONS_REGEXP = re.compile( |
148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' | 148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
149 ) | 149 ) |
150 | 150 |
151 | 151 |
152 def _parse_comment(text, metadata_closed): | |
153 match = METADATA_REGEXP.match(text) | |
154 if match: | |
155 key, value = match.groups() | |
156 | |
157 # Historically, checksums can occur at the bottom of the filter list. | |
158 # Checksums are no longer used by Adblock Plus, but in order to strip | |
159 # them (in abp.filters.renderer), we have to make sure to still parse | |
160 # them regardless of their position in the filter list. | |
161 if key and (not metadata_closed or key.lower() == 'checksum'): | |
162 return Metadata(key, value), metadata_closed | |
163 else: | |
164 # The regular expression matches as well if we see a malformed key | |
165 # (e.g. "Last modified"). In that case we want to keep looking for | |
166 # more metadata, but return a Comment instead of a Metadata object. | |
167 # Hence we only consider the the metadata section closed if the | |
168 # regular expression doesn't match. | |
169 metadata_closed = True | |
170 return Comment(text[1:].lstrip()), metadata_closed | |
171 | |
172 | |
173 def _parse_header(text): | 152 def _parse_header(text): |
174 match = HEADER_REGEXP.match(text) | 153 match = HEADER_REGEXP.match(text) |
175 if not match: | 154 if not match: |
176 raise ParseError('Malformed header', text) | 155 raise ParseError('Malformed header', text) |
177 return Header(match.group(1)) | 156 return Header(match.group(1)) |
178 | 157 |
179 | 158 |
180 def _parse_instruction(text): | 159 def _parse_instruction(text): |
181 match = INCLUDE_REGEXP.match(text) | 160 match = INCLUDE_REGEXP.match(text) |
182 if not match: | 161 if not match: |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
265 Parsed filter. | 244 Parsed filter. |
266 | 245 |
267 """ | 246 """ |
268 if '#' in text: | 247 if '#' in text: |
269 match = HIDING_FILTER_REGEXP.search(text) | 248 match = HIDING_FILTER_REGEXP.search(text) |
270 if match: | 249 if match: |
271 return _parse_hiding_filter(text, *match.groups()) | 250 return _parse_hiding_filter(text, *match.groups()) |
272 return _parse_blocking_filter(text) | 251 return _parse_blocking_filter(text) |
273 | 252 |
274 | 253 |
275 def _parse_line(line_text, metadata_closed): | |
276 if isinstance(line_text, type(b'')): | |
277 line_text = line_text.decode('utf-8') | |
278 | |
279 content = line_text.strip() | |
280 | |
281 if content.startswith('!'): | |
282 line, metadata_closed = _parse_comment(content, metadata_closed) | |
283 elif content.startswith('[') and content.endswith(']'): | |
284 line = _parse_header(content) | |
285 else: | |
286 if content == '': | |
287 line = EmptyLine() | |
288 elif content.startswith('%') and content.endswith('%'): | |
289 line = _parse_instruction(content) | |
290 else: | |
291 line = parse_filter(content) | |
292 metadata_closed = True | |
293 | |
294 return line, metadata_closed | |
295 | |
296 | |
297 def parse_line(line_text): | 254 def parse_line(line_text): |
298 """Parse one line of a filter list. | 255 """Parse one line of a filter list. |
256 | |
257 Note that parse_line() doesn't handle special comments, hence never returns | |
258 a Metadata() object, Adblock Plus only considers metadata when parsing the | |
259 whole filter list and only if they are given at the top of the filter list. | |
299 | 260 |
300 Parameters | 261 Parameters |
301 ---------- | 262 ---------- |
302 line_text : str | 263 line_text : str |
303 Line of a filter list. | 264 Line of a filter list. |
304 | 265 |
305 Returns | 266 Returns |
306 ------- | 267 ------- |
307 namedtuple | 268 namedtuple |
308 Parsed line (see `_line_type`). | 269 Parsed line (see `_line_type`). |
309 | 270 |
310 Raises | 271 Raises |
311 ------ | 272 ------ |
312 ParseError | 273 ParseError |
313 ParseError: If the line can't be parsed. | 274 ParseError: If the line can't be parsed. |
314 """ | 275 """ |
315 return _parse_line(line_text, True)[0] | 276 if isinstance(line_text, type(b'')): |
Vasily Kuznetsov
2018/09/04 10:03:30
Here we're changing the behavior of `parse_line` -
Sebastian Noack
2018/09/04 16:01:03
I moved the logic as requested to parse_filterlist
Vasily Kuznetsov
2018/09/04 16:49:50
What I had in mind was to keep _parse_comment() wi
| |
277 line_text = line_text.decode('utf-8') | |
278 | |
279 content = line_text.strip() | |
280 | |
281 if content == '': | |
282 line = EmptyLine() | |
283 elif content.startswith('!'): | |
284 line = Comment(content[1:].lstrip()) | |
285 elif content.startswith('%') and content.endswith('%'): | |
286 line = _parse_instruction(content) | |
287 elif content.startswith('[') and content.endswith(']'): | |
288 line = _parse_header(content) | |
289 else: | |
290 line = parse_filter(content) | |
291 | |
292 assert line.to_string().replace(' ', '') == content.replace(' ', '') | |
293 return line | |
294 | |
316 | 295 |
317 def parse_filterlist(lines): | 296 def parse_filterlist(lines): |
318 """Parse filter list from an iterable. | 297 """Parse filter list from an iterable. |
319 | 298 |
320 Parameters | 299 Parameters |
321 ---------- | 300 ---------- |
322 lines: iterable of str | 301 lines: iterable of str |
323 Lines of the filter list. | 302 Lines of the filter list. |
324 | 303 |
325 Returns | 304 Returns |
326 ------- | 305 ------- |
327 iterator of namedtuple | 306 iterator of namedtuple |
328 Parsed lines of the filter list. | 307 Parsed lines of the filter list. |
329 | 308 |
330 Raises | 309 Raises |
331 ------ | 310 ------ |
332 ParseError | 311 ParseError |
333 Thrown during iteration for invalid filter list lines. | 312 Thrown during iteration for invalid filter list lines. |
334 TypeError | 313 TypeError |
335 If `lines` is not iterable. | 314 If `lines` is not iterable. |
336 | 315 |
337 """ | 316 """ |
338 metadata_closed = False | 317 metadata_closed = False |
Vasily Kuznetsov
2018/09/04 10:03:30
What do you think about filter lists like this?
Sebastian Noack
2018/09/04 16:01:03
In practice nobody seems to put comments in betwee
Vasily Kuznetsov
2018/09/04 16:49:50
Well, "Last modified" is considered a comment and
| |
318 | |
339 for line in lines: | 319 for line in lines: |
340 parsed, metadata_closed = _parse_line(line, metadata_closed) | 320 result = parse_line(line) |
341 yield parsed | 321 |
322 if isinstance(result, Comment): | |
323 match = METADATA_REGEXP.match(result.text) | |
324 if match: | |
325 key, value = match.groups() | |
326 | |
327 # Historically, checksums can occur at the bottom of the | |
328 # filter list. Checksums are no longer used by Adblock Plus, | |
329 # but in order to strip them (in abp.filters.renderer), | |
330 # we have to make sure to still parse them regardless of | |
331 # their position in the filter list. | |
332 if not metadata_closed or key.lower() == 'checksum': | |
333 yield Metadata(key, value) | |
334 continue | |
335 | |
336 if not result.text: | |
337 metadata_closed = True | |
338 elif not isinstance(result, Header): | |
339 metadata_closed = True | |
340 | |
341 yield result | |
LEFT | RIGHT |