Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)
Patch Set: remove all interpretation and keep only parsing, add support for element hiding emulation filters, … Created July 27, 2017, 7:16 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>, 1 # This file is part of Adblock Plus <https://adblockplus.org/>,
2 # Copyright (C) 2006-2017 eyeo GmbH 2 # Copyright (C) 2006-2017 eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 from __future__ import unicode_literals 16 from __future__ import unicode_literals
17 17
18 import re 18 import re
19 from collections import namedtuple 19 from collections import namedtuple
20 20
21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError'] 21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError', 'ST', 'FA']
22 22
23 23
24 class ParseError(Exception): 24 class ParseError(Exception):
25 """Exception thrown by the parser when it encounters invalid input. 25 """Exception thrown by the parser when it encounters invalid input.
26 26
27 :param error: Description of the error. 27 :param error: Description of the error.
28 :param text: The text which was being parsed when an error occurred. 28 :param text: The text which was being parsed when an error occurred.
29 """ 29 """
30 30
31 def __init__(self, error, text): 31 def __init__(self, error, text):
32 Exception.__init__(self, '{} in "{}"'.format(error, text)) 32 Exception.__init__(self, '{} in "{}"'.format(error, text))
33 self.text = text 33 self.text = text
34 self.error = error 34 self.error = error
35 35
36 36
37 # Constants related to filters (see https://adblockplus.org/filters).
38 class ST:
mathias 2017/07/28 16:43:29 Why abbreviating here (ST) and below (FA)?
Vasily Kuznetsov 2017/07/28 17:38:10 To be completely honest, the reason is kind of stu
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
39 """Selector types"""
40 URL_PATTERN = 'url-pattern' # Normal URL patterns.
41 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.
42 CSS = 'css' # CSS selectors for hiding filters.
43 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).
44 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.
45
46
47 class FA:
48 """Filter actions"""
49 BLOCK = 'block' # Block the request.
50 ALLOW = 'allow' # Allow the request (whitelist).
51 HIDE = 'hide' # Hide selected element(s).
52 SHOW = 'show' # Show selected element(s) (whitelist).
53
54
37 def _line_type(name, field_names, format_string): 55 def _line_type(name, field_names, format_string):
38 """Define a line type. 56 """Define a line type.
39 57
40 :param name: The name of the line type to define. 58 :param name: The name of the line type to define.
41 :param field_names: A sequence of field names or one space-separated 59 :param field_names: A sequence of field names or one space-separated
42 string that contains all field names. 60 string that contains all field names.
43 :param format_string: A format specifier for converting this line type 61 :param format_string: A format specifier for converting this line type
44 back to string representation. 62 back to string representation.
45 :returns: Class created with `namedtuple` that has `.type` set to 63 :returns: Class created with `namedtuple` that has `.type` set to
46 lowercased `name` and supports conversion back to string with 64 lowercased `name` and supports conversion back to string with
47 `.to_string()` method. 65 `.to_string()` method.
48 """ 66 """
49 lt = namedtuple(name, field_names) 67 lt = namedtuple(name, field_names)
50 lt.type = name.lower() 68 lt.type = name.lower()
51 lt.to_string = lambda self: format_string.format(self) 69 lt.to_string = lambda self: format_string.format(self)
52 return lt 70 return lt
53 71
54 72
55 Header = _line_type('Header', 'version', '[{.version}]') 73 Header = _line_type('Header', 'version', '[{.version}]')
56 EmptyLine = _line_type('EmptyLine', '', '') 74 EmptyLine = _line_type('EmptyLine', '', '')
57 Comment = _line_type('Comment', 'text', '! {.text}') 75 Comment = _line_type('Comment', 'text', '! {.text}')
58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') 76 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')
59 Filter = _line_type('Filter', 'expression', '{.expression}') 77 Filter = _line_type('Filter', 'text selector action options', '{.text}')
60 Include = _line_type('Include', 'target', '%include {0.target}%') 78 Include = _line_type('Include', 'target', '%include {0.target}%')
61 79
62 80
63 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)') 81 METADATA_REGEXP = re.compile(r'!\s*(\w+)\s*:\s*(.*)')
64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect', 82 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',
65 'Version'} 83 'Version'}
66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') 84 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')
67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) 85 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I)
86 HFILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$')
mathias 2017/07/28 16:43:29 Why abbreviating? What's wrong about HIDING_FILTER
Vasily Kuznetsov 2017/07/28 17:38:10 It's shorter this way. But I don't feel very stron
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
87 BFILTER_REGEXP_REGEXP = re.compile(
mathias 2017/07/28 16:43:30 I was wondering about the *_REGEXP_REGEXP name, bu
Vasily Kuznetsov 2017/07/28 17:38:10 It's the regular expression for blocking filters w
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
88 r'^(@@)?\/.*\/(?:\$~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)?$'
89 )
90 BFILTER_OPTIONS_REGEXP = re.compile(
91 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'
92 )
68 93
69 94
70 def _parse_comment(text): 95 def _parse_comment(text):
71 match = METADATA_REGEXP.match(text) 96 match = METADATA_REGEXP.match(text)
72 if match and match.group(1) in METADATA_KEYS: 97 if match and match.group(1) in METADATA_KEYS:
73 return Metadata(match.group(1), match.group(2)) 98 return Metadata(match.group(1), match.group(2))
74 return Comment(text[1:].strip()) 99 return Comment(text[1:].strip())
75 100
76 101
77 def _parse_header(text): 102 def _parse_header(text):
78 match = HEADER_REGEXP.match(text) 103 match = HEADER_REGEXP.match(text)
79 if not match: 104 if not match:
80 raise ParseError('Malformed header', text) 105 raise ParseError('Malformed header', text)
81 return Header(match.group(1)) 106 return Header(match.group(1))
82 107
83 108
84 def _parse_instruction(text): 109 def _parse_instruction(text):
85 match = INCLUDE_REGEXP.match(text) 110 match = INCLUDE_REGEXP.match(text)
86 if not match: 111 if not match:
87 raise ParseError('Unrecognized instruction', text) 112 raise ParseError('Unrecognized instruction', text)
88 return Include(match.group(1)) 113 return Include(match.group(1))
89 114
90 115
116 def _parse_option(option):
117 if '=' in option:
118 name, value = option.split('=', 1)
119 elif option.startswith('~'):
120 name, value = option[1:], False
121 else:
122 name, value = option, True
123
124 # Handle special cases of multivalued options.
125 if name == 'domain':
mathias 2017/07/28 16:43:30 Wouldn't it make sense to enumerate recognized OPT
Vasily Kuznetsov 2017/07/28 17:38:10 Yeah, probably makes sense to make some kind of en
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
126 name, value = 'domains', _parse_options(value, '|')
mathias 2017/07/28 16:43:30 Why using a different / plural key for the parsed
Vasily Kuznetsov 2017/07/28 17:38:10 Because semantically it's a list, always, so calli
Vasily Kuznetsov 2017/07/28 18:57:49 Done.
127 elif name == 'sitekey':
128 name, value = 'sitekeys', value.split('|')
129
130 return name, value
131
132
133 def _parse_options(options, separator=','):
134 return [_parse_option(o) for o in options.split(separator)]
135
136
137 def _parse_blocking_filter(text):
138 # Based on RegExpFilter.fromText in lib/filterClasses.js
139 # in https://hg.adblockplus.org/adblockpluscore.
140 action = FA.BLOCK
141 options = []
142 selector = text
143
144 if selector.startswith('@@'):
145 action = FA.ALLOW
146 selector = selector[2:]
147
148 if '$' in selector:
149 opt_match = BFILTER_OPTIONS_REGEXP.search(selector)
150 if opt_match:
151 selector = selector[:opt_match.start(0)]
152 options = _parse_options(opt_match.group(1))
153
154 if (len(selector) > 1 and
155 selector.startswith('/') and selector.endswith('/')):
156 selector = {'type': ST.URL_REGEXP, 'value': selector[1:-1]}
157 else:
158 selector = {'type': ST.URL_PATTERN, 'value': selector}
159
160 return Filter(text, selector, action, options)
161
162
163 def _parse_hiding_filter(text, domains, type_flag, selector_value):
164 selector = {'type': ST.CSS, 'value': selector_value}
165 action = FA.HIDE
166 options = []
167
168 if type_flag == '@':
169 action = FA.SHOW
170 elif type_flag == '?':
171 selector['type'] = ST.XCSS
172
173 if domains:
174 options.append(('domains', _parse_options(domains)))
175
176 return Filter(text, selector, action, options)
177
178
179 def parse_filter(text):
180 """Parse one filter.
181
182 :param text: Text representation of a filter.
183 :returns: filter object.
mathias 2017/07/28 16:43:29 I think this should be upper-case "Filter".
Vasily Kuznetsov 2017/07/28 17:38:10 Yes.
184 """
185 if '#' in text:
186 match = HFILTER_REGEXP.search(text)
187 if match:
188 return _parse_hiding_filter(text, *match.groups())
189 return _parse_blocking_filter(text)
190
191
91 def parse_line(line_text): 192 def parse_line(line_text):
92 """Parse one line of a filter list. 193 """Parse one line of a filter list.
93 194
94 :param line_text: Line of a filter list (must be a unicode string). 195 :param line_text: Line of a filter list (must be a unicode string).
95 :returns: Parsed line object (see `_line_type`). 196 :returns: Parsed line object (see `_line_type`).
96 :raises ParseError: If the line can't be successfully parsed. 197 :raises ParseError: If the line can't be successfully parsed.
97 """ 198 """
98 content = line_text.strip() 199 content = line_text.strip()
99 200
100 if content == '': 201 if content == '':
101 line = EmptyLine() 202 line = EmptyLine()
102 elif content.startswith('!'): 203 elif content.startswith('!'):
103 line = _parse_comment(content) 204 line = _parse_comment(content)
104 elif content.startswith('%') and content.endswith('%'): 205 elif content.startswith('%') and content.endswith('%'):
105 line = _parse_instruction(content) 206 line = _parse_instruction(content)
106 elif content.startswith('[') and content.endswith(']'): 207 elif content.startswith('[') and content.endswith(']'):
107 line = _parse_header(content) 208 line = _parse_header(content)
108 else: 209 else:
109 line = Filter(content) 210 line = parse_filter(content)
110 211
111 assert line.to_string().replace(' ', '') == content.replace(' ', '') 212 assert line.to_string().replace(' ', '') == content.replace(' ', '')
112 return line 213 return line
113 214
114 215
115 def parse_filterlist(lines): 216 def parse_filterlist(lines):
116 """Parse filter list from an iterable. 217 """Parse filter list from an iterable.
117 218
118 :param lines: List of strings or file or other iterable. 219 :param lines: List of strings or file or other iterable.
119 :returns: Iterator over parsed lines. 220 :returns: Iterator over parsed lines.
120 :raises ParseError: Can be thrown during iteration for invalid lines. 221 :raises ParseError: Can be thrown during iteration for invalid lines.
121 """ 222 """
122 for line in lines: 223 for line in lines:
123 yield parse_line(line) 224 yield parse_line(line)
OLDNEW
« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld