abp/filters/parser.py - Issue 29465720: Issue 4970 - Document the library API of python-abp

Delta Between Two Patch Sets: abp/filters/parser.py

Issue 29465720: Issue 4970 - Document the library API of python-abp (Closed)

Left Patch Set: Update README to match the changes from https://codereview.adblockplus.org/29465715/ Created Aug. 7, 2017, 8:28 p.m.

Right Patch Set: Rebase to match the new master and retouche the docstrings. Created Oct. 24, 2017, 4:06 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Right: Side by side diff | Download

LEFT	RIGHT
(no file at all)
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-present eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

	16 """Parser for ABP filterlist format."""

	17

16 from __future__ import unicode_literals	18 from __future__ import unicode_literals

17	19

18 import re	20 import re

19 from collections import namedtuple	21 from collections import namedtuple

20	22

21 __all__ = [	23 __all__ = [

22 'FILTER_ACTION',	24 'FILTER_ACTION',

23 'FILTER_OPTION',	25 'FILTER_OPTION',

	26 'SELECTOR_TYPE',

24 'ParseError',	27 'ParseError',

25 'SELECTOR_TYPE',

26 'parse_filterlist',	28 'parse_filterlist',

27 'parse_line',	29 'parse_line',

28 ]	30 ]

29	31

30	32

31 class ParseError(Exception):	33 class ParseError(Exception):

32 """Exception thrown by the parser when it encounters invalid input.	34 """Exception thrown by the parser when it encounters invalid input.

33	35

34 :param error: Description of the error.	36 Parameters

35 :param text: The text which was being parsed when an error occurred.	37 ----------

	38 error : str

	39 Description of the error.

	40 text : str

	41 The source text that caused an error.

	42
	Vasily Kuznetsov 2017/10/24 16:11:00 pep8-docstrings demands an empty line at the end o pep8-docstrings demands an empty line at the end of numpy-style docstrings, although it seems somewhat inconsistently. Anyway, they are more readable this way.
36 """	43 """

37	44

38 def __init__(self, error, text):	45 def __init__(self, error, text):

39 Exception.__init__(self, '{} in "{}"'.format(error, text))	46 Exception.__init__(self, '{} in "{}"'.format(error, text))

40 self.text = text	47 self.text = text

41 self.error = error	48 self.error = error

42	49

43	50

44 # Constants related to filters (see https://adblockplus.org/filters).	51 # Constants related to filters (see https://adblockplus.org/filters).

45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).	52 class SELECTOR_TYPE: # flake8: noqa (this is a namespace of constants).

46 """Selector types"""	53 """Selector type constants."""

47 URL_PATTERN = 'url-pattern' # Normal URL patterns.	54 URL_PATTERN = 'url-pattern' # Normal URL patterns.

48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.	55 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.

49 CSS = 'css' # CSS selectors for hiding filters.	56 CSS = 'css' # CSS selectors for hiding filters.

50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).	57 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).

51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.	58 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.

52	59

53	60

54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).	61 class FILTER_ACTION: # flake8: noqa (this is a namespace of constants).

55 """Filter actions"""	62 """Filter action constants."""

56 BLOCK = 'block' # Block the request.	63 BLOCK = 'block' # Block the request.

57 ALLOW = 'allow' # Allow the request (whitelist).	64 ALLOW = 'allow' # Allow the request (whitelist).

58 HIDE = 'hide' # Hide selected element(s).	65 HIDE = 'hide' # Hide selected element(s).

59 SHOW = 'show' # Show selected element(s) (whitelist).	66 SHOW = 'show' # Show selected element(s) (whitelist).

60	67

61	68

62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).	69 class FILTER_OPTION: # flake8: noqa (this is a namespace of constants).

63 """Filter options"""	70 """Filter option constants."""

64 # Resource types.	71 # Resource types.

65 OTHER = 'other'	72 OTHER = 'other'

66 SCRIPT = 'script'	73 SCRIPT = 'script'

67 IMAGE = 'image'	74 IMAGE = 'image'

68 STYLESHEET = 'stylesheet'	75 STYLESHEET = 'stylesheet'

69 OBJECT = 'object'	76 OBJECT = 'object'

70 SUBDOCUMENT = 'subdocument'	77 SUBDOCUMENT = 'subdocument'

71 DOCUMENT = 'document'	78 DOCUMENT = 'document'

72 WEBSOCKET = 'websocket'	79 WEBSOCKET = 'websocket'

73 WEBRTC = 'webrtc'	80 WEBRTC = 'webrtc'

(...skipping 17 matching lines...) Expand all Loading...
91 DOMAIN = 'domain'	98 DOMAIN = 'domain'

92 THIRD_PARTY = 'third-party'	99 THIRD_PARTY = 'third-party'

93 COLLAPSE = 'collapse'	100 COLLAPSE = 'collapse'

94 SITEKEY = 'sitekey'	101 SITEKEY = 'sitekey'

95 DONOTTRACK = 'donottrack'	102 DONOTTRACK = 'donottrack'

96	103

97	104

98 def _line_type(name, field_names, format_string):	105 def _line_type(name, field_names, format_string):

99 """Define a line type.	106 """Define a line type.

100	107

101 :param name: The name of the line type to define.	108 Parameters

102 :param field_names: A sequence of field names or one space-separated	109 ----------

103 string that contains all field names.	110 name: str

104 :param format_string: A format specifier for converting this line type	111 The name of the line type to define.

105 back to string representation.	112 field_names: str or list

106 :returns: Class created with `namedtuple` that has `.type` set to	113 A sequence of field names or one space-separated string that contains

107 lowercased `name` and supports conversion back to string with	114 all field names.

108 `.to_string()` method.	115 format_string: str

	116 A format specifier for converting this line type back to string

	117 representation.

	118

	119 Returns

	120 -------

	121 class

	122 Class created with `namedtuple` that has `.type` set to lowercased

	123 `name` and supports conversion back to string with `.to_string()`

	124 method.

	125

109 """	126 """

110 lt = namedtuple(name, field_names)	127 lt = namedtuple(name, field_names)

111 lt.type = name.lower()	128 lt.type = name.lower()

112 lt.to_string = lambda self: format_string.format(self)	129 lt.to_string = lambda self: format_string.format(self)

113 return lt	130 return lt

114	131

115	132

116 Header = _line_type('Header', 'version', '[{.version}]')	133 Header = _line_type('Header', 'version', '[{.version}]')

117 EmptyLine = _line_type('EmptyLine', '', '')	134 EmptyLine = _line_type('EmptyLine', '', '')

118 Comment = _line_type('Comment', 'text', '! {.text}')	135 Comment = _line_type('Comment', 'text', '! {.text}')

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
216 if domain:	233 if domain:

217 domains = [_parse_option(d) for d in domain.split(',')]	234 domains = [_parse_option(d) for d in domain.split(',')]

218 options.append((FILTER_OPTION.DOMAIN, domains))	235 options.append((FILTER_OPTION.DOMAIN, domains))

219	236

220 return Filter(text, selector, action, options)	237 return Filter(text, selector, action, options)

221	238

222	239

223 def parse_filter(text):	240 def parse_filter(text):

224 """Parse one filter.	241 """Parse one filter.

225	242

226 :param text: Text representation of a filter.	243 Parameters

227 :returns: Filter object.	244 ----------

	245 text : str

	246 Filter to parse in ABP filter list syntax.

	247

	248 Returns

	249 -------

	250 namedtuple

	251 Parsed filter.

	252

228 """	253 """

229 if '#' in text:	254 if '#' in text:

230 match = HIDING_FILTER_REGEXP.search(text)	255 match = HIDING_FILTER_REGEXP.search(text)

231 if match:	256 if match:

232 return _parse_hiding_filter(text, *match.groups())	257 return _parse_hiding_filter(text, *match.groups())

233 return _parse_blocking_filter(text)	258 return _parse_blocking_filter(text)

234	259

235	260

236 def parse_line(line_text):	261 def parse_line(line_text):

237 """Parse one line of a filter list.	262 """Parse one line of a filter list.

238	263

239 :param line_text: Line of a filter list (must be a unicode string).	264 Parameters

240 :returns: Parsed line object (see `_line_type`).	265 ----------

241 :raises ParseError: If the line can't be successfully parsed.	266 line_text : str

	267 Line of a filter list.

	268

	269 Returns

	270 -------

	271 namedtuple

	272 Parsed line (see `_line_type`).

	273

	274 Raises

	275 ------

	276 ParseError

	277 ParseError: If the line can't be parsed.

242 """	278 """

243 content = line_text.strip()	279 content = line_text.strip()

244	280

245 if content == '':	281 if content == '':

246 line = EmptyLine()	282 line = EmptyLine()

247 elif content.startswith('!'):	283 elif content.startswith('!'):

248 line = _parse_comment(content)	284 line = _parse_comment(content)

249 elif content.startswith('%') and content.endswith('%'):	285 elif content.startswith('%') and content.endswith('%'):

250 line = _parse_instruction(content)	286 line = _parse_instruction(content)

251 elif content.startswith('[') and content.endswith(']'):	287 elif content.startswith('[') and content.endswith(']'):

252 line = _parse_header(content)	288 line = _parse_header(content)

253 else:	289 else:

254 line = parse_filter(content)	290 line = parse_filter(content)

255	291

256 assert line.to_string().replace(' ', '') == content.replace(' ', '')	292 assert line.to_string().replace(' ', '') == content.replace(' ', '')

257 return line	293 return line

258	294

259	295

260 def parse_filterlist(lines):	296 def parse_filterlist(lines):

261 """Parse filter list from an iterable.	297 """Parse filter list from an iterable.

262	298

263 :param lines: List of strings or file or other iterable.	299 Parameters

264 :returns: Iterator over parsed lines.	300 ----------

265 :raises ParseError: Can be thrown during iteration for invalid lines.	301 lines: iterable of str

	302 Lines of the filter list.

	303

	304 Returns

	305 -------

	306 iterator of namedtuple

	307 Parsed lines of the filter list.

	308

	309 Raises

	310 ------

	311 ParseError

	312 Thrown during iteration for invalid filter list lines.

	313 TypeError
	Vasily Kuznetsov 2017/10/24 16:11:00 As suggested by Matze, this error is easy to fores As suggested by Matze, this error is easy to foresee, so we might as well document it.
	314 If `lines` is not iterable.

	315

266 """	316 """

267 for line in lines:	317 for line in lines:

268 yield parse_line(line)	318 yield parse_line(line)

LEFT	RIGHT