abp/filters/parser.py - Issue 29465715: Fixes 4969 - Add parsing of filters

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)

Patch Set: Address review comments on patch set 2 Created July 28, 2017, 6:52 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-2017 eyeo GmbH	2 # Copyright (C) 2006-2017 eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 from __future__ import unicode_literals	16 from __future__ import unicode_literals

17	17

18 import re	18 import re

19 from collections import namedtuple	19 from collections import namedtuple

20	20

21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError']	21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError',

	22 'SELECTOR_TYPE', 'FILTER_ACTION', 'FILTER_OPTION']

22	23

23	24

24 class ParseError(Exception):	25 class ParseError(Exception):

25 """Exception thrown by the parser when it encounters invalid input.	26 """Exception thrown by the parser when it encounters invalid input.

26	27

27 :param error: Description of the error.	28 :param error: Description of the error.

28 :param text: The text which was being parsed when an error occurred.	29 :param text: The text which was being parsed when an error occurred.

29 """	30 """

30	31

31 def __init__(self, error, text):	32 def __init__(self, error, text):

32 Exception.__init__(self, '{} in "{}"'.format(error, text))	33 Exception.__init__(self, '{} in "{}"'.format(error, text))

33 self.text = text	34 self.text = text

34 self.error = error	35 self.error = error

35	36

36	37

	38 # Constants related to filters (see https://adblockplus.org/filters).

	39 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).

	40 """Selector types"""

	41 URL_PATTERN = 'url-pattern' # Normal URL patterns.

	42 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.

	43 CSS = 'css' # CSS selectors for hiding filters.

	44 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).

	45 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.

	46

	47

	48 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).

	49 """Filter actions"""

	50 BLOCK = 'block' # Block the request.

	51 ALLOW = 'allow' # Allow the request (whitelist).

	52 HIDE = 'hide' # Hide selected element(s).

	53 SHOW = 'show' # Show selected element(s) (whitelist).

	54

	55

	56 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).

	57 """Filter options"""

	58 # Resource types.

	59 OTHER = 'other'

	60 SCRIPT = 'script'

	61 IMAGE = 'image'

	62 STYLESHEET = 'stylesheet'

	63 OBJECT = 'object'

	64 SUBDOCUMENT = 'subdocument'

	65 DOCUMENT = 'document'

	66 WEBSOCKET = 'websocket'

	67 WEBRTC = 'webrtc'

	68 PING = 'ping'

	69 XMLHTTPREQUEST = 'xmlhttprequest'

	70 OBJECT_SUBREQUEST = 'object-subrequest'

	71 MEDIA = 'media'

	72 FONT = 'font'

	73 POPUP = 'popup'

	74 GENERICBLOCK = 'genericblock'

	75 ELEMHIDE = 'elemhide'

	76 GENERICHIDE = 'generichide'

	77

	78 # Deprecated resource types.

	79 BACKGROUND = 'background'

	80 XBL = 'xbl'

	81 DTD = 'dtd'

	82

	83 # Other options.

	84 MATCH_CASE = 'match-case'

	85 DOMAIN = 'domain'

	86 THIRD_PARTY = 'third-party'

	87 COLLAPSE = 'collapse'

	88 SITEKEY = 'sitekey'

	89 DONOTTRACK = 'donottrack'

	90

	91

	92 ALL_OPTIONS = {opt for name, opt in vars(FILTER_OPTION).items()

	93 if not name.startswith('__')}

	94

	95

37 def _line_type(name, field_names, format_string):	96 def _line_type(name, field_names, format_string):

38 """Define a line type.	97 """Define a line type.

39	98

40 :param name: The name of the line type to define.	99 :param name: The name of the line type to define.

41 :param field_names: A sequence of field names or one space-separated	100 :param field_names: A sequence of field names or one space-separated

42 string that contains all field names.	101 string that contains all field names.

43 :param format_string: A format specifier for converting this line type	102 :param format_string: A format specifier for converting this line type

44 back to string representation.	103 back to string representation.

45 :returns: Class created with `namedtuple` that has `.type` set to	104 :returns: Class created with `namedtuple` that has `.type` set to

46 lowercased `name` and supports conversion back to string with	105 lowercased `name` and supports conversion back to string with

47 `.to_string()` method.	106 `.to_string()` method.

48 """	107 """

49 lt = namedtuple(name, field_names)	108 lt = namedtuple(name, field_names)

50 lt.type = name.lower()	109 lt.type = name.lower()

51 lt.to_string = lambda self: format_string.format(self)	110 lt.to_string = lambda self: format_string.format(self)

52 return lt	111 return lt

53	112

54	113

55 Header = _line_type('Header', 'version', '[{.version}]')	114 Header = _line_type('Header', 'version', '[{.version}]')

56 EmptyLine = _line_type('EmptyLine', '', '')	115 EmptyLine = _line_type('EmptyLine', '', '')

57 Comment = _line_type('Comment', 'text', '! {.text}')	116 Comment = _line_type('Comment', 'text', '! {.text}')

58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')	117 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

59 Filter = _line_type('Filter', 'expression', '{.expression}')	118 Filter = _line_type('Filter', 'text selector action options', '{.text}')

60 Include = _line_type('Include', 'target', '%include {0.target}%')	119 Include = _line_type('Include', 'target', '%include {0.target}%')

61	120

62	121

63 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')	122 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')

64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',	123 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',

65 'Version'}	124 'Version'}

66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	125 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	126 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)

	127 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')

	128 FILTER_OPTIONS_REGEXP = re.compile(

	129 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'

	130 )

68	131

69	132

70 def _parse_comment(text):	133 def _parse_comment(text):

71 match = METADATA_REGEXP.match(text)	134 match = METADATA_REGEXP.match(text)

72 if match and match.group(1) in METADATA_KEYS:	135 if match and match.group(1) in METADATA_KEYS:

73 return Metadata(match.group(1), match.group(2))	136 return Metadata(match.group(1), match.group(2))

74 return Comment(text[1:].strip())	137 return Comment(text[1:].strip())

75	138

76	139

77 def _parse_header(text):	140 def _parse_header(text):

78 match = HEADER_REGEXP.match(text)	141 match = HEADER_REGEXP.match(text)

79 if not match:	142 if not match:

80 raise ParseError('Malformed header', text)	143 raise ParseError('Malformed header', text)

81 return Header(match.group(1))	144 return Header(match.group(1))

82	145

83	146

84 def _parse_instruction(text):	147 def _parse_instruction(text):

85 match = INCLUDE_REGEXP.match(text)	148 match = INCLUDE_REGEXP.match(text)

86 if not match:	149 if not match:

87 raise ParseError('Unrecognized instruction', text)	150 raise ParseError('Unrecognized instruction', text)

88 return Include(match.group(1))	151 return Include(match.group(1))

89	152

90	153

	154 def _parse_option(option):

	155 if '=' in option:

	156 return option.split('=', 1)

	157 if option.startswith('~'):

	158 return option[1:], False

	159 return option, True

	160

	161

	162 def _parse_filter_option(option):

	163 name, value = _parse_option(option)

	164

	165 if name not in ALL_OPTIONS:
	mathias 2017/08/01 06:31:35 I don't think this part of the code should validat I don't think this part of the code should validate whether an option is recognized, at least not unconditionally, for the sake of compatibility with future versions but also because parsing and validating is not necessarily the same step. Vasily Kuznetsov 2017/08/02 16:21:17 Following our conversation, I agree. Done Show quoted text On 2017/08/01 06:31:35, mathias wrote: > I don't think this part of the code should validate whether an option is > recognized, at least not unconditionally, for the sake of compatibility with > future versions but also because parsing and validating is not necessarily the > same step. Following our conversation, I agree. Done
	166 raise ParseError('Unrecognized option', name)

	167

	168 # Handle special cases of multivalued options.

	169 if name == FILTER_OPTION.DOMAIN:

	170 value = [_parse_option(o) for o in value.split('\|')]

	171 elif name == FILTER_OPTION.SITEKEY:

	172 value = value.split('\|')

	173

	174 return name, value

	175

	176

	177 def _parse_filter_options(options, separator=','):
	mathias 2017/08/01 06:31:35 Why is the separator a parameter? The only place w Why is the separator a parameter? The only place where this private function is invoked is _parse_blocking_filter, and that one does not bother passing a custom separator. Vasily Kuznetsov 2017/08/02 16:21:17 This is left-over from an earlier version that use Show quoted text On 2017/08/01 06:31:35, mathias wrote: > Why is the separator a parameter? The only place where this private function is > invoked is _parse_blocking_filter, and that one does not bother passing a custom > separator. This is left-over from an earlier version that used this parameter. Now removed.
	178 return [_parse_filter_option(o) for o in options.split(separator)]

	179

	180

	181 def _parse_blocking_filter(text):

	182 # Based on RegExpFilter.fromText in lib/filterClasses.js

	183 # in https://hg.adblockplus.org/adblockpluscore.

	184 action = FILTER_ACTION.BLOCK

	185 options = []

	186 selector = text

	187

	188 if selector.startswith('@@'):

	189 action = FILTER_ACTION.ALLOW

	190 selector = selector[2:]

	191

	192 if '$' in selector:

	193 opt_match = FILTER_OPTIONS_REGEXP.search(selector)

	194 if opt_match:

	195 selector = selector[:opt_match.start(0)]

	196 options = _parse_filter_options(opt_match.group(1))

	197

	198 if (len(selector) > 1 and

	199 selector.startswith('/') and selector.endswith('/')):

	200 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}

	201 else:

	202 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}

	203

	204 return Filter(text, selector, action, options)

	205

	206

	207 def _parse_hiding_filter(text, domain, type_flag, selector_value):

	208 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}

	209 action = FILTER_ACTION.HIDE

	210 options = []

	211

	212 if type_flag == '@':

	213 action = FILTER_ACTION.SHOW

	214 elif type_flag == '?':

	215 selector['type'] = SELECTOR_TYPE.XCSS

	216

	217 if domain:

	218 domains = [_parse_option(d) for d in domain.split(',')]

	219 options.append((FILTER_OPTION.DOMAIN, domains))

	220

	221 return Filter(text, selector, action, options)

	222

	223

	224 def parse_filter(text):

	225 """Parse one filter.

	226

	227 :param text: Text representation of a filter.

	228 :returns: Filter object.

	229 """

	230 if '#' in text:

	231 match = HIDING_FILTER_REGEXP.search(text)

	232 if match:

	233 return _parse_hiding_filter(text, *match.groups())

	234 return _parse_blocking_filter(text)

	235

	236

91 def parse_line(line_text):	237 def parse_line(line_text):

92 """Parse one line of a filter list.	238 """Parse one line of a filter list.

93	239

94 :param line_text: Line of a filter list (must be a unicode string).	240 :param line_text: Line of a filter list (must be a unicode string).

95 :returns: Parsed line object (see `_line_type`).	241 :returns: Parsed line object (see `_line_type`).

96 :raises ParseError: If the line can't be successfully parsed.	242 :raises ParseError: If the line can't be successfully parsed.

97 """	243 """

98 content = line_text.strip()	244 content = line_text.strip()

99	245

100 if content == '':	246 if content == '':

101 line = EmptyLine()	247 line = EmptyLine()

102 elif content.startswith('!'):	248 elif content.startswith('!'):

103 line = _parse_comment(content)	249 line = _parse_comment(content)

104 elif content.startswith('%') and content.endswith('%'):	250 elif content.startswith('%') and content.endswith('%'):

105 line = _parse_instruction(content)	251 line = _parse_instruction(content)

106 elif content.startswith('[') and content.endswith(']'):	252 elif content.startswith('[') and content.endswith(']'):

107 line = _parse_header(content)	253 line = _parse_header(content)

108 else:	254 else:

109 line = Filter(content)	255 line = parse_filter(content)

110	256

111 assert line.to_string().replace(' ', '') == content.replace(' ', '')	257 assert line.to_string().replace(' ', '') == content.replace(' ', '')

112 return line	258 return line

113	259

114	260

115 def parse_filterlist(lines):	261 def parse_filterlist(lines):

116 """Parse filter list from an iterable.	262 """Parse filter list from an iterable.

117	263

118 :param lines: List of strings or file or other iterable.	264 :param lines: List of strings or file or other iterable.

119 :returns: Iterator over parsed lines.	265 :returns: Iterator over parsed lines.

120 :raises ParseError: Can be thrown during iteration for invalid lines.	266 :raises ParseError: Can be thrown during iteration for invalid lines.

121 """	267 """

122 for line in lines:	268 for line in lines:

123 yield parse_line(line)	269 yield parse_line(line)

OLD	NEW

« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | tests/test_parser.py » ('J')