abp/filters/parser.py - Issue 29465715: Fixes 4969 - Add parsing of filters

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)

Patch Set: Address review comments on patch set 3 Created Aug. 2, 2017, 4:15 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-2017 eyeo GmbH	2 # Copyright (C) 2006-2017 eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 from __future__ import unicode_literals	16 from __future__ import unicode_literals

17	17

18 import re	18 import re

19 from collections import namedtuple	19 from collections import namedtuple

20	20

21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError']	21 __all__ = [
	Vasily Kuznetsov 2017/08/02 16:21:18 I updated my formatting rules to include "prefer o I updated my formatting rules to include "prefer one item per line notation for lists, sets and dictionary literals that don't fit on one line" after our conversation. This is updated accordingly.
	22 'FILTER_ACTION',

	23 'FILTER_OPTION',

	24 'ParseError',

	25 'SELECTOR_TYPE',

	26 'parse_filterlist',

	27 'parse_line',

	28 ]

22	29

23	30

24 class ParseError(Exception):	31 class ParseError(Exception):

25 """Exception thrown by the parser when it encounters invalid input.	32 """Exception thrown by the parser when it encounters invalid input.

26	33

27 :param error: Description of the error.	34 :param error: Description of the error.

28 :param text: The text which was being parsed when an error occurred.	35 :param text: The text which was being parsed when an error occurred.

29 """	36 """

30	37

31 def __init__(self, error, text):	38 def __init__(self, error, text):

32 Exception.__init__(self, '{} in "{}"'.format(error, text))	39 Exception.__init__(self, '{} in "{}"'.format(error, text))

33 self.text = text	40 self.text = text

34 self.error = error	41 self.error = error

35	42

36	43

	44 # Constants related to filters (see https://adblockplus.org/filters).

	45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).

	46 """Selector types"""

	47 URL_PATTERN = 'url-pattern' # Normal URL patterns.

	48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.

	49 CSS = 'css' # CSS selectors for hiding filters.

	50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).

	51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.

	52

	53

	54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).

	55 """Filter actions"""

	56 BLOCK = 'block' # Block the request.

	57 ALLOW = 'allow' # Allow the request (whitelist).

	58 HIDE = 'hide' # Hide selected element(s).

	59 SHOW = 'show' # Show selected element(s) (whitelist).

	60

	61

	62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).

	63 """Filter options"""

	64 # Resource types.

	65 OTHER = 'other'

	66 SCRIPT = 'script'

	67 IMAGE = 'image'

	68 STYLESHEET = 'stylesheet'

	69 OBJECT = 'object'

	70 SUBDOCUMENT = 'subdocument'

	71 DOCUMENT = 'document'

	72 WEBSOCKET = 'websocket'

	73 WEBRTC = 'webrtc'

	74 PING = 'ping'

	75 XMLHTTPREQUEST = 'xmlhttprequest'

	76 OBJECT_SUBREQUEST = 'object-subrequest'

	77 MEDIA = 'media'

	78 FONT = 'font'

	79 POPUP = 'popup'

	80 GENERICBLOCK = 'genericblock'

	81 ELEMHIDE = 'elemhide'

	82 GENERICHIDE = 'generichide'

	83

	84 # Deprecated resource types.

	85 BACKGROUND = 'background'

	86 XBL = 'xbl'

	87 DTD = 'dtd'

	88

	89 # Other options.

	90 MATCH_CASE = 'match-case'

	91 DOMAIN = 'domain'

	92 THIRD_PARTY = 'third-party'

	93 COLLAPSE = 'collapse'

	94 SITEKEY = 'sitekey'

	95 DONOTTRACK = 'donottrack'

	96

	97

37 def _line_type(name, field_names, format_string):	98 def _line_type(name, field_names, format_string):

38 """Define a line type.	99 """Define a line type.

39	100

40 :param name: The name of the line type to define.	101 :param name: The name of the line type to define.

41 :param field_names: A sequence of field names or one space-separated	102 :param field_names: A sequence of field names or one space-separated

42 string that contains all field names.	103 string that contains all field names.

43 :param format_string: A format specifier for converting this line type	104 :param format_string: A format specifier for converting this line type

44 back to string representation.	105 back to string representation.

45 :returns: Class created with `namedtuple` that has `.type` set to	106 :returns: Class created with `namedtuple` that has `.type` set to

46 lowercased `name` and supports conversion back to string with	107 lowercased `name` and supports conversion back to string with

47 `.to_string()` method.	108 `.to_string()` method.

48 """	109 """

49 lt = namedtuple(name, field_names)	110 lt = namedtuple(name, field_names)

50 lt.type = name.lower()	111 lt.type = name.lower()

51 lt.to_string = lambda self: format_string.format(self)	112 lt.to_string = lambda self: format_string.format(self)

52 return lt	113 return lt

53	114

54	115

55 Header = _line_type('Header', 'version', '[{.version}]')	116 Header = _line_type('Header', 'version', '[{.version}]')

56 EmptyLine = _line_type('EmptyLine', '', '')	117 EmptyLine = _line_type('EmptyLine', '', '')

57 Comment = _line_type('Comment', 'text', '! {.text}')	118 Comment = _line_type('Comment', 'text', '! {.text}')

58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')	119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

59 Filter = _line_type('Filter', 'expression', '{.expression}')	120 Filter = _line_type('Filter', 'text selector action options', '{.text}')

60 Include = _line_type('Include', 'target', '%include {0.target}%')	121 Include = _line_type('Include', 'target', '%include {0.target}%')

61	122

62	123

63 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')	124 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')

64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',	125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',

65 'Version'}	126 'Version'}

66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)

	129 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')

	130 FILTER_OPTIONS_REGEXP = re.compile(

	131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'

	132 )

68	133

69	134

70 def _parse_comment(text):	135 def _parse_comment(text):

71 match = METADATA_REGEXP.match(text)	136 match = METADATA_REGEXP.match(text)

72 if match and match.group(1) in METADATA_KEYS:	137 if match and match.group(1) in METADATA_KEYS:

73 return Metadata(match.group(1), match.group(2))	138 return Metadata(match.group(1), match.group(2))

74 return Comment(text[1:].strip())	139 return Comment(text[1:].strip())

75	140

76	141

77 def _parse_header(text):	142 def _parse_header(text):

78 match = HEADER_REGEXP.match(text)	143 match = HEADER_REGEXP.match(text)

79 if not match:	144 if not match:

80 raise ParseError('Malformed header', text)	145 raise ParseError('Malformed header', text)

81 return Header(match.group(1))	146 return Header(match.group(1))

82	147

83	148

84 def _parse_instruction(text):	149 def _parse_instruction(text):

85 match = INCLUDE_REGEXP.match(text)	150 match = INCLUDE_REGEXP.match(text)

86 if not match:	151 if not match:

87 raise ParseError('Unrecognized instruction', text)	152 raise ParseError('Unrecognized instruction', text)

88 return Include(match.group(1))	153 return Include(match.group(1))

89	154

90	155

	156 def _parse_option(option):

	157 if '=' in option:

	158 return option.split('=', 1)

	159 if option.startswith('~'):

	160 return option[1:], False

	161 return option, True

	162

	163

	164 def _parse_filter_option(option):

	165 name, value = _parse_option(option)

	166

	167 # Handle special cases of multivalued options.

	168 if name == FILTER_OPTION.DOMAIN:

	169 value = [_parse_option(o) for o in value.split('\|')]

	170 elif name == FILTER_OPTION.SITEKEY:

	171 value = value.split('\|')

	172

	173 return name, value

	174

	175

	176 def _parse_filter_options(options):

	177 return [_parse_filter_option(o) for o in options.split(',')]

	178

	179

	180 def _parse_blocking_filter(text):

	181 # Based on RegExpFilter.fromText in lib/filterClasses.js

	182 # in https://hg.adblockplus.org/adblockpluscore.

	183 action = FILTER_ACTION.BLOCK

	184 options = []

	185 selector = text

	186

	187 if selector.startswith('@@'):

	188 action = FILTER_ACTION.ALLOW

	189 selector = selector[2:]

	190

	191 if '$' in selector:

	192 opt_match = FILTER_OPTIONS_REGEXP.search(selector)

	193 if opt_match:

	194 selector = selector[:opt_match.start(0)]

	195 options = _parse_filter_options(opt_match.group(1))

	196

	197 if (len(selector) > 1 and

	198 selector.startswith('/') and selector.endswith('/')):

	199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}

	200 else:

	201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}

	202

	203 return Filter(text, selector, action, options)

	204

	205

	206 def _parse_hiding_filter(text, domain, type_flag, selector_value):

	207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}

	208 action = FILTER_ACTION.HIDE

	209 options = []

	210

	211 if type_flag == '@':

	212 action = FILTER_ACTION.SHOW

	213 elif type_flag == '?':

	214 selector['type'] = SELECTOR_TYPE.XCSS

	215

	216 if domain:

	217 domains = [_parse_option(d) for d in domain.split(',')]

	218 options.append((FILTER_OPTION.DOMAIN, domains))

	219

	220 return Filter(text, selector, action, options)

	221

	222

	223 def parse_filter(text):

	224 """Parse one filter.

	225

	226 :param text: Text representation of a filter.

	227 :returns: Filter object.

	228 """

	229 if '#' in text:

	230 match = HIDING_FILTER_REGEXP.search(text)

	231 if match:

	232 return _parse_hiding_filter(text, *match.groups())

	233 return _parse_blocking_filter(text)

	234

	235

91 def parse_line(line_text):	236 def parse_line(line_text):

92 """Parse one line of a filter list.	237 """Parse one line of a filter list.

93	238

94 :param line_text: Line of a filter list (must be a unicode string).	239 :param line_text: Line of a filter list (must be a unicode string).

95 :returns: Parsed line object (see `_line_type`).	240 :returns: Parsed line object (see `_line_type`).

96 :raises ParseError: If the line can't be successfully parsed.	241 :raises ParseError: If the line can't be successfully parsed.

97 """	242 """

98 content = line_text.strip()	243 content = line_text.strip()

99	244

100 if content == '':	245 if content == '':

101 line = EmptyLine()	246 line = EmptyLine()

102 elif content.startswith('!'):	247 elif content.startswith('!'):

103 line = _parse_comment(content)	248 line = _parse_comment(content)

104 elif content.startswith('%') and content.endswith('%'):	249 elif content.startswith('%') and content.endswith('%'):

105 line = _parse_instruction(content)	250 line = _parse_instruction(content)

106 elif content.startswith('[') and content.endswith(']'):	251 elif content.startswith('[') and content.endswith(']'):

107 line = _parse_header(content)	252 line = _parse_header(content)

108 else:	253 else:

109 line = Filter(content)	254 line = parse_filter(content)

110	255

111 assert line.to_string().replace(' ', '') == content.replace(' ', '')	256 assert line.to_string().replace(' ', '') == content.replace(' ', '')

112 return line	257 return line

113	258

114	259

115 def parse_filterlist(lines):	260 def parse_filterlist(lines):

116 """Parse filter list from an iterable.	261 """Parse filter list from an iterable.

117	262

118 :param lines: List of strings or file or other iterable.	263 :param lines: List of strings or file or other iterable.

119 :returns: Iterator over parsed lines.	264 :returns: Iterator over parsed lines.

120 :raises ParseError: Can be thrown during iteration for invalid lines.	265 :raises ParseError: Can be thrown during iteration for invalid lines.

121 """	266 """

122 for line in lines:	267 for line in lines:

123 yield parse_line(line)	268 yield parse_line(line)

OLD	NEW

« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »