abp/filters/parser.py - Issue 29465715: Fixes 4969 - Add parsing of filters

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)

Patch Set: Rebase to 1f5d7ead9bff Created Oct. 24, 2017, 3:58 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-present eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 from __future__ import unicode_literals	16 from __future__ import unicode_literals

17	17

18 import re	18 import re

19 from collections import namedtuple	19 from collections import namedtuple

20	20

21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError']	21 __all__ = [

	22 'FILTER_ACTION',

	23 'FILTER_OPTION',

	24 'ParseError',

	25 'SELECTOR_TYPE',

	26 'parse_filterlist',

	27 'parse_line',

	28 ]

22	29

23	30

24 class ParseError(Exception):	31 class ParseError(Exception):

25 """Exception thrown by the parser when it encounters invalid input.	32 """Exception thrown by the parser when it encounters invalid input.

26	33

27 :param error: Description of the error.	34 :param error: Description of the error.

28 :param text: The text which was being parsed when an error occurred.	35 :param text: The text which was being parsed when an error occurred.

29 """	36 """

30	37

31 def __init__(self, error, text):	38 def __init__(self, error, text):

32 Exception.__init__(self, '{} in "{}"'.format(error, text))	39 Exception.__init__(self, '{} in "{}"'.format(error, text))

33 self.text = text	40 self.text = text

34 self.error = error	41 self.error = error

35	42

36	43

	44 # Constants related to filters (see https://adblockplus.org/filters).

	45 class SELECTOR_TYPE: # flake8: noqa (This class is an enumeration constant).

	46 """Selector types"""

	47 URL_PATTERN = 'url-pattern' # Normal URL patterns.

	48 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.

	49 CSS = 'css' # CSS selectors for hiding filters.

	50 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).

	51 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.

	52

	53

	54 class FILTER_ACTION: # flake8: noqa (This class is an enumeration constant).

	55 """Filter actions"""

	56 BLOCK = 'block' # Block the request.

	57 ALLOW = 'allow' # Allow the request (whitelist).

	58 HIDE = 'hide' # Hide selected element(s).

	59 SHOW = 'show' # Show selected element(s) (whitelist).

	60

	61

	62 class FILTER_OPTION: # flake8: noqa (This class is an enumeration constant).

	63 """Filter options"""

	64 # Resource types.

	65 OTHER = 'other'

	66 SCRIPT = 'script'

	67 IMAGE = 'image'

	68 STYLESHEET = 'stylesheet'

	69 OBJECT = 'object'

	70 SUBDOCUMENT = 'subdocument'

	71 DOCUMENT = 'document'

	72 WEBSOCKET = 'websocket'

	73 WEBRTC = 'webrtc'

	74 PING = 'ping'

	75 XMLHTTPREQUEST = 'xmlhttprequest'

	76 OBJECT_SUBREQUEST = 'object-subrequest'

	77 MEDIA = 'media'

	78 FONT = 'font'

	79 POPUP = 'popup'

	80 GENERICBLOCK = 'genericblock'

	81 ELEMHIDE = 'elemhide'

	82 GENERICHIDE = 'generichide'

	83

	84 # Deprecated resource types.

	85 BACKGROUND = 'background'

	86 XBL = 'xbl'

	87 DTD = 'dtd'

	88

	89 # Other options.

	90 MATCH_CASE = 'match-case'

	91 DOMAIN = 'domain'

	92 THIRD_PARTY = 'third-party'

	93 COLLAPSE = 'collapse'

	94 SITEKEY = 'sitekey'

	95 DONOTTRACK = 'donottrack'

	96

	97

37 def _line_type(name, field_names, format_string):	98 def _line_type(name, field_names, format_string):

38 """Define a line type.	99 """Define a line type.

39	100

40 :param name: The name of the line type to define.	101 :param name: The name of the line type to define.

41 :param field_names: A sequence of field names or one space-separated	102 :param field_names: A sequence of field names or one space-separated

42 string that contains all field names.	103 string that contains all field names.

43 :param format_string: A format specifier for converting this line type	104 :param format_string: A format specifier for converting this line type

44 back to string representation.	105 back to string representation.

45 :returns: Class created with `namedtuple` that has `.type` set to	106 :returns: Class created with `namedtuple` that has `.type` set to

46 lowercased `name` and supports conversion back to string with	107 lowercased `name` and supports conversion back to string with

47 `.to_string()` method.	108 `.to_string()` method.

48 """	109 """

49 lt = namedtuple(name, field_names)	110 lt = namedtuple(name, field_names)

50 lt.type = name.lower()	111 lt.type = name.lower()

51 lt.to_string = lambda self: format_string.format(self)	112 lt.to_string = lambda self: format_string.format(self)

52 return lt	113 return lt

53	114

54	115

55 Header = _line_type('Header', 'version', '[{.version}]')	116 Header = _line_type('Header', 'version', '[{.version}]')

56 EmptyLine = _line_type('EmptyLine', '', '')	117 EmptyLine = _line_type('EmptyLine', '', '')

57 Comment = _line_type('Comment', 'text', '! {.text}')	118 Comment = _line_type('Comment', 'text', '! {.text}')

58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')	119 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

59 Filter = _line_type('Filter', 'expression', '{.expression}')	120 Filter = _line_type('Filter', 'text selector action options', '{.text}')

60 Include = _line_type('Include', 'target', '%include {0.target}%')	121 Include = _line_type('Include', 'target', '%include {0.target}%')

61	122

62	123

63 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')	124 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')

64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',	125 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',

65 'Version'}	126 'Version'}

66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	127 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	128 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)

	129 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')

	130 FILTER_OPTIONS_REGEXP = re.compile(

	131 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'

	132 )

68	133

69	134

70 def _parse_comment(text):	135 def _parse_comment(text):

71 match = METADATA_REGEXP.match(text)	136 match = METADATA_REGEXP.match(text)

72 if match and match.group(1) in METADATA_KEYS:	137 if match and match.group(1) in METADATA_KEYS:

73 return Metadata(match.group(1), match.group(2))	138 return Metadata(match.group(1), match.group(2))

74 return Comment(text[1:].strip())	139 return Comment(text[1:].strip())

75	140

76	141

77 def _parse_header(text):	142 def _parse_header(text):

78 match = HEADER_REGEXP.match(text)	143 match = HEADER_REGEXP.match(text)

79 if not match:	144 if not match:

80 raise ParseError('Malformed header', text)	145 raise ParseError('Malformed header', text)

81 return Header(match.group(1))	146 return Header(match.group(1))

82	147

83	148

84 def _parse_instruction(text):	149 def _parse_instruction(text):

85 match = INCLUDE_REGEXP.match(text)	150 match = INCLUDE_REGEXP.match(text)

86 if not match:	151 if not match:

87 raise ParseError('Unrecognized instruction', text)	152 raise ParseError('Unrecognized instruction', text)

88 return Include(match.group(1))	153 return Include(match.group(1))

89	154

90	155

	156 def _parse_option(option):

	157 if '=' in option:

	158 return option.split('=', 1)

	159 if option.startswith('~'):

	160 return option[1:], False

	161 return option, True

	162

	163

	164 def _parse_filter_option(option):

	165 name, value = _parse_option(option)

	166

	167 # Handle special cases of multivalued options.

	168 if name == FILTER_OPTION.DOMAIN:

	169 value = [_parse_option(o) for o in value.split('\|')]

	170 elif name == FILTER_OPTION.SITEKEY:

	171 value = value.split('\|')

	172

	173 return name, value

	174

	175

	176 def _parse_filter_options(options):

	177 return [_parse_filter_option(o) for o in options.split(',')]

	178

	179

	180 def _parse_blocking_filter(text):

	181 # Based on RegExpFilter.fromText in lib/filterClasses.js

	182 # in https://hg.adblockplus.org/adblockpluscore.

	183 action = FILTER_ACTION.BLOCK

	184 options = []

	185 selector = text

	186

	187 if selector.startswith('@@'):

	188 action = FILTER_ACTION.ALLOW

	189 selector = selector[2:]

	190

	191 if '$' in selector:

	192 opt_match = FILTER_OPTIONS_REGEXP.search(selector)

	193 if opt_match:

	194 selector = selector[:opt_match.start(0)]

	195 options = _parse_filter_options(opt_match.group(1))

	196

	197 if (len(selector) > 1 and

	198 selector.startswith('/') and selector.endswith('/')):

	199 selector = {'type': SELECTOR_TYPE.URL_REGEXP, 'value': selector[1:-1]}

	200 else:

	201 selector = {'type': SELECTOR_TYPE.URL_PATTERN, 'value': selector}

	202

	203 return Filter(text, selector, action, options)

	204

	205

	206 def _parse_hiding_filter(text, domain, type_flag, selector_value):

	207 selector = {'type': SELECTOR_TYPE.CSS, 'value': selector_value}

	208 action = FILTER_ACTION.HIDE

	209 options = []

	210

	211 if type_flag == '@':

	212 action = FILTER_ACTION.SHOW

	213 elif type_flag == '?':

	214 selector['type'] = SELECTOR_TYPE.XCSS

	215

	216 if domain:

	217 domains = [_parse_option(d) for d in domain.split(',')]

	218 options.append((FILTER_OPTION.DOMAIN, domains))

	219

	220 return Filter(text, selector, action, options)

	221

	222

	223 def parse_filter(text):

	224 """Parse one filter.

	225

	226 :param text: Text representation of a filter.

	227 :returns: Filter object.

	228 """

	229 if '#' in text:

	230 match = HIDING_FILTER_REGEXP.search(text)

	231 if match:

	232 return _parse_hiding_filter(text, *match.groups())

	233 return _parse_blocking_filter(text)

	234

	235

91 def parse_line(line_text):	236 def parse_line(line_text):

92 """Parse one line of a filter list.	237 """Parse one line of a filter list.

93	238

94 :param line_text: Line of a filter list (must be a unicode string).	239 :param line_text: Line of a filter list (must be a unicode string).

95 :returns: Parsed line object (see `_line_type`).	240 :returns: Parsed line object (see `_line_type`).

96 :raises ParseError: If the line can't be successfully parsed.	241 :raises ParseError: If the line can't be successfully parsed.

97 """	242 """

98 content = line_text.strip()	243 content = line_text.strip()

99	244

100 if content == '':	245 if content == '':

101 line = EmptyLine()	246 line = EmptyLine()

102 elif content.startswith('!'):	247 elif content.startswith('!'):

103 line = _parse_comment(content)	248 line = _parse_comment(content)

104 elif content.startswith('%') and content.endswith('%'):	249 elif content.startswith('%') and content.endswith('%'):

105 line = _parse_instruction(content)	250 line = _parse_instruction(content)

106 elif content.startswith('[') and content.endswith(']'):	251 elif content.startswith('[') and content.endswith(']'):

107 line = _parse_header(content)	252 line = _parse_header(content)

108 else:	253 else:

109 line = Filter(content)	254 line = parse_filter(content)

110	255

111 assert line.to_string().replace(' ', '') == content.replace(' ', '')	256 assert line.to_string().replace(' ', '') == content.replace(' ', '')

112 return line	257 return line

113	258

114	259

115 def parse_filterlist(lines):	260 def parse_filterlist(lines):

116 """Parse filter list from an iterable.	261 """Parse filter list from an iterable.

117	262

118 :param lines: List of strings or file or other iterable.	263 :param lines: List of strings or file or other iterable.

119 :returns: Iterator over parsed lines.	264 :returns: Iterator over parsed lines.

120 :raises ParseError: Can be thrown during iteration for invalid lines.	265 :raises ParseError: Can be thrown during iteration for invalid lines.

121 """	266 """

122 for line in lines:	267 for line in lines:

123 yield parse_line(line)	268 yield parse_line(line)

OLD	NEW

« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »