abp/filters/parser.py - Issue 29465715: Fixes 4969 - Add parsing of filters

Side by Side Diff: abp/filters/parser.py

Issue 29465715: Fixes 4969 - Add parsing of filters (Closed)

Patch Set: remove all interpretation and keep only parsing, add support for element hiding emulation filters, … Created July 27, 2017, 7:16 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-2017 eyeo GmbH	2 # Copyright (C) 2006-2017 eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 from __future__ import unicode_literals	16 from __future__ import unicode_literals

17	17

18 import re	18 import re

19 from collections import namedtuple	19 from collections import namedtuple

20	20

21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError']	21 __all__ = ['parse_filterlist', 'parse_line', 'ParseError', 'ST', 'FA']

22	22

23	23

24 class ParseError(Exception):	24 class ParseError(Exception):

25 """Exception thrown by the parser when it encounters invalid input.	25 """Exception thrown by the parser when it encounters invalid input.

26	26

27 :param error: Description of the error.	27 :param error: Description of the error.

28 :param text: The text which was being parsed when an error occurred.	28 :param text: The text which was being parsed when an error occurred.

29 """	29 """

30	30

31 def __init__(self, error, text):	31 def __init__(self, error, text):

32 Exception.__init__(self, '{} in "{}"'.format(error, text))	32 Exception.__init__(self, '{} in "{}"'.format(error, text))

33 self.text = text	33 self.text = text

34 self.error = error	34 self.error = error

35	35

36	36

	37 # Constants related to filters (see https://adblockplus.org/filters).

	38 class ST:
	mathias 2017/07/28 16:43:29 Why abbreviating here (ST) and below (FA)? Why abbreviating here (ST) and below (FA)? Vasily Kuznetsov 2017/07/28 17:38:10 To be completely honest, the reason is kind of stu Show quoted text On 2017/07/28 16:43:29, mathias wrote: > Why abbreviating here (ST) and below (FA)? To be completely honest, the reason is kind of stupid: to make flake8 happy :/ This class was called SELECTOR_TYPE originally and the following one FILTER_ACTION. So then you'd end up with constants like SELECTOR_TYPE.CSS and FILTER_ACTION.ALLOW. And then if you (as a user of the library) like shortness, you can import them `from abp.filters import SELECTOR_TYPE as ST` and get your short ST.CSS. But flake8 doesn't like class names that are not CamelCased. I thought about rolling a small special class to hold constants, kind of like: SELECTOR_TYPE = Namespace( URL_PATTERN='url-pattern', URL_REGEXP='url-regexp', ... ) But then you need to implement this `Namespace` class (or it can be a function), which is annoying and inelegant. Another option is to just disable N801 in flake8, that could probably also do it. Vasily Kuznetsov 2017/07/28 18:57:49 Done. Show quoted text On 2017/07/28 17:38:10, Vasily Kuznetsov wrote: > On 2017/07/28 16:43:29, mathias wrote: > > Why abbreviating here (ST) and below (FA)? > > To be completely honest, the reason is kind of stupid: to make flake8 happy :/ > > This class was called SELECTOR_TYPE originally and the following one > FILTER_ACTION. So then you'd end up with constants like SELECTOR_TYPE.CSS and > FILTER_ACTION.ALLOW. And then if you (as a user of the library) like shortness, > you can import them `from abp.filters import SELECTOR_TYPE as ST` and get your > short ST.CSS. But flake8 doesn't like class names that are not CamelCased. I > thought about rolling a small special class to hold constants, kind of like: > > SELECTOR_TYPE = Namespace( > URL_PATTERN='url-pattern', > URL_REGEXP='url-regexp', > ... > ) > > But then you need to implement this `Namespace` class (or it can be a function), > which is annoying and inelegant. Another option is to just disable N801 in > flake8, that could probably also do it. Done.
	39 """Selector types"""

	40 URL_PATTERN = 'url-pattern' # Normal URL patterns.

	41 URL_REGEXP = 'url-regexp' # Regular expressions for URLs.

	42 CSS = 'css' # CSS selectors for hiding filters.

	43 XCSS = 'extended-css' # Extended CSS selectors (to emulate CSS4).

	44 ABP_SIMPLE = 'abp-simple' # Simplified element hiding syntax.

	45

	46

	47 class FA:

	48 """Filter actions"""

	49 BLOCK = 'block' # Block the request.

	50 ALLOW = 'allow' # Allow the request (whitelist).

	51 HIDE = 'hide' # Hide selected element(s).

	52 SHOW = 'show' # Show selected element(s) (whitelist).

	53

	54

37 def _line_type(name, field_names, format_string):	55 def _line_type(name, field_names, format_string):

38 """Define a line type.	56 """Define a line type.

39	57

40 :param name: The name of the line type to define.	58 :param name: The name of the line type to define.

41 :param field_names: A sequence of field names or one space-separated	59 :param field_names: A sequence of field names or one space-separated

42 string that contains all field names.	60 string that contains all field names.

43 :param format_string: A format specifier for converting this line type	61 :param format_string: A format specifier for converting this line type

44 back to string representation.	62 back to string representation.

45 :returns: Class created with `namedtuple` that has `.type` set to	63 :returns: Class created with `namedtuple` that has `.type` set to

46 lowercased `name` and supports conversion back to string with	64 lowercased `name` and supports conversion back to string with

47 `.to_string()` method.	65 `.to_string()` method.

48 """	66 """

49 lt = namedtuple(name, field_names)	67 lt = namedtuple(name, field_names)

50 lt.type = name.lower()	68 lt.type = name.lower()

51 lt.to_string = lambda self: format_string.format(self)	69 lt.to_string = lambda self: format_string.format(self)

52 return lt	70 return lt

53	71

54	72

55 Header = _line_type('Header', 'version', '[{.version}]')	73 Header = _line_type('Header', 'version', '[{.version}]')

56 EmptyLine = _line_type('EmptyLine', '', '')	74 EmptyLine = _line_type('EmptyLine', '', '')

57 Comment = _line_type('Comment', 'text', '! {.text}')	75 Comment = _line_type('Comment', 'text', '! {.text}')

58 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')	76 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

59 Filter = _line_type('Filter', 'expression', '{.expression}')	77 Filter = _line_type('Filter', 'text selector action options', '{.text}')

60 Include = _line_type('Include', 'target', '%include {0.target}%')	78 Include = _line_type('Include', 'target', '%include {0.target}%')

61	79

62	80

63 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')	81 METADATA_REGEXP = re.compile(r'!\s(\w+)\s:\s(.)')

64 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',	82 METADATA_KEYS = {'Homepage', 'Title', 'Expires', 'Checksum', 'Redirect',

65 'Version'}	83 'Version'}

66 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	84 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

67 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	85 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)

	86 HFILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')
	mathias 2017/07/28 16:43:29 Why abbreviating? What's wrong about HIDING_FILTER Why abbreviating? What's wrong about HIDING_FILTER_REGEXP here and BLOCKING_FILTER_REGEXP below? Vasily Kuznetsov 2017/07/28 17:38:10 It's shorter this way. But I don't feel very stron Show quoted text On 2017/07/28 16:43:29, mathias wrote: > Why abbreviating? What's wrong about HIDING_FILTER_REGEXP here and > BLOCKING_FILTER_REGEXP below? It's shorter this way. But I don't feel very strong about it, will rename them. Vasily Kuznetsov 2017/07/28 18:57:49 Done. Show quoted text On 2017/07/28 17:38:10, Vasily Kuznetsov wrote: > On 2017/07/28 16:43:29, mathias wrote: > > Why abbreviating? What's wrong about HIDING_FILTER_REGEXP here and > > BLOCKING_FILTER_REGEXP below? > > It's shorter this way. But I don't feel very strong about it, will rename them. Done.
	87 BFILTER_REGEXP_REGEXP = re.compile(
	mathias 2017/07/28 16:43:30 I was wondering about the _REGEXP_REGEXP name, bu I was wondering about the _REGEXP_REGEXP name, but then where is this actually used anyway? Vasily Kuznetsov 2017/07/28 17:38:10 It's the regular expression for blocking filters w Show quoted text On 2017/07/28 16:43:30, mathias wrote: > I was wondering about the _REGEXP_REGEXP name, but then where is this actually > used anyway? It's the regular expression for blocking filters which use regular expressions, so the name is legit. However, you're right that I'm not actually using it. Just copied it from ABP source. The funny thing, it seems that there it's not used either :) (see https://hg.adblockplus.org/adblockpluscore/file/tip/lib/filterClasses.js) I will remove it. Vasily Kuznetsov* 2017/07/28 18:57:49 Done. Show quoted text On 2017/07/28 16:43:30, mathias wrote: > I was wondering about the *_REGEXP_REGEXP name, but then where is this actually > used anyway? Done.
	88 r'^(@@)?\/.\/(?:\$~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?))?$'

	89 )

	90 BFILTER_OPTIONS_REGEXP = re.compile(

	91 r'\$(~?[\w-]+(?:=[^,\s]+)?(?:,~?[\w-]+(?:=[^,\s]+)?)*)$'

	92 )

68	93

69	94

70 def _parse_comment(text):	95 def _parse_comment(text):

71 match = METADATA_REGEXP.match(text)	96 match = METADATA_REGEXP.match(text)

72 if match and match.group(1) in METADATA_KEYS:	97 if match and match.group(1) in METADATA_KEYS:

73 return Metadata(match.group(1), match.group(2))	98 return Metadata(match.group(1), match.group(2))

74 return Comment(text[1:].strip())	99 return Comment(text[1:].strip())

75	100

76	101

77 def _parse_header(text):	102 def _parse_header(text):

78 match = HEADER_REGEXP.match(text)	103 match = HEADER_REGEXP.match(text)

79 if not match:	104 if not match:

80 raise ParseError('Malformed header', text)	105 raise ParseError('Malformed header', text)

81 return Header(match.group(1))	106 return Header(match.group(1))

82	107

83	108

84 def _parse_instruction(text):	109 def _parse_instruction(text):

85 match = INCLUDE_REGEXP.match(text)	110 match = INCLUDE_REGEXP.match(text)

86 if not match:	111 if not match:

87 raise ParseError('Unrecognized instruction', text)	112 raise ParseError('Unrecognized instruction', text)

88 return Include(match.group(1))	113 return Include(match.group(1))

89	114

90	115

	116 def _parse_option(option):

	117 if '=' in option:

	118 name, value = option.split('=', 1)

	119 elif option.startswith('~'):

	120 name, value = option[1:], False

	121 else:

	122 name, value = option, True

	123

	124 # Handle special cases of multivalued options.

	125 if name == 'domain':
	mathias 2017/07/28 16:43:30 Wouldn't it make sense to enumerate recognized OPT Wouldn't it make sense to enumerate recognized OPTION_$NAME symbols as well? Vasily Kuznetsov 2017/07/28 17:38:10 Yeah, probably makes sense to make some kind of en Show quoted text On 2017/07/28 16:43:30, mathias wrote: > Wouldn't it make sense to enumerate recognized OPTION_$NAME symbols as well? Yeah, probably makes sense to make some kind of enum for these things. It will be needed anyway later, for interpretation, and might also be useful for the users of the library. Vasily Kuznetsov 2017/07/28 18:57:49 Done. Show quoted text On 2017/07/28 16:43:30, mathias wrote: > Wouldn't it make sense to enumerate recognized OPTION_$NAME symbols as well? Done.
	126 name, value = 'domains', _parse_options(value, '\|')
	mathias 2017/07/28 16:43:30 Why using a different / plural key for the parsed Why using a different / plural key for the parsed version? Vasily Kuznetsov 2017/07/28 17:38:10 Because semantically it's a list, always, so calli Show quoted text On 2017/07/28 16:43:30, mathias wrote: > Why using a different / plural key for the parsed version? Because semantically it's a list, always, so calling it 'domain' is a bit confusing. I see your point, however, that it might be not very obvious if the option is named differently than what it's called in the unparsed filter. Perhaps this could be solved in a mutually beneficial way if the key will be 'domain' but the constant will be called `OPTION.DOMAINS`. Although this is also confusing :/ Vasily Kuznetsov 2017/07/28 18:57:49 Done. Show quoted text On 2017/07/28 16:43:30, mathias wrote: > Why using a different / plural key for the parsed version? Done.
	127 elif name == 'sitekey':

	128 name, value = 'sitekeys', value.split('\|')

	129

	130 return name, value

	131

	132

	133 def _parse_options(options, separator=','):

	134 return [_parse_option(o) for o in options.split(separator)]

	135

	136

	137 def _parse_blocking_filter(text):

	138 # Based on RegExpFilter.fromText in lib/filterClasses.js

	139 # in https://hg.adblockplus.org/adblockpluscore.

	140 action = FA.BLOCK

	141 options = []

	142 selector = text

	143

	144 if selector.startswith('@@'):

	145 action = FA.ALLOW

	146 selector = selector[2:]

	147

	148 if '$' in selector:

	149 opt_match = BFILTER_OPTIONS_REGEXP.search(selector)

	150 if opt_match:

	151 selector = selector[:opt_match.start(0)]

	152 options = _parse_options(opt_match.group(1))

	153

	154 if (len(selector) > 1 and

	155 selector.startswith('/') and selector.endswith('/')):

	156 selector = {'type': ST.URL_REGEXP, 'value': selector[1:-1]}

	157 else:

	158 selector = {'type': ST.URL_PATTERN, 'value': selector}

	159

	160 return Filter(text, selector, action, options)

	161

	162

	163 def _parse_hiding_filter(text, domains, type_flag, selector_value):

	164 selector = {'type': ST.CSS, 'value': selector_value}

	165 action = FA.HIDE

	166 options = []

	167

	168 if type_flag == '@':

	169 action = FA.SHOW

	170 elif type_flag == '?':

	171 selector['type'] = ST.XCSS

	172

	173 if domains:

	174 options.append(('domains', _parse_options(domains)))

	175

	176 return Filter(text, selector, action, options)

	177

	178

	179 def parse_filter(text):

	180 """Parse one filter.

	181

	182 :param text: Text representation of a filter.

	183 :returns: filter object.
	mathias 2017/07/28 16:43:29 I think this should be upper-case "Filter". I think this should be upper-case "Filter". Vasily Kuznetsov 2017/07/28 17:38:10 Yes. Show quoted text On 2017/07/28 16:43:29, mathias wrote: > I think this should be upper-case "Filter". Yes.
	184 """

	185 if '#' in text:

	186 match = HFILTER_REGEXP.search(text)

	187 if match:

	188 return _parse_hiding_filter(text, *match.groups())

	189 return _parse_blocking_filter(text)

	190

	191

91 def parse_line(line_text):	192 def parse_line(line_text):

92 """Parse one line of a filter list.	193 """Parse one line of a filter list.

93	194

94 :param line_text: Line of a filter list (must be a unicode string).	195 :param line_text: Line of a filter list (must be a unicode string).

95 :returns: Parsed line object (see `_line_type`).	196 :returns: Parsed line object (see `_line_type`).

96 :raises ParseError: If the line can't be successfully parsed.	197 :raises ParseError: If the line can't be successfully parsed.

97 """	198 """

98 content = line_text.strip()	199 content = line_text.strip()

99	200

100 if content == '':	201 if content == '':

101 line = EmptyLine()	202 line = EmptyLine()

102 elif content.startswith('!'):	203 elif content.startswith('!'):

103 line = _parse_comment(content)	204 line = _parse_comment(content)

104 elif content.startswith('%') and content.endswith('%'):	205 elif content.startswith('%') and content.endswith('%'):

105 line = _parse_instruction(content)	206 line = _parse_instruction(content)

106 elif content.startswith('[') and content.endswith(']'):	207 elif content.startswith('[') and content.endswith(']'):

107 line = _parse_header(content)	208 line = _parse_header(content)

108 else:	209 else:

109 line = Filter(content)	210 line = parse_filter(content)

110	211

111 assert line.to_string().replace(' ', '') == content.replace(' ', '')	212 assert line.to_string().replace(' ', '') == content.replace(' ', '')

112 return line	213 return line

113	214

114	215

115 def parse_filterlist(lines):	216 def parse_filterlist(lines):

116 """Parse filter list from an iterable.	217 """Parse filter list from an iterable.

117	218

118 :param lines: List of strings or file or other iterable.	219 :param lines: List of strings or file or other iterable.

119 :returns: Iterator over parsed lines.	220 :returns: Iterator over parsed lines.

120 :raises ParseError: Can be thrown during iteration for invalid lines.	221 :raises ParseError: Can be thrown during iteration for invalid lines.

121 """	222 """

122 for line in lines:	223 for line in lines:

123 yield parse_line(line)	224 yield parse_line(line)

OLD	NEW

« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »