abp/filters/parser.py - Issue 29873561: Issue 6920 - Only parse metadata from the top of the file

Side by Side Diff: abp/filters/parser.py

Issue 29873561: Issue 6920 - Only parse metadata from the top of the file (Closed)

Patch Set: Documented behavior for parse_line(), simplified end-of-metadata semantics Created Sept. 4, 2018, 6:26 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-present eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

(...skipping 122 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
133	133

134	134

135 Header = _line_type('Header', 'version', '[{.version}]')	135 Header = _line_type('Header', 'version', '[{.version}]')

136 EmptyLine = _line_type('EmptyLine', '', '')	136 EmptyLine = _line_type('EmptyLine', '', '')

137 Comment = _line_type('Comment', 'text', '! {.text}')	137 Comment = _line_type('Comment', 'text', '! {.text}')

138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')	138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

139 Filter = _line_type('Filter', 'text selector action options', '{.text}')	139 Filter = _line_type('Filter', 'text selector action options', '{.text}')

140 Include = _line_type('Include', 'target', '%include {0.target}%')	140 Include = _line_type('Include', 'target', '%include {0.target}%')

141	141

142	142

143 METADATA_REGEXP = re.compile(r'!\s([\w-]+)\s:(?!//)\s(.)')	143 METADATA_REGEXP = re.compile(r'([\w-]+)\s:\s(.*)')

144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)

146 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')	146 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')

147 FILTER_OPTIONS_REGEXP = re.compile(	147 FILTER_OPTIONS_REGEXP = re.compile(

148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'	148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'

149 )	149 )

150	150

151	151

152 def _parse_comment(text):

153 match = METADATA_REGEXP.match(text)

154 if match:

155 return Metadata(match.group(1), match.group(2))

156 return Comment(text[1:].strip())

157

158

159 def _parse_header(text):	152 def _parse_header(text):

160 match = HEADER_REGEXP.match(text)	153 match = HEADER_REGEXP.match(text)

161 if not match:	154 if not match:

162 raise ParseError('Malformed header', text)	155 raise ParseError('Malformed header', text)

163 return Header(match.group(1))	156 return Header(match.group(1))

164	157

165	158

166 def _parse_instruction(text):	159 def _parse_instruction(text):

167 match = INCLUDE_REGEXP.match(text)	160 match = INCLUDE_REGEXP.match(text)

168 if not match:	161 if not match:

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
254 if '#' in text:	247 if '#' in text:

255 match = HIDING_FILTER_REGEXP.search(text)	248 match = HIDING_FILTER_REGEXP.search(text)

256 if match:	249 if match:

257 return _parse_hiding_filter(text, *match.groups())	250 return _parse_hiding_filter(text, *match.groups())

258 return _parse_blocking_filter(text)	251 return _parse_blocking_filter(text)

259	252

260	253

261 def parse_line(line_text):	254 def parse_line(line_text):

262 """Parse one line of a filter list.	255 """Parse one line of a filter list.

263	256

	257 Note that parse_line() doesn't handle special comments, hence never returns

	258 a Metadata() object, Adblock Plus only considers metadata when parsing the

	259 whole filter list and only if they are given at the top of the filter list.

	260

264 Parameters	261 Parameters

265 ----------	262 ----------

266 line_text : str	263 line_text : str

267 Line of a filter list.	264 Line of a filter list.

268	265

269 Returns	266 Returns

270 -------	267 -------

271 namedtuple	268 namedtuple

272 Parsed line (see `_line_type`).	269 Parsed line (see `_line_type`).

273	270

274 Raises	271 Raises

275 ------	272 ------

276 ParseError	273 ParseError

277 ParseError: If the line can't be parsed.	274 ParseError: If the line can't be parsed.

278 """	275 """

279 if isinstance(line_text, type(b'')):	276 if isinstance(line_text, type(b'')):

280 line_text = line_text.decode('utf-8')	277 line_text = line_text.decode('utf-8')

281	278

282 content = line_text.strip()	279 content = line_text.strip()

283	280

284 if content == '':	281 if content == '':

285 line = EmptyLine()	282 line = EmptyLine()

286 elif content.startswith('!'):	283 elif content.startswith('!'):

287 line = _parse_comment(content)	284 line = Comment(content[1:].lstrip())

288 elif content.startswith('%') and content.endswith('%'):	285 elif content.startswith('%') and content.endswith('%'):

289 line = _parse_instruction(content)	286 line = _parse_instruction(content)

290 elif content.startswith('[') and content.endswith(']'):	287 elif content.startswith('[') and content.endswith(']'):

291 line = _parse_header(content)	288 line = _parse_header(content)

292 else:	289 else:

293 line = parse_filter(content)	290 line = parse_filter(content)

294	291

295 assert line.to_string().replace(' ', '') == content.replace(' ', '')	292 assert line.to_string().replace(' ', '') == content.replace(' ', '')

296 return line	293 return line

297	294

(...skipping 12 matching lines...) Expand all Loading...
310 Parsed lines of the filter list.	307 Parsed lines of the filter list.

311	308

312 Raises	309 Raises

313 ------	310 ------

314 ParseError	311 ParseError

315 Thrown during iteration for invalid filter list lines.	312 Thrown during iteration for invalid filter list lines.

316 TypeError	313 TypeError

317 If `lines` is not iterable.	314 If `lines` is not iterable.

318	315

319 """	316 """

	317 metadata_closed = False

	318

320 for line in lines:	319 for line in lines:

321 yield parse_line(line)	320 result = parse_line(line)

	321

	322 if isinstance(result, Comment):

	323 match = METADATA_REGEXP.match(result.text)

	324 if match:

	325 key, value = match.groups()

	326

	327 # Historically, checksums can occur at the bottom of the

	328 # filter list. Checksums are no longer used by Adblock Plus,

	329 # but in order to strip them (in abp.filters.renderer),

	330 # we have to make sure to still parse them regardless of

	331 # their position in the filter list.

	332 if not metadata_closed or key.lower() == 'checksum':

	333 yield Metadata(key, value)

	334 continue

	335

	336 if not result.text:

	337 metadata_closed = True

	338 elif isinstance(result, (EmptyLine, Filter)):
	Vasily Kuznetsov 2018/09/04 19:50:56 I just thought that since this could also be used I just thought that since this could also be used to parse fragments of filter lists, we should probably also close metadata on an include. Does this make sense? Sebastian Noack 2018/09/04 20:23:32 Done. Show quoted text On 2018/09/04 19:50:56, Vasily Kuznetsov wrote: > I just thought that since this could also be used to parse fragments of filter > lists, we should probably also close metadata on an include. Does this make > sense? Done.
	339 metadata_closed = True

	340

	341 yield result

OLD	NEW

« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »