abp/filters/parser.py - Issue 29880555: Issue 6877 - Only parse headers in the first line of the filter list

Side by Side Diff: abp/filters/parser.py

Issue 29880555: Issue 6877 - Only parse headers in the first line of the filter list (Closed)

Patch Set: Created Sept. 14, 2018, 2:40 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of Adblock Plus <https://adblockplus.org/>,	1 # This file is part of Adblock Plus <https://adblockplus.org/>,

2 # Copyright (C) 2006-present eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

(...skipping 122 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
133	133

134	134

135 Header = _line_type('Header', 'version', '[{.version}]')	135 Header = _line_type('Header', 'version', '[{.version}]')

136 EmptyLine = _line_type('EmptyLine', '', '')	136 EmptyLine = _line_type('EmptyLine', '', '')

137 Comment = _line_type('Comment', 'text', '! {.text}')	137 Comment = _line_type('Comment', 'text', '! {.text}')

138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')	138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}')

139 Filter = _line_type('Filter', 'text selector action options', '{.text}')	139 Filter = _line_type('Filter', 'text selector action options', '{.text}')

140 Include = _line_type('Include', 'target', '%include {0.target}%')	140 Include = _line_type('Include', 'target', '%include {0.target}%')

141	141

142	142

143 METADATA_REGEXP = re.compile(r'(.?)\s:\s(.)')	143 METADATA_REGEXP = re.compile(r'\s!\s(.?)\s:\s(.)')

144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')	144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%')

145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\sPlus\s[\d\.]+?)?)\]', flags=re.I)	145 HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\sPlus\s[\d\.]+?)?)\|.*)\]$', flag s=re.I)

146 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')	146 HIDING_FILTER_REGEXP = re.compile(r'^([^/\|@"!]?)#([@?])?#(.+)$')

147 FILTER_OPTIONS_REGEXP = re.compile(	147 FILTER_OPTIONS_REGEXP = re.compile(

148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'	148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$'

149 )	149 )

150	150

151	151

152 def _parse_header(text):

153 match = HEADER_REGEXP.match(text)

154 if not match:

155 raise ParseError('Malformed header', text)

156 return Header(match.group(1))

157

158

159 def _parse_instruction(text):	152 def _parse_instruction(text):

160 match = INCLUDE_REGEXP.match(text)	153 match = INCLUDE_REGEXP.match(text)

161 if not match:	154 if not match:

162 raise ParseError('Unrecognized instruction', text)	155 raise ParseError('Unrecognized instruction', text)

163 return Include(match.group(1))	156 return Include(match.group(1))

164	157

165	158

166 def _parse_option(option):	159 def _parse_option(option):

167 if '=' in option:	160 if '=' in option:

168 return option.split('=', 1)	161 return option.split('=', 1)

(...skipping 75 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
244 Parsed filter.	237 Parsed filter.

245	238

246 """	239 """

247 if '#' in text:	240 if '#' in text:

248 match = HIDING_FILTER_REGEXP.search(text)	241 match = HIDING_FILTER_REGEXP.search(text)

249 if match:	242 if match:

250 return _parse_hiding_filter(text, *match.groups())	243 return _parse_hiding_filter(text, *match.groups())

251 return _parse_blocking_filter(text)	244 return _parse_blocking_filter(text)

252	245

253	246

	247 def _decode_if_bytes(s):

	248 return s.decode('utf-8') if isinstance(s, type(b'')) else s

	249

	250

254 def parse_line(line_text):	251 def parse_line(line_text):

255 """Parse one line of a filter list.	252 """Parse one line of a filter list.

256	253

257 Note that parse_line() doesn't handle special comments, hence never returns	254 Note that parse_line() doesn't handle headers and special comments,

258 a Metadata() object, Adblock Plus only considers metadata when parsing the	255 hence never returns a Header() or Metadata() object. Adblock Plus only

259 whole filter list and only if they are given at the top of the filter list.	256 considers headers and metadata when parsing the whole filter list and

	257 only if they are given at the top of the filter list.

260	258

261 Parameters	259 Parameters

262 ----------	260 ----------

263 line_text : str	261 line_text : str

264 Line of a filter list.	262 Line of a filter list.

265	263

266 Returns	264 Returns

267 -------	265 -------

268 namedtuple	266 namedtuple

269 Parsed line (see `_line_type`).	267 Parsed line (see `_line_type`).

270	268

271 Raises	269 Raises

272 ------	270 ------

273 ParseError	271 ParseError

274 ParseError: If the line can't be parsed.	272 ParseError: If the line can't be parsed.

275 """	273 """

276 if isinstance(line_text, type(b'')):	274 content = _decode_if_bytes(line_text).strip()

277 line_text = line_text.decode('utf-8')

278

279 content = line_text.strip()

280	275

281 if content == '':	276 if content == '':

282 line = EmptyLine()	277 line = EmptyLine()

283 elif content.startswith('!'):	278 elif content.startswith('!'):

284 line = Comment(content[1:].lstrip())	279 line = Comment(content[1:].lstrip())

285 elif content.startswith('%') and content.endswith('%'):	280 elif content.startswith('%') and content.endswith('%'):

286 line = _parse_instruction(content)	281 line = _parse_instruction(content)

287 elif content.startswith('[') and content.endswith(']'):

288 line = _parse_header(content)

289 else:	282 else:

290 line = parse_filter(content)	283 line = parse_filter(content)

291	284

292 assert line.to_string().replace(' ', '') == content.replace(' ', '')	285 assert line.to_string().replace(' ', '') == content.replace(' ', '')

293 return line	286 return line

294	287

295	288

296 def parse_filterlist(lines):	289 def parse_filterlist(lines):

297 """Parse filter list from an iterable.	290 """Parse filter list from an iterable.

298	291

(...skipping 10 matching lines...) Expand all Loading...
309 Raises	302 Raises

310 ------	303 ------

311 ParseError	304 ParseError

312 Thrown during iteration for invalid filter list lines.	305 Thrown during iteration for invalid filter list lines.

313 TypeError	306 TypeError

314 If `lines` is not iterable.	307 If `lines` is not iterable.

315	308

316 """	309 """

317 metadata_closed = False	310 metadata_closed = False

318	311

319 for line in lines:	312 for i, line in enumerate(lines):

320 result = parse_line(line)	313 text = _decode_if_bytes(line)

321	314

322 if result.type == 'comment':	315 if i == 0:

323 match = METADATA_REGEXP.match(result.text)	316 match = HEADER_REGEXP.match(text)

324 if match:	317 if match:

325 key, value = match.groups()	318 version = match.group(1)

	319 if not version:

	320 raise ParseError('Malformed header', text)

326	321

327 # Historically, checksums can occur at the bottom of the	322 yield Header(version)

328 # filter list. Checksums are no longer used by Adblock Plus,	323 continue

329 # but in order to strip them (in abp.filters.renderer),

330 # we have to make sure to still parse them regardless of

331 # their position in the filter list.

332 if not metadata_closed or key.lower() == 'checksum':

333 result = Metadata(key, value)

334	324

335 if result.type not in {'header', 'metadata'}:	325 match = METADATA_REGEXP.match(text)

336 metadata_closed = True	326 if match:

	327 key, value = match.groups()

337	328

338 yield result	329 # Historically, checksums can occur at the bottom of the

	330 # filter list. Checksums are no longer used by Adblock Plus,

	331 # but in order to strip them (in abp.filters.renderer),

	332 # we have to make sure to still parse them regardless of

	333 # their position in the filter list.

	334 if not metadata_closed or key.lower() == 'checksum':

	335 yield Metadata(key, value)

	336 continue

	337

	338 metadata_closed = True

	339 yield parse_line(text)

OLD	NEW

« no previous file with comments | « no previous file | tests/test_parser.py » ('j') | no next file with comments »