cms/converters.py - Issue 29472555: Issue 4867 - Add global get_pages_metadata to template converters

Side by Side Diff: cms/converters.py

Issue 29472555: Issue 4867 - Add global get_pages_metadata to template converters (Closed)

Patch Set: refactor file metadata parsing, fix tests Created June 26, 2017, 7:22 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of the Adblock Plus web scripts,	1 # This file is part of the Adblock Plus web scripts,

2 # Copyright (C) 2006-2017 eyeo GmbH	2 # Copyright (C) 2006-2017 eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

(...skipping 98 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
109 # the document.	109 # the document.

110 self._append_text(data)	110 self._append_text(data)

111	111

112 def handle_entityref(self, name):	112 def handle_entityref(self, name):

113 self._append_text(self.unescape('&{};'.format(name)))	113 self._append_text(self.unescape('&{};'.format(name)))

114	114

115 def handle_charref(self, name):	115 def handle_charref(self, name):

116 self._append_text(self.unescape('&#{};'.format(name)))	116 self._append_text(self.unescape('&#{};'.format(name)))

117	117

118	118

	119 def get_page_metadata(page, data):

	120 """Generator which gets per page metadata and corresponding line indices"""

	121 for i, line in enumerate(data.splitlines()):

	122 if not re.search(r'^\s[\w\-]+\s=', line):

	123 break

	124 name, value = line.split('=', 1)

	125 value = value.strip()

	126 if value.startswith('[') and value.endswith(']'):

	127 value = [element.strip() for element in value[1:-1].split(',')]

	128 yield name.strip(), value, i

	129

	130

119 class Converter:	131 class Converter:

120 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'}	132 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'}

121 missing_translations = 0	133 missing_translations = 0

122 total_translations = 0	134 total_translations = 0

123	135

124 def __init__(self, params, key='pagedata'):	136 def __init__(self, params, key='pagedata'):

125 self._params = params	137 self._params = params

126 self._key = key	138 self._key = key

127 self._attribute_parser = AttributeParser(self.whitelist)	139 self._attribute_parser = AttributeParser(self.whitelist)

128 self._seen_defaults = {}	140 self._seen_defaults = {}

129	141

130 # Read in any parameters specified at the beginning of the file	142 # Read in any parameters specified at the beginning of the file

131 data, filename = params[key]	143 data, filename = params[key]

132 lines = data.splitlines(True)	144 lines = data.splitlines(True)

133 for i, line in enumerate(lines):	145 for name, value, i in get_page_metadata(params['page'], data):
	Vasily Kuznetsov 2017/06/27 13:32:15 This refactoring moved the code into a separate fu This refactoring moved the code into a separate function, but the code is still very interdependent: this `i` in return tuples has nothing to do with metadata and why are we even returning an iterator of tuples, wouldn't it be better to return a dictionary instead. I propose that we go one step further. How about we change `get_page_metadata` to `parse_page(data) -> (metadata, pagecontent)` (where `metadata` is a dictionary and `pagecontent` is a string). This function will have all parsing logic in it, from splitting lines to making the dictionary out of metadata. Then we can just `params.update(metadata)` and `params[key] = (pagecontent, filename)`. Jon Sonesen 2017/06/28 14:31:19 The line index is required to strip the metadata f Show quoted text On 2017/06/27 13:32:15, Vasily Kuznetsov wrote: > This refactoring moved the code into a separate function, but the code is still > very interdependent: this `i` in return tuples has nothing to do with metadata > and why are we even returning an iterator of tuples, wouldn't it be better to > return a dictionary instead. > I propose that we go one step further. How about we change `get_page_metadata` > to `parse_page(data) -> (metadata, pagecontent)` (where `metadata` is a > dictionary and `pagecontent` is a string). This function will have all parsing > logic in it, from splitting lines to making the dictionary out of metadata. Then > we can just `params.update(metadata)` and `params[key] = (pagecontent, > filename)`. The line index is required to strip the metadata from the page data which gets inserted to params['page_content']. Jon Sonesen 2017/07/03 09:06:06 Done. Show quoted text On 2017/06/27 13:32:15, Vasily Kuznetsov wrote: > This refactoring moved the code into a separate function, but the code is still > very interdependent: this `i` in return tuples has nothing to do with metadata > and why are we even returning an iterator of tuples, wouldn't it be better to > return a dictionary instead. > I propose that we go one step further. How about we change `get_page_metadata` > to `parse_page(data) -> (metadata, pagecontent)` (where `metadata` is a > dictionary and `pagecontent` is a string). This function will have all parsing > logic in it, from splitting lines to making the dictionary out of metadata. Then > we can just `params.update(metadata)` and `params[key] = (pagecontent, > filename)`. Done.
134 if not re.search(r'^\s[\w\-]+\s=', line):	146 params[name] = value

135 break

136 name, value = line.split('=', 1)

137 params[name.strip()] = value.strip()

138 lines[i] = '\n'	147 lines[i] = '\n'

139 params[key] = (''.join(lines), filename)	148 params[key] = (''.join(lines), filename)

140	149

141 def localize_string(	150 def localize_string(

142 self, page, name, default, comment, localedata, escapes):	151 self, page, name, default, comment, localedata, escapes):

143	152

144 def escape(s):	153 def escape(s):

145 return re.sub(r'.',	154 return re.sub(r'.',

146 lambda match: escapes.get(match.group(0),	155 lambda match: escapes.get(match.group(0),

147 match.group(0)),	156 match.group(0)),

(...skipping 228 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
376 filters = {	385 filters = {

377 'translate': self.translate,	386 'translate': self.translate,

378 'linkify': self.linkify,	387 'linkify': self.linkify,

379 'toclist': self.toclist,	388 'toclist': self.toclist,

380 }	389 }

381	390

382 globals = {	391 globals = {

383 'get_string': self.get_string,	392 'get_string': self.get_string,

384 'has_string': self.has_string,	393 'has_string': self.has_string,

385 'get_page_content': self.get_page_content,	394 'get_page_content': self.get_page_content,

	395 'get_pages_metadata': self.get_pages_metadata,

386 }	396 }

387	397

388 for dirname, dictionary in [('filters', filters),	398 for dirname, dictionary in [('filters', filters),

389 ('globals', globals)]:	399 ('globals', globals)]:

390 for filename in self._params['source'].list_files(dirname):	400 for filename in self._params['source'].list_files(dirname):

391 root, ext = os.path.splitext(filename)	401 root, ext = os.path.splitext(filename)

392 if ext.lower() != '.py':	402 if ext.lower() != '.py':

393 continue	403 continue

394	404

395 path = os.path.join(dirname, filename)	405 path = os.path.join(dirname, filename)

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
464 locale = self._params['locale']	474 locale = self._params['locale']

465	475

466 locale, url = self._params['source'].resolve_link(page, locale)	476 locale, url = self._params['source'].resolve_link(page, locale)

467 return jinja2.Markup('<a{}>'.format(''.join(	477 return jinja2.Markup('<a{}>'.format(''.join(

468 ' {}="{}"'.format(name, jinja2.escape(value)) for name, value in [	478 ' {}="{}"'.format(name, jinja2.escape(value)) for name, value in [

469 ('href', url),	479 ('href', url),

470 ('hreflang', locale)	480 ('hreflang', locale)

471 ] + attrs.items()	481 ] + attrs.items()

472 )))	482 )))

473	483

	484 def get_pages_metadata(self, filters=None):

	485 if not isinstance(filters, dict) and filters:
	Vasily Kuznetsov 2017/06/27 13:32:15 This code seems to allow things like `filters = [] This code seems to allow things like `filters = []` or `filters = 0` (which will crash inside of `filter_metadata`). The check should probably be something like `if filters is not None and not isinstance(...):` instead. Jon Sonesen 2017/06/28 14:31:19 Acknowledged. Show quoted text On 2017/06/27 13:32:15, Vasily Kuznetsov wrote: > This code seems to allow things like `filters = []` or `filters = 0` (which will > crash inside of `filter_metadata`). The check should probably be something like > `if filters is not None and not isinstance(...):` instead. Acknowledged. Jon Sonesen 2017/07/03 09:06:06 Done. Show quoted text On 2017/06/27 13:32:15, Vasily Kuznetsov wrote: > This code seems to allow things like `filters = []` or `filters = 0` (which will > crash inside of `filter_metadata`). The check should probably be something like > `if filters is not None and not isinstance(...):` instead. Done.
	486 raise TypeError('Filters are not a dictionary')

	487

	488 return_data = []

	489 for page_name, _format in self._params['source'].list_pages():

	490 data, filename = self._params['source'].read_page(page_name,

	491 _format)

	492 page_data = {'page': page_name}

	493 for name, value, i in get_page_metadata(page_name, data):

	494 page_data[name] = value

	495 if self.filter_metadata(filters, page_data) is True:

	496 return_data.append(page_data)

	497 return return_data

	498

	499 def filter_metadata(self, filters, metadata):

	500 # if only the page key is in the metadata then there

	501 # was no user defined metadata

	502 if metadata.keys() == ['page']:
	Vasily Kuznetsov 2017/06/27 13:32:15 Is this a requirement that such metadata should be Is this a requirement that such metadata should be ignored? This is an honest question, I see why you might want to do this, but I can also imagine use cases for including those pages. Jon Sonesen 2017/06/28 14:31:19 This is done to keep a uniform api, the previous i Show quoted text On 2017/06/27 13:32:15, Vasily Kuznetsov wrote: > Is this a requirement that such metadata should be ignored? This is an honest > question, I see why you might want to do this, but I can also imagine use cases > for including those pages. This is done to keep a uniform api, the previous implementation does not include files which have no user defined metadata so adding it now may cause a regression, however this is probably best directed to @julian since he is the user. Jon Sonesen 2017/07/03 09:06:06 Still waiting on julians opinion, i guess i get wh Show quoted text On 2017/06/28 14:31:19, Jon Sonesen wrote: > On 2017/06/27 13:32:15, Vasily Kuznetsov wrote: > > Is this a requirement that such metadata should be ignored? This is an honest > > question, I see why you might want to do this, but I can also imagine use > cases > > for including those pages. > > This is done to keep a uniform api, the previous implementation does not include > files which have no user defined metadata so adding it now may cause a > regression, however this is probably best directed to @julian since he is the > user. Still waiting on julians opinion, i guess i get whaat you mean here since have the pages a member regardless of their metadta could be useful juliandoucette 2017/07/03 21:48:51 I don't understand the question. Can you please ex Show quoted text On 2017/06/28 14:31:19, Jon Sonesen wrote: > however this is probably best directed to @julian since he is the user. I don't understand the question. Can you please explain? Vasily Kuznetsov 2017/07/04 07:43:48 The question is: if a page has no explicit metadat Show quoted text On 2017/07/03 21:48:51, juliandoucette wrote: > On 2017/06/28 14:31:19, Jon Sonesen wrote: > > however this is probably best directed to @julian since he is the user. > > I don't understand the question. Can you please explain? > The question is: if a page has no explicit metadata (so it will only have 'page' key in the dictionary) but matches the query (probably means that the query was empty), should it be in the results? Current implementation always filters out pages with no explicit metadata. juliandoucette 2017/07/04 09:57:24 Or that I queried by page name? Show quoted text On 2017/07/04 07:43:48, Vasily Kuznetsov wrote: > The question is: if a page has no explicit metadata (so it will only have 'page' > key in the dictionary) but matches the query (probably means that the query was > empty), Or that I queried by page name? Show quoted text > should it be in the results? Current implementation always filters out > pages with no explicit metadata. So I would get an object like this? { "page": "name" "content": "..." } If so, why would I want to filter this out? Vasily Kuznetsov 2017/07/04 10:23:34 The content is not in the dictionary currently. It Show quoted text On 2017/07/04 09:57:24, juliandoucette wrote: > On 2017/07/04 07:43:48, Vasily Kuznetsov wrote: > > The question is: if a page has no explicit metadata (so it will only have > 'page' > > key in the dictionary) but matches the query (probably means that the query > was > > empty), > > Or that I queried by page name? > > > should it be in the results? Current implementation always filters out > > pages with no explicit metadata. > > So I would get an object like this? > > { > "page": "name" > "content": "..." > } > > If so, why would I want to filter this out? The content is not in the dictionary currently. It can be added, but I would say that should be another ticket. In any case, I was doubting that you would want to filter such pages out, thanks for confirming that. Jon: I would say this `if` should be removed.
	503 return False

	504 if filters is None:

	505 return True

	506 for filter_name, filter_value in filters.items():

	507 if filter_name not in metadata:

	508 return False

	509 if isinstance(metadata[filter_name], list):

	510 if isinstance(filter_value, basestring):

	511 filter_value = [filter_value]

	512 for option in filter_value:

	513 if str(option) not in metadata[filter_name]:

	514 return False

	515 elif filter_value != metadata[filter_name]:

	516 return False

	517 return True

	518

474 def toclist(self, content):	519 def toclist(self, content):

475 toc_re = r'<h(\d)\s[^<>]\bid="([^<>"]+)"[^<>]>(.*?)</h\1>'	520 toc_re = r'<h(\d)\s[^<>]\bid="([^<>"]+)"[^<>]>(.*?)</h\1>'

476 flat = []	521 flat = []

477 for match in re.finditer(toc_re, content, re.S):	522 for match in re.finditer(toc_re, content, re.S):

478 flat.append({	523 flat.append({

479 'level': int(match.group(1)),	524 'level': int(match.group(1)),

480 'anchor': jinja2.Markup(match.group(2)).unescape(),	525 'anchor': jinja2.Markup(match.group(2)).unescape(),

481 'title': jinja2.Markup(match.group(3)).unescape(),	526 'title': jinja2.Markup(match.group(3)).unescape(),

482 'subitems': [],	527 'subitems': [],

483 })	528 })

484	529

485 structured = []	530 structured = []

486 stack = [{'level': 0, 'subitems': structured}]	531 stack = [{'level': 0, 'subitems': structured}]

487 for item in flat:	532 for item in flat:

488 while stack[-1]['level'] >= item['level']:	533 while stack[-1]['level'] >= item['level']:

489 stack.pop()	534 stack.pop()

490 stack[-1]['subitems'].append(item)	535 stack[-1]['subitems'].append(item)

491 stack.append(item)	536 stack.append(item)

492 return structured	537 return structured

493	538

494 converters = {	539 converters = {

495 'html': RawConverter,	540 'html': RawConverter,

496 'md': MarkdownConverter,	541 'md': MarkdownConverter,

497 'tmpl': TemplateConverter,	542 'tmpl': TemplateConverter,

498 }	543 }

OLD	NEW

« no previous file with comments | « no previous file | tests/expected_output/global » ('j') | no next file with comments »