cms/converters.py - Issue 29516687: Issue 4488 - Add support for JSON page front matter

Side by Side Diff: cms/converters.py

Issue 29516687: Issue 4488 - Add support for JSON page front matter (Closed) Base URL: https://hg.adblockplus.org/cms

Patch Set: Removed JSON postprocessing and integrated the cms tests Created Aug. 19, 2017, 1:55 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of the Adblock Plus web scripts,	1 # This file is part of the Adblock Plus web scripts,

2 # Copyright (C) 2006-2017 eyeo GmbH	2 # Copyright (C) 2006-2017 eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 from __future__ import unicode_literals	16 from __future__ import unicode_literals

17	17

18 import os	18 import os

19 import HTMLParser	19 import HTMLParser

20 import re	20 import re

21 import urlparse	21 import urlparse

	22 import json

22	23

23 import jinja2	24 import jinja2

24 import markdown	25 import markdown

25	26

26	27

27 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs	28 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs

28 # are inserted into the <head> tag	29 # are inserted into the <head> tag

29 orig_isBlockLevel = markdown.util.isBlockLevel	30 orig_isBlockLevel = markdown.util.isBlockLevel

30	31

31	32

32 def isBlockLevel(tag):	33 def isBlockLevel(tag):

33 if tag == 'head':	34 if tag == 'head':

34 return True	35 return True

35 return orig_isBlockLevel(tag)	36 return orig_isBlockLevel(tag)

36	37

	38

37 markdown.util.isBlockLevel = isBlockLevel	39 markdown.util.isBlockLevel = isBlockLevel

38	40

39 html_escapes = {	41 html_escapes = {

40 '<': '<',	42 '<': '<',

41 '>': '>',	43 '>': '>',

42 '&': '&',	44 '&': '&',

43 '"': '"',	45 '"': '"',

44 "'": ''',	46 "'": ''',

45 }	47 }

46	48

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
110 # the document.	112 # the document.

111 self._append_text(data)	113 self._append_text(data)

112	114

113 def handle_entityref(self, name):	115 def handle_entityref(self, name):

114 self._append_text(self.unescape('&{};'.format(name)))	116 self._append_text(self.unescape('&{};'.format(name)))

115	117

116 def handle_charref(self, name):	118 def handle_charref(self, name):

117 self._append_text(self.unescape('&#{};'.format(name)))	119 self._append_text(self.unescape('&#{};'.format(name)))

118	120

119	121

	122 def parse_metadata(page, data):
	Vasily Kuznetsov 2017/08/21 18:19:50 There's an implicit duplication here, where we man There's an implicit duplication here, where we manually assign the `page` key in the returned dictionary. Something like this: def parse_metadata(page, data): metadata = {'page': page} try: decoder = json.JSONDecoder() json_data, index = decoder.raw_decode(data) metadata.update(json_data) page_data = data[index:].strip() except ValueError: lines = data.splitlines(True) for i, line in enumerate(lines): if not re.search(r'^\s[\w\-]+\s=', line): break name, value = line.split('=', 1) value = value.strip() if value.startswith('[') and value.endswith(']'): value = [element.strip() for element in value[1:-1].split(',')] lines[i] = '' metadata[name.strip()] = value page_data = ''.join(lines) return metadata, page_data We just reuse the same initial `metadata` dictionary, so in case we need to add or change default metadata variables we can do it in one place and then we only return in one place to make the flow a bit more obvious. What do you think about this version? rosie 2017/08/23 18:13:04 Done. Show quoted text On 2017/08/21 18:19:50, Vasily Kuznetsov wrote: > There's an implicit duplication here, where we manually assign the `page` key in > the returned dictionary. Something like this: > > def parse_metadata(page, data): > metadata = {'page': page} > try: > decoder = json.JSONDecoder() > json_data, index = decoder.raw_decode(data) > metadata.update(json_data) > page_data = data[index:].strip() > except ValueError: > lines = data.splitlines(True) > for i, line in enumerate(lines): > if not re.search(r'^\s[\w\-]+\s=', line): > break > name, value = line.split('=', 1) > value = value.strip() > if value.startswith('[') and value.endswith(']'): > value = [element.strip() for element in > value[1:-1].split(',')] > lines[i] = '' > metadata[name.strip()] = value > page_data = ''.join(lines) > return metadata, page_data > > We just reuse the same initial `metadata` dictionary, so in case we need to add > or change default metadata variables we can do it in one place and then we only > return in one place to make the flow a bit more obvious. > > What do you think about this version? Done. rosie 2017/08/23 18:13:04 Looks good. :) Show quoted text On 2017/08/21 18:19:50, Vasily Kuznetsov wrote: > There's an implicit duplication here, where we manually assign the `page` key in > the returned dictionary. Something like this: > > def parse_metadata(page, data): > metadata = {'page': page} > try: > decoder = json.JSONDecoder() > json_data, index = decoder.raw_decode(data) > metadata.update(json_data) > page_data = data[index:].strip() > except ValueError: > lines = data.splitlines(True) > for i, line in enumerate(lines): > if not re.search(r'^\s[\w\-]+\s=', line): > break > name, value = line.split('=', 1) > value = value.strip() > if value.startswith('[') and value.endswith(']'): > value = [element.strip() for element in > value[1:-1].split(',')] > lines[i] = '' > metadata[name.strip()] = value > page_data = ''.join(lines) > return metadata, page_data > > We just reuse the same initial `metadata` dictionary, so in case we need to add > or change default metadata variables we can do it in one place and then we only > return in one place to make the flow a bit more obvious. > > What do you think about this version? Looks good. :)
	123 try:

	124 decoder = json.JSONDecoder()

	125 json_data, index = decoder.raw_decode(data)

	126 json_data['page'] = page

	127 return json_data, data[index:].strip()

	128 except ValueError:

	129 metadata = {'page': page}

	130 lines = data.splitlines(True)

	131 for i, line in enumerate(lines):

	132 if not re.search(r'^\s[\w\-]+\s=', line):

	133 break

	134 name, value = line.split('=', 1)

	135 value = value.strip()

	136 if value.startswith('[') and value.endswith(']'):

	137 value = [element.strip() for element in value[1:-1].split(',')]

	138 lines[i] = ''

	139 metadata[name.strip()] = value

	140 page_data = ''.join(lines)

	141 return metadata, page_data

	142

	143

120 def parse_page_content(page, data):	144 def parse_page_content(page, data):

121 """Separate page content into metadata (dict) and body text (str)"""	145 """Separate page content into metadata (dict) and body text (str)"""

122 page_data = {'page': page}	146 # If metadata is in a comment block, extract it

123 lines = data.splitlines(True)	147 if data.lstrip().startswith('<!--'):

124 for i, line in enumerate(lines):	148 start_index = re.search('<!--', data).end()
	Vasily Kuznetsov 2017/08/21 18:19:50 I think regular expressions are a bit of an overki I think regular expressions are a bit of an overkill here. You can just use `data.index('<!--') + 4` as `start_index` and `data.index('-->')` as `end_index`. Although, this creates more magical constants and repetition, so perhaps it would make sense to move the comment start and end strings into constants, something like `COMMENT_START` and `COMMENT_END`. In the end it would be something like this (I also took the liberty to restructure it a bit to have only one return): def parse_page_content(page, data): """Separate page content into metadata (dict) and body text (str)""" # If metadata is in a comment block, extract it if data.lstrip().startswith(COMMENT_START): start_index = data.index(COMMENT_START) + len(COMMENT_START) end_index = data.index(COMMENT_END) comment = data[start_index:end_index] page_tail = data[end_index + len(COMMENT_END):] metadata, comment_data = parse_metadata(page, comment.strip()) page_data = '{}\n{}\n{}\n\n{}'.format( COMMENT_START, comment_data, COMMENT_END, page_tail.strip() ) else: metadata, page_data = parse_metadata(page, data.strip()) return metadata, page_data I think it would make sense to replace the regular expressions with a simple `.index(...)` but feel free to incorporate any of the other changes too. rosie 2017/08/23 18:13:05 Yeah, that looks cleaner and avoids regular expres Show quoted text On 2017/08/21 18:19:50, Vasily Kuznetsov wrote: > I think regular expressions are a bit of an overkill here. You can just use > `data.index('<!--') + 4` as `start_index` and `data.index('-->')` as > `end_index`. Although, this creates more magical constants and repetition, so > perhaps it would make sense to move the comment start and end strings into > constants, something like `COMMENT_START` and `COMMENT_END`. > > In the end it would be something like this (I also took the liberty to > restructure it a bit to have only one return): > > def parse_page_content(page, data): > """Separate page content into metadata (dict) and body text (str)""" > # If metadata is in a comment block, extract it > if data.lstrip().startswith(COMMENT_START): > start_index = data.index(COMMENT_START) + len(COMMENT_START) > end_index = data.index(COMMENT_END) > comment = data[start_index:end_index] > page_tail = data[end_index + len(COMMENT_END):] > metadata, comment_data = parse_metadata(page, comment.strip()) > page_data = '{}\n{}\n{}\n\n{}'.format( > COMMENT_START, comment_data, COMMENT_END, page_tail.strip() > ) > else: > metadata, page_data = parse_metadata(page, data.strip()) > return metadata, page_data > > I think it would make sense to replace the regular expressions with a simple > `.index(...)` but feel free to incorporate any of the other changes too. Yeah, that looks cleaner and avoids regular expressions. Is it necessary to make the strings constants? It seems like the added complexity might not be worth it. https://stackoverflow.com/questions/2682745/how-do-i-create-a-constant-in-python Maybe just use `comment_start` and `comment_end` as regular variables, but don't modify them? Vasily Kuznetsov 2017/08/25 10:12:28 There's not much added complexity to the constants Show quoted text On 2017/08/23 18:13:05, rosie wrote: > On 2017/08/21 18:19:50, Vasily Kuznetsov wrote: > > I think regular expressions are a bit of an overkill here. You can just use > > `data.index('<!--') + 4` as `start_index` and `data.index('-->')` as > > `end_index`. Although, this creates more magical constants and repetition, so > > perhaps it would make sense to move the comment start and end strings into > > constants, something like `COMMENT_START` and `COMMENT_END`. > > > > In the end it would be something like this (I also took the liberty to > > restructure it a bit to have only one return): > > > > def parse_page_content(page, data): > > """Separate page content into metadata (dict) and body text (str)""" > > # If metadata is in a comment block, extract it > > if data.lstrip().startswith(COMMENT_START): > > start_index = data.index(COMMENT_START) + len(COMMENT_START) > > end_index = data.index(COMMENT_END) > > comment = data[start_index:end_index] > > page_tail = data[end_index + len(COMMENT_END):] > > metadata, comment_data = parse_metadata(page, comment.strip()) > > page_data = '{}\n{}\n{}\n\n{}'.format( > > COMMENT_START, comment_data, COMMENT_END, page_tail.strip() > > ) > > else: > > metadata, page_data = parse_metadata(page, data.strip()) > > return metadata, page_data > > > > I think it would make sense to replace the regular expressions with a simple > > `.index(...)` but feel free to incorporate any of the other changes too. > > Yeah, that looks cleaner and avoids regular expressions. Is it necessary to make > the strings constants? It seems like the added complexity might not be worth it. > > https://stackoverflow.com/questions/2682745/how-do-i-create-a-constant-in-python > > Maybe just use `comment_start` and `comment_end` as regular variables, but don't > modify them? There's not much added complexity to the constants, just capitalize the names and move them out of the method. There's also merit to keeping them inside, though, since they are not needed anywhere. All in all, your approach is good.
125 if line.strip() in {'<!--', '-->'}:	149 end_match = re.search('-->', data)

126 lines[i] = ''	150 end_index = end_match.start()

127 continue	151 comment = data[start_index:end_index]

128 if not re.search(r'^\s[\w\-]+\s=', line):	152 page_data = data[end_match.end():]

129 break	153 metadata, comment_data = parse_metadata(page, comment.strip())

130 name, value = line.split('=', 1)	154 page_data_result = '{}{}{}{}'.format('<!--\n', comment_data,

131 value = value.strip()	155 '\n-->\n\n', page_data.strip())

132 if value.startswith('[') and value.endswith(']'):	156 return metadata, page_data_result

133 value = [element.strip() for element in value[1:-1].split(',')]	157 metadata, page_data = parse_metadata(page, data.strip())

134 lines[i] = '\n'	158 return metadata, page_data

135 page_data[name.strip()] = value

136 return page_data, ''.join(lines)

137	159

138	160

139 class Converter:	161 class Converter:

140 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'}	162 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'}

141 missing_translations = 0	163 missing_translations = 0

142 total_translations = 0	164 total_translations = 0

143	165

144 def __init__(self, params, key='pagedata'):	166 def __init__(self, params, key='pagedata'):

145 self._params = params	167 self._params = params

146 self._key = key	168 self._key = key

(...skipping 404 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
551	573

552 structured = []	574 structured = []

553 stack = [{'level': 0, 'subitems': structured}]	575 stack = [{'level': 0, 'subitems': structured}]

554 for item in flat:	576 for item in flat:

555 while stack[-1]['level'] >= item['level']:	577 while stack[-1]['level'] >= item['level']:

556 stack.pop()	578 stack.pop()

557 stack[-1]['subitems'].append(item)	579 stack[-1]['subitems'].append(item)

558 stack.append(item)	580 stack.append(item)

559 return structured	581 return structured

560	582

	583

561 converters = {	584 converters = {

562 'html': RawConverter,	585 'html': RawConverter,

563 'md': MarkdownConverter,	586 'md': MarkdownConverter,

564 'tmpl': TemplateConverter,	587 'tmpl': TemplateConverter,

565 }	588 }

OLD	NEW

« no previous file with comments | « no previous file | tests/expected_output/en/metadata_json » ('j') | no next file with comments »