cms/converters.py - Issue 29516687: Issue 4488 - Add support for JSON page front matter

Unified Diff: cms/converters.py

Issue 29516687: Issue 4488 - Add support for JSON page front matter (Closed) Base URL: https://hg.adblockplus.org/cms

Patch Set: Removed JSON postprocessing and integrated the cms tests Created Aug. 19, 2017, 1:55 a.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: cms/converters.py

===================================================================

--- a/cms/converters.py

+++ b/cms/converters.py

@@ -14,31 +14,33 @@

# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import os

import HTMLParser

import re

import urlparse

+import json

import jinja2

import markdown

# Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs

# are inserted into the <head> tag

orig_isBlockLevel = markdown.util.isBlockLevel

def isBlockLevel(tag):

if tag == 'head':

return True

return orig_isBlockLevel(tag)

markdown.util.isBlockLevel = isBlockLevel

html_escapes = {

'<': '<',

'>': '>',

'&': '&',

'"': '"',

"'": ''',

@@ -112,33 +114,53 @@

def handle_entityref(self, name):

self._append_text(self.unescape('&{};'.format(name)))

def handle_charref(self, name):

self._append_text(self.unescape('&#{};'.format(name)))

+def parse_metadata(page, data):

Vasily Kuznetsov 2017/08/21 18:19:50 There's an implicit duplication here, where we man

rosie 2017/08/23 18:13:04 Done.

rosie 2017/08/23 18:13:04 Looks good. :)

+ try:

+ decoder = json.JSONDecoder()

+ json_data, index = decoder.raw_decode(data)

+ json_data['page'] = page

+ return json_data, data[index:].strip()

+ except ValueError:

+ metadata = {'page': page}

+ lines = data.splitlines(True)

+ for i, line in enumerate(lines):

+ if not re.search(r'^\s*[\w\-]+\s*=', line):

+ break

+ name, value = line.split('=', 1)

+ value = value.strip()

+ if value.startswith('[') and value.endswith(']'):

+ value = [element.strip() for element in value[1:-1].split(',')]

+ lines[i] = ''

+ metadata[name.strip()] = value

+ page_data = ''.join(lines)

+ return metadata, page_data

def parse_page_content(page, data):

"""Separate page content into metadata (dict) and body text (str)"""

- page_data = {'page': page}

- lines = data.splitlines(True)

- for i, line in enumerate(lines):

- if line.strip() in {''}:

- lines[i] = ''

- continue

- if not re.search(r'^\s*[\w\-]+\s*=', line):

- break

- name, value = line.split('=', 1)

- value = value.strip()

- if value.startswith('[') and value.endswith(']'):

- value = [element.strip() for element in value[1:-1].split(',')]

- lines[i] = '\n'

- page_data[name.strip()] = value

- return page_data, ''.join(lines)

+ # If metadata is in a comment block, extract it

+ if data.lstrip().startswith('<!--'):

+ start_index = re.search('<!--', data).end()

Vasily Kuznetsov 2017/08/21 18:19:50 I think regular expressions are a bit of an overki

rosie 2017/08/23 18:13:05 Yeah, that looks cleaner and avoids regular expres

Vasily Kuznetsov 2017/08/25 10:12:28 There's not much added complexity to the constants

There's not much added complexity to the constants, just capitalize the names and move them out of the method. There's also merit to keeping them inside, though, since they are not needed anywhere. All in all, your approach is good.

+ end_match = re.search('-->', data)

+ end_index = end_match.start()

+ comment = data[start_index:end_index]

+ page_data = data[end_match.end():]

+ metadata, comment_data = parse_metadata(page, comment.strip())

+ page_data_result = '{}{}{}{}'.format('<!--\n', comment_data,

+ '\n-->\n\n', page_data.strip())

+ return metadata, page_data_result

+ metadata, page_data = parse_metadata(page, data.strip())

+ return metadata, page_data

class Converter:

whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'}

missing_translations = 0

total_translations = 0

def __init__(self, params, key='pagedata'):

@@ -553,13 +575,14 @@

stack = [{'level': 0, 'subitems': structured}]

for item in flat:

while stack[-1]['level'] >= item['level']:

stack.pop()

stack[-1]['subitems'].append(item)

stack.append(item)

return structured

converters = {

'html': RawConverter,

'md': MarkdownConverter,

'tmpl': TemplateConverter,

}

« no previous file with comments | « no previous file | tests/expected_output/en/metadata_json » ('j') | no next file with comments »