Index: cms/converters.py |
=================================================================== |
--- a/cms/converters.py |
+++ b/cms/converters.py |
@@ -14,31 +14,33 @@ |
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
from __future__ import unicode_literals |
import os |
import HTMLParser |
import re |
import urlparse |
+import json |
import jinja2 |
import markdown |
# Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs |
# are inserted into the <head> tag |
orig_isBlockLevel = markdown.util.isBlockLevel |
def isBlockLevel(tag): |
if tag == 'head': |
return True |
return orig_isBlockLevel(tag) |
+ |
markdown.util.isBlockLevel = isBlockLevel |
html_escapes = { |
'<': '<', |
'>': '>', |
'&': '&', |
'"': '"', |
"'": ''', |
@@ -112,33 +114,53 @@ |
def handle_entityref(self, name): |
self._append_text(self.unescape('&{};'.format(name))) |
def handle_charref(self, name): |
self._append_text(self.unescape('&#{};'.format(name))) |
+def parse_metadata(page, data): |
Vasily Kuznetsov
2017/08/21 18:19:50
There's an implicit duplication here, where we man
rosie
2017/08/23 18:13:04
Done.
rosie
2017/08/23 18:13:04
Looks good. :)
|
+ try: |
+ decoder = json.JSONDecoder() |
+ json_data, index = decoder.raw_decode(data) |
+ json_data['page'] = page |
+ return json_data, data[index:].strip() |
+ except ValueError: |
+ metadata = {'page': page} |
+ lines = data.splitlines(True) |
+ for i, line in enumerate(lines): |
+ if not re.search(r'^\s*[\w\-]+\s*=', line): |
+ break |
+ name, value = line.split('=', 1) |
+ value = value.strip() |
+ if value.startswith('[') and value.endswith(']'): |
+ value = [element.strip() for element in value[1:-1].split(',')] |
+ lines[i] = '' |
+ metadata[name.strip()] = value |
+ page_data = ''.join(lines) |
+ return metadata, page_data |
+ |
+ |
def parse_page_content(page, data): |
"""Separate page content into metadata (dict) and body text (str)""" |
- page_data = {'page': page} |
- lines = data.splitlines(True) |
- for i, line in enumerate(lines): |
- if line.strip() in {'<!--', '-->'}: |
- lines[i] = '' |
- continue |
- if not re.search(r'^\s*[\w\-]+\s*=', line): |
- break |
- name, value = line.split('=', 1) |
- value = value.strip() |
- if value.startswith('[') and value.endswith(']'): |
- value = [element.strip() for element in value[1:-1].split(',')] |
- lines[i] = '\n' |
- page_data[name.strip()] = value |
- return page_data, ''.join(lines) |
+ # If metadata is in a comment block, extract it |
+ if data.lstrip().startswith('<!--'): |
+ start_index = re.search('<!--', data).end() |
Vasily Kuznetsov
2017/08/21 18:19:50
I think regular expressions are a bit of an overki
rosie
2017/08/23 18:13:05
Yeah, that looks cleaner and avoids regular expres
Vasily Kuznetsov
2017/08/25 10:12:28
There's not much added complexity to the constants
|
+ end_match = re.search('-->', data) |
+ end_index = end_match.start() |
+ comment = data[start_index:end_index] |
+ page_data = data[end_match.end():] |
+ metadata, comment_data = parse_metadata(page, comment.strip()) |
+ page_data_result = '{}{}{}{}'.format('<!--\n', comment_data, |
+ '\n-->\n\n', page_data.strip()) |
+ return metadata, page_data_result |
+ metadata, page_data = parse_metadata(page, data.strip()) |
+ return metadata, page_data |
class Converter: |
whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} |
missing_translations = 0 |
total_translations = 0 |
def __init__(self, params, key='pagedata'): |
@@ -553,13 +575,14 @@ |
stack = [{'level': 0, 'subitems': structured}] |
for item in flat: |
while stack[-1]['level'] >= item['level']: |
stack.pop() |
stack[-1]['subitems'].append(item) |
stack.append(item) |
return structured |
+ |
converters = { |
'html': RawConverter, |
'md': MarkdownConverter, |
'tmpl': TemplateConverter, |
} |