| Index: cms/converters.py |
| =================================================================== |
| --- a/cms/converters.py |
| +++ b/cms/converters.py |
| @@ -14,31 +14,33 @@ |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| from __future__ import unicode_literals |
| import os |
| import HTMLParser |
| import re |
| import urlparse |
| +import json |
| import jinja2 |
| import markdown |
| # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs |
| # are inserted into the <head> tag |
| orig_isBlockLevel = markdown.util.isBlockLevel |
| def isBlockLevel(tag): |
| if tag == 'head': |
| return True |
| return orig_isBlockLevel(tag) |
| + |
| markdown.util.isBlockLevel = isBlockLevel |
| html_escapes = { |
| '<': '<', |
| '>': '>', |
| '&': '&', |
| '"': '"', |
| "'": ''', |
| @@ -112,33 +114,53 @@ |
| def handle_entityref(self, name): |
| self._append_text(self.unescape('&{};'.format(name))) |
| def handle_charref(self, name): |
| self._append_text(self.unescape('&#{};'.format(name))) |
| +def parse_metadata(page, data): |
|
Vasily Kuznetsov
2017/08/21 18:19:50
There's an implicit duplication here, where we man
rosie
2017/08/23 18:13:04
Done.
rosie
2017/08/23 18:13:04
Looks good. :)
|
| + try: |
| + decoder = json.JSONDecoder() |
| + json_data, index = decoder.raw_decode(data) |
| + json_data['page'] = page |
| + return json_data, data[index:].strip() |
| + except ValueError: |
| + metadata = {'page': page} |
| + lines = data.splitlines(True) |
| + for i, line in enumerate(lines): |
| + if not re.search(r'^\s*[\w\-]+\s*=', line): |
| + break |
| + name, value = line.split('=', 1) |
| + value = value.strip() |
| + if value.startswith('[') and value.endswith(']'): |
| + value = [element.strip() for element in value[1:-1].split(',')] |
| + lines[i] = '' |
| + metadata[name.strip()] = value |
| + page_data = ''.join(lines) |
| + return metadata, page_data |
| + |
| + |
| def parse_page_content(page, data): |
| """Separate page content into metadata (dict) and body text (str)""" |
| - page_data = {'page': page} |
| - lines = data.splitlines(True) |
| - for i, line in enumerate(lines): |
| - if line.strip() in {'<!--', '-->'}: |
| - lines[i] = '' |
| - continue |
| - if not re.search(r'^\s*[\w\-]+\s*=', line): |
| - break |
| - name, value = line.split('=', 1) |
| - value = value.strip() |
| - if value.startswith('[') and value.endswith(']'): |
| - value = [element.strip() for element in value[1:-1].split(',')] |
| - lines[i] = '\n' |
| - page_data[name.strip()] = value |
| - return page_data, ''.join(lines) |
| + # If metadata is in a comment block, extract it |
| + if data.lstrip().startswith('<!--'): |
| + start_index = re.search('<!--', data).end() |
|
Vasily Kuznetsov
2017/08/21 18:19:50
I think regular expressions are a bit of an overki
rosie
2017/08/23 18:13:05
Yeah, that looks cleaner and avoids regular expres
Vasily Kuznetsov
2017/08/25 10:12:28
There's not much added complexity to the constants
|
| + end_match = re.search('-->', data) |
| + end_index = end_match.start() |
| + comment = data[start_index:end_index] |
| + page_data = data[end_match.end():] |
| + metadata, comment_data = parse_metadata(page, comment.strip()) |
| + page_data_result = '{}{}{}{}'.format('<!--\n', comment_data, |
| + '\n-->\n\n', page_data.strip()) |
| + return metadata, page_data_result |
| + metadata, page_data = parse_metadata(page, data.strip()) |
| + return metadata, page_data |
| class Converter: |
| whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} |
| missing_translations = 0 |
| total_translations = 0 |
| def __init__(self, params, key='pagedata'): |
| @@ -553,13 +575,14 @@ |
| stack = [{'level': 0, 'subitems': structured}] |
| for item in flat: |
| while stack[-1]['level'] >= item['level']: |
| stack.pop() |
| stack[-1]['subitems'].append(item) |
| stack.append(item) |
| return structured |
| + |
| converters = { |
| 'html': RawConverter, |
| 'md': MarkdownConverter, |
| 'tmpl': TemplateConverter, |
| } |