| Index: cms/converters.py |
| =================================================================== |
| --- a/cms/converters.py |
| +++ b/cms/converters.py |
| @@ -14,31 +14,33 @@ |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| from __future__ import unicode_literals |
| import os |
| import HTMLParser |
| import re |
| import urlparse |
| +import json |
| import jinja2 |
| import markdown |
| # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs |
| # are inserted into the <head> tag |
| orig_isBlockLevel = markdown.util.isBlockLevel |
| def isBlockLevel(tag): |
| if tag == 'head': |
| return True |
| return orig_isBlockLevel(tag) |
| + |
|
Sebastian Noack
2017/08/29 22:49:39
Adding this blank line is unrelated.
rosie
2018/03/26 02:32:21
True. My linter was showing a warning because ther
Sebastian Noack
2018/03/26 02:57:47
Strictly, there should be two blank lines surround
|
| markdown.util.isBlockLevel = isBlockLevel |
| html_escapes = { |
| '<': '<', |
| '>': '>', |
| '&': '&', |
| '"': '"', |
| "'": ''', |
| @@ -112,33 +114,54 @@ |
| def handle_entityref(self, name): |
| self._append_text(self.unescape('&{};'.format(name))) |
| def handle_charref(self, name): |
| self._append_text(self.unescape('&#{};'.format(name))) |
| +def parse_metadata(page, data): |
| + metadata = {'page': page} |
| + try: |
| + decoder = json.JSONDecoder() |
| + json_data, index = decoder.raw_decode(data) |
| + metadata.update(json_data) |
| + page_data = data[index:].strip() |
|
Sebastian Noack
2017/08/29 22:49:39
Note that when parsing the legacy non-JSON format,
Vasily Kuznetsov
2017/08/30 10:50:00
This consideration is rather non-obvious, for exam
rosie
2018/03/26 02:32:21
(This function was moved to utils.py.) Now, the li
|
| + except ValueError: |
|
Sebastian Noack
2017/08/29 22:49:39
In case the data can be interpreted as JSON, but r
rosie
2018/03/26 02:32:21
The check to make sure a dict is returned now happ
|
| + lines = data.splitlines(True) |
| + for i, line in enumerate(lines): |
| + if not re.search(r'^\s*[\w\-]+\s*=', line): |
|
Sebastian Noack
2017/08/29 22:49:39
There is some redundancy in the regular expression
rosie
2018/03/26 02:32:20
Done.
|
| + break |
| + name, value = line.split('=', 1) |
| + value = value.strip() |
| + if value.startswith('[') and value.endswith(']'): |
| + value = [element.strip() for element in value[1:-1].split(',')] |
| + lines[i] = '' |
| + metadata[name.strip()] = value |
| + page_data = ''.join(lines) |
| + return metadata, page_data |
| + |
| + |
| def parse_page_content(page, data): |
| """Separate page content into metadata (dict) and body text (str)""" |
| - page_data = {'page': page} |
| - lines = data.splitlines(True) |
| - for i, line in enumerate(lines): |
| - if line.strip() in {'<!--', '-->'}: |
| - lines[i] = '' |
| - continue |
| - if not re.search(r'^\s*[\w\-]+\s*=', line): |
| - break |
| - name, value = line.split('=', 1) |
| - value = value.strip() |
| - if value.startswith('[') and value.endswith(']'): |
| - value = [element.strip() for element in value[1:-1].split(',')] |
| - lines[i] = '\n' |
| - page_data[name.strip()] = value |
| - return page_data, ''.join(lines) |
| + # If metadata is in a comment block, extract it |
| + comment_start = '<!--' |
|
Sebastian Noack
2017/08/29 22:49:39
It seems much simpler to use a regular expression
Vasily Kuznetsov
2017/08/30 10:50:00
I think it was my advice to not use regexps here b
rosie
2018/03/26 02:32:20
Done.
|
| + comment_end = '-->' |
| + if data.lstrip().startswith(comment_start): |
| + start_index = data.index(comment_start) + len(comment_start) |
| + end_index = data.index(comment_end) |
| + comment = data[start_index:end_index] |
| + page_tail = data[end_index + len(comment_end):] |
| + metadata, comment_data = parse_metadata(page, comment.strip()) |
|
Sebastian Noack
2017/08/29 22:49:39
Does stripping the comment have any effect here? I
Vasily Kuznetsov
2017/08/30 10:50:01
JSON parsing fails on leading space, but if we wan
rosie
2018/03/26 02:32:21
Done.
|
| + page_data = '{}\n{}\n{}\n\n{}'.format(comment_start, comment_data, |
| + comment_end, page_tail.strip()) |
| + else: |
| + metadata, page_data = parse_metadata(page, data.strip()) |
| + return metadata, page_data |
| class Converter: |
| whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} |
| missing_translations = 0 |
| total_translations = 0 |
| def __init__(self, params, key='pagedata'): |