Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: cms/converters.py

Issue 29516687: Issue 4488 - Add support for JSON page front matter (Closed) Base URL: https://hg.adblockplus.org/cms
Patch Set: Cleaned up duplication, removed unnecessary regex Created Aug. 23, 2017, 6:12 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/expected_output/en/metadata_json » ('j') | tests/expected_output/en/sitemap » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of the Adblock Plus web scripts, 1 # This file is part of the Adblock Plus web scripts,
2 # Copyright (C) 2006-present eyeo GmbH 2 # Copyright (C) 2006-present eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 from __future__ import unicode_literals 16 from __future__ import unicode_literals
17 17
18 import os 18 import os
19 import HTMLParser 19 import HTMLParser
20 import re 20 import re
21 import urlparse 21 import urlparse
22 import json
22 23
23 import jinja2 24 import jinja2
24 import markdown 25 import markdown
25 26
26 27
27 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs 28 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs
28 # are inserted into the <head> tag 29 # are inserted into the <head> tag
29 orig_isBlockLevel = markdown.util.isBlockLevel 30 orig_isBlockLevel = markdown.util.isBlockLevel
30 31
31 32
32 def isBlockLevel(tag): 33 def isBlockLevel(tag):
33 if tag == 'head': 34 if tag == 'head':
34 return True 35 return True
35 return orig_isBlockLevel(tag) 36 return orig_isBlockLevel(tag)
36 37
38
Sebastian Noack 2017/08/29 22:49:39 Adding this blank line is unrelated.
rosie 2018/03/26 02:32:21 True. My linter was showing a warning because ther
Sebastian Noack 2018/03/26 02:57:47 Strictly, there should be two blank lines surround
37 markdown.util.isBlockLevel = isBlockLevel 39 markdown.util.isBlockLevel = isBlockLevel
38 40
39 html_escapes = { 41 html_escapes = {
40 '<': '&lt;', 42 '<': '&lt;',
41 '>': '&gt;', 43 '>': '&gt;',
42 '&': '&amp;', 44 '&': '&amp;',
43 '"': '&quot;', 45 '"': '&quot;',
44 "'": '&#39;', 46 "'": '&#39;',
45 } 47 }
46 48
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
110 # the document. 112 # the document.
111 self._append_text(data) 113 self._append_text(data)
112 114
113 def handle_entityref(self, name): 115 def handle_entityref(self, name):
114 self._append_text(self.unescape('&{};'.format(name))) 116 self._append_text(self.unescape('&{};'.format(name)))
115 117
116 def handle_charref(self, name): 118 def handle_charref(self, name):
117 self._append_text(self.unescape('&#{};'.format(name))) 119 self._append_text(self.unescape('&#{};'.format(name)))
118 120
119 121
122 def parse_metadata(page, data):
123 metadata = {'page': page}
124 try:
125 decoder = json.JSONDecoder()
126 json_data, index = decoder.raw_decode(data)
127 metadata.update(json_data)
128 page_data = data[index:].strip()
Sebastian Noack 2017/08/29 22:49:39 Note that when parsing the legacy non-JSON format,
Vasily Kuznetsov 2017/08/30 10:50:00 This consideration is rather non-obvious, for exam
rosie 2018/03/26 02:32:21 (This function was moved to utils.py.) Now, the li
129 except ValueError:
Sebastian Noack 2017/08/29 22:49:39 In case the data can be interpreted as JSON, but r
rosie 2018/03/26 02:32:21 The check to make sure a dict is returned now happ
130 lines = data.splitlines(True)
131 for i, line in enumerate(lines):
132 if not re.search(r'^\s*[\w\-]+\s*=', line):
Sebastian Noack 2017/08/29 22:49:39 There is some redundancy in the regular expression
rosie 2018/03/26 02:32:20 Done.
133 break
134 name, value = line.split('=', 1)
135 value = value.strip()
136 if value.startswith('[') and value.endswith(']'):
137 value = [element.strip() for element in value[1:-1].split(',')]
138 lines[i] = ''
139 metadata[name.strip()] = value
140 page_data = ''.join(lines)
141 return metadata, page_data
142
143
120 def parse_page_content(page, data): 144 def parse_page_content(page, data):
121 """Separate page content into metadata (dict) and body text (str)""" 145 """Separate page content into metadata (dict) and body text (str)"""
122 page_data = {'page': page} 146 # If metadata is in a comment block, extract it
123 lines = data.splitlines(True) 147 comment_start = '<!--'
Sebastian Noack 2017/08/29 22:49:39 It seems much simpler to use a regular expression
Vasily Kuznetsov 2017/08/30 10:50:00 I think it was my advice to not use regexps here b
rosie 2018/03/26 02:32:20 Done.
124 for i, line in enumerate(lines): 148 comment_end = '-->'
125 if line.strip() in {'<!--', '-->'}: 149 if data.lstrip().startswith(comment_start):
126 lines[i] = '' 150 start_index = data.index(comment_start) + len(comment_start)
127 continue 151 end_index = data.index(comment_end)
128 if not re.search(r'^\s*[\w\-]+\s*=', line): 152 comment = data[start_index:end_index]
129 break 153 page_tail = data[end_index + len(comment_end):]
130 name, value = line.split('=', 1) 154 metadata, comment_data = parse_metadata(page, comment.strip())
Sebastian Noack 2017/08/29 22:49:39 Does stripping the comment have any effect here? I
Vasily Kuznetsov 2017/08/30 10:50:01 JSON parsing fails on leading space, but if we wan
rosie 2018/03/26 02:32:21 Done.
131 value = value.strip() 155 page_data = '{}\n{}\n{}\n\n{}'.format(comment_start, comment_data,
132 if value.startswith('[') and value.endswith(']'): 156 comment_end, page_tail.strip())
133 value = [element.strip() for element in value[1:-1].split(',')] 157 else:
134 lines[i] = '\n' 158 metadata, page_data = parse_metadata(page, data.strip())
135 page_data[name.strip()] = value 159 return metadata, page_data
136 return page_data, ''.join(lines)
137 160
138 161
139 class Converter: 162 class Converter:
140 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} 163 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'}
141 missing_translations = 0 164 missing_translations = 0
142 total_translations = 0 165 total_translations = 0
143 166
144 def __init__(self, params, key='pagedata'): 167 def __init__(self, params, key='pagedata'):
145 self._params = params 168 self._params = params
146 self._key = key 169 self._key = key
(...skipping 409 matching lines...) Expand 10 before | Expand all | Expand 10 after
556 stack.pop() 579 stack.pop()
557 stack[-1]['subitems'].append(item) 580 stack[-1]['subitems'].append(item)
558 stack.append(item) 581 stack.append(item)
559 return structured 582 return structured
560 583
561 converters = { 584 converters = {
562 'html': RawConverter, 585 'html': RawConverter,
563 'md': MarkdownConverter, 586 'md': MarkdownConverter,
564 'tmpl': TemplateConverter, 587 'tmpl': TemplateConverter,
565 } 588 }
OLDNEW
« no previous file with comments | « no previous file | tests/expected_output/en/metadata_json » ('j') | tests/expected_output/en/sitemap » ('J')

Powered by Google App Engine
This is Rietveld