Left: | ||
Right: |
OLD | NEW |
---|---|
1 # This file is part of the Adblock Plus web scripts, | 1 # This file is part of the Adblock Plus web scripts, |
2 # Copyright (C) 2006-2017 eyeo GmbH | 2 # Copyright (C) 2006-2017 eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
12 # | 12 # |
13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
15 | 15 |
16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
17 | 17 |
18 import os | 18 import os |
19 import HTMLParser | 19 import HTMLParser |
20 import re | 20 import re |
21 import urlparse | 21 import urlparse |
22 import json | |
22 | 23 |
23 import jinja2 | 24 import jinja2 |
24 import markdown | 25 import markdown |
25 | 26 |
26 | 27 |
27 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs | 28 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs |
28 # are inserted into the <head> tag | 29 # are inserted into the <head> tag |
29 orig_isBlockLevel = markdown.util.isBlockLevel | 30 orig_isBlockLevel = markdown.util.isBlockLevel |
30 | 31 |
31 | 32 |
32 def isBlockLevel(tag): | 33 def isBlockLevel(tag): |
33 if tag == 'head': | 34 if tag == 'head': |
34 return True | 35 return True |
35 return orig_isBlockLevel(tag) | 36 return orig_isBlockLevel(tag) |
36 | 37 |
38 | |
37 markdown.util.isBlockLevel = isBlockLevel | 39 markdown.util.isBlockLevel = isBlockLevel |
38 | 40 |
39 html_escapes = { | 41 html_escapes = { |
40 '<': '<', | 42 '<': '<', |
41 '>': '>', | 43 '>': '>', |
42 '&': '&', | 44 '&': '&', |
43 '"': '"', | 45 '"': '"', |
44 "'": ''', | 46 "'": ''', |
45 } | 47 } |
46 | 48 |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
110 # the document. | 112 # the document. |
111 self._append_text(data) | 113 self._append_text(data) |
112 | 114 |
113 def handle_entityref(self, name): | 115 def handle_entityref(self, name): |
114 self._append_text(self.unescape('&{};'.format(name))) | 116 self._append_text(self.unescape('&{};'.format(name))) |
115 | 117 |
116 def handle_charref(self, name): | 118 def handle_charref(self, name): |
117 self._append_text(self.unescape('&#{};'.format(name))) | 119 self._append_text(self.unescape('&#{};'.format(name))) |
118 | 120 |
119 | 121 |
122 def parse_metadata(page, data): | |
Vasily Kuznetsov
2017/08/21 18:19:50
There's an implicit duplication here, where we man
rosie
2017/08/23 18:13:04
Done.
rosie
2017/08/23 18:13:04
Looks good. :)
| |
123 try: | |
124 decoder = json.JSONDecoder() | |
125 json_data, index = decoder.raw_decode(data) | |
126 json_data['page'] = page | |
127 return json_data, data[index:].strip() | |
128 except ValueError: | |
129 metadata = {'page': page} | |
130 lines = data.splitlines(True) | |
131 for i, line in enumerate(lines): | |
132 if not re.search(r'^\s*[\w\-]+\s*=', line): | |
133 break | |
134 name, value = line.split('=', 1) | |
135 value = value.strip() | |
136 if value.startswith('[') and value.endswith(']'): | |
137 value = [element.strip() for element in value[1:-1].split(',')] | |
138 lines[i] = '' | |
139 metadata[name.strip()] = value | |
140 page_data = ''.join(lines) | |
141 return metadata, page_data | |
142 | |
143 | |
120 def parse_page_content(page, data): | 144 def parse_page_content(page, data): |
121 """Separate page content into metadata (dict) and body text (str)""" | 145 """Separate page content into metadata (dict) and body text (str)""" |
122 page_data = {'page': page} | 146 # If metadata is in a comment block, extract it |
123 lines = data.splitlines(True) | 147 if data.lstrip().startswith('<!--'): |
124 for i, line in enumerate(lines): | 148 start_index = re.search('<!--', data).end() |
Vasily Kuznetsov
2017/08/21 18:19:50
I think regular expressions are a bit of an overki
rosie
2017/08/23 18:13:05
Yeah, that looks cleaner and avoids regular expres
Vasily Kuznetsov
2017/08/25 10:12:28
There's not much added complexity to the constants
| |
125 if line.strip() in {'<!--', '-->'}: | 149 end_match = re.search('-->', data) |
126 lines[i] = '' | 150 end_index = end_match.start() |
127 continue | 151 comment = data[start_index:end_index] |
128 if not re.search(r'^\s*[\w\-]+\s*=', line): | 152 page_data = data[end_match.end():] |
129 break | 153 metadata, comment_data = parse_metadata(page, comment.strip()) |
130 name, value = line.split('=', 1) | 154 page_data_result = '{}{}{}{}'.format('<!--\n', comment_data, |
131 value = value.strip() | 155 '\n-->\n\n', page_data.strip()) |
132 if value.startswith('[') and value.endswith(']'): | 156 return metadata, page_data_result |
133 value = [element.strip() for element in value[1:-1].split(',')] | 157 metadata, page_data = parse_metadata(page, data.strip()) |
134 lines[i] = '\n' | 158 return metadata, page_data |
135 page_data[name.strip()] = value | |
136 return page_data, ''.join(lines) | |
137 | 159 |
138 | 160 |
139 class Converter: | 161 class Converter: |
140 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} | 162 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} |
141 missing_translations = 0 | 163 missing_translations = 0 |
142 total_translations = 0 | 164 total_translations = 0 |
143 | 165 |
144 def __init__(self, params, key='pagedata'): | 166 def __init__(self, params, key='pagedata'): |
145 self._params = params | 167 self._params = params |
146 self._key = key | 168 self._key = key |
(...skipping 404 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
551 | 573 |
552 structured = [] | 574 structured = [] |
553 stack = [{'level': 0, 'subitems': structured}] | 575 stack = [{'level': 0, 'subitems': structured}] |
554 for item in flat: | 576 for item in flat: |
555 while stack[-1]['level'] >= item['level']: | 577 while stack[-1]['level'] >= item['level']: |
556 stack.pop() | 578 stack.pop() |
557 stack[-1]['subitems'].append(item) | 579 stack[-1]['subitems'].append(item) |
558 stack.append(item) | 580 stack.append(item) |
559 return structured | 581 return structured |
560 | 582 |
583 | |
561 converters = { | 584 converters = { |
562 'html': RawConverter, | 585 'html': RawConverter, |
563 'md': MarkdownConverter, | 586 'md': MarkdownConverter, |
564 'tmpl': TemplateConverter, | 587 'tmpl': TemplateConverter, |
565 } | 588 } |
OLD | NEW |