| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 # This file is part of the Adblock Plus web scripts, | 1 # This file is part of the Adblock Plus web scripts, |
| 2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
| 3 # | 3 # |
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
| 5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
| 6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
| 7 # | 7 # |
| 8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 # GNU General Public License for more details. | 11 # GNU General Public License for more details. |
| 12 # | 12 # |
| 13 # You should have received a copy of the GNU General Public License | 13 # You should have received a copy of the GNU General Public License |
| 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| 15 | 15 |
| 16 from __future__ import unicode_literals | 16 from __future__ import unicode_literals |
| 17 | 17 |
| 18 import os | 18 import os |
| 19 import HTMLParser | 19 import HTMLParser |
| 20 import re | 20 import re |
| 21 import urlparse | 21 import urlparse |
| 22 | 22 |
| 23 import jinja2 | 23 import jinja2 |
| 24 import markdown | 24 import markdown |
| 25 | 25 |
| 26 from cms import utils | |
|
Vasily Kuznetsov
2017/10/27 18:35:01
It makes more sense for `utils` to be imported int
mathias
2017/10/30 15:37:00
Acknowledged.
| |
| 26 | 27 |
| 27 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs | 28 # Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs |
| 28 # are inserted into the <head> tag | 29 # are inserted into the <head> tag |
| 29 orig_isBlockLevel = markdown.util.isBlockLevel | 30 orig_isBlockLevel = markdown.util.isBlockLevel |
| 30 | 31 |
| 31 | 32 |
| 32 def isBlockLevel(tag): | 33 def isBlockLevel(tag): |
| 33 if tag == 'head': | 34 if tag == 'head': |
| 34 return True | 35 return True |
| 35 return orig_isBlockLevel(tag) | 36 return orig_isBlockLevel(tag) |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 111 # the document. | 112 # the document. |
| 112 self._append_text(data) | 113 self._append_text(data) |
| 113 | 114 |
| 114 def handle_entityref(self, name): | 115 def handle_entityref(self, name): |
| 115 self._append_text(self.unescape('&{};'.format(name))) | 116 self._append_text(self.unescape('&{};'.format(name))) |
| 116 | 117 |
| 117 def handle_charref(self, name): | 118 def handle_charref(self, name): |
| 118 self._append_text(self.unescape('&#{};'.format(name))) | 119 self._append_text(self.unescape('&#{};'.format(name))) |
| 119 | 120 |
| 120 | 121 |
| 121 def parse_page_content(page, data): | |
|
Vasily Kuznetsov
2017/10/27 18:35:01
This function has nothing to do with converters, i
| |
| 122 """Separate page content into metadata (dict) and body text (str)""" | |
| 123 page_data = {'page': page} | |
| 124 lines = data.splitlines(True) | |
| 125 for i, line in enumerate(lines): | |
| 126 if line.strip() in {'<!--', '-->'}: | |
| 127 lines[i] = '' | |
| 128 continue | |
| 129 if not re.search(r'^\s*[\w\-]+\s*=', line): | |
| 130 break | |
| 131 name, value = line.split('=', 1) | |
| 132 value = value.strip() | |
| 133 if value.startswith('[') and value.endswith(']'): | |
| 134 value = [element.strip() for element in value[1:-1].split(',')] | |
| 135 lines[i] = '\n' | |
| 136 page_data[name.strip()] = value | |
| 137 return page_data, ''.join(lines) | |
| 138 | |
| 139 | |
| 140 class Converter: | 122 class Converter: |
| 141 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} | 123 whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} |
| 142 missing_translations = 0 | 124 missing_translations = 0 |
| 143 total_translations = 0 | 125 total_translations = 0 |
| 144 | 126 |
| 145 def __init__(self, params, key='pagedata'): | 127 def __init__(self, data, filename, params): |
|
Vasily Kuznetsov
2017/10/27 18:35:01
This signature is rather cryptic, what we want to
| |
| 128 self._data = data | |
| 129 self._filename = filename | |
| 146 self._params = params | 130 self._params = params |
| 147 self._key = key | |
| 148 self._attribute_parser = AttributeParser(self.whitelist) | 131 self._attribute_parser = AttributeParser(self.whitelist) |
| 149 self._seen_defaults = {} | 132 self._seen_defaults = {} |
| 150 | 133 |
| 151 # Read in any parameters specified at the beginning of the file | |
| 152 # and override converter defaults with page specific params | |
| 153 data, filename = params[key] | |
|
Vasily Kuznetsov
2017/10/27 18:35:01
This is a side effect that is not related to conve
| |
| 154 page_data, body_text = parse_page_content(params['page'], data) | |
| 155 params.update(page_data) | |
| 156 params[key] = (body_text, filename) | |
| 157 | |
| 158 def localize_string( | 134 def localize_string( |
| 159 self, page, name, default, comment, localedata, escapes): | 135 self, page, name, default, comment, localedata, escapes): |
| 160 | 136 |
| 161 def escape(s): | 137 def escape(s): |
| 162 return re.sub(r'.', | 138 return re.sub(r'.', |
| 163 lambda match: escapes.get(match.group(0), | 139 lambda match: escapes.get(match.group(0), |
| 164 match.group(0)), | 140 match.group(0)), |
| 165 s, flags=re.S) | 141 s, flags=re.S) |
| 166 | 142 |
| 167 def re_escape(s): | 143 def re_escape(s): |
| (...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 288 return text | 264 return text |
| 289 | 265 |
| 290 include_start_regex = '<' | 266 include_start_regex = '<' |
| 291 include_end_regex = '>' | 267 include_end_regex = '>' |
| 292 | 268 |
| 293 def resolve_includes(self, text): | 269 def resolve_includes(self, text): |
| 294 def resolve_include(match): | 270 def resolve_include(match): |
| 295 name = match.group(1) | 271 name = match.group(1) |
| 296 for format_, converter_class in converters.iteritems(): | 272 for format_, converter_class in converters.iteritems(): |
| 297 if self._params['source'].has_include(name, format_): | 273 if self._params['source'].has_include(name, format_): |
| 298 self._params['includedata'] = ( | 274 data, filename = ( |
|
Vasily Kuznetsov
2017/10/27 18:35:01
This key is not used anywhere, so we don't need to
| |
| 299 self._params['source'].read_include(name, format_)) | 275 self._params['source'].read_include(name, format_)) |
| 300 | 276 |
| 301 converter = converter_class(self._params, | 277 # XXX: allowing includes to modify params of the whole page |
| 302 key='includedata') | 278 # seems like a bad idea but we have to support this because |
| 279 # it's used by www.adblockplus.org. | |
| 280 metadata, rest = utils.extract_page_metadata(data) | |
|
Vasily Kuznetsov
2017/10/27 18:35:01
We have to maintain the ability of includes to wri
mathias
2017/10/30 15:37:00
Acknowledged.
| |
| 281 self._params.update(metadata) | |
| 282 | |
| 283 converter = converter_class(rest, filename, self._params) | |
| 303 result = converter() | 284 result = converter() |
| 304 self.missing_translations += converter.missing_translations | 285 self.missing_translations += converter.missing_translations |
| 305 self.total_translations += converter.total_translations | 286 self.total_translations += converter.total_translations |
| 306 return result | 287 return result |
| 307 raise Exception('Failed to resolve include {}' | 288 raise Exception('Failed to resolve include {}' |
| 308 ' on page {}'.format(name, self._params['page'])) | 289 ' on page {}'.format(name, self._params['page'])) |
| 309 | 290 |
| 310 return re.sub( | 291 return re.sub( |
| 311 r'{}\?\s*include\s+([^\s<>"]+)\s*\?{}'.format( | 292 r'{}\?\s*include\s+([^\s<>"]+)\s*\?{}'.format( |
| 312 self.include_start_regex, | 293 self.include_start_regex, |
| 313 self.include_end_regex | 294 self.include_end_regex |
| 314 ), | 295 ), |
| 315 resolve_include, | 296 resolve_include, |
| 316 text | 297 text |
| 317 ) | 298 ) |
| 318 | 299 |
| 319 def __call__(self): | 300 def __call__(self): |
| 320 result = self.get_html(*self._params[self._key]) | 301 result = self.get_html(self._data, self._filename) |
| 321 result = self.resolve_includes(result) | 302 return self.resolve_includes(result) |
| 322 if self._key == 'pagedata': | |
|
Vasily Kuznetsov
2017/10/27 18:35:01
The return types of the two branches of if are dif
| |
| 323 head = [] | |
| 324 | |
| 325 def add_to_head(match): | |
| 326 head.append(match.group(1)) | |
| 327 return '' | |
| 328 body = re.sub(r'<head>(.*?)</head>', add_to_head, result, | |
| 329 flags=re.S) | |
| 330 return ''.join(head), body | |
| 331 return result | |
| 332 | 303 |
| 333 | 304 |
| 334 class RawConverter(Converter): | 305 class RawConverter(Converter): |
| 335 def get_html(self, source, filename): | 306 def get_html(self, source, filename): |
| 336 result = self.insert_localized_strings(source, html_escapes) | 307 result = self.insert_localized_strings(source, html_escapes) |
| 337 result = self.process_links(result) | 308 result = self.process_links(result) |
| 338 return result | 309 return result |
| 339 | 310 |
| 340 | 311 |
| 341 class MarkdownConverter(Converter): | 312 class MarkdownConverter(Converter): |
| (...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 465 )) | 436 )) |
| 466 | 437 |
| 467 def has_string(self, name, page=None): | 438 def has_string(self, name, page=None): |
| 468 if page is None: | 439 if page is None: |
| 469 page = self._params['page'] | 440 page = self._params['page'] |
| 470 | 441 |
| 471 localedata = self._get_locale_data(page) | 442 localedata = self._get_locale_data(page) |
| 472 return name in localedata | 443 return name in localedata |
| 473 | 444 |
| 474 def get_page_content(self, page, locale=None): | 445 def get_page_content(self, page, locale=None): |
| 475 from cms.utils import get_page_params | |
| 476 | |
| 477 if locale is None: | 446 if locale is None: |
| 478 locale = self._params['locale'] | 447 locale = self._params['locale'] |
| 479 return get_page_params(self._params['source'], locale, page) | 448 return utils.get_page_params(self._params['source'], locale, page) |
| 480 | 449 |
| 481 def linkify(self, page, locale=None, **attrs): | 450 def linkify(self, page, locale=None, **attrs): |
| 482 if locale is None: | 451 if locale is None: |
| 483 locale = self._params['locale'] | 452 locale = self._params['locale'] |
| 484 | 453 |
| 485 locale, url = self._params['source'].resolve_link(page, locale) | 454 locale, url = self._params['source'].resolve_link(page, locale) |
| 486 return jinja2.Markup('<a{}>'.format(''.join( | 455 return jinja2.Markup('<a{}>'.format(''.join( |
| 487 ' {}="{}"'.format(name, jinja2.escape(value)) for name, value in [ | 456 ' {}="{}"'.format(name, jinja2.escape(value)) for name, value in [ |
| 488 ('href', url), | 457 ('href', url), |
| 489 ('hreflang', locale) | 458 ('hreflang', locale) |
| 490 ] + attrs.items() | 459 ] + attrs.items() |
| 491 ))) | 460 ))) |
| 492 | 461 |
| 493 def get_pages_metadata(self, filters=None): | 462 def get_pages_metadata(self, filters=None): |
| 494 if filters is not None and not isinstance(filters, dict): | 463 if filters is not None and not isinstance(filters, dict): |
| 495 raise TypeError('Filters are not a dictionary') | 464 raise TypeError('Filters are not a dictionary') |
| 496 | 465 |
| 497 return_data = [] | 466 return_data = [] |
| 498 for page_name, _format in self._params['source'].list_pages(): | 467 for page_name, _format in self._params['source'].list_pages(): |
| 499 data, filename = self._params['source'].read_page(page_name, | 468 data, filename = self._params['source'].read_page(page_name, |
| 500 _format) | 469 _format) |
| 501 page_data = parse_page_content(page_name, data)[0] | 470 page_data = utils.extract_page_metadata(data)[0] |
| 471 page_data['page'] = page_name | |
|
mathias
2017/10/30 15:37:00
Shouldn't this use setdefault(), in order to allow
Vasily Kuznetsov
2017/11/07 17:08:29
Yeah, you're right, this would be needed to preser
| |
| 502 if self.filter_metadata(filters, page_data) is True: | 472 if self.filter_metadata(filters, page_data) is True: |
| 503 return_data.append(page_data) | 473 return_data.append(page_data) |
| 504 return return_data | 474 return return_data |
| 505 | 475 |
| 506 def filter_metadata(self, filters, metadata): | 476 def filter_metadata(self, filters, metadata): |
| 507 # if only the page key is in the metadata then there | 477 # if only the page key is in the metadata then there |
| 508 # was no user defined metadata | 478 # was no user defined metadata |
| 509 if metadata.keys() == ['page']: | 479 if metadata.keys() == ['page']: |
| 510 return False | 480 return False |
| 511 if filters is None: | 481 if filters is None: |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 558 stack[-1]['subitems'].append(item) | 528 stack[-1]['subitems'].append(item) |
| 559 stack.append(item) | 529 stack.append(item) |
| 560 return structured | 530 return structured |
| 561 | 531 |
| 562 | 532 |
| 563 converters = { | 533 converters = { |
| 564 'html': RawConverter, | 534 'html': RawConverter, |
| 565 'md': MarkdownConverter, | 535 'md': MarkdownConverter, |
| 566 'tmpl': TemplateConverter, | 536 'tmpl': TemplateConverter, |
| 567 } | 537 } |
| OLD | NEW |