| Index: compiled/RegExpFilter.cpp |
| =================================================================== |
| new file mode 100644 |
| --- /dev/null |
| +++ b/compiled/RegExpFilter.cpp |
| @@ -0,0 +1,305 @@ |
| +#include <climits> |
| +#include <unordered_map> |
| + |
| +#include <emscripten.h> |
| + |
| +#include "RegExpFilter.h" |
| +#include "WhiteListFilter.h" |
| +#include "InvalidFilter.h" |
| +#include "StringScanner.h" |
| + |
| +namespace |
| +{ |
| + enum |
| + { |
| + TYPE_OTHER = 0x1, |
| + TYPE_SCRIPT = 0x2, |
| + TYPE_IMAGE = 0x4, |
| + TYPE_STYLESHEET = 0x8, |
| + TYPE_OBJECT = 0x10, |
| + TYPE_SUBDOCUMENT = 0x20, |
| + TYPE_DOCUMENT = 0x40, |
| + TYPE_PING = 0x400, |
| + TYPE_XMLHTTPREQUEST = 0x800, |
| + TYPE_OBJECT_SUBREQUEST = 0x1000, |
| + TYPE_MEDIA = 0x4000, |
| + TYPE_FONT = 0x8000, |
| + TYPE_POPUP = 0x8000000, |
| + TYPE_GENERICBLOCK = 0x10000000, |
| + TYPE_GENERICHIDE = 0x20000000, |
| + TYPE_ELEMHIDE = 0x40000000, |
| + }; |
| + |
| + std::unordered_map<std::u16string,int> typeMap({ |
| + {u"OTHER", TYPE_OTHER}, |
| + {u"SCRIPT", TYPE_SCRIPT}, |
| + {u"IMAGE", TYPE_IMAGE}, |
| + {u"STYLESHEET", TYPE_STYLESHEET}, |
| + {u"OBJECT", TYPE_OBJECT}, |
| + {u"SUBDOCUMENT", TYPE_SUBDOCUMENT}, |
| + {u"DOCUMENT", TYPE_DOCUMENT}, |
| + {u"XBL", TYPE_OTHER}, // Backwards compat |
| + {u"PING", TYPE_PING}, |
| + {u"XMLHTTPREQUEST", TYPE_XMLHTTPREQUEST}, |
| + {u"OBJECT_SUBREQUEST", TYPE_OBJECT_SUBREQUEST}, |
| + {u"DTD", TYPE_OTHER}, // Backwards compat |
| + {u"MEDIA", TYPE_MEDIA}, |
| + {u"FONT", TYPE_FONT}, |
| + {u"BACKGROUND", TYPE_IMAGE}, // Backwards compat |
| + |
| + {u"POPUP", TYPE_POPUP}, |
| + {u"GENERICBLOCK", TYPE_GENERICBLOCK}, |
| + {u"GENERICHIDE", TYPE_GENERICHIDE}, |
| + {u"ELEMHIDE", TYPE_ELEMHIDE}, |
| + }); |
| + |
| + int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | TYPE_POPUP | |
| + TYPE_GENERICBLOCK | TYPE_GENERICHIDE); |
| + |
| + int GenerateRegExp(const std::u16string& regexp, bool matchCase) |
| + { |
| + return EM_ASM_INT(return regexps.create($0, $1), ®exp, matchCase); |
| + } |
| +} |
| + |
| +RegExpFilter::RegExpFilter(const std::u16string& text, |
| + const std::u16string& pattern, const std::u16string& options) |
| + : ActiveFilter(text, true), regexpId(0), contentType(-1), matchCase(false), |
| + thirdParty(TrippleState::ANY) |
| +{ |
| + int optionStart = 0; |
| + int optionEnd = -1; |
| + int valueStart = -1; |
| + StringScanner scanner(options + u","); |
| + while (!scanner.done()) |
| + { |
| + switch (scanner.next()) |
| + { |
| + case u'=': |
| + if (optionEnd < 0) |
| + { |
| + optionEnd = scanner.position(); |
| + valueStart = optionEnd + 1; |
| + } |
| + break; |
| + case u',': |
| + if (optionEnd < 0) |
| + optionEnd = scanner.position(); |
| + ProcessOption(options, optionStart, optionEnd, valueStart, scanner.position()); |
| + optionStart = scanner.position() + 1; |
| + optionEnd = -1; |
| + valueStart = -1; |
| + break; |
| + } |
| + } |
| + if (contentType < 0) |
| + contentType = defaultTypeMask; |
| + |
| + size_t len = pattern.length(); |
| + if (len >= 2 && pattern[0] == u'/' && pattern[len - 1] == u'/') |
| + { |
| + regexpId = GenerateRegExp(pattern.substr(1, len - 2), matchCase); |
| + |
| + std::u16string* error = reinterpret_cast<std::u16string*>(EM_ASM_INT(return regexps.getError($0), regexpId)); |
| + if (error) |
| + { |
| + EM_ASM_ARGS(regexps.delete($0), regexpId); |
| + throw std::u16string(*error); |
| + } |
| + } |
| + else |
| + regexpSource = pattern; |
| +} |
| + |
| +RegExpFilter::~RegExpFilter() |
| +{ |
| + if (regexpId) |
| + EM_ASM_ARGS(regexps.delete($0), regexpId); |
| +} |
| + |
| +void RegExpFilter::ProcessOption(const std::u16string& options, |
| + int optionStart, int optionEnd, int valueStart, int valueEnd) |
| +{ |
| + if (optionEnd <= optionStart) |
| + return; |
| + |
| + bool reverse = false; |
| + if (options[optionStart] == u'~') |
| + { |
| + reverse = true; |
| + optionStart++; |
| + } |
| + |
| + std::u16string name(options.substr(optionStart, optionEnd - optionStart)); |
| + for (size_t i = 0, l = name.length(); i < l; ++i) |
| + { |
| + char16_t currChar = name[i]; |
| + if (currChar >= u'a' && currChar <= u'z') |
| + name[i] = currChar + u'A' - u'a'; |
| + else if (currChar == u'-') |
| + name[i] = u'_'; |
| + } |
| + |
| + auto it = typeMap.find(name); |
| + if (it != typeMap.end()) |
| + { |
| + if (contentType < 0) |
| + contentType = reverse ? defaultTypeMask : 0; |
| + if (reverse) |
| + contentType &= ~it->second; |
| + else |
| + contentType |= it->second; |
| + } |
| + else if (!name.compare(u"DOMAIN")) |
| + { |
| + if (valueStart >= 0 && valueEnd > valueStart) |
| + ParseDomains(options.substr(valueStart, valueEnd - valueStart), u'|'); |
| + } |
| + else if (!name.compare(u"SITEKEY")) |
| + { |
| + if (valueStart >= 0 && valueEnd > valueStart) |
| + { |
| + StringScanner scanner(options.substr(valueStart, valueEnd - valueStart) + u"|"); |
| + size_t start = 0; |
| + while (!scanner.done()) |
| + { |
| + if (scanner.next() == u'|') |
| + { |
| + if (scanner.position() > start) |
| + sitekeys.insert(options.substr(valueStart + start, scanner.position() - start)); |
| + start = scanner.position() + 1; |
| + } |
| + } |
| + } |
| + } |
| + else if (!name.compare(u"MATCH_CASE")) |
| + matchCase = !reverse; |
| + else if (!name.compare(u"THIRD_PARTY")) |
| + thirdParty = reverse ? TrippleState::NO : TrippleState::YES; |
| + else if (!name.compare(u"COLLAPSE")) |
| + collapse = reverse ? TrippleState::NO : TrippleState::YES; |
| + else |
| + throw std::u16string(u"Unknown option " + name); |
| +} |
| + |
| +Filter* RegExpFilter::Create(const std::u16string& text) |
| +{ |
| + bool blocking = true; |
| + size_t patternStart = 0; |
| + if (!text.compare(0, 2, u"@@")) |
| + { |
| + blocking = false; |
| + patternStart = 2; |
| + } |
| + |
| + size_t patternEnd = text.find(u'$', patternStart); |
| + size_t patternLength = (patternEnd != std::u16string::npos ? |
| + patternEnd - patternStart : patternEnd); |
| + std::u16string pattern(text.substr(patternStart, patternLength)); |
| + std::u16string options(patternEnd != std::u16string::npos ? |
| + text.substr(patternEnd + 1) : u""); |
| + |
| + try |
| + { |
| + if (blocking) |
| + return new RegExpFilter(text, pattern, options); |
| + else |
| + return new WhiteListFilter(text, pattern, options); |
| + } |
| + catch (const std::u16string& reason) |
| + { |
| + return new InvalidFilter(text, reason); |
| + } |
| +} |
| + |
| +void RegExpFilter::InitJSTypes() |
| +{ |
| + for (auto it = typeMap.begin(); it != typeMap.end(); ++it) |
| + EM_ASM_ARGS(Module.RegExpFilter_typeMap[getStringData($0)] = $1, &(it->first), it->second); |
| +} |
| + |
| +const std::u16string RegExpFilter::RegExpFromSource(const std::u16string& source) |
| +{ |
| + // Note: This doesn't remove trailing wildcards, otherwise the result should |
| + // be identical to Filter.toRegExp(). |
| + std::u16string result; |
| + char16_t prevChar = u'*'; |
| + for (size_t i = 0, l = source.length(); i < l; ++i) |
| + { |
| + char16_t currChar = source[i]; |
| + switch (currChar) |
| + { |
| + case u'*': |
| + if (prevChar != u'*') |
| + result += u".*"; |
| + break; |
| + case u'^': |
| + result += u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x60\\x7B-\\x7F]|$)"; |
| + break; |
| + case u'|': |
| + if (i == 0) |
| + { |
| + // Anchor at expression start, maybe extended anchor? |
| + if (i + 1 < l && source[i + 1] == u'|') |
| + { |
| + result += u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"; |
| + ++i; |
| + } |
| + else |
| + result += u"^"; |
| + } |
| + else if (i == l - 1) |
| + { |
| + // Anchor at expression end, ignore if following separator placeholder |
| + if (prevChar != u'^') |
| + result += u"$"; |
| + } |
| + else |
| + { |
| + // Not actually an anchor, escape it |
| + result += u"\\|"; |
| + } |
| + break; |
| + default: |
| + if ((currChar >= u'a' && currChar <= u'z') || |
| + (currChar >= u'A' && currChar <= u'Z') || |
| + (currChar >= u'0' && currChar <= u'9') || |
| + currChar >= 128) |
| + { |
| + result += currChar; |
| + } |
| + else |
| + { |
| + result += u"\\"; |
| + result.append(1, currChar); |
| + } |
| + } |
| + prevChar = currChar; |
| + } |
| + return result; |
| +} |
| + |
| +Filter::Type RegExpFilter::GetType() const |
| +{ |
| + return Type::BLOCKING; |
| +} |
| + |
| +bool RegExpFilter::Matches(const std::u16string& location, int typeMask, |
| + const std::u16string& docDomain, bool thirdParty, |
| + const std::u16string& sitekey) |
| +{ |
| + if (!(this->contentType & typeMask) || |
| + (this->thirdParty == TrippleState::YES && !thirdParty) || |
| + (this->thirdParty == TrippleState::NO && thirdParty) || |
| + !IsActiveOnDomain(docDomain, sitekey)) |
| + { |
| + return false; |
| + } |
| + |
| + if (!regexpId) |
| + { |
| + regexpId = GenerateRegExp(RegExpFromSource(regexpSource), matchCase); |
| + regexpSource.resize(0); |
| + } |
| + return EM_ASM_INT(return regexps.test($0, $1), regexpId, &location); |
| +} |