| LEFT | RIGHT | 
 |    1 #include <climits> | 
 |    2  | 
 |    3 #include <emscripten.h> | 
 |    4  | 
|    1 #include "RegExpFilter.h" |    5 #include "RegExpFilter.h" | 
|    2  |    6 #include "StringScanner.h" | 
|    3 RegExpFilter::RegExpFilter(const std::u16string& text) |    7 #include "StringMap.h" | 
|    4     : ActiveFilter(text) |    8  | 
|    5 { |    9 namespace | 
|    6 } |   10 { | 
|    7  |   11   enum | 
|    8 RegExpFilter* RegExpFilter::Create(const std::u16string& text) |   12   { | 
|    9 { |   13     TYPE_OTHER = 0x1, | 
|   10   return new RegExpFilter(text); |   14     TYPE_SCRIPT = 0x2, | 
|   11 } |   15     TYPE_IMAGE = 0x4, | 
|   12  |   16     TYPE_STYLESHEET = 0x8, | 
|   13 Filter::Type RegExpFilter::GetType() |   17     TYPE_OBJECT = 0x10, | 
|   14 { |   18     TYPE_SUBDOCUMENT = 0x20, | 
|   15   return Type::BLOCKING; |   19     TYPE_DOCUMENT = 0x40, | 
|   16 } |   20     TYPE_PING = 0x400, | 
 |   21     TYPE_XMLHTTPREQUEST = 0x800, | 
 |   22     TYPE_OBJECT_SUBREQUEST = 0x1000, | 
 |   23     TYPE_MEDIA = 0x4000, | 
 |   24     TYPE_FONT = 0x8000, | 
 |   25     TYPE_POPUP = 0x8000000, | 
 |   26     TYPE_GENERICBLOCK = 0x10000000, | 
 |   27     TYPE_GENERICHIDE = 0x20000000, | 
 |   28     TYPE_ELEMHIDE = 0x40000000, | 
 |   29   }; | 
 |   30  | 
 |   31   StringMap<int> typeMap { | 
 |   32     {u"other"_str, TYPE_OTHER}, | 
 |   33     {u"script"_str, TYPE_SCRIPT}, | 
 |   34     {u"image"_str, TYPE_IMAGE}, | 
 |   35     {u"stylesheet"_str, TYPE_STYLESHEET}, | 
 |   36     {u"object"_str, TYPE_OBJECT}, | 
 |   37     {u"subdocument"_str, TYPE_SUBDOCUMENT}, | 
 |   38     {u"document"_str, TYPE_DOCUMENT}, | 
 |   39     {u"xbl"_str, TYPE_OTHER},          // Backwards compat | 
 |   40     {u"ping"_str, TYPE_PING}, | 
 |   41     {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST}, | 
 |   42     {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST}, | 
 |   43     {u"dtd"_str, TYPE_OTHER},          // Backwards compat | 
 |   44     {u"media"_str, TYPE_MEDIA}, | 
 |   45     {u"font"_str, TYPE_FONT}, | 
 |   46     {u"background"_str, TYPE_IMAGE},   // Backwards compat | 
 |   47  | 
 |   48     {u"popup"_str, TYPE_POPUP}, | 
 |   49     {u"genericblock"_str, TYPE_GENERICBLOCK}, | 
 |   50     {u"generichide"_str, TYPE_GENERICHIDE}, | 
 |   51     {u"elemhide"_str, TYPE_ELEMHIDE}, | 
 |   52   }; | 
 |   53  | 
 |   54   const int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | | 
 |   55       TYPE_POPUP | TYPE_GENERICBLOCK | TYPE_GENERICHIDE); | 
 |   56  | 
 |   57   int GenerateRegExp(const String& regexp, bool matchCase) | 
 |   58   { | 
 |   59     return EM_ASM_INT(return regexps.create($0, $1), ®exp, matchCase); | 
 |   60   } | 
 |   61  | 
 |   62   void NormalizeWhitespace(DependentString& text) | 
 |   63   { | 
 |   64     // We want to remove all spaces but bail out early in the common scenario | 
 |   65     // that the string contains no spaces. | 
 |   66  | 
 |   67     // Look for the first space | 
 |   68     String::size_type len = text.length(); | 
 |   69     String::size_type pos; | 
 |   70     for (pos = 0; pos < len; pos++) | 
 |   71       if (text[pos] == ' ') | 
 |   72         break; | 
 |   73  | 
 |   74     if (pos >= len) | 
 |   75       return; | 
 |   76  | 
 |   77     // Found spaces, move characters to remove them | 
 |   78     String::size_type delta = 1; | 
 |   79     for (pos = pos + 1; pos < len; pos++) | 
 |   80     { | 
 |   81       if (text[pos] == ' ') | 
 |   82         delta++; | 
 |   83       else | 
 |   84         text[pos - delta] = text[pos]; | 
 |   85     } | 
 |   86     text.reset(text, 0, len - delta); | 
 |   87   } | 
 |   88  | 
 |   89   void ParseOption(String& text, DependentString& error, RegExpFilterData& data, | 
 |   90       int optionStart, int optionEnd, int valueStart, int valueEnd) | 
 |   91   { | 
 |   92     if (optionEnd <= optionStart) | 
 |   93       return; | 
 |   94  | 
 |   95     bool reverse = false; | 
 |   96     if (text[optionStart] == u'~') | 
 |   97     { | 
 |   98       reverse = true; | 
 |   99       optionStart++; | 
 |  100     } | 
 |  101  | 
 |  102     DependentString name(text, optionStart, optionEnd - optionStart); | 
 |  103     for (size_t i = 0; i < name.length(); ++i) | 
 |  104     { | 
 |  105       char16_t currChar = name[i]; | 
 |  106       if (currChar >= u'A' && currChar <= u'Z') | 
 |  107         name[i] = currChar + u'a' - u'A'; | 
 |  108       else if (currChar == u'_') | 
 |  109         name[i] = u'-'; | 
 |  110     } | 
 |  111  | 
 |  112     auto it = typeMap.find(name); | 
 |  113     if (it) | 
 |  114     { | 
 |  115       if (data.mContentType < 0) | 
 |  116         data.mContentType = reverse ? defaultTypeMask : 0; | 
 |  117       if (reverse) | 
 |  118         data.mContentType &= ~it->second; | 
 |  119       else | 
 |  120         data.mContentType |= it->second; | 
 |  121     } | 
 |  122     else if (name.equals(u"domain"_str)) | 
 |  123     { | 
 |  124       if (valueStart >= 0 && valueEnd > valueStart) | 
 |  125       { | 
 |  126         data.mDomainsStart = valueStart; | 
 |  127         data.mDomainsEnd = valueEnd; | 
 |  128         DependentString(text, valueStart, valueEnd - valueStart).toLower(); | 
 |  129       } | 
 |  130     } | 
 |  131     else if (name.equals(u"sitekey"_str)) | 
 |  132     { | 
 |  133       if (valueStart >= 0 && valueEnd > valueStart) | 
 |  134       { | 
 |  135         data.mSitekeysStart = valueStart; | 
 |  136         data.mSitekeysEnd = valueEnd; | 
 |  137       } | 
 |  138     } | 
 |  139     else if (name.equals(u"match-case"_str)) | 
 |  140       data.mMatchCase = !reverse; | 
 |  141     else if (name.equals(u"third-party"_str)) | 
 |  142       data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES; | 
 |  143     else if (name.equals(u"collapse"_str)) | 
 |  144       data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES; | 
 |  145     else | 
 |  146       error.reset(u"filter_unknown_option"_str); | 
 |  147   } | 
 |  148  | 
 |  149   void ParseOptions(String& text, DependentString& error, RegExpFilterData& data
     , | 
 |  150       String::size_type optionsStart) | 
 |  151   { | 
 |  152     data.mMatchCase = false; | 
 |  153     data.mThirdParty = TrippleState::ANY; | 
 |  154     data.mCollapse = TrippleState::ANY; | 
 |  155     data.mDomainsStart = String::npos; | 
 |  156     data.mSitekeysStart = String::npos; | 
 |  157     if (optionsStart >= text.length()) | 
 |  158     { | 
 |  159       data.mContentType = defaultTypeMask; | 
 |  160       return; | 
 |  161     } | 
 |  162  | 
 |  163     data.mContentType = -1; | 
 |  164  | 
 |  165     int optionStart = data.mPatternEnd + 1; | 
 |  166     int optionEnd = -1; | 
 |  167     int valueStart = -1; | 
 |  168  | 
 |  169     StringScanner scanner(text, optionStart, u','); | 
 |  170     bool done = false; | 
 |  171     while (!done) | 
 |  172     { | 
 |  173       done = scanner.done(); | 
 |  174       switch (scanner.next()) | 
 |  175       { | 
 |  176         case u'=': | 
 |  177           if (optionEnd < 0) | 
 |  178           { | 
 |  179             optionEnd = scanner.position(); | 
 |  180             valueStart = optionEnd + 1; | 
 |  181           } | 
 |  182           break; | 
 |  183         case u',': | 
 |  184           if (optionEnd < 0) | 
 |  185             optionEnd = scanner.position(); | 
 |  186           ParseOption(text, error, data, optionStart, optionEnd, valueStart, | 
 |  187               scanner.position()); | 
 |  188           if (!error.empty()) | 
 |  189             return; | 
 |  190  | 
 |  191           optionStart = scanner.position() + 1; | 
 |  192           optionEnd = -1; | 
 |  193           valueStart = -1; | 
 |  194           break; | 
 |  195       } | 
 |  196     } | 
 |  197  | 
 |  198     if (data.mContentType < 0) | 
 |  199       data.mContentType = defaultTypeMask; | 
 |  200   } | 
 |  201 } | 
 |  202  | 
 |  203 RegExpFilter::RegExpFilter(Type type, const String& text, const RegExpFilterData
     & data) | 
 |  204     : ActiveFilter(type, text, true), mData(data) | 
 |  205 { | 
 |  206 } | 
 |  207  | 
 |  208 RegExpFilter::~RegExpFilter() | 
 |  209 { | 
 |  210   if (mData.HasRegExp()) | 
 |  211     EM_ASM_ARGS(regexps.delete($0), mData.mRegexpId); | 
 |  212 } | 
 |  213  | 
 |  214 Filter::Type RegExpFilter::Parse(DependentString& text, DependentString& error, | 
 |  215     RegExpFilterData& data) | 
 |  216 { | 
 |  217   NormalizeWhitespace(text); | 
 |  218  | 
 |  219   Filter::Type type = Type::BLOCKING; | 
 |  220  | 
 |  221   data.mPatternStart = 0; | 
 |  222   if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@') | 
 |  223   { | 
 |  224     type = Type::WHITELIST; | 
 |  225     data.mPatternStart = 2; | 
 |  226   } | 
 |  227  | 
 |  228   data.mPatternEnd = text.find(u'$', data.mPatternStart); | 
 |  229   if (data.mPatternEnd == text.npos) | 
 |  230     data.mPatternEnd = text.length(); | 
 |  231  | 
 |  232   ParseOptions(text, error, data, data.mPatternEnd + 1); | 
 |  233   if (!error.empty()) | 
 |  234     return Type::INVALID; | 
 |  235  | 
 |  236   if (data.mPatternEnd - data.mPatternStart >= 2 && | 
 |  237       text[data.mPatternStart] == u'/' && | 
 |  238       text[data.mPatternEnd - 1] == u'/') | 
 |  239   { | 
 |  240     data.SetRegExp(GenerateRegExp(DependentString(text, data.mPatternStart + 1, | 
 |  241         data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase)); | 
 |  242     if (data.mRegexpId == -1) | 
 |  243     { | 
 |  244       error.reset(u"filter_invalid_regexp"_str); | 
 |  245       return Type::INVALID; | 
 |  246     } | 
 |  247   } | 
 |  248  | 
 |  249   return type; | 
 |  250 } | 
 |  251  | 
 |  252 void RegExpFilter::ParseSitekeys(const String& sitekeys) const | 
 |  253 { | 
 |  254   StringScanner scanner(sitekeys, 0, u'|'); | 
 |  255   size_t start = 0; | 
 |  256   bool done = false; | 
 |  257   while (!done) | 
 |  258   { | 
 |  259     done = scanner.done(); | 
 |  260     if (scanner.next() == u'|') | 
 |  261     { | 
 |  262       if (scanner.position() > start) | 
 |  263         AddSitekey(DependentString(sitekeys, start, scanner.position() - start))
     ; | 
 |  264       start = scanner.position() + 1; | 
 |  265     } | 
 |  266   } | 
 |  267 } | 
 |  268  | 
 |  269 void RegExpFilter::InitJSTypes() | 
 |  270 { | 
 |  271   EM_ASM(exports.RegExpFilter.typeMap = {};); | 
 |  272   for (auto it = typeMap.begin(); it != typeMap.end(); ++it) | 
 |  273     EM_ASM_ARGS(exports.RegExpFilter.typeMap[readString($0).replace("-", "_").to
     UpperCase()] = $1, &(it->first), it->second); | 
 |  274 } | 
 |  275  | 
 |  276 OwnedString RegExpFilter::RegExpFromSource(const String& source) | 
 |  277 { | 
 |  278   /* TODO: this is very inefficient */ | 
 |  279  | 
 |  280   // Note: This doesn't remove trailing wildcards, otherwise the result should | 
 |  281   // be identical to Filter.toRegExp(). | 
 |  282   OwnedString result; | 
 |  283   String::value_type prevChar = u'*'; | 
 |  284   for (String::size_type i = 0; i < source.length(); ++i) | 
 |  285   { | 
 |  286     String::value_type currChar = source[i]; | 
 |  287     switch (currChar) | 
 |  288     { | 
 |  289       case u'*': | 
 |  290         if (prevChar != u'*') | 
 |  291           result.append(u".*"_str); | 
 |  292         break; | 
 |  293       case u'^': | 
 |  294         result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x
     60\\x7B-\\x7F]|$)"_str); | 
 |  295         break; | 
 |  296       case u'|': | 
 |  297         if (i == 0) | 
 |  298         { | 
 |  299           // Anchor at expression start, maybe extended anchor? | 
 |  300           if (i + 1 < source.length() && source[i + 1] == u'|') | 
 |  301           { | 
 |  302             result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str); | 
 |  303             ++i; | 
 |  304           } | 
 |  305           else | 
 |  306             result.append(u'^'); | 
 |  307         } | 
 |  308         else if (i == source.length() - 1) | 
 |  309         { | 
 |  310           // Anchor at expression end, ignore if following separator placeholder | 
 |  311           if (prevChar != u'^') | 
 |  312             result.append(u'$'); | 
 |  313         } | 
 |  314         else | 
 |  315         { | 
 |  316           // Not actually an anchor, escape it | 
 |  317           result.append(u"\\|"_str); | 
 |  318         } | 
 |  319         break; | 
 |  320       default: | 
 |  321         if (!(currChar >= u'a' && currChar <= u'z') && | 
 |  322             !(currChar >= u'A' && currChar <= u'Z') && | 
 |  323             !(currChar >= u'0' && currChar <= u'9') && | 
 |  324             currChar < 128) | 
 |  325         { | 
 |  326           result.append(u'\\'); | 
 |  327         } | 
 |  328         result.append(currChar); | 
 |  329     } | 
 |  330     prevChar = currChar; | 
 |  331   } | 
 |  332   return result; | 
 |  333 } | 
 |  334  | 
 |  335 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const | 
 |  336 { | 
 |  337   if (!mData.DomainsParsingDone()) | 
 |  338   { | 
 |  339     ParseDomains(mData.GetDomainsSource(mText), u'|'); | 
 |  340     mData.SetDomainsParsingDone(); | 
 |  341   } | 
 |  342   return ActiveFilter::GetDomains(); | 
 |  343 } | 
 |  344  | 
 |  345 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const | 
 |  346 { | 
 |  347   if (!mData.SitekeyParsingDone()) | 
 |  348   { | 
 |  349     ParseSitekeys(mData.GetSitekeysSource(mText)); | 
 |  350     mData.SetSitekeysParsingDone(); | 
 |  351   } | 
 |  352   return ActiveFilter::GetSitekeys(); | 
 |  353 } | 
 |  354  | 
 |  355 bool RegExpFilter::Matches(const String& location, int typeMask, | 
 |  356     DependentString& docDomain, bool thirdParty, const String& sitekey) const | 
 |  357 { | 
 |  358   if (!(mData.mContentType & typeMask) || | 
 |  359       (mData.mThirdParty == TrippleState::YES && !thirdParty) || | 
 |  360       (mData.mThirdParty == TrippleState::NO && thirdParty) || | 
 |  361       !IsActiveOnDomain(docDomain, sitekey)) | 
 |  362   { | 
 |  363     return false; | 
 |  364   } | 
 |  365  | 
 |  366   if (!mData.RegExpParsingDone()) | 
 |  367   { | 
 |  368     const OwnedString pattern(mData.GetRegExpSource(mText)); | 
 |  369     mData.SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mData.mMatchCase))
     ; | 
 |  370   } | 
 |  371   return EM_ASM_INT(return regexps.test($0, $1), mData.mRegexpId, &location); | 
 |  372 } | 
| LEFT | RIGHT |