OLD | NEW |
(Empty) | |
| 1 #include <climits> |
| 2 #include <unordered_map> |
| 3 |
| 4 #include <emscripten.h> |
| 5 |
| 6 #include "RegExpFilter.h" |
| 7 #include "WhiteListFilter.h" |
| 8 #include "InvalidFilter.h" |
| 9 #include "StringScanner.h" |
| 10 |
| 11 namespace |
| 12 { |
| 13 enum |
| 14 { |
| 15 TYPE_OTHER = 0x1, |
| 16 TYPE_SCRIPT = 0x2, |
| 17 TYPE_IMAGE = 0x4, |
| 18 TYPE_STYLESHEET = 0x8, |
| 19 TYPE_OBJECT = 0x10, |
| 20 TYPE_SUBDOCUMENT = 0x20, |
| 21 TYPE_DOCUMENT = 0x40, |
| 22 TYPE_PING = 0x400, |
| 23 TYPE_XMLHTTPREQUEST = 0x800, |
| 24 TYPE_OBJECT_SUBREQUEST = 0x1000, |
| 25 TYPE_MEDIA = 0x4000, |
| 26 TYPE_FONT = 0x8000, |
| 27 TYPE_POPUP = 0x8000000, |
| 28 TYPE_GENERICBLOCK = 0x10000000, |
| 29 TYPE_GENERICHIDE = 0x20000000, |
| 30 TYPE_ELEMHIDE = 0x40000000, |
| 31 }; |
| 32 |
| 33 std::unordered_map<std::u16string,int> typeMap({ |
| 34 {u"OTHER", TYPE_OTHER}, |
| 35 {u"SCRIPT", TYPE_SCRIPT}, |
| 36 {u"IMAGE", TYPE_IMAGE}, |
| 37 {u"STYLESHEET", TYPE_STYLESHEET}, |
| 38 {u"OBJECT", TYPE_OBJECT}, |
| 39 {u"SUBDOCUMENT", TYPE_SUBDOCUMENT}, |
| 40 {u"DOCUMENT", TYPE_DOCUMENT}, |
| 41 {u"XBL", TYPE_OTHER}, // Backwards compat |
| 42 {u"PING", TYPE_PING}, |
| 43 {u"XMLHTTPREQUEST", TYPE_XMLHTTPREQUEST}, |
| 44 {u"OBJECT_SUBREQUEST", TYPE_OBJECT_SUBREQUEST}, |
| 45 {u"DTD", TYPE_OTHER}, // Backwards compat |
| 46 {u"MEDIA", TYPE_MEDIA}, |
| 47 {u"FONT", TYPE_FONT}, |
| 48 {u"BACKGROUND", TYPE_IMAGE}, // Backwards compat |
| 49 |
| 50 {u"POPUP", TYPE_POPUP}, |
| 51 {u"GENERICBLOCK", TYPE_GENERICBLOCK}, |
| 52 {u"GENERICHIDE", TYPE_GENERICHIDE}, |
| 53 {u"ELEMHIDE", TYPE_ELEMHIDE}, |
| 54 }); |
| 55 |
| 56 int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | TYPE_POPUP | |
| 57 TYPE_GENERICBLOCK | TYPE_GENERICHIDE); |
| 58 |
| 59 int GenerateRegExp(const std::u16string& regexp, bool matchCase) |
| 60 { |
| 61 return EM_ASM_INT(return regexps.create($0, $1), ®exp, matchCase); |
| 62 } |
| 63 } |
| 64 |
| 65 RegExpFilter::RegExpFilter(const std::u16string& text, |
| 66 const std::u16string& pattern, const std::u16string& options) |
| 67 : ActiveFilter(text, true), regexpId(0), contentType(-1), matchCase(false), |
| 68 thirdParty(TrippleState::ANY) |
| 69 { |
| 70 int optionStart = 0; |
| 71 int optionEnd = -1; |
| 72 int valueStart = -1; |
| 73 StringScanner scanner(options + u","); |
| 74 while (!scanner.done()) |
| 75 { |
| 76 switch (scanner.next()) |
| 77 { |
| 78 case u'=': |
| 79 if (optionEnd < 0) |
| 80 { |
| 81 optionEnd = scanner.position(); |
| 82 valueStart = optionEnd + 1; |
| 83 } |
| 84 break; |
| 85 case u',': |
| 86 if (optionEnd < 0) |
| 87 optionEnd = scanner.position(); |
| 88 ProcessOption(options, optionStart, optionEnd, valueStart, scanner.posit
ion()); |
| 89 optionStart = scanner.position() + 1; |
| 90 optionEnd = -1; |
| 91 valueStart = -1; |
| 92 break; |
| 93 } |
| 94 } |
| 95 if (contentType < 0) |
| 96 contentType = defaultTypeMask; |
| 97 |
| 98 size_t len = pattern.length(); |
| 99 if (len >= 2 && pattern[0] == u'/' && pattern[len - 1] == u'/') |
| 100 { |
| 101 regexpId = GenerateRegExp(pattern.substr(1, len - 2), matchCase); |
| 102 |
| 103 std::u16string* error = reinterpret_cast<std::u16string*>(EM_ASM_INT(return
regexps.getError($0), regexpId)); |
| 104 if (error) |
| 105 { |
| 106 EM_ASM_ARGS(regexps.delete($0), regexpId); |
| 107 throw std::u16string(*error); |
| 108 } |
| 109 } |
| 110 else |
| 111 regexpSource = pattern; |
| 112 } |
| 113 |
| 114 RegExpFilter::~RegExpFilter() |
| 115 { |
| 116 if (regexpId) |
| 117 EM_ASM_ARGS(regexps.delete($0), regexpId); |
| 118 } |
| 119 |
| 120 void RegExpFilter::ProcessOption(const std::u16string& options, |
| 121 int optionStart, int optionEnd, int valueStart, int valueEnd) |
| 122 { |
| 123 if (optionEnd <= optionStart) |
| 124 return; |
| 125 |
| 126 bool reverse = false; |
| 127 if (options[optionStart] == u'~') |
| 128 { |
| 129 reverse = true; |
| 130 optionStart++; |
| 131 } |
| 132 |
| 133 std::u16string name(options.substr(optionStart, optionEnd - optionStart)); |
| 134 for (size_t i = 0, l = name.length(); i < l; ++i) |
| 135 { |
| 136 char16_t currChar = name[i]; |
| 137 if (currChar >= u'a' && currChar <= u'z') |
| 138 name[i] = currChar + u'A' - u'a'; |
| 139 else if (currChar == u'-') |
| 140 name[i] = u'_'; |
| 141 } |
| 142 |
| 143 auto it = typeMap.find(name); |
| 144 if (it != typeMap.end()) |
| 145 { |
| 146 if (contentType < 0) |
| 147 contentType = reverse ? defaultTypeMask : 0; |
| 148 if (reverse) |
| 149 contentType &= ~it->second; |
| 150 else |
| 151 contentType |= it->second; |
| 152 } |
| 153 else if (!name.compare(u"DOMAIN")) |
| 154 { |
| 155 if (valueStart >= 0 && valueEnd > valueStart) |
| 156 ParseDomains(options.substr(valueStart, valueEnd - valueStart), u'|'); |
| 157 } |
| 158 else if (!name.compare(u"SITEKEY")) |
| 159 { |
| 160 if (valueStart >= 0 && valueEnd > valueStart) |
| 161 { |
| 162 StringScanner scanner(options.substr(valueStart, valueEnd - valueStart) +
u"|"); |
| 163 size_t start = 0; |
| 164 while (!scanner.done()) |
| 165 { |
| 166 if (scanner.next() == u'|') |
| 167 { |
| 168 if (scanner.position() > start) |
| 169 sitekeys.insert(options.substr(valueStart + start, scanner.position(
) - start)); |
| 170 start = scanner.position() + 1; |
| 171 } |
| 172 } |
| 173 } |
| 174 } |
| 175 else if (!name.compare(u"MATCH_CASE")) |
| 176 matchCase = !reverse; |
| 177 else if (!name.compare(u"THIRD_PARTY")) |
| 178 thirdParty = reverse ? TrippleState::NO : TrippleState::YES; |
| 179 else if (!name.compare(u"COLLAPSE")) |
| 180 collapse = reverse ? TrippleState::NO : TrippleState::YES; |
| 181 else |
| 182 throw std::u16string(u"Unknown option " + name); |
| 183 } |
| 184 |
| 185 Filter* RegExpFilter::Create(const std::u16string& text) |
| 186 { |
| 187 bool blocking = true; |
| 188 size_t patternStart = 0; |
| 189 if (!text.compare(0, 2, u"@@")) |
| 190 { |
| 191 blocking = false; |
| 192 patternStart = 2; |
| 193 } |
| 194 |
| 195 size_t patternEnd = text.find(u'$', patternStart); |
| 196 size_t patternLength = (patternEnd != std::u16string::npos ? |
| 197 patternEnd - patternStart : patternEnd); |
| 198 std::u16string pattern(text.substr(patternStart, patternLength)); |
| 199 std::u16string options(patternEnd != std::u16string::npos ? |
| 200 text.substr(patternEnd + 1) : u""); |
| 201 |
| 202 try |
| 203 { |
| 204 if (blocking) |
| 205 return new RegExpFilter(text, pattern, options); |
| 206 else |
| 207 return new WhiteListFilter(text, pattern, options); |
| 208 } |
| 209 catch (const std::u16string& reason) |
| 210 { |
| 211 return new InvalidFilter(text, reason); |
| 212 } |
| 213 } |
| 214 |
| 215 void RegExpFilter::InitJSTypes() |
| 216 { |
| 217 for (auto it = typeMap.begin(); it != typeMap.end(); ++it) |
| 218 EM_ASM_ARGS(Module.RegExpFilter_typeMap[getStringData($0)] = $1, &(it->first
), it->second); |
| 219 } |
| 220 |
| 221 const std::u16string RegExpFilter::RegExpFromSource(const std::u16string& source
) |
| 222 { |
| 223 // Note: This doesn't remove trailing wildcards, otherwise the result should |
| 224 // be identical to Filter.toRegExp(). |
| 225 std::u16string result; |
| 226 char16_t prevChar = u'*'; |
| 227 for (size_t i = 0, l = source.length(); i < l; ++i) |
| 228 { |
| 229 char16_t currChar = source[i]; |
| 230 switch (currChar) |
| 231 { |
| 232 case u'*': |
| 233 if (prevChar != u'*') |
| 234 result += u".*"; |
| 235 break; |
| 236 case u'^': |
| 237 result += u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x60\\
x7B-\\x7F]|$)"; |
| 238 break; |
| 239 case u'|': |
| 240 if (i == 0) |
| 241 { |
| 242 // Anchor at expression start, maybe extended anchor? |
| 243 if (i + 1 < l && source[i + 1] == u'|') |
| 244 { |
| 245 result += u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"; |
| 246 ++i; |
| 247 } |
| 248 else |
| 249 result += u"^"; |
| 250 } |
| 251 else if (i == l - 1) |
| 252 { |
| 253 // Anchor at expression end, ignore if following separator placeholder |
| 254 if (prevChar != u'^') |
| 255 result += u"$"; |
| 256 } |
| 257 else |
| 258 { |
| 259 // Not actually an anchor, escape it |
| 260 result += u"\\|"; |
| 261 } |
| 262 break; |
| 263 default: |
| 264 if ((currChar >= u'a' && currChar <= u'z') || |
| 265 (currChar >= u'A' && currChar <= u'Z') || |
| 266 (currChar >= u'0' && currChar <= u'9') || |
| 267 currChar >= 128) |
| 268 { |
| 269 result += currChar; |
| 270 } |
| 271 else |
| 272 { |
| 273 result += u"\\"; |
| 274 result.append(1, currChar); |
| 275 } |
| 276 } |
| 277 prevChar = currChar; |
| 278 } |
| 279 return result; |
| 280 } |
| 281 |
| 282 Filter::Type RegExpFilter::GetType() const |
| 283 { |
| 284 return Type::BLOCKING; |
| 285 } |
| 286 |
| 287 bool RegExpFilter::Matches(const std::u16string& location, int typeMask, |
| 288 const std::u16string& docDomain, bool thirdParty, |
| 289 const std::u16string& sitekey) |
| 290 { |
| 291 if (!(this->contentType & typeMask) || |
| 292 (this->thirdParty == TrippleState::YES && !thirdParty) || |
| 293 (this->thirdParty == TrippleState::NO && thirdParty) || |
| 294 !IsActiveOnDomain(docDomain, sitekey)) |
| 295 { |
| 296 return false; |
| 297 } |
| 298 |
| 299 if (!regexpId) |
| 300 { |
| 301 regexpId = GenerateRegExp(RegExpFromSource(regexpSource), matchCase); |
| 302 regexpSource.resize(0); |
| 303 } |
| 304 return EM_ASM_INT(return regexps.test($0, $1), regexpId, &location); |
| 305 } |
OLD | NEW |