Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 #include <climits> | |
2 | |
3 #include <emscripten.h> | |
4 | |
5 #include "RegExpFilter.h" | |
6 #include "StringScanner.h" | |
7 #include "StringMap.h" | |
8 | |
9 namespace | |
10 { | |
11 enum | |
12 { | |
13 TYPE_OTHER = 0x1, | |
14 TYPE_SCRIPT = 0x2, | |
15 TYPE_IMAGE = 0x4, | |
16 TYPE_STYLESHEET = 0x8, | |
17 TYPE_OBJECT = 0x10, | |
18 TYPE_SUBDOCUMENT = 0x20, | |
19 TYPE_DOCUMENT = 0x40, | |
20 TYPE_PING = 0x400, | |
21 TYPE_XMLHTTPREQUEST = 0x800, | |
22 TYPE_OBJECT_SUBREQUEST = 0x1000, | |
23 TYPE_MEDIA = 0x4000, | |
24 TYPE_FONT = 0x8000, | |
25 TYPE_POPUP = 0x8000000, | |
26 TYPE_GENERICBLOCK = 0x10000000, | |
27 TYPE_GENERICHIDE = 0x20000000, | |
28 TYPE_ELEMHIDE = 0x40000000, | |
29 }; | |
30 | |
31 StringMap<int> typeMap { | |
32 {u"other"_str, TYPE_OTHER}, | |
33 {u"script"_str, TYPE_SCRIPT}, | |
34 {u"image"_str, TYPE_IMAGE}, | |
35 {u"stylesheet"_str, TYPE_STYLESHEET}, | |
36 {u"object"_str, TYPE_OBJECT}, | |
37 {u"subdocument"_str, TYPE_SUBDOCUMENT}, | |
38 {u"document"_str, TYPE_DOCUMENT}, | |
39 {u"xbl"_str, TYPE_OTHER}, // Backwards compat | |
40 {u"ping"_str, TYPE_PING}, | |
41 {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST}, | |
42 {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST}, | |
43 {u"dtd"_str, TYPE_OTHER}, // Backwards compat | |
44 {u"media"_str, TYPE_MEDIA}, | |
45 {u"font"_str, TYPE_FONT}, | |
46 {u"background"_str, TYPE_IMAGE}, // Backwards compat | |
47 | |
48 {u"popup"_str, TYPE_POPUP}, | |
49 {u"genericblock"_str, TYPE_GENERICBLOCK}, | |
50 {u"generichide"_str, TYPE_GENERICHIDE}, | |
51 {u"elemhide"_str, TYPE_ELEMHIDE}, | |
52 }; | |
53 | |
54 int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | TYPE_POPUP | | |
sergei
2016/02/17 12:54:38
Should not it be const?
Wladimir Palant
2016/02/18 16:06:45
Done.
| |
55 TYPE_GENERICBLOCK | TYPE_GENERICHIDE); | |
56 | |
57 int GenerateRegExp(const String& regexp, bool matchCase) | |
58 { | |
59 return EM_ASM_INT(return regexps.create($0, $1), ®exp, matchCase); | |
60 } | |
61 | |
62 void NormalizeWhitespace(DependentString& text) | |
63 { | |
64 // We want to remove all spaces but bail out early in the common scenario | |
65 // that the string contains no spaces. | |
66 | |
67 // Look for the first space | |
68 String::size_type len = text.length(); | |
69 String::size_type pos; | |
70 for (pos = 0; pos < len; pos++) | |
71 if (text[pos] == ' ') | |
72 break; | |
73 | |
74 if (pos >= len) | |
75 return; | |
76 | |
77 // Found spaces, move characters to remove them | |
78 String::size_type delta = 1; | |
79 for (pos = pos + 1; pos < len; pos++) | |
80 { | |
81 if (text[pos] == ' ') | |
82 delta++; | |
83 else | |
84 text[pos - delta] = text[pos]; | |
85 } | |
86 text.reset(text, 0, len - delta); | |
87 } | |
88 | |
89 void ParseOption(String& text, OwnedString& error, RegExpFilterData& data, | |
90 int optionStart, int optionEnd, int valueStart, int valueEnd) | |
91 { | |
92 if (optionEnd <= optionStart) | |
93 return; | |
94 | |
95 bool reverse = false; | |
96 if (text[optionStart] == u'~') | |
97 { | |
98 reverse = true; | |
99 optionStart++; | |
100 } | |
101 | |
102 DependentString name(text, optionStart, optionEnd - optionStart); | |
103 for (size_t i = 0; i < name.length(); ++i) | |
104 { | |
105 char16_t currChar = name[i]; | |
106 if (currChar >= u'A' && currChar <= u'Z') | |
107 name[i] = currChar + u'a' - u'A'; | |
108 else if (currChar == u'_') | |
109 name[i] = u'-'; | |
110 } | |
111 | |
112 auto it = typeMap.find(name); | |
113 if (it != typeMap.end()) | |
114 { | |
115 if (data.mContentType < 0) | |
116 data.mContentType = reverse ? defaultTypeMask : 0; | |
117 if (reverse) | |
118 data.mContentType &= ~it->second; | |
119 else | |
120 data.mContentType |= it->second; | |
121 } | |
122 else if (name.equals(u"domain"_str)) | |
123 { | |
124 if (valueStart >= 0 && valueEnd > valueStart) | |
125 { | |
126 data.mDomainsStart = valueStart; | |
127 data.mDomainsEnd = valueEnd; | |
128 ActiveFilter::ToLower(text, data.mDomainsStart, data.mDomainsEnd); | |
129 } | |
130 } | |
131 else if (name.equals(u"sitekey"_str)) | |
132 { | |
133 if (valueStart >= 0 && valueEnd > valueStart) | |
134 { | |
135 data.mSitekeysStart = valueStart; | |
136 data.mSitekeysEnd = valueEnd; | |
137 } | |
138 } | |
139 else if (name.equals(u"match-case"_str)) | |
140 data.mMatchCase = !reverse; | |
141 else if (name.equals(u"third-party"_str)) | |
142 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES; | |
143 else if (name.equals(u"collapse"_str)) | |
144 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES; | |
145 else | |
146 { | |
147 error = u"Unknown option "_str; | |
148 error.append(name); | |
149 } | |
150 } | |
151 | |
152 void ParseOptions(String& text, OwnedString& error, RegExpFilterData& data, | |
153 String::size_type optionsStart) | |
154 { | |
155 data.mMatchCase = false; | |
156 data.mThirdParty = TrippleState::ANY; | |
157 data.mCollapse = TrippleState::ANY; | |
158 data.mDomainsStart = String::npos; | |
159 data.mSitekeysStart = String::npos; | |
160 if (optionsStart >= text.length()) | |
161 { | |
162 data.mContentType = defaultTypeMask; | |
163 return; | |
164 } | |
165 | |
166 data.mContentType = -1; | |
167 | |
168 int optionStart = data.mPatternEnd + 1; | |
169 int optionEnd = -1; | |
170 int valueStart = -1; | |
171 | |
172 StringScanner scanner(text, optionStart, u','); | |
173 bool done = false; | |
174 while (!done) | |
175 { | |
176 done = scanner.done(); | |
177 switch (scanner.next()) | |
178 { | |
179 case u'=': | |
180 if (optionEnd < 0) | |
181 { | |
182 optionEnd = scanner.position(); | |
183 valueStart = optionEnd + 1; | |
184 } | |
185 break; | |
186 case u',': | |
187 if (optionEnd < 0) | |
188 optionEnd = scanner.position(); | |
189 ParseOption(text, error, data, optionStart, optionEnd, valueStart, | |
190 scanner.position()); | |
191 if (!error.empty()) | |
192 return; | |
193 | |
194 optionStart = scanner.position() + 1; | |
195 optionEnd = -1; | |
196 valueStart = -1; | |
197 break; | |
198 } | |
199 } | |
200 | |
201 if (data.mContentType < 0) | |
202 data.mContentType = defaultTypeMask; | |
203 } | |
204 } | |
205 | |
206 RegExpFilter::RegExpFilter(const String& text, const RegExpFilterData& data) | |
207 : ActiveFilter(text, true), RegExpFilterData(data) | |
208 { | |
209 } | |
210 | |
211 RegExpFilter::~RegExpFilter() | |
212 { | |
213 if (HasRegExp()) | |
214 EM_ASM_ARGS(regexps.delete($0), mRegexpId); | |
215 } | |
216 | |
217 Filter::Type RegExpFilter::Parse(DependentString& text, OwnedString& error, | |
218 RegExpFilterData& data) | |
219 { | |
220 NormalizeWhitespace(text); | |
221 | |
222 bool blocking = true; | |
223 | |
224 data.mPatternStart = 0; | |
225 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@') | |
226 { | |
227 blocking = false; | |
228 data.mPatternStart = 2; | |
229 } | |
230 | |
231 data.mPatternEnd = text.find(u'$', data.mPatternStart); | |
232 if (data.mPatternEnd == text.npos) | |
233 data.mPatternEnd = text.length(); | |
234 | |
235 ParseOptions(text, error, data, data.mPatternEnd + 1); | |
236 if (!error.empty()) | |
237 return Type::INVALID; | |
238 | |
239 if (data.mPatternEnd - data.mPatternStart >= 2 && | |
240 text[data.mPatternStart] == u'/' && | |
241 text[data.mPatternEnd - 1] == u'/') | |
242 { | |
243 data.SetRegExp(GenerateRegExp(DependentString(text, data.mPatternStart + 1, | |
244 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase)); | |
245 | |
246 int errorLength = EM_ASM_INT(return regexps.getErrorLength($0), | |
247 data.mRegexpId); | |
248 if (errorLength >= 0) | |
249 { | |
250 OwnedString regexpError(errorLength); | |
251 EM_ASM_ARGS(regexps.getError($0, $1), data.mRegexpId, regexpError.data()); | |
252 error = std::move(regexpError); | |
253 return Type::INVALID; | |
254 } | |
255 } | |
256 | |
257 if (blocking) | |
258 return Type::BLOCKING; | |
259 else | |
260 return Type::WHITELIST; | |
261 } | |
262 | |
263 void RegExpFilter::ParseSitekeys(const String& sitekeys) const | |
264 { | |
265 StringScanner scanner(sitekeys, 0, u'|'); | |
266 size_t start = 0; | |
267 bool done = false; | |
268 while (!done) | |
269 { | |
270 done = scanner.done(); | |
271 if (scanner.next() == u'|') | |
272 { | |
273 if (scanner.position() > start) | |
274 AddSitekey(DependentString(sitekeys, start, scanner.position() - start)) ; | |
275 start = scanner.position() + 1; | |
276 } | |
277 } | |
278 } | |
279 | |
280 void RegExpFilter::InitJSTypes() | |
281 { | |
282 EM_ASM(exports.RegExpFilter.typeMap = {};); | |
283 for (auto it = typeMap.begin(); it != typeMap.end(); ++it) | |
284 EM_ASM_ARGS(exports.RegExpFilter.typeMap[getStringData($0).replace("-", "_") .toUpperCase()] = $1, &(it->first), it->second); | |
285 } | |
286 | |
287 OwnedString RegExpFilter::RegExpFromSource(const String& source) | |
288 { | |
289 /* TODO: this is very inefficient */ | |
290 | |
291 // Note: This doesn't remove trailing wildcards, otherwise the result should | |
292 // be identical to Filter.toRegExp(). | |
293 OwnedString result; | |
294 String::value_type prevChar = u'*'; | |
295 for (String::size_type i = 0; i < source.length(); ++i) | |
296 { | |
297 String::value_type currChar = source[i]; | |
298 switch (currChar) | |
299 { | |
300 case u'*': | |
301 if (prevChar != u'*') | |
302 result.append(u".*"_str); | |
303 break; | |
304 case u'^': | |
305 result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x 60\\x7B-\\x7F]|$)"_str); | |
306 break; | |
307 case u'|': | |
308 if (i == 0) | |
309 { | |
310 // Anchor at expression start, maybe extended anchor? | |
311 if (i + 1 < source.length() && source[i + 1] == u'|') | |
312 { | |
313 result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str); | |
314 ++i; | |
315 } | |
316 else | |
317 result.append(u'^'); | |
318 } | |
319 else if (i == source.length() - 1) | |
320 { | |
321 // Anchor at expression end, ignore if following separator placeholder | |
322 if (prevChar != u'^') | |
323 result.append(u'$'); | |
324 } | |
325 else | |
326 { | |
327 // Not actually an anchor, escape it | |
328 result.append(u"\\|"_str); | |
329 } | |
330 break; | |
331 default: | |
332 if (!(currChar >= u'a' && currChar <= u'z') && | |
333 !(currChar >= u'A' && currChar <= u'Z') && | |
334 !(currChar >= u'0' && currChar <= u'9') && | |
335 currChar < 128) | |
336 { | |
337 result.append(u'\\'); | |
338 } | |
339 result.append(currChar); | |
340 } | |
341 prevChar = currChar; | |
342 } | |
343 return std::move(result); | |
344 } | |
345 | |
346 Filter::Type RegExpFilter::GetType() const | |
347 { | |
348 return Type::BLOCKING; | |
349 } | |
350 | |
351 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const | |
352 { | |
353 if (!DomainsParsingDone()) | |
354 { | |
355 ParseDomains(GetDomainsSource(mText), u'|'); | |
356 SetDomainsParsingDone(); | |
357 } | |
358 return ActiveFilter::GetDomains(); | |
359 } | |
360 | |
361 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const | |
362 { | |
363 if (!SitekeyParsingDone()) | |
364 { | |
365 ParseSitekeys(GetSitekeysSource(mText)); | |
366 SetSitekeysParsingDone(); | |
367 } | |
368 return ActiveFilter::GetSitekeys(); | |
369 } | |
370 | |
371 bool RegExpFilter::Matches(const String& location, int typeMask, | |
372 DependentString& docDomain, bool thirdParty, const String& sitekey) const | |
373 { | |
374 if (!(mContentType & typeMask) || | |
375 (mThirdParty == TrippleState::YES && !thirdParty) || | |
376 (mThirdParty == TrippleState::NO && thirdParty) || | |
377 !IsActiveOnDomain(docDomain, sitekey)) | |
378 { | |
379 return false; | |
380 } | |
381 | |
382 if (!RegExpParsingDone()) | |
383 { | |
384 const OwnedString pattern(GetRegExpSource(mText)); | |
385 SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mMatchCase)); | |
386 } | |
387 return EM_ASM_INT(return regexps.test($0, $1), mRegexpId, &location); | |
388 } | |
OLD | NEW |