Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: compiled/RegExpFilter.cpp

Issue 29333474: Issue 4125 - [emscripten] Convert filter classes to C++ (Closed)
Patch Set: Merged filter parsing and normalization Created Feb. 4, 2016, 3:01 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « compiled/RegExpFilter.h ('k') | compiled/String.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #include <climits>
2
3 #include <emscripten.h>
4
5 #include "RegExpFilter.h"
6 #include "StringScanner.h"
7 #include "StringMap.h"
8
9 namespace
10 {
11 enum
12 {
13 TYPE_OTHER = 0x1,
14 TYPE_SCRIPT = 0x2,
15 TYPE_IMAGE = 0x4,
16 TYPE_STYLESHEET = 0x8,
17 TYPE_OBJECT = 0x10,
18 TYPE_SUBDOCUMENT = 0x20,
19 TYPE_DOCUMENT = 0x40,
20 TYPE_PING = 0x400,
21 TYPE_XMLHTTPREQUEST = 0x800,
22 TYPE_OBJECT_SUBREQUEST = 0x1000,
23 TYPE_MEDIA = 0x4000,
24 TYPE_FONT = 0x8000,
25 TYPE_POPUP = 0x8000000,
26 TYPE_GENERICBLOCK = 0x10000000,
27 TYPE_GENERICHIDE = 0x20000000,
28 TYPE_ELEMHIDE = 0x40000000,
29 };
30
31 StringMap<int> typeMap {
32 {u"other"_str, TYPE_OTHER},
33 {u"script"_str, TYPE_SCRIPT},
34 {u"image"_str, TYPE_IMAGE},
35 {u"stylesheet"_str, TYPE_STYLESHEET},
36 {u"object"_str, TYPE_OBJECT},
37 {u"subdocument"_str, TYPE_SUBDOCUMENT},
38 {u"document"_str, TYPE_DOCUMENT},
39 {u"xbl"_str, TYPE_OTHER}, // Backwards compat
40 {u"ping"_str, TYPE_PING},
41 {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST},
42 {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST},
43 {u"dtd"_str, TYPE_OTHER}, // Backwards compat
44 {u"media"_str, TYPE_MEDIA},
45 {u"font"_str, TYPE_FONT},
46 {u"background"_str, TYPE_IMAGE}, // Backwards compat
47
48 {u"popup"_str, TYPE_POPUP},
49 {u"genericblock"_str, TYPE_GENERICBLOCK},
50 {u"generichide"_str, TYPE_GENERICHIDE},
51 {u"elemhide"_str, TYPE_ELEMHIDE},
52 };
53
54 int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | TYPE_POPUP |
55 TYPE_GENERICBLOCK | TYPE_GENERICHIDE);
56
57 int GenerateRegExp(const String& regexp, bool matchCase)
58 {
59 return EM_ASM_INT(return regexps.create($0, $1), &regexp, matchCase);
60 }
61
62 void NormalizeWhitespace(String& text)
63 {
64 // We want to remove all spaces but bail out early in the common scenario
65 // that the string contains no spaces.
66
67 // Look for the first space
68 String::size_type len = text.length();
69 String::size_type pos;
70 for (pos = 0; pos < len; pos++)
71 if (text[pos] == ' ')
72 break;
73
74 if (pos >= len)
75 return;
76
77 // Found spaces, move characters to remove them
78 String::size_type delta = 1;
79 for (pos = pos + 1; pos < len; pos++)
80 {
81 if (text[pos] == ' ')
82 delta++;
83 else
84 text[pos - delta] = text[pos];
85 }
86 text.reset(text, 0, len - delta);
87 }
88 }
89
90 RegExpFilter::RegExpFilter(const String& text, const RegExpFilterData& data)
91 : ActiveFilter(text, true), RegExpFilterData(data)
92 {
93 }
94
95 RegExpFilter::~RegExpFilter()
96 {
97 if (HasRegExp())
98 EM_ASM_ARGS(regexps.delete($0), mRegexpId);
99 }
100
101 Filter::Type RegExpFilter::Parse(String& text, String& error,
102 RegExpFilterData& data)
103 {
104 NormalizeWhitespace(text);
105
106 bool blocking = true;
107
108 data.mPatternStart = 0;
109 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@')
110 {
111 blocking = false;
112 data.mPatternStart = 2;
113 }
114
115 data.mPatternEnd = text.find(u'$', data.mPatternStart);
116 if (data.mPatternEnd == text.npos)
117 data.mPatternEnd = text.length();
118
119 ParseOptions(text, error, data, data.mPatternEnd + 1);
120 if (!error.empty())
121 return Type::INVALID;
122
123 if (data.mPatternEnd - data.mPatternStart >= 2 &&
124 text[data.mPatternStart] == u'/' &&
125 text[data.mPatternEnd - 1] == u'/')
126 {
127 data.SetRegExp(GenerateRegExp(String(text, data.mPatternStart + 1,
128 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase));
129
130 int errorLength = EM_ASM_INT(return regexps.getErrorLength($0),
131 data.mRegexpId);
132 if (errorLength >= 0)
133 {
134 String regexpError(errorLength);
135 EM_ASM_ARGS(regexps.getError($0, $1), data.mRegexpId, regexpError.data());
136 error.reset(std::move(regexpError));
137 return Type::INVALID;
138 }
139 }
140
141 if (blocking)
142 return Type::BLOCKING;
143 else
144 return Type::WHITELIST;
145 }
146
147 void RegExpFilter::ParseOptions(String& text, String& error,
148 RegExpFilterData& data, String::size_type optionsStart)
149 {
150 data.mMatchCase = false;
151 data.mThirdParty = TrippleState::ANY;
152 data.mCollapse = TrippleState::ANY;
153 data.mDomainsStart = String::npos;
154 data.mSitekeysStart = String::npos;
155 if (optionsStart >= text.length())
156 {
157 data.mContentType = defaultTypeMask;
158 return;
159 }
160
161 data.mContentType = -1;
162
163 int optionStart = data.mPatternEnd + 1;
164 int optionEnd = -1;
165 int valueStart = -1;
166
167 StringScanner scanner(text, optionStart, u',');
168 bool done = false;
169 while (!done)
170 {
171 done = scanner.done();
172 switch (scanner.next())
173 {
174 case u'=':
175 if (optionEnd < 0)
176 {
177 optionEnd = scanner.position();
178 valueStart = optionEnd + 1;
179 }
180 break;
181 case u',':
182 if (optionEnd < 0)
183 optionEnd = scanner.position();
184 ParseOption(text, error, data, optionStart, optionEnd, valueStart,
185 scanner.position());
186 if (!error.empty())
187 return;
188
189 optionStart = scanner.position() + 1;
190 optionEnd = -1;
191 valueStart = -1;
192 break;
193 }
194 }
195
196 if (data.mContentType < 0)
197 data.mContentType = defaultTypeMask;
198 }
199
200 void RegExpFilter::ParseOption(String& text, String& error,
201 RegExpFilterData& data, int optionStart, int optionEnd, int valueStart,
202 int valueEnd)
203 {
204 if (optionEnd <= optionStart)
205 return;
206
207 bool reverse = false;
208 if (text[optionStart] == u'~')
209 {
210 reverse = true;
211 optionStart++;
212 }
213
214 String name(text, optionStart, optionEnd - optionStart);
215 for (size_t i = 0; i < name.length(); ++i)
216 {
217 char16_t currChar = name[i];
218 if (currChar >= u'A' && currChar <= u'Z')
219 name[i] = currChar + u'a' - u'A';
220 else if (currChar == u'_')
221 name[i] = u'-';
222 }
223
224 auto it = typeMap.find(name);
225 if (it != typeMap.end())
226 {
227 if (data.mContentType < 0)
228 data.mContentType = reverse ? defaultTypeMask : 0;
229 if (reverse)
230 data.mContentType &= ~it->second;
231 else
232 data.mContentType |= it->second;
233 }
234 else if (name.equals(u"domain"_str))
235 {
236 if (valueStart >= 0 && valueEnd > valueStart)
237 {
238 data.mDomainsStart = valueStart;
239 data.mDomainsEnd = valueEnd;
240 ToLower(text, data.mDomainsStart, data.mDomainsEnd);
241 }
242 }
243 else if (name.equals(u"sitekey"_str))
244 {
245 if (valueStart >= 0 && valueEnd > valueStart)
246 {
247 data.mSitekeysStart = valueStart;
248 data.mSitekeysEnd = valueEnd;
249 }
250 }
251 else if (name.equals(u"match-case"_str))
252 data.mMatchCase = !reverse;
253 else if (name.equals(u"third-party"_str))
254 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES;
255 else if (name.equals(u"collapse"_str))
256 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES;
257 else
258 {
259 error.reset(u"Unknown option "_str);
260 error.append(name);
261 }
262 }
263
264 void RegExpFilter::ParseSitekeys(const String& sitekeys) const
265 {
266 StringScanner scanner(sitekeys, 0, u'|');
267 size_t start = 0;
268 bool done = false;
269 while (!done)
270 {
271 done = scanner.done();
272 if (scanner.next() == u'|')
273 {
274 if (scanner.position() > start)
275 AddSitekey(String(sitekeys, start, scanner.position() - start));
276 start = scanner.position() + 1;
277 }
278 }
279 }
280
281 void RegExpFilter::InitJSTypes()
282 {
283 EM_ASM(exports.RegExpFilter.typeMap = {};);
284 for (auto it = typeMap.begin(); it != typeMap.end(); ++it)
285 EM_ASM_ARGS(exports.RegExpFilter.typeMap[getStringData($0).replace("-", "_") .toUpperCase()] = $1, &(it->first), it->second);
286 }
287
288 String RegExpFilter::RegExpFromSource(const String& source)
289 {
290 /* TODO: this is very inefficient */
291
292 // Note: This doesn't remove trailing wildcards, otherwise the result should
293 // be identical to Filter.toRegExp().
294 String result;
295 String::value_type prevChar = u'*';
296 for (String::size_type i = 0; i < source.length(); ++i)
297 {
298 String::value_type currChar = source[i];
299 switch (currChar)
300 {
301 case u'*':
302 if (prevChar != u'*')
303 result.append(u".*"_str);
304 break;
305 case u'^':
306 result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x 60\\x7B-\\x7F]|$)"_str);
307 break;
308 case u'|':
309 if (i == 0)
310 {
311 // Anchor at expression start, maybe extended anchor?
312 if (i + 1 < source.length() && source[i + 1] == u'|')
313 {
314 result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str);
315 ++i;
316 }
317 else
318 result.append(u'^');
319 }
320 else if (i == source.length() - 1)
321 {
322 // Anchor at expression end, ignore if following separator placeholder
323 if (prevChar != u'^')
324 result.append(u'$');
325 }
326 else
327 {
328 // Not actually an anchor, escape it
329 result.append(u"\\|"_str);
330 }
331 break;
332 default:
333 if (!(currChar >= u'a' && currChar <= u'z') &&
334 !(currChar >= u'A' && currChar <= u'Z') &&
335 !(currChar >= u'0' && currChar <= u'9') &&
336 currChar < 128)
337 {
338 result.append(u'\\');
339 }
340 result.append(currChar);
341 }
342 prevChar = currChar;
343 }
344 return std::move(result.ensure_own_buffer());
345 }
346
347 Filter::Type RegExpFilter::GetType() const
348 {
349 return Type::BLOCKING;
350 }
351
352 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const
353 {
354 if (!DomainsParsingDone())
355 {
356 ParseDomains(GetDomainsSource(mText), u'|');
357 SetDomainsParsingDone();
358 }
359 return ActiveFilter::GetDomains();
360 }
361
362 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const
363 {
364 if (!SitekeyParsingDone())
365 {
366 ParseSitekeys(GetSitekeysSource(mText));
367 SetSitekeysParsingDone();
368 }
369 return ActiveFilter::GetSitekeys();
370 }
371
372 bool RegExpFilter::Matches(const String& location, int typeMask,
373 String& docDomain, bool thirdParty, const String& sitekey) const
374 {
375 if (!(mContentType & typeMask) ||
376 (mThirdParty == TrippleState::YES && !thirdParty) ||
377 (mThirdParty == TrippleState::NO && thirdParty) ||
378 !IsActiveOnDomain(docDomain, sitekey))
379 {
380 return false;
381 }
382
383 if (!RegExpParsingDone())
384 {
385 const String pattern(GetRegExpSource(mText));
386 SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mMatchCase));
387 }
388 return EM_ASM_INT(return regexps.test($0, $1), mRegexpId, &location);
389 }
OLDNEW
« no previous file with comments | « compiled/RegExpFilter.h ('k') | compiled/String.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld