Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: compiled/RegExpFilter.cpp

Issue 29333474: Issue 4125 - [emscripten] Convert filter classes to C++ (Closed)
Patch Set: Addressed Sergei`s comments again and added some asserts Created Feb. 23, 2016, 12:30 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #include <climits>
2
3 #include <emscripten.h>
4
5 #include "RegExpFilter.h"
6 #include "StringScanner.h"
7 #include "StringMap.h"
8
9 namespace
10 {
11 enum
12 {
13 TYPE_OTHER = 0x1,
14 TYPE_SCRIPT = 0x2,
15 TYPE_IMAGE = 0x4,
16 TYPE_STYLESHEET = 0x8,
17 TYPE_OBJECT = 0x10,
18 TYPE_SUBDOCUMENT = 0x20,
19 TYPE_DOCUMENT = 0x40,
20 TYPE_PING = 0x400,
21 TYPE_XMLHTTPREQUEST = 0x800,
22 TYPE_OBJECT_SUBREQUEST = 0x1000,
23 TYPE_MEDIA = 0x4000,
24 TYPE_FONT = 0x8000,
25 TYPE_POPUP = 0x8000000,
26 TYPE_GENERICBLOCK = 0x10000000,
27 TYPE_GENERICHIDE = 0x20000000,
28 TYPE_ELEMHIDE = 0x40000000,
29 };
30
31 StringMap<int> typeMap {
32 {u"other"_str, TYPE_OTHER},
33 {u"script"_str, TYPE_SCRIPT},
34 {u"image"_str, TYPE_IMAGE},
35 {u"stylesheet"_str, TYPE_STYLESHEET},
36 {u"object"_str, TYPE_OBJECT},
37 {u"subdocument"_str, TYPE_SUBDOCUMENT},
38 {u"document"_str, TYPE_DOCUMENT},
39 {u"xbl"_str, TYPE_OTHER}, // Backwards compat
40 {u"ping"_str, TYPE_PING},
41 {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST},
42 {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST},
43 {u"dtd"_str, TYPE_OTHER}, // Backwards compat
44 {u"media"_str, TYPE_MEDIA},
45 {u"font"_str, TYPE_FONT},
46 {u"background"_str, TYPE_IMAGE}, // Backwards compat
47
48 {u"popup"_str, TYPE_POPUP},
49 {u"genericblock"_str, TYPE_GENERICBLOCK},
50 {u"generichide"_str, TYPE_GENERICHIDE},
51 {u"elemhide"_str, TYPE_ELEMHIDE},
52 };
53
54 const int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE |
55 TYPE_POPUP | TYPE_GENERICBLOCK | TYPE_GENERICHIDE);
56
57 int GenerateRegExp(const String& regexp, bool matchCase)
58 {
59 return EM_ASM_INT(return regexps.create($0, $1), &regexp, matchCase);
60 }
61
62 void NormalizeWhitespace(DependentString& text)
63 {
64 // We want to remove all spaces but bail out early in the common scenario
65 // that the string contains no spaces.
66
67 // Look for the first space
68 String::size_type len = text.length();
69 String::size_type pos;
70 for (pos = 0; pos < len; pos++)
71 if (text[pos] == ' ')
72 break;
73
74 if (pos >= len)
75 return;
76
77 // Found spaces, move characters to remove them
78 String::size_type delta = 1;
79 for (pos = pos + 1; pos < len; pos++)
80 {
81 if (text[pos] == ' ')
82 delta++;
83 else
84 text[pos - delta] = text[pos];
85 }
86 text.reset(text, 0, len - delta);
87 }
88
89 void ParseOption(String& text, DependentString& error, RegExpFilterData& data,
90 int optionStart, int optionEnd, int valueStart, int valueEnd)
91 {
92 if (optionEnd <= optionStart)
93 return;
94
95 bool reverse = false;
96 if (text[optionStart] == u'~')
97 {
98 reverse = true;
99 optionStart++;
100 }
101
102 DependentString name(text, optionStart, optionEnd - optionStart);
103 for (size_t i = 0; i < name.length(); ++i)
104 {
105 char16_t currChar = name[i];
106 if (currChar >= u'A' && currChar <= u'Z')
107 name[i] = currChar + u'a' - u'A';
108 else if (currChar == u'_')
109 name[i] = u'-';
110 }
111
112 auto it = typeMap.find(name);
113 if (it)
114 {
115 if (data.mContentType < 0)
116 data.mContentType = reverse ? defaultTypeMask : 0;
117 if (reverse)
118 data.mContentType &= ~it->second;
119 else
120 data.mContentType |= it->second;
121 }
122 else if (name.equals(u"domain"_str))
123 {
124 if (valueStart >= 0 && valueEnd > valueStart)
125 {
126 data.mDomainsStart = valueStart;
127 data.mDomainsEnd = valueEnd;
128 ActiveFilter::ToLower(DependentString(text, data.mDomainsStart,
Wladimir Palant 2016/02/23 12:37:47 This should be valueStart rather than data.mDomain
129 valueEnd - valueStart));
130 }
131 }
132 else if (name.equals(u"sitekey"_str))
133 {
134 if (valueStart >= 0 && valueEnd > valueStart)
135 {
136 data.mSitekeysStart = valueStart;
137 data.mSitekeysEnd = valueEnd;
138 }
139 }
140 else if (name.equals(u"match-case"_str))
141 data.mMatchCase = !reverse;
142 else if (name.equals(u"third-party"_str))
143 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES;
144 else if (name.equals(u"collapse"_str))
145 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES;
146 else
147 error.reset(u"filter_unknown_option"_str);
148 }
149
150 void ParseOptions(String& text, DependentString& error, RegExpFilterData& data ,
151 String::size_type optionsStart)
152 {
153 data.mMatchCase = false;
154 data.mThirdParty = TrippleState::ANY;
155 data.mCollapse = TrippleState::ANY;
156 data.mDomainsStart = String::npos;
157 data.mSitekeysStart = String::npos;
158 if (optionsStart >= text.length())
159 {
160 data.mContentType = defaultTypeMask;
161 return;
162 }
163
164 data.mContentType = -1;
165
166 int optionStart = data.mPatternEnd + 1;
167 int optionEnd = -1;
168 int valueStart = -1;
169
170 StringScanner scanner(text, optionStart, u',');
171 bool done = false;
172 while (!done)
173 {
174 done = scanner.done();
175 switch (scanner.next())
176 {
177 case u'=':
178 if (optionEnd < 0)
179 {
180 optionEnd = scanner.position();
181 valueStart = optionEnd + 1;
182 }
183 break;
184 case u',':
185 if (optionEnd < 0)
186 optionEnd = scanner.position();
187 ParseOption(text, error, data, optionStart, optionEnd, valueStart,
188 scanner.position());
189 if (!error.empty())
190 return;
191
192 optionStart = scanner.position() + 1;
193 optionEnd = -1;
194 valueStart = -1;
195 break;
196 }
197 }
198
199 if (data.mContentType < 0)
200 data.mContentType = defaultTypeMask;
201 }
202 }
203
204 RegExpFilter::RegExpFilter(const String& text, const RegExpFilterData& data)
205 : ActiveFilter(text, true), RegExpFilterData(data)
206 {
207 }
208
209 RegExpFilter::~RegExpFilter()
210 {
211 if (HasRegExp())
212 EM_ASM_ARGS(regexps.delete($0), mRegexpId);
213 }
214
215 Filter::Type RegExpFilter::Parse(DependentString& text, DependentString& error,
216 RegExpFilterData& data)
217 {
218 NormalizeWhitespace(text);
219
220 bool blocking = true;
221
222 data.mPatternStart = 0;
223 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@')
224 {
225 blocking = false;
226 data.mPatternStart = 2;
227 }
228
229 data.mPatternEnd = text.find(u'$', data.mPatternStart);
230 if (data.mPatternEnd == text.npos)
231 data.mPatternEnd = text.length();
232
233 ParseOptions(text, error, data, data.mPatternEnd + 1);
234 if (!error.empty())
235 return Type::INVALID;
236
237 if (data.mPatternEnd - data.mPatternStart >= 2 &&
238 text[data.mPatternStart] == u'/' &&
239 text[data.mPatternEnd - 1] == u'/')
240 {
241 data.SetRegExp(GenerateRegExp(DependentString(text, data.mPatternStart + 1,
242 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase));
243 if (data.mRegexpId == -1)
244 {
245 error.reset(u"filter_invalid_regexp"_str);
246 return Type::INVALID;
247 }
248 }
249
250 if (blocking)
251 return Type::BLOCKING;
252 else
253 return Type::WHITELIST;
254 }
255
256 void RegExpFilter::ParseSitekeys(const String& sitekeys) const
257 {
258 StringScanner scanner(sitekeys, 0, u'|');
259 size_t start = 0;
260 bool done = false;
261 while (!done)
262 {
263 done = scanner.done();
264 if (scanner.next() == u'|')
265 {
266 if (scanner.position() > start)
267 AddSitekey(DependentString(sitekeys, start, scanner.position() - start)) ;
268 start = scanner.position() + 1;
269 }
270 }
271 }
272
273 void RegExpFilter::InitJSTypes()
274 {
275 EM_ASM(exports.RegExpFilter.typeMap = {};);
276 for (auto it = typeMap.begin(); it != typeMap.end(); ++it)
277 EM_ASM_ARGS(exports.RegExpFilter.typeMap[getStringData($0).replace("-", "_") .toUpperCase()] = $1, &(it->first), it->second);
278 }
279
280 OwnedString RegExpFilter::RegExpFromSource(const String& source)
281 {
282 /* TODO: this is very inefficient */
283
284 // Note: This doesn't remove trailing wildcards, otherwise the result should
285 // be identical to Filter.toRegExp().
286 OwnedString result;
287 String::value_type prevChar = u'*';
288 for (String::size_type i = 0; i < source.length(); ++i)
289 {
290 String::value_type currChar = source[i];
291 switch (currChar)
292 {
293 case u'*':
294 if (prevChar != u'*')
295 result.append(u".*"_str);
296 break;
297 case u'^':
298 result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x 60\\x7B-\\x7F]|$)"_str);
299 break;
300 case u'|':
301 if (i == 0)
302 {
303 // Anchor at expression start, maybe extended anchor?
304 if (i + 1 < source.length() && source[i + 1] == u'|')
305 {
306 result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str);
307 ++i;
308 }
309 else
310 result.append(u'^');
311 }
312 else if (i == source.length() - 1)
313 {
314 // Anchor at expression end, ignore if following separator placeholder
315 if (prevChar != u'^')
316 result.append(u'$');
317 }
318 else
319 {
320 // Not actually an anchor, escape it
321 result.append(u"\\|"_str);
322 }
323 break;
324 default:
325 if (!(currChar >= u'a' && currChar <= u'z') &&
326 !(currChar >= u'A' && currChar <= u'Z') &&
327 !(currChar >= u'0' && currChar <= u'9') &&
328 currChar < 128)
329 {
330 result.append(u'\\');
331 }
332 result.append(currChar);
333 }
334 prevChar = currChar;
335 }
336 return result;
337 }
338
339 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const
340 {
341 if (!DomainsParsingDone())
342 {
343 ParseDomains(GetDomainsSource(mText), u'|');
344 SetDomainsParsingDone();
345 }
346 return ActiveFilter::GetDomains();
347 }
348
349 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const
350 {
351 if (!SitekeyParsingDone())
352 {
353 ParseSitekeys(GetSitekeysSource(mText));
354 SetSitekeysParsingDone();
355 }
356 return ActiveFilter::GetSitekeys();
357 }
358
359 bool RegExpFilter::Matches(const String& location, int typeMask,
360 DependentString& docDomain, bool thirdParty, const String& sitekey) const
361 {
362 if (!(mContentType & typeMask) ||
363 (mThirdParty == TrippleState::YES && !thirdParty) ||
364 (mThirdParty == TrippleState::NO && thirdParty) ||
365 !IsActiveOnDomain(docDomain, sitekey))
366 {
367 return false;
368 }
369
370 if (!RegExpParsingDone())
371 {
372 const OwnedString pattern(GetRegExpSource(mText));
373 SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mMatchCase));
374 }
375 return EM_ASM_INT(return regexps.test($0, $1), mRegexpId, &location);
376 }
OLDNEW
« no previous file with comments | « compiled/RegExpFilter.h ('k') | compiled/String.h » ('j') | compiled/String.h » ('J')

Powered by Google App Engine
This is Rietveld