Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: compiled/RegExpFilter.cpp

Issue 29333474: Issue 4125 - [emscripten] Convert filter classes to C++ (Closed)
Left Patch Set: Now passing all filter matching tests (without filter options) Created Jan. 18, 2016, 6:12 p.m.
Right Patch Set: Addressed comments from Patch Set 28 Created March 21, 2017, 10:04 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « compiled/RegExpFilter.h ('k') | compiled/String.h » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #include <climits>
2
1 #include <emscripten.h> 3 #include <emscripten.h>
2 4
3 #include "RegExpFilter.h" 5 #include "RegExpFilter.h"
4 #include "WhiteListFilter.h" 6 #include "StringScanner.h"
5 #include "InvalidFilter.h" 7 #include "StringMap.h"
6 8
7 namespace 9 namespace
8 { 10 {
9 int GenerateRegExp(const std::u16string& source) 11 enum
10 { 12 {
11 // Note: This doesn't remove trailing wildcards, otherwise the result should 13 TYPE_OTHER = 0x1,
12 // be identical to Filter.toRegExp(). 14 TYPE_SCRIPT = 0x2,
13 std::u16string result; 15 TYPE_IMAGE = 0x4,
14 char16_t prevChar = u'*'; 16 TYPE_STYLESHEET = 0x8,
15 for (size_t i = 0, l = source.length(); i < l; ++i) 17 TYPE_OBJECT = 0x10,
16 { 18 TYPE_SUBDOCUMENT = 0x20,
17 char16_t currChar = source[i]; 19 TYPE_DOCUMENT = 0x40,
18 switch (currChar) 20 TYPE_PING = 0x400,
21 TYPE_XMLHTTPREQUEST = 0x800,
22 TYPE_OBJECT_SUBREQUEST = 0x1000,
23 TYPE_MEDIA = 0x4000,
24 TYPE_FONT = 0x8000,
25 TYPE_POPUP = 0x8000000,
26 TYPE_GENERICBLOCK = 0x10000000,
27 TYPE_GENERICHIDE = 0x20000000,
28 TYPE_ELEMHIDE = 0x40000000,
29 };
30
31 StringMap<int> typeMap {
32 {u"other"_str, TYPE_OTHER},
33 {u"script"_str, TYPE_SCRIPT},
34 {u"image"_str, TYPE_IMAGE},
35 {u"stylesheet"_str, TYPE_STYLESHEET},
36 {u"object"_str, TYPE_OBJECT},
37 {u"subdocument"_str, TYPE_SUBDOCUMENT},
38 {u"document"_str, TYPE_DOCUMENT},
39 {u"xbl"_str, TYPE_OTHER}, // Backwards compat
40 {u"ping"_str, TYPE_PING},
41 {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST},
42 {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST},
43 {u"dtd"_str, TYPE_OTHER}, // Backwards compat
44 {u"media"_str, TYPE_MEDIA},
45 {u"font"_str, TYPE_FONT},
46 {u"background"_str, TYPE_IMAGE}, // Backwards compat
47
48 {u"popup"_str, TYPE_POPUP},
49 {u"genericblock"_str, TYPE_GENERICBLOCK},
50 {u"generichide"_str, TYPE_GENERICHIDE},
51 {u"elemhide"_str, TYPE_ELEMHIDE},
52 };
53
54 const int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE |
55 TYPE_POPUP | TYPE_GENERICBLOCK | TYPE_GENERICHIDE);
56
57 int GenerateRegExp(const String& regexp, bool matchCase)
58 {
59 return EM_ASM_INT(return regexps.create($0, $1), &regexp, matchCase);
60 }
61
62 void NormalizeWhitespace(DependentString& text)
63 {
64 // We want to remove all spaces but bail out early in the common scenario
65 // that the string contains no spaces.
66
67 // Look for the first space
68 String::size_type len = text.length();
69 String::size_type pos;
70 for (pos = 0; pos < len; pos++)
71 if (text[pos] == ' ')
72 break;
73
74 if (pos >= len)
75 return;
76
77 // Found spaces, move characters to remove them
78 String::size_type delta = 1;
79 for (pos = pos + 1; pos < len; pos++)
80 {
81 if (text[pos] == ' ')
82 delta++;
83 else
84 text[pos - delta] = text[pos];
85 }
86 text.reset(text, 0, len - delta);
87 }
88
89 void ParseOption(String& text, DependentString& error, RegExpFilterData& data,
90 int optionStart, int optionEnd, int valueStart, int valueEnd)
91 {
92 if (optionEnd <= optionStart)
93 return;
94
95 bool reverse = false;
96 if (text[optionStart] == u'~')
97 {
98 reverse = true;
99 optionStart++;
100 }
101
102 DependentString name(text, optionStart, optionEnd - optionStart);
103 for (size_t i = 0; i < name.length(); ++i)
104 {
105 char16_t currChar = name[i];
106 if (currChar >= u'A' && currChar <= u'Z')
107 name[i] = currChar + u'a' - u'A';
108 else if (currChar == u'_')
109 name[i] = u'-';
110 }
111
112 auto it = typeMap.find(name);
113 if (it)
114 {
115 if (data.mContentType < 0)
116 data.mContentType = reverse ? defaultTypeMask : 0;
117 if (reverse)
118 data.mContentType &= ~it->second;
119 else
120 data.mContentType |= it->second;
121 }
122 else if (name.equals(u"domain"_str))
123 {
124 if (valueStart >= 0 && valueEnd > valueStart)
19 { 125 {
20 case u'*': 126 data.mDomainsStart = valueStart;
21 if (prevChar != u'*') 127 data.mDomainsEnd = valueEnd;
22 result += u".*"; 128 DependentString(text, valueStart, valueEnd - valueStart).toLower();
129 }
130 }
131 else if (name.equals(u"sitekey"_str))
132 {
133 if (valueStart >= 0 && valueEnd > valueStart)
134 {
135 data.mSitekeysStart = valueStart;
136 data.mSitekeysEnd = valueEnd;
137 }
138 }
139 else if (name.equals(u"match-case"_str))
140 data.mMatchCase = !reverse;
141 else if (name.equals(u"third-party"_str))
142 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES;
143 else if (name.equals(u"collapse"_str))
144 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES;
145 else
146 error.reset(u"filter_unknown_option"_str);
147 }
148
149 void ParseOptions(String& text, DependentString& error, RegExpFilterData& data ,
150 String::size_type optionsStart)
151 {
152 data.mMatchCase = false;
153 data.mThirdParty = TrippleState::ANY;
154 data.mCollapse = TrippleState::ANY;
155 data.mDomainsStart = String::npos;
156 data.mSitekeysStart = String::npos;
157 if (optionsStart >= text.length())
158 {
159 data.mContentType = defaultTypeMask;
160 return;
161 }
162
163 data.mContentType = -1;
164
165 int optionStart = data.mPatternEnd + 1;
166 int optionEnd = -1;
167 int valueStart = -1;
168
169 StringScanner scanner(text, optionStart, u',');
170 bool done = false;
171 while (!done)
172 {
173 done = scanner.done();
174 switch (scanner.next())
175 {
176 case u'=':
177 if (optionEnd < 0)
178 {
179 optionEnd = scanner.position();
180 valueStart = optionEnd + 1;
181 }
23 break; 182 break;
24 case u'^': 183 case u',':
25 result += u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x60 \\x7B-\\x7F]|$)"; 184 if (optionEnd < 0)
185 optionEnd = scanner.position();
186 ParseOption(text, error, data, optionStart, optionEnd, valueStart,
187 scanner.position());
188 if (!error.empty())
189 return;
190
191 optionStart = scanner.position() + 1;
192 optionEnd = -1;
193 valueStart = -1;
26 break; 194 break;
27 case u'|': 195 }
28 if (i == 0) 196 }
197
198 if (data.mContentType < 0)
199 data.mContentType = defaultTypeMask;
200 }
201 }
202
203 RegExpFilter::RegExpFilter(Type type, const String& text, const RegExpFilterData & data)
204 : ActiveFilter(type, text, true), mData(data)
205 {
206 }
207
208 RegExpFilter::~RegExpFilter()
209 {
210 if (mData.HasRegExp())
211 EM_ASM_ARGS(regexps.delete($0), mData.mRegexpId);
212 }
213
214 Filter::Type RegExpFilter::Parse(DependentString& text, DependentString& error,
215 RegExpFilterData& data)
216 {
217 NormalizeWhitespace(text);
218
219 Filter::Type type = Type::BLOCKING;
220
221 data.mPatternStart = 0;
222 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@')
223 {
224 type = Type::WHITELIST;
225 data.mPatternStart = 2;
226 }
227
228 data.mPatternEnd = text.find(u'$', data.mPatternStart);
229 if (data.mPatternEnd == text.npos)
230 data.mPatternEnd = text.length();
231
232 ParseOptions(text, error, data, data.mPatternEnd + 1);
233 if (!error.empty())
234 return Type::INVALID;
235
236 if (data.mPatternEnd - data.mPatternStart >= 2 &&
237 text[data.mPatternStart] == u'/' &&
238 text[data.mPatternEnd - 1] == u'/')
239 {
240 data.SetRegExp(GenerateRegExp(DependentString(text, data.mPatternStart + 1,
241 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase));
242 if (data.mRegexpId == -1)
243 {
244 error.reset(u"filter_invalid_regexp"_str);
245 return Type::INVALID;
246 }
247 }
248
249 return type;
250 }
251
252 void RegExpFilter::ParseSitekeys(const String& sitekeys) const
253 {
254 StringScanner scanner(sitekeys, 0, u'|');
255 size_t start = 0;
256 bool done = false;
257 while (!done)
258 {
259 done = scanner.done();
260 if (scanner.next() == u'|')
261 {
262 if (scanner.position() > start)
263 AddSitekey(DependentString(sitekeys, start, scanner.position() - start)) ;
264 start = scanner.position() + 1;
265 }
266 }
267 }
268
269 void RegExpFilter::InitJSTypes()
270 {
271 EM_ASM(exports.RegExpFilter.typeMap = {};);
272 for (auto it = typeMap.begin(); it != typeMap.end(); ++it)
273 EM_ASM_ARGS(exports.RegExpFilter.typeMap[readString($0).replace("-", "_").to UpperCase()] = $1, &(it->first), it->second);
274 }
275
276 OwnedString RegExpFilter::RegExpFromSource(const String& source)
277 {
278 /* TODO: this is very inefficient */
279
280 // Note: This doesn't remove trailing wildcards, otherwise the result should
281 // be identical to Filter.toRegExp().
282 OwnedString result;
283 String::value_type prevChar = u'*';
284 for (String::size_type i = 0; i < source.length(); ++i)
285 {
286 String::value_type currChar = source[i];
287 switch (currChar)
288 {
289 case u'*':
290 if (prevChar != u'*')
291 result.append(u".*"_str);
292 break;
293 case u'^':
294 result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x 60\\x7B-\\x7F]|$)"_str);
295 break;
296 case u'|':
297 if (i == 0)
298 {
299 // Anchor at expression start, maybe extended anchor?
300 if (i + 1 < source.length() && source[i + 1] == u'|')
29 { 301 {
30 // Anchor at expression start, maybe extended anchor? 302 result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str);
31 if (i + 1 < l && source[i + 1] == u'|') 303 ++i;
32 {
33 result += u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?";
34 ++i;
35 }
36 else
37 result += u"^";
38 }
39 else if (i == l - 1)
40 {
41 // Anchor at expression end, ignore if following separator placehold er
42 if (prevChar != u'^')
43 result += u"$";
44 } 304 }
45 else 305 else
46 { 306 result.append(u'^');
47 // Not actually an anchor, escape it 307 }
48 result += u"\\|"; 308 else if (i == source.length() - 1)
49 } 309 {
50 break; 310 // Anchor at expression end, ignore if following separator placeholder
51 default: 311 if (prevChar != u'^')
52 if ((currChar >= u'a' && currChar <= u'z') || 312 result.append(u'$');
53 (currChar >= u'A' && currChar <= u'Z') || 313 }
54 (currChar >= u'0' && currChar <= u'9') || 314 else
55 currChar >= 128) 315 {
56 { 316 // Not actually an anchor, escape it
57 result += currChar; 317 result.append(u"\\|"_str);
58 } 318 }
59 else 319 break;
60 { 320 default:
61 result += u"\\"; 321 if (!(currChar >= u'a' && currChar <= u'z') &&
62 result.append(1, currChar); 322 !(currChar >= u'A' && currChar <= u'Z') &&
63 } 323 !(currChar >= u'0' && currChar <= u'9') &&
64 } 324 currChar < 128)
65 prevChar = currChar; 325 {
66 } 326 result.append(u'\\');
67 return EM_ASM_INT(return regexps.create($0, $1), &result, false); 327 }
68 } 328 result.append(currChar);
69 } 329 }
70 330 prevChar = currChar;
71 RegExpFilter::RegExpFilter(const std::u16string& text, 331 }
72 const std::u16string& pattern, const std::u16string& options) 332 return result;
73 : ActiveFilter(text), regexpId(0) 333 }
74 { 334
75 size_t len = pattern.length(); 335 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const
76 if (len >= 2 && pattern[0] == u'/' && pattern[len - 1] == u'/') 336 {
77 { 337 if (!mData.DomainsParsingDone())
78 std::u16string param = pattern.substr(1, len - 2); 338 {
79 regexpId = EM_ASM_INT(return regexps.create($0, $1), &param, false); 339 ParseDomains(mData.GetDomainsSource(mText), u'|');
80 340 mData.SetDomainsParsingDone();
81 std::u16string* error = reinterpret_cast<std::u16string*>(EM_ASM_INT(return regexps.getError($0), regexpId)); 341 }
82 if (error) 342 return ActiveFilter::GetDomains();
83 { 343 }
84 EM_ASM_ARGS(regexps.delete($0), regexpId); 344
85 throw std::u16string(*error); 345 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const
86 } 346 {
87 } 347 if (!mData.SitekeyParsingDone())
88 else 348 {
89 regexpSource = pattern; 349 ParseSitekeys(mData.GetSitekeysSource(mText));
90 } 350 mData.SetSitekeysParsingDone();
91 351 }
92 RegExpFilter::~RegExpFilter() 352 return ActiveFilter::GetSitekeys();
93 { 353 }
94 if (regexpId) 354
95 EM_ASM_ARGS(regexps.delete($0), regexpId); 355 bool RegExpFilter::Matches(const String& location, int typeMask,
96 } 356 DependentString& docDomain, bool thirdParty, const String& sitekey) const
97 357 {
98 Filter* RegExpFilter::Create(const std::u16string& text) 358 if (!(mData.mContentType & typeMask) ||
99 { 359 (mData.mThirdParty == TrippleState::YES && !thirdParty) ||
100 bool blocking = true; 360 (mData.mThirdParty == TrippleState::NO && thirdParty) ||
101 size_t patternStart = 0; 361 !IsActiveOnDomain(docDomain, sitekey))
102 if (!text.compare(0, 2, u"@@")) 362 {
103 { 363 return false;
104 blocking = false; 364 }
105 patternStart = 2; 365
106 } 366 if (!mData.RegExpParsingDone())
107 367 {
108 size_t patternEnd = text.find(u'$', patternStart); 368 const OwnedString pattern(mData.GetRegExpSource(mText));
109 size_t patternLength = (patternEnd != std::u16string::npos ? 369 mData.SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mData.mMatchCase)) ;
110 patternEnd - patternStart : patternEnd); 370 }
111 std::u16string pattern(text.substr(patternStart, patternLength)); 371 return EM_ASM_INT(return regexps.test($0, $1), mData.mRegexpId, &location);
112 std::u16string options(patternEnd != std::u16string::npos ? 372 }
113 text.substr(patternEnd) : u"");
114
115 try
116 {
117 if (blocking)
118 return new RegExpFilter(text, pattern, options);
119 else
120 return new WhiteListFilter(text, pattern, options);
121 }
122 catch (const std::u16string& reason)
123 {
124 return new InvalidFilter(text, reason);
125 }
126 }
127
128 Filter::Type RegExpFilter::GetType() const
129 {
130 return Type::BLOCKING;
131 }
132
133 bool RegExpFilter::Matches(const std::u16string& location)
134 {
135 if (!regexpId)
136 {
137 regexpId = GenerateRegExp(regexpSource);
138 regexpSource.resize(0);
139 }
140 return EM_ASM_INT(return regexps.test($0, $1), regexpId, &location);
141 }
LEFTRIGHT

Powered by Google App Engine
This is Rietveld