Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: compiled/RegExpFilter.cpp

Issue 29333474: Issue 4125 - [emscripten] Convert filter classes to C++ (Closed)
Left Patch Set: Minor improvements Created Jan. 20, 2016, 2:41 p.m.
Right Patch Set: Addressed comments from Patch Set 28 Created March 21, 2017, 10:04 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « compiled/RegExpFilter.h ('k') | compiled/String.h » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #include <climits> 1 #include <climits>
2 #include <unordered_map>
3 2
4 #include <emscripten.h> 3 #include <emscripten.h>
5 4
6 #include "RegExpFilter.h" 5 #include "RegExpFilter.h"
7 #include "WhiteListFilter.h"
8 #include "InvalidFilter.h"
9 #include "StringScanner.h" 6 #include "StringScanner.h"
7 #include "StringMap.h"
10 8
11 namespace 9 namespace
12 { 10 {
13 enum 11 enum
14 { 12 {
15 TYPE_OTHER = 0x1, 13 TYPE_OTHER = 0x1,
16 TYPE_SCRIPT = 0x2, 14 TYPE_SCRIPT = 0x2,
17 TYPE_IMAGE = 0x4, 15 TYPE_IMAGE = 0x4,
18 TYPE_STYLESHEET = 0x8, 16 TYPE_STYLESHEET = 0x8,
19 TYPE_OBJECT = 0x10, 17 TYPE_OBJECT = 0x10,
20 TYPE_SUBDOCUMENT = 0x20, 18 TYPE_SUBDOCUMENT = 0x20,
21 TYPE_DOCUMENT = 0x40, 19 TYPE_DOCUMENT = 0x40,
22 TYPE_PING = 0x400, 20 TYPE_PING = 0x400,
23 TYPE_XMLHTTPREQUEST = 0x800, 21 TYPE_XMLHTTPREQUEST = 0x800,
24 TYPE_OBJECT_SUBREQUEST = 0x1000, 22 TYPE_OBJECT_SUBREQUEST = 0x1000,
25 TYPE_MEDIA = 0x4000, 23 TYPE_MEDIA = 0x4000,
26 TYPE_FONT = 0x8000, 24 TYPE_FONT = 0x8000,
27 TYPE_POPUP = 0x8000000, 25 TYPE_POPUP = 0x8000000,
28 TYPE_GENERICBLOCK = 0x10000000, 26 TYPE_GENERICBLOCK = 0x10000000,
29 TYPE_GENERICHIDE = 0x20000000, 27 TYPE_GENERICHIDE = 0x20000000,
30 TYPE_ELEMHIDE = 0x40000000, 28 TYPE_ELEMHIDE = 0x40000000,
31 }; 29 };
32 30
33 std::unordered_map<std::u16string,int> typeMap({ 31 StringMap<int> typeMap {
34 {u"OTHER", TYPE_OTHER}, 32 {u"other"_str, TYPE_OTHER},
35 {u"SCRIPT", TYPE_SCRIPT}, 33 {u"script"_str, TYPE_SCRIPT},
36 {u"IMAGE", TYPE_IMAGE}, 34 {u"image"_str, TYPE_IMAGE},
37 {u"STYLESHEET", TYPE_STYLESHEET}, 35 {u"stylesheet"_str, TYPE_STYLESHEET},
38 {u"OBJECT", TYPE_OBJECT}, 36 {u"object"_str, TYPE_OBJECT},
39 {u"SUBDOCUMENT", TYPE_SUBDOCUMENT}, 37 {u"subdocument"_str, TYPE_SUBDOCUMENT},
40 {u"DOCUMENT", TYPE_DOCUMENT}, 38 {u"document"_str, TYPE_DOCUMENT},
41 {u"XBL", TYPE_OTHER}, // Backwards compat 39 {u"xbl"_str, TYPE_OTHER}, // Backwards compat
42 {u"PING", TYPE_PING}, 40 {u"ping"_str, TYPE_PING},
43 {u"XMLHTTPREQUEST", TYPE_XMLHTTPREQUEST}, 41 {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST},
44 {u"OBJECT_SUBREQUEST", TYPE_OBJECT_SUBREQUEST}, 42 {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST},
45 {u"DTD", TYPE_OTHER}, // Backwards compat 43 {u"dtd"_str, TYPE_OTHER}, // Backwards compat
46 {u"MEDIA", TYPE_MEDIA}, 44 {u"media"_str, TYPE_MEDIA},
47 {u"FONT", TYPE_FONT}, 45 {u"font"_str, TYPE_FONT},
48 {u"BACKGROUND", TYPE_IMAGE}, // Backwards compat 46 {u"background"_str, TYPE_IMAGE}, // Backwards compat
49 47
50 {u"POPUP", TYPE_POPUP}, 48 {u"popup"_str, TYPE_POPUP},
51 {u"GENERICBLOCK", TYPE_GENERICBLOCK}, 49 {u"genericblock"_str, TYPE_GENERICBLOCK},
52 {u"GENERICHIDE", TYPE_GENERICHIDE}, 50 {u"generichide"_str, TYPE_GENERICHIDE},
53 {u"ELEMHIDE", TYPE_ELEMHIDE}, 51 {u"elemhide"_str, TYPE_ELEMHIDE},
54 }); 52 };
55 53
56 int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | TYPE_POPUP | 54 const int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE |
57 TYPE_GENERICBLOCK | TYPE_GENERICHIDE); 55 TYPE_POPUP | TYPE_GENERICBLOCK | TYPE_GENERICHIDE);
58 56
59 int GenerateRegExp(const std::u16string& regexp, bool matchCase) 57 int GenerateRegExp(const String& regexp, bool matchCase)
60 { 58 {
61 return EM_ASM_INT(return regexps.create($0, $1), &regexp, matchCase); 59 return EM_ASM_INT(return regexps.create($0, $1), &regexp, matchCase);
62 } 60 }
63 } 61
64 62 void NormalizeWhitespace(DependentString& text)
65 RegExpFilter::RegExpFilter(const std::u16string& text, 63 {
66 const std::u16string& pattern, const std::u16string& options) 64 // We want to remove all spaces but bail out early in the common scenario
67 : ActiveFilter(text, true), regexpId(0), contentType(-1), matchCase(false), 65 // that the string contains no spaces.
68 thirdParty(TrippleState::ANY) 66
69 { 67 // Look for the first space
70 int optionStart = 0; 68 String::size_type len = text.length();
71 int optionEnd = -1; 69 String::size_type pos;
72 int valueStart = -1; 70 for (pos = 0; pos < len; pos++)
73 StringScanner scanner(options + u","); 71 if (text[pos] == ' ')
74 while (!scanner.done())
75 {
76 switch (scanner.next())
77 {
78 case u'=':
79 if (optionEnd < 0)
80 {
81 optionEnd = scanner.position();
82 valueStart = optionEnd + 1;
83 }
84 break; 72 break;
85 case u',': 73
86 if (optionEnd < 0) 74 if (pos >= len)
87 optionEnd = scanner.position(); 75 return;
88 ProcessOption(options, optionStart, optionEnd, valueStart, scanner.posit ion()); 76
89 optionStart = scanner.position() + 1; 77 // Found spaces, move characters to remove them
90 optionEnd = -1; 78 String::size_type delta = 1;
91 valueStart = -1; 79 for (pos = pos + 1; pos < len; pos++)
92 break; 80 {
93 } 81 if (text[pos] == ' ')
94 } 82 delta++;
95 if (contentType < 0) 83 else
96 contentType = defaultTypeMask; 84 text[pos - delta] = text[pos];
97 85 }
98 size_t len = pattern.length(); 86 text.reset(text, 0, len - delta);
99 if (len >= 2 && pattern[0] == u'/' && pattern[len - 1] == u'/') 87 }
100 { 88
101 regexpId = GenerateRegExp(pattern.substr(1, len - 2), matchCase); 89 void ParseOption(String& text, DependentString& error, RegExpFilterData& data,
102 90 int optionStart, int optionEnd, int valueStart, int valueEnd)
103 std::u16string* error = reinterpret_cast<std::u16string*>(EM_ASM_INT(return regexps.getError($0), regexpId)); 91 {
104 if (error) 92 if (optionEnd <= optionStart)
105 { 93 return;
106 EM_ASM_ARGS(regexps.delete($0), regexpId); 94
107 throw std::u16string(*error); 95 bool reverse = false;
108 } 96 if (text[optionStart] == u'~')
109 } 97 {
110 else 98 reverse = true;
111 regexpSource = pattern; 99 optionStart++;
100 }
101
102 DependentString name(text, optionStart, optionEnd - optionStart);
103 for (size_t i = 0; i < name.length(); ++i)
104 {
105 char16_t currChar = name[i];
106 if (currChar >= u'A' && currChar <= u'Z')
107 name[i] = currChar + u'a' - u'A';
108 else if (currChar == u'_')
109 name[i] = u'-';
110 }
111
112 auto it = typeMap.find(name);
113 if (it)
114 {
115 if (data.mContentType < 0)
116 data.mContentType = reverse ? defaultTypeMask : 0;
117 if (reverse)
118 data.mContentType &= ~it->second;
119 else
120 data.mContentType |= it->second;
121 }
122 else if (name.equals(u"domain"_str))
123 {
124 if (valueStart >= 0 && valueEnd > valueStart)
125 {
126 data.mDomainsStart = valueStart;
127 data.mDomainsEnd = valueEnd;
128 DependentString(text, valueStart, valueEnd - valueStart).toLower();
129 }
130 }
131 else if (name.equals(u"sitekey"_str))
132 {
133 if (valueStart >= 0 && valueEnd > valueStart)
134 {
135 data.mSitekeysStart = valueStart;
136 data.mSitekeysEnd = valueEnd;
137 }
138 }
139 else if (name.equals(u"match-case"_str))
140 data.mMatchCase = !reverse;
141 else if (name.equals(u"third-party"_str))
142 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES;
143 else if (name.equals(u"collapse"_str))
144 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES;
145 else
146 error.reset(u"filter_unknown_option"_str);
147 }
148
149 void ParseOptions(String& text, DependentString& error, RegExpFilterData& data ,
150 String::size_type optionsStart)
151 {
152 data.mMatchCase = false;
153 data.mThirdParty = TrippleState::ANY;
154 data.mCollapse = TrippleState::ANY;
155 data.mDomainsStart = String::npos;
156 data.mSitekeysStart = String::npos;
157 if (optionsStart >= text.length())
158 {
159 data.mContentType = defaultTypeMask;
160 return;
161 }
162
163 data.mContentType = -1;
164
165 int optionStart = data.mPatternEnd + 1;
166 int optionEnd = -1;
167 int valueStart = -1;
168
169 StringScanner scanner(text, optionStart, u',');
170 bool done = false;
171 while (!done)
172 {
173 done = scanner.done();
174 switch (scanner.next())
175 {
176 case u'=':
177 if (optionEnd < 0)
178 {
179 optionEnd = scanner.position();
180 valueStart = optionEnd + 1;
181 }
182 break;
183 case u',':
184 if (optionEnd < 0)
185 optionEnd = scanner.position();
186 ParseOption(text, error, data, optionStart, optionEnd, valueStart,
187 scanner.position());
188 if (!error.empty())
189 return;
190
191 optionStart = scanner.position() + 1;
192 optionEnd = -1;
193 valueStart = -1;
194 break;
195 }
196 }
197
198 if (data.mContentType < 0)
199 data.mContentType = defaultTypeMask;
200 }
201 }
202
203 RegExpFilter::RegExpFilter(Type type, const String& text, const RegExpFilterData & data)
204 : ActiveFilter(type, text, true), mData(data)
205 {
112 } 206 }
113 207
114 RegExpFilter::~RegExpFilter() 208 RegExpFilter::~RegExpFilter()
115 { 209 {
116 if (regexpId) 210 if (mData.HasRegExp())
117 EM_ASM_ARGS(regexps.delete($0), regexpId); 211 EM_ASM_ARGS(regexps.delete($0), mData.mRegexpId);
118 } 212 }
119 213
120 void RegExpFilter::ProcessOption(const std::u16string& options, 214 Filter::Type RegExpFilter::Parse(DependentString& text, DependentString& error,
121 int optionStart, int optionEnd, int valueStart, int valueEnd) 215 RegExpFilterData& data)
122 { 216 {
123 if (optionEnd <= optionStart) 217 NormalizeWhitespace(text);
124 return; 218
125 219 Filter::Type type = Type::BLOCKING;
126 bool reverse = false; 220
127 if (options[optionStart] == u'~') 221 data.mPatternStart = 0;
128 { 222 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@')
129 reverse = true; 223 {
130 optionStart++; 224 type = Type::WHITELIST;
131 } 225 data.mPatternStart = 2;
132 226 }
133 std::u16string name(options.substr(optionStart, optionEnd - optionStart)); 227
134 for (size_t i = 0, l = name.length(); i < l; ++i) 228 data.mPatternEnd = text.find(u'$', data.mPatternStart);
135 { 229 if (data.mPatternEnd == text.npos)
136 char16_t currChar = name[i]; 230 data.mPatternEnd = text.length();
137 if (currChar >= u'a' && currChar <= u'z') 231
138 name[i] = currChar + u'A' - u'a'; 232 ParseOptions(text, error, data, data.mPatternEnd + 1);
139 else if (currChar == u'-') 233 if (!error.empty())
140 name[i] = u'_'; 234 return Type::INVALID;
141 } 235
142 236 if (data.mPatternEnd - data.mPatternStart >= 2 &&
143 auto it = typeMap.find(name); 237 text[data.mPatternStart] == u'/' &&
144 if (it != typeMap.end()) 238 text[data.mPatternEnd - 1] == u'/')
145 { 239 {
146 if (contentType < 0) 240 data.SetRegExp(GenerateRegExp(DependentString(text, data.mPatternStart + 1,
147 contentType = reverse ? defaultTypeMask : 0; 241 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase));
148 if (reverse) 242 if (data.mRegexpId == -1)
149 contentType &= ~it->second; 243 {
150 else 244 error.reset(u"filter_invalid_regexp"_str);
151 contentType |= it->second; 245 return Type::INVALID;
152 } 246 }
153 else if (!name.compare(u"DOMAIN")) 247 }
154 { 248
155 if (valueStart >= 0 && valueEnd > valueStart) 249 return type;
156 ParseDomains(options.substr(valueStart, valueEnd - valueStart), u'|'); 250 }
157 } 251
158 else if (!name.compare(u"SITEKEY")) 252 void RegExpFilter::ParseSitekeys(const String& sitekeys) const
159 { 253 {
160 if (valueStart >= 0 && valueEnd > valueStart) 254 StringScanner scanner(sitekeys, 0, u'|');
161 { 255 size_t start = 0;
162 StringScanner scanner(options.substr(valueStart, valueEnd - valueStart) + u"|"); 256 bool done = false;
163 size_t start = 0; 257 while (!done)
164 while (!scanner.done()) 258 {
165 { 259 done = scanner.done();
166 if (scanner.next() == u'|') 260 if (scanner.next() == u'|')
167 { 261 {
168 if (scanner.position() > start) 262 if (scanner.position() > start)
169 sitekeys.insert(options.substr(valueStart + start, scanner.position( ) - start)); 263 AddSitekey(DependentString(sitekeys, start, scanner.position() - start)) ;
170 start = scanner.position() + 1; 264 start = scanner.position() + 1;
171 } 265 }
172 }
173 }
174 }
175 else if (!name.compare(u"MATCH_CASE"))
176 matchCase = !reverse;
177 else if (!name.compare(u"THIRD_PARTY"))
178 thirdParty = reverse ? TrippleState::NO : TrippleState::YES;
179 else if (!name.compare(u"COLLAPSE"))
180 collapse = reverse ? TrippleState::NO : TrippleState::YES;
181 else
182 throw std::u16string(u"Unknown option " + name);
183 }
184
185 Filter* RegExpFilter::Create(const std::u16string& text)
186 {
187 bool blocking = true;
188 size_t patternStart = 0;
189 if (!text.compare(0, 2, u"@@"))
190 {
191 blocking = false;
192 patternStart = 2;
193 }
194
195 size_t patternEnd = text.find(u'$', patternStart);
196 size_t patternLength = (patternEnd != std::u16string::npos ?
197 patternEnd - patternStart : patternEnd);
198 std::u16string pattern(text.substr(patternStart, patternLength));
199 std::u16string options(patternEnd != std::u16string::npos ?
200 text.substr(patternEnd + 1) : u"");
201
202 try
203 {
204 if (blocking)
205 return new RegExpFilter(text, pattern, options);
206 else
207 return new WhiteListFilter(text, pattern, options);
208 }
209 catch (const std::u16string& reason)
210 {
211 return new InvalidFilter(text, reason);
212 } 266 }
213 } 267 }
214 268
215 void RegExpFilter::InitJSTypes() 269 void RegExpFilter::InitJSTypes()
216 { 270 {
271 EM_ASM(exports.RegExpFilter.typeMap = {};);
217 for (auto it = typeMap.begin(); it != typeMap.end(); ++it) 272 for (auto it = typeMap.begin(); it != typeMap.end(); ++it)
218 EM_ASM_ARGS(Module.RegExpFilter_typeMap[getStringData($0)] = $1, &(it->first ), it->second); 273 EM_ASM_ARGS(exports.RegExpFilter.typeMap[readString($0).replace("-", "_").to UpperCase()] = $1, &(it->first), it->second);
219 } 274 }
220 275
221 const std::u16string RegExpFilter::RegExpFromSource(const std::u16string& source ) 276 OwnedString RegExpFilter::RegExpFromSource(const String& source)
222 { 277 {
278 /* TODO: this is very inefficient */
279
223 // Note: This doesn't remove trailing wildcards, otherwise the result should 280 // Note: This doesn't remove trailing wildcards, otherwise the result should
224 // be identical to Filter.toRegExp(). 281 // be identical to Filter.toRegExp().
225 std::u16string result; 282 OwnedString result;
226 char16_t prevChar = u'*'; 283 String::value_type prevChar = u'*';
227 for (size_t i = 0, l = source.length(); i < l; ++i) 284 for (String::size_type i = 0; i < source.length(); ++i)
228 { 285 {
229 char16_t currChar = source[i]; 286 String::value_type currChar = source[i];
230 switch (currChar) 287 switch (currChar)
231 { 288 {
232 case u'*': 289 case u'*':
233 if (prevChar != u'*') 290 if (prevChar != u'*')
234 result += u".*"; 291 result.append(u".*"_str);
235 break; 292 break;
236 case u'^': 293 case u'^':
237 result += u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x60\\ x7B-\\x7F]|$)"; 294 result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x 60\\x7B-\\x7F]|$)"_str);
238 break; 295 break;
239 case u'|': 296 case u'|':
240 if (i == 0) 297 if (i == 0)
241 { 298 {
242 // Anchor at expression start, maybe extended anchor? 299 // Anchor at expression start, maybe extended anchor?
243 if (i + 1 < l && source[i + 1] == u'|') 300 if (i + 1 < source.length() && source[i + 1] == u'|')
244 { 301 {
245 result += u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"; 302 result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str);
246 ++i; 303 ++i;
247 } 304 }
248 else 305 else
249 result += u"^"; 306 result.append(u'^');
250 } 307 }
251 else if (i == l - 1) 308 else if (i == source.length() - 1)
252 { 309 {
253 // Anchor at expression end, ignore if following separator placeholder 310 // Anchor at expression end, ignore if following separator placeholder
254 if (prevChar != u'^') 311 if (prevChar != u'^')
255 result += u"$"; 312 result.append(u'$');
256 } 313 }
257 else 314 else
258 { 315 {
259 // Not actually an anchor, escape it 316 // Not actually an anchor, escape it
260 result += u"\\|"; 317 result.append(u"\\|"_str);
261 } 318 }
262 break; 319 break;
263 default: 320 default:
264 if ((currChar >= u'a' && currChar <= u'z') || 321 if (!(currChar >= u'a' && currChar <= u'z') &&
265 (currChar >= u'A' && currChar <= u'Z') || 322 !(currChar >= u'A' && currChar <= u'Z') &&
266 (currChar >= u'0' && currChar <= u'9') || 323 !(currChar >= u'0' && currChar <= u'9') &&
267 currChar >= 128) 324 currChar < 128)
268 { 325 {
269 result += currChar; 326 result.append(u'\\');
270 } 327 }
271 else 328 result.append(currChar);
272 {
273 result += u"\\";
274 result.append(1, currChar);
275 }
276 } 329 }
277 prevChar = currChar; 330 prevChar = currChar;
278 } 331 }
279 return result; 332 return result;
280 } 333 }
281 334
282 Filter::Type RegExpFilter::GetType() const 335 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const
283 { 336 {
284 return Type::BLOCKING; 337 if (!mData.DomainsParsingDone())
285 } 338 {
286 339 ParseDomains(mData.GetDomainsSource(mText), u'|');
287 bool RegExpFilter::Matches(const std::u16string& location, int typeMask, 340 mData.SetDomainsParsingDone();
288 const std::u16string& docDomain, bool thirdParty, 341 }
289 const std::u16string& sitekey) 342 return ActiveFilter::GetDomains();
290 { 343 }
291 if (!(this->contentType & typeMask) || 344
292 (this->thirdParty == TrippleState::YES && !thirdParty) || 345 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const
293 (this->thirdParty == TrippleState::NO && thirdParty) || 346 {
347 if (!mData.SitekeyParsingDone())
348 {
349 ParseSitekeys(mData.GetSitekeysSource(mText));
350 mData.SetSitekeysParsingDone();
351 }
352 return ActiveFilter::GetSitekeys();
353 }
354
355 bool RegExpFilter::Matches(const String& location, int typeMask,
356 DependentString& docDomain, bool thirdParty, const String& sitekey) const
357 {
358 if (!(mData.mContentType & typeMask) ||
359 (mData.mThirdParty == TrippleState::YES && !thirdParty) ||
360 (mData.mThirdParty == TrippleState::NO && thirdParty) ||
294 !IsActiveOnDomain(docDomain, sitekey)) 361 !IsActiveOnDomain(docDomain, sitekey))
295 { 362 {
296 return false; 363 return false;
297 } 364 }
298 365
299 if (!regexpId) 366 if (!mData.RegExpParsingDone())
300 { 367 {
301 regexpId = GenerateRegExp(RegExpFromSource(regexpSource), matchCase); 368 const OwnedString pattern(mData.GetRegExpSource(mText));
302 regexpSource.resize(0); 369 mData.SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mData.mMatchCase)) ;
303 } 370 }
304 return EM_ASM_INT(return regexps.test($0, $1), regexpId, &location); 371 return EM_ASM_INT(return regexps.test($0, $1), mData.mRegexpId, &location);
305 } 372 }
LEFTRIGHT

Powered by Google App Engine
This is Rietveld