compiled/RegExpFilter.cpp - Issue 29333474: Issue 4125 - [emscripten] Convert filter classes to C++

Delta Between Two Patch Sets: compiled/RegExpFilter.cpp

Issue 29333474: Issue 4125 - [emscripten] Convert filter classes to C++ (Closed)

Left Patch Set: Reworked JS binding generation Created Feb. 1, 2016, 9:14 p.m.

Right Patch Set: Addressed comments from Patch Set 28 Created March 21, 2017, 10:04 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 #include <climits>	1 #include <climits>

2	2

3 #include <emscripten.h>	3 #include <emscripten.h>

4	4

5 #include "RegExpFilter.h"	5 #include "RegExpFilter.h"

6 #include "WhitelistFilter.h"

7 #include "InvalidFilter.h"

8 #include "StringScanner.h"	6 #include "StringScanner.h"

9 #include "StringMap.h"	7 #include "StringMap.h"

10	8

11 namespace	9 namespace

12 {	10 {

13 enum	11 enum

14 {	12 {

15 TYPE_OTHER = 0x1,	13 TYPE_OTHER = 0x1,

16 TYPE_SCRIPT = 0x2,	14 TYPE_SCRIPT = 0x2,

17 TYPE_IMAGE = 0x4,	15 TYPE_IMAGE = 0x4,

(...skipping 28 matching lines...) Expand all Loading...
46 {u"media"_str, TYPE_MEDIA},	44 {u"media"_str, TYPE_MEDIA},

47 {u"font"_str, TYPE_FONT},	45 {u"font"_str, TYPE_FONT},

48 {u"background"_str, TYPE_IMAGE}, // Backwards compat	46 {u"background"_str, TYPE_IMAGE}, // Backwards compat

49	47

50 {u"popup"_str, TYPE_POPUP},	48 {u"popup"_str, TYPE_POPUP},

51 {u"genericblock"_str, TYPE_GENERICBLOCK},	49 {u"genericblock"_str, TYPE_GENERICBLOCK},

52 {u"generichide"_str, TYPE_GENERICHIDE},	50 {u"generichide"_str, TYPE_GENERICHIDE},

53 {u"elemhide"_str, TYPE_ELEMHIDE},	51 {u"elemhide"_str, TYPE_ELEMHIDE},

54 };	52 };

55	53

56 int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT \| TYPE_ELEMHIDE \| TYPE_POPUP \|	54 const int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT \| TYPE_ELEMHIDE \|

57 TYPE_GENERICBLOCK \| TYPE_GENERICHIDE);	55 TYPE_POPUP \| TYPE_GENERICBLOCK \| TYPE_GENERICHIDE);

58	56

59 int GenerateRegExp(const String& regexp, bool matchCase)	57 int GenerateRegExp(const String& regexp, bool matchCase)

60 {	58 {

61 return EM_ASM_INT(return regexps.create($0, $1), &regexp, matchCase);	59 return EM_ASM_INT(return regexps.create($0, $1), &regexp, matchCase);

62 }	60 }

63 }	61

64	62 void NormalizeWhitespace(DependentString& text)

65 RegExpFilter::RegExpFilter(const String& text,	63 {

66 String::size_type patternStart, String::size_type patternEnd)	64 // We want to remove all spaces but bail out early in the common scenario

67 : ActiveFilter(text, true), mRegexpId(0),	65 // that the string contains no spaces.

68 mRegexpSource(String(mText, patternStart, patternEnd - patternStart)),	66

69 mContentType(-1), mMatchCase(false), mThirdParty(TrippleState::ANY)	67 // Look for the first space

70 {	68 String::size_type len = text.length();

71 String options(mText, patternEnd + 1);	69 String::size_type pos;

72 StringScanner scanner(options, u',');	70 for (pos = 0; pos < len; pos++)

73 int optionStart = 0;	71 if (text[pos] == ' ')

74 int optionEnd = -1;	72 break;

75 int valueStart = -1;	73

	74 if (pos >= len)

	75 return;

	76

	77 // Found spaces, move characters to remove them

	78 String::size_type delta = 1;

	79 for (pos = pos + 1; pos < len; pos++)

	80 {

	81 if (text[pos] == ' ')

	82 delta++;

	83 else

	84 text[pos - delta] = text[pos];

	85 }

	86 text.reset(text, 0, len - delta);

	87 }

	88

	89 void ParseOption(String& text, DependentString& error, RegExpFilterData& data,

	90 int optionStart, int optionEnd, int valueStart, int valueEnd)

	91 {

	92 if (optionEnd <= optionStart)

	93 return;

	94

	95 bool reverse = false;

	96 if (text[optionStart] == u'~')

	97 {

	98 reverse = true;

	99 optionStart++;

	100 }

	101

	102 DependentString name(text, optionStart, optionEnd - optionStart);

	103 for (size_t i = 0; i < name.length(); ++i)

	104 {

	105 char16_t currChar = name[i];

	106 if (currChar >= u'A' && currChar <= u'Z')

	107 name[i] = currChar + u'a' - u'A';

	108 else if (currChar == u'_')

	109 name[i] = u'-';

	110 }

	111

	112 auto it = typeMap.find(name);

	113 if (it)

	114 {

	115 if (data.mContentType < 0)

	116 data.mContentType = reverse ? defaultTypeMask : 0;

	117 if (reverse)

	118 data.mContentType &= ~it->second;

	119 else

	120 data.mContentType \|= it->second;

	121 }

	122 else if (name.equals(u"domain"_str))

	123 {

	124 if (valueStart >= 0 && valueEnd > valueStart)

	125 {

	126 data.mDomainsStart = valueStart;

	127 data.mDomainsEnd = valueEnd;

	128 DependentString(text, valueStart, valueEnd - valueStart).toLower();

	129 }

	130 }

	131 else if (name.equals(u"sitekey"_str))

	132 {

	133 if (valueStart >= 0 && valueEnd > valueStart)

	134 {

	135 data.mSitekeysStart = valueStart;

	136 data.mSitekeysEnd = valueEnd;

	137 }

	138 }

	139 else if (name.equals(u"match-case"_str))

	140 data.mMatchCase = !reverse;

	141 else if (name.equals(u"third-party"_str))

	142 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES;

	143 else if (name.equals(u"collapse"_str))

	144 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES;

	145 else

	146 error.reset(u"filter_unknown_option"_str);

	147 }

	148

	149 void ParseOptions(String& text, DependentString& error, RegExpFilterData& data ,

	150 String::size_type optionsStart)

	151 {

	152 data.mMatchCase = false;

	153 data.mThirdParty = TrippleState::ANY;

	154 data.mCollapse = TrippleState::ANY;

	155 data.mDomainsStart = String::npos;

	156 data.mSitekeysStart = String::npos;

	157 if (optionsStart >= text.length())

	158 {

	159 data.mContentType = defaultTypeMask;

	160 return;

	161 }

	162

	163 data.mContentType = -1;

	164

	165 int optionStart = data.mPatternEnd + 1;

	166 int optionEnd = -1;

	167 int valueStart = -1;

	168

	169 StringScanner scanner(text, optionStart, u',');

	170 bool done = false;

	171 while (!done)

	172 {

	173 done = scanner.done();

	174 switch (scanner.next())

	175 {

	176 case u'=':

	177 if (optionEnd < 0)

	178 {

	179 optionEnd = scanner.position();

	180 valueStart = optionEnd + 1;

	181 }

	182 break;

	183 case u',':

	184 if (optionEnd < 0)

	185 optionEnd = scanner.position();

	186 ParseOption(text, error, data, optionStart, optionEnd, valueStart,

	187 scanner.position());

	188 if (!error.empty())

	189 return;

	190

	191 optionStart = scanner.position() + 1;

	192 optionEnd = -1;

	193 valueStart = -1;

	194 break;

	195 }

	196 }

	197

	198 if (data.mContentType < 0)

	199 data.mContentType = defaultTypeMask;

	200 }

	201 }

	202

	203 RegExpFilter::RegExpFilter(Type type, const String& text, const RegExpFilterData & data)

	204 : ActiveFilter(type, text, true), mData(data)

	205 {

	206 }

	207

	208 RegExpFilter::~RegExpFilter()

	209 {

	210 if (mData.HasRegExp())

	211 EM_ASM_ARGS(regexps.delete($0), mData.mRegexpId);

	212 }

	213

	214 Filter::Type RegExpFilter::Parse(DependentString& text, DependentString& error,

	215 RegExpFilterData& data)

	216 {

	217 NormalizeWhitespace(text);

	218

	219 Filter::Type type = Type::BLOCKING;

	220

	221 data.mPatternStart = 0;

	222 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@')

	223 {

	224 type = Type::WHITELIST;

	225 data.mPatternStart = 2;

	226 }

	227

	228 data.mPatternEnd = text.find(u'$', data.mPatternStart);

	229 if (data.mPatternEnd == text.npos)

	230 data.mPatternEnd = text.length();

	231

	232 ParseOptions(text, error, data, data.mPatternEnd + 1);

	233 if (!error.empty())

	234 return Type::INVALID;

	235

	236 if (data.mPatternEnd - data.mPatternStart >= 2 &&

	237 text[data.mPatternStart] == u'/' &&

	238 text[data.mPatternEnd - 1] == u'/')

	239 {

	240 data.SetRegExp(GenerateRegExp(DependentString(text, data.mPatternStart + 1,

	241 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase));

	242 if (data.mRegexpId == -1)

	243 {

	244 error.reset(u"filter_invalid_regexp"_str);

	245 return Type::INVALID;

	246 }

	247 }

	248

	249 return type;

	250 }

	251

	252 void RegExpFilter::ParseSitekeys(const String& sitekeys) const

	253 {

	254 StringScanner scanner(sitekeys, 0, u'\|');

	255 size_t start = 0;

76 bool done = false;	256 bool done = false;

77 while (!done)	257 while (!done)

78 {	258 {

79 done = scanner.done();	259 done = scanner.done();

80 switch (scanner.next())	260 if (scanner.next() == u'\|')

81 {	261 {

82 case u'=':	262 if (scanner.position() > start)

83 if (optionEnd < 0)	263 AddSitekey(DependentString(sitekeys, start, scanner.position() - start)) ;

84 {	264 start = scanner.position() + 1;

85 optionEnd = scanner.position();	265 }

86 valueStart = optionEnd + 1;

87 }

88 break;

89 case u',':

90 if (optionEnd < 0)

91 optionEnd = scanner.position();

92 ProcessOption(options, optionStart, optionEnd, valueStart, scanner.posit ion());

93 optionStart = scanner.position() + 1;

94 optionEnd = -1;

95 valueStart = -1;

96 break;

97 }

98 }

99 if (mContentType < 0)

100 mContentType = defaultTypeMask;

101

102 size_t len = mRegexpSource.length();

103 if (len >= 2 && mRegexpSource[0] == u'/' && mRegexpSource[len - 1] == u'/')

104 {

105 mRegexpSource.reset(mRegexpSource, 1 , len - 2);

106 mRegexpId = GenerateRegExp(mRegexpSource, mMatchCase);

107

108 int errorLength = EM_ASM_INT(return regexps.getErrorLength($0), mRegexpId);

109 if (errorLength >= 0)

110 {

111 String error(errorLength);

112 EM_ASM_ARGS(regexps.getError($0, $1), mRegexpId, error.data());

113 throw error;

114 }

115 }

116 }

117

118 RegExpFilter::~RegExpFilter()

119 {

120 if (mRegexpId)

121 EM_ASM_ARGS(regexps.delete($0), mRegexpId);

122 }

123

124 void RegExpFilter::ProcessOption(String& options, int optionStart,

125 int optionEnd, int valueStart, int valueEnd)

126 {

127 if (optionEnd <= optionStart)

128 return;

129

130 bool reverse = false;

131 if (options[optionStart] == u'~')

132 {

133 reverse = true;

134 optionStart++;

135 }

136

137 String name(options, optionStart, optionEnd - optionStart);

138 for (size_t i = 0; i < name.length(); ++i)

139 {

140 char16_t currChar = name[i];

141 if (currChar >= u'A' && currChar <= u'Z')

142 name[i] = currChar + u'a' - u'A';

143 else if (currChar == u'_')

144 name[i] = u'-';

145 }

146

147 auto it = typeMap.find(name);

148 if (it != typeMap.end())

149 {

150 if (mContentType < 0)

151 mContentType = reverse ? defaultTypeMask : 0;

152 if (reverse)

153 mContentType &= ~it->second;

154 else

155 mContentType \|= it->second;

156 }

157 else if (name.equals(u"domain"_str))

158 {

159 if (valueStart >= 0 && valueEnd > valueStart)

160 ParseDomains(String(options, valueStart, valueEnd - valueStart), u'\|');

161 }

162 else if (name.equals(u"sitekey"_str))

163 {

164 if (valueStart >= 0 && valueEnd > valueStart)

165 {

166 StringScanner scanner(String(options, valueStart, valueEnd - valueStart), u'\|');

167 size_t start = 0;

168 bool done = false;

169 while (!done)

170 {

171 done = scanner.done();

172 if (scanner.next() == u'\|')

173 {

174 if (scanner.position() > start)

175 AddSitekey(String(options, valueStart + start, scanner.position() - start));

176 start = scanner.position() + 1;

177 }

178 }

179 }

180 }

181 else if (name.equals(u"match-case"_str))

182 mMatchCase = !reverse;

183 else if (name.equals(u"third-party"_str))

184 mThirdParty = reverse ? TrippleState::NO : TrippleState::YES;

185 else if (name.equals(u"collapse"_str))

186 mCollapse = reverse ? TrippleState::NO : TrippleState::YES;

187 else

188 {

189 String error(u"Unknown option "_str);

190 error.append(name);

191 throw std::move(error);

192 }

193 }

194

195 Filter* RegExpFilter::Create(const String& text)

196 {

197 bool blocking = true;

198 String::size_type patternStart = 0;

199 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@')

200 {

201 blocking = false;

202 patternStart = 2;

203 }

204

205 String::size_type patternEnd = text.find(u'$', patternStart);

206 if (patternEnd == text.npos)

207 patternEnd = text.length();

208

209 try

210 {

211 if (blocking)

212 return new RegExpFilter(text, patternStart, patternEnd);

213 else

214 return new WhitelistFilter(text, patternStart, patternEnd);

215 }

216 catch (const String& reason)

217 {

218 return new InvalidFilter(text, reason);

219 }	266 }

220 }	267 }

221	268

222 void RegExpFilter::InitJSTypes()	269 void RegExpFilter::InitJSTypes()

223 {	270 {

224 EM_ASM(exports.RegExpFilter.typeMap = {};);	271 EM_ASM(exports.RegExpFilter.typeMap = {};);

225 for (auto it = typeMap.begin(); it != typeMap.end(); ++it)	272 for (auto it = typeMap.begin(); it != typeMap.end(); ++it)

226 EM_ASM_ARGS(exports.RegExpFilter.typeMap[getStringData($0).replace("-", "_") .toUpperCase()] = $1, &(it->first), it->second);	273 EM_ASM_ARGS(exports.RegExpFilter.typeMap[readString($0).replace("-", "_").to UpperCase()] = $1, &(it->first), it->second);

227 }	274 }

228	275

229 String RegExpFilter::RegExpFromSource(const String& source)	276 OwnedString RegExpFilter::RegExpFromSource(const String& source)

230 {	277 {

231 /* TODO: this is very inefficient */	278 /* TODO: this is very inefficient */

232	279

233 // Note: This doesn't remove trailing wildcards, otherwise the result should	280 // Note: This doesn't remove trailing wildcards, otherwise the result should

234 // be identical to Filter.toRegExp().	281 // be identical to Filter.toRegExp().

235 String result;	282 OwnedString result;

236 String::value_type prevChar = u'*';	283 String::value_type prevChar = u'*';

237 for (String::size_type i = 0; i < source.length(); ++i)	284 for (String::size_type i = 0; i < source.length(); ++i)

238 {	285 {

239 String::value_type currChar = source[i];	286 String::value_type currChar = source[i];

240 switch (currChar)	287 switch (currChar)

241 {	288 {

242 case u'*':	289 case u'*':

243 if (prevChar != u'*')	290 if (prevChar != u'*')

244 result.append(u".*"_str);	291 result.append(u".*"_str);

245 break;	292 break;

(...skipping 29 matching lines...) Expand all Loading...
275 !(currChar >= u'A' && currChar <= u'Z') &&	322 !(currChar >= u'A' && currChar <= u'Z') &&

276 !(currChar >= u'0' && currChar <= u'9') &&	323 !(currChar >= u'0' && currChar <= u'9') &&

277 currChar < 128)	324 currChar < 128)

278 {	325 {

279 result.append(u'\\');	326 result.append(u'\\');

280 }	327 }

281 result.append(currChar);	328 result.append(currChar);

282 }	329 }

283 prevChar = currChar;	330 prevChar = currChar;

284 }	331 }

285 return std::move(result);	332 return result;

286 }	333 }

287	334

288 Filter::Type RegExpFilter::GetType() const	335 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const

289 {	336 {

290 return Type::BLOCKING;	337 if (!mData.DomainsParsingDone())

	338 {

	339 ParseDomains(mData.GetDomainsSource(mText), u'\|');

	340 mData.SetDomainsParsingDone();

	341 }

	342 return ActiveFilter::GetDomains();

	343 }

	344

	345 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const

	346 {

	347 if (!mData.SitekeyParsingDone())

	348 {

	349 ParseSitekeys(mData.GetSitekeysSource(mText));

	350 mData.SetSitekeysParsingDone();

	351 }

	352 return ActiveFilter::GetSitekeys();

291 }	353 }

292	354

293 bool RegExpFilter::Matches(const String& location, int typeMask,	355 bool RegExpFilter::Matches(const String& location, int typeMask,

294 String& docDomain, bool thirdParty, const String& sitekey) const	356 DependentString& docDomain, bool thirdParty, const String& sitekey) const

295 {	357 {

296 if (!(mContentType & typeMask) \|\|	358 if (!(mData.mContentType & typeMask) \|\|

297 (mThirdParty == TrippleState::YES && !thirdParty) \|\|	359 (mData.mThirdParty == TrippleState::YES && !thirdParty) \|\|

298 (mThirdParty == TrippleState::NO && thirdParty) \|\|	360 (mData.mThirdParty == TrippleState::NO && thirdParty) \|\|

299 !IsActiveOnDomain(docDomain, sitekey))	361 !IsActiveOnDomain(docDomain, sitekey))

300 {	362 {

301 return false;	363 return false;

302 }	364 }

303	365

304 if (!mRegexpId)	366 if (!mData.RegExpParsingDone())

305 mRegexpId = GenerateRegExp(RegExpFromSource(mRegexpSource), mMatchCase);	367 {

306 return EM_ASM_INT(return regexps.test($0, $1), mRegexpId, &location);	368 const OwnedString pattern(mData.GetRegExpSource(mText));

307 }	369 mData.SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mData.mMatchCase)) ;

	370 }

	371 return EM_ASM_INT(return regexps.test($0, $1), mData.mRegexpId, &location);

	372 }

LEFT	RIGHT