Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: compiled/filter/Matcher.cpp

Issue 29556737: Issue 5141 - Convert filter match to C++ (Closed) Base URL: https://hg.adblockplus.org/adblockpluscore/
Left Patch Set: Created Sept. 26, 2017, 9:34 p.m.
Right Patch Set: Fixed many issues. One test left out. Created Oct. 6, 2017, 1:45 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
LEFTRIGHT
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-present eyeo GmbH 3 * Copyright (C) 2006-present eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
16 */ 16 */
17 17
18 #include "Matcher.h" 18 #include "Matcher.h"
19 #include "RegExpFilter.h" 19 #include "RegExpFilter.h"
20 #include "../library.h" 20 #include "../library.h"
21 21
22 class CombinedMatcher : public MatcherBase 22 namespace {
23 { 23 const DependentString regexpRegExp =
24 private: 24 u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_s tr;
25 StringMap<Filter*> mResultCache; 25 const DependentString optionsRegExp =
hub 2017/09/26 21:49:00 I wanted to use FilterPtr in there, but it didn't
26 static const size_t MAX_CACHE_ENTRIES = 1000; 26 u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str;
27 int mMatchReId; 27 const DependentString candidateRegExp =
28 public: 28 u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str;
29 Matcher mBlacklist; 29 const DependentString matchRegExp = u"[a-z0-9%]{3,}"_str;
30 Matcher mWhitelist; 30 }
31 31
32 CombinedMatcher() 32 Matcher::Matcher()
33 : mMatchReId(GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true)) 33 : mFilterByKeyword(1024), mKeywordByFilter(1024),
34 { 34 mReId(-1), mOptionsReId(-1), mCandidatesReId(-1)
35 } 35 {
36 36 mReId = GenerateRegExp(regexpRegExp, true, false);
37 ~CombinedMatcher() 37 mOptionsReId = GenerateRegExp(optionsRegExp, true, false);
38 { 38 mCandidatesReId = GenerateRegExp(candidateRegExp, true, true);
39 DeleteRegExp(mMatchReId); 39 mMatchReId = GenerateRegExp(matchRegExp, true, true);
40 } 40 }
41 41
42 void ResetCache() 42 void Matcher::Add(Filter& filter)
43 { 43 {
44 for (auto filter : mResultCache) 44 if (mKeywordByFilter.find(filter.GetText()))
45 filter.second->ReleaseRef(); 45 return;
hub 2017/09/26 21:49:00 See above: if we could have the FilterPtr as the d
46 mResultCache.clear(); 46
47 } 47 auto keyword = FindKeyword(filter);
48 48
49 void Add(const FilterPtr& filter) override 49 mFilterByKeyword[keyword].push_back(FilterPtr(&filter));
sergei 2017/10/11 09:55:16 Although the review is already closed I think it's
50 { 50 mKeywordByFilter[filter.GetText()] =
51 if (filter->mType == Filter::Type::WHITELIST) 51 FilterKeyword(std::move(keyword), filter);
52 mWhitelist.Add(filter); 52 }
53 else 53
54 mBlacklist.Add(filter); 54 void Matcher::Remove(Filter& filter)
55 55 {
56 ResetCache(); 56 auto entry = mKeywordByFilter.find(filter.GetText());
57 } 57 if (!entry)
58 58 return;
59 void Remove(const FilterPtr& filter) override 59
60 { 60 auto& keyword = static_cast<const String&>(entry->second);
61 if (filter->mType == Filter::Type::WHITELIST) 61 auto list = mFilterByKeyword[keyword];
62 mWhitelist.Remove(filter); 62 if (list.size() == 1)
63 else 63 mFilterByKeyword.erase(keyword);
64 mBlacklist.Remove(filter); 64 else
65 65 list.erase(std::find(list.cbegin(), list.cend(), FilterPtr(&filter)));
66 ResetCache(); 66
67 } 67 mKeywordByFilter.erase(filter.GetText());
68 68 }
69 void Clear() override 69
70 { 70 void Matcher::Clear()
71 mBlacklist.Clear(); 71 {
72 mWhitelist.Clear(); 72 mFilterByKeyword.clear();
73 ResetCache(); 73 mKeywordByFilter.clear();
74 } 74 }
75 75
76 OwnedString FindKeyword(const FilterPtr& filter) override 76 bool Matcher::HasFilter(const Filter& filter) const
77 { 77 {
78 if (filter->mType == Filter::Type::WHITELIST) 78 return mKeywordByFilter.find(filter.GetText());
79 return mWhitelist.FindKeyword(filter); 79 }
80 return mBlacklist.FindKeyword(filter); 80
81 } 81 namespace
82 82 {
83 bool HasFilter(const FilterPtr& filter) const override 83 DependentString emptyString = u""_str;
84 { 84 }
85 if (filter->mType == Filter::Type::WHITELIST) 85
86 return mWhitelist.HasFilter(filter); 86 const String& Matcher::GetKeywordForFilter(const Filter& filter) const
87 return mBlacklist.HasFilter(filter); 87 {
88 } 88 auto entry = mKeywordByFilter.find(filter.GetText());
89 89 if (entry)
90 const String& GetKeywordForFilter(const FilterPtr& filter) override 90 return static_cast<const String&>(entry->second);
91 { 91 return emptyString;
92 if (filter->mType == Filter::Type::WHITELIST) 92 }
93 return mWhitelist.GetKeywordForFilter(filter); 93
94 return mBlacklist.GetKeywordForFilter(filter); 94 Filter* Matcher::MatchesAny(const String& location,
95 } 95 int typeMask, DependentString& docDomain, bool thirdParty,
96 96 const String& sitekey, bool specificOnly) const
97 Filter* MatchesAnyInternal(const String& location, 97 {
98 int typeMask, DependentString& docDomain, bool thirdParty, 98 OwnedString text(location);
99 const String& sitekey, bool specificOnly) 99 text.toLower();
100 { 100 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false);
101 ReMatchResults reResult; 101 if (text.match(mMatchReId, *reResult))
102 OwnedString text(location); 102 {
103 text.toLower(); 103 auto& candidates = reResult->candidates;
104 text.match(mMatchReId, &reResult);
105
106 auto& candidates = reResult.candidates;
107 candidates.push_back(OwnedString()); 104 candidates.push_back(OwnedString());
108 105 for (auto candidate : candidates)
109 Filter* blacklistHit = nullptr;
110 for (size_t i = 0, l = candidates.size(); i < l; i++)
111 { 106 {
112 auto substr = candidates[i]; 107 auto result = CheckEntryMatch(candidate, location, typeMask, docDomain,
113 if (mWhitelist.mFilterByKeyword.find(substr)) 108 thirdParty, sitekey, specificOnly);
114 { 109 if (result)
115 auto result = mWhitelist._CheckEntryMatch( 110 return result.release();
116 substr, location, typeMask, docDomain, thirdParty, sitekey, specificOn ly);
117 if (result)
118 return result;
119 }
120 if (mBlacklist.mFilterByKeyword.find(substr) && !blacklistHit)
121 {
122 blacklistHit = mBlacklist._CheckEntryMatch(
123 substr, location, typeMask, docDomain, thirdParty, sitekey,
124 specificOnly);
125 }
126 } 111 }
127 return blacklistHit; 112 }
Wladimir Palant 2017/10/09 08:39:47 As mentioned in the issue description, we should n
sergei 2017/10/09 15:27:53 Although it merely converts the existing JS code a
Wladimir Palant 2017/10/10 07:39:05 I strongly disagree. Landing crappy code is always
128 } 113 return nullptr;
129 114 }
130 Filter* MatchesAny(const String& location, 115
131 int typeMask, DependentString& docDomain, bool thirdParty, 116 OwnedString Matcher::FindKeyword(const Filter& filter) const
132 const String& sitekey, bool specificOnly) override 117 {
133 { 118 OwnedString result;
134 OwnedString key(location); 119 OwnedString text(filter.GetText());
135 key.append(u" "_str); 120 if (TestRegExp(mReId, text))
136 key.append(typeMask);
137 key.append(u" "_str);
138 key.append(docDomain);
139 key.append(u" "_str);
140 key.append(thirdParty);
141 key.append(u" "_str);
142 key.append(sitekey);
143 key.append(u" "_str);
144 key.append(specificOnly);
145
146 auto cachedResult = mResultCache.find(key);
147 if (cachedResult)
148 {
149 cachedResult->second->AddRef();
150 return cachedResult->second;
151 }
152
153 Filter* result = MatchesAnyInternal(location, typeMask, docDomain,
154 thirdParty, sitekey, specificOnly);
155
156 if (mResultCache.size() >= MAX_CACHE_ENTRIES)
157 ResetCache();
158
159 result->AddRef();
160 mResultCache[key] = result;
161
162 result->AddRef();
163 return result;
164 }
165 };
166
167 MatcherBase* MatcherBase::mInstance = new CombinedMatcher;
168
169 Matcher::Matcher()
170 : mFilterReId(GenerateRegExp(DependentString(Filter::regexpRegExp), true, fals e))
171 , mOptionsReId(GenerateRegExp(DependentString(Filter::optionsRegExp), true, fa lse))
172 , mCandidatesReId(GenerateRegExp(u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_st r, true, true))
173 {
174 }
175
176 Matcher::~Matcher()
177 {
178 DeleteRegExp(mFilterReId);
179 DeleteRegExp(mOptionsReId);
180 DeleteRegExp(mCandidatesReId);
181 }
182
183 OwnedString Matcher::FindKeyword(const FilterPtr& filter)
184 {
185 OwnedString result(u""_str);
186 OwnedString text(filter->GetText());
187 if (TestRegExp(mFilterReId, text))
188 return result; 121 return result;
189 122
190 // Remove options 123 // Remove options
191 auto index = ExecRegExp(mOptionsReId, text); 124 auto index = ExecRegExp(mOptionsReId, text);
192 if (index != -1) 125 if (index != String::npos)
193 text = text.substr(0, index); 126 text = DependentString(text, 0, index);
194 127
195 // Remove whitelist marker 128 // Remove whitelist marker
196 if (text[0] == '@' && text[1] == '@') 129 if (text.length() >= 2 && text[0] == '@' && text[1] == '@')
197 text = text.substr(2); 130 text = DependentString(text, 2);
198 131
199 text.toLower(); 132 text.toLower();
200 ReMatchResults keywords; 133 intrusive_ptr<ReMatchResults> keywords(new ReMatchResults, false);
201 auto match = text.match(mCandidatesReId, &keywords); 134 auto match = text.match(mCandidatesReId, *keywords);
202 if (!match) 135 if (!match)
203 return result; 136 return result;
204 137
205 auto& candidates = keywords.candidates; 138 auto& candidates = keywords->candidates;
206 139
207 auto& hash = mFilterByKeyword; 140 uint32_t resultCount = 0xffffff;
208 uint32_t resultCount = 0xffffffff;
209 uint32_t resultLength = 0; 141 uint32_t resultLength = 0;
210 for (uint32_t i = 0, l = candidates.size(); i < l; i++) 142 for (auto substr : candidates)
211 { 143 {
212 auto candidate = DependentString(candidates[i]).substr(1); 144 if (substr.empty())
213 auto count = (hash.find(candidate) ? hash[candidate].size() : 0); 145 continue;
146
147 auto candidate = DependentString(substr, 1);
148 auto entry = mFilterByKeyword.find(candidate);
149 auto count = entry ? entry->second.size() : 0;
214 if (count < resultCount || 150 if (count < resultCount ||
215 (count == resultCount && candidate.length() > resultLength)) 151 (count == resultCount && candidate.length() > resultLength))
216 { 152 {
217 result = candidate; 153 result = candidate;
218 resultCount = count; 154 resultCount = count;
219 resultLength = candidate.length(); 155 resultLength = candidate.length();
220 } 156 }
221 } 157 }
222
223 return result; 158 return result;
224 } 159 }
225 160
226 void Matcher::Add(const FilterPtr& filter) 161 FilterPtr Matcher::CheckEntryMatch(const String& keyword,
227 {
228 if (mKeywordByFilter.find(filter->GetText()))
229 return;
230
231 auto keyword = FindKeyword(filter);
232 auto oldEntry = mFilterByKeyword.find(keyword);
233 if (!oldEntry)
234 mFilterByKeyword[keyword] = std::vector<FilterPtr>{filter};
235 else
236 mFilterByKeyword[keyword].push_back(filter);
237 mKeywordByFilter[filter->GetText()] = keyword;
238 }
239
240 void Matcher::Remove(const FilterPtr& filter)
241 {
242 if (!mKeywordByFilter.find(filter->GetText()))
243 return;
244
245 auto keyword = mKeywordByFilter[filter->GetText()];
246 auto list = mFilterByKeyword[keyword];
247 if (list.size() == 1)
248 mFilterByKeyword.erase(keyword);
249 else
250 {
251 auto iter = std::find(list.cbegin(), list.cend(), filter);
252 list.erase(iter);
253 }
254 mKeywordByFilter.erase(filter->GetText());
255 }
256
257 void Matcher::Clear()
258 {
259 mFilterByKeyword.clear();
260 mKeywordByFilter.clear();
261 }
262
263 bool Matcher::HasFilter(const FilterPtr& filter) const
264 {
265 return mKeywordByFilter.find(filter->GetText());
266 }
267
268 static DependentString emptyString = u""_str;
269
270 const String& Matcher::GetKeywordForFilter(const FilterPtr& filter)
271 {
272 if (mKeywordByFilter.find(filter->GetText()))
273 return mKeywordByFilter[filter->GetText()];
274 return emptyString;
275 }
276
277 Filter* Matcher::_CheckEntryMatch(const String& keyword,
278 const String& location, 162 const String& location,
279 int typeMask, DependentString& docDomain, bool thirdParty, 163 int typeMask, DependentString& docDomain, bool thirdParty,
280 const String& sitekey, bool specificOnly) 164 const String& sitekey, bool specificOnly) const
281 { 165 {
282 auto list = mFilterByKeyword[keyword]; 166 auto entry = mFilterByKeyword.find(keyword);
283 for (auto filter : list) { 167 if (!entry)
168 return FilterPtr();
169
170 auto filters = entry->second;
171 for (auto filter : filters)
172 {
284 auto activeFilter = static_cast<ActiveFilter*>(filter.get()); 173 auto activeFilter = static_cast<ActiveFilter*>(filter.get());
hub 2017/09/26 21:49:00 This is done without checking. And it is ugly. And
sergei 2017/10/02 12:02:33 Although we don't pass other filters here, what do
285 if (specificOnly && activeFilter->IsGeneric() && 174 if (specificOnly && activeFilter->IsGeneric() &&
286 !(activeFilter->mType != Filter::Type::WHITELIST)) 175 (activeFilter->mType != Filter::Type::WHITELIST))
287 continue; 176 continue;
177
288 auto reFilter = static_cast<RegExpFilter*>(activeFilter); 178 auto reFilter = static_cast<RegExpFilter*>(activeFilter);
hub 2017/09/26 21:49:00 SImilarly as above: this is unchecked.
289 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey)) 179 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey))
290 { 180 return filter;
291 return filter.get(); 181 }
292 } 182
293 } 183 return FilterPtr();
294 return nullptr; 184 }
295 } 185
296 186 const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000;
297 Filter* Matcher::MatchesAny(const String& location, 187
188 CombinedMatcher::CombinedMatcher()
189 : mResultCache(1024), mMatchReId(-1)
190 {
191 mMatchReId = GenerateRegExp(matchRegExp, true, true);
192 }
193
194 void CombinedMatcher::Add(Filter& filter)
195 {
196 GetMatcher(filter).Add(filter);
197 ResetCache();
198 }
199
200 void CombinedMatcher::Remove(Filter& filter)
201 {
202 GetMatcher(filter).Remove(filter);
203 ResetCache();
204 }
205
206 void CombinedMatcher::Clear()
207 {
208 mBlacklist.Clear();
209 mWhitelist.Clear();
210 ResetCache();
211 }
212
213 bool CombinedMatcher::HasFilter(const Filter& filter) const
214 {
215 return GetMatcher(filter).HasFilter(filter);
216 }
217
218 const String& CombinedMatcher::GetKeywordForFilter(const Filter& filter) const
219 {
220 return GetMatcher(filter).GetKeywordForFilter(filter);
221 }
222
223 Filter* CombinedMatcher::MatchesAny(const String& location,
298 int typeMask, DependentString& docDomain, bool thirdParty, 224 int typeMask, DependentString& docDomain, bool thirdParty,
299 const String& sitekey, bool specificOnly) 225 const String& sitekey, bool specificOnly)
300 { 226 {
301 ReMatchResults reResult; 227 OwnedString key(location);
302 auto re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true); 228 key.append(u" "_str);
229 key.append(typeMask);
230 key.append(u" "_str);
231 key.append(docDomain);
232 key.append(u" "_str);
233 key.append(thirdParty);
234 key.append(u" "_str);
235 key.append(sitekey);
236 key.append(u" "_str);
237 key.append(specificOnly);
238
239 FilterPtr result;
240
241 auto cachedResult = mResultCache.find(key);
242 if (cachedResult)
243 result = cachedResult->second.filter();
244 else
245 {
246 result = MatchesAnyInternal(location, typeMask, docDomain,
247 thirdParty, sitekey, specificOnly);
248
249 if (mResultCache.size() >= MAX_CACHE_ENTRIES)
250 ResetCache();
251
252 CacheEntry cache(std::move(key), result);
253 mResultCache[cache.key()] = cache;
254 }
255
256 return result.release();
257 }
258
259 OwnedString CombinedMatcher::FindKeyword(const Filter& filter) const
260 {
261 return GetMatcher(filter).FindKeyword(filter);
262 }
263
264 void CombinedMatcher::ResetCache()
265 {
266 mResultCache.clear();
267 }
268
269 FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location,
270 int typeMask, DependentString& docDomain, bool thirdParty,
271 const String& sitekey, bool specificOnly) const
272 {
303 OwnedString text(location); 273 OwnedString text(location);
304 text.toLower(); 274 text.toLower();
305 MatchRegExp(re_id, text, &reResult); 275 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false);
306 auto& candidates = reResult.candidates; 276 text.match(mMatchReId, *reResult);
277
278 auto& candidates = reResult->candidates;
307 candidates.push_back(OwnedString()); 279 candidates.push_back(OwnedString());
308 for (size_t i = 0, l = candidates.size(); i < l; i++) 280
309 { 281 FilterPtr blacklistHit;
310 auto substr = candidates[i]; 282 for (auto substr : candidates)
311 if (mFilterByKeyword.find(substr)) 283 {
312 { 284 auto result = mWhitelist.CheckEntryMatch(
313 auto result = _CheckEntryMatch(substr, location, typeMask, docDomain, 285 substr, location, typeMask, docDomain, thirdParty, sitekey, false);
314 thirdParty, sitekey, specificOnly); 286 if (result)
315 if (result) 287 return result;
316 { 288
317 result->AddRef(); 289 if (!blacklistHit)
318 return result; 290 blacklistHit = mBlacklist.CheckEntryMatch(
319 } 291 substr, location, typeMask, docDomain, thirdParty, sitekey,
320 } 292 specificOnly);
321 } 293 }
322 294 return blacklistHit;
323 return nullptr; 295 }
324 }
LEFTRIGHT

Powered by Google App Engine
This is Rietveld