Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: compiled/filter/Matcher.cpp

Issue 29556737: Issue 5141 - Convert filter match to C++ (Closed) Base URL: https://hg.adblockplus.org/adblockpluscore/
Left Patch Set: Addressed most of the comment. Fixed some issues. Created Oct. 3, 2017, 7:31 p.m.
Right Patch Set: Fixed many issues. One test left out. Created Oct. 6, 2017, 1:45 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
LEFTRIGHT
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-present eyeo GmbH 3 * Copyright (C) 2006-present eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
16 */ 16 */
17 17
18 #include "Matcher.h" 18 #include "Matcher.h"
19 #include "RegExpFilter.h" 19 #include "RegExpFilter.h"
20 #include "../library.h" 20 #include "../library.h"
21 21
22 namespace { 22 namespace {
23 const DependentString regexpRegExp = 23 const DependentString regexpRegExp =
24 u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_s tr; 24 u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_s tr;
25 const DependentString optionsRegExp = 25 const DependentString optionsRegExp =
26 u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str; 26 u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str;
27 const DependentString candidateRegExp = 27 const DependentString candidateRegExp =
28 u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str; 28 u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str;
29 const DependentString matchRegExp = u"[a-z0-9%]{3,}"_str;
30 }
31
32 Matcher::Matcher()
33 : mFilterByKeyword(1024), mKeywordByFilter(1024),
34 mReId(-1), mOptionsReId(-1), mCandidatesReId(-1)
35 {
36 mReId = GenerateRegExp(regexpRegExp, true, false);
37 mOptionsReId = GenerateRegExp(optionsRegExp, true, false);
38 mCandidatesReId = GenerateRegExp(candidateRegExp, true, true);
39 mMatchReId = GenerateRegExp(matchRegExp, true, true);
29 } 40 }
30 41
31 void Matcher::Add(Filter& filter) 42 void Matcher::Add(Filter& filter)
32 { 43 {
33 if (mKeywordByFilter.find(filter.GetText())) 44 if (mKeywordByFilter.find(filter.GetText()))
34 return; 45 return;
35 46
36 auto keyword = FindKeyword(filter); 47 auto keyword = FindKeyword(filter);
37 48
38 mFilterByKeyword[keyword].push_back(FilterPtr(&filter)); 49 mFilterByKeyword[keyword].push_back(FilterPtr(&filter));
sergei 2017/10/11 09:55:16 Although the review is already closed I think it's
39 mKeywordByFilter[filter.GetText()] = keyword; 50 mKeywordByFilter[filter.GetText()] =
51 FilterKeyword(std::move(keyword), filter);
40 } 52 }
41 53
42 void Matcher::Remove(Filter& filter) 54 void Matcher::Remove(Filter& filter)
43 { 55 {
44 auto entry = mKeywordByFilter.find(filter.GetText()); 56 auto entry = mKeywordByFilter.find(filter.GetText());
45 if (!entry) 57 if (!entry)
46 return; 58 return;
47 59
48 auto keyword = entry->second; 60 auto& keyword = static_cast<const String&>(entry->second);
49 auto list = mFilterByKeyword[keyword]; 61 auto list = mFilterByKeyword[keyword];
50 if (list.size() == 1) 62 if (list.size() == 1)
51 mFilterByKeyword.erase(keyword); 63 mFilterByKeyword.erase(keyword);
52 else 64 else
53 list.erase(std::find(list.cbegin(), list.cend(), FilterPtr(&filter))); 65 list.erase(std::find(list.cbegin(), list.cend(), FilterPtr(&filter)));
54 66
55 mKeywordByFilter.erase(filter.GetText()); 67 mKeywordByFilter.erase(filter.GetText());
56 } 68 }
57 69
58 void Matcher::Clear() 70 void Matcher::Clear()
59 { 71 {
60 mFilterByKeyword.clear(); 72 mFilterByKeyword.clear();
61 mKeywordByFilter.clear(); 73 mKeywordByFilter.clear();
62 } 74 }
63 75
64 bool Matcher::HasFilter(const Filter& filter) const 76 bool Matcher::HasFilter(const Filter& filter) const
65 { 77 {
66 return mKeywordByFilter.find(filter.GetText()); 78 return mKeywordByFilter.find(filter.GetText());
67 } 79 }
68 80
69 namespace 81 namespace
70 { 82 {
71 DependentString emptyString = u""_str; 83 DependentString emptyString = u""_str;
72 } 84 }
73 85
74 const String& Matcher::GetKeywordForFilter(const Filter& filter) const 86 const String& Matcher::GetKeywordForFilter(const Filter& filter) const
75 { 87 {
76 auto entry = mKeywordByFilter.find(filter.GetText()); 88 auto entry = mKeywordByFilter.find(filter.GetText());
77 if (entry) 89 if (entry)
78 return entry->second; 90 return static_cast<const String&>(entry->second);
79 return emptyString; 91 return emptyString;
80 } 92 }
81 93
82 Filter* Matcher::MatchesAny(const String& location, 94 Filter* Matcher::MatchesAny(const String& location,
83 int typeMask, DependentString& docDomain, bool thirdParty, 95 int typeMask, DependentString& docDomain, bool thirdParty,
84 const String& sitekey, bool specificOnly) const 96 const String& sitekey, bool specificOnly) const
85 { 97 {
86 auto re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true);
87 OwnedString text(location); 98 OwnedString text(location);
88 text.toLower(); 99 text.toLower();
89 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); 100 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false);
90 MatchRegExp(re_id, text, reResult.get()); 101 if (text.match(mMatchReId, *reResult))
91 auto& candidates = reResult->candidates; 102 {
92 candidates.push_back(OwnedString()); 103 auto& candidates = reResult->candidates;
93 for (auto substr : candidates) 104 candidates.push_back(OwnedString());
94 { 105 for (auto candidate : candidates)
95 if (mFilterByKeyword.find(substr))
96 { 106 {
97 auto result = CheckEntryMatch(substr, location, typeMask, docDomain, 107 auto result = CheckEntryMatch(candidate, location, typeMask, docDomain,
98 thirdParty, sitekey, specificOnly); 108 thirdParty, sitekey, specificOnly);
99 if (result) 109 if (result)
100 return result.release(); 110 return result.release();
101 } 111 }
102 } 112 }
Wladimir Palant 2017/10/09 08:39:47 As mentioned in the issue description, we should n
sergei 2017/10/09 15:27:53 Although it merely converts the existing JS code a
Wladimir Palant 2017/10/10 07:39:05 I strongly disagree. Landing crappy code is always
103 return nullptr; 113 return nullptr;
104 } 114 }
105 115
106 OwnedString Matcher::FindKeyword(const Filter& filter) const 116 OwnedString Matcher::FindKeyword(const Filter& filter) const
107 { 117 {
108 OwnedString result(u""_str); 118 OwnedString result;
109 OwnedString text(filter.GetText()); 119 OwnedString text(filter.GetText());
110 auto re_id = GenerateRegExp(regexpRegExp, true, false); 120 if (TestRegExp(mReId, text))
111 if (TestRegExp(re_id, text))
112 return result; 121 return result;
113 122
114 // Remove options 123 // Remove options
115 auto options_re_id = GenerateRegExp(optionsRegExp, true, false); 124 auto index = ExecRegExp(mOptionsReId, text);
116 auto index = ExecRegExp(options_re_id, text);
117 if (index != String::npos) 125 if (index != String::npos)
118 text = text.substr(0, index); 126 text = DependentString(text, 0, index);
119 127
120 // Remove whitelist marker 128 // Remove whitelist marker
121 if (text.length() >= 2 && text[0] == '@' && text[1] == '@') 129 if (text.length() >= 2 && text[0] == '@' && text[1] == '@')
122 text = text.substr(2); 130 text = DependentString(text, 2);
123 131
124 text.toLower(); 132 text.toLower();
125 intrusive_ptr<ReMatchResults> keywords(new ReMatchResults, false); 133 intrusive_ptr<ReMatchResults> keywords(new ReMatchResults, false);
126 auto candidates_re_id = GenerateRegExp(candidateRegExp, true, true); 134 auto match = text.match(mCandidatesReId, *keywords);
127 auto match = text.match(candidates_re_id, keywords.get());
128 if (!match) 135 if (!match)
129 return result; 136 return result;
130 137
131 auto& candidates = keywords->candidates; 138 auto& candidates = keywords->candidates;
132 139
133 auto& hash = mFilterByKeyword; 140 uint32_t resultCount = 0xffffff;
134 uint32_t resultCount = 0xffffffff;
135 uint32_t resultLength = 0; 141 uint32_t resultLength = 0;
136 for (auto substr : candidates) 142 for (auto substr : candidates)
137 { 143 {
138 if (substr.empty()) 144 if (substr.empty())
139 continue; 145 continue;
140 146
141 auto candidate = substr.substr(1); 147 auto candidate = DependentString(substr, 1);
142 auto entry = hash.find(candidate); 148 auto entry = mFilterByKeyword.find(candidate);
143 auto count = entry ? entry->second.size() : 0; 149 auto count = entry ? entry->second.size() : 0;
144 if (count < resultCount || 150 if (count < resultCount ||
145 (count == resultCount && candidate.length() > resultLength)) 151 (count == resultCount && candidate.length() > resultLength))
146 { 152 {
147 result = candidate; 153 result = candidate;
148 resultCount = count; 154 resultCount = count;
149 resultLength = candidate.length(); 155 resultLength = candidate.length();
150 } 156 }
151 } 157 }
152 return result; 158 return result;
153 } 159 }
154 160
155 FilterPtr Matcher::CheckEntryMatch(const String& keyword, 161 FilterPtr Matcher::CheckEntryMatch(const String& keyword,
156 const String& location, 162 const String& location,
157 int typeMask, DependentString& docDomain, bool thirdParty, 163 int typeMask, DependentString& docDomain, bool thirdParty,
158 const String& sitekey, bool specificOnly) const 164 const String& sitekey, bool specificOnly) const
159 { 165 {
160 auto entry = mFilterByKeyword.find(keyword); 166 auto entry = mFilterByKeyword.find(keyword);
161 if (entry) 167 if (!entry)
162 { 168 return FilterPtr();
sergei 2017/10/04 08:54:33 Earlier return would be better here, in my opinion
hub 2017/10/06 13:49:19 Done.
163 auto list = entry->second; 169
164 for (auto filter : list) 170 auto filters = entry->second;
165 { 171 for (auto filter : filters)
166 auto activeFilter = static_cast<ActiveFilter*>(filter.get()); 172 {
167 if (specificOnly && activeFilter->IsGeneric() && 173 auto activeFilter = static_cast<ActiveFilter*>(filter.get());
168 !(activeFilter->mType != Filter::Type::WHITELIST)) 174 if (specificOnly && activeFilter->IsGeneric() &&
169 continue; 175 (activeFilter->mType != Filter::Type::WHITELIST))
170 176 continue;
171 auto reFilter = static_cast<RegExpFilter*>(activeFilter); 177
172 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey)) 178 auto reFilter = static_cast<RegExpFilter*>(activeFilter);
173 return filter; 179 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey))
174 } 180 return filter;
175 } 181 }
182
176 return FilterPtr(); 183 return FilterPtr();
177 } 184 }
178 185
179 const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000; 186 const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000;
180 187
188 CombinedMatcher::CombinedMatcher()
189 : mResultCache(1024), mMatchReId(-1)
190 {
191 mMatchReId = GenerateRegExp(matchRegExp, true, true);
192 }
193
181 void CombinedMatcher::Add(Filter& filter) 194 void CombinedMatcher::Add(Filter& filter)
182 { 195 {
183 if (filter.mType == Filter::Type::WHITELIST) 196 GetMatcher(filter).Add(filter);
184 mWhitelist.Add(filter);
185 else
186 mBlacklist.Add(filter);
187
188 ResetCache(); 197 ResetCache();
189 } 198 }
190 199
191 void CombinedMatcher::Remove(Filter& filter) 200 void CombinedMatcher::Remove(Filter& filter)
192 { 201 {
193 if (filter.mType == Filter::Type::WHITELIST) 202 GetMatcher(filter).Remove(filter);
194 mWhitelist.Remove(filter);
195 else
196 mBlacklist.Remove(filter);
197
198 ResetCache(); 203 ResetCache();
199 } 204 }
200 205
201 void CombinedMatcher::Clear() 206 void CombinedMatcher::Clear()
202 { 207 {
203 mBlacklist.Clear(); 208 mBlacklist.Clear();
204 mWhitelist.Clear(); 209 mWhitelist.Clear();
205 ResetCache(); 210 ResetCache();
206 } 211 }
207 212
208 bool CombinedMatcher::HasFilter(const Filter& filter) const 213 bool CombinedMatcher::HasFilter(const Filter& filter) const
209 { 214 {
210 return filter.mType == Filter::Type::WHITELIST ? 215 return GetMatcher(filter).HasFilter(filter);
211 mWhitelist.HasFilter(filter) : mBlacklist.HasFilter(filter);
212 } 216 }
213 217
214 const String& CombinedMatcher::GetKeywordForFilter(const Filter& filter) const 218 const String& CombinedMatcher::GetKeywordForFilter(const Filter& filter) const
215 { 219 {
216 return filter.mType == Filter::Type::WHITELIST ? 220 return GetMatcher(filter).GetKeywordForFilter(filter);
217 mWhitelist.GetKeywordForFilter(filter) : mBlacklist.GetKeywordForFilter(filt er);
218 } 221 }
219 222
220 Filter* CombinedMatcher::MatchesAny(const String& location, 223 Filter* CombinedMatcher::MatchesAny(const String& location,
221 int typeMask, DependentString& docDomain, bool thirdParty, 224 int typeMask, DependentString& docDomain, bool thirdParty,
222 const String& sitekey, bool specificOnly) 225 const String& sitekey, bool specificOnly)
223 { 226 {
224 OwnedString key(location); 227 OwnedString key(location);
225 key.append(u" "_str); 228 key.append(u" "_str);
226 key.append(typeMask); 229 key.append(typeMask);
227 key.append(u" "_str); 230 key.append(u" "_str);
228 key.append(docDomain); 231 key.append(docDomain);
229 key.append(u" "_str); 232 key.append(u" "_str);
230 key.append(thirdParty); 233 key.append(thirdParty);
231 key.append(u" "_str); 234 key.append(u" "_str);
232 key.append(sitekey); 235 key.append(sitekey);
233 key.append(u" "_str); 236 key.append(u" "_str);
234 key.append(specificOnly); 237 key.append(specificOnly);
235 238
236 FilterPtr result; 239 FilterPtr result;
237 240
238 auto cachedResult = mResultCache.find(key); 241 auto cachedResult = mResultCache.find(key);
239 if (cachedResult) 242 if (cachedResult)
240 result = cachedResult->second; 243 result = cachedResult->second.filter();
241 else 244 else
242 { 245 {
243 result = MatchesAnyInternal(location, typeMask, docDomain, 246 result = MatchesAnyInternal(location, typeMask, docDomain,
244 thirdParty, sitekey, specificOnly); 247 thirdParty, sitekey, specificOnly);
245 248
246 if (mResultCache.size() >= MAX_CACHE_ENTRIES) 249 if (mResultCache.size() >= MAX_CACHE_ENTRIES)
247 ResetCache(); 250 ResetCache();
248 251
249 mResultCache[key] = result; 252 CacheEntry cache(std::move(key), result);
253 mResultCache[cache.key()] = cache;
250 } 254 }
251 255
252 return result.release(); 256 return result.release();
253 } 257 }
254 258
255 OwnedString CombinedMatcher::FindKeyword(const Filter& filter) const 259 OwnedString CombinedMatcher::FindKeyword(const Filter& filter) const
256 { 260 {
257 return filter.mType == Filter::Type::WHITELIST ? 261 return GetMatcher(filter).FindKeyword(filter);
258 mWhitelist.FindKeyword(filter) : mBlacklist.FindKeyword(filter);
259 } 262 }
260 263
261 void CombinedMatcher::ResetCache() 264 void CombinedMatcher::ResetCache()
262 { 265 {
263 mResultCache.clear(); 266 mResultCache.clear();
264 } 267 }
265 268
266 FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location, 269 FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location,
267 int typeMask, DependentString& docDomain, bool thirdParty, 270 int typeMask, DependentString& docDomain, bool thirdParty,
268 const String& sitekey, bool specificOnly) const 271 const String& sitekey, bool specificOnly) const
269 { 272 {
270 OwnedString text(location); 273 OwnedString text(location);
271 text.toLower(); 274 text.toLower();
272 auto match_re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true);
273 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); 275 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false);
274 text.match(match_re_id, reResult.get()); 276 text.match(mMatchReId, *reResult);
275 277
276 auto& candidates = reResult->candidates; 278 auto& candidates = reResult->candidates;
277 candidates.push_back(OwnedString()); 279 candidates.push_back(OwnedString());
278 280
279 FilterPtr blacklistHit; 281 FilterPtr blacklistHit;
280 for (auto substr : candidates) 282 for (auto substr : candidates)
281 { 283 {
282 auto result = mWhitelist.CheckEntryMatch( 284 auto result = mWhitelist.CheckEntryMatch(
283 substr, location, typeMask, docDomain, thirdParty, sitekey, specificOnly); 285 substr, location, typeMask, docDomain, thirdParty, sitekey, false);
284 if (result) 286 if (result)
285 return result; 287 return result;
286 288
287 if (!blacklistHit) 289 if (!blacklistHit)
288 blacklistHit = mBlacklist.CheckEntryMatch( 290 blacklistHit = mBlacklist.CheckEntryMatch(
289 substr, location, typeMask, docDomain, thirdParty, sitekey, 291 substr, location, typeMask, docDomain, thirdParty, sitekey,
290 specificOnly); 292 specificOnly);
291 } 293 }
292 return blacklistHit; 294 return blacklistHit;
293 } 295 }
LEFTRIGHT

Powered by Google App Engine
This is Rietveld