Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 /* | 1 /* |
2 * This file is part of Adblock Plus <https://adblockplus.org/>, | 2 * This file is part of Adblock Plus <https://adblockplus.org/>, |
3 * Copyright (C) 2006-present eyeo GmbH | 3 * Copyright (C) 2006-present eyeo GmbH |
4 * | 4 * |
5 * Adblock Plus is free software: you can redistribute it and/or modify | 5 * Adblock Plus is free software: you can redistribute it and/or modify |
6 * it under the terms of the GNU General Public License version 3 as | 6 * it under the terms of the GNU General Public License version 3 as |
7 * published by the Free Software Foundation. | 7 * published by the Free Software Foundation. |
8 * | 8 * |
9 * Adblock Plus is distributed in the hope that it will be useful, | 9 * Adblock Plus is distributed in the hope that it will be useful, |
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of | 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 * GNU General Public License for more details. | 12 * GNU General Public License for more details. |
13 * | 13 * |
14 * You should have received a copy of the GNU General Public License | 14 * You should have received a copy of the GNU General Public License |
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
16 */ | 16 */ |
17 | 17 |
18 #include "Matcher.h" | 18 #include "Matcher.h" |
19 #include "RegExpFilter.h" | 19 #include "RegExpFilter.h" |
20 #include "../library.h" | 20 #include "../library.h" |
21 | 21 |
22 class CombinedMatcher : public MatcherBase | 22 namespace { |
23 { | 23 const DependentString regexpRegExp = |
24 private: | 24 u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_s tr; |
25 StringMap<Filter*> mResultCache; | 25 const DependentString optionsRegExp = |
26 static const size_t MAX_CACHE_ENTRIES = 1000; | 26 u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str; |
27 int mMatchReId; | 27 const DependentString candidateRegExp = |
28 Matcher mBlacklist; | 28 u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str; |
29 Matcher mWhitelist; | 29 const DependentString matchRegExp = u"[a-z0-9%]{3,}"_str; |
30 | 30 } |
31 protected: | 31 |
32 OwnedString FindKeyword(const FilterPtr& filter) override | 32 Matcher::Matcher() |
33 { | 33 : mFilterByKeyword(1024), mKeywordByFilter(1024), |
34 if (filter->mType == Filter::Type::WHITELIST) | 34 mReId(-1), mOptionsReId(-1), mCandidatesReId(-1) |
35 return mWhitelist.FindKeyword(filter); | 35 { |
36 return mBlacklist.FindKeyword(filter); | 36 mReId = GenerateRegExp(regexpRegExp, true, false); |
37 } | 37 mOptionsReId = GenerateRegExp(optionsRegExp, true, false); |
38 | 38 mCandidatesReId = GenerateRegExp(candidateRegExp, true, true); |
39 public: | 39 mMatchReId = GenerateRegExp(matchRegExp, true, true); |
40 CombinedMatcher() | 40 } |
41 : mMatchReId(GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true)) | 41 |
42 { | 42 void Matcher::Add(Filter& filter) |
43 } | 43 { |
44 | 44 if (mKeywordByFilter.find(filter.GetText())) |
45 ~CombinedMatcher() | 45 return; |
46 { | 46 |
47 DeleteRegExp(mMatchReId); | 47 auto keyword = FindKeyword(filter); |
48 } | 48 |
49 | 49 mFilterByKeyword[keyword].push_back(FilterPtr(&filter)); |
sergei
2017/10/11 09:55:16
Although the review is already closed I think it's
| |
50 void ResetCache() | 50 mKeywordByFilter[filter.GetText()] = |
51 { | 51 FilterKeyword(std::move(keyword), filter); |
52 for (auto filter : mResultCache) | 52 } |
53 filter.second->ReleaseRef(); | 53 |
54 mResultCache.clear(); | 54 void Matcher::Remove(Filter& filter) |
55 } | 55 { |
56 | 56 auto entry = mKeywordByFilter.find(filter.GetText()); |
57 void Add(const FilterPtr& filter) override | 57 if (!entry) |
58 { | 58 return; |
59 if (filter->mType == Filter::Type::WHITELIST) | 59 |
60 mWhitelist.Add(filter); | 60 auto& keyword = static_cast<const String&>(entry->second); |
61 else | 61 auto list = mFilterByKeyword[keyword]; |
62 mBlacklist.Add(filter); | 62 if (list.size() == 1) |
63 | 63 mFilterByKeyword.erase(keyword); |
64 ResetCache(); | 64 else |
65 } | 65 list.erase(std::find(list.cbegin(), list.cend(), FilterPtr(&filter))); |
66 | 66 |
67 void Remove(const FilterPtr& filter) override | 67 mKeywordByFilter.erase(filter.GetText()); |
68 { | 68 } |
69 if (filter->mType == Filter::Type::WHITELIST) | 69 |
70 mWhitelist.Remove(filter); | 70 void Matcher::Clear() |
71 else | 71 { |
72 mBlacklist.Remove(filter); | 72 mFilterByKeyword.clear(); |
73 | 73 mKeywordByFilter.clear(); |
74 ResetCache(); | 74 } |
75 } | 75 |
76 | 76 bool Matcher::HasFilter(const Filter& filter) const |
77 void Clear() override | 77 { |
78 { | 78 return mKeywordByFilter.find(filter.GetText()); |
79 mBlacklist.Clear(); | 79 } |
80 mWhitelist.Clear(); | 80 |
81 ResetCache(); | 81 namespace |
82 } | 82 { |
83 | 83 DependentString emptyString = u""_str; |
84 bool HasFilter(const FilterPtr& filter) const override | 84 } |
85 { | 85 |
86 if (filter->mType == Filter::Type::WHITELIST) | 86 const String& Matcher::GetKeywordForFilter(const Filter& filter) const |
87 return mWhitelist.HasFilter(filter); | 87 { |
88 return mBlacklist.HasFilter(filter); | 88 auto entry = mKeywordByFilter.find(filter.GetText()); |
89 } | 89 if (entry) |
90 | 90 return static_cast<const String&>(entry->second); |
91 const String& GetKeywordForFilter(const FilterPtr& filter) override | 91 return emptyString; |
92 { | 92 } |
93 if (filter->mType == Filter::Type::WHITELIST) | 93 |
94 return mWhitelist.GetKeywordForFilter(filter); | 94 Filter* Matcher::MatchesAny(const String& location, |
95 return mBlacklist.GetKeywordForFilter(filter); | 95 int typeMask, DependentString& docDomain, bool thirdParty, |
96 } | 96 const String& sitekey, bool specificOnly) const |
97 | 97 { |
98 private: | 98 OwnedString text(location); |
99 Filter* MatchesAnyInternal(const String& location, | 99 text.toLower(); |
100 int typeMask, DependentString& docDomain, bool thirdParty, | 100 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); |
101 const String& sitekey, bool specificOnly) | 101 if (text.match(mMatchReId, *reResult)) |
102 { | 102 { |
103 ReMatchResults reResult; | 103 auto& candidates = reResult->candidates; |
104 OwnedString text(location); | |
105 text.toLower(); | |
106 text.match(mMatchReId, &reResult); | |
107 | |
108 auto& candidates = reResult.candidates; | |
109 candidates.push_back(OwnedString()); | 104 candidates.push_back(OwnedString()); |
110 | 105 for (auto candidate : candidates) |
111 Filter* blacklistHit = nullptr; | |
112 for (auto substr : candidates) | |
113 { | 106 { |
114 if (mWhitelist.mFilterByKeyword.find(substr)) | 107 auto result = CheckEntryMatch(candidate, location, typeMask, docDomain, |
115 { | 108 thirdParty, sitekey, specificOnly); |
116 auto result = mWhitelist.CheckEntryMatch( | 109 if (result) |
117 substr, location, typeMask, docDomain, thirdParty, sitekey, specificOn ly); | 110 return result.release(); |
118 if (result) | |
119 return result; | |
120 } | |
121 if (mBlacklist.mFilterByKeyword.find(substr) && !blacklistHit) | |
122 { | |
123 blacklistHit = mBlacklist.CheckEntryMatch( | |
124 substr, location, typeMask, docDomain, thirdParty, sitekey, | |
125 specificOnly); | |
126 } | |
127 } | 111 } |
128 return blacklistHit; | 112 } |
Wladimir Palant
2017/10/09 08:39:47
As mentioned in the issue description, we should n
sergei
2017/10/09 15:27:53
Although it merely converts the existing JS code a
Wladimir Palant
2017/10/10 07:39:05
I strongly disagree. Landing crappy code is always
| |
129 } | 113 return nullptr; |
130 | 114 } |
131 public: | 115 |
132 Filter* MatchesAny(const String& location, | 116 OwnedString Matcher::FindKeyword(const Filter& filter) const |
133 int typeMask, DependentString& docDomain, bool thirdParty, | 117 { |
134 const String& sitekey, bool specificOnly) override | 118 OwnedString result; |
135 { | 119 OwnedString text(filter.GetText()); |
136 OwnedString key(location); | 120 if (TestRegExp(mReId, text)) |
137 key.append(u" "_str); | |
138 key.append(typeMask); | |
139 key.append(u" "_str); | |
140 key.append(docDomain); | |
141 key.append(u" "_str); | |
142 key.append(thirdParty); | |
143 key.append(u" "_str); | |
144 key.append(sitekey); | |
145 key.append(u" "_str); | |
146 key.append(specificOnly); | |
147 | |
148 auto cachedResult = mResultCache.find(key); | |
149 if (cachedResult) | |
150 { | |
151 cachedResult->second->AddRef(); | |
152 return cachedResult->second; | |
153 } | |
154 | |
155 Filter* result = MatchesAnyInternal(location, typeMask, docDomain, | |
156 thirdParty, sitekey, specificOnly); | |
157 | |
158 if (mResultCache.size() >= MAX_CACHE_ENTRIES) | |
159 ResetCache(); | |
160 | |
161 result->AddRef(); | |
162 mResultCache[key] = result; | |
163 | |
164 result->AddRef(); | |
165 return result; | |
166 } | |
167 }; | |
168 | |
169 MatcherBase* MatcherBase::mInstance = new CombinedMatcher; | |
170 | |
171 Matcher::Matcher() | |
172 : mFilterReId(GenerateRegExp(DependentString(Filter::regexpRegExp), true, fals e)) | |
173 , mOptionsReId(GenerateRegExp(DependentString(Filter::optionsRegExp), true, fa lse)) | |
174 , mCandidatesReId(GenerateRegExp(u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_st r, true, true)) | |
175 { | |
176 } | |
177 | |
178 Matcher::~Matcher() | |
179 { | |
180 DeleteRegExp(mFilterReId); | |
181 DeleteRegExp(mOptionsReId); | |
182 DeleteRegExp(mCandidatesReId); | |
183 } | |
184 | |
185 OwnedString Matcher::FindKeyword(const FilterPtr& filter) | |
186 { | |
187 OwnedString result(u""_str); | |
188 OwnedString text(filter->GetText()); | |
189 if (TestRegExp(mFilterReId, text)) | |
190 return result; | 121 return result; |
191 | 122 |
192 // Remove options | 123 // Remove options |
193 auto index = ExecRegExp(mOptionsReId, text); | 124 auto index = ExecRegExp(mOptionsReId, text); |
194 if (index != -1) | 125 if (index != String::npos) |
195 text = text.substr(0, index); | 126 text = DependentString(text, 0, index); |
196 | 127 |
197 // Remove whitelist marker | 128 // Remove whitelist marker |
198 if (text[0] == '@' && text[1] == '@') | 129 if (text.length() >= 2 && text[0] == '@' && text[1] == '@') |
199 text = text.substr(2); | 130 text = DependentString(text, 2); |
200 | 131 |
201 text.toLower(); | 132 text.toLower(); |
202 ReMatchResults keywords; | 133 intrusive_ptr<ReMatchResults> keywords(new ReMatchResults, false); |
203 auto match = text.match(mCandidatesReId, &keywords); | 134 auto match = text.match(mCandidatesReId, *keywords); |
204 if (!match) | 135 if (!match) |
205 return result; | 136 return result; |
206 | 137 |
207 auto& candidates = keywords.candidates; | 138 auto& candidates = keywords->candidates; |
208 | 139 |
209 auto& hash = mFilterByKeyword; | 140 uint32_t resultCount = 0xffffff; |
210 uint32_t resultCount = 0xffffffff; | |
211 uint32_t resultLength = 0; | 141 uint32_t resultLength = 0; |
212 for (auto substr : candidates) | 142 for (auto substr : candidates) |
213 { | 143 { |
214 auto candidate = DependentString(substr).substr(1); | 144 if (substr.empty()) |
215 auto count = (hash.find(candidate) ? hash[candidate].size() : 0); | 145 continue; |
146 | |
147 auto candidate = DependentString(substr, 1); | |
148 auto entry = mFilterByKeyword.find(candidate); | |
149 auto count = entry ? entry->second.size() : 0; | |
216 if (count < resultCount || | 150 if (count < resultCount || |
217 (count == resultCount && candidate.length() > resultLength)) | 151 (count == resultCount && candidate.length() > resultLength)) |
218 { | 152 { |
219 result = candidate; | 153 result = candidate; |
220 resultCount = count; | 154 resultCount = count; |
221 resultLength = candidate.length(); | 155 resultLength = candidate.length(); |
222 } | 156 } |
223 } | 157 } |
224 | |
225 return result; | 158 return result; |
226 } | 159 } |
227 | 160 |
228 void Matcher::Add(const FilterPtr& filter) | 161 FilterPtr Matcher::CheckEntryMatch(const String& keyword, |
229 { | |
230 if (mKeywordByFilter.find(filter->GetText())) | |
231 return; | |
232 | |
233 auto keyword = FindKeyword(filter); | |
234 auto oldEntry = mFilterByKeyword.find(keyword); | |
235 if (!oldEntry) | |
236 mFilterByKeyword[keyword] = std::vector<FilterPtr>{filter}; | |
237 else | |
238 mFilterByKeyword[keyword].push_back(filter); | |
239 mKeywordByFilter[filter->GetText()] = keyword; | |
240 } | |
241 | |
242 void Matcher::Remove(const FilterPtr& filter) | |
243 { | |
244 if (!mKeywordByFilter.find(filter->GetText())) | |
245 return; | |
246 | |
247 auto keyword = mKeywordByFilter[filter->GetText()]; | |
248 auto list = mFilterByKeyword[keyword]; | |
249 if (list.size() == 1) | |
250 mFilterByKeyword.erase(keyword); | |
251 else | |
252 { | |
253 auto iter = std::find(list.cbegin(), list.cend(), filter); | |
254 list.erase(iter); | |
255 } | |
256 mKeywordByFilter.erase(filter->GetText()); | |
257 } | |
258 | |
259 void Matcher::Clear() | |
260 { | |
261 mFilterByKeyword.clear(); | |
262 mKeywordByFilter.clear(); | |
263 } | |
264 | |
265 bool Matcher::HasFilter(const FilterPtr& filter) const | |
266 { | |
267 return mKeywordByFilter.find(filter->GetText()); | |
268 } | |
269 | |
270 static DependentString emptyString = u""_str; | |
271 | |
272 const String& Matcher::GetKeywordForFilter(const FilterPtr& filter) | |
273 { | |
274 if (mKeywordByFilter.find(filter->GetText())) | |
275 return mKeywordByFilter[filter->GetText()]; | |
276 return emptyString; | |
277 } | |
278 | |
279 Filter* Matcher::CheckEntryMatch(const String& keyword, | |
280 const String& location, | 162 const String& location, |
281 int typeMask, DependentString& docDomain, bool thirdParty, | 163 int typeMask, DependentString& docDomain, bool thirdParty, |
282 const String& sitekey, bool specificOnly) | 164 const String& sitekey, bool specificOnly) const |
283 { | 165 { |
284 auto list = mFilterByKeyword[keyword]; | 166 auto entry = mFilterByKeyword.find(keyword); |
285 for (auto filter : list) { | 167 if (!entry) |
168 return FilterPtr(); | |
169 | |
170 auto filters = entry->second; | |
171 for (auto filter : filters) | |
172 { | |
286 auto activeFilter = static_cast<ActiveFilter*>(filter.get()); | 173 auto activeFilter = static_cast<ActiveFilter*>(filter.get()); |
287 if (specificOnly && activeFilter->IsGeneric() && | 174 if (specificOnly && activeFilter->IsGeneric() && |
288 !(activeFilter->mType != Filter::Type::WHITELIST)) | 175 (activeFilter->mType != Filter::Type::WHITELIST)) |
289 continue; | 176 continue; |
290 | 177 |
291 auto reFilter = static_cast<RegExpFilter*>(activeFilter); | 178 auto reFilter = static_cast<RegExpFilter*>(activeFilter); |
292 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey)) | 179 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey)) |
293 return filter.get(); | 180 return filter; |
294 } | 181 } |
295 return nullptr; | 182 |
296 } | 183 return FilterPtr(); |
297 | 184 } |
298 Filter* Matcher::MatchesAny(const String& location, | 185 |
186 const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000; | |
187 | |
188 CombinedMatcher::CombinedMatcher() | |
189 : mResultCache(1024), mMatchReId(-1) | |
190 { | |
191 mMatchReId = GenerateRegExp(matchRegExp, true, true); | |
192 } | |
193 | |
194 void CombinedMatcher::Add(Filter& filter) | |
195 { | |
196 GetMatcher(filter).Add(filter); | |
197 ResetCache(); | |
198 } | |
199 | |
200 void CombinedMatcher::Remove(Filter& filter) | |
201 { | |
202 GetMatcher(filter).Remove(filter); | |
203 ResetCache(); | |
204 } | |
205 | |
206 void CombinedMatcher::Clear() | |
207 { | |
208 mBlacklist.Clear(); | |
209 mWhitelist.Clear(); | |
210 ResetCache(); | |
211 } | |
212 | |
213 bool CombinedMatcher::HasFilter(const Filter& filter) const | |
214 { | |
215 return GetMatcher(filter).HasFilter(filter); | |
216 } | |
217 | |
218 const String& CombinedMatcher::GetKeywordForFilter(const Filter& filter) const | |
219 { | |
220 return GetMatcher(filter).GetKeywordForFilter(filter); | |
221 } | |
222 | |
223 Filter* CombinedMatcher::MatchesAny(const String& location, | |
299 int typeMask, DependentString& docDomain, bool thirdParty, | 224 int typeMask, DependentString& docDomain, bool thirdParty, |
300 const String& sitekey, bool specificOnly) | 225 const String& sitekey, bool specificOnly) |
301 { | 226 { |
302 ReMatchResults reResult; | 227 OwnedString key(location); |
303 auto re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true); | 228 key.append(u" "_str); |
229 key.append(typeMask); | |
230 key.append(u" "_str); | |
231 key.append(docDomain); | |
232 key.append(u" "_str); | |
233 key.append(thirdParty); | |
234 key.append(u" "_str); | |
235 key.append(sitekey); | |
236 key.append(u" "_str); | |
237 key.append(specificOnly); | |
238 | |
239 FilterPtr result; | |
240 | |
241 auto cachedResult = mResultCache.find(key); | |
242 if (cachedResult) | |
243 result = cachedResult->second.filter(); | |
244 else | |
245 { | |
246 result = MatchesAnyInternal(location, typeMask, docDomain, | |
247 thirdParty, sitekey, specificOnly); | |
248 | |
249 if (mResultCache.size() >= MAX_CACHE_ENTRIES) | |
250 ResetCache(); | |
251 | |
252 CacheEntry cache(std::move(key), result); | |
253 mResultCache[cache.key()] = cache; | |
254 } | |
255 | |
256 return result.release(); | |
257 } | |
258 | |
259 OwnedString CombinedMatcher::FindKeyword(const Filter& filter) const | |
260 { | |
261 return GetMatcher(filter).FindKeyword(filter); | |
262 } | |
263 | |
264 void CombinedMatcher::ResetCache() | |
265 { | |
266 mResultCache.clear(); | |
267 } | |
268 | |
269 FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location, | |
270 int typeMask, DependentString& docDomain, bool thirdParty, | |
271 const String& sitekey, bool specificOnly) const | |
272 { | |
304 OwnedString text(location); | 273 OwnedString text(location); |
305 text.toLower(); | 274 text.toLower(); |
306 MatchRegExp(re_id, text, &reResult); | 275 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); |
307 auto& candidates = reResult.candidates; | 276 text.match(mMatchReId, *reResult); |
277 | |
278 auto& candidates = reResult->candidates; | |
308 candidates.push_back(OwnedString()); | 279 candidates.push_back(OwnedString()); |
280 | |
281 FilterPtr blacklistHit; | |
309 for (auto substr : candidates) | 282 for (auto substr : candidates) |
310 if (mFilterByKeyword.find(substr)) | 283 { |
311 { | 284 auto result = mWhitelist.CheckEntryMatch( |
312 auto result = CheckEntryMatch(substr, location, typeMask, docDomain, | 285 substr, location, typeMask, docDomain, thirdParty, sitekey, false); |
313 thirdParty, sitekey, specificOnly); | 286 if (result) |
314 if (result) | 287 return result; |
315 { | 288 |
316 result->AddRef(); | 289 if (!blacklistHit) |
317 return result; | 290 blacklistHit = mBlacklist.CheckEntryMatch( |
318 } | 291 substr, location, typeMask, docDomain, thirdParty, sitekey, |
319 } | 292 specificOnly); |
320 | 293 } |
321 return nullptr; | 294 return blacklistHit; |
322 } | 295 } |
LEFT | RIGHT |