Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 /* | 1 /* |
2 * This file is part of Adblock Plus <https://adblockplus.org/>, | 2 * This file is part of Adblock Plus <https://adblockplus.org/>, |
3 * Copyright (C) 2006-present eyeo GmbH | 3 * Copyright (C) 2006-present eyeo GmbH |
4 * | 4 * |
5 * Adblock Plus is free software: you can redistribute it and/or modify | 5 * Adblock Plus is free software: you can redistribute it and/or modify |
6 * it under the terms of the GNU General Public License version 3 as | 6 * it under the terms of the GNU General Public License version 3 as |
7 * published by the Free Software Foundation. | 7 * published by the Free Software Foundation. |
8 * | 8 * |
9 * Adblock Plus is distributed in the hope that it will be useful, | 9 * Adblock Plus is distributed in the hope that it will be useful, |
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of | 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 * GNU General Public License for more details. | 12 * GNU General Public License for more details. |
13 * | 13 * |
14 * You should have received a copy of the GNU General Public License | 14 * You should have received a copy of the GNU General Public License |
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
16 */ | 16 */ |
17 | 17 |
18 #include "Matcher.h" | 18 #include "Matcher.h" |
19 #include "RegExpFilter.h" | 19 #include "RegExpFilter.h" |
20 #include "../library.h" | 20 #include "../library.h" |
21 | 21 |
22 namespace { | 22 namespace { |
23 const DependentString regexpRegExp = | 23 const DependentString regexpRegExp = |
24 u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_s tr; | 24 u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_s tr; |
25 const DependentString optionsRegExp = | 25 const DependentString optionsRegExp = |
26 u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str; | 26 u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str; |
27 const DependentString candidateRegExp = | 27 const DependentString candidateRegExp = |
28 u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str; | 28 u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str; |
29 const DependentString matchRegExp = u"[a-z0-9%]{3,}"_str; | |
30 } | |
31 | |
32 Matcher::Matcher() | |
33 : mFilterByKeyword(1024), mKeywordByFilter(1024), | |
34 mReId(-1), mOptionsReId(-1), mCandidatesReId(-1) | |
35 { | |
36 mReId = GenerateRegExp(regexpRegExp, true, false); | |
37 mOptionsReId = GenerateRegExp(optionsRegExp, true, false); | |
38 mCandidatesReId = GenerateRegExp(candidateRegExp, true, true); | |
39 mMatchReId = GenerateRegExp(matchRegExp, true, true); | |
29 } | 40 } |
30 | 41 |
31 void Matcher::Add(Filter& filter) | 42 void Matcher::Add(Filter& filter) |
32 { | 43 { |
33 if (mKeywordByFilter.find(filter.GetText())) | 44 if (mKeywordByFilter.find(filter.GetText())) |
34 return; | 45 return; |
35 | 46 |
36 auto keyword = FindKeyword(filter); | 47 auto keyword = FindKeyword(filter); |
37 | 48 |
38 mFilterByKeyword[keyword].push_back(FilterPtr(&filter)); | 49 mFilterByKeyword[keyword].push_back(FilterPtr(&filter)); |
sergei
2017/10/11 09:55:16
Although the review is already closed I think it's
| |
39 mKeywordByFilter[filter.GetText()] = keyword; | 50 mKeywordByFilter[filter.GetText()] = |
51 FilterKeyword(std::move(keyword), filter); | |
40 } | 52 } |
41 | 53 |
42 void Matcher::Remove(Filter& filter) | 54 void Matcher::Remove(Filter& filter) |
43 { | 55 { |
44 auto entry = mKeywordByFilter.find(filter.GetText()); | 56 auto entry = mKeywordByFilter.find(filter.GetText()); |
45 if (!entry) | 57 if (!entry) |
46 return; | 58 return; |
47 | 59 |
48 auto keyword = entry->second; | 60 auto& keyword = static_cast<const String&>(entry->second); |
49 auto list = mFilterByKeyword[keyword]; | 61 auto list = mFilterByKeyword[keyword]; |
50 if (list.size() == 1) | 62 if (list.size() == 1) |
51 mFilterByKeyword.erase(keyword); | 63 mFilterByKeyword.erase(keyword); |
52 else | 64 else |
53 list.erase(std::find(list.cbegin(), list.cend(), FilterPtr(&filter))); | 65 list.erase(std::find(list.cbegin(), list.cend(), FilterPtr(&filter))); |
54 | 66 |
55 mKeywordByFilter.erase(filter.GetText()); | 67 mKeywordByFilter.erase(filter.GetText()); |
56 } | 68 } |
57 | 69 |
58 void Matcher::Clear() | 70 void Matcher::Clear() |
59 { | 71 { |
60 mFilterByKeyword.clear(); | 72 mFilterByKeyword.clear(); |
61 mKeywordByFilter.clear(); | 73 mKeywordByFilter.clear(); |
62 } | 74 } |
63 | 75 |
64 bool Matcher::HasFilter(const Filter& filter) const | 76 bool Matcher::HasFilter(const Filter& filter) const |
65 { | 77 { |
66 return mKeywordByFilter.find(filter.GetText()); | 78 return mKeywordByFilter.find(filter.GetText()); |
67 } | 79 } |
68 | 80 |
69 namespace | 81 namespace |
70 { | 82 { |
71 DependentString emptyString = u""_str; | 83 DependentString emptyString = u""_str; |
72 } | 84 } |
73 | 85 |
74 const String& Matcher::GetKeywordForFilter(const Filter& filter) const | 86 const String& Matcher::GetKeywordForFilter(const Filter& filter) const |
75 { | 87 { |
76 auto entry = mKeywordByFilter.find(filter.GetText()); | 88 auto entry = mKeywordByFilter.find(filter.GetText()); |
77 if (entry) | 89 if (entry) |
78 return entry->second; | 90 return static_cast<const String&>(entry->second); |
79 return emptyString; | 91 return emptyString; |
80 } | 92 } |
81 | 93 |
82 Filter* Matcher::MatchesAny(const String& location, | 94 Filter* Matcher::MatchesAny(const String& location, |
83 int typeMask, DependentString& docDomain, bool thirdParty, | 95 int typeMask, DependentString& docDomain, bool thirdParty, |
84 const String& sitekey, bool specificOnly) const | 96 const String& sitekey, bool specificOnly) const |
85 { | 97 { |
86 auto re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true); | |
87 OwnedString text(location); | 98 OwnedString text(location); |
88 text.toLower(); | 99 text.toLower(); |
89 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); | 100 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); |
90 MatchRegExp(re_id, text, reResult.get()); | 101 if (text.match(mMatchReId, *reResult)) |
91 auto& candidates = reResult->candidates; | 102 { |
92 candidates.push_back(OwnedString()); | 103 auto& candidates = reResult->candidates; |
93 for (auto substr : candidates) | 104 candidates.push_back(OwnedString()); |
94 { | 105 for (auto candidate : candidates) |
95 if (mFilterByKeyword.find(substr)) | |
96 { | 106 { |
97 auto result = CheckEntryMatch(substr, location, typeMask, docDomain, | 107 auto result = CheckEntryMatch(candidate, location, typeMask, docDomain, |
98 thirdParty, sitekey, specificOnly); | 108 thirdParty, sitekey, specificOnly); |
99 if (result) | 109 if (result) |
100 return result.release(); | 110 return result.release(); |
101 } | 111 } |
102 } | 112 } |
Wladimir Palant
2017/10/09 08:39:47
As mentioned in the issue description, we should n
sergei
2017/10/09 15:27:53
Although it merely converts the existing JS code a
Wladimir Palant
2017/10/10 07:39:05
I strongly disagree. Landing crappy code is always
| |
103 return nullptr; | 113 return nullptr; |
104 } | 114 } |
105 | 115 |
106 OwnedString Matcher::FindKeyword(const Filter& filter) const | 116 OwnedString Matcher::FindKeyword(const Filter& filter) const |
107 { | 117 { |
108 OwnedString result(u""_str); | 118 OwnedString result; |
109 OwnedString text(filter.GetText()); | 119 OwnedString text(filter.GetText()); |
110 auto re_id = GenerateRegExp(regexpRegExp, true, false); | 120 if (TestRegExp(mReId, text)) |
111 if (TestRegExp(re_id, text)) | |
112 return result; | 121 return result; |
113 | 122 |
114 // Remove options | 123 // Remove options |
115 auto options_re_id = GenerateRegExp(optionsRegExp, true, false); | 124 auto index = ExecRegExp(mOptionsReId, text); |
116 auto index = ExecRegExp(options_re_id, text); | |
117 if (index != String::npos) | 125 if (index != String::npos) |
118 text = text.substr(0, index); | 126 text = DependentString(text, 0, index); |
119 | 127 |
120 // Remove whitelist marker | 128 // Remove whitelist marker |
121 if (text.length() >= 2 && text[0] == '@' && text[1] == '@') | 129 if (text.length() >= 2 && text[0] == '@' && text[1] == '@') |
122 text = text.substr(2); | 130 text = DependentString(text, 2); |
123 | 131 |
124 text.toLower(); | 132 text.toLower(); |
125 intrusive_ptr<ReMatchResults> keywords(new ReMatchResults, false); | 133 intrusive_ptr<ReMatchResults> keywords(new ReMatchResults, false); |
126 auto candidates_re_id = GenerateRegExp(candidateRegExp, true, true); | 134 auto match = text.match(mCandidatesReId, *keywords); |
127 auto match = text.match(candidates_re_id, keywords.get()); | |
128 if (!match) | 135 if (!match) |
129 return result; | 136 return result; |
130 | 137 |
131 auto& candidates = keywords->candidates; | 138 auto& candidates = keywords->candidates; |
132 | 139 |
133 auto& hash = mFilterByKeyword; | 140 uint32_t resultCount = 0xffffff; |
134 uint32_t resultCount = 0xffffffff; | |
135 uint32_t resultLength = 0; | 141 uint32_t resultLength = 0; |
136 for (auto substr : candidates) | 142 for (auto substr : candidates) |
137 { | 143 { |
138 if (substr.empty()) | 144 if (substr.empty()) |
139 continue; | 145 continue; |
140 | 146 |
141 auto candidate = substr.substr(1); | 147 auto candidate = DependentString(substr, 1); |
142 auto entry = hash.find(candidate); | 148 auto entry = mFilterByKeyword.find(candidate); |
143 auto count = entry ? entry->second.size() : 0; | 149 auto count = entry ? entry->second.size() : 0; |
144 if (count < resultCount || | 150 if (count < resultCount || |
145 (count == resultCount && candidate.length() > resultLength)) | 151 (count == resultCount && candidate.length() > resultLength)) |
146 { | 152 { |
147 result = candidate; | 153 result = candidate; |
148 resultCount = count; | 154 resultCount = count; |
149 resultLength = candidate.length(); | 155 resultLength = candidate.length(); |
150 } | 156 } |
151 } | 157 } |
152 return result; | 158 return result; |
153 } | 159 } |
154 | 160 |
155 FilterPtr Matcher::CheckEntryMatch(const String& keyword, | 161 FilterPtr Matcher::CheckEntryMatch(const String& keyword, |
156 const String& location, | 162 const String& location, |
157 int typeMask, DependentString& docDomain, bool thirdParty, | 163 int typeMask, DependentString& docDomain, bool thirdParty, |
158 const String& sitekey, bool specificOnly) const | 164 const String& sitekey, bool specificOnly) const |
159 { | 165 { |
160 auto entry = mFilterByKeyword.find(keyword); | 166 auto entry = mFilterByKeyword.find(keyword); |
161 if (entry) | 167 if (!entry) |
162 { | 168 return FilterPtr(); |
sergei
2017/10/04 08:54:33
Earlier return would be better here, in my opinion
hub
2017/10/06 13:49:19
Done.
| |
163 auto list = entry->second; | 169 |
164 for (auto filter : list) | 170 auto filters = entry->second; |
165 { | 171 for (auto filter : filters) |
166 auto activeFilter = static_cast<ActiveFilter*>(filter.get()); | 172 { |
167 if (specificOnly && activeFilter->IsGeneric() && | 173 auto activeFilter = static_cast<ActiveFilter*>(filter.get()); |
168 !(activeFilter->mType != Filter::Type::WHITELIST)) | 174 if (specificOnly && activeFilter->IsGeneric() && |
169 continue; | 175 (activeFilter->mType != Filter::Type::WHITELIST)) |
170 | 176 continue; |
171 auto reFilter = static_cast<RegExpFilter*>(activeFilter); | 177 |
172 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey)) | 178 auto reFilter = static_cast<RegExpFilter*>(activeFilter); |
173 return filter; | 179 if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey)) |
174 } | 180 return filter; |
175 } | 181 } |
182 | |
176 return FilterPtr(); | 183 return FilterPtr(); |
177 } | 184 } |
178 | 185 |
179 const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000; | 186 const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000; |
180 | 187 |
188 CombinedMatcher::CombinedMatcher() | |
189 : mResultCache(1024), mMatchReId(-1) | |
190 { | |
191 mMatchReId = GenerateRegExp(matchRegExp, true, true); | |
192 } | |
193 | |
181 void CombinedMatcher::Add(Filter& filter) | 194 void CombinedMatcher::Add(Filter& filter) |
182 { | 195 { |
183 if (filter.mType == Filter::Type::WHITELIST) | 196 GetMatcher(filter).Add(filter); |
184 mWhitelist.Add(filter); | |
185 else | |
186 mBlacklist.Add(filter); | |
187 | |
188 ResetCache(); | 197 ResetCache(); |
189 } | 198 } |
190 | 199 |
191 void CombinedMatcher::Remove(Filter& filter) | 200 void CombinedMatcher::Remove(Filter& filter) |
192 { | 201 { |
193 if (filter.mType == Filter::Type::WHITELIST) | 202 GetMatcher(filter).Remove(filter); |
194 mWhitelist.Remove(filter); | |
195 else | |
196 mBlacklist.Remove(filter); | |
197 | |
198 ResetCache(); | 203 ResetCache(); |
199 } | 204 } |
200 | 205 |
201 void CombinedMatcher::Clear() | 206 void CombinedMatcher::Clear() |
202 { | 207 { |
203 mBlacklist.Clear(); | 208 mBlacklist.Clear(); |
204 mWhitelist.Clear(); | 209 mWhitelist.Clear(); |
205 ResetCache(); | 210 ResetCache(); |
206 } | 211 } |
207 | 212 |
208 bool CombinedMatcher::HasFilter(const Filter& filter) const | 213 bool CombinedMatcher::HasFilter(const Filter& filter) const |
209 { | 214 { |
210 return filter.mType == Filter::Type::WHITELIST ? | 215 return GetMatcher(filter).HasFilter(filter); |
211 mWhitelist.HasFilter(filter) : mBlacklist.HasFilter(filter); | |
212 } | 216 } |
213 | 217 |
214 const String& CombinedMatcher::GetKeywordForFilter(const Filter& filter) const | 218 const String& CombinedMatcher::GetKeywordForFilter(const Filter& filter) const |
215 { | 219 { |
216 return filter.mType == Filter::Type::WHITELIST ? | 220 return GetMatcher(filter).GetKeywordForFilter(filter); |
217 mWhitelist.GetKeywordForFilter(filter) : mBlacklist.GetKeywordForFilter(filt er); | |
218 } | 221 } |
219 | 222 |
220 Filter* CombinedMatcher::MatchesAny(const String& location, | 223 Filter* CombinedMatcher::MatchesAny(const String& location, |
221 int typeMask, DependentString& docDomain, bool thirdParty, | 224 int typeMask, DependentString& docDomain, bool thirdParty, |
222 const String& sitekey, bool specificOnly) | 225 const String& sitekey, bool specificOnly) |
223 { | 226 { |
224 OwnedString key(location); | 227 OwnedString key(location); |
225 key.append(u" "_str); | 228 key.append(u" "_str); |
226 key.append(typeMask); | 229 key.append(typeMask); |
227 key.append(u" "_str); | 230 key.append(u" "_str); |
228 key.append(docDomain); | 231 key.append(docDomain); |
229 key.append(u" "_str); | 232 key.append(u" "_str); |
230 key.append(thirdParty); | 233 key.append(thirdParty); |
231 key.append(u" "_str); | 234 key.append(u" "_str); |
232 key.append(sitekey); | 235 key.append(sitekey); |
233 key.append(u" "_str); | 236 key.append(u" "_str); |
234 key.append(specificOnly); | 237 key.append(specificOnly); |
235 | 238 |
236 FilterPtr result; | 239 FilterPtr result; |
237 | 240 |
238 auto cachedResult = mResultCache.find(key); | 241 auto cachedResult = mResultCache.find(key); |
239 if (cachedResult) | 242 if (cachedResult) |
240 result = cachedResult->second; | 243 result = cachedResult->second.filter(); |
241 else | 244 else |
242 { | 245 { |
243 result = MatchesAnyInternal(location, typeMask, docDomain, | 246 result = MatchesAnyInternal(location, typeMask, docDomain, |
244 thirdParty, sitekey, specificOnly); | 247 thirdParty, sitekey, specificOnly); |
245 | 248 |
246 if (mResultCache.size() >= MAX_CACHE_ENTRIES) | 249 if (mResultCache.size() >= MAX_CACHE_ENTRIES) |
247 ResetCache(); | 250 ResetCache(); |
248 | 251 |
249 mResultCache[key] = result; | 252 CacheEntry cache(std::move(key), result); |
253 mResultCache[cache.key()] = cache; | |
250 } | 254 } |
251 | 255 |
252 return result.release(); | 256 return result.release(); |
253 } | 257 } |
254 | 258 |
255 OwnedString CombinedMatcher::FindKeyword(const Filter& filter) const | 259 OwnedString CombinedMatcher::FindKeyword(const Filter& filter) const |
256 { | 260 { |
257 return filter.mType == Filter::Type::WHITELIST ? | 261 return GetMatcher(filter).FindKeyword(filter); |
258 mWhitelist.FindKeyword(filter) : mBlacklist.FindKeyword(filter); | |
259 } | 262 } |
260 | 263 |
261 void CombinedMatcher::ResetCache() | 264 void CombinedMatcher::ResetCache() |
262 { | 265 { |
263 mResultCache.clear(); | 266 mResultCache.clear(); |
264 } | 267 } |
265 | 268 |
266 FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location, | 269 FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location, |
267 int typeMask, DependentString& docDomain, bool thirdParty, | 270 int typeMask, DependentString& docDomain, bool thirdParty, |
268 const String& sitekey, bool specificOnly) const | 271 const String& sitekey, bool specificOnly) const |
269 { | 272 { |
270 OwnedString text(location); | 273 OwnedString text(location); |
271 text.toLower(); | 274 text.toLower(); |
272 auto match_re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true); | |
273 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); | 275 intrusive_ptr<ReMatchResults> reResult(new ReMatchResults, false); |
274 text.match(match_re_id, reResult.get()); | 276 text.match(mMatchReId, *reResult); |
275 | 277 |
276 auto& candidates = reResult->candidates; | 278 auto& candidates = reResult->candidates; |
277 candidates.push_back(OwnedString()); | 279 candidates.push_back(OwnedString()); |
278 | 280 |
279 FilterPtr blacklistHit; | 281 FilterPtr blacklistHit; |
280 for (auto substr : candidates) | 282 for (auto substr : candidates) |
281 { | 283 { |
282 auto result = mWhitelist.CheckEntryMatch( | 284 auto result = mWhitelist.CheckEntryMatch( |
283 substr, location, typeMask, docDomain, thirdParty, sitekey, specificOnly); | 285 substr, location, typeMask, docDomain, thirdParty, sitekey, false); |
284 if (result) | 286 if (result) |
285 return result; | 287 return result; |
286 | 288 |
287 if (!blacklistHit) | 289 if (!blacklistHit) |
288 blacklistHit = mBlacklist.CheckEntryMatch( | 290 blacklistHit = mBlacklist.CheckEntryMatch( |
289 substr, location, typeMask, docDomain, thirdParty, sitekey, | 291 substr, location, typeMask, docDomain, thirdParty, sitekey, |
290 specificOnly); | 292 specificOnly); |
291 } | 293 } |
292 return blacklistHit; | 294 return blacklistHit; |
293 } | 295 } |
LEFT | RIGHT |