 Issue 29556737:
  Issue 5141 - Convert filter match to C++  (Closed) 
  Base URL: https://hg.adblockplus.org/adblockpluscore/
    
  
    Issue 29556737:
  Issue 5141 - Convert filter match to C++  (Closed) 
  Base URL: https://hg.adblockplus.org/adblockpluscore/| Index: compiled/filter/Matcher.cpp | 
| =================================================================== | 
| new file mode 100644 | 
| --- /dev/null | 
| +++ b/compiled/filter/Matcher.cpp | 
| @@ -0,0 +1,295 @@ | 
| +/* | 
| + * This file is part of Adblock Plus <https://adblockplus.org/>, | 
| + * Copyright (C) 2006-present eyeo GmbH | 
| + * | 
| + * Adblock Plus is free software: you can redistribute it and/or modify | 
| + * it under the terms of the GNU General Public License version 3 as | 
| + * published by the Free Software Foundation. | 
| + * | 
| + * Adblock Plus is distributed in the hope that it will be useful, | 
| + * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 
| + * GNU General Public License for more details. | 
| + * | 
| + * You should have received a copy of the GNU General Public License | 
| + * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 
| + */ | 
| + | 
| +#include "Matcher.h" | 
| +#include "RegExpFilter.h" | 
| +#include "../library.h" | 
| + | 
| +const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000; | 
| + | 
| +OwnedString CombinedMatcher::FindKeyword(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:35
It's not important but still it would be better to
 
sergei
2017/10/02 12:02:36
the method should be const.
 
hub
2017/10/03 19:33:13
Done.
 
hub
2017/10/03 19:33:13
Done.
 | 
| +{ | 
| + if (filter->mType == Filter::Type::WHITELIST) | 
| 
sergei
2017/10/02 12:02:33
What do you think about having an inline function
 
hub
2017/10/03 19:33:11
Done.
 
sergei
2017/10/04 08:54:32
I meant that the code of CombinedMatcher::SomeMeth
 
hub
2017/10/06 13:49:17
Done.
 | 
| + return mWhitelist.FindKeyword(filter); | 
| + return mBlacklist.FindKeyword(filter); | 
| +} | 
| + | 
| +void CombinedMatcher::ResetCache() | 
| +{ | 
| + mResultCache.clear(); | 
| +} | 
| + | 
| +void CombinedMatcher::Add(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:36
Should the argument be `Filter&`?
 
hub
2017/10/03 19:33:11
Done.
 | 
| +{ | 
| + if (filter->mType == Filter::Type::WHITELIST) | 
| + mWhitelist.Add(filter); | 
| + else | 
| + mBlacklist.Add(filter); | 
| + | 
| + ResetCache(); | 
| +} | 
| + | 
| +void CombinedMatcher::Remove(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:37
Should the argument be `const Filter&`?
 
hub
2017/10/03 19:33:09
Done.
 | 
| +{ | 
| + if (filter->mType == Filter::Type::WHITELIST) | 
| + mWhitelist.Remove(filter); | 
| + else | 
| + mBlacklist.Remove(filter); | 
| + | 
| + ResetCache(); | 
| +} | 
| + | 
| +void CombinedMatcher::Clear() | 
| +{ | 
| + mBlacklist.Clear(); | 
| + mWhitelist.Clear(); | 
| + ResetCache(); | 
| +} | 
| + | 
| +bool CombinedMatcher::HasFilter(const FilterPtr& filter) const | 
| 
sergei
2017/10/02 12:02:34
Should the argument be `const Filter&`?
 
hub
2017/10/03 19:33:11
Done.
 | 
| +{ | 
| + if (filter->mType == Filter::Type::WHITELIST) | 
| + return mWhitelist.HasFilter(filter); | 
| + return mBlacklist.HasFilter(filter); | 
| +} | 
| + | 
| +const String& CombinedMatcher::GetKeywordForFilter(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:35
Should the argument be `const Filter&`?
 
sergei
2017/10/02 12:02:36
the method should be const.
 
hub
2017/10/03 19:33:11
Done.
 
hub
2017/10/03 19:33:12
Done.
 | 
| +{ | 
| + if (filter->mType == Filter::Type::WHITELIST) | 
| + return mWhitelist.GetKeywordForFilter(filter); | 
| + return mBlacklist.GetKeywordForFilter(filter); | 
| +} | 
| + | 
| +FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location, | 
| 
sergei
2017/10/02 12:02:34
the method should be const if it's possible.
 
hub
2017/10/03 19:33:10
Done.
 | 
| + int typeMask, DependentString& docDomain, bool thirdParty, | 
| + const String& sitekey, bool specificOnly) | 
| +{ | 
| + ReMatchResults reResult; | 
| + OwnedString text(location); | 
| + text.toLower(); | 
| + auto match_re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true); | 
| 
sergei
2017/10/02 12:02:36
It should be in anonymous namespace, otherwise a n
 
sergei
2017/10/04 08:54:31
This is not addressed.
 
hub
2017/10/06 13:49:17
Done.
 | 
| + text.match(match_re_id, &reResult); | 
| 
sergei
2017/10/02 12:02:35
Although it seems it does work here, I think for p
 
hub
2017/10/03 19:33:12
Done.
 | 
| + | 
| + auto& candidates = reResult.candidates; | 
| + candidates.push_back(OwnedString()); | 
| + | 
| + FilterPtr blacklistHit; | 
| + for (auto substr : candidates) | 
| + { | 
| + if (mWhitelist.mFilterByKeyword.find(substr)) | 
| 
sergei
2017/10/02 12:02:36
It's already changed in the master, do you mind to
 
hub
2017/10/03 19:33:13
Done.
 | 
| + { | 
| + auto result = mWhitelist.CheckEntryMatch( | 
| + substr, location, typeMask, docDomain, thirdParty, sitekey, specificOnly); | 
| + if (result) | 
| + return result; | 
| + } | 
| + if (mBlacklist.mFilterByKeyword.find(substr) && !blacklistHit) | 
| + { | 
| + blacklistHit = mBlacklist.CheckEntryMatch( | 
| + substr, location, typeMask, docDomain, thirdParty, sitekey, | 
| + specificOnly); | 
| + } | 
| + } | 
| + return blacklistHit; | 
| +} | 
| + | 
| +Filter* CombinedMatcher::MatchesAny(const String& location, | 
| + int typeMask, DependentString& docDomain, bool thirdParty, | 
| + const String& sitekey, bool specificOnly) | 
| 
sergei
2017/10/02 12:02:34
The method should be const if it's possible.
 
hub
2017/10/03 19:33:11
sadly the use of the cache makes it non-const. I c
 | 
| +{ | 
| + OwnedString key(location); | 
| + key.append(u" "_str); | 
| + key.append(typeMask); | 
| + key.append(u" "_str); | 
| + key.append(docDomain); | 
| + key.append(u" "_str); | 
| + key.append(thirdParty); | 
| + key.append(u" "_str); | 
| + key.append(sitekey); | 
| + key.append(u" "_str); | 
| + key.append(specificOnly); | 
| + | 
| + FilterPtr result; | 
| + | 
| + auto cachedResult = mResultCache.find(key); | 
| + if (cachedResult) | 
| + result = cachedResult->second; | 
| + else | 
| + { | 
| + result = MatchesAnyInternal(location, typeMask, docDomain, | 
| + thirdParty, sitekey, specificOnly); | 
| + | 
| + if (mResultCache.size() >= MAX_CACHE_ENTRIES) | 
| + ResetCache(); | 
| + | 
| + mResultCache[key] = result; | 
| + } | 
| + | 
| + result->AddRef(); | 
| + return result.get(); | 
| 
sergei
2017/10/02 12:02:34
It would be better to `return result.release();`.
 
hub
2017/10/03 19:33:10
Done.
 | 
| +} | 
| + | 
| +namespace { | 
| + const DependentString regexpRegExp = | 
| + u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_str; | 
| + const DependentString optionsRegExp = | 
| + u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str; | 
| + const DependentString candidateRegExp = | 
| + u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str; | 
| +} | 
| + | 
| +OwnedString Matcher::FindKeyword(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:34
Should the argument be `const Filter&`?
 
sergei
2017/10/02 12:02:36
should it be a const method?
 
hub
2017/10/03 19:33:11
Done.
 
hub
2017/10/03 19:33:12
Done.
 | 
| +{ | 
| + OwnedString result(u""_str); | 
| + OwnedString text(filter->GetText()); | 
| + auto re_id = GenerateRegExp(DependentString(regexpRegExp), true, false); | 
| 
sergei
2017/10/02 12:02:37
It and all other regexps below should be in the an
 
hub
2017/10/03 19:33:14
The mistake here is that a create a new DependentS
 
sergei
2017/10/04 08:54:32
Each call of GenerateRegExp increases global _rege
 
hub
2017/10/06 13:49:16
Done.
 | 
| + if (TestRegExp(re_id, text)) | 
| + return result; | 
| + | 
| + // Remove options | 
| + auto options_re_id = GenerateRegExp(DependentString(optionsRegExp), true, false); | 
| + auto index = ExecRegExp(options_re_id, text); | 
| + if (index != -1) | 
| 
sergei
2017/10/02 12:02:34
It would be better to use String::npos than -1.
 
hub
2017/10/03 19:33:13
Done.
 | 
| + text = text.substr(0, index); | 
| + | 
| + // Remove whitelist marker | 
| + if (text[0] == '@' && text[1] == '@') | 
| 
sergei
2017/10/02 12:02:37
Firstly we should check the length of the `text`.
 
hub
2017/10/03 19:33:11
Done.
 | 
| + text = text.substr(2); | 
| + | 
| + text.toLower(); | 
| + ReMatchResults keywords; | 
| + auto candidates_re_id = GenerateRegExp(candidateRegExp, true, true); | 
| + auto match = text.match(candidates_re_id, &keywords); | 
| + if (!match) | 
| + return result; | 
| + | 
| + auto& candidates = keywords.candidates; | 
| + | 
| + auto& hash = mFilterByKeyword; | 
| + uint32_t resultCount = 0xffffffff; | 
| + uint32_t resultLength = 0; | 
| + for (auto substr : candidates) | 
| + { | 
| + auto candidate = DependentString(substr).substr(1); | 
| + auto count = (hash.find(candidate) ? hash[candidate].size() : 0); | 
| 
sergei
2017/10/02 12:02:35
Basically braces are not needed here.
 
sergei
2017/10/02 12:02:37
It seems it could be optimized by
auto ii_hash = h
 
hub
2017/10/03 19:33:12
I have to do that for to make the function `const`
 
hub
2017/10/03 19:33:13
Done.
 
sergei
2017/10/04 08:54:32
It's just a side effect of the present code, there
 
hub
2017/10/06 13:49:16
I addressed that. Just as I said making this const
 | 
| + if (count < resultCount || | 
| + (count == resultCount && candidate.length() > resultLength)) | 
| + { | 
| + result = candidate; | 
| + resultCount = count; | 
| + resultLength = candidate.length(); | 
| + } | 
| + } | 
| + | 
| + return result; | 
| +} | 
| + | 
| +void Matcher::Add(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:36
What about passing `Filter&`?
 
hub
2017/10/03 19:33:10
Done.
 | 
| +{ | 
| + if (mKeywordByFilter.find(filter->GetText())) | 
| + return; | 
| + | 
| + auto keyword = FindKeyword(filter); | 
| + auto oldEntry = mFilterByKeyword.find(keyword); | 
| + if (!oldEntry) | 
| + mFilterByKeyword[keyword] = std::vector<FilterPtr>{filter}; | 
| + else | 
| + mFilterByKeyword[keyword].push_back(filter); | 
| 
sergei
2017/10/02 12:02:37
StringMap::operator[](const String& key) creates a
 
hub
2017/10/03 19:33:09
Done.
 | 
| + mKeywordByFilter[filter->GetText()] = keyword; | 
| 
sergei
2017/10/02 12:02:34
mKeywordByFilter stores DependentString, what if t
 
sergei
2017/10/04 08:54:32
What about having some
struct FilterKeyword
{
  Fi
 
hub
2017/10/06 13:49:17
Sounds like a good idea. Done.
 | 
| +} | 
| + | 
| +void Matcher::Remove(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:37
It seems the argument can be a const reference.
 
hub
2017/10/03 19:33:09
Done.
 | 
| +{ | 
| + if (!mKeywordByFilter.find(filter->GetText())) | 
| + return; | 
| + | 
| + auto keyword = mKeywordByFilter[filter->GetText()]; | 
| 
sergei
2017/10/02 12:02:37
There is also no need for double looking up.
 
hub
2017/10/03 19:33:12
Done.
 | 
| + auto list = mFilterByKeyword[keyword]; | 
| + if (list.size() == 1) | 
| + mFilterByKeyword.erase(keyword); | 
| + else | 
| + { | 
| + auto iter = std::find(list.cbegin(), list.cend(), filter); | 
| + list.erase(iter); | 
| 
sergei
2017/10/02 12:02:35
It can be one line but it does not matter.
 
hub
2017/10/03 19:33:10
Done.
 | 
| + } | 
| + mKeywordByFilter.erase(filter->GetText()); | 
| +} | 
| + | 
| +void Matcher::Clear() | 
| +{ | 
| + mFilterByKeyword.clear(); | 
| + mKeywordByFilter.clear(); | 
| +} | 
| + | 
| +bool Matcher::HasFilter(const FilterPtr& filter) const | 
| 
sergei
2017/10/02 12:02:35
the argument should be a const reference.
 
hub
2017/10/03 19:33:09
Done.
 | 
| +{ | 
| + return mKeywordByFilter.find(filter->GetText()); | 
| +} | 
| + | 
| +static DependentString emptyString = u""_str; | 
| 
sergei
2017/10/02 12:02:37
Although static in the compilation unit achieves t
 
hub
2017/10/03 19:33:10
Done.
 | 
| + | 
| +const String& Matcher::GetKeywordForFilter(const FilterPtr& filter) | 
| 
sergei
2017/10/02 12:02:36
the argument should be a const reference and the m
 
hub
2017/10/03 19:33:12
Done.
 | 
| +{ | 
| + if (mKeywordByFilter.find(filter->GetText())) | 
| + return mKeywordByFilter[filter->GetText()]; | 
| + return emptyString; | 
| 
sergei
2017/10/02 12:02:37
There is also no need for double looking up.
 
hub
2017/10/03 19:33:13
Done (needed for making the method `const`)
 | 
| +} | 
| + | 
| +FilterPtr Matcher::CheckEntryMatch(const String& keyword, | 
| + const String& location, | 
| + int typeMask, DependentString& docDomain, bool thirdParty, | 
| + const String& sitekey, bool specificOnly) | 
| 
sergei
2017/10/02 12:02:34
basically this method and the one below do not mod
 
hub
2017/10/03 19:33:10
Done.
 | 
| +{ | 
| + auto list = mFilterByKeyword[keyword]; | 
| + for (auto filter : list) { | 
| + auto activeFilter = static_cast<ActiveFilter*>(filter.get()); | 
| 
sergei
2017/10/02 12:02:35
opening brace { should be on the new line.
 
hub
2017/10/03 19:33:14
Done.
 | 
| + if (specificOnly && activeFilter->IsGeneric() && | 
| + !(activeFilter->mType != Filter::Type::WHITELIST)) | 
| + continue; | 
| + | 
| + auto reFilter = static_cast<RegExpFilter*>(activeFilter); | 
| + if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey)) | 
| + return filter; | 
| + } | 
| + return FilterPtr(); | 
| +} | 
| + | 
| +Filter* Matcher::MatchesAny(const String& location, | 
| + int typeMask, DependentString& docDomain, bool thirdParty, | 
| + const String& sitekey, bool specificOnly) | 
| +{ | 
| + ReMatchResults reResult; | 
| + auto re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true); | 
| + OwnedString text(location); | 
| + text.toLower(); | 
| + MatchRegExp(re_id, text, &reResult); | 
| + auto& candidates = reResult.candidates; | 
| + candidates.push_back(OwnedString()); | 
| + for (auto substr : candidates) | 
| + if (mFilterByKeyword.find(substr)) | 
| + { | 
| + auto result = CheckEntryMatch(substr, location, typeMask, docDomain, | 
| + thirdParty, sitekey, specificOnly); | 
| + if (result) | 
| + { | 
| + result->AddRef(); | 
| + return result.get(); | 
| 
sergei
2017/10/02 12:02:36
just return `result.release();`
 
hub
2017/10/03 19:33:12
Done.
 | 
| + } | 
| + } | 
| + | 
| + return nullptr; | 
| +} |