Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: compiled/filter/Matcher.cpp

Issue 29556737: Issue 5141 - Convert filter match to C++ (Closed) Base URL: https://hg.adblockplus.org/adblockpluscore/
Patch Set: Some more cleanup Created Sept. 29, 2017, 4:12 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: compiled/filter/Matcher.cpp
===================================================================
new file mode 100644
--- /dev/null
+++ b/compiled/filter/Matcher.cpp
@@ -0,0 +1,295 @@
+/*
+ * This file is part of Adblock Plus <https://adblockplus.org/>,
+ * Copyright (C) 2006-present eyeo GmbH
+ *
+ * Adblock Plus is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * Adblock Plus is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "Matcher.h"
+#include "RegExpFilter.h"
+#include "../library.h"
+
+const size_t CombinedMatcher::MAX_CACHE_ENTRIES = 1000;
+
+OwnedString CombinedMatcher::FindKeyword(const FilterPtr& filter)
sergei 2017/10/02 12:02:35 It's not important but still it would be better to
sergei 2017/10/02 12:02:36 the method should be const.
hub 2017/10/03 19:33:13 Done.
hub 2017/10/03 19:33:13 Done.
+{
+ if (filter->mType == Filter::Type::WHITELIST)
sergei 2017/10/02 12:02:33 What do you think about having an inline function
hub 2017/10/03 19:33:11 Done.
sergei 2017/10/04 08:54:32 I meant that the code of CombinedMatcher::SomeMeth
hub 2017/10/06 13:49:17 Done.
+ return mWhitelist.FindKeyword(filter);
+ return mBlacklist.FindKeyword(filter);
+}
+
+void CombinedMatcher::ResetCache()
+{
+ mResultCache.clear();
+}
+
+void CombinedMatcher::Add(const FilterPtr& filter)
sergei 2017/10/02 12:02:36 Should the argument be `Filter&`?
hub 2017/10/03 19:33:11 Done.
+{
+ if (filter->mType == Filter::Type::WHITELIST)
+ mWhitelist.Add(filter);
+ else
+ mBlacklist.Add(filter);
+
+ ResetCache();
+}
+
+void CombinedMatcher::Remove(const FilterPtr& filter)
sergei 2017/10/02 12:02:37 Should the argument be `const Filter&`?
hub 2017/10/03 19:33:09 Done.
+{
+ if (filter->mType == Filter::Type::WHITELIST)
+ mWhitelist.Remove(filter);
+ else
+ mBlacklist.Remove(filter);
+
+ ResetCache();
+}
+
+void CombinedMatcher::Clear()
+{
+ mBlacklist.Clear();
+ mWhitelist.Clear();
+ ResetCache();
+}
+
+bool CombinedMatcher::HasFilter(const FilterPtr& filter) const
sergei 2017/10/02 12:02:34 Should the argument be `const Filter&`?
hub 2017/10/03 19:33:11 Done.
+{
+ if (filter->mType == Filter::Type::WHITELIST)
+ return mWhitelist.HasFilter(filter);
+ return mBlacklist.HasFilter(filter);
+}
+
+const String& CombinedMatcher::GetKeywordForFilter(const FilterPtr& filter)
sergei 2017/10/02 12:02:35 Should the argument be `const Filter&`?
sergei 2017/10/02 12:02:36 the method should be const.
hub 2017/10/03 19:33:11 Done.
hub 2017/10/03 19:33:12 Done.
+{
+ if (filter->mType == Filter::Type::WHITELIST)
+ return mWhitelist.GetKeywordForFilter(filter);
+ return mBlacklist.GetKeywordForFilter(filter);
+}
+
+FilterPtr CombinedMatcher::MatchesAnyInternal(const String& location,
sergei 2017/10/02 12:02:34 the method should be const if it's possible.
hub 2017/10/03 19:33:10 Done.
+ int typeMask, DependentString& docDomain, bool thirdParty,
+ const String& sitekey, bool specificOnly)
+{
+ ReMatchResults reResult;
+ OwnedString text(location);
+ text.toLower();
+ auto match_re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true);
sergei 2017/10/02 12:02:36 It should be in anonymous namespace, otherwise a n
sergei 2017/10/04 08:54:31 This is not addressed.
hub 2017/10/06 13:49:17 Done.
+ text.match(match_re_id, &reResult);
sergei 2017/10/02 12:02:35 Although it seems it does work here, I think for p
hub 2017/10/03 19:33:12 Done.
+
+ auto& candidates = reResult.candidates;
+ candidates.push_back(OwnedString());
+
+ FilterPtr blacklistHit;
+ for (auto substr : candidates)
+ {
+ if (mWhitelist.mFilterByKeyword.find(substr))
sergei 2017/10/02 12:02:36 It's already changed in the master, do you mind to
hub 2017/10/03 19:33:13 Done.
+ {
+ auto result = mWhitelist.CheckEntryMatch(
+ substr, location, typeMask, docDomain, thirdParty, sitekey, specificOnly);
+ if (result)
+ return result;
+ }
+ if (mBlacklist.mFilterByKeyword.find(substr) && !blacklistHit)
+ {
+ blacklistHit = mBlacklist.CheckEntryMatch(
+ substr, location, typeMask, docDomain, thirdParty, sitekey,
+ specificOnly);
+ }
+ }
+ return blacklistHit;
+}
+
+Filter* CombinedMatcher::MatchesAny(const String& location,
+ int typeMask, DependentString& docDomain, bool thirdParty,
+ const String& sitekey, bool specificOnly)
sergei 2017/10/02 12:02:34 The method should be const if it's possible.
hub 2017/10/03 19:33:11 sadly the use of the cache makes it non-const. I c
+{
+ OwnedString key(location);
+ key.append(u" "_str);
+ key.append(typeMask);
+ key.append(u" "_str);
+ key.append(docDomain);
+ key.append(u" "_str);
+ key.append(thirdParty);
+ key.append(u" "_str);
+ key.append(sitekey);
+ key.append(u" "_str);
+ key.append(specificOnly);
+
+ FilterPtr result;
+
+ auto cachedResult = mResultCache.find(key);
+ if (cachedResult)
+ result = cachedResult->second;
+ else
+ {
+ result = MatchesAnyInternal(location, typeMask, docDomain,
+ thirdParty, sitekey, specificOnly);
+
+ if (mResultCache.size() >= MAX_CACHE_ENTRIES)
+ ResetCache();
+
+ mResultCache[key] = result;
+ }
+
+ result->AddRef();
+ return result.get();
sergei 2017/10/02 12:02:34 It would be better to `return result.release();`.
hub 2017/10/03 19:33:10 Done.
+}
+
+namespace {
+ const DependentString regexpRegExp =
+ u"^(@@)?/.*/(?:\\$~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)?$"_str;
+ const DependentString optionsRegExp =
+ u"\\$(~?[\\w-]+(?:=[^,\\s]+)?(?:,~?[\\w-]+(?:=[^,\\s]+)?)*)$"_str;
+ const DependentString candidateRegExp =
+ u"[^a-z0-9%*][a-z0-9%]{3,}(?=[^a-z0-9%*])"_str;
+}
+
+OwnedString Matcher::FindKeyword(const FilterPtr& filter)
sergei 2017/10/02 12:02:34 Should the argument be `const Filter&`?
sergei 2017/10/02 12:02:36 should it be a const method?
hub 2017/10/03 19:33:11 Done.
hub 2017/10/03 19:33:12 Done.
+{
+ OwnedString result(u""_str);
+ OwnedString text(filter->GetText());
+ auto re_id = GenerateRegExp(DependentString(regexpRegExp), true, false);
sergei 2017/10/02 12:02:37 It and all other regexps below should be in the an
hub 2017/10/03 19:33:14 The mistake here is that a create a new DependentS
sergei 2017/10/04 08:54:32 Each call of GenerateRegExp increases global _rege
hub 2017/10/06 13:49:16 Done.
+ if (TestRegExp(re_id, text))
+ return result;
+
+ // Remove options
+ auto options_re_id = GenerateRegExp(DependentString(optionsRegExp), true, false);
+ auto index = ExecRegExp(options_re_id, text);
+ if (index != -1)
sergei 2017/10/02 12:02:34 It would be better to use String::npos than -1.
hub 2017/10/03 19:33:13 Done.
+ text = text.substr(0, index);
+
+ // Remove whitelist marker
+ if (text[0] == '@' && text[1] == '@')
sergei 2017/10/02 12:02:37 Firstly we should check the length of the `text`.
hub 2017/10/03 19:33:11 Done.
+ text = text.substr(2);
+
+ text.toLower();
+ ReMatchResults keywords;
+ auto candidates_re_id = GenerateRegExp(candidateRegExp, true, true);
+ auto match = text.match(candidates_re_id, &keywords);
+ if (!match)
+ return result;
+
+ auto& candidates = keywords.candidates;
+
+ auto& hash = mFilterByKeyword;
+ uint32_t resultCount = 0xffffffff;
+ uint32_t resultLength = 0;
+ for (auto substr : candidates)
+ {
+ auto candidate = DependentString(substr).substr(1);
+ auto count = (hash.find(candidate) ? hash[candidate].size() : 0);
sergei 2017/10/02 12:02:35 Basically braces are not needed here.
sergei 2017/10/02 12:02:37 It seems it could be optimized by auto ii_hash = h
hub 2017/10/03 19:33:12 I have to do that for to make the function `const`
hub 2017/10/03 19:33:13 Done.
sergei 2017/10/04 08:54:32 It's just a side effect of the present code, there
hub 2017/10/06 13:49:16 I addressed that. Just as I said making this const
+ if (count < resultCount ||
+ (count == resultCount && candidate.length() > resultLength))
+ {
+ result = candidate;
+ resultCount = count;
+ resultLength = candidate.length();
+ }
+ }
+
+ return result;
+}
+
+void Matcher::Add(const FilterPtr& filter)
sergei 2017/10/02 12:02:36 What about passing `Filter&`?
hub 2017/10/03 19:33:10 Done.
+{
+ if (mKeywordByFilter.find(filter->GetText()))
+ return;
+
+ auto keyword = FindKeyword(filter);
+ auto oldEntry = mFilterByKeyword.find(keyword);
+ if (!oldEntry)
+ mFilterByKeyword[keyword] = std::vector<FilterPtr>{filter};
+ else
+ mFilterByKeyword[keyword].push_back(filter);
sergei 2017/10/02 12:02:37 StringMap::operator[](const String& key) creates a
hub 2017/10/03 19:33:09 Done.
+ mKeywordByFilter[filter->GetText()] = keyword;
sergei 2017/10/02 12:02:34 mKeywordByFilter stores DependentString, what if t
sergei 2017/10/04 08:54:32 What about having some struct FilterKeyword { Fi
hub 2017/10/06 13:49:17 Sounds like a good idea. Done.
+}
+
+void Matcher::Remove(const FilterPtr& filter)
sergei 2017/10/02 12:02:37 It seems the argument can be a const reference.
hub 2017/10/03 19:33:09 Done.
+{
+ if (!mKeywordByFilter.find(filter->GetText()))
+ return;
+
+ auto keyword = mKeywordByFilter[filter->GetText()];
sergei 2017/10/02 12:02:37 There is also no need for double looking up.
hub 2017/10/03 19:33:12 Done.
+ auto list = mFilterByKeyword[keyword];
+ if (list.size() == 1)
+ mFilterByKeyword.erase(keyword);
+ else
+ {
+ auto iter = std::find(list.cbegin(), list.cend(), filter);
+ list.erase(iter);
sergei 2017/10/02 12:02:35 It can be one line but it does not matter.
hub 2017/10/03 19:33:10 Done.
+ }
+ mKeywordByFilter.erase(filter->GetText());
+}
+
+void Matcher::Clear()
+{
+ mFilterByKeyword.clear();
+ mKeywordByFilter.clear();
+}
+
+bool Matcher::HasFilter(const FilterPtr& filter) const
sergei 2017/10/02 12:02:35 the argument should be a const reference.
hub 2017/10/03 19:33:09 Done.
+{
+ return mKeywordByFilter.find(filter->GetText());
+}
+
+static DependentString emptyString = u""_str;
sergei 2017/10/02 12:02:37 Although static in the compilation unit achieves t
hub 2017/10/03 19:33:10 Done.
+
+const String& Matcher::GetKeywordForFilter(const FilterPtr& filter)
sergei 2017/10/02 12:02:36 the argument should be a const reference and the m
hub 2017/10/03 19:33:12 Done.
+{
+ if (mKeywordByFilter.find(filter->GetText()))
+ return mKeywordByFilter[filter->GetText()];
+ return emptyString;
sergei 2017/10/02 12:02:37 There is also no need for double looking up.
hub 2017/10/03 19:33:13 Done (needed for making the method `const`)
+}
+
+FilterPtr Matcher::CheckEntryMatch(const String& keyword,
+ const String& location,
+ int typeMask, DependentString& docDomain, bool thirdParty,
+ const String& sitekey, bool specificOnly)
sergei 2017/10/02 12:02:34 basically this method and the one below do not mod
hub 2017/10/03 19:33:10 Done.
+{
+ auto list = mFilterByKeyword[keyword];
+ for (auto filter : list) {
+ auto activeFilter = static_cast<ActiveFilter*>(filter.get());
sergei 2017/10/02 12:02:35 opening brace { should be on the new line.
hub 2017/10/03 19:33:14 Done.
+ if (specificOnly && activeFilter->IsGeneric() &&
+ !(activeFilter->mType != Filter::Type::WHITELIST))
+ continue;
+
+ auto reFilter = static_cast<RegExpFilter*>(activeFilter);
+ if (reFilter->Matches(location, typeMask, docDomain, thirdParty, sitekey))
+ return filter;
+ }
+ return FilterPtr();
+}
+
+Filter* Matcher::MatchesAny(const String& location,
+ int typeMask, DependentString& docDomain, bool thirdParty,
+ const String& sitekey, bool specificOnly)
+{
+ ReMatchResults reResult;
+ auto re_id = GenerateRegExp(u"[a-z0-9%]{3,}"_str, true, true);
+ OwnedString text(location);
+ text.toLower();
+ MatchRegExp(re_id, text, &reResult);
+ auto& candidates = reResult.candidates;
+ candidates.push_back(OwnedString());
+ for (auto substr : candidates)
+ if (mFilterByKeyword.find(substr))
+ {
+ auto result = CheckEntryMatch(substr, location, typeMask, docDomain,
+ thirdParty, sitekey, specificOnly);
+ if (result)
+ {
+ result->AddRef();
+ return result.get();
sergei 2017/10/02 12:02:36 just return `result.release();`
hub 2017/10/03 19:33:12 Done.
+ }
+ }
+
+ return nullptr;
+}

Powered by Google App Engine
This is Rietveld