Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: compiled/matcher/Matcher.cpp

Issue 29581602: Issue 5141 - [emscripten] Convert filter matcher to C++ Base URL: https://hg.adblockplus.org/adblockpluscore
Patch Set: Created Oct. 17, 2017, 12:31 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: compiled/matcher/Matcher.cpp
===================================================================
new file mode 100644
--- /dev/null
+++ b/compiled/matcher/Matcher.cpp
@@ -0,0 +1,175 @@
+/*
+ * This file is part of Adblock Plus <https://adblockplus.org/>,
+ * Copyright (C) 2006-present eyeo GmbH
+ *
+ * Adblock Plus is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * Adblock Plus is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "Matcher.h"
+#include "../filter/WhitelistFilter.h"
+
+class RollingHash
+{
+private:
+ static const String::size_type SUBSTRING_LENGTH = 8;
+ static const uint32_t VALUE_SIZE = 32;
+ typedef uint32_t result_type;
+ String::size_type mCount;
+ result_type mResult;
+ String::value_type mCurrentChars[SUBSTRING_LENGTH];
+
+private:
+ result_type Rotate(result_type value, uint32_t bits)
sergei 2017/10/18 10:58:43 should it be static?
+ {
+ return value << bits | value >> (VALUE_SIZE - bits);
sergei 2017/10/18 10:58:42 Mind adding static_assert(std::is_unsigned<result_
sergei 2017/10/18 10:58:42 I wonder whether our compilers are smart enough to
+ }
+
+public:
+ RollingHash()
+ : mCount(0), mResult(0)
+ {
+ }
+
+ void Advance(String::value_type newChar)
+ {
+ auto pos = mCount++ % SUBSTRING_LENGTH;
+ mResult = Rotate(mResult, 1) ^ newChar;
+ if (mCount > SUBSTRING_LENGTH)
sergei 2017/10/18 10:58:43 Do we really need this condition? Can we rather se
+ mResult ^= Rotate(mCurrentChars[pos], SUBSTRING_LENGTH);
Wladimir Palant 2017/10/17 12:51:22 Normally, one wouldn't use the char value directly
+ mCurrentChars[pos] = newChar;
+ }
+
+ int32_t PrefixLength()
hub 2017/11/20 17:38:33 IMHO this should be const method.
+ {
+ return static_cast<int32_t>(mCount) - SUBSTRING_LENGTH;
+ }
Wladimir Palant 2017/10/17 12:51:22 This goal of this method is to avoid exposing SUBS
sergei 2017/10/18 10:58:43 IMO not only the caller should do the calculation
+
+ result_type GetResult()
hub 2017/11/20 17:38:33 and const method here too.
+ {
+ return mResult & 0x7FFFFFFF;
Wladimir Palant 2017/10/17 12:51:22 We cut off the top bit because key values 0xFFFFFF
sergei 2017/10/18 10:58:42 Maybe put a special const value masking the range
+ }
+};
+
+Matcher* Matcher::Create()
+{
+ return new Matcher();
+}
+
+void Matcher::Add(Filter& filter)
+{
+ RegExpFilter* regExpFilter = filter.As<RegExpFilter>();
+ if (!regExpFilter)
+ return;
+
+ RollingHash hash;
+ DependentString str(regExpFilter->GetRegExpSource());
+ int32_t minPrefixLength = 0;
+ for (auto i = 0; i < str.length(); i++)
sergei 2017/10/18 10:58:43 Here auto resolves into `int`, but should be Strin
+ {
+ auto currentChar = str[i];
+ if (currentChar == u'*' || currentChar == u'^' || currentChar == u'|')
+ minPrefixLength = i + 1;
+ hash.Advance(currentChar);
+ if (hash.PrefixLength() >= minPrefixLength)
+ {
+ auto candidate = hash.GetResult();
+ auto entry = mFilters.find(candidate);
+ if (!entry)
+ {
+ entry.assign(candidate, RegExpFilterPtr(regExpFilter));
+ return;
+ }
+ }
+ }
+
+ mUnoptimizedFilters.push_back(RegExpFilterPtr(regExpFilter));
+}
+
+void Matcher::Remove(const Filter& filter)
+{
+ const RegExpFilter* regExpFilter = filter.As<RegExpFilter>();
+ if (!regExpFilter)
+ return;
+
+ RollingHash hash;
+ DependentString str(regExpFilter->GetRegExpSource());
+ String::size_type minPrefixLength = 0;
+ for (auto i = 0; i < str.length(); i++)
+ {
+ auto currentChar = str[i];
+ if (currentChar == u'*' || currentChar == u'^' || currentChar == u'|')
sergei 2017/10/18 10:58:43 What about putting this condition into an inline f
+ minPrefixLength = i + 1;
+ hash.Advance(currentChar);
+ if (hash.PrefixLength() >= minPrefixLength)
+ {
+ auto candidate = hash.GetResult();
+ auto entry = mFilters.find(candidate);
+ if (entry && entry->second == &filter)
sergei 2017/10/18 10:58:43 The type of second is RegExpFilterPtr, so finally
+ {
+ mFilters.erase(candidate);
+ return;
+ }
+ }
+ }
+
+ for (auto ptr : mUnoptimizedFilters)
+ if (ptr == &filter)
+ ptr.reset();
+}
+
+RegExpFilter* Matcher::MatchesAny(const String& location, int typeMask,
+ DependentString& docDomain, bool thirdParty, const String& siteKey,
+ bool specificOnly)
+{
+ RegExpFilterPtr result;
+
+ RollingHash hash;
+ for (auto i = 0; i < location.length(); i++)
+ {
+ hash.Advance(location[i]);
+ if (hash.PrefixLength() >= 0)
+ {
+ auto entry = mFilters.find(hash.GetResult());
+ if (!entry)
+ continue;
+
+ if (specificOnly && !entry->second->As<WhitelistFilter>() && entry->second->IsGeneric())
+ continue;
+
+ if (!entry->second->Matches(location, typeMask, docDomain, thirdParty, siteKey))
+ continue;
+
+ result.reset(entry->second.get());
+ if (result->As<WhitelistFilter>())
+ return result.release();
+ }
+ }
+
+ for (auto ptr : mUnoptimizedFilters)
+ {
+ if (!ptr)
+ continue;
+
+ if (specificOnly && !ptr->As<WhitelistFilter>() && ptr->IsGeneric())
+ continue;
+
+ if (!ptr->Matches(location, typeMask, docDomain, thirdParty, siteKey))
+ continue;
+
+ result.reset(ptr.get());
+ if (result->As<WhitelistFilter>())
+ return result.release();
+ }
+
+ return result.release();
+}

Powered by Google App Engine
This is Rietveld