Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Rebase with minor changes Created July 20, 2017, 3:45 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: lib/abp2blocklist.js
===================================================================
--- a/lib/abp2blocklist.js
+++ b/lib/abp2blocklist.js
@@ -42,16 +42,54 @@
typeMap.WEBRTC |
typeMap.OBJECT_SUBREQUEST |
typeMap.PING |
typeMap.OTHER;
const whitelistableRequestTypes = httpRequestTypes |
typeMap.WEBSOCKET |
typeMap.WEBRTC;
+function callLater(func)
+{
+ return new Promise(resolve =>
+ {
+ let call = () => resolve(func());
+
+ // If this looks like Node.js, call process.nextTick, otherwise call
+ // setTimeout.
+ if (typeof process != "undefined")
+ process.nextTick(call);
+ else
+ setTimeout(call, 0);
+ });
+}
+
+function async(funcs)
+{
+ if (!Array.isArray(funcs))
+ funcs = Array.from(arguments);
+
+ let lastPause = Date.now();
+
+ return funcs.reduce((promise, next) => promise.then(() =>
+ {
+ // If it has been 100ms or longer since the last call, take a pause. This
+ // keeps the browser from freezing up.
+ let now = Date.now();
+ if (now - lastPause >= 100)
+ {
+ lastPause = now;
+ return callLater(next);
+ }
+
+ return next();
+ }),
+ Promise.resolve());
+}
+
function parseDomains(domains, included, excluded)
{
for (let domain in domains)
{
if (domain != "")
{
let enabled = domains[domain];
domain = punycode.toASCII(domain.toLowerCase());
@@ -609,24 +647,505 @@
if (unlessDomain)
rule.trigger["unless-domain"] = unlessDomain;
rules.push(rule);
}
}
+/**
+ * Check if two strings are a close match
+ *
+ * This function returns an edit operation, one of "substitute", "delete", and
+ * "insert", along with an index in the source string where the edit must occur
+ * in order to arrive at the target string. If the strings are not a close
+ * match, it returns null.
+ *
+ * Two strings are considered to be a close match if they are one edit
+ * operation apart.
+ *
+ * Deletions or insertions of a contiguous range of characters from one string
+ * into the other, at the same index, are treated as a single edit. For
+ * example, "internal" and "international" are considered to be one edit apart
+ * and therefore a close match.
+ *
+ * A few things to note:
+ *
+ * 1) This function does not care about the format of the input strings. For
+ * example, the caller may pass in regular expressions, where "[ab]" and
+ * "[bc]" could be considered to be a close match, since the order within the
+ * brackets doesn't matter. This function will still return null for this set
+ * of inputs since they are two edits apart.
+ *
+ * 2) To be friendly to calling code that might be passing in regular
+ * expressions, this function will simply return null if it encounters a
+ * special character (e.g. "\", "?", "+", etc.) in the delta. For example,
+ * given "Hello" and "Hello, how are you?", it will return null.
+ *
+ * 3) If the caller does indeed pass in regular expressions, it must make the
+ * important assumption that the parts where two such regular expressions may
+ * differ can always be treated as normal strings. For example,
+ * "^https?://example.com/ads" and "^https?://example.com/adv" differ only in
+ * the last character, therefore the regular expressions can safely be merged
+ * into "^https?://example.com/ad[sv]".
+ *
+ * @param {string} s The source string
+ * @param {string} t The target string
+ *
+ * @returns {object} An object describing the single edit operation that must
+ * occur in the source string in order to arrive at the
+ * target string
+ */
+function closeMatch(s, t)
+{
+ let diff = s.length - t.length;
+
+ // If target is longer than source, swap them for the purpose of our
+ // calculation.
+ if (diff < 0)
+ {
+ let tmp = s;
+ s = t;
+ t = tmp;
+ }
+
+ let edit = null;
+
+ let i = 0;
+ let j = 0;
+
+ // Start from the beginning and keep going until we hit a character that
+ // doesn't match.
+ for (; i < s.length; i++)
+ {
+ if (s[i] != t[i])
+ break;
+ }
+
+ // Now do exactly the same from the end, but also stop if we reach the
+ // position where we terminated the previous loop.
+ for (; j < t.length; j++)
+ {
+ if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1])
+ break;
+ }
+
+ if (diff == 0)
+ {
+ // If the strings are equal in length and the delta isn't exactly one
+ // character, it's not a close match.
+ if (t.length - j - i != 1)
+ return null;
+ }
+ else if (i != t.length - j)
+ {
+ // For strings of unequal length, if we haven't found a match for every
+ // single character in the shorter string counting from both the beginning
+ // and the end, it's not a close match.
+ return null;
+ }
+
+ for (let k = i; k < s.length - j; k++)
+ {
+ // If the delta contains any special characters, it's not a close match.
+ if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
+ s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
+ s[k] == "[" || s[k] == "]" || s[k] == "\\")
+ return null;
+ }
+
+ if (diff == 0)
+ {
+ edit = {type: "substitute", index: i};
+ }
+ else if (diff > 0)
+ {
+ edit = {type: "delete", index: i};
+
+ if (diff > 1)
+ edit.endIndex = s.length - j;
+ }
+ else
+ {
+ edit = {type: "insert", index: i};
+
+ if (diff < -1)
+ edit.endIndex = s.length - j;
+ }
+
+ return edit;
+}
+
+function eliminateRedundantRulesByURLFilter(rulesInfo, exhaustive)
+{
+ const heuristicRange = 1000;
+
+ let ol = rulesInfo.length;
+
+ // Throw out obviously redundant rules.
+ return async(rulesInfo.map((ruleInfo, index) => () =>
+ {
+ // If this rule is already marked as redundant, don't bother comparing it
+ // with other rules.
+ if (rulesInfo[index].redundant)
+ return;
+
+ let limit = exhaustive ? rulesInfo.length :
+ Math.min(index + heuristicRange, rulesInfo.length);
+
+ for (let i = index, j = i + 1; j < limit; j++)
+ {
+ if (rulesInfo[j].redundant)
+ continue;
+
+ let source = rulesInfo[i].rule.trigger["url-filter"];
+ let target = rulesInfo[j].rule.trigger["url-filter"];
+
+ if (source.length >= target.length)
+ {
+ // If one URL filter is a substring of the other starting at the
+ // beginning, the other one is clearly redundant.
+ if (source.substring(0, target.length) == target)
+ {
+ rulesInfo[i].redundant = true;
+ break;
+ }
+ }
+ else if (target.substring(0, source.length) == source)
+ {
+ rulesInfo[j].redundant = true;
+ }
+ }
+ }))
+ .then(() => rulesInfo.filter(ruleInfo => !ruleInfo.redundant));
+}
+
+function findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive)
+{
+ // Closely matching rules are likely to be within a certain range. We only
+ // look for matches within this range by default. If we increase this value,
+ // it can give us more matches and a smaller resulting rule set, but possibly
+ // at a significant performance cost.
+ //
+ // If the exhaustive option is true, we simply ignore this value and look for
+ // matches throughout the rule set.
+ const heuristicRange = 1000;
+
+ let limit = exhaustive ? rulesInfo.length :
+ Math.min(index + heuristicRange, rulesInfo.length);
+
+ for (let i = index, j = i + 1; j < limit; j++)
+ {
+ let source = rulesInfo[i].rule.trigger["url-filter"];
+ let target = rulesInfo[j].rule.trigger["url-filter"];
+
+ let edit = closeMatch(source, target);
+
+ if (edit)
+ {
+ let urlFilter, ruleInfo, match = {edit};
+
+ if (edit.type == "insert")
+ {
+ // Convert the insertion into a deletion and stick it on the target
+ // rule instead. We can only group deletions and substitutions;
+ // therefore insertions must be treated as deletions on the target
+ // rule.
+ urlFilter = target;
+ ruleInfo = rulesInfo[j];
+ match.index = i;
+ edit.type = "delete";
+ }
+ else
+ {
+ urlFilter = source;
+ ruleInfo = rulesInfo[i];
+ match.index = j;
+ }
+
+ // If the edit has an end index, it represents a multiple character
+ // edit.
+ let multiEdit = !!edit.endIndex;
+
+ if (multiEdit)
+ {
+ // We only care about a single multiple character edit because the
+ // number of characters for such a match doesn't matter, we can
+ // only merge with one other rule.
+ if (!ruleInfo.multiEditMatch)
+ ruleInfo.multiEditMatch = match;
+ }
+ else
+ {
+ // For single character edits, multiple rules can be merged into
+ // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
+ if (!ruleInfo.matches)
+ ruleInfo.matches = new Array(urlFilter.length);
+
+ // Matches at a particular index. For example, for a source string
+ // "ads", both target strings "ad" (deletion) and "adv"
+ // (substitution) match at index 2, hence they are grouped together
+ // to possibly be merged later into "ad[sv]?".
+ let matchesForIndex = ruleInfo.matches[edit.index];
+
+ if (matchesForIndex)
+ {
+ matchesForIndex.push(match);
+ }
+ else
+ {
+ matchesForIndex = [match];
+ ruleInfo.matches[edit.index] = matchesForIndex;
+ }
+
+ // Keep track of the best set of matches. We later sort by this to
+ // get best results.
+ if (!ruleInfo.bestMatches ||
+ matchesForIndex.length > ruleInfo.bestMatches.length)
+ ruleInfo.bestMatches = matchesForIndex;
+ }
+ }
+ }
+}
+
+function mergeCandidateRulesByURLFilter(rulesInfo)
+{
+ // Filter out rules that have no matches at all.
+ let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
+ {
+ return ruleInfo.bestMatches || ruleInfo.multiEditMatch
+ });
+
+ // For best results, we have to sort the candidates by the largest set of
+ // matches.
+ //
+ // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
+ // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
+ // "[ab]dx" (3 rules).
+ candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
+ {
+ let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
+ ruleInfo1.multiEditMatch ? 1 : 0;
+ let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
+ ruleInfo2.multiEditMatch ? 1 : 0;
+
+ return weight2 - weight1;
+ });
+
+ for (let ruleInfo of candidateRulesInfo)
+ {
+ let rule = ruleInfo.rule;
+
+ // If this rule has already been merged into another rule, we skip it.
+ if (ruleInfo.merged)
+ continue;
+
+ // Find the best set of rules to group, which is simply the largest set.
+ let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
+ {
+ matchesForIndex = (matchesForIndex || []).filter(match =>
+ {
+ // Filter out rules that have either already been merged into other
+ // rules or have had other rules merged into them.
+ return !rulesInfo[match.index].merged &&
+ !rulesInfo[match.index].mergedInto;
+ });
+
+ return matchesForIndex.length > best.length ? matchesForIndex : best;
+ },
+ []);
+
+ let multiEdit = false;
+
+ // If we couldn't find a single rule to merge with, let's see if we have a
+ // multiple character edit. e.g. we could merge "ad" and "adserver" into
+ // "ad(server)?".
+ if (best.length == 0 && ruleInfo.multiEditMatch &&
+ !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
+ !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
+ {
+ best = [ruleInfo.multiEditMatch];
+ multiEdit = true;
+ }
+
+ if (best.length > 0)
+ {
+ let urlFilter = rule.trigger["url-filter"];
+
+ let editIndex = best[0].edit.index;
+
+ if (!multiEdit)
+ {
+ // Merge all the matching rules into this one.
+
+ let characters = [urlFilter[editIndex]];
+ let quantifier = "";
+
+ for (let match of best)
+ {
+ if (match.edit.type == "delete")
+ {
+ quantifier = "?";
+ }
+ else
+ {
+ let character = rulesInfo[match.index].rule
+ .trigger["url-filter"][editIndex];
+
+ // Insert any hyphen at the beginning so it gets interpreted as a
+ // literal hyphen.
+ if (character == "-")
+ characters.unshift(character);
+ else
+ characters.push(character);
+ }
+
+ // Mark the target rule as merged so other rules don't try to merge
+ // it again.
+ rulesInfo[match.index].merged = true;
+ }
+
+ urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
+ urlFilter.substring(editIndex + 1);
+ if (characters.length > 1)
+ {
+ urlFilter = urlFilter.substring(0, editIndex) + "[" +
+ characters.join("") + "]" +
+ urlFilter.substring(editIndex + 1);
+ }
+ }
+ else
+ {
+ let editEndIndex = best[0].edit.endIndex;
+
+ // Mark the target rule as merged so other rules don't try to merge it
+ // again.
+ rulesInfo[best[0].index].merged = true;
+
+ urlFilter = urlFilter.substring(0, editIndex) + "(" +
+ urlFilter.substring(editIndex, editEndIndex) + ")?" +
+ urlFilter.substring(editEndIndex);
+ }
+
+ rule.trigger["url-filter"] = urlFilter;
+
+ // Mark this rule as one that has had other rules merged into it.
+ ruleInfo.mergedInto = true;
+ }
+ }
+}
+
+function mergeRulesByURLFilter(rulesInfo, exhaustive)
+{
+ return async(rulesInfo.map((ruleInfo, index) => () =>
+ findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive)
+ ))
+ .then(() => mergeCandidateRulesByURLFilter(rulesInfo));
+}
+
+function mergeRulesByArrayProperty(rulesInfo, propertyType, property)
+{
+ if (rulesInfo.length <= 1)
+ return;
+
+ let valueSet = new Set(rulesInfo[0].rule[propertyType][property]);
+
+ for (let i = 1; i < rulesInfo.length; i++)
+ {
+ for (let value of rulesInfo[i].rule[propertyType][property] || [])
+ valueSet.add(value);
+
+ rulesInfo[i].merged = true;
+ }
+
+ if (valueSet.size > 0)
+ rulesInfo[0].rule[propertyType][property] = Array.from(valueSet);
+
+ rulesInfo[0].mergedInto = true;
+}
+
+function groupRulesByMergeableProperty(rulesInfo, propertyType, property)
+{
+ let mergeableRulesInfoByGroup = new Map();
+
+ for (let ruleInfo of rulesInfo)
+ {
+ let copy = {
+ trigger: Object.assign({}, ruleInfo.rule.trigger),
+ action: Object.assign({}, ruleInfo.rule.action)
+ };
+
+ delete copy[propertyType][property];
+
+ let groupKey = JSON.stringify(copy);
+
+ let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey);
+
+ if (mergeableRulesInfo)
+ mergeableRulesInfo.push(ruleInfo);
+ else
+ mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]);
+ }
+
+ return mergeableRulesInfoByGroup;
+}
+
+function mergeRules(rules, exhaustive)
+{
+ let rulesInfo = rules.map(rule => ({rule}));
+
+ let arrayPropertiesToMergeBy = ["resource-type", "if-domain"];
+
+ return async(() =>
+ {
+ let map = groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter");
+ return async(Array.from(map.values()).map(mergeableRulesInfo => () =>
kzar 2017/07/25 12:18:53 If async always took a sequence as the first argum
Manish Jethani 2017/07/28 09:17:36 That's a good suggestion. If the async function t
+ eliminateRedundantRulesByURLFilter(mergeableRulesInfo, exhaustive)
+ .then(rulesInfo => mergeRulesByURLFilter(rulesInfo, exhaustive))
+ ))
+ .then(() =>
+ {
+ // Filter out rules that are redundant or have been merged into other
+ // rules.
+ rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.redundant &&
+ !ruleInfo.merged);
+ });
+ })
+ .then(() => async(arrayPropertiesToMergeBy.map(arrayProperty => () =>
+ {
+ let map = groupRulesByMergeableProperty(rulesInfo, "trigger",
+ arrayProperty);
+ return async(Array.from(map.values()).map(mergeableRulesInfo => () =>
+ mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty)
+ ))
+ .then(() =>
+ {
+ rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged);
+ });
+ })))
+ .then(() => rulesInfo.map(ruleInfo => ruleInfo.rule));
+}
+
let ContentBlockerList =
/**
* Create a new Adblock Plus filter to content blocker list converter
*
+ * @param {object} options Options for content blocker list generation
+ *
* @constructor
*/
-exports.ContentBlockerList = function ()
+exports.ContentBlockerList = function (options)
{
+ const defaultOptions = {
+ merge: "auto"
+ };
+
+ this.options = Object.assign({}, defaultOptions, options);
+
this.requestFilters = [];
this.requestExceptions = [];
this.elemhideFilters = [];
this.elemhideExceptions = [];
this.genericblockExceptions = [];
this.generichideExceptions = [];
this.elemhideSelectorExceptions = new Map();
};
@@ -671,22 +1190,26 @@
domains = this.elemhideSelectorExceptions[filter.selector] = [];
parseDomains(filter.domains, domains, []);
}
};
/**
* Generate content blocker list for all filters that were added
- *
- * @returns {Filter} filter Filter to convert
*/
-ContentBlockerList.prototype.generateRules = function(filter)
+ContentBlockerList.prototype.generateRules = function()
{
- let rules = [];
+ let cssRules = [];
+ let cssExceptionRules = [];
+ let blockingRules = [];
+ let blockingExceptionRules = [];
+
+ let ruleGroups = [cssRules, cssExceptionRules,
+ blockingRules, blockingExceptionRules];
let genericSelectors = [];
let groupedElemhideFilters = new Map();
for (let filter of this.elemhideFilters)
{
let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
if (!result)
@@ -723,35 +1246,57 @@
let genericSelectorExceptionDomains =
extractFilterDomains(this.generichideExceptions);
elemhideExceptionDomains.forEach(name =>
{
genericSelectorExceptionDomains.add(name);
});
- addCSSRules(rules, genericSelectors, "^https?://",
+ addCSSRules(cssRules, genericSelectors, "^https?://",
genericSelectorExceptionDomains);
groupedElemhideFilters.forEach((selectors, matchDomain) =>
{
- addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);
+ addCSSRules(cssRules, selectors, matchDomain, elemhideExceptionDomains);
});
let requestFilterExceptionDomains = [];
for (let filter of this.genericblockExceptions)
{
let parsed = parseFilterRegexpSource(filter.regexpSource);
if (parsed.hostname)
requestFilterExceptionDomains.push(parsed.hostname);
}
for (let filter of this.requestFilters)
{
- convertFilterAddRules(rules, filter, "block", true,
+ convertFilterAddRules(blockingRules, filter, "block", true,
requestFilterExceptionDomains);
}
for (let filter of this.requestExceptions)
- convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
+ convertFilterAddRules(blockingExceptionRules, filter,
kzar 2017/07/25 12:18:53 Nit: Please use braces for this for loop since it
Manish Jethani 2017/07/28 09:17:36 Done.
+ "ignore-previous-rules", true);
+
+ return async(ruleGroups.map((group, index) => () =>
+ {
+ let next = () =>
+ {
+ if (index == ruleGroups.length - 1)
+ return ruleGroups.reduce((all, rules) => all.concat(rules), []);
+ };
- return rules;
+ if (this.options.merge == "all" ||
+ (this.options.merge == "auto" &&
+ ruleGroups.reduce((n, group) => n + group.length, 0) > 50000))
+ {
+ return mergeRules(ruleGroups[index], this.options.merge == "all")
+ .then(rules =>
+ {
+ ruleGroups[index] = rules;
+ return next();
+ });
+ }
+
+ return next();
+ }));
};
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld