lib/abp2blocklist.js - Issue 29426594: Issue 3673 - Merge closely matching rules

Unified Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist

Patch Set: Fix bugs and add unit tests Created May 3, 2017, 12:25 a.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: lib/abp2blocklist.js

===================================================================

--- a/lib/abp2blocklist.js

+++ b/lib/abp2blocklist.js

@@ -361,16 +361,259 @@

newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');

i = pos.end;

}

newSelector.push(selector.substring(i));

return newSelector.join("");

}

+function closeMatch(s, t)

+ // This function returns an edit operation (one of "substitute", "delete",

+ // and "insert") along with an index in the source string where the edit

+ // should occur in order to arrive at the target string.

+ let diff = s.length - t.length;

+ // If the string lenghts differ by more than one character, we cannot arrive

+ // at target from source in a single edit operation.

+ if (diff < -1 || diff > 1)

+ return null;

+ // If target is longer than source, swap them for the purpose of our

+ // calculation.

+ if (diff == -1)

+ {

+ let tmp = s;

+ s = t;

+ t = tmp;

+ }

+ let edit = null;

+ for (let i = 0, j = 0; i < s.length; i++)

+ {

+ if (s[i] == t[j])

+ {

+ j++;

+ }

+ else if (edit)

+ {

+ // Since we want one and only one edit operation, we must bail here.

+ return null;

+ }

+ else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" ||

+ s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" ||

+ s[i] == "[" || s[i] == "]" || s[i] == "\\") ||

+ (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" ||

+ t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" ||

+ t[j] == "[" || t[j] == "]" || t[j] == "\\"))

+ {

+ // We don't deal with special characters for now.

+ return null;

+ }

+ else

+ {

+ switch (diff)

+ {

+ case 0:

+ // If both strings are equal in length, this is a substitution.

+ edit = {type: "substitute", index: i};

+ j++;

+ break;

+ case 1:

+ // If the source string is longer, this is a deletion.

+ edit = {type: "delete", index: i};

+ break;

+ default:

+ edit = {type: "insert", index: i};

+ }

+ return edit;

+function ruleWithoutURLFilter(rule)

+ let copy = {

+ trigger: Object.assign({}, rule.trigger),

+ action: Object.assign({}, rule.action)

+ };

+ delete copy.trigger["url-filter"];

+ return copy;

+function mergeCloselyMatchingRules(rules)

+ // Closely matching rules are likely to be within a certain range. We only

+ // look for matches within this range. If we increase this value, it can give

+ // us more matches and a smaller resulting rule set, but possibly at a

+ // significant performance cost.

+ const heuristicRange = 100;

+ let rulesInfo = new Array(rules.length);

+ rules.forEach((rule, index) =>

+ {

+ rulesInfo[index] = {rule};

+ if (rule.action.type == "ignore-previous-rules")

+ {

+ rulesInfo[index].skip = true;

+ }

+ else

+ {

+ // Save a stringified version of the rule, but without the URL filter. We

+ // use this for comparison later.

+ rulesInfo[index].stringifiedWithoutURLFilter =

+ JSON.stringify(ruleWithoutURLFilter(rule));

+ }

+ });

+ for (let i = 0; i < rules.length; i++)

+ {

+ if (rulesInfo[i].skip)

+ continue;

+ for (let j = i + 1; j < i + heuristicRange && j < rules.length; j++)

+ {

+ if (rulesInfo[j].skip)

+ continue;

+ // Check if the rules are identical except for the URL filter.

+ if (rulesInfo[i].stringifiedWithoutURLFilter ==

+ rulesInfo[j].stringifiedWithoutURLFilter)

+ {

+ let source = rules[i].trigger["url-filter"];

+ let target = rules[j].trigger["url-filter"];

+ // Find out if the Levenshtein distance between the rules is 1.

+ let edit = closeMatch(source, target);

+ if (edit)

+ {

+ let urlFilter, ruleInfo, match = {edit};

+ if (edit.type == "insert")

+ {

+ // Convert the insertion into a deletion and stick it on the target

+ // rule instead. We can only group deletions and substitutions;

+ // therefore insertions must be treated as deletions on the target

+ // rule, to be dealt with later.

+ urlFilter = target;

+ ruleInfo = rulesInfo[j];

+ match.index = i;

+ edit.type = "delete";

+ }

+ else

+ {

+ urlFilter = source;

+ ruleInfo = rulesInfo[i];

+ match.index = j;

+ }

+ if (!ruleInfo.matches)

+ ruleInfo.matches = new Array(urlFilter.length + 1);

+ let matchesForIndex = ruleInfo.matches[edit.index];

+ if (matchesForIndex)

+ {

+ matchesForIndex.push(match);

+ }

+ else

+ {

+ matchesForIndex = [match];

+ ruleInfo.matches[edit.index] = matchesForIndex;

+ }

+ if (!ruleInfo.bestMatches ||

+ matchesForIndex.length > ruleInfo.bestMatches.length)

+ ruleInfo.bestMatches = matchesForIndex;

+ }

+ let candidateRulesInfo = rulesInfo.filter(ruleInfo => ruleInfo.bestMatches);

+ // For best results, we have to sort the candidates by the number of matches.

+ // For example, we want "ads", "bds", "adv", "bdv", and "bdx" to generate

+ // "ad[sv]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and "bdx" (3

+ // rules).

+ candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>

+ {

+ return ruleInfo2.bestMatches.length - ruleInfo1.bestMatches.length;

+ });

+ for (let ruleInfo of candidateRulesInfo)

+ {

+ let rule = ruleInfo.rule;

+ if (rule._merged)

+ continue;

+ // Find the best set of rules to group, which is simply the largest set.

+ let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>

+ {

+ matchesForIndex = (matchesForIndex || []).filter(match =>

+ {

+ // Filter out rules that have either already been merged into other

+ // rules or have had other rules merged into them.

+ return !rules[match.index]._merged &&

+ !rulesInfo[match.index].mergedInto;

+ });

+ return matchesForIndex.length > best.length ? matchesForIndex : best;

+ },

+ []);

+ if (best.length > 0)

+ {

+ // Merge all the matching rules into this one.

+ let editIndex = best[0].edit.index;

+ let characters = [];

+ let quantifier = "";

+ for (let match of best)

+ {

+ if (match.edit.type == "delete")

+ quantifier = "?";

+ else

+ characters.push(rules[match.index].trigger["url-filter"][editIndex]);

+ rules[match.index]._merged = true;

+ }

+ let urlFilter = rule.trigger["url-filter"];

+ urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +

+ urlFilter.substring(editIndex + 1);

+ if (characters.length > 0)

+ {

+ urlFilter = urlFilter.substring(0, editIndex) + "[" +

+ urlFilter[editIndex] + characters.join("") + "]" +

+ urlFilter.substring(editIndex + 1);

+ }

+ rule.trigger["url-filter"] = urlFilter;

+ ruleInfo.mergedInto = true;

+ }

+ return rules.filter(rule => !rule._merged);

let ContentBlockerList =

/**

* Create a new Adblock Plus filter to content blocker list converter

* @constructor

exports.ContentBlockerList = function ()

{

@@ -419,17 +662,17 @@

}

};

/**

* Generate content blocker list for all filters that were added

* @returns {Filter} filter Filter to convert

-ContentBlockerList.prototype.generateRules = function(filter)

+ContentBlockerList.prototype.generateRules = function({merge = false} = {})

{

let rules = [];

let groupedElemhideFilters = new Map();

for (let filter of this.elemhideFilters)

{

let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);

if (!result)

@@ -467,10 +710,15 @@

for (let filter of this.elemhideExceptions)

convertFilterAddRules(rules, filter, "ignore-previous-rules", false);

for (let filter of this.requestFilters)

convertFilterAddRules(rules, filter, "block", true);

for (let filter of this.requestExceptions)

convertFilterAddRules(rules, filter, "ignore-previous-rules", true);

- return rules.filter(rule => !hasNonASCI(rule));

+ rules = rules.filter(rule => !hasNonASCI(rule));

+ if (merge)

+ rules = mergeCloselyMatchingRules(rules);

+ return rules;

};

« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »