lib/abp2blocklist.js - Issue 29426594: Issue 3673 - Merge closely matching rules

Unified Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist

Patch Set: Rebase with minor changes Created July 20, 2017, 3:45 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: lib/abp2blocklist.js

===================================================================

--- a/lib/abp2blocklist.js

+++ b/lib/abp2blocklist.js

@@ -42,16 +42,54 @@

typeMap.WEBRTC |

typeMap.OBJECT_SUBREQUEST |

typeMap.PING |

typeMap.OTHER;

const whitelistableRequestTypes = httpRequestTypes |

typeMap.WEBSOCKET |

typeMap.WEBRTC;

+function callLater(func)

+ return new Promise(resolve =>

+ {

+ let call = () => resolve(func());

+ // If this looks like Node.js, call process.nextTick, otherwise call

+ // setTimeout.

+ if (typeof process != "undefined")

+ process.nextTick(call);

+ else

+ setTimeout(call, 0);

+ });

+function async(funcs)

+ if (!Array.isArray(funcs))

+ funcs = Array.from(arguments);

+ let lastPause = Date.now();

+ return funcs.reduce((promise, next) => promise.then(() =>

+ {

+ // If it has been 100ms or longer since the last call, take a pause. This

+ // keeps the browser from freezing up.

+ let now = Date.now();

+ if (now - lastPause >= 100)

+ {

+ lastPause = now;

+ return callLater(next);

+ }

+ return next();

+ }),

+ Promise.resolve());

function parseDomains(domains, included, excluded)

{

for (let domain in domains)

{

if (domain != "")

{

let enabled = domains[domain];

domain = punycode.toASCII(domain.toLowerCase());

@@ -609,24 +647,505 @@

if (unlessDomain)

rule.trigger["unless-domain"] = unlessDomain;

rules.push(rule);

}

+/**

+ * Check if two strings are a close match

+ *

+ * This function returns an edit operation, one of "substitute", "delete", and

+ * "insert", along with an index in the source string where the edit must occur

+ * in order to arrive at the target string. If the strings are not a close

+ * match, it returns null.

+ *

+ * Two strings are considered to be a close match if they are one edit

+ * operation apart.

+ *

+ * Deletions or insertions of a contiguous range of characters from one string

+ * into the other, at the same index, are treated as a single edit. For

+ * example, "internal" and "international" are considered to be one edit apart

+ * and therefore a close match.

+ *

+ * A few things to note:

+ *

+ * 1) This function does not care about the format of the input strings. For

+ * example, the caller may pass in regular expressions, where "[ab]" and

+ * "[bc]" could be considered to be a close match, since the order within the

+ * brackets doesn't matter. This function will still return null for this set

+ * of inputs since they are two edits apart.

+ *

+ * 2) To be friendly to calling code that might be passing in regular

+ * expressions, this function will simply return null if it encounters a

+ * special character (e.g. "\", "?", "+", etc.) in the delta. For example,

+ * given "Hello" and "Hello, how are you?", it will return null.

+ *

+ * 3) If the caller does indeed pass in regular expressions, it must make the

+ * important assumption that the parts where two such regular expressions may

+ * differ can always be treated as normal strings. For example,

+ * "^https?://example.com/ads" and "^https?://example.com/adv" differ only in

+ * the last character, therefore the regular expressions can safely be merged

+ * into "^https?://example.com/ad[sv]".

+ *

+ * @param {string} s The source string

+ * @param {string} t The target string

+ *

+ * @returns {object} An object describing the single edit operation that must

+ * occur in the source string in order to arrive at the

+ * target string

+ */

+function closeMatch(s, t)

+ let diff = s.length - t.length;

+ // If target is longer than source, swap them for the purpose of our

+ // calculation.

+ if (diff < 0)

+ {

+ let tmp = s;

+ s = t;

+ t = tmp;

+ }

+ let edit = null;

+ let i = 0;

+ let j = 0;

+ // Start from the beginning and keep going until we hit a character that

+ // doesn't match.

+ for (; i < s.length; i++)

+ {

+ if (s[i] != t[i])

+ break;

+ }

+ // Now do exactly the same from the end, but also stop if we reach the

+ // position where we terminated the previous loop.

+ for (; j < t.length; j++)

+ {

+ if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1])

+ break;

+ }

+ if (diff == 0)

+ {

+ // If the strings are equal in length and the delta isn't exactly one

+ // character, it's not a close match.

+ if (t.length - j - i != 1)

+ return null;

+ }

+ else if (i != t.length - j)

+ {

+ // For strings of unequal length, if we haven't found a match for every

+ // single character in the shorter string counting from both the beginning

+ // and the end, it's not a close match.

+ return null;

+ }

+ for (let k = i; k < s.length - j; k++)

+ {

+ // If the delta contains any special characters, it's not a close match.

+ if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||

+ s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||

+ s[k] == "[" || s[k] == "]" || s[k] == "\\")

+ return null;

+ }

+ if (diff == 0)

+ {

+ edit = {type: "substitute", index: i};

+ }

+ else if (diff > 0)

+ {

+ edit = {type: "delete", index: i};

+ if (diff > 1)

+ edit.endIndex = s.length - j;

+ }

+ else

+ {

+ edit = {type: "insert", index: i};

+ if (diff < -1)

+ edit.endIndex = s.length - j;

+ }

+ return edit;

+function eliminateRedundantRulesByURLFilter(rulesInfo, exhaustive)

+ const heuristicRange = 1000;

+ let ol = rulesInfo.length;

+ // Throw out obviously redundant rules.

+ return async(rulesInfo.map((ruleInfo, index) => () =>

+ {

+ // If this rule is already marked as redundant, don't bother comparing it

+ // with other rules.

+ if (rulesInfo[index].redundant)

+ return;

+ let limit = exhaustive ? rulesInfo.length :

+ Math.min(index + heuristicRange, rulesInfo.length);

+ for (let i = index, j = i + 1; j < limit; j++)

+ {

+ if (rulesInfo[j].redundant)

+ continue;

+ let source = rulesInfo[i].rule.trigger["url-filter"];

+ let target = rulesInfo[j].rule.trigger["url-filter"];

+ if (source.length >= target.length)

+ {

+ // If one URL filter is a substring of the other starting at the

+ // beginning, the other one is clearly redundant.

+ if (source.substring(0, target.length) == target)

+ {

+ rulesInfo[i].redundant = true;

+ break;

+ }

+ else if (target.substring(0, source.length) == source)

+ {

+ rulesInfo[j].redundant = true;

+ }

+ }))

+ .then(() => rulesInfo.filter(ruleInfo => !ruleInfo.redundant));

+function findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive)

+ // Closely matching rules are likely to be within a certain range. We only

+ // look for matches within this range by default. If we increase this value,

+ // it can give us more matches and a smaller resulting rule set, but possibly

+ // at a significant performance cost.

+ //

+ // If the exhaustive option is true, we simply ignore this value and look for

+ // matches throughout the rule set.

+ const heuristicRange = 1000;

+ let limit = exhaustive ? rulesInfo.length :

+ Math.min(index + heuristicRange, rulesInfo.length);

+ for (let i = index, j = i + 1; j < limit; j++)

+ {

+ let source = rulesInfo[i].rule.trigger["url-filter"];

+ let target = rulesInfo[j].rule.trigger["url-filter"];

+ let edit = closeMatch(source, target);

+ if (edit)

+ {

+ let urlFilter, ruleInfo, match = {edit};

+ if (edit.type == "insert")

+ {

+ // Convert the insertion into a deletion and stick it on the target

+ // rule instead. We can only group deletions and substitutions;

+ // therefore insertions must be treated as deletions on the target

+ // rule.

+ urlFilter = target;

+ ruleInfo = rulesInfo[j];

+ match.index = i;

+ edit.type = "delete";

+ }

+ else

+ {

+ urlFilter = source;

+ ruleInfo = rulesInfo[i];

+ match.index = j;

+ }

+ // If the edit has an end index, it represents a multiple character

+ // edit.

+ let multiEdit = !!edit.endIndex;

+ if (multiEdit)

+ {

+ // We only care about a single multiple character edit because the

+ // number of characters for such a match doesn't matter, we can

+ // only merge with one other rule.

+ if (!ruleInfo.multiEditMatch)

+ ruleInfo.multiEditMatch = match;

+ }

+ else

+ {

+ // For single character edits, multiple rules can be merged into

+ // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".

+ if (!ruleInfo.matches)

+ ruleInfo.matches = new Array(urlFilter.length);

+ // Matches at a particular index. For example, for a source string

+ // "ads", both target strings "ad" (deletion) and "adv"

+ // (substitution) match at index 2, hence they are grouped together

+ // to possibly be merged later into "ad[sv]?".

+ let matchesForIndex = ruleInfo.matches[edit.index];

+ if (matchesForIndex)

+ {

+ matchesForIndex.push(match);

+ }

+ else

+ {

+ matchesForIndex = [match];

+ ruleInfo.matches[edit.index] = matchesForIndex;

+ }

+ // Keep track of the best set of matches. We later sort by this to

+ // get best results.

+ if (!ruleInfo.bestMatches ||

+ matchesForIndex.length > ruleInfo.bestMatches.length)

+ ruleInfo.bestMatches = matchesForIndex;

+ }

+function mergeCandidateRulesByURLFilter(rulesInfo)

+ // Filter out rules that have no matches at all.

+ let candidateRulesInfo = rulesInfo.filter(ruleInfo =>

+ {

+ return ruleInfo.bestMatches || ruleInfo.multiEditMatch

+ });

+ // For best results, we have to sort the candidates by the largest set of

+ // matches.

+ //

+ // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to

+ // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and

+ // "[ab]dx" (3 rules).

+ candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>

+ {

+ let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :

+ ruleInfo1.multiEditMatch ? 1 : 0;

+ let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :

+ ruleInfo2.multiEditMatch ? 1 : 0;

+ return weight2 - weight1;

+ });

+ for (let ruleInfo of candidateRulesInfo)

+ {

+ let rule = ruleInfo.rule;

+ // If this rule has already been merged into another rule, we skip it.

+ if (ruleInfo.merged)

+ continue;

+ // Find the best set of rules to group, which is simply the largest set.

+ let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>

+ {

+ matchesForIndex = (matchesForIndex || []).filter(match =>

+ {

+ // Filter out rules that have either already been merged into other

+ // rules or have had other rules merged into them.

+ return !rulesInfo[match.index].merged &&

+ !rulesInfo[match.index].mergedInto;

+ });

+ return matchesForIndex.length > best.length ? matchesForIndex : best;

+ },

+ []);

+ let multiEdit = false;

+ // If we couldn't find a single rule to merge with, let's see if we have a

+ // multiple character edit. e.g. we could merge "ad" and "adserver" into

+ // "ad(server)?".

+ if (best.length == 0 && ruleInfo.multiEditMatch &&

+ !rulesInfo[ruleInfo.multiEditMatch.index].merged &&

+ !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)

+ {

+ best = [ruleInfo.multiEditMatch];

+ multiEdit = true;

+ }

+ if (best.length > 0)

+ {

+ let urlFilter = rule.trigger["url-filter"];

+ let editIndex = best[0].edit.index;

+ if (!multiEdit)

+ {

+ // Merge all the matching rules into this one.

+ let characters = [urlFilter[editIndex]];

+ let quantifier = "";

+ for (let match of best)

+ {

+ if (match.edit.type == "delete")

+ {

+ quantifier = "?";

+ }

+ else

+ {

+ let character = rulesInfo[match.index].rule

+ .trigger["url-filter"][editIndex];

+ // Insert any hyphen at the beginning so it gets interpreted as a

+ // literal hyphen.

+ if (character == "-")

+ characters.unshift(character);

+ else

+ characters.push(character);

+ }

+ // Mark the target rule as merged so other rules don't try to merge

+ // it again.

+ rulesInfo[match.index].merged = true;

+ }

+ urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +

+ urlFilter.substring(editIndex + 1);

+ if (characters.length > 1)

+ {

+ urlFilter = urlFilter.substring(0, editIndex) + "[" +

+ characters.join("") + "]" +

+ urlFilter.substring(editIndex + 1);

+ }

+ else

+ {

+ let editEndIndex = best[0].edit.endIndex;

+ // Mark the target rule as merged so other rules don't try to merge it

+ // again.

+ rulesInfo[best[0].index].merged = true;

+ urlFilter = urlFilter.substring(0, editIndex) + "(" +

+ urlFilter.substring(editIndex, editEndIndex) + ")?" +

+ urlFilter.substring(editEndIndex);

+ }

+ rule.trigger["url-filter"] = urlFilter;

+ // Mark this rule as one that has had other rules merged into it.

+ ruleInfo.mergedInto = true;

+ }

+function mergeRulesByURLFilter(rulesInfo, exhaustive)

+ return async(rulesInfo.map((ruleInfo, index) => () =>

+ findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive)

+ ))

+ .then(() => mergeCandidateRulesByURLFilter(rulesInfo));

+function mergeRulesByArrayProperty(rulesInfo, propertyType, property)

+ if (rulesInfo.length <= 1)

+ return;

+ let valueSet = new Set(rulesInfo[0].rule[propertyType][property]);

+ for (let i = 1; i < rulesInfo.length; i++)

+ {

+ for (let value of rulesInfo[i].rule[propertyType][property] || [])

+ valueSet.add(value);

+ rulesInfo[i].merged = true;

+ }

+ if (valueSet.size > 0)

+ rulesInfo[0].rule[propertyType][property] = Array.from(valueSet);

+ rulesInfo[0].mergedInto = true;

+function groupRulesByMergeableProperty(rulesInfo, propertyType, property)

+ let mergeableRulesInfoByGroup = new Map();

+ for (let ruleInfo of rulesInfo)

+ {

+ let copy = {

+ trigger: Object.assign({}, ruleInfo.rule.trigger),

+ action: Object.assign({}, ruleInfo.rule.action)

+ };

+ delete copy[propertyType][property];

+ let groupKey = JSON.stringify(copy);

+ let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey);

+ if (mergeableRulesInfo)

+ mergeableRulesInfo.push(ruleInfo);

+ else

+ mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]);

+ }

+ return mergeableRulesInfoByGroup;

+function mergeRules(rules, exhaustive)

+ let rulesInfo = rules.map(rule => ({rule}));

+ let arrayPropertiesToMergeBy = ["resource-type", "if-domain"];

+ return async(() =>

+ {

+ let map = groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter");

+ return async(Array.from(map.values()).map(mergeableRulesInfo => () =>

kzar 2017/07/25 12:18:53 If async always took a sequence as the first argum

Manish Jethani 2017/07/28 09:17:36 That's a good suggestion. If the async function t

+ eliminateRedundantRulesByURLFilter(mergeableRulesInfo, exhaustive)

+ .then(rulesInfo => mergeRulesByURLFilter(rulesInfo, exhaustive))

+ ))

+ .then(() =>

+ {

+ // Filter out rules that are redundant or have been merged into other

+ // rules.

+ rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.redundant &&

+ !ruleInfo.merged);

+ });

+ })

+ .then(() => async(arrayPropertiesToMergeBy.map(arrayProperty => () =>

+ {

+ let map = groupRulesByMergeableProperty(rulesInfo, "trigger",

+ arrayProperty);

+ return async(Array.from(map.values()).map(mergeableRulesInfo => () =>

+ mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty)

+ ))

+ .then(() =>

+ {

+ rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged);

+ });

+ })))

+ .then(() => rulesInfo.map(ruleInfo => ruleInfo.rule));

let ContentBlockerList =

/**

* Create a new Adblock Plus filter to content blocker list converter

+ * @param {object} options Options for content blocker list generation

+ *

* @constructor

-exports.ContentBlockerList = function ()

+exports.ContentBlockerList = function (options)

{

+ const defaultOptions = {

+ merge: "auto"

+ };

+ this.options = Object.assign({}, defaultOptions, options);

this.requestFilters = [];

this.requestExceptions = [];

this.elemhideFilters = [];

this.elemhideExceptions = [];

this.genericblockExceptions = [];

this.generichideExceptions = [];

this.elemhideSelectorExceptions = new Map();

};

@@ -671,22 +1190,26 @@

domains = this.elemhideSelectorExceptions[filter.selector] = [];

parseDomains(filter.domains, domains, []);

}

};

/**

* Generate content blocker list for all filters that were added

- *

- * @returns {Filter} filter Filter to convert

-ContentBlockerList.prototype.generateRules = function(filter)

+ContentBlockerList.prototype.generateRules = function()

{

- let rules = [];

+ let cssRules = [];

+ let cssExceptionRules = [];

+ let blockingRules = [];

+ let blockingExceptionRules = [];

+ let ruleGroups = [cssRules, cssExceptionRules,

+ blockingRules, blockingExceptionRules];

let genericSelectors = [];

let groupedElemhideFilters = new Map();

for (let filter of this.elemhideFilters)

{

let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);

if (!result)

@@ -723,35 +1246,57 @@

let genericSelectorExceptionDomains =

extractFilterDomains(this.generichideExceptions);

elemhideExceptionDomains.forEach(name =>

{

genericSelectorExceptionDomains.add(name);

});

- addCSSRules(rules, genericSelectors, "^https?://",

+ addCSSRules(cssRules, genericSelectors, "^https?://",

genericSelectorExceptionDomains);

groupedElemhideFilters.forEach((selectors, matchDomain) =>

{

- addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);

+ addCSSRules(cssRules, selectors, matchDomain, elemhideExceptionDomains);

});

let requestFilterExceptionDomains = [];

for (let filter of this.genericblockExceptions)

{

let parsed = parseFilterRegexpSource(filter.regexpSource);

if (parsed.hostname)

requestFilterExceptionDomains.push(parsed.hostname);

}

for (let filter of this.requestFilters)

{

- convertFilterAddRules(rules, filter, "block", true,

+ convertFilterAddRules(blockingRules, filter, "block", true,

requestFilterExceptionDomains);

}

for (let filter of this.requestExceptions)

- convertFilterAddRules(rules, filter, "ignore-previous-rules", true);

+ convertFilterAddRules(blockingExceptionRules, filter,

kzar 2017/07/25 12:18:53 Nit: Please use braces for this for loop since it

Manish Jethani 2017/07/28 09:17:36 Done.

+ "ignore-previous-rules", true);

+ return async(ruleGroups.map((group, index) => () =>

+ {

+ let next = () =>

+ {

+ if (index == ruleGroups.length - 1)

+ return ruleGroups.reduce((all, rules) => all.concat(rules), []);

+ };

- return rules;

+ if (this.options.merge == "all" ||

+ (this.options.merge == "auto" &&

+ ruleGroups.reduce((n, group) => n + group.length, 0) > 50000))

+ {

+ return mergeRules(ruleGroups[index], this.options.merge == "all")

+ .then(rules =>

+ {

+ ruleGroups[index] = rules;

+ return next();

+ });

+ }

+ return next();

+ }));

};

« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »