Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Make it work in Safari Created May 5, 2017, 4:36 a.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: lib/abp2blocklist.js
===================================================================
--- a/lib/abp2blocklist.js
+++ b/lib/abp2blocklist.js
@@ -33,16 +33,63 @@
| typeMap.POPUP
| typeMap.OBJECT
| typeMap.OBJECT_SUBREQUEST
| typeMap.XMLHTTPREQUEST
| typeMap.PING
| typeMap.SUBDOCUMENT
| typeMap.OTHER);
+function pearsonHash(message)
+{
+ // This is an implementation of the Pearson hashing algorithm, where we
+ // generate a 32-bit digest for a given message.
+
+ // Note that this code will only look at the lowest 8 bits of each character.
+ // For best results, encode the input as UTF-8.
+
+ // A table of all numbers from 0 through 255 shuffled up.
+ const table = [
+ 0xe5, 0x0a, 0x9f, 0x79, 0x99, 0xad, 0x10, 0x85, 0x5d, 0x55, 0x75, 0x2e,
+ 0x04, 0x1a, 0xb5, 0x7d, 0x96, 0xe6, 0xa3, 0xc6, 0x82, 0x87, 0xb2, 0xef,
+ 0x00, 0x64, 0x70, 0x4b, 0xe4, 0x2f, 0x37, 0x52, 0x90, 0x1d, 0x08, 0x68,
+ 0x3a, 0x26, 0x74, 0xaa, 0xc1, 0x80, 0x17, 0xc2, 0x0f, 0xec, 0xc8, 0x1f,
+ 0xe2, 0x3c, 0xe1, 0xa1, 0x8f, 0xfd, 0x9d, 0x0b, 0xd5, 0xcc, 0xd4, 0xd9,
+ 0xf0, 0x3d, 0x5e, 0x57, 0xae, 0x12, 0x46, 0xb0, 0x63, 0x94, 0x61, 0x9a,
+ 0xbb, 0x76, 0x0c, 0x3f, 0xc4, 0x59, 0xdc, 0x5c, 0xb4, 0xc7, 0x73, 0x39,
+ 0x65, 0x2c, 0x2a, 0xc3, 0xed, 0x20, 0x54, 0xfe, 0xfb, 0xd0, 0xa6, 0x33,
+ 0x4a, 0x9e, 0xe7, 0x49, 0xea, 0x58, 0xaf, 0x35, 0x30, 0x95, 0x2b, 0x56,
+ 0x14, 0xff, 0xb1, 0xd6, 0x27, 0x6a, 0x88, 0x89, 0x43, 0x4c, 0xca, 0xb9,
+ 0x21, 0x8a, 0x78, 0xf1, 0x18, 0x1e, 0xd1, 0xe0, 0x60, 0xf8, 0x3e, 0xdd,
+ 0x25, 0x16, 0xde, 0xc0, 0x98, 0x28, 0x7a, 0x3b, 0x1b, 0x45, 0xa5, 0xb3,
+ 0xe3, 0x84, 0xd3, 0xb8, 0xbd, 0x47, 0xe9, 0xfa, 0xc9, 0xb6, 0xbe, 0x6c,
+ 0x9c, 0xac, 0xda, 0xfc, 0x41, 0x0d, 0x07, 0x91, 0x6b, 0x6f, 0x03, 0xcb,
+ 0xbc, 0x8d, 0x06, 0x01, 0xd2, 0x8c, 0x19, 0x5a, 0x02, 0xba, 0x4f, 0x6e,
+ 0x2d, 0xf4, 0xcf, 0x7e, 0x6d, 0x42, 0x93, 0x31, 0xbf, 0xdf, 0x5f, 0x67,
+ 0x53, 0xf6, 0x38, 0x9b, 0xa7, 0xe8, 0xee, 0x5b, 0x0e, 0x22, 0xdb, 0x51,
+ 0x8e, 0x69, 0x97, 0x32, 0x36, 0xce, 0x77, 0x4d, 0xa4, 0xf2, 0x23, 0xc5,
+ 0x11, 0x05, 0xab, 0xf9, 0x13, 0xd8, 0x7b, 0xa8, 0x40, 0x66, 0xd7, 0x24,
+ 0x86, 0xa0, 0xeb, 0xf3, 0x81, 0x4e, 0x50, 0xf7, 0xb7, 0x7f, 0x83, 0xa2,
+ 0xa9, 0x09, 0xcd, 0x62, 0x7c, 0x92, 0x8b, 0x71, 0x44, 0xf5, 0x72, 0x1c,
+ 0x29, 0x48, 0x34, 0x15
+ ];
+
+ let digest = 0;
+
+ for (let i = 0; i < 4; i++)
+ {
+ let d = table[(message.charCodeAt(0) + i) % 256];
+ for (let j = 1; j < message.length; j++)
+ d = table[d ^ message.charCodeAt(j)];
+ digest |= (d & 0xff) << i * 8;
+ }
+
+ return digest;
+}
+
function parseDomains(domains, included, excluded)
{
for (let domain in domains)
{
if (domain != "")
{
let enabled = domains[domain];
domain = punycode.toASCII(domain.toLowerCase());
@@ -361,16 +408,425 @@
newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
i = pos.end;
}
newSelector.push(selector.substring(i));
return newSelector.join("");
}
+function closeMatch(s, t, singleCharacterOnly)
+{
+ // This function returns an edit operation, one of "substitute", "delete",
+ // and "insert", along with an index in the source string where the edit must
+ // occur in order to arrive at the target string. If the strings are not a
+ // close match, it returns null.
+
+ // If singleCharacterOnly is false, deletions or insertions of a contiguous
+ // range of characters from one string into the other, at the same index, are
+ // treated as a single edit. For example, "internal" and "international" are
+ // considered to be one edit apart, inserting the substring "tiona" from the
+ // latter into the former.
+
+ // A few things to note:
+ //
+ // 1) This function does not care about how the input strings are treated
+ // by the caller. It only treats them as raw strings. For example, the
+ // caller may treat them as regular expressions, where "[ab]" and "[bc]"
+ // could be considered to have an edit distance of 1, since the order
+ // within the brackets does not matter. This function will still return
+ // null for this set of inputs since they are two edits apart.
+ //
+ // 2) To be friendly to calling code that might be passing in regular
+ // expressions anyway, this function will simply return null if it
+ // encounters a special character (e.g. "\", "?", "+", "*", etc.) in the
+ // delta. For example, given "Hello" and "Hello, how are you?", it will
+ // return null instead of "{type: 'insert', index: 5, endIndex: 19}".
+ //
+ // 3) The calling code within this file does indeed pass in regular
+ // expressions (the strict subset of JavaScript regular expressions
+ // supported by WebKit for content blockers), making the important
+ // assumption that the parts where two such regular expressions may
+ // differ can always be treated as normal strings.
+ //
+ // For example, "^https?://.*/ads" and "^https?://.*/adv" differ only in
+ // the last character, therefore the regular expressions can safely be
+ // merged into "^https?://.*/ad[sv]". If, for example, the characters in
+ // the delta were to appear within square brackets originally in the
+ // input strings (e.g. "^https?://.*/ad[sx]" and "^https?://.*/ad[vx]"),
+ // the calling code would have to do extra work to merge the two regular
+ // expressions correctly. The calling code within this file assumes that
+ // this is never the case.
+
+ let diff = s.length - t.length;
+
+ // If the string lengths differ by more than one character, we cannot arrive
+ // at target from source in a single edit operation.
+ if (singleCharacterOnly && (diff < -1 || diff > 1))
+ return null;
+
+ // If target is longer than source, swap them for the purpose of our
+ // calculation.
+ if (diff < 0)
+ {
+ let tmp = s;
+ s = t;
+ t = tmp;
+ }
+
+ let edit = null;
+
+ // If the string lengths differ by only one character at most, use the simple
+ // algorithm to find a single character edit.
+ if (diff == 0 || diff == 1 || diff == -1)
+ {
+ for (let i = 0, j = 0; i < s.length; i++)
+ {
+ if (s[i] == t[j])
+ {
+ j++;
+ }
+ else if (edit)
+ {
+ // Since we want one and only one edit operation, we must bail here.
+ return null;
+ }
+ else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" ||
+ s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" ||
+ s[i] == "[" || s[i] == "]" || s[i] == "\\") ||
+ (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" ||
+ t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" ||
+ t[j] == "[" || t[j] == "]" || t[j] == "\\"))
+ {
+ // We don't deal with special characters for now.
+ return null;
+ }
+ else if (diff == 0)
+ {
+ // If both strings are equal in length, this is a substitution.
+ edit = {type: "substitute", index: i};
+ j++;
+ }
+ else if (diff > 0)
+ {
+ // If the source string is longer, this is a deletion.
+ edit = {type: "delete", index: i};
+ }
+ else
+ {
+ edit = {type: "insert", index: i};
+ }
+ }
+ }
+ else if (!singleCharacterOnly)
+ {
+ // Try another algorithm to find a multiple character deletion or
+ // insertion.
+
+ let i = 0, j = 0;
+
+ for (; i < s.length; i++)
+ {
+ if (s[i] != t[i])
+ break;
+ }
+
+ for (; j < t.length; j++)
+ {
+ if (t.length - j == i ||
+ s[s.length - j - 1] != t[t.length - j - 1])
+ break;
+ }
+
+ if (i != t.length - j)
+ return null;
+
+ for (let k = i; k < s.length - j; k++)
+ {
+ // If there are any special characters in the delta, bail.
+ if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
+ s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
+ s[k] == "[" || s[k] == "]" || s[k] == "\\")
+ return null;
+ }
+
+ if (diff > 0)
+ {
+ edit = {type: "delete", index: i, endIndex: s.length - j};
+ }
+ else
+ {
+ edit = {type: "insert", index: i, endIndex: s.length - j};
+ }
+ }
+
+ return edit;
+}
+
+function mergeCloselyMatchingRules(rules, options)
+{
+ const defaultOptions = {advanced: false, exhaustive: false};
+
+ options = Object.assign({}, defaultOptions, options);
+
+ // Closely matching rules are likely to be within a certain range. We only
+ // look for matches within this range. If we increase this value, it can give
+ // us more matches and a smaller resulting rule set, but possibly at a
+ // significant performance cost.
+ const heuristicRange = 100;
+
+ let rulesInfo = new Array(rules.length);
+
+ rules.forEach((rule, index) =>
+ {
+ rulesInfo[index] = {rule};
+
+ if (rule.action.type == "ignore-previous-rules")
+ {
+ rulesInfo[index].skip = true;
+ }
+ else
+ {
+ // Save a hash of the rule but without the URL filter. We use this for
+ // comparison later.
+ let copy = {
+ trigger: Object.assign({}, rule.trigger),
+ action: Object.assign({}, rule.action)
+ };
+
+ delete copy.trigger["url-filter"];
+
+ let stringified = JSON.stringify(copy);
+
+ if (options.exhaustive)
+ {
+ // The Pearson hash function expects all characters to be within the
+ // 8-bit range.
+ stringified = encodeURIComponent(stringified);
+
+ rulesInfo[index].ruleHash = pearsonHash(stringified);
+ }
+ else
+ {
+ rulesInfo[index].ruleHash = stringified;
+ }
+ }
+ });
+
+ for (let i = 0; i < rules.length; i++)
+ {
+ if (rulesInfo[i].skip)
+ continue;
+
+ let limit = options.exhaustive ? rules.length :
+ Math.min(i + heuristicRange, rules.length);
+
+ for (let j = i + 1; j < limit; j++)
+ {
+ if (rulesInfo[j].skip)
+ continue;
+
+ // Check if the rules are identical except for the URL filter.
+ if (rulesInfo[i].ruleHash == rulesInfo[j].ruleHash)
+ {
+ let source = rules[i].trigger["url-filter"];
+ let target = rules[j].trigger["url-filter"];
+
+ let edit = closeMatch(source, target, !options.advanced);
+
+ if (edit)
+ {
+ let urlFilter, ruleInfo, match = {edit};
+
+ if (edit.type == "insert")
+ {
+ // Convert the insertion into a deletion and stick it on the target
+ // rule instead. We can only group deletions and substitutions;
+ // therefore insertions must be treated as deletions on the target
+ // rule.
+ urlFilter = target;
+ ruleInfo = rulesInfo[j];
+ match.index = i;
+ edit.type = "delete";
+ }
+ else
+ {
+ urlFilter = source;
+ ruleInfo = rulesInfo[i];
+ match.index = j;
+ }
+
+ // If the edit has an end index, it represents a multiple character
+ // edit.
+ let multiEdit = !!edit.endIndex;
+
+ if (multiEdit)
+ {
+ // We only care about a single multiple character edit because the
+ // number of characters for such a match doesn't matter, we can
+ // only merge with one other rule.
+ if (!ruleInfo.multiEditMatch)
+ ruleInfo.multiEditMatch = match;
+ }
+ else
+ {
+ // For single character edits, multiple rules can be merged into
+ // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
+ if (!ruleInfo.matches)
+ ruleInfo.matches = new Array(urlFilter.length + 1);
+
+ // Matches at a particular index. For example, for a source string
+ // "ads", both target strings "ad" (deletion) and "adv"
+ // (substitution) match at index 2, hence they are grouped together
+ // to possibly be merged later into "ad[sv]?".
+ let matchesForIndex = ruleInfo.matches[edit.index];
+
+ if (matchesForIndex)
+ {
+ matchesForIndex.push(match);
+ }
+ else
+ {
+ matchesForIndex = [match];
+ ruleInfo.matches[edit.index] = matchesForIndex;
+ }
+
+ // Keep track of the best set of matches. We later sort by this to
+ // get best results.
+ if (!ruleInfo.bestMatches ||
+ matchesForIndex.length > ruleInfo.bestMatches.length)
+ ruleInfo.bestMatches = matchesForIndex;
+ }
+ }
+ }
+ }
+ }
+
+ // Filter out rules that have no matches at all.
+ let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
+ {
+ return ruleInfo.bestMatches || ruleInfo.multiEditMatch
+ });
+
+ // For best results, we have to sort the candidates by the largest set of
+ // matches.
+ //
+ // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
+ // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
+ // "[ab]dx" (3 rules).
+ candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
+ {
+ let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
+ ruleInfo1.multiEditMatch ? 1 : 0;
+ let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
+ ruleInfo2.multiEditMatch ? 1 : 0;
+
+ return weight2 - weight1;
+ });
+
+ for (let ruleInfo of candidateRulesInfo)
+ {
+ let rule = ruleInfo.rule;
+
+ // If this rule has already been merged into another rule, we skip it.
+ if (ruleInfo.merged)
+ continue;
+
+ // Find the best set of rules to group, which is simply the largest set.
+ let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
+ {
+ matchesForIndex = (matchesForIndex || []).filter(match =>
+ {
+ // Filter out rules that have either already been merged into other
+ // rules or have had other rules merged into them.
+ return !rulesInfo[match.index].merged &&
+ !rulesInfo[match.index].mergedInto;
+ });
+
+ return matchesForIndex.length > best.length ? matchesForIndex : best;
+ },
+ []);
+
+ let multiEdit = false;
+
+ // If we couldn't find a single rule to merge with, let's see if we have a
+ // multiple character edit. e.g. we could merge "ad" and "adserver" into
+ // "ad(server)?".
+ if (best.length == 0 && ruleInfo.multiEditMatch &&
+ !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
+ !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
+ {
+ best = [ruleInfo.multiEditMatch];
+ multiEdit = true;
+ }
+
+ if (best.length > 0)
+ {
+ let urlFilter = rule.trigger["url-filter"];
+
+ let editIndex = best[0].edit.index;
+
+ if (!multiEdit)
+ {
+ // Merge all the matching rules into this one.
+
+ let characters = [];
+ let quantifier = "";
+
+ for (let match of best)
+ {
+ if (match.edit.type == "delete")
+ {
+ quantifier = "?";
+ }
+ else
+ {
+ let character = rules[match.index].trigger["url-filter"][editIndex];
+ characters.push(character);
+ }
+
+ // Mark the target rule as merged so other rules don't try to merge
+ // it again.
+ rulesInfo[match.index].merged = true;
+ }
+
+ urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
+ urlFilter.substring(editIndex + 1);
+ if (characters.length > 0)
+ {
+ urlFilter = urlFilter.substring(0, editIndex) + "[" +
+ urlFilter[editIndex] + characters.join("") + "]" +
+ urlFilter.substring(editIndex + 1);
+ }
+ }
+ else
+ {
+ let editEndIndex = best[0].edit.endIndex;
+
+ // Mark the target rule as merged so other rules don't try to merge it
+ // again.
+ rulesInfo[best[0].index].merged = true;
+
+ urlFilter = urlFilter.substring(0, editIndex) + "(" +
+ urlFilter.substring(editIndex, editEndIndex) + ")?" +
+ urlFilter.substring(editEndIndex);
+ }
+
+ rule.trigger["url-filter"] = urlFilter;
+
+ // Mark this rule as one that has had other rules merged into it.
+ ruleInfo.mergedInto = true;
+ }
+ }
+
+ // Filter out rules that have been merged into other rules.
+ return rulesInfo.filter(ruleInfo => !ruleInfo.merged)
+ .map(ruleInfo => ruleInfo.rule);
+}
+
let ContentBlockerList =
/**
* Create a new Adblock Plus filter to content blocker list converter
*
* @constructor
*/
exports.ContentBlockerList = function ()
{
@@ -419,18 +875,27 @@
}
};
/**
* Generate content blocker list for all filters that were added
*
* @returns {Filter} filter Filter to convert
*/
-ContentBlockerList.prototype.generateRules = function(filter)
+ContentBlockerList.prototype.generateRules = function(options)
{
+ const defaultOptions = {
+ merge: false,
+ fastMerge: true,
+ advancedMerge: null,
+ exhaustiveMerge: null
+ };
+
+ options = Object.assign({}, defaultOptions, options);
+
let rules = [];
let groupedElemhideFilters = new Map();
for (let filter of this.elemhideFilters)
{
let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
if (!result)
continue;
@@ -467,10 +932,26 @@
for (let filter of this.elemhideExceptions)
convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
for (let filter of this.requestFilters)
convertFilterAddRules(rules, filter, "block", true);
for (let filter of this.requestExceptions)
convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
- return rules.filter(rule => !hasNonASCI(rule));
+ rules = rules.filter(rule => !hasNonASCI(rule));
+
+ if (options.merge)
+ {
+ // If the more specific options are specified (e.g. "advanced" and
+ // "exhaustive"), they override the more general options (e.g. "fast").
+ let mergeOptions = {
+ advanced: options.advancedMerge ||
+ (!options.fastMerge && options.advancedMerge != false),
+ exhaustive: options.exhaustiveMerge ||
+ (!options.fastMerge && options.exhaustiveMerge != false)
+ };
+
+ rules = mergeCloselyMatchingRules(rules, mergeOptions);
+ }
+
+ return rules;
};
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld