| Index: lib/abp2blocklist.js |
| =================================================================== |
| --- a/lib/abp2blocklist.js |
| +++ b/lib/abp2blocklist.js |
| @@ -361,24 +361,495 @@ |
| newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); |
| i = pos.end; |
| } |
| newSelector.push(selector.substring(i)); |
| return newSelector.join(""); |
| } |
| +/** |
| + * Check if two strings are a close match |
| + * |
| + * This function returns an edit operation, one of "substitute", "delete", and |
| + * "insert", along with an index in the source string where the edit must occur |
| + * in order to arrive at the target string. If the strings are not a close |
| + * match, it returns null. |
| + * |
| + * Two strings are considered to be a close match if they are one edit |
| + * operation apart. |
| + * |
| + * Deletions or insertions of a contiguous range of characters from one string |
| + * into the other, at the same index, are treated as a single edit. For |
| + * example, "internal" and "international" are considered to be one edit apart |
| + * and therefore a close match. |
| + * |
| + * A few things to note: |
| + * |
| + * 1) This function does not care about the format of the input strings. For |
| + * example, the caller may pass in regular expressions, where "[ab]" and |
| + * "[bc]" could be considered to be a close match, since the order within the |
| + * brackets doesn't matter. This function will still return null for this set |
| + * of inputs since they are two edits apart. |
| + * |
| + * 2) To be friendly to calling code that might be passing in regular |
| + * expressions, this function will simply return null if it encounters a |
| + * special character (e.g. "\", "?", "+", etc.) in the delta. For example, |
| + * given "Hello" and "Hello, how are you?", it will return null. |
| + * |
| + * 3) If the caller does indeed pass in regular expressions, it must make the |
| + * important assumption that the parts where two such regular expressions may |
| + * differ can always be treated as normal strings. For example, |
| + * "^https?://example.com/ads" and "^https?://example.com/adv" differ only in |
| + * the last character, therefore the regular expressions can safely be merged |
| + * into "^https?://example.com/ad[sv]". |
| + * |
| + * @param {string} s The source string |
| + * @param {string} t The target string |
| + * |
| + * @returns {object} An object describing the single edit operation that must |
| + * occur in the source string in order to arrive at the |
| + * target string |
| + */ |
| +function closeMatch(s, t) |
| +{ |
| + let diff = s.length - t.length; |
| + |
| + // If target is longer than source, swap them for the purpose of our |
| + // calculation. |
| + if (diff < 0) |
| + { |
| + let tmp = s; |
| + s = t; |
| + t = tmp; |
| + } |
| + |
| + let edit = null; |
| + |
| + let i = 0, j = 0; |
| + |
| + // Start from the beginning and keep going until we hit a character that |
| + // doesn't match. |
| + for (; i < s.length; i++) |
| + { |
| + if (s[i] != t[i]) |
| + break; |
| + } |
| + |
| + // Now do exactly the same from the end, but also stop if we reach the |
| + // position where we terminated the previous loop. |
| + for (; j < t.length; j++) |
| + { |
| + if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1]) |
| + break; |
| + } |
| + |
| + if (diff == 0) |
| + { |
| + // If the strings are equal in length and the delta isn't exactly one |
| + // character, it's not a close match. |
| + if (t.length - j - i != 1) |
| + return null; |
| + } |
| + else if (i != t.length - j) |
| + { |
| + // For strings of unequal length, if we haven't found a match for every |
| + // single character in the shorter string counting from both the beginning |
| + // and the end, it's not a close match. |
| + return null; |
| + } |
| + |
| + for (let k = i; k < s.length - j; k++) |
| + { |
| + // If the delta contains any special characters, it's not a close match. |
| + if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" || |
| + s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" || |
| + s[k] == "[" || s[k] == "]" || s[k] == "\\") |
| + return null; |
| + } |
| + |
| + if (diff == 0) |
| + { |
| + edit = {type: "substitute", index: i}; |
| + } |
| + else if (diff > 0) |
| + { |
| + edit = {type: "delete", index: i}; |
| + |
| + if (diff > 1) |
| + edit.endIndex = s.length - j; |
| + } |
| + else |
| + { |
| + edit = {type: "insert", index: i}; |
| + |
| + if (diff < -1) |
| + edit.endIndex = s.length - j; |
| + } |
| + |
| + return edit; |
| +} |
| + |
| +function eliminateRedundantRulesByURLFilter(rulesInfo) |
| +{ |
| + for (let i = 0; i < rulesInfo.length; i++) |
| + { |
| + // If this rule is already marked as redundant, don't bother comparing it |
| + // with other rules. |
| + if (rulesInfo[i].redundant) |
| + continue; |
| + |
| + for (let j = i + 1; j < rulesInfo.length; j++) |
| + { |
| + if (rulesInfo[j].redundant) |
| + continue; |
| + |
| + let source = rulesInfo[i].rule.trigger["url-filter"]; |
| + let target = rulesInfo[j].rule.trigger["url-filter"]; |
| + |
| + if (source.length >= target.length) |
| + { |
| + // If one URL filter is a substring of the other starting at the |
| + // beginning, the other one is clearly redundant. |
| + if (source.substring(0, target.length) == target) |
| + { |
| + rulesInfo[i].redundant = true; |
| + break; |
| + } |
| + } |
| + else if (target.substring(0, source.length) == source) |
| + { |
| + rulesInfo[j].redundant = true; |
| + } |
| + } |
| + } |
| + |
| + return rulesInfo.filter(ruleInfo => !ruleInfo.redundant); |
| +} |
| + |
| +function mergeRulesByURLFilter(rulesInfo, exhaustive) |
| +{ |
| + // Closely matching rules are likely to be within a certain range. We only |
| + // look for matches within this range by default. If we increase this value, |
| + // it can give us more matches and a smaller resulting rule set, but possibly |
| + // at a significant performance cost. |
| + // |
| + // If the exhaustive option is true, we simply ignore this value and look for |
| + // matches throughout the rule set. |
| + const heuristicRange = 10; |
| + |
| + if (exhaustive) |
| + { |
| + // Throw out obviously redundant rules. |
| + rulesInfo = eliminateRedundantRulesByURLFilter(rulesInfo); |
| + } |
| + |
| + if (rulesInfo.length <= 1) |
| + return; |
| + |
| + for (let i = 0; i < rulesInfo.length; i++) |
| + { |
| + let limit = exhaustive ? rulesInfo.length : |
| + Math.min(i + heuristicRange, rulesInfo.length); |
| + |
| + for (let j = i + 1; j < limit; j++) |
| + { |
| + let source = rulesInfo[i].rule.trigger["url-filter"]; |
| + let target = rulesInfo[j].rule.trigger["url-filter"]; |
| + |
| + let edit = closeMatch(source, target); |
| + |
| + if (edit) |
| + { |
| + let urlFilter, ruleInfo, match = {edit}; |
| + |
| + if (edit.type == "insert") |
| + { |
| + // Convert the insertion into a deletion and stick it on the target |
| + // rule instead. We can only group deletions and substitutions; |
| + // therefore insertions must be treated as deletions on the target |
| + // rule. |
| + urlFilter = target; |
| + ruleInfo = rulesInfo[j]; |
| + match.index = i; |
| + edit.type = "delete"; |
| + } |
| + else |
| + { |
| + urlFilter = source; |
| + ruleInfo = rulesInfo[i]; |
| + match.index = j; |
| + } |
| + |
| + // If the edit has an end index, it represents a multiple character |
| + // edit. |
| + let multiEdit = !!edit.endIndex; |
| + |
| + if (multiEdit) |
| + { |
| + // We only care about a single multiple character edit because the |
| + // number of characters for such a match doesn't matter, we can |
| + // only merge with one other rule. |
| + if (!ruleInfo.multiEditMatch) |
| + ruleInfo.multiEditMatch = match; |
| + } |
| + else |
| + { |
| + // For single character edits, multiple rules can be merged into |
| + // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?". |
| + if (!ruleInfo.matches) |
| + ruleInfo.matches = new Array(urlFilter.length); |
| + |
| + // Matches at a particular index. For example, for a source string |
| + // "ads", both target strings "ad" (deletion) and "adv" |
| + // (substitution) match at index 2, hence they are grouped together |
| + // to possibly be merged later into "ad[sv]?". |
| + let matchesForIndex = ruleInfo.matches[edit.index]; |
| + |
| + if (matchesForIndex) |
| + { |
| + matchesForIndex.push(match); |
| + } |
| + else |
| + { |
| + matchesForIndex = [match]; |
| + ruleInfo.matches[edit.index] = matchesForIndex; |
| + } |
| + |
| + // Keep track of the best set of matches. We later sort by this to |
| + // get best results. |
| + if (!ruleInfo.bestMatches || |
| + matchesForIndex.length > ruleInfo.bestMatches.length) |
| + ruleInfo.bestMatches = matchesForIndex; |
| + } |
| + } |
| + } |
| + } |
| + |
| + // Filter out rules that have no matches at all. |
| + let candidateRulesInfo = rulesInfo.filter(ruleInfo => |
| + { |
| + return ruleInfo.bestMatches || ruleInfo.multiEditMatch |
| + }); |
| + |
| + // For best results, we have to sort the candidates by the largest set of |
| + // matches. |
| + // |
| + // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to |
| + // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and |
| + // "[ab]dx" (3 rules). |
| + candidateRulesInfo.sort((ruleInfo1, ruleInfo2) => |
| + { |
| + let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length : |
| + ruleInfo1.multiEditMatch ? 1 : 0; |
| + let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length : |
| + ruleInfo2.multiEditMatch ? 1 : 0; |
| + |
| + return weight2 - weight1; |
| + }); |
| + |
| + for (let ruleInfo of candidateRulesInfo) |
| + { |
| + let rule = ruleInfo.rule; |
| + |
| + // If this rule has already been merged into another rule, we skip it. |
| + if (ruleInfo.merged) |
| + continue; |
| + |
| + // Find the best set of rules to group, which is simply the largest set. |
| + let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) => |
| + { |
| + matchesForIndex = (matchesForIndex || []).filter(match => |
| + { |
| + // Filter out rules that have either already been merged into other |
| + // rules or have had other rules merged into them. |
| + return !rulesInfo[match.index].merged && |
| + !rulesInfo[match.index].mergedInto; |
| + }); |
| + |
| + return matchesForIndex.length > best.length ? matchesForIndex : best; |
| + }, |
| + []); |
| + |
| + let multiEdit = false; |
| + |
| + // If we couldn't find a single rule to merge with, let's see if we have a |
| + // multiple character edit. e.g. we could merge "ad" and "adserver" into |
| + // "ad(server)?". |
| + if (best.length == 0 && ruleInfo.multiEditMatch && |
| + !rulesInfo[ruleInfo.multiEditMatch.index].merged && |
| + !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto) |
| + { |
| + best = [ruleInfo.multiEditMatch]; |
| + multiEdit = true; |
| + } |
| + |
| + if (best.length > 0) |
| + { |
| + let urlFilter = rule.trigger["url-filter"]; |
| + |
| + let editIndex = best[0].edit.index; |
| + |
| + if (!multiEdit) |
| + { |
| + // Merge all the matching rules into this one. |
| + |
| + let characters = []; |
| + let quantifier = ""; |
| + |
| + for (let match of best) |
| + { |
| + if (match.edit.type == "delete") |
| + { |
| + quantifier = "?"; |
| + } |
| + else |
| + { |
| + let character = rulesInfo[match.index].rule |
| + .trigger["url-filter"][editIndex]; |
| + characters.push(character); |
| + } |
| + |
| + // Mark the target rule as merged so other rules don't try to merge |
| + // it again. |
| + rulesInfo[match.index].merged = true; |
| + } |
| + |
| + urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier + |
| + urlFilter.substring(editIndex + 1); |
| + if (characters.length > 0) |
| + { |
| + urlFilter = urlFilter.substring(0, editIndex) + "[" + |
| + urlFilter[editIndex] + characters.join("") + "]" + |
| + urlFilter.substring(editIndex + 1); |
| + } |
| + } |
| + else |
| + { |
| + let editEndIndex = best[0].edit.endIndex; |
| + |
| + // Mark the target rule as merged so other rules don't try to merge it |
| + // again. |
| + rulesInfo[best[0].index].merged = true; |
| + |
| + urlFilter = urlFilter.substring(0, editIndex) + "(" + |
| + urlFilter.substring(editIndex, editEndIndex) + ")?" + |
| + urlFilter.substring(editEndIndex); |
| + } |
| + |
| + rule.trigger["url-filter"] = urlFilter; |
| + |
| + // Mark this rule as one that has had other rules merged into it. |
| + ruleInfo.mergedInto = true; |
| + } |
| + } |
| +} |
| + |
| +function mergeRulesByArrayProperty(rulesInfo, propertyType, property) |
| +{ |
| + if (rulesInfo.length <= 1) |
| + return; |
| + |
| + let oneRuleInfo = rulesInfo.shift(); |
|
kzar
2017/05/09 16:50:59
Nit: firstRuleInfo?
Manish Jethani
2017/05/09 17:32:11
That's what I called it at first, but then I wante
|
| + let valueSet = new Set(oneRuleInfo.rule[propertyType][property]); |
| + |
| + for (let ruleInfo of rulesInfo) |
| + { |
| + if (ruleInfo.rule[propertyType][property]) |
| + { |
| + for (let value of ruleInfo.rule[propertyType][property]) |
| + valueSet.add(value); |
| + } |
| + |
| + ruleInfo.merged = true; |
| + } |
| + |
| + if (valueSet.size > 0) |
| + oneRuleInfo.rule[propertyType][property] = Array.from(valueSet); |
| + |
| + oneRuleInfo.mergedInto = true; |
| +} |
| + |
| +function groupRulesByMergeableProperty(rulesInfo, propertyType, property) |
| +{ |
| + let mergeableRulesInfoByGroup = new Map(); |
| + |
| + for (let ruleInfo of rulesInfo) |
| + { |
| + let copy = { |
| + trigger: Object.assign({}, ruleInfo.rule.trigger), |
| + action: Object.assign({}, ruleInfo.rule.action) |
| + }; |
| + |
| + delete copy[propertyType][property]; |
| + |
| + let groupKey = JSON.stringify(copy); |
| + |
| + let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey); |
| + |
| + if (mergeableRulesInfo) |
| + mergeableRulesInfo.push(ruleInfo); |
| + else |
| + mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]); |
| + } |
| + |
| + return mergeableRulesInfoByGroup; |
| +} |
| + |
| +function mergeRules(rules, options) |
| +{ |
| + const defaultOptions = {exhaustive: false}; |
| + |
| + options = Object.assign({}, defaultOptions, options); |
| + |
| + let rulesInfo = rules.map(rule => ({rule})); |
| + |
| + groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter") |
| + .forEach(mergeableRulesInfo => |
| + { |
| + if (mergeableRulesInfo.length > 1) |
| + mergeRulesByURLFilter(mergeableRulesInfo, options.exhaustive); |
| + }); |
| + |
| + // Filter out rules that are redundant or have been merged into other rules. |
| + rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.redundant && |
| + !ruleInfo.merged); |
| + |
| + for (let arrayProperty of ["resource-type", "if-domain"]) |
| + { |
| + groupRulesByMergeableProperty(rulesInfo, "trigger", arrayProperty) |
| + .forEach(mergeableRulesInfo => |
| + { |
| + if (mergeableRulesInfo.length > 1) |
| + mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty); |
| + }); |
| + |
| + rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged); |
| + } |
| + |
| + return rulesInfo.map(ruleInfo => ruleInfo.rule); |
| +} |
| + |
| let ContentBlockerList = |
| /** |
| * Create a new Adblock Plus filter to content blocker list converter |
| * |
| + * @param {object} options Options for content blocker list generation |
| + * |
| * @constructor |
| */ |
| -exports.ContentBlockerList = function () |
| +exports.ContentBlockerList = function(options) |
| { |
| + const defaultOptions = { |
| + merge: false, |
| + exhaustiveMerge: false |
| + }; |
| + |
| + this.options = Object.assign({}, defaultOptions, options); |
| + |
| this.requestFilters = []; |
| this.requestExceptions = []; |
| this.elemhideFilters = []; |
| this.elemhideExceptions = []; |
| this.elemhideSelectorExceptions = new Map(); |
| }; |
| /** |
| @@ -416,20 +887,18 @@ |
| domains = this.elemhideSelectorExceptions[filter.selector] = []; |
| parseDomains(filter.domains, domains, []); |
| } |
| }; |
| /** |
| * Generate content blocker list for all filters that were added |
| - * |
| - * @returns {Filter} filter Filter to convert |
| */ |
| -ContentBlockerList.prototype.generateRules = function(filter) |
| +ContentBlockerList.prototype.generateRules = function() |
| { |
| let rules = []; |
| let groupedElemhideFilters = new Map(); |
| for (let filter of this.elemhideFilters) |
| { |
| let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); |
| if (!result) |
| @@ -467,10 +936,15 @@ |
| for (let filter of this.elemhideExceptions) |
| convertFilterAddRules(rules, filter, "ignore-previous-rules", false); |
| for (let filter of this.requestFilters) |
| convertFilterAddRules(rules, filter, "block", true); |
| for (let filter of this.requestExceptions) |
| convertFilterAddRules(rules, filter, "ignore-previous-rules", true); |
| - return rules.filter(rule => !hasNonASCI(rule)); |
| + rules = rules.filter(rule => !hasNonASCI(rule)); |
| + |
| + if (this.options.merge) |
| + rules = mergeRules(rules, {exhaustive: this.options.exhaustiveMerge}); |
| + |
| + return rules; |
| }; |