Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Improved matching algorithm Created May 4, 2017, 2:36 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
16 */ 16 */
17 17
18 /** @module abp2blocklist */ 18 /** @module abp2blocklist */
19 19
20 "use strict"; 20 "use strict";
21 21
22 const crypto = require("crypto");
23
22 let filterClasses = require("filterClasses"); 24 let filterClasses = require("filterClasses");
23 let tldjs = require("tldjs"); 25 let tldjs = require("tldjs");
24 let punycode = require("punycode"); 26 let punycode = require("punycode");
25 27
26 const selectorLimit = 5000; 28 const selectorLimit = 5000;
27 const typeMap = filterClasses.RegExpFilter.typeMap; 29 const typeMap = filterClasses.RegExpFilter.typeMap;
28 const whitelistableRequestTypes = (typeMap.IMAGE 30 const whitelistableRequestTypes = (typeMap.IMAGE
29 | typeMap.STYLESHEET 31 | typeMap.STYLESHEET
30 | typeMap.SCRIPT 32 | typeMap.SCRIPT
31 | typeMap.FONT 33 | typeMap.FONT
(...skipping 327 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 { 361 {
360 newSelector.push(selector.substring(i, pos.start)); 362 newSelector.push(selector.substring(i, pos.start));
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 363 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
362 i = pos.end; 364 i = pos.end;
363 } 365 }
364 newSelector.push(selector.substring(i)); 366 newSelector.push(selector.substring(i));
365 367
366 return newSelector.join(""); 368 return newSelector.join("");
367 } 369 }
368 370
371 function closeMatch(s, t, {singleCharacterOnly = false} = {})
372 {
373 // This function returns an edit operation, one of "substitute", "delete",
374 // and "insert", along with an index in the source string where the edit must
375 // occur in order to arrive at the target string. If the strings are not a
376 // close match, it returns null.
377
378 // If singleCharacterOnly is false, deletions or insertions of a contiguous
379 // range of characters from one string into the other, at the same index, are
380 // treated as a single edit. For example, "internal" and "international" are
381 // considered to be one edit apart, inserting the substring "tiona" from the
382 // latter into the former.
383
384 // A few things to note:
385 //
386 // 1) This function does not care about how the input strings are treated
387 // by the caller. It only treats them as raw strings. For example, the
388 // caller may treat them as regular expressions, where "[ab]" and "[bc]"
389 // could be considered to have an edit distance of 1, since the order
390 // within the brackets does not matter. This function will still return
391 // null for this set of inputs since they are two edits apart.
392 //
393 // 2) To be friendly to calling code that might be passing in regular
394 // expressions anyway, this function will simply return null if it
395 // encounters a special character (e.g. "\", "?", "+", "*", etc.) in the
396 // delta. For example, given "Hello" and "Hello, how are you?", it will
397 // return null instead of "{type: 'insert', index: 5, endIndex: 19}".
398 //
399 // 3) The calling code within this file does indeed pass in regular
400 // expressions (the strict subset of JavaScript regular expressions
401 // supported by WebKit for content blockers), making the important
402 // assumption that the parts where two such regular expressions may
403 // differ can always be treated as normal strings.
404 //
405 // For example, "^https?://.*/ads" and "^https?://.*/adv" differ only in
406 // the last character, therefore the regular expressions can safely be
407 // merged into "^https?://.*/ad[sv]". If, for example, the characters in
408 // the delta were to appear within square brackets originally in the
409 // input strings (e.g. "^https?://.*/ad[sx]" and "^https?://.*/ad[vx]"),
410 // the calling code would have to do extra work to merge the two regular
411 // expressions correctly. The calling code within this file assumes that
412 // this is never the case.
413
414 let diff = s.length - t.length;
415
416 // If the string lengths differ by more than one character, we cannot arrive
417 // at target from source in a single edit operation.
418 if (singleCharacterOnly && (diff < -1 || diff > 1))
419 return null;
420
421 // If target is longer than source, swap them for the purpose of our
422 // calculation.
423 if (diff < 0)
424 {
425 let tmp = s;
426 s = t;
427 t = tmp;
428 }
429
430 let edit = null;
431
432 // If the string lengths differ by only one character at most, use the simple
433 // algorithm to find a single character edit.
434 if (diff == 0 || diff == 1 || diff == -1)
435 {
436 for (let i = 0, j = 0; i < s.length; i++)
437 {
438 if (s[i] == t[j])
439 {
440 j++;
441 }
442 else if (edit)
443 {
444 // Since we want one and only one edit operation, we must bail here.
445 return null;
446 }
447 else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" ||
448 s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" ||
449 s[i] == "[" || s[i] == "]" || s[i] == "\\") ||
450 (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" ||
451 t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" ||
452 t[j] == "[" || t[j] == "]" || t[j] == "\\"))
453 {
454 // We don't deal with special characters for now.
455 return null;
456 }
457 else if (diff == 0)
458 {
459 // If both strings are equal in length, this is a substitution.
460 edit = {type: "substitute", index: i};
461 j++;
462 }
463 else if (diff > 0)
464 {
465 // If the source string is longer, this is a deletion.
466 edit = {type: "delete", index: i};
467 }
468 else
469 {
470 edit = {type: "insert", index: i};
471 }
472 }
473 }
474 else if (!singleCharacterOnly)
475 {
476 // Try another algorithm to find a multiple character deletion or
477 // insertion.
478
479 let i = 0, j = 0;
480
481 for (; i < s.length; i++)
482 {
483 if (s[i] != t[i])
484 break;
485 }
486
487 for (; j < t.length; j++)
488 {
489 if (t.length - j == i ||
490 s[s.length - j - 1] != t[t.length - j - 1])
491 break;
492 }
493
494 if (i != t.length - j)
495 return null;
496
497 for (let k = i; k < s.length - j; k++)
498 {
499 // If there are any special characters in the delta, bail.
500 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
501 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
502 s[k] == "[" || s[k] == "]" || s[k] == "\\")
503 return null;
504 }
505
506 if (diff > 0)
507 {
508 edit = {type: "delete", index: i, endIndex: s.length - j};
509 }
510 else
511 {
512 edit = {type: "insert", index: i, endIndex: s.length - j};
513 }
514 }
515
516 return edit;
517 }
518
519 function mergeCloselyMatchingRules(rules,
520 {advanced = false, exhaustive = false} = {})
521 {
522 // Closely matching rules are likely to be within a certain range. We only
523 // look for matches within this range. If we increase this value, it can give
524 // us more matches and a smaller resulting rule set, but possibly at a
525 // significant performance cost.
526 const heuristicRange = 100;
527
528 let rulesInfo = new Array(rules.length);
529
530 rules.forEach((rule, index) =>
531 {
532 rulesInfo[index] = {rule};
533
534 if (rule.action.type == "ignore-previous-rules")
535 {
536 rulesInfo[index].skip = true;
537 }
538 else
539 {
540 // Save a hash of the rule but without the URL filter. We use this for
541 // comparison later.
542 let copy = {
543 trigger: Object.assign({}, rule.trigger),
544 action: Object.assign({}, rule.action)
545 };
546
547 delete copy.trigger["url-filter"];
548
549 rulesInfo[index].ruleHash = crypto.createHash("sha1")
550 .update(JSON.stringify(copy))
551 .digest("hex")
552 .substring(0, 8);
553 }
554 });
555
556 for (let i = 0; i < rules.length; i++)
557 {
558 if (rulesInfo[i].skip)
559 continue;
560
561 let limit = exhaustive ? rules.length :
562 Math.min(i + heuristicRange, rules.length);
563
564 for (let j = i + 1; j < limit; j++)
565 {
566 if (rulesInfo[j].skip)
567 continue;
568
569 // Check if the rules are identical except for the URL filter.
570 if (rulesInfo[i].ruleHash == rulesInfo[j].ruleHash)
571 {
572 let source = rules[i].trigger["url-filter"];
573 let target = rules[j].trigger["url-filter"];
574
575 let edit = closeMatch(source, target, {singleCharacterOnly: !advanced});
576
577 if (edit)
578 {
579 let urlFilter, ruleInfo, match = {edit};
580
581 if (edit.type == "insert")
582 {
583 // Convert the insertion into a deletion and stick it on the target
584 // rule instead. We can only group deletions and substitutions;
585 // therefore insertions must be treated as deletions on the target
586 // rule.
587 urlFilter = target;
588 ruleInfo = rulesInfo[j];
589 match.index = i;
590 edit.type = "delete";
591 }
592 else
593 {
594 urlFilter = source;
595 ruleInfo = rulesInfo[i];
596 match.index = j;
597 }
598
599 // If the edit has an end index, it represents a multiple character
600 // edit.
601 let multiEdit = !!edit.endIndex;
602
603 if (multiEdit)
604 {
605 // We only care about a single multiple character edit because the
606 // number of characters for such a match doesn't matter, we can
607 // only merge with one other rule.
608 if (!ruleInfo.multiEditMatch)
609 ruleInfo.multiEditMatch = match;
610 }
611 else
612 {
613 // For single character edits, multiple rules can be merged into
614 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
615 if (!ruleInfo.matches)
616 ruleInfo.matches = new Array(urlFilter.length + 1);
617
618 // Matches at a particular index. For example, for a source string
619 // "ads", both target strings "ad" (deletion) and "adv"
620 // (substitution) match at index 2, hence they are grouped together
621 // to possibly be merged later into "ad[sv]?".
622 let matchesForIndex = ruleInfo.matches[edit.index];
623
624 if (matchesForIndex)
625 {
626 matchesForIndex.push(match);
627 }
628 else
629 {
630 matchesForIndex = [match];
631 ruleInfo.matches[edit.index] = matchesForIndex;
632 }
633
634 // Keep track of the best set of matches. We later sort by this to
635 // get best results.
636 if (!ruleInfo.bestMatches ||
637 matchesForIndex.length > ruleInfo.bestMatches.length)
638 ruleInfo.bestMatches = matchesForIndex;
639 }
640 }
641 }
642 }
643 }
644
645 // Filter out rules that have no matches at all.
646 let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
647 {
648 return ruleInfo.bestMatches || ruleInfo.multiEditMatch
649 });
650
651 // For best results, we have to sort the candidates by the largest set of
652 // matches.
653 //
654 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
655 // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
656 // "[ab]dx" (3 rules).
657 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
658 {
659 let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
660 ruleInfo1.multiEditMatch ? 1 : 0;
661 let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
662 ruleInfo2.multiEditMatch ? 1 : 0;
663
664 return weight2 - weight1;
665 });
666
667 for (let ruleInfo of candidateRulesInfo)
668 {
669 let rule = ruleInfo.rule;
670
671 // If this rule has already been merged into another rule, we skip it.
672 if (ruleInfo.merged)
673 continue;
674
675 // Find the best set of rules to group, which is simply the largest set.
676 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
677 {
678 matchesForIndex = (matchesForIndex || []).filter(match =>
679 {
680 // Filter out rules that have either already been merged into other
681 // rules or have had other rules merged into them.
682 return !rulesInfo[match.index].merged &&
683 !rulesInfo[match.index].mergedInto;
684 });
685
686 return matchesForIndex.length > best.length ? matchesForIndex : best;
687 },
688 []);
689
690 let multiEdit = false;
691
692 // If we couldn't find a single rule to merge with, let's see if we have a
693 // multiple character edit. e.g. we could merge "ad" and "adserver" into
694 // "ad(server)?".
695 if (best.length == 0 && ruleInfo.multiEditMatch &&
696 !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
697 !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
698 {
699 best = [ruleInfo.multiEditMatch];
700 multiEdit = true;
701 }
702
703 if (best.length > 0)
704 {
705 let urlFilter = rule.trigger["url-filter"];
706
707 let editIndex = best[0].edit.index;
708
709 if (!multiEdit)
710 {
711 // Merge all the matching rules into this one.
712
713 let characters = [];
714 let quantifier = "";
715
716 for (let match of best)
717 {
718 if (match.edit.type == "delete")
719 {
720 quantifier = "?";
721 }
722 else
723 {
724 let character = rules[match.index].trigger["url-filter"][editIndex];
725 characters.push(character);
726 }
727
728 // Mark the target rule as merged so other rules don't try to merge
729 // it again.
730 rulesInfo[match.index].merged = true;
731 }
732
733 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
734 urlFilter.substring(editIndex + 1);
735 if (characters.length > 0)
736 {
737 urlFilter = urlFilter.substring(0, editIndex) + "[" +
738 urlFilter[editIndex] + characters.join("") + "]" +
739 urlFilter.substring(editIndex + 1);
740 }
741 }
742 else
743 {
744 let editEndIndex = best[0].edit.endIndex;
745
746 // Mark the target rule as merged so other rules don't try to merge it
747 // again.
748 rulesInfo[best[0].index].merged = true;
749
750 urlFilter = urlFilter.substring(0, editIndex) + "(" +
751 urlFilter.substring(editIndex, editEndIndex) + ")?" +
752 urlFilter.substring(editEndIndex);
753 }
754
755 rule.trigger["url-filter"] = urlFilter;
756
757 // Mark this rule as one that has had other rules merged into it.
758 ruleInfo.mergedInto = true;
759 }
760 }
761
762 // Filter out rules that have been merged into other rules.
763 return rulesInfo.filter(ruleInfo => !ruleInfo.merged)
764 .map(ruleInfo => ruleInfo.rule);
765 }
766
369 let ContentBlockerList = 767 let ContentBlockerList =
370 /** 768 /**
371 * Create a new Adblock Plus filter to content blocker list converter 769 * Create a new Adblock Plus filter to content blocker list converter
372 * 770 *
373 * @constructor 771 * @constructor
374 */ 772 */
375 exports.ContentBlockerList = function () 773 exports.ContentBlockerList = function ()
376 { 774 {
377 this.requestFilters = []; 775 this.requestFilters = [];
378 this.requestExceptions = []; 776 this.requestExceptions = [];
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
417 815
418 parseDomains(filter.domains, domains, []); 816 parseDomains(filter.domains, domains, []);
419 } 817 }
420 }; 818 };
421 819
422 /** 820 /**
423 * Generate content blocker list for all filters that were added 821 * Generate content blocker list for all filters that were added
424 * 822 *
425 * @returns {Filter} filter Filter to convert 823 * @returns {Filter} filter Filter to convert
426 */ 824 */
427 ContentBlockerList.prototype.generateRules = function(filter) 825 ContentBlockerList.prototype.generateRules = function({
826 merge = false,
827 fastMerge = true,
828 advancedMerge,
829 exhaustiveMerge
830 } = {})
428 { 831 {
429 let rules = []; 832 let rules = [];
430 833
431 let groupedElemhideFilters = new Map(); 834 let groupedElemhideFilters = new Map();
432 for (let filter of this.elemhideFilters) 835 for (let filter of this.elemhideFilters)
433 { 836 {
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 837 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
435 if (!result) 838 if (!result)
436 continue; 839 continue;
437 840
(...skipping 27 matching lines...) Expand all
465 } 868 }
466 }); 869 });
467 870
468 for (let filter of this.elemhideExceptions) 871 for (let filter of this.elemhideExceptions)
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 872 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
470 for (let filter of this.requestFilters) 873 for (let filter of this.requestFilters)
471 convertFilterAddRules(rules, filter, "block", true); 874 convertFilterAddRules(rules, filter, "block", true);
472 for (let filter of this.requestExceptions) 875 for (let filter of this.requestExceptions)
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 876 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
474 877
475 return rules.filter(rule => !hasNonASCI(rule)); 878 rules = rules.filter(rule => !hasNonASCI(rule));
879
880 if (merge)
881 {
882 // If the more specific options are specified (e.g. "advanced" and
883 // "exhaustive"), they override the more general options (e.g. "fast").
884 let mergeOptions = {
885 advanced: advancedMerge || (!fastMerge && advancedMerge != false),
886 exhaustive: exhaustiveMerge || (!fastMerge && exhaustiveMerge != false)
887 };
888
889 rules = mergeCloselyMatchingRules(rules, mergeOptions);
890 }
891
892 return rules;
476 }; 893 };
OLDNEW
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld