Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Make it work in Safari Created May 5, 2017, 4:36 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 20 matching lines...) Expand all
31 | typeMap.FONT 31 | typeMap.FONT
32 | typeMap.MEDIA 32 | typeMap.MEDIA
33 | typeMap.POPUP 33 | typeMap.POPUP
34 | typeMap.OBJECT 34 | typeMap.OBJECT
35 | typeMap.OBJECT_SUBREQUEST 35 | typeMap.OBJECT_SUBREQUEST
36 | typeMap.XMLHTTPREQUEST 36 | typeMap.XMLHTTPREQUEST
37 | typeMap.PING 37 | typeMap.PING
38 | typeMap.SUBDOCUMENT 38 | typeMap.SUBDOCUMENT
39 | typeMap.OTHER); 39 | typeMap.OTHER);
40 40
41 function pearsonHash(message)
42 {
43 // This is an implementation of the Pearson hashing algorithm, where we
44 // generate a 32-bit digest for a given message.
45
46 // Note that this code will only look at the lowest 8 bits of each character.
47 // For best results, encode the input as UTF-8.
48
49 // A table of all numbers from 0 through 255 shuffled up.
50 const table = [
51 0xe5, 0x0a, 0x9f, 0x79, 0x99, 0xad, 0x10, 0x85, 0x5d, 0x55, 0x75, 0x2e,
52 0x04, 0x1a, 0xb5, 0x7d, 0x96, 0xe6, 0xa3, 0xc6, 0x82, 0x87, 0xb2, 0xef,
53 0x00, 0x64, 0x70, 0x4b, 0xe4, 0x2f, 0x37, 0x52, 0x90, 0x1d, 0x08, 0x68,
54 0x3a, 0x26, 0x74, 0xaa, 0xc1, 0x80, 0x17, 0xc2, 0x0f, 0xec, 0xc8, 0x1f,
55 0xe2, 0x3c, 0xe1, 0xa1, 0x8f, 0xfd, 0x9d, 0x0b, 0xd5, 0xcc, 0xd4, 0xd9,
56 0xf0, 0x3d, 0x5e, 0x57, 0xae, 0x12, 0x46, 0xb0, 0x63, 0x94, 0x61, 0x9a,
57 0xbb, 0x76, 0x0c, 0x3f, 0xc4, 0x59, 0xdc, 0x5c, 0xb4, 0xc7, 0x73, 0x39,
58 0x65, 0x2c, 0x2a, 0xc3, 0xed, 0x20, 0x54, 0xfe, 0xfb, 0xd0, 0xa6, 0x33,
59 0x4a, 0x9e, 0xe7, 0x49, 0xea, 0x58, 0xaf, 0x35, 0x30, 0x95, 0x2b, 0x56,
60 0x14, 0xff, 0xb1, 0xd6, 0x27, 0x6a, 0x88, 0x89, 0x43, 0x4c, 0xca, 0xb9,
61 0x21, 0x8a, 0x78, 0xf1, 0x18, 0x1e, 0xd1, 0xe0, 0x60, 0xf8, 0x3e, 0xdd,
62 0x25, 0x16, 0xde, 0xc0, 0x98, 0x28, 0x7a, 0x3b, 0x1b, 0x45, 0xa5, 0xb3,
63 0xe3, 0x84, 0xd3, 0xb8, 0xbd, 0x47, 0xe9, 0xfa, 0xc9, 0xb6, 0xbe, 0x6c,
64 0x9c, 0xac, 0xda, 0xfc, 0x41, 0x0d, 0x07, 0x91, 0x6b, 0x6f, 0x03, 0xcb,
65 0xbc, 0x8d, 0x06, 0x01, 0xd2, 0x8c, 0x19, 0x5a, 0x02, 0xba, 0x4f, 0x6e,
66 0x2d, 0xf4, 0xcf, 0x7e, 0x6d, 0x42, 0x93, 0x31, 0xbf, 0xdf, 0x5f, 0x67,
67 0x53, 0xf6, 0x38, 0x9b, 0xa7, 0xe8, 0xee, 0x5b, 0x0e, 0x22, 0xdb, 0x51,
68 0x8e, 0x69, 0x97, 0x32, 0x36, 0xce, 0x77, 0x4d, 0xa4, 0xf2, 0x23, 0xc5,
69 0x11, 0x05, 0xab, 0xf9, 0x13, 0xd8, 0x7b, 0xa8, 0x40, 0x66, 0xd7, 0x24,
70 0x86, 0xa0, 0xeb, 0xf3, 0x81, 0x4e, 0x50, 0xf7, 0xb7, 0x7f, 0x83, 0xa2,
71 0xa9, 0x09, 0xcd, 0x62, 0x7c, 0x92, 0x8b, 0x71, 0x44, 0xf5, 0x72, 0x1c,
72 0x29, 0x48, 0x34, 0x15
73 ];
74
75 let digest = 0;
76
77 for (let i = 0; i < 4; i++)
78 {
79 let d = table[(message.charCodeAt(0) + i) % 256];
80 for (let j = 1; j < message.length; j++)
81 d = table[d ^ message.charCodeAt(j)];
82 digest |= (d & 0xff) << i * 8;
83 }
84
85 return digest;
86 }
87
41 function parseDomains(domains, included, excluded) 88 function parseDomains(domains, included, excluded)
42 { 89 {
43 for (let domain in domains) 90 for (let domain in domains)
44 { 91 {
45 if (domain != "") 92 if (domain != "")
46 { 93 {
47 let enabled = domains[domain]; 94 let enabled = domains[domain];
48 domain = punycode.toASCII(domain.toLowerCase()); 95 domain = punycode.toASCII(domain.toLowerCase());
49 96
50 if (!enabled) 97 if (!enabled)
(...skipping 308 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 { 406 {
360 newSelector.push(selector.substring(i, pos.start)); 407 newSelector.push(selector.substring(i, pos.start));
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 408 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
362 i = pos.end; 409 i = pos.end;
363 } 410 }
364 newSelector.push(selector.substring(i)); 411 newSelector.push(selector.substring(i));
365 412
366 return newSelector.join(""); 413 return newSelector.join("");
367 } 414 }
368 415
416 function closeMatch(s, t, singleCharacterOnly)
417 {
418 // This function returns an edit operation, one of "substitute", "delete",
419 // and "insert", along with an index in the source string where the edit must
420 // occur in order to arrive at the target string. If the strings are not a
421 // close match, it returns null.
422
423 // If singleCharacterOnly is false, deletions or insertions of a contiguous
424 // range of characters from one string into the other, at the same index, are
425 // treated as a single edit. For example, "internal" and "international" are
426 // considered to be one edit apart, inserting the substring "tiona" from the
427 // latter into the former.
428
429 // A few things to note:
430 //
431 // 1) This function does not care about how the input strings are treated
432 // by the caller. It only treats them as raw strings. For example, the
433 // caller may treat them as regular expressions, where "[ab]" and "[bc]"
434 // could be considered to have an edit distance of 1, since the order
435 // within the brackets does not matter. This function will still return
436 // null for this set of inputs since they are two edits apart.
437 //
438 // 2) To be friendly to calling code that might be passing in regular
439 // expressions anyway, this function will simply return null if it
440 // encounters a special character (e.g. "\", "?", "+", "*", etc.) in the
441 // delta. For example, given "Hello" and "Hello, how are you?", it will
442 // return null instead of "{type: 'insert', index: 5, endIndex: 19}".
443 //
444 // 3) The calling code within this file does indeed pass in regular
445 // expressions (the strict subset of JavaScript regular expressions
446 // supported by WebKit for content blockers), making the important
447 // assumption that the parts where two such regular expressions may
448 // differ can always be treated as normal strings.
449 //
450 // For example, "^https?://.*/ads" and "^https?://.*/adv" differ only in
451 // the last character, therefore the regular expressions can safely be
452 // merged into "^https?://.*/ad[sv]". If, for example, the characters in
453 // the delta were to appear within square brackets originally in the
454 // input strings (e.g. "^https?://.*/ad[sx]" and "^https?://.*/ad[vx]"),
455 // the calling code would have to do extra work to merge the two regular
456 // expressions correctly. The calling code within this file assumes that
457 // this is never the case.
458
459 let diff = s.length - t.length;
460
461 // If the string lengths differ by more than one character, we cannot arrive
462 // at target from source in a single edit operation.
463 if (singleCharacterOnly && (diff < -1 || diff > 1))
464 return null;
465
466 // If target is longer than source, swap them for the purpose of our
467 // calculation.
468 if (diff < 0)
469 {
470 let tmp = s;
471 s = t;
472 t = tmp;
473 }
474
475 let edit = null;
476
477 // If the string lengths differ by only one character at most, use the simple
478 // algorithm to find a single character edit.
479 if (diff == 0 || diff == 1 || diff == -1)
480 {
481 for (let i = 0, j = 0; i < s.length; i++)
482 {
483 if (s[i] == t[j])
484 {
485 j++;
486 }
487 else if (edit)
488 {
489 // Since we want one and only one edit operation, we must bail here.
490 return null;
491 }
492 else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" ||
493 s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" ||
494 s[i] == "[" || s[i] == "]" || s[i] == "\\") ||
495 (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" ||
496 t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" ||
497 t[j] == "[" || t[j] == "]" || t[j] == "\\"))
498 {
499 // We don't deal with special characters for now.
500 return null;
501 }
502 else if (diff == 0)
503 {
504 // If both strings are equal in length, this is a substitution.
505 edit = {type: "substitute", index: i};
506 j++;
507 }
508 else if (diff > 0)
509 {
510 // If the source string is longer, this is a deletion.
511 edit = {type: "delete", index: i};
512 }
513 else
514 {
515 edit = {type: "insert", index: i};
516 }
517 }
518 }
519 else if (!singleCharacterOnly)
520 {
521 // Try another algorithm to find a multiple character deletion or
522 // insertion.
523
524 let i = 0, j = 0;
525
526 for (; i < s.length; i++)
527 {
528 if (s[i] != t[i])
529 break;
530 }
531
532 for (; j < t.length; j++)
533 {
534 if (t.length - j == i ||
535 s[s.length - j - 1] != t[t.length - j - 1])
536 break;
537 }
538
539 if (i != t.length - j)
540 return null;
541
542 for (let k = i; k < s.length - j; k++)
543 {
544 // If there are any special characters in the delta, bail.
545 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
546 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
547 s[k] == "[" || s[k] == "]" || s[k] == "\\")
548 return null;
549 }
550
551 if (diff > 0)
552 {
553 edit = {type: "delete", index: i, endIndex: s.length - j};
554 }
555 else
556 {
557 edit = {type: "insert", index: i, endIndex: s.length - j};
558 }
559 }
560
561 return edit;
562 }
563
564 function mergeCloselyMatchingRules(rules, options)
565 {
566 const defaultOptions = {advanced: false, exhaustive: false};
567
568 options = Object.assign({}, defaultOptions, options);
569
570 // Closely matching rules are likely to be within a certain range. We only
571 // look for matches within this range. If we increase this value, it can give
572 // us more matches and a smaller resulting rule set, but possibly at a
573 // significant performance cost.
574 const heuristicRange = 100;
575
576 let rulesInfo = new Array(rules.length);
577
578 rules.forEach((rule, index) =>
579 {
580 rulesInfo[index] = {rule};
581
582 if (rule.action.type == "ignore-previous-rules")
583 {
584 rulesInfo[index].skip = true;
585 }
586 else
587 {
588 // Save a hash of the rule but without the URL filter. We use this for
589 // comparison later.
590 let copy = {
591 trigger: Object.assign({}, rule.trigger),
592 action: Object.assign({}, rule.action)
593 };
594
595 delete copy.trigger["url-filter"];
596
597 let stringified = JSON.stringify(copy);
598
599 if (options.exhaustive)
600 {
601 // The Pearson hash function expects all characters to be within the
602 // 8-bit range.
603 stringified = encodeURIComponent(stringified);
604
605 rulesInfo[index].ruleHash = pearsonHash(stringified);
606 }
607 else
608 {
609 rulesInfo[index].ruleHash = stringified;
610 }
611 }
612 });
613
614 for (let i = 0; i < rules.length; i++)
615 {
616 if (rulesInfo[i].skip)
617 continue;
618
619 let limit = options.exhaustive ? rules.length :
620 Math.min(i + heuristicRange, rules.length);
621
622 for (let j = i + 1; j < limit; j++)
623 {
624 if (rulesInfo[j].skip)
625 continue;
626
627 // Check if the rules are identical except for the URL filter.
628 if (rulesInfo[i].ruleHash == rulesInfo[j].ruleHash)
629 {
630 let source = rules[i].trigger["url-filter"];
631 let target = rules[j].trigger["url-filter"];
632
633 let edit = closeMatch(source, target, !options.advanced);
634
635 if (edit)
636 {
637 let urlFilter, ruleInfo, match = {edit};
638
639 if (edit.type == "insert")
640 {
641 // Convert the insertion into a deletion and stick it on the target
642 // rule instead. We can only group deletions and substitutions;
643 // therefore insertions must be treated as deletions on the target
644 // rule.
645 urlFilter = target;
646 ruleInfo = rulesInfo[j];
647 match.index = i;
648 edit.type = "delete";
649 }
650 else
651 {
652 urlFilter = source;
653 ruleInfo = rulesInfo[i];
654 match.index = j;
655 }
656
657 // If the edit has an end index, it represents a multiple character
658 // edit.
659 let multiEdit = !!edit.endIndex;
660
661 if (multiEdit)
662 {
663 // We only care about a single multiple character edit because the
664 // number of characters for such a match doesn't matter, we can
665 // only merge with one other rule.
666 if (!ruleInfo.multiEditMatch)
667 ruleInfo.multiEditMatch = match;
668 }
669 else
670 {
671 // For single character edits, multiple rules can be merged into
672 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
673 if (!ruleInfo.matches)
674 ruleInfo.matches = new Array(urlFilter.length + 1);
675
676 // Matches at a particular index. For example, for a source string
677 // "ads", both target strings "ad" (deletion) and "adv"
678 // (substitution) match at index 2, hence they are grouped together
679 // to possibly be merged later into "ad[sv]?".
680 let matchesForIndex = ruleInfo.matches[edit.index];
681
682 if (matchesForIndex)
683 {
684 matchesForIndex.push(match);
685 }
686 else
687 {
688 matchesForIndex = [match];
689 ruleInfo.matches[edit.index] = matchesForIndex;
690 }
691
692 // Keep track of the best set of matches. We later sort by this to
693 // get best results.
694 if (!ruleInfo.bestMatches ||
695 matchesForIndex.length > ruleInfo.bestMatches.length)
696 ruleInfo.bestMatches = matchesForIndex;
697 }
698 }
699 }
700 }
701 }
702
703 // Filter out rules that have no matches at all.
704 let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
705 {
706 return ruleInfo.bestMatches || ruleInfo.multiEditMatch
707 });
708
709 // For best results, we have to sort the candidates by the largest set of
710 // matches.
711 //
712 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
713 // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
714 // "[ab]dx" (3 rules).
715 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
716 {
717 let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
718 ruleInfo1.multiEditMatch ? 1 : 0;
719 let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
720 ruleInfo2.multiEditMatch ? 1 : 0;
721
722 return weight2 - weight1;
723 });
724
725 for (let ruleInfo of candidateRulesInfo)
726 {
727 let rule = ruleInfo.rule;
728
729 // If this rule has already been merged into another rule, we skip it.
730 if (ruleInfo.merged)
731 continue;
732
733 // Find the best set of rules to group, which is simply the largest set.
734 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
735 {
736 matchesForIndex = (matchesForIndex || []).filter(match =>
737 {
738 // Filter out rules that have either already been merged into other
739 // rules or have had other rules merged into them.
740 return !rulesInfo[match.index].merged &&
741 !rulesInfo[match.index].mergedInto;
742 });
743
744 return matchesForIndex.length > best.length ? matchesForIndex : best;
745 },
746 []);
747
748 let multiEdit = false;
749
750 // If we couldn't find a single rule to merge with, let's see if we have a
751 // multiple character edit. e.g. we could merge "ad" and "adserver" into
752 // "ad(server)?".
753 if (best.length == 0 && ruleInfo.multiEditMatch &&
754 !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
755 !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
756 {
757 best = [ruleInfo.multiEditMatch];
758 multiEdit = true;
759 }
760
761 if (best.length > 0)
762 {
763 let urlFilter = rule.trigger["url-filter"];
764
765 let editIndex = best[0].edit.index;
766
767 if (!multiEdit)
768 {
769 // Merge all the matching rules into this one.
770
771 let characters = [];
772 let quantifier = "";
773
774 for (let match of best)
775 {
776 if (match.edit.type == "delete")
777 {
778 quantifier = "?";
779 }
780 else
781 {
782 let character = rules[match.index].trigger["url-filter"][editIndex];
783 characters.push(character);
784 }
785
786 // Mark the target rule as merged so other rules don't try to merge
787 // it again.
788 rulesInfo[match.index].merged = true;
789 }
790
791 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
792 urlFilter.substring(editIndex + 1);
793 if (characters.length > 0)
794 {
795 urlFilter = urlFilter.substring(0, editIndex) + "[" +
796 urlFilter[editIndex] + characters.join("") + "]" +
797 urlFilter.substring(editIndex + 1);
798 }
799 }
800 else
801 {
802 let editEndIndex = best[0].edit.endIndex;
803
804 // Mark the target rule as merged so other rules don't try to merge it
805 // again.
806 rulesInfo[best[0].index].merged = true;
807
808 urlFilter = urlFilter.substring(0, editIndex) + "(" +
809 urlFilter.substring(editIndex, editEndIndex) + ")?" +
810 urlFilter.substring(editEndIndex);
811 }
812
813 rule.trigger["url-filter"] = urlFilter;
814
815 // Mark this rule as one that has had other rules merged into it.
816 ruleInfo.mergedInto = true;
817 }
818 }
819
820 // Filter out rules that have been merged into other rules.
821 return rulesInfo.filter(ruleInfo => !ruleInfo.merged)
822 .map(ruleInfo => ruleInfo.rule);
823 }
824
369 let ContentBlockerList = 825 let ContentBlockerList =
370 /** 826 /**
371 * Create a new Adblock Plus filter to content blocker list converter 827 * Create a new Adblock Plus filter to content blocker list converter
372 * 828 *
373 * @constructor 829 * @constructor
374 */ 830 */
375 exports.ContentBlockerList = function () 831 exports.ContentBlockerList = function ()
376 { 832 {
377 this.requestFilters = []; 833 this.requestFilters = [];
378 this.requestExceptions = []; 834 this.requestExceptions = [];
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
417 873
418 parseDomains(filter.domains, domains, []); 874 parseDomains(filter.domains, domains, []);
419 } 875 }
420 }; 876 };
421 877
422 /** 878 /**
423 * Generate content blocker list for all filters that were added 879 * Generate content blocker list for all filters that were added
424 * 880 *
425 * @returns {Filter} filter Filter to convert 881 * @returns {Filter} filter Filter to convert
426 */ 882 */
427 ContentBlockerList.prototype.generateRules = function(filter) 883 ContentBlockerList.prototype.generateRules = function(options)
428 { 884 {
885 const defaultOptions = {
886 merge: false,
887 fastMerge: true,
888 advancedMerge: null,
889 exhaustiveMerge: null
890 };
891
892 options = Object.assign({}, defaultOptions, options);
893
429 let rules = []; 894 let rules = [];
430 895
431 let groupedElemhideFilters = new Map(); 896 let groupedElemhideFilters = new Map();
432 for (let filter of this.elemhideFilters) 897 for (let filter of this.elemhideFilters)
433 { 898 {
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 899 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
435 if (!result) 900 if (!result)
436 continue; 901 continue;
437 902
438 if (result.matchDomains.length == 0) 903 if (result.matchDomains.length == 0)
(...skipping 26 matching lines...) Expand all
465 } 930 }
466 }); 931 });
467 932
468 for (let filter of this.elemhideExceptions) 933 for (let filter of this.elemhideExceptions)
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 934 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
470 for (let filter of this.requestFilters) 935 for (let filter of this.requestFilters)
471 convertFilterAddRules(rules, filter, "block", true); 936 convertFilterAddRules(rules, filter, "block", true);
472 for (let filter of this.requestExceptions) 937 for (let filter of this.requestExceptions)
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 938 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
474 939
475 return rules.filter(rule => !hasNonASCI(rule)); 940 rules = rules.filter(rule => !hasNonASCI(rule));
941
942 if (options.merge)
943 {
944 // If the more specific options are specified (e.g. "advanced" and
945 // "exhaustive"), they override the more general options (e.g. "fast").
946 let mergeOptions = {
947 advanced: options.advancedMerge ||
948 (!options.fastMerge && options.advancedMerge != false),
949 exhaustive: options.exhaustiveMerge ||
950 (!options.fastMerge && options.exhaustiveMerge != false)
951 };
952
953 rules = mergeCloselyMatchingRules(rules, mergeOptions);
954 }
955
956 return rules;
476 }; 957 };
OLDNEW
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld