Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Filter out redundant rules before merging Created May 7, 2017, 10:38 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 348 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 { 359 {
360 newSelector.push(selector.substring(i, pos.start)); 360 newSelector.push(selector.substring(i, pos.start));
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
362 i = pos.end; 362 i = pos.end;
363 } 363 }
364 newSelector.push(selector.substring(i)); 364 newSelector.push(selector.substring(i));
365 365
366 return newSelector.join(""); 366 return newSelector.join("");
367 } 367 }
368 368
369 function closeMatch(s, t)
370 {
371 // This function returns an edit operation, one of "substitute", "delete",
kzar 2017/05/08 08:13:03 Mind using the JSDoc syntax for the comment explai
Manish Jethani 2017/05/08 23:12:48 Done.
372 // and "insert", along with an index in the source string where the edit must
373 // occur in order to arrive at the target string. If the strings are not a
374 // close match, it returns null.
375 //
376 // Two strings are considered to be a close match if they are one edit
377 // operation apart.
378 //
379 // Deletions or insertions of a contiguous range of characters from one
380 // string into the other, at the same index, are treated as a single edit.
381 // For example, "internal" and "international" are considered to be one edit
382 // apart and therefore a close match.
383
384 // A few things to note:
385 //
386 // 1) This function does not care about the format of the input strings.
387 // For example, the caller may pass in regular expressions, where "[ab]"
388 // and "[bc]" could be considered to be a close match, since the order
389 // within the brackets doesn't matter. This function will still return null
390 // for this set of inputs since they are two edits apart.
391 //
392 // 2) To be friendly to calling code that might be passing in regular
393 // expressions, this function will simply return null if it encounters a
394 // special character (e.g. "\", "?", "+", etc.) in the delta. For example,
395 // given "Hello" and "Hello, how are you?", it will return null.
396 //
397 // 3) If the caller does indeed pass in regular expressions, it must make
398 // the important assumption that the parts where two such regular
399 // expressions may differ can always be treated as normal strings. For
400 // example, "^https?://.*/ads" and "^https?://.*/adv" differ only in the
401 // last character, therefore the regular expressions can safely be merged
402 // into "^https?://.*/ad[sv]".
403
404 let diff = s.length - t.length;
405
406 // If target is longer than source, swap them for the purpose of our
407 // calculation.
408 if (diff < 0)
409 {
410 let tmp = s;
411 s = t;
412 t = tmp;
413 }
414
415 let edit = null;
416
417 let i = 0, j = 0;
418
419 for (; i < s.length; i++)
420 {
421 if (s[i] != t[i])
422 break;
423 }
424
425 for (; j < t.length; j++)
426 {
427 if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1])
kzar 2017/05/08 08:13:02 I find this part hard to grok, could you add a com
Manish Jethani 2017/05/08 23:12:48 Done.
428 break;
429 }
430
431 if (diff == 0)
432 {
433 if (t.length - j - i != 1)
434 return null;
435 }
436 else if (i != t.length - j)
437 {
438 return null;
439 }
440
441 for (let k = i; k < s.length - j; k++)
442 {
443 // If there are any special characters in the delta, bail.
kzar 2017/05/08 08:13:02 Nit: IMO this comment doesn't add much.
Manish Jethani 2017/05/08 23:12:48 I'm slightly rephrasing here, but since I've added
kzar 2017/05/09 10:05:46 Fair enough. Thanks for those extra comments, they
444 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
445 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
446 s[k] == "[" || s[k] == "]" || s[k] == "\\")
447 return null;
448 }
449
450 if (diff == 0)
451 {
452 edit = {type: "substitute", index: i};
453 }
454 else if (diff > 0)
455 {
456 edit = {type: "delete", index: i};
457
458 if (diff > 1)
459 edit.endIndex = s.length - j;
460 }
461 else
462 {
463 edit = {type: "insert", index: i};
464
465 if (diff < -1)
466 edit.endIndex = s.length - j;
467 }
468
469 return edit;
470 }
471
472 function eliminateRedundantRulesByURLFilter(rulesInfo)
473 {
474 for (let i = 0; i < rulesInfo.length; i++)
475 {
476 // If this rule is already marked as redundant, don't bother comparing it
477 // with other rules.
478 if (rulesInfo[i].redundant)
479 continue;
480
481 for (let j = i + 1; j < rulesInfo.length; j++)
482 {
483 if (rulesInfo[j].redundant)
484 continue;
485
486 let source = rulesInfo[i].rule.trigger["url-filter"];
487 let target = rulesInfo[j].rule.trigger["url-filter"];
488
489 if (source.length >= target.length)
490 {
491 // If one URL filter is a substring of the other starting at the
492 // beginning, the other one is clearly redundant.
493 if (source.substring(0, target.length) == target)
494 {
495 rulesInfo[i].redundant = true;
496 break;
497 }
498 }
499 else if (target.substring(0, source.length) == source)
500 {
501 rulesInfo[j].redundant = true;
502 }
503 }
504 }
505
506 return rulesInfo.filter(ruleInfo => !ruleInfo.redundant);
507 }
508
509 function mergeRulesByURLFilter(rulesInfo, exhaustive)
510 {
511 // Closely matching rules are likely to be within a certain range. We only
512 // look for matches within this range. If we increase this value, it can give
513 // us more matches and a smaller resulting rule set, but possibly at a
514 // significant performance cost.
515 const heuristicRange = 10;
kzar 2017/05/08 08:13:03 Could you mention about the behaviour (or lack the
Manish Jethani 2017/05/08 23:12:48 Done.
516
517 if (exhaustive)
518 // Throw out obviously redundant rules.
kzar 2017/05/08 08:13:02 Nit: Please surround with braces since it spans mu
Manish Jethani 2017/05/08 23:12:48 Done.
519 rulesInfo = eliminateRedundantRulesByURLFilter(rulesInfo);
520
521 if (rulesInfo.length <= 1)
522 return;
523
524 for (let i = 0; i < rulesInfo.length; i++)
525 {
526 let limit = exhaustive ? rulesInfo.length :
527 Math.min(i + heuristicRange, rulesInfo.length);
528
529 for (let j = i + 1; j < limit; j++)
530 {
531 let source = rulesInfo[i].rule.trigger["url-filter"];
532 let target = rulesInfo[j].rule.trigger["url-filter"];
533
534 let edit = closeMatch(source, target);
535
536 if (edit)
537 {
538 let urlFilter, ruleInfo, match = {edit};
539
540 if (edit.type == "insert")
541 {
542 // Convert the insertion into a deletion and stick it on the target
543 // rule instead. We can only group deletions and substitutions;
544 // therefore insertions must be treated as deletions on the target
545 // rule.
546 urlFilter = target;
547 ruleInfo = rulesInfo[j];
548 match.index = i;
549 edit.type = "delete";
550 }
551 else
552 {
553 urlFilter = source;
554 ruleInfo = rulesInfo[i];
555 match.index = j;
556 }
557
558 // If the edit has an end index, it represents a multiple character
559 // edit.
560 let multiEdit = !!edit.endIndex;
561
562 if (multiEdit)
563 {
564 // We only care about a single multiple character edit because the
565 // number of characters for such a match doesn't matter, we can
566 // only merge with one other rule.
567 if (!ruleInfo.multiEditMatch)
568 ruleInfo.multiEditMatch = match;
569 }
570 else
571 {
572 // For single character edits, multiple rules can be merged into
573 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
574 if (!ruleInfo.matches)
575 ruleInfo.matches = new Array(urlFilter.length);
576
577 // Matches at a particular index. For example, for a source string
578 // "ads", both target strings "ad" (deletion) and "adv"
579 // (substitution) match at index 2, hence they are grouped together
580 // to possibly be merged later into "ad[sv]?".
581 let matchesForIndex = ruleInfo.matches[edit.index];
582
583 if (matchesForIndex)
584 {
585 matchesForIndex.push(match);
586 }
587 else
588 {
589 matchesForIndex = [match];
590 ruleInfo.matches[edit.index] = matchesForIndex;
591 }
592
593 // Keep track of the best set of matches. We later sort by this to
594 // get best results.
595 if (!ruleInfo.bestMatches ||
596 matchesForIndex.length > ruleInfo.bestMatches.length)
597 ruleInfo.bestMatches = matchesForIndex;
598 }
599 }
600 }
601 }
602
603 // Filter out rules that have no matches at all.
604 let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
605 {
606 return ruleInfo.bestMatches || ruleInfo.multiEditMatch
607 });
608
609 // For best results, we have to sort the candidates by the largest set of
610 // matches.
611 //
612 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
613 // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
614 // "[ab]dx" (3 rules).
615 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
616 {
617 let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
618 ruleInfo1.multiEditMatch ? 1 : 0;
619 let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
620 ruleInfo2.multiEditMatch ? 1 : 0;
621
622 return weight2 - weight1;
623 });
624
625 for (let ruleInfo of candidateRulesInfo)
626 {
627 let rule = ruleInfo.rule;
628
629 // If this rule has already been merged into another rule, we skip it.
630 if (ruleInfo.merged)
631 continue;
632
633 // Find the best set of rules to group, which is simply the largest set.
634 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
635 {
636 matchesForIndex = (matchesForIndex || []).filter(match =>
637 {
638 // Filter out rules that have either already been merged into other
639 // rules or have had other rules merged into them.
640 return !rulesInfo[match.index].merged &&
641 !rulesInfo[match.index].mergedInto;
642 });
643
644 return matchesForIndex.length > best.length ? matchesForIndex : best;
645 },
646 []);
647
648 let multiEdit = false;
649
650 // If we couldn't find a single rule to merge with, let's see if we have a
651 // multiple character edit. e.g. we could merge "ad" and "adserver" into
652 // "ad(server)?".
653 if (best.length == 0 && ruleInfo.multiEditMatch &&
654 !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
655 !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
656 {
657 best = [ruleInfo.multiEditMatch];
658 multiEdit = true;
659 }
660
661 if (best.length > 0)
662 {
663 let urlFilter = rule.trigger["url-filter"];
664
665 let editIndex = best[0].edit.index;
666
667 if (!multiEdit)
668 {
669 // Merge all the matching rules into this one.
670
671 let characters = [];
672 let quantifier = "";
673
674 for (let match of best)
675 {
676 if (match.edit.type == "delete")
677 {
678 quantifier = "?";
679 }
680 else
681 {
682 let character = rulesInfo[match.index].rule
683 .trigger["url-filter"][editIndex];
684 characters.push(character);
685 }
686
687 // Mark the target rule as merged so other rules don't try to merge
688 // it again.
689 rulesInfo[match.index].merged = true;
690 }
691
692 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
693 urlFilter.substring(editIndex + 1);
694 if (characters.length > 0)
695 {
696 urlFilter = urlFilter.substring(0, editIndex) + "[" +
697 urlFilter[editIndex] + characters.join("") + "]" +
698 urlFilter.substring(editIndex + 1);
699 }
700 }
701 else
702 {
703 let editEndIndex = best[0].edit.endIndex;
704
705 // Mark the target rule as merged so other rules don't try to merge it
706 // again.
707 rulesInfo[best[0].index].merged = true;
708
709 urlFilter = urlFilter.substring(0, editIndex) + "(" +
710 urlFilter.substring(editIndex, editEndIndex) + ")?" +
711 urlFilter.substring(editEndIndex);
712 }
713
714 rule.trigger["url-filter"] = urlFilter;
715
716 // Mark this rule as one that has had other rules merged into it.
717 ruleInfo.mergedInto = true;
718 }
719 }
720 }
721
722 function mergeRulesByArrayProperty(rulesInfo, propertyType, property)
723 {
724 if (rulesInfo.length <= 1)
725 return;
726
727 let set = new Set();
728
729 rulesInfo.forEach((ruleInfo, index) =>
730 {
731 if (ruleInfo.rule[propertyType][property])
732 {
733 for (let value of ruleInfo.rule[propertyType][property])
734 set.add(value);
735 }
736
737 if (index > 0)
738 ruleInfo.merged = true;
739 });
740
741 if (set.size > 0)
742 rulesInfo[0].rule[propertyType][property] = Array.from(set);
743
744 rulesInfo[0].mergedInto = true;
745 }
746
747 function groupRulesByMergeableProperty(rulesInfo, propertyType, property)
748 {
749 let mergeableRulesInfoByGroup = new Map();
750
751 rulesInfo.forEach(ruleInfo =>
752 {
753 let copy = {
754 trigger: Object.assign({}, ruleInfo.rule.trigger),
755 action: Object.assign({}, ruleInfo.rule.action)
756 };
757
758 delete copy[propertyType][property];
759
760 let groupKey = JSON.stringify(copy);
761
762 let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey);
763
764 if (mergeableRulesInfo)
765 mergeableRulesInfo.push(ruleInfo);
766 else
767 mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]);
768 });
769
770 return mergeableRulesInfoByGroup;
771 }
772
773 function mergeRules(rules, options)
774 {
775 const defaultOptions = {exhaustive: false};
776
777 options = Object.assign({}, defaultOptions, options);
778
779 let rulesInfo = rules.map(rule => ({rule}));
kzar 2017/05/08 08:13:02 What's the purpose of this line?
Manish Jethani 2017/05/08 14:03:59 The purpose of this line is to create one "ruleInf
kzar 2017/05/09 10:05:46 Argh, sorry that was an especially dumb question.
Manish Jethani 2017/05/09 15:52:46 Haha, no worries :)
780
781 groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter")
782 .forEach(mergeableRulesInfo =>
kzar 2017/05/08 08:13:03 Any reason you used forEach instead of for ... of
Manish Jethani 2017/05/08 14:04:00 It was for (let value of map.values()) { ... } at
kzar 2017/05/09 10:05:46 Well groupRulesByMergeableProperty creates the arr
Manish Jethani 2017/05/09 15:52:46 The intermediate array here would an array of the
783 {
784 if (mergeableRulesInfo.length > 1)
785 mergeRulesByURLFilter(mergeableRulesInfo, options.exhaustive);
786 });
787
788 // Filter out rules that are redundant or have been merged into other rules.
789 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.redundant &&
790 !ruleInfo.merged);
791
792 for (let arrayProperty of ["resource-type", "if-domain"])
793 {
794 groupRulesByMergeableProperty(rulesInfo, "trigger", arrayProperty)
795 .forEach(mergeableRulesInfo =>
kzar 2017/05/08 08:13:02 This logic seems pretty much the same as above, I
Manish Jethani 2017/05/08 14:03:59 I suppose we could merge them into one, but then w
kzar 2017/05/09 10:05:46 Acknowledged.
796 {
797 if (mergeableRulesInfo.length > 1)
798 mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty);
799 });
800
801 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged);
kzar 2017/05/08 08:13:02 Maybe a dumb question, but instead of setting the
Manish Jethani 2017/05/08 14:03:58 I could try that, but then we'd have to adjust the
kzar 2017/05/09 10:05:46 Acknowledged.
802 }
803
804 return rulesInfo.map(ruleInfo => ruleInfo.rule);
805 }
806
369 let ContentBlockerList = 807 let ContentBlockerList =
370 /** 808 /**
371 * Create a new Adblock Plus filter to content blocker list converter 809 * Create a new Adblock Plus filter to content blocker list converter
372 * 810 *
373 * @constructor 811 * @constructor
374 */ 812 */
375 exports.ContentBlockerList = function () 813 exports.ContentBlockerList = function ()
376 { 814 {
377 this.requestFilters = []; 815 this.requestFilters = [];
378 this.requestExceptions = []; 816 this.requestExceptions = [];
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
417 855
418 parseDomains(filter.domains, domains, []); 856 parseDomains(filter.domains, domains, []);
419 } 857 }
420 }; 858 };
421 859
422 /** 860 /**
423 * Generate content blocker list for all filters that were added 861 * Generate content blocker list for all filters that were added
424 * 862 *
425 * @returns {Filter} filter Filter to convert 863 * @returns {Filter} filter Filter to convert
426 */ 864 */
427 ContentBlockerList.prototype.generateRules = function(filter) 865 ContentBlockerList.prototype.generateRules = function(options)
428 { 866 {
867 const defaultOptions = {
868 merge: false,
869 exhaustiveMerge: false
870 };
871
872 options = Object.assign({}, defaultOptions, options);
873
429 let rules = []; 874 let rules = [];
430 875
431 let groupedElemhideFilters = new Map(); 876 let groupedElemhideFilters = new Map();
432 for (let filter of this.elemhideFilters) 877 for (let filter of this.elemhideFilters)
433 { 878 {
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 879 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
435 if (!result) 880 if (!result)
436 continue; 881 continue;
437 882
438 if (result.matchDomains.length == 0) 883 if (result.matchDomains.length == 0)
(...skipping 26 matching lines...) Expand all
465 } 910 }
466 }); 911 });
467 912
468 for (let filter of this.elemhideExceptions) 913 for (let filter of this.elemhideExceptions)
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 914 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
470 for (let filter of this.requestFilters) 915 for (let filter of this.requestFilters)
471 convertFilterAddRules(rules, filter, "block", true); 916 convertFilterAddRules(rules, filter, "block", true);
472 for (let filter of this.requestExceptions) 917 for (let filter of this.requestExceptions)
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 918 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
474 919
475 return rules.filter(rule => !hasNonASCI(rule)); 920 rules = rules.filter(rule => !hasNonASCI(rule));
921
922 if (options.merge)
923 {
924 let mergeOptions = {
kzar 2017/05/08 08:13:02 Couldn't you just pass options.exhaustiveMerge str
Manish Jethani 2017/05/08 23:12:48 Done.
925 exhaustive: options.exhaustiveMerge
926 };
927
928 rules = mergeRules(rules, mergeOptions);
929 }
930
931 return rules;
476 }; 932 };
OLDNEW
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld