Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Merge rules by resource-type and if-domain Created May 6, 2017, 5:17 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 348 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 { 359 {
360 newSelector.push(selector.substring(i, pos.start)); 360 newSelector.push(selector.substring(i, pos.start));
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
362 i = pos.end; 362 i = pos.end;
363 } 363 }
364 newSelector.push(selector.substring(i)); 364 newSelector.push(selector.substring(i));
365 365
366 return newSelector.join(""); 366 return newSelector.join("");
367 } 367 }
368 368
369 function closeMatch(s, t)
370 {
371 // This function returns an edit operation, one of "substitute", "delete",
372 // and "insert", along with an index in the source string where the edit must
373 // occur in order to arrive at the target string. If the strings are not a
374 // close match, it returns null.
375 //
376 // Two strings are considered to be a close match if they are one edit
377 // operation apart.
378 //
379 // Deletions or insertions of a contiguous range of characters from one
380 // string into the other, at the same index, are treated as a single edit.
381 // For example, "internal" and "international" are considered to be one edit
382 // apart and therefore a close match.
383
384 // A few things to note:
385 //
386 // 1) This function does not care about the format of the input strings.
387 // For example, the caller may pass in regular expressions, where "[ab]"
388 // and "[bc]" could be considered to be a close match, since the order
389 // within the brackets doesn't matter. This function will still return null
390 // for this set of inputs since they are two edits apart.
391 //
392 // 2) To be friendly to calling code that might be passing in regular
393 // expressions, this function will simply return null if it encounters a
394 // special character (e.g. "\", "?", "+", etc.) in the delta. For example,
395 // given "Hello" and "Hello, how are you?", it will return null.
396 //
397 // 3) If the caller does indeed pass in regular expressions, it must make
398 // the important assumption that the parts where two such regular
399 // expressions may differ can always be treated as normal strings. For
400 // example, "^https?://.*/ads" and "^https?://.*/adv" differ only in the
401 // last character, therefore the regular expressions can safely be merged
402 // into "^https?://.*/ad[sv]".
403
404 let diff = s.length - t.length;
405
406 // If target is longer than source, swap them for the purpose of our
407 // calculation.
408 if (diff < 0)
409 {
410 let tmp = s;
411 s = t;
412 t = tmp;
413 }
414
415 let edit = null;
416
417 let i = 0, j = 0;
418
419 for (; i < s.length; i++)
420 {
421 if (s[i] != t[i])
422 break;
423 }
424
425 for (; j < t.length; j++)
426 {
427 if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1])
428 break;
429 }
430
431 if (diff == 0)
432 {
433 if (t.length - j - i != 1)
434 return null;
435 }
436 else if (i != t.length - j)
437 {
438 return null;
439 }
440
441 for (let k = i; k < s.length - j; k++)
442 {
443 // If there are any special characters in the delta, bail.
444 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
445 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
446 s[k] == "[" || s[k] == "]" || s[k] == "\\")
447 return null;
448 }
449
450 if (diff == 0)
451 {
452 edit = {type: "substitute", index: i};
453 }
454 else if (diff > 0)
455 {
456 edit = {type: "delete", index: i};
457
458 if (diff > 1)
459 edit.endIndex = s.length - j;
460 }
461 else
462 {
463 edit = {type: "insert", index: i};
464
465 if (diff < -1)
466 edit.endIndex = s.length - j;
467 }
468
469 return edit;
470 }
471
472 function mergeRulesByURLFilter(rulesInfo, exhaustive)
473 {
474 // Closely matching rules are likely to be within a certain range. We only
475 // look for matches within this range. If we increase this value, it can give
476 // us more matches and a smaller resulting rule set, but possibly at a
477 // significant performance cost.
478 const heuristicRange = 100;
479
480 for (let i = 0; i < rulesInfo.length; i++)
481 {
482 let limit = exhaustive ? rulesInfo.length :
483 Math.min(i + heuristicRange, rulesInfo.length);
484
485 for (let j = i + 1; j < limit; j++)
486 {
487 let source = rulesInfo[i].rule.trigger["url-filter"];
488 let target = rulesInfo[j].rule.trigger["url-filter"];
489
490 let edit = closeMatch(source, target);
491
492 if (edit)
493 {
494 let urlFilter, ruleInfo, match = {edit};
495
496 if (edit.type == "insert")
497 {
498 // Convert the insertion into a deletion and stick it on the target
499 // rule instead. We can only group deletions and substitutions;
500 // therefore insertions must be treated as deletions on the target
501 // rule.
502 urlFilter = target;
503 ruleInfo = rulesInfo[j];
504 match.index = i;
505 edit.type = "delete";
506 }
507 else
508 {
509 urlFilter = source;
510 ruleInfo = rulesInfo[i];
511 match.index = j;
512 }
513
514 // If the edit has an end index, it represents a multiple character
515 // edit.
516 let multiEdit = !!edit.endIndex;
517
518 if (multiEdit)
519 {
520 // We only care about a single multiple character edit because the
521 // number of characters for such a match doesn't matter, we can
522 // only merge with one other rule.
523 if (!ruleInfo.multiEditMatch)
524 ruleInfo.multiEditMatch = match;
525 }
526 else
527 {
528 // For single character edits, multiple rules can be merged into
529 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
530 if (!ruleInfo.matches)
531 ruleInfo.matches = new Array(urlFilter.length + 1);
532
533 // Matches at a particular index. For example, for a source string
534 // "ads", both target strings "ad" (deletion) and "adv"
535 // (substitution) match at index 2, hence they are grouped together
536 // to possibly be merged later into "ad[sv]?".
537 let matchesForIndex = ruleInfo.matches[edit.index];
538
539 if (matchesForIndex)
540 {
541 matchesForIndex.push(match);
542 }
543 else
544 {
545 matchesForIndex = [match];
546 ruleInfo.matches[edit.index] = matchesForIndex;
547 }
548
549 // Keep track of the best set of matches. We later sort by this to
550 // get best results.
551 if (!ruleInfo.bestMatches ||
552 matchesForIndex.length > ruleInfo.bestMatches.length)
553 ruleInfo.bestMatches = matchesForIndex;
554 }
555 }
556 }
557 }
558
559 // Filter out rules that have no matches at all.
560 let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
561 {
562 return ruleInfo.bestMatches || ruleInfo.multiEditMatch
563 });
564
565 // For best results, we have to sort the candidates by the largest set of
566 // matches.
567 //
568 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
569 // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
570 // "[ab]dx" (3 rules).
571 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
572 {
573 let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
574 ruleInfo1.multiEditMatch ? 1 : 0;
575 let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
576 ruleInfo2.multiEditMatch ? 1 : 0;
577
578 return weight2 - weight1;
579 });
580
581 for (let ruleInfo of candidateRulesInfo)
582 {
583 let rule = ruleInfo.rule;
584
585 // If this rule has already been merged into another rule, we skip it.
586 if (ruleInfo.merged)
587 continue;
588
589 // Find the best set of rules to group, which is simply the largest set.
590 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
591 {
592 matchesForIndex = (matchesForIndex || []).filter(match =>
593 {
594 // Filter out rules that have either already been merged into other
595 // rules or have had other rules merged into them.
596 return !rulesInfo[match.index].merged &&
597 !rulesInfo[match.index].mergedInto;
598 });
599
600 return matchesForIndex.length > best.length ? matchesForIndex : best;
601 },
602 []);
603
604 let multiEdit = false;
605
606 // If we couldn't find a single rule to merge with, let's see if we have a
607 // multiple character edit. e.g. we could merge "ad" and "adserver" into
608 // "ad(server)?".
609 if (best.length == 0 && ruleInfo.multiEditMatch &&
610 !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
611 !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
612 {
613 best = [ruleInfo.multiEditMatch];
614 multiEdit = true;
615 }
616
617 if (best.length > 0)
618 {
619 let urlFilter = rule.trigger["url-filter"];
620
621 let editIndex = best[0].edit.index;
622
623 if (!multiEdit)
624 {
625 // Merge all the matching rules into this one.
626
627 let characters = [];
628 let quantifier = "";
629
630 for (let match of best)
631 {
632 if (match.edit.type == "delete")
633 {
634 quantifier = "?";
635 }
636 else
637 {
638 let character = rulesInfo[match.index].rule
639 .trigger["url-filter"][editIndex];
640 characters.push(character);
641 }
642
643 // Mark the target rule as merged so other rules don't try to merge
644 // it again.
645 rulesInfo[match.index].merged = true;
646 }
647
648 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
649 urlFilter.substring(editIndex + 1);
650 if (characters.length > 0)
651 {
652 urlFilter = urlFilter.substring(0, editIndex) + "[" +
653 urlFilter[editIndex] + characters.join("") + "]" +
654 urlFilter.substring(editIndex + 1);
655 }
656 }
657 else
658 {
659 let editEndIndex = best[0].edit.endIndex;
660
661 // Mark the target rule as merged so other rules don't try to merge it
662 // again.
663 rulesInfo[best[0].index].merged = true;
664
665 urlFilter = urlFilter.substring(0, editIndex) + "(" +
666 urlFilter.substring(editIndex, editEndIndex) + ")?" +
667 urlFilter.substring(editEndIndex);
668 }
669
670 rule.trigger["url-filter"] = urlFilter;
671
672 // Mark this rule as one that has had other rules merged into it.
673 ruleInfo.mergedInto = true;
674 }
675 }
676 }
677
678 function mergeRulesByArrayProperty(rulesInfo, propertyType, property)
679 {
680 let set = new Set();
681
682 rulesInfo.forEach((ruleInfo, index) =>
683 {
684 if (ruleInfo.rule[propertyType][property])
685 {
686 for (let value of ruleInfo.rule[propertyType][property])
687 set.add(value);
688 }
689
690 if (index > 0)
691 ruleInfo.merged = true;
692 });
693
694 if (set.size > 0)
695 rulesInfo[0].rule[propertyType][property] = Array.from(set).sort();
696
697 rulesInfo[0].mergedInto = true;
698 }
699
700 function groupRulesByMergeableProperty(rulesInfo, propertyType, property)
701 {
702 let mergeableRulesInfoByGroup = new Map();
703
704 rulesInfo.forEach(ruleInfo =>
705 {
706 let copy = {
707 trigger: Object.assign({}, ruleInfo.rule.trigger),
708 action: Object.assign({}, ruleInfo.rule.action)
709 };
710
711 delete copy[propertyType][property];
712
713 let groupKey = JSON.stringify(copy);
714
715 let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey);
716
717 if (mergeableRulesInfo)
718 mergeableRulesInfo.push(ruleInfo);
719 else
720 mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]);
721 });
722
723 return mergeableRulesInfoByGroup;
724 }
725
726 function mergeRules(rules, options)
727 {
728 const defaultOptions = {exhaustive: false};
729
730 options = Object.assign({}, defaultOptions, options);
731
732 let rulesInfo = rules.map(rule => ({rule}));
733
734 groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter")
735 .forEach(mergeableRulesInfo =>
736 {
737 if (mergeableRulesInfo.length > 1)
738 mergeRulesByURLFilter(mergeableRulesInfo, options.exhaustive);
739 });
740
741 // Filter out rules that have been merged into other rules.
742 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged);
743
744 for (let arrayProperty of["resource-type", "if-domain"])
745 {
746 groupRulesByMergeableProperty(rulesInfo, "trigger", arrayProperty)
747 .forEach(mergeableRulesInfo =>
748 {
749 if (mergeableRulesInfo.length > 1)
750 mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty);
751 });
752
753 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged);
754 }
755
756 return rulesInfo.map(ruleInfo => ruleInfo.rule);
757 }
758
369 let ContentBlockerList = 759 let ContentBlockerList =
370 /** 760 /**
371 * Create a new Adblock Plus filter to content blocker list converter 761 * Create a new Adblock Plus filter to content blocker list converter
372 * 762 *
373 * @constructor 763 * @constructor
374 */ 764 */
375 exports.ContentBlockerList = function () 765 exports.ContentBlockerList = function ()
376 { 766 {
377 this.requestFilters = []; 767 this.requestFilters = [];
378 this.requestExceptions = []; 768 this.requestExceptions = [];
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
417 807
418 parseDomains(filter.domains, domains, []); 808 parseDomains(filter.domains, domains, []);
419 } 809 }
420 }; 810 };
421 811
422 /** 812 /**
423 * Generate content blocker list for all filters that were added 813 * Generate content blocker list for all filters that were added
424 * 814 *
425 * @returns {Filter} filter Filter to convert 815 * @returns {Filter} filter Filter to convert
426 */ 816 */
427 ContentBlockerList.prototype.generateRules = function(filter) 817 ContentBlockerList.prototype.generateRules = function(options)
428 { 818 {
819 const defaultOptions = {
820 merge: false,
821 exhaustiveMerge: false
822 };
823
824 options = Object.assign({}, defaultOptions, options);
825
429 let rules = []; 826 let rules = [];
430 827
431 let groupedElemhideFilters = new Map(); 828 let groupedElemhideFilters = new Map();
432 for (let filter of this.elemhideFilters) 829 for (let filter of this.elemhideFilters)
433 { 830 {
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 831 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
435 if (!result) 832 if (!result)
436 continue; 833 continue;
437 834
438 if (result.matchDomains.length == 0) 835 if (result.matchDomains.length == 0)
(...skipping 26 matching lines...) Expand all
465 } 862 }
466 }); 863 });
467 864
468 for (let filter of this.elemhideExceptions) 865 for (let filter of this.elemhideExceptions)
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 866 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
470 for (let filter of this.requestFilters) 867 for (let filter of this.requestFilters)
471 convertFilterAddRules(rules, filter, "block", true); 868 convertFilterAddRules(rules, filter, "block", true);
472 for (let filter of this.requestExceptions) 869 for (let filter of this.requestExceptions)
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 870 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
474 871
475 return rules.filter(rule => !hasNonASCI(rule)); 872 rules = rules.filter(rule => !hasNonASCI(rule));
873
874 if (options.merge)
875 {
876 let mergeOptions = {
877 exhaustive: options.exhaustiveMerge
878 };
879
880 rules = mergeRules(rules, mergeOptions);
881 }
882
883 return rules;
476 }; 884 };
OLDNEW
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld