Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Simplify and optimize Created May 5, 2017, 9:39 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 348 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 { 359 {
360 newSelector.push(selector.substring(i, pos.start)); 360 newSelector.push(selector.substring(i, pos.start));
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
362 i = pos.end; 362 i = pos.end;
363 } 363 }
364 newSelector.push(selector.substring(i)); 364 newSelector.push(selector.substring(i));
365 365
366 return newSelector.join(""); 366 return newSelector.join("");
367 } 367 }
368 368
369 function closeMatch(s, t, singleCharacterOnly)
370 {
371 // This function returns an edit operation, one of "substitute", "delete",
372 // and "insert", along with an index in the source string where the edit must
373 // occur in order to arrive at the target string. If the strings are not a
374 // close match, it returns null.
375
376 // If singleCharacterOnly is false, deletions or insertions of a contiguous
377 // range of characters from one string into the other, at the same index, are
378 // treated as a single edit. For example, "internal" and "international" are
379 // considered to be one edit apart, inserting the substring "tiona" from the
380 // latter into the former.
381
382 // A few things to note:
383 //
384 // 1) This function does not care about how the input strings are treated
385 // by the caller. It only treats them as raw strings. For example, the
386 // caller may treat them as regular expressions, where "[ab]" and "[bc]"
387 // could be considered to have an edit distance of 1, since the order
388 // within the brackets does not matter. This function will still return
389 // null for this set of inputs since they are two edits apart.
390 //
391 // 2) To be friendly to calling code that might be passing in regular
392 // expressions anyway, this function will simply return null if it
393 // encounters a special character (e.g. "\", "?", "+", "*", etc.) in the
394 // delta. For example, given "Hello" and "Hello, how are you?", it will
395 // return null instead of "{type: 'insert', index: 5, endIndex: 19}".
396 //
397 // 3) The calling code within this file does indeed pass in regular
398 // expressions (the strict subset of JavaScript regular expressions
399 // supported by WebKit for content blockers), making the important
400 // assumption that the parts where two such regular expressions may
401 // differ can always be treated as normal strings.
402 //
403 // For example, "^https?://.*/ads" and "^https?://.*/adv" differ only in
404 // the last character, therefore the regular expressions can safely be
405 // merged into "^https?://.*/ad[sv]". If, for example, the characters in
406 // the delta were to appear within square brackets originally in the
407 // input strings (e.g. "^https?://.*/ad[sx]" and "^https?://.*/ad[vx]"),
408 // the calling code would have to do extra work to merge the two regular
409 // expressions correctly. The calling code within this file assumes that
410 // this is never the case.
411
412 let diff = s.length - t.length;
413
414 // If the string lengths differ by more than one character, we cannot arrive
415 // at target from source in a single edit operation.
416 if (singleCharacterOnly && (diff < -1 || diff > 1))
417 return null;
418
419 // If target is longer than source, swap them for the purpose of our
420 // calculation.
421 if (diff < 0)
422 {
423 let tmp = s;
424 s = t;
425 t = tmp;
426 }
427
428 let edit = null;
429
430 // If the string lengths differ by only one character at most, use the simple
431 // algorithm to find a single character edit.
432 if (diff == 0 || diff == 1 || diff == -1)
433 {
434 for (let i = 0, j = 0; i < s.length; i++)
435 {
436 if (s[i] == t[j])
437 {
438 j++;
439 }
440 else if (edit)
441 {
442 // Since we want one and only one edit operation, we must bail here.
443 return null;
444 }
445 else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" ||
446 s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" ||
447 s[i] == "[" || s[i] == "]" || s[i] == "\\") ||
448 (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" ||
449 t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" ||
450 t[j] == "[" || t[j] == "]" || t[j] == "\\"))
451 {
452 // We don't deal with special characters for now.
453 return null;
454 }
455 else if (diff == 0)
456 {
457 // If both strings are equal in length, this is a substitution.
458 edit = {type: "substitute", index: i};
459 j++;
460 }
461 else if (diff > 0)
462 {
463 // If the source string is longer, this is a deletion.
464 edit = {type: "delete", index: i};
465 }
466 else
467 {
468 edit = {type: "insert", index: i};
469 }
470 }
471 }
472 else if (!singleCharacterOnly)
473 {
474 // Try another algorithm to find a multiple character deletion or
475 // insertion.
476
477 let i = 0, j = 0;
478
479 for (; i < s.length; i++)
480 {
481 if (s[i] != t[i])
482 break;
483 }
484
485 for (; j < t.length; j++)
486 {
487 if (t.length - j == i ||
488 s[s.length - j - 1] != t[t.length - j - 1])
489 break;
490 }
491
492 if (i != t.length - j)
493 return null;
494
495 for (let k = i; k < s.length - j; k++)
496 {
497 // If there are any special characters in the delta, bail.
498 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
499 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
500 s[k] == "[" || s[k] == "]" || s[k] == "\\")
501 return null;
502 }
503
504 if (diff > 0)
505 {
506 edit = {type: "delete", index: i, endIndex: s.length - j};
507 }
508 else
509 {
510 edit = {type: "insert", index: i, endIndex: s.length - j};
511 }
512 }
513
514 return edit;
515 }
516
517 function performRuleMerging(rulesInfo, advanced, exhaustive)
518 {
519 // Closely matching rules are likely to be within a certain range. We only
520 // look for matches within this range. If we increase this value, it can give
521 // us more matches and a smaller resulting rule set, but possibly at a
522 // significant performance cost.
523 const heuristicRange = 100;
524
525 for (let i = 0; i < rulesInfo.length; i++)
526 {
527 let limit = exhaustive ? rulesInfo.length :
528 Math.min(i + heuristicRange, rulesInfo.length);
529
530 for (let j = i + 1; j < limit; j++)
531 {
532 let source = rulesInfo[i].rule.trigger["url-filter"];
533 let target = rulesInfo[j].rule.trigger["url-filter"];
534
535 let edit = closeMatch(source, target, !advanced);
536
537 if (edit)
538 {
539 let urlFilter, ruleInfo, match = {edit};
540
541 if (edit.type == "insert")
542 {
543 // Convert the insertion into a deletion and stick it on the target
544 // rule instead. We can only group deletions and substitutions;
545 // therefore insertions must be treated as deletions on the target
546 // rule.
547 urlFilter = target;
548 ruleInfo = rulesInfo[j];
549 match.index = i;
550 edit.type = "delete";
551 }
552 else
553 {
554 urlFilter = source;
555 ruleInfo = rulesInfo[i];
556 match.index = j;
557 }
558
559 // If the edit has an end index, it represents a multiple character
560 // edit.
561 let multiEdit = !!edit.endIndex;
562
563 if (multiEdit)
564 {
565 // We only care about a single multiple character edit because the
566 // number of characters for such a match doesn't matter, we can
567 // only merge with one other rule.
568 if (!ruleInfo.multiEditMatch)
569 ruleInfo.multiEditMatch = match;
570 }
571 else
572 {
573 // For single character edits, multiple rules can be merged into
574 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
575 if (!ruleInfo.matches)
576 ruleInfo.matches = new Array(urlFilter.length + 1);
577
578 // Matches at a particular index. For example, for a source string
579 // "ads", both target strings "ad" (deletion) and "adv"
580 // (substitution) match at index 2, hence they are grouped together
581 // to possibly be merged later into "ad[sv]?".
582 let matchesForIndex = ruleInfo.matches[edit.index];
583
584 if (matchesForIndex)
585 {
586 matchesForIndex.push(match);
587 }
588 else
589 {
590 matchesForIndex = [match];
591 ruleInfo.matches[edit.index] = matchesForIndex;
592 }
593
594 // Keep track of the best set of matches. We later sort by this to
595 // get best results.
596 if (!ruleInfo.bestMatches ||
597 matchesForIndex.length > ruleInfo.bestMatches.length)
598 ruleInfo.bestMatches = matchesForIndex;
599 }
600 }
601 }
602 }
603
604 // Filter out rules that have no matches at all.
605 let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
606 {
607 return ruleInfo.bestMatches || ruleInfo.multiEditMatch
608 });
609
610 // For best results, we have to sort the candidates by the largest set of
611 // matches.
612 //
613 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
614 // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
615 // "[ab]dx" (3 rules).
616 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
617 {
618 let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
619 ruleInfo1.multiEditMatch ? 1 : 0;
620 let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
621 ruleInfo2.multiEditMatch ? 1 : 0;
622
623 return weight2 - weight1;
624 });
625
626 for (let ruleInfo of candidateRulesInfo)
627 {
628 let rule = ruleInfo.rule;
629
630 // If this rule has already been merged into another rule, we skip it.
631 if (ruleInfo.merged)
632 continue;
633
634 // Find the best set of rules to group, which is simply the largest set.
635 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
636 {
637 matchesForIndex = (matchesForIndex || []).filter(match =>
638 {
639 // Filter out rules that have either already been merged into other
640 // rules or have had other rules merged into them.
641 return !rulesInfo[match.index].merged &&
642 !rulesInfo[match.index].mergedInto;
643 });
644
645 return matchesForIndex.length > best.length ? matchesForIndex : best;
646 },
647 []);
648
649 let multiEdit = false;
650
651 // If we couldn't find a single rule to merge with, let's see if we have a
652 // multiple character edit. e.g. we could merge "ad" and "adserver" into
653 // "ad(server)?".
654 if (best.length == 0 && ruleInfo.multiEditMatch &&
655 !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
656 !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
657 {
658 best = [ruleInfo.multiEditMatch];
659 multiEdit = true;
660 }
661
662 if (best.length > 0)
663 {
664 let urlFilter = rule.trigger["url-filter"];
665
666 let editIndex = best[0].edit.index;
667
668 if (!multiEdit)
669 {
670 // Merge all the matching rules into this one.
671
672 let characters = [];
673 let quantifier = "";
674
675 for (let match of best)
676 {
677 if (match.edit.type == "delete")
678 {
679 quantifier = "?";
680 }
681 else
682 {
683 let character = rulesInfo[match.index].rule
684 .trigger["url-filter"][editIndex];
685 characters.push(character);
686 }
687
688 // Mark the target rule as merged so other rules don't try to merge
689 // it again.
690 rulesInfo[match.index].merged = true;
691 }
692
693 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
694 urlFilter.substring(editIndex + 1);
695 if (characters.length > 0)
696 {
697 urlFilter = urlFilter.substring(0, editIndex) + "[" +
698 urlFilter[editIndex] + characters.join("") + "]" +
699 urlFilter.substring(editIndex + 1);
700 }
701 }
702 else
703 {
704 let editEndIndex = best[0].edit.endIndex;
705
706 // Mark the target rule as merged so other rules don't try to merge it
707 // again.
708 rulesInfo[best[0].index].merged = true;
709
710 urlFilter = urlFilter.substring(0, editIndex) + "(" +
711 urlFilter.substring(editIndex, editEndIndex) + ")?" +
712 urlFilter.substring(editEndIndex);
713 }
714
715 rule.trigger["url-filter"] = urlFilter;
716
717 // Mark this rule as one that has had other rules merged into it.
718 ruleInfo.mergedInto = true;
719 }
720 }
721 }
722
723 function mergeCloselyMatchingRules(rules, options)
724 {
725 const defaultOptions = {advanced: false, exhaustive: false};
726
727 options = Object.assign({}, defaultOptions, options);
728
729 let rulesInfo = new Array(rules.length);
730
731 let mergeableRulesInfoByGroup = new Map();
732
733 rules.forEach((rule, index) =>
734 {
735 rulesInfo[index] = {rule};
736
737 // Skip rules we do not want to merge, for whatever reason.
738 if (rule.action.type == "ignore-previous-rules")
739 return;
740
741 // Group rules based on all their properties except the URL filter.
742
743 let copy = {
744 trigger: Object.assign({}, rule.trigger),
745 action: Object.assign({}, rule.action)
746 };
747
748 delete copy.trigger["url-filter"];
749
750 let groupKey = JSON.stringify(copy);
751
752 let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey);
753
754 if (mergeableRulesInfo)
755 mergeableRulesInfo.push(rulesInfo[index]);
756 else
757 mergeableRulesInfoByGroup.set(groupKey, [rulesInfo[index]]);
758 });
759
760 mergeableRulesInfoByGroup.forEach(mergeableRulesInfo =>
761 {
762 if (mergeableRulesInfo.length > 1)
763 {
764 performRuleMerging(mergeableRulesInfo, options.advanced,
765 options.exhaustive);
766 }
767 });
768
769 // Filter out rules that have been merged into other rules.
770 return rulesInfo.filter(ruleInfo => !ruleInfo.merged)
771 .map(ruleInfo => ruleInfo.rule);
772 }
773
369 let ContentBlockerList = 774 let ContentBlockerList =
370 /** 775 /**
371 * Create a new Adblock Plus filter to content blocker list converter 776 * Create a new Adblock Plus filter to content blocker list converter
372 * 777 *
373 * @constructor 778 * @constructor
374 */ 779 */
375 exports.ContentBlockerList = function () 780 exports.ContentBlockerList = function ()
376 { 781 {
377 this.requestFilters = []; 782 this.requestFilters = [];
378 this.requestExceptions = []; 783 this.requestExceptions = [];
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
417 822
418 parseDomains(filter.domains, domains, []); 823 parseDomains(filter.domains, domains, []);
419 } 824 }
420 }; 825 };
421 826
422 /** 827 /**
423 * Generate content blocker list for all filters that were added 828 * Generate content blocker list for all filters that were added
424 * 829 *
425 * @returns {Filter} filter Filter to convert 830 * @returns {Filter} filter Filter to convert
426 */ 831 */
427 ContentBlockerList.prototype.generateRules = function(filter) 832 ContentBlockerList.prototype.generateRules = function(options)
428 { 833 {
834 const defaultOptions = {
835 merge: false,
836 fastMerge: true,
837 advancedMerge: null,
838 exhaustiveMerge: null
839 };
840
841 options = Object.assign({}, defaultOptions, options);
842
429 let rules = []; 843 let rules = [];
430 844
431 let groupedElemhideFilters = new Map(); 845 let groupedElemhideFilters = new Map();
432 for (let filter of this.elemhideFilters) 846 for (let filter of this.elemhideFilters)
433 { 847 {
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 848 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
435 if (!result) 849 if (!result)
436 continue; 850 continue;
437 851
438 if (result.matchDomains.length == 0) 852 if (result.matchDomains.length == 0)
(...skipping 26 matching lines...) Expand all
465 } 879 }
466 }); 880 });
467 881
468 for (let filter of this.elemhideExceptions) 882 for (let filter of this.elemhideExceptions)
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 883 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
470 for (let filter of this.requestFilters) 884 for (let filter of this.requestFilters)
471 convertFilterAddRules(rules, filter, "block", true); 885 convertFilterAddRules(rules, filter, "block", true);
472 for (let filter of this.requestExceptions) 886 for (let filter of this.requestExceptions)
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 887 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
474 888
475 return rules.filter(rule => !hasNonASCI(rule)); 889 rules = rules.filter(rule => !hasNonASCI(rule));
890
891 if (options.merge)
892 {
893 let mergeOptions = {
894 advanced: options.advancedMerge ||
895 (!options.fastMerge && options.advancedMerge != false),
896 exhaustive: options.exhaustiveMerge ||
897 (!options.fastMerge && options.exhaustiveMerge != false)
898 };
899
900 rules = mergeCloselyMatchingRules(rules, mergeOptions);
901 }
902
903 return rules;
476 }; 904 };
OLDNEW
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld