Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Fix bugs and add unit tests Created May 3, 2017, 12:25 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 348 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 { 359 {
360 newSelector.push(selector.substring(i, pos.start)); 360 newSelector.push(selector.substring(i, pos.start));
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
362 i = pos.end; 362 i = pos.end;
363 } 363 }
364 newSelector.push(selector.substring(i)); 364 newSelector.push(selector.substring(i));
365 365
366 return newSelector.join(""); 366 return newSelector.join("");
367 } 367 }
368 368
369 function closeMatch(s, t)
370 {
371 // This function returns an edit operation (one of "substitute", "delete",
372 // and "insert") along with an index in the source string where the edit
373 // should occur in order to arrive at the target string.
374
375 let diff = s.length - t.length;
376
377 // If the string lenghts differ by more than one character, we cannot arrive
378 // at target from source in a single edit operation.
379 if (diff < -1 || diff > 1)
380 return null;
381
382 // If target is longer than source, swap them for the purpose of our
383 // calculation.
384 if (diff == -1)
385 {
386 let tmp = s;
387 s = t;
388 t = tmp;
389 }
390
391 let edit = null;
392
393 for (let i = 0, j = 0; i < s.length; i++)
394 {
395 if (s[i] == t[j])
396 {
397 j++;
398 }
399 else if (edit)
400 {
401 // Since we want one and only one edit operation, we must bail here.
402 return null;
403 }
404 else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" ||
405 s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" ||
406 s[i] == "[" || s[i] == "]" || s[i] == "\\") ||
407 (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" ||
408 t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" ||
409 t[j] == "[" || t[j] == "]" || t[j] == "\\"))
410 {
411 // We don't deal with special characters for now.
412 return null;
413 }
414 else
415 {
416 switch (diff)
417 {
418 case 0:
419 // If both strings are equal in length, this is a substitution.
420 edit = {type: "substitute", index: i};
421 j++;
422 break;
423 case 1:
424 // If the source string is longer, this is a deletion.
425 edit = {type: "delete", index: i};
426 break;
427 default:
428 edit = {type: "insert", index: i};
429 }
430 }
431 }
432
433 return edit;
434 }
435
436 function ruleWithoutURLFilter(rule)
437 {
438 let copy = {
439 trigger: Object.assign({}, rule.trigger),
440 action: Object.assign({}, rule.action)
441 };
442
443 delete copy.trigger["url-filter"];
444
445 return copy;
446 }
447
448 function mergeCloselyMatchingRules(rules)
449 {
450 // Closely matching rules are likely to be within a certain range. We only
451 // look for matches within this range. If we increase this value, it can give
452 // us more matches and a smaller resulting rule set, but possibly at a
453 // significant performance cost.
454 const heuristicRange = 100;
455
456 let rulesInfo = new Array(rules.length);
457
458 rules.forEach((rule, index) =>
459 {
460 rulesInfo[index] = {rule};
461
462 if (rule.action.type == "ignore-previous-rules")
463 {
464 rulesInfo[index].skip = true;
465 }
466 else
467 {
468 // Save a stringified version of the rule, but without the URL filter. We
469 // use this for comparison later.
470 rulesInfo[index].stringifiedWithoutURLFilter =
471 JSON.stringify(ruleWithoutURLFilter(rule));
472 }
473 });
474
475 for (let i = 0; i < rules.length; i++)
476 {
477 if (rulesInfo[i].skip)
478 continue;
479
480 for (let j = i + 1; j < i + heuristicRange && j < rules.length; j++)
481 {
482 if (rulesInfo[j].skip)
483 continue;
484
485 // Check if the rules are identical except for the URL filter.
486 if (rulesInfo[i].stringifiedWithoutURLFilter ==
487 rulesInfo[j].stringifiedWithoutURLFilter)
488 {
489 let source = rules[i].trigger["url-filter"];
490 let target = rules[j].trigger["url-filter"];
491
492 // Find out if the Levenshtein distance between the rules is 1.
493 let edit = closeMatch(source, target);
494
495 if (edit)
496 {
497 let urlFilter, ruleInfo, match = {edit};
498
499 if (edit.type == "insert")
500 {
501 // Convert the insertion into a deletion and stick it on the target
502 // rule instead. We can only group deletions and substitutions;
503 // therefore insertions must be treated as deletions on the target
504 // rule, to be dealt with later.
505 urlFilter = target;
506 ruleInfo = rulesInfo[j];
507 match.index = i;
508 edit.type = "delete";
509 }
510 else
511 {
512 urlFilter = source;
513 ruleInfo = rulesInfo[i];
514 match.index = j;
515 }
516
517 if (!ruleInfo.matches)
518 ruleInfo.matches = new Array(urlFilter.length + 1);
519
520 let matchesForIndex = ruleInfo.matches[edit.index];
521
522 if (matchesForIndex)
523 {
524 matchesForIndex.push(match);
525 }
526 else
527 {
528 matchesForIndex = [match];
529 ruleInfo.matches[edit.index] = matchesForIndex;
530 }
531
532 if (!ruleInfo.bestMatches ||
533 matchesForIndex.length > ruleInfo.bestMatches.length)
534 ruleInfo.bestMatches = matchesForIndex;
535 }
536 }
537 }
538 }
539
540 let candidateRulesInfo = rulesInfo.filter(ruleInfo => ruleInfo.bestMatches);
541
542 // For best results, we have to sort the candidates by the number of matches.
543 // For example, we want "ads", "bds", "adv", "bdv", and "bdx" to generate
544 // "ad[sv]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and "bdx" (3
545 // rules).
546 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
547 {
548 return ruleInfo2.bestMatches.length - ruleInfo1.bestMatches.length;
549 });
550
551 for (let ruleInfo of candidateRulesInfo)
552 {
553 let rule = ruleInfo.rule;
554
555 if (rule._merged)
556 continue;
557
558 // Find the best set of rules to group, which is simply the largest set.
559 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
560 {
561 matchesForIndex = (matchesForIndex || []).filter(match =>
562 {
563 // Filter out rules that have either already been merged into other
564 // rules or have had other rules merged into them.
565 return !rules[match.index]._merged &&
566 !rulesInfo[match.index].mergedInto;
567 });
568
569 return matchesForIndex.length > best.length ? matchesForIndex : best;
570 },
571 []);
572
573 if (best.length > 0)
574 {
575 // Merge all the matching rules into this one.
576
577 let editIndex = best[0].edit.index;
578
579 let characters = [];
580 let quantifier = "";
581
582 for (let match of best)
583 {
584 if (match.edit.type == "delete")
585 quantifier = "?";
586 else
587 characters.push(rules[match.index].trigger["url-filter"][editIndex]);
588
589 rules[match.index]._merged = true;
590 }
591
592 let urlFilter = rule.trigger["url-filter"];
593
594 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
595 urlFilter.substring(editIndex + 1);
596 if (characters.length > 0)
597 {
598 urlFilter = urlFilter.substring(0, editIndex) + "[" +
599 urlFilter[editIndex] + characters.join("") + "]" +
600 urlFilter.substring(editIndex + 1);
601 }
602
603 rule.trigger["url-filter"] = urlFilter;
604
605 ruleInfo.mergedInto = true;
606 }
607 }
608
609 return rules.filter(rule => !rule._merged);
610 }
611
369 let ContentBlockerList = 612 let ContentBlockerList =
370 /** 613 /**
371 * Create a new Adblock Plus filter to content blocker list converter 614 * Create a new Adblock Plus filter to content blocker list converter
372 * 615 *
373 * @constructor 616 * @constructor
374 */ 617 */
375 exports.ContentBlockerList = function () 618 exports.ContentBlockerList = function ()
376 { 619 {
377 this.requestFilters = []; 620 this.requestFilters = [];
378 this.requestExceptions = []; 621 this.requestExceptions = [];
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
417 660
418 parseDomains(filter.domains, domains, []); 661 parseDomains(filter.domains, domains, []);
419 } 662 }
420 }; 663 };
421 664
422 /** 665 /**
423 * Generate content blocker list for all filters that were added 666 * Generate content blocker list for all filters that were added
424 * 667 *
425 * @returns {Filter} filter Filter to convert 668 * @returns {Filter} filter Filter to convert
426 */ 669 */
427 ContentBlockerList.prototype.generateRules = function(filter) 670 ContentBlockerList.prototype.generateRules = function({merge = false} = {})
428 { 671 {
429 let rules = []; 672 let rules = [];
430 673
431 let groupedElemhideFilters = new Map(); 674 let groupedElemhideFilters = new Map();
432 for (let filter of this.elemhideFilters) 675 for (let filter of this.elemhideFilters)
433 { 676 {
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 677 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
435 if (!result) 678 if (!result)
436 continue; 679 continue;
437 680
(...skipping 27 matching lines...) Expand all
465 } 708 }
466 }); 709 });
467 710
468 for (let filter of this.elemhideExceptions) 711 for (let filter of this.elemhideExceptions)
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 712 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
470 for (let filter of this.requestFilters) 713 for (let filter of this.requestFilters)
471 convertFilterAddRules(rules, filter, "block", true); 714 convertFilterAddRules(rules, filter, "block", true);
472 for (let filter of this.requestExceptions) 715 for (let filter of this.requestExceptions)
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 716 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
474 717
475 return rules.filter(rule => !hasNonASCI(rule)); 718 rules = rules.filter(rule => !hasNonASCI(rule));
719
720 if (merge)
721 rules = mergeCloselyMatchingRules(rules);
722
723 return rules;
476 }; 724 };
OLDNEW
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld