abp2blocklist.js - Issue 29336753: Issue 3671 - Split out contentBlockerList API

Unified Diff: abp2blocklist.js

Issue 29336753: Issue 3671 - Split out contentBlockerList API (Closed)

Patch Set: Addressed feedback, removed some modules Created Feb. 21, 2016, 11:26 a.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: abp2blocklist.js

diff --git a/abp2blocklist.js b/abp2blocklist.js

index 2a032dd8f666ef4245889721676254e02b2a8501..207788888d907c60085c95ee0c657f455de3e532 100644

--- a/abp2blocklist.js

+++ b/abp2blocklist.js

@@ -1,367 +1,30 @@

"use strict";

Sebastian Noack 2016/02/21 21:21:43 Mind adding the missing license disclaimer here?

kzar 2016/02/22 12:28:05 Done.

let readline = require("readline");

-let punycode = require("punycode");

-let tldjs = require("tldjs");

-let filterClasses = require("./adblockplus.js");

-let typeMap = filterClasses.RegExpFilter.typeMap;

+// Hack to force `require("url")` to return our URL module instead of the

+// one included with Node.js! We manually load it here and then tweak the cache.

+// (After loading our code we must clear it again so that Node.js libraries will

+// be given the URL module that they expect.)

+require("url.js");

+require.cache["url"] = require.cache[require.resolve("url.js")];

Sebastian Noack 2016/02/21 20:34:44 I'd rather rename our url module, here and in Adbl

kzar 2016/02/22 12:28:05 OK I've renamed the module urlHelpers and removed

Sebastian Noack 2016/02/22 17:35:27 I might have got an even a better idea. How about

kzar 2016/02/22 18:09:28 Sounds good, done.

-const selectorLimit = 5000;

+let Filter = require("filterClasses").Filter;

+let contentBlockerLists = require("./lib/contentBlockerLists.js");

-let requestFilters = [];

-let requestExceptions = [];

-let elemhideFilters = [];

-let elemhideExceptions = [];

-let elemhideSelectorExceptions = new Map();

+delete require.cache["url"];

-function recordException(filter)

- if (filter.contentType & (typeMap.IMAGE

- | typeMap.STYLESHEET

- | typeMap.SCRIPT

- | typeMap.FONT

- | typeMap.MEDIA

- | typeMap.POPUP

- | typeMap.OBJECT

- | typeMap.OBJECT_SUBREQUEST

- | typeMap.XMLHTTPREQUEST

- | typeMap.PING

- | typeMap.SUBDOCUMENT

- | typeMap.OTHER))

- requestExceptions.push(filter);

- if (filter.contentType & typeMap.ELEMHIDE)

- elemhideExceptions.push(filter);

-function parseDomains(domains, included, excluded)

- for (let domain in domains)

- {

- if (domain != "")

- {

- let enabled = domains[domain];

- domain = punycode.toASCII(domain.toLowerCase());

- if (!enabled)

- excluded.push(domain);

- else if (!domains[""])

- included.push(domain);

- }

-function recordSelectorException(filter)

- let domains = elemhideSelectorExceptions[filter.selector];

- if (!domains)

- domains = elemhideSelectorExceptions[filter.selector] = [];

- parseDomains(filter.domains, domains, []);

-function parseFilter(line)

- if (line.charAt(0) == "[")

- return;

- let filter = filterClasses.Filter.fromText(line);

- if (filter.sitekeys)

- return;

- if (filter instanceof filterClasses.RegExpFilter && !filter.regexpSource)

- return;

- if (filter instanceof filterClasses.BlockingFilter)

- requestFilters.push(filter);

- if (filter instanceof filterClasses.WhitelistFilter)

- recordException(filter);

- if (filter instanceof filterClasses.ElemHideFilter)

- elemhideFilters.push(filter);

- if (filter instanceof filterClasses.ElemHideException)

- recordSelectorException(filter);

-function escapeRegExp(s)

- return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");

-function matchDomain(domain)

- return "^https?://([^/:]*\\.)?" + escapeRegExp(domain) + "[/:]";

-function convertElemHideFilter(filter)

- let included = [];

- let excluded = [];

- let rules = [];

- parseDomains(filter.domains, included, excluded);

- if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))

- return {matchDomains: included.map(matchDomain), selector: filter.selector};

-function toRegExp(text)

- let result = "";

- let lastIndex = text.length - 1;

- for (let i = 0; i < text.length; i++)

- {

- let c = text[i];

- switch (c)

- {

- case "*":

- if (result.length > 0 && i < lastIndex && text[i + 1] != "*")

- result += ".*";

- break;

- case "^":

- if (i < lastIndex)

- result += ".";

- break;

- case "|":

- if (i == 0)

- {

- result += "^";

- break;

- }

- if (i == lastIndex)

- {

- result += "$";

- break;

- }

- if (i == 1 && text[0] == "|")

- {

- result += "https?://";

- break;

- }

- case ".": case "+": case "?": case "$":

- case "{": case "}": case "(": case ")":

- case "[": case "]": case "\\":

- result += "\\";

- default:

- result += c;

- }

- return result;

-function getRegExpSource(filter)

- let source = toRegExp(filter.regexpSource.replace(

- // Safari expects punycode, filter lists use unicode

- /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i,

- function (match, prefix, domain)

- {

- return prefix + punycode.toASCII(domain);

- }

- ));

- // Limit rules to to HTTP(S) URLs

- if (!/^(\^|http)/i.test(source))

- source = "^https?://.*" + source;

- return source;

-function getResourceTypes(filter)

- let types = [];

+var rl = readline.createInterface({input: process.stdin, terminal: false});

+var filters = [];

- if (filter.contentType & typeMap.IMAGE)

- types.push("image");

- if (filter.contentType & typeMap.STYLESHEET)

- types.push("style-sheet");

- if (filter.contentType & typeMap.SCRIPT)

- types.push("script");

- if (filter.contentType & typeMap.FONT)

- types.push("font");

- if (filter.contentType & (typeMap.MEDIA | typeMap.OBJECT))

- types.push("media");

- if (filter.contentType & typeMap.POPUP)

- types.push("popup");

- if (filter.contentType & (typeMap.XMLHTTPREQUEST | typeMap.OBJECT_SUBREQUEST

- | typeMap.PING | typeMap.OTHER))

- types.push("raw");

- if (filter.contentType & typeMap.SUBDOCUMENT)

- types.push("document");

- return types;

-function addDomainPrefix(domains)

- let result = [];

- for (let domain of domains)

- {

- result.push(domain);

- if (tldjs.getSubdomain(domain) == "")

- result.push("www." + domain);

- }

- return result;

-function convertFilter(filter, action, withResourceTypes)

- let trigger = {"url-filter": getRegExpSource(filter)};

- let included = [];

- let excluded = [];

- parseDomains(filter.domains, included, excluded);

- if (withResourceTypes)

- trigger["resource-type"] = getResourceTypes(filter);

- if (filter.thirdParty != null)

- trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];

- if (included.length > 0)

- trigger["if-domain"] = addDomainPrefix(included);

- else if (excluded.length > 0)

- trigger["unless-domain"] = addDomainPrefix(excluded);

- return {trigger: trigger, action: {type: action}};

-function hasNonASCI(obj)

- if (typeof obj == "string")

- {

- if (/[^\x00-\x7F]/.test(obj))

- return true;

- }

- if (typeof obj == "object")

- {

- if (obj instanceof Array)

- for (let item of obj)

- if (hasNonASCI(item))

- return true;

- for (let name of Object.getOwnPropertyNames(obj))

- if (hasNonASCI(obj[name]))

- return true;

- }

- return false;

-function convertIDSelectorsToAttributeSelectors(selector)

+rl.on("line", line =>

{

- // First we figure out where all the IDs are

- let sep = "";

- let start = null;

- let positions = [];

- for (let i = 0; i < selector.length; i++)

- {

- let chr = selector[i];

- if (chr == "\\") // ignore escaped characters

- i++;

- else if (chr == sep) // don't match IDs within quoted text

- sep = ""; // e.g. [attr="#Hello"]

- else if (sep == "")

- {

- if (chr == '"' || chr == "'")

- sep = chr;

- else if (start == null) // look for the start of an ID

- {

- if (chr == "#")

- start = i;

- }

- else if (chr != "-" && chr != "_" &&

- (chr < "0" ||

- chr > "9" && chr < "A" ||

- chr > "Z" && chr < "a" ||

- chr > "z" && chr < "\x80")) // look for the end of the ID

- {

- positions.push({start: start, end: i});

- start = null;

- }

- if (start != null)

- positions.push({start: start, end: selector.length});

- // Now replace them all with the [id="someID"] form

- let newSelector = [];

- let i = 0;

- for (let pos of positions)

- {

- newSelector.push(selector.substring(i, pos.start));

- newSelector.push('[id=' + selector.substring(pos.start + 1, pos.end) + ']');

- i = pos.end;

- }

- newSelector.push(selector.substring(i));

+ if (line.charAt(0) != "[")

Sebastian Noack 2016/02/21 20:34:44 What's about empty lines? It seems that we didn't

kzar 2016/02/22 12:28:05 It appears that contentBlockerLists.js logic was h

Sebastian Noack 2016/02/22 17:35:27 OK, I looked into it and figured that this case wa

kzar 2016/02/22 18:09:28 Done. As for lines with only white-space, we're c

Sebastian Noack 2016/02/22 18:20:26 Yeah, I already figured that myself. The point is

On 2016/02/22 18:09:28, kzar wrote: > On 2016/02/22 17:35:27, Sebastian Noack wrote: > > On 2016/02/22 12:28:05, kzar wrote: > > > On 2016/02/21 20:34:44, Sebastian Noack wrote: > > > > What's about empty lines? It seems that we didn't handle those before > > either. > > > > But wouldn't an empty line result result into a filter that would block > > > > everything? > > > > > > It appears that contentBlockerLists.js logic was handling that case somehow > > but > > > I can't see exactly why. Anyway I think you're right, better to check here > > > explicitly. Done. > > > > OK, I looked into it and figured that this case was already handled by > following > > check in the original code: > > > > if (filter instanceof filterClasses.RegExpFilter && !filter.regexpSource) > > return; > > > > This check was meant to exclude raw regular expression filters. But it also > > bails if the filter has been parsed from and empty string, in which case > > regexpSource is "" rather than null. > > > > I think that check should be changed to |filter.regexpSource == null|. While > > empty lines should be explicitly ignored here. > > > > Also what is if a line in a filter list contains only whitespace characters? I > > didn't look into it yet, but we should make sure that we treat that case the > > same as when parsing filter lists in Adblock Plus. > > Done. > > As for lines with only white-space, we're currently passing them to > Filter.fromText, which I guess is all that Adblock Plus does. I tried it out and > here's what a rule generated from a filter with a fitertext of " " looks like: > > { > "trigger": { > "url-filter": "^https?://.* ", > "resource-type": [ > "image", > "style-sheet", > "script", > "font", > "media", > "raw", > "document" > ] > }, > "action": { > "type": "block" > } > } > > I'm not entirely sure if we want abp2blocklist to skip over lines with only > whitespace or treat them as any other line. I guess probably the former, as it's > more likely a mistake than anything else.

Yeah, I already figured that myself. The point is we want abp2blocklist behave exactly like Adblock Plus here. Can you try out what happens to empty lines and lines who only have whitespace when Adblock Plus parses filter subscriptions?

kzar 2016/02/22 19:46:01 OK add the custom subscription of http://static.kz

On 2016/02/22 18:20:26, Sebastian Noack wrote: > On 2016/02/22 18:09:28, kzar wrote: > > On 2016/02/22 17:35:27, Sebastian Noack wrote: > > > On 2016/02/22 12:28:05, kzar wrote: > > > > On 2016/02/21 20:34:44, Sebastian Noack wrote: > > > > > What's about empty lines? It seems that we didn't handle those before > > > either. > > > > > But wouldn't an empty line result result into a filter that would block > > > > > everything? > > > > > > > > It appears that contentBlockerLists.js logic was handling that case > somehow > > > but > > > > I can't see exactly why. Anyway I think you're right, better to check here > > > > explicitly. Done. > > > > > > OK, I looked into it and figured that this case was already handled by > > following > > > check in the original code: > > > > > > if (filter instanceof filterClasses.RegExpFilter && !filter.regexpSource) > > > return; > > > > > > This check was meant to exclude raw regular expression filters. But it also > > > bails if the filter has been parsed from and empty string, in which case > > > regexpSource is "" rather than null. > > > > > > I think that check should be changed to |filter.regexpSource == null|. While > > > empty lines should be explicitly ignored here. > > > > > > Also what is if a line in a filter list contains only whitespace characters? > I > > > didn't look into it yet, but we should make sure that we treat that case the > > > same as when parsing filter lists in Adblock Plus. > > > > Done. > > > > As for lines with only white-space, we're currently passing them to > > Filter.fromText, which I guess is all that Adblock Plus does. I tried it out > and > > here's what a rule generated from a filter with a fitertext of " " looks > like: > > > > { > > "trigger": { > > "url-filter": "^https?://.* ", > > "resource-type": [ > > "image", > > "style-sheet", > > "script", > > "font", > > "media", > > "raw", > > "document" > > ] > > }, > > "action": { > > "type": "block" > > } > > } > > > > I'm not entirely sure if we want abp2blocklist to skip over lines with only > > whitespace or treat them as any other line. I guess probably the former, as > it's > > more likely a mistake than anything else. > > Yeah, I already figured that myself. The point is we want abp2blocklist behave > exactly like Adblock Plus here. Can you try out what happens to empty lines and > lines who only have whitespace when Adblock Plus parses filter subscriptions?

OK add the custom subscription of http://static.kzar.co.uk/dave.txt and then run this in the background console: require("subscriptionClasses").Subscription.knownSubscriptions["http://static.kzar.co.uk/dave.txt"].filters.map(function(f) { return f.text; }).join("\n") It turns out that the lines with just whitespace are stripped, so I've updated abp2blocklist.js to do the same.

+ filters.push(Filter.fromText(line));

+});

- return newSelector.join("");

-function logRules()

+rl.on("close", () =>

{

- let rules = [];

- function addRule(rule)

- {

- if (!hasNonASCI(rule))

- rules.push(rule);

- }

- let groupedElemhideFilters = new Map();

- for (let filter of elemhideFilters)

- {

- let result = convertElemHideFilter(filter);

- if (!result)

- continue;

- if (result.matchDomains.length == 0)

- result.matchDomains = ["^https?://"];

- for (let matchDomain of result.matchDomains)

- {

- let group = groupedElemhideFilters.get(matchDomain) || [];

- group.push(result.selector);

- groupedElemhideFilters.set(matchDomain, group);

- }

- groupedElemhideFilters.forEach((selectors, matchDomain) =>

- {

- while (selectors.length)

- {

- let selector = selectors.splice(0, selectorLimit).join(", ");

- // As of Safari 9.0 element IDs are matched as lowercase. We work around

- // this by converting to the attribute format [id="elementID"]

- selector = convertIDSelectorsToAttributeSelectors(selector);

- addRule({

- trigger: {"url-filter": matchDomain},

- action: {type: "css-display-none",

- selector: selector}

- });

- }

- });

- for (let filter of elemhideExceptions)

- addRule(convertFilter(filter, "ignore-previous-rules", false));

- for (let filter of requestFilters)

- addRule(convertFilter(filter, "block", true));

- for (let filter of requestExceptions)

- addRule(convertFilter(filter, "ignore-previous-rules", true));

- console.log(JSON.stringify(rules, null, "\t"));

-let rl = readline.createInterface({input: process.stdin, terminal: false});

-rl.on("line", parseFilter);

-rl.on("close", logRules);

+ console.log(JSON.stringify(contentBlockerLists.convertFilters(filters),

+ null, "\t"));

+});

« no previous file with comments | « README.md ('k') | adblockplus.js » ('j') | lib/contentBlockerLists.js » ('J')