Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/contentBlockerLists.js

Issue 29336753: Issue 3671 - Split out contentBlockerList API (Closed)
Patch Set: Addressed feedback, removed some modules Created Feb. 21, 2016, 11:26 a.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: lib/contentBlockerLists.js
diff --git a/lib/contentBlockerLists.js b/lib/contentBlockerLists.js
new file mode 100644
index 0000000000000000000000000000000000000000..95b94fc16eda818c6e2f692c0cd5b8f789b0cc32
--- /dev/null
+++ b/lib/contentBlockerLists.js
@@ -0,0 +1,380 @@
+/*
+ * This file is part of Adblock Plus <https://adblockplus.org/>,
+ * Copyright (C) 2006-2016 Eyeo GmbH
+ *
+ * Adblock Plus is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * Adblock Plus is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @module contentBlockerLists */
Sebastian Noack 2016/02/21 20:34:45 I'd rather call that module simply abp2blocklist,
Sebastian Noack 2016/02/21 20:36:15 s/is the module of the project/is the name of the
kzar 2016/02/22 12:28:06 I think abp2blocklist is a fine name for the scrip
Sebastian Noack 2016/02/22 17:35:28 Whether it will be included into adblockpluscore o
kzar 2016/02/22 18:09:29 I'd really rather keep the name lib/contentBlocker
Sebastian Noack 2016/02/22 18:20:26 So I guess we need a third opinion here. fhd, what
kzar 2016/02/22 19:46:01 I mean by the same logic, wouldn't having the libr
Felix Dahlke 2016/02/24 13:06:30 No particularly strong opinion, but since you're a
Sebastian Noack 2016/02/24 19:31:51 Thanks for your feedback. But first of all "abp2b
Sebastian Noack 2016/02/24 22:10:04 Dave and I continued the discussion on IRC: <snoa
Felix Dahlke 2016/02/26 15:40:35 Alright, read up on the discussion again and had a
+
+"use strict";
+
+let filterClasses = require("filterClasses");
+let getBaseDomain = require("url").getBaseDomain;
+let punycode = require("punycode");
+
+
Sebastian Noack 2016/02/21 20:34:45 Nit: One empty line should suffice here. Same belo
kzar 2016/02/22 12:28:05 Done.
+const selectorLimit = 5000;
+const typeMap = filterClasses.RegExpFilter.typeMap;
+
+
+function parseDomains(domains, included, excluded)
+{
+ for (let domain in domains)
+ {
+ if (domain != "")
+ {
+ let enabled = domains[domain];
+ domain = punycode.toASCII(domain.toLowerCase());
+
+ if (!enabled)
+ excluded.push(domain);
+ else if (!domains[""])
+ included.push(domain);
+ }
+ }
+}
+
+function escapeRegExp(s)
+{
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+function matchDomain(domain)
+{
+ return "^https?://([^/:]*\\.)?" + escapeRegExp(domain) + "[/:]";
+}
+
+function convertElemHideFilter(filter, elemhideSelectorExceptions)
+{
+ let included = [];
+ let excluded = [];
+ let rules = [];
+
+ parseDomains(filter.domains, included, excluded);
+
+ if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))
+ return {matchDomains: included.map(matchDomain), selector: filter.selector};
+}
+
+function toRegExp(text)
+{
+ let result = "";
Sebastian Noack 2016/02/21 20:34:45 This is unrelated, but we probably should use an a
kzar 2016/02/22 12:28:05 Done.
+ let lastIndex = text.length - 1;
+
+ for (let i = 0; i < text.length; i++)
+ {
+ let c = text[i];
+
+ switch (c)
+ {
+ case "*":
+ if (result.length > 0 && i < lastIndex && text[i + 1] != "*")
+ result += ".*";
+ break;
+ case "^":
+ if (i < lastIndex)
+ result += ".";
+ break;
+ case "|":
+ if (i == 0)
+ {
+ result += "^";
+ break;
+ }
+ if (i == lastIndex)
+ {
+ result += "$";
+ break;
+ }
+ if (i == 1 && text[0] == "|")
+ {
+ result += "https?://";
+ break;
+ }
+ case ".": case "+": case "?": case "$":
+ case "{": case "}": case "(": case ")":
+ case "[": case "]": case "\\":
+ result += "\\";
+ default:
+ result += c;
+ }
+ }
+
+ return result;
+}
+
+function getRegExpSource(filter)
+{
+ let source = toRegExp(filter.regexpSource.replace(
+ // Safari expects punycode, filter lists use unicode
+ /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i,
+ function (match, prefix, domain)
+ {
+ return prefix + punycode.toASCII(domain);
+ }
+ ));
+
+ // Limit rules to to HTTP(S) URLs
+ if (!/^(\^|http)/i.test(source))
+ source = "^https?://.*" + source;
+
+ return source;
+}
+
+function getResourceTypes(filter)
+{
+ let types = [];
+
+ if (filter.contentType & typeMap.IMAGE)
+ types.push("image");
+ if (filter.contentType & typeMap.STYLESHEET)
+ types.push("style-sheet");
+ if (filter.contentType & typeMap.SCRIPT)
+ types.push("script");
+ if (filter.contentType & typeMap.FONT)
+ types.push("font");
+ if (filter.contentType & (typeMap.MEDIA | typeMap.OBJECT))
+ types.push("media");
+ if (filter.contentType & typeMap.POPUP)
+ types.push("popup");
+ if (filter.contentType & (typeMap.XMLHTTPREQUEST | typeMap.OBJECT_SUBREQUEST
Sebastian Noack 2016/02/21 20:34:45 Nit: This check reads easier if the types are alig
kzar 2016/02/22 12:28:05 Done.
+ | typeMap.PING | typeMap.OTHER))
+ types.push("raw");
+ if (filter.contentType & typeMap.SUBDOCUMENT)
+ types.push("document");
+
+ return types;
+}
+
+function addDomainPrefix(domains)
+{
+ let result = [];
+
+ for (let domain of domains)
+ {
+ result.push(domain);
+
+ if (getBaseDomain(domain) == domain)
+ result.push("www." + domain);
+ }
+
+ return result;
+}
+
+function convertFilter(filter, action, withResourceTypes)
+{
+ let trigger = {"url-filter": getRegExpSource(filter)};
+ let included = [];
+ let excluded = [];
+
+ parseDomains(filter.domains, included, excluded);
+
+ if (withResourceTypes)
+ trigger["resource-type"] = getResourceTypes(filter);
+ if (filter.thirdParty != null)
+ trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];
+
+ if (included.length > 0)
+ trigger["if-domain"] = addDomainPrefix(included);
+ else if (excluded.length > 0)
+ trigger["unless-domain"] = addDomainPrefix(excluded);
+
+ return {trigger: trigger, action: {type: action}};
+}
+
+function hasNonASCI(obj)
+{
+ if (typeof obj == "string")
+ {
+ if (/[^\x00-\x7F]/.test(obj))
+ return true;
+ }
+
+ if (typeof obj == "object")
+ {
+ if (obj instanceof Array)
+ for (let item of obj)
+ if (hasNonASCI(item))
+ return true;
+
+ for (let name of Object.getOwnPropertyNames(obj))
Sebastian Noack 2016/02/21 20:34:45 When we start to transpile the code with jsHydra,
kzar 2016/02/22 12:28:05 Done.
+ if (hasNonASCI(obj[name]))
+ return true;
+ }
+
+ return false;
+}
+
+function convertIDSelectorsToAttributeSelectors(selector)
+{
+ // First we figure out where all the IDs are
+ let sep = "";
+ let start = null;
+ let positions = [];
+ for (let i = 0; i < selector.length; i++)
+ {
+ let chr = selector[i];
+
+ if (chr == "\\") // ignore escaped characters
+ i++;
+ else if (chr == sep) // don't match IDs within quoted text
+ sep = ""; // e.g. [attr="#Hello"]
+ else if (sep == "")
+ {
+ if (chr == '"' || chr == "'")
+ sep = chr;
+ else if (start == null) // look for the start of an ID
+ {
+ if (chr == "#")
+ start = i;
+ }
+ else if (chr != "-" && chr != "_" &&
+ (chr < "0" ||
+ chr > "9" && chr < "A" ||
+ chr > "Z" && chr < "a" ||
+ chr > "z" && chr < "\x80")) // look for the end of the ID
+ {
+ positions.push({start: start, end: i});
+ start = null;
+ }
+ }
+ }
+ if (start != null)
+ positions.push({start: start, end: selector.length});
+
+ // Now replace them all with the [id="someID"] form
+ let newSelector = [];
+ let i = 0;
+ for (let pos of positions)
+ {
+ newSelector.push(selector.substring(i, pos.start));
+ newSelector.push('[id=' + selector.substring(pos.start + 1, pos.end) + ']');
Sebastian Noack 2016/02/21 20:34:45 Sorry, I missed that one on the previous review; t
kzar 2016/02/22 12:28:05 Done.
+ i = pos.end;
+ }
+ newSelector.push(selector.substring(i));
+
+ return newSelector.join("");
+}
+
+/**
+ * Converts given Adblock Plus filters into content blocker list.
+ *
+ * @param {Filter[]} filters Sequence of filters to convert
+ * @returns {Object[]} Array of content blocker rules
+ */
+exports.convertFilters = function(filters)
Sebastian Noack 2016/02/21 20:34:44 On the command line we read filters line by line.
kzar 2016/02/22 12:28:05 Done.
+{
+ let requestFilters = [];
+ let requestExceptions = [];
+ let elemhideFilters = [];
+ let elemhideExceptions = [];
+ let elemhideSelectorExceptions = new Map();
+ let rules = [];
+
+ // Group the filters into their various types
+ for (let filter of filters)
+ {
+ if (filter.sitekeys)
+ continue;
+ if (filter instanceof filterClasses.RegExpFilter && !filter.regexpSource)
+ continue;
+
+ if (filter instanceof filterClasses.BlockingFilter)
+ requestFilters.push(filter);
+
+ if (filter instanceof filterClasses.WhitelistFilter)
+ {
+ if (filter.contentType & (typeMap.IMAGE
+ | typeMap.STYLESHEET
+ | typeMap.SCRIPT
+ | typeMap.FONT
+ | typeMap.MEDIA
+ | typeMap.POPUP
+ | typeMap.OBJECT
+ | typeMap.OBJECT_SUBREQUEST
+ | typeMap.XMLHTTPREQUEST
+ | typeMap.PING
+ | typeMap.SUBDOCUMENT
+ | typeMap.OTHER))
+ requestExceptions.push(filter);
+
+ if (filter.contentType & typeMap.ELEMHIDE)
+ elemhideExceptions.push(filter);
+ }
+
+ if (filter instanceof filterClasses.ElemHideFilter)
+ elemhideFilters.push(filter);
+
+ if (filter instanceof filterClasses.ElemHideException)
+ {
+ let domains = elemhideSelectorExceptions[filter.selector];
+ if (!domains)
+ domains = elemhideSelectorExceptions[filter.selector] = [];
+
+ parseDomains(filter.domains, domains, []);
+ }
+ }
+
+ // Create the content blocker rules for those grouped filters
+ function addRule(rule)
+ {
+ if (!hasNonASCI(rule))
+ rules.push(rule);
+ }
+
+ let groupedElemhideFilters = new Map();
+ for (let filter of elemhideFilters)
+ {
+ let result = convertElemHideFilter(filter, elemhideSelectorExceptions);
+ if (!result)
+ continue;
+
+ if (result.matchDomains.length == 0)
+ result.matchDomains = ["^https?://"];
+
+ for (let matchDomain of result.matchDomains)
+ {
+ let group = groupedElemhideFilters.get(matchDomain) || [];
+ group.push(result.selector);
+ groupedElemhideFilters.set(matchDomain, group);
+ }
+ }
+
+ groupedElemhideFilters.forEach((selectors, matchDomain) =>
+ {
+ while (selectors.length)
+ {
+ let selector = selectors.splice(0, selectorLimit).join(", ");
+
+ // As of Safari 9.0 element IDs are matched as lowercase. We work around
+ // this by converting to the attribute format [id="elementID"]
+ selector = convertIDSelectorsToAttributeSelectors(selector);
+
+ addRule({
+ trigger: {"url-filter": matchDomain},
+ action: {type: "css-display-none",
+ selector: selector}
+ });
+ }
+ });
+
+ for (let filter of elemhideExceptions)
+ addRule(convertFilter(filter, "ignore-previous-rules", false));
+ for (let filter of requestFilters)
+ addRule(convertFilter(filter, "block", true));
+ for (let filter of requestExceptions)
+ addRule(convertFilter(filter, "ignore-previous-rules", true));
+
+ return rules;
+};

Powered by Google App Engine
This is Rietveld