Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: abp2blocklist.js

Issue 29336753: Issue 3671 - Split out contentBlockerList API (Closed)
Patch Set: Addressed feedback, removed some modules Created Feb. 21, 2016, 11:26 a.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « README.md ('k') | adblockplus.js » ('j') | lib/contentBlockerLists.js » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: abp2blocklist.js
diff --git a/abp2blocklist.js b/abp2blocklist.js
index 2a032dd8f666ef4245889721676254e02b2a8501..207788888d907c60085c95ee0c657f455de3e532 100644
--- a/abp2blocklist.js
+++ b/abp2blocklist.js
@@ -1,367 +1,30 @@
"use strict";
Sebastian Noack 2016/02/21 21:21:43 Mind adding the missing license disclaimer here?
kzar 2016/02/22 12:28:05 Done.
let readline = require("readline");
-let punycode = require("punycode");
-let tldjs = require("tldjs");
-let filterClasses = require("./adblockplus.js");
-let typeMap = filterClasses.RegExpFilter.typeMap;
+// Hack to force `require("url")` to return our URL module instead of the
+// one included with Node.js! We manually load it here and then tweak the cache.
+// (After loading our code we must clear it again so that Node.js libraries will
+// be given the URL module that they expect.)
+require("url.js");
+require.cache["url"] = require.cache[require.resolve("url.js")];
Sebastian Noack 2016/02/21 20:34:44 I'd rather rename our url module, here and in Adbl
kzar 2016/02/22 12:28:05 OK I've renamed the module urlHelpers and removed
Sebastian Noack 2016/02/22 17:35:27 I might have got an even a better idea. How about
kzar 2016/02/22 18:09:28 Sounds good, done.
-const selectorLimit = 5000;
+let Filter = require("filterClasses").Filter;
+let contentBlockerLists = require("./lib/contentBlockerLists.js");
-let requestFilters = [];
-let requestExceptions = [];
-let elemhideFilters = [];
-let elemhideExceptions = [];
-let elemhideSelectorExceptions = new Map();
+delete require.cache["url"];
-function recordException(filter)
-{
- if (filter.contentType & (typeMap.IMAGE
- | typeMap.STYLESHEET
- | typeMap.SCRIPT
- | typeMap.FONT
- | typeMap.MEDIA
- | typeMap.POPUP
- | typeMap.OBJECT
- | typeMap.OBJECT_SUBREQUEST
- | typeMap.XMLHTTPREQUEST
- | typeMap.PING
- | typeMap.SUBDOCUMENT
- | typeMap.OTHER))
- requestExceptions.push(filter);
-
- if (filter.contentType & typeMap.ELEMHIDE)
- elemhideExceptions.push(filter);
-}
-
-function parseDomains(domains, included, excluded)
-{
- for (let domain in domains)
- {
- if (domain != "")
- {
- let enabled = domains[domain];
- domain = punycode.toASCII(domain.toLowerCase());
-
- if (!enabled)
- excluded.push(domain);
- else if (!domains[""])
- included.push(domain);
- }
- }
-}
-
-function recordSelectorException(filter)
-{
- let domains = elemhideSelectorExceptions[filter.selector];
- if (!domains)
- domains = elemhideSelectorExceptions[filter.selector] = [];
-
- parseDomains(filter.domains, domains, []);
-}
-
-function parseFilter(line)
-{
- if (line.charAt(0) == "[")
- return;
-
- let filter = filterClasses.Filter.fromText(line);
-
- if (filter.sitekeys)
- return;
- if (filter instanceof filterClasses.RegExpFilter && !filter.regexpSource)
- return;
-
- if (filter instanceof filterClasses.BlockingFilter)
- requestFilters.push(filter);
- if (filter instanceof filterClasses.WhitelistFilter)
- recordException(filter);
- if (filter instanceof filterClasses.ElemHideFilter)
- elemhideFilters.push(filter);
- if (filter instanceof filterClasses.ElemHideException)
- recordSelectorException(filter);
-}
-
-function escapeRegExp(s)
-{
- return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-}
-
-function matchDomain(domain)
-{
- return "^https?://([^/:]*\\.)?" + escapeRegExp(domain) + "[/:]";
-}
-
-function convertElemHideFilter(filter)
-{
- let included = [];
- let excluded = [];
- let rules = [];
-
- parseDomains(filter.domains, included, excluded);
-
- if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))
- return {matchDomains: included.map(matchDomain), selector: filter.selector};
-}
-
-function toRegExp(text)
-{
- let result = "";
- let lastIndex = text.length - 1;
-
- for (let i = 0; i < text.length; i++)
- {
- let c = text[i];
-
- switch (c)
- {
- case "*":
- if (result.length > 0 && i < lastIndex && text[i + 1] != "*")
- result += ".*";
- break;
- case "^":
- if (i < lastIndex)
- result += ".";
- break;
- case "|":
- if (i == 0)
- {
- result += "^";
- break;
- }
- if (i == lastIndex)
- {
- result += "$";
- break;
- }
- if (i == 1 && text[0] == "|")
- {
- result += "https?://";
- break;
- }
- case ".": case "+": case "?": case "$":
- case "{": case "}": case "(": case ")":
- case "[": case "]": case "\\":
- result += "\\";
- default:
- result += c;
- }
- }
-
- return result;
-}
-
-function getRegExpSource(filter)
-{
- let source = toRegExp(filter.regexpSource.replace(
- // Safari expects punycode, filter lists use unicode
- /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i,
- function (match, prefix, domain)
- {
- return prefix + punycode.toASCII(domain);
- }
- ));
-
- // Limit rules to to HTTP(S) URLs
- if (!/^(\^|http)/i.test(source))
- source = "^https?://.*" + source;
-
- return source;
-}
-
-function getResourceTypes(filter)
-{
- let types = [];
+var rl = readline.createInterface({input: process.stdin, terminal: false});
+var filters = [];
- if (filter.contentType & typeMap.IMAGE)
- types.push("image");
- if (filter.contentType & typeMap.STYLESHEET)
- types.push("style-sheet");
- if (filter.contentType & typeMap.SCRIPT)
- types.push("script");
- if (filter.contentType & typeMap.FONT)
- types.push("font");
- if (filter.contentType & (typeMap.MEDIA | typeMap.OBJECT))
- types.push("media");
- if (filter.contentType & typeMap.POPUP)
- types.push("popup");
- if (filter.contentType & (typeMap.XMLHTTPREQUEST | typeMap.OBJECT_SUBREQUEST
- | typeMap.PING | typeMap.OTHER))
- types.push("raw");
- if (filter.contentType & typeMap.SUBDOCUMENT)
- types.push("document");
-
- return types;
-}
-
-function addDomainPrefix(domains)
-{
- let result = [];
-
- for (let domain of domains)
- {
- result.push(domain);
-
- if (tldjs.getSubdomain(domain) == "")
- result.push("www." + domain);
- }
-
- return result;
-}
-
-function convertFilter(filter, action, withResourceTypes)
-{
- let trigger = {"url-filter": getRegExpSource(filter)};
- let included = [];
- let excluded = [];
-
- parseDomains(filter.domains, included, excluded);
-
- if (withResourceTypes)
- trigger["resource-type"] = getResourceTypes(filter);
- if (filter.thirdParty != null)
- trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];
-
- if (included.length > 0)
- trigger["if-domain"] = addDomainPrefix(included);
- else if (excluded.length > 0)
- trigger["unless-domain"] = addDomainPrefix(excluded);
-
- return {trigger: trigger, action: {type: action}};
-}
-
-function hasNonASCI(obj)
-{
- if (typeof obj == "string")
- {
- if (/[^\x00-\x7F]/.test(obj))
- return true;
- }
-
- if (typeof obj == "object")
- {
- if (obj instanceof Array)
- for (let item of obj)
- if (hasNonASCI(item))
- return true;
-
- for (let name of Object.getOwnPropertyNames(obj))
- if (hasNonASCI(obj[name]))
- return true;
- }
-
- return false;
-}
-
-function convertIDSelectorsToAttributeSelectors(selector)
+rl.on("line", line =>
{
- // First we figure out where all the IDs are
- let sep = "";
- let start = null;
- let positions = [];
- for (let i = 0; i < selector.length; i++)
- {
- let chr = selector[i];
-
- if (chr == "\\") // ignore escaped characters
- i++;
- else if (chr == sep) // don't match IDs within quoted text
- sep = ""; // e.g. [attr="#Hello"]
- else if (sep == "")
- {
- if (chr == '"' || chr == "'")
- sep = chr;
- else if (start == null) // look for the start of an ID
- {
- if (chr == "#")
- start = i;
- }
- else if (chr != "-" && chr != "_" &&
- (chr < "0" ||
- chr > "9" && chr < "A" ||
- chr > "Z" && chr < "a" ||
- chr > "z" && chr < "\x80")) // look for the end of the ID
- {
- positions.push({start: start, end: i});
- start = null;
- }
- }
- }
- if (start != null)
- positions.push({start: start, end: selector.length});
-
- // Now replace them all with the [id="someID"] form
- let newSelector = [];
- let i = 0;
- for (let pos of positions)
- {
- newSelector.push(selector.substring(i, pos.start));
- newSelector.push('[id=' + selector.substring(pos.start + 1, pos.end) + ']');
- i = pos.end;
- }
- newSelector.push(selector.substring(i));
+ if (line.charAt(0) != "[")
Sebastian Noack 2016/02/21 20:34:44 What's about empty lines? It seems that we didn't
kzar 2016/02/22 12:28:05 It appears that contentBlockerLists.js logic was h
Sebastian Noack 2016/02/22 17:35:27 OK, I looked into it and figured that this case wa
kzar 2016/02/22 18:09:28 Done. As for lines with only white-space, we're c
Sebastian Noack 2016/02/22 18:20:26 Yeah, I already figured that myself. The point is
kzar 2016/02/22 19:46:01 OK add the custom subscription of http://static.kz
+ filters.push(Filter.fromText(line));
+});
- return newSelector.join("");
-}
-
-function logRules()
+rl.on("close", () =>
{
- let rules = [];
-
- function addRule(rule)
- {
- if (!hasNonASCI(rule))
- rules.push(rule);
- }
-
- let groupedElemhideFilters = new Map();
- for (let filter of elemhideFilters)
- {
- let result = convertElemHideFilter(filter);
- if (!result)
- continue;
-
- if (result.matchDomains.length == 0)
- result.matchDomains = ["^https?://"];
-
- for (let matchDomain of result.matchDomains)
- {
- let group = groupedElemhideFilters.get(matchDomain) || [];
- group.push(result.selector);
- groupedElemhideFilters.set(matchDomain, group);
- }
- }
-
- groupedElemhideFilters.forEach((selectors, matchDomain) =>
- {
- while (selectors.length)
- {
- let selector = selectors.splice(0, selectorLimit).join(", ");
-
- // As of Safari 9.0 element IDs are matched as lowercase. We work around
- // this by converting to the attribute format [id="elementID"]
- selector = convertIDSelectorsToAttributeSelectors(selector);
-
- addRule({
- trigger: {"url-filter": matchDomain},
- action: {type: "css-display-none",
- selector: selector}
- });
- }
- });
-
- for (let filter of elemhideExceptions)
- addRule(convertFilter(filter, "ignore-previous-rules", false));
-
- for (let filter of requestFilters)
- addRule(convertFilter(filter, "block", true));
- for (let filter of requestExceptions)
- addRule(convertFilter(filter, "ignore-previous-rules", true));
-
- console.log(JSON.stringify(rules, null, "\t"));
-}
-
-let rl = readline.createInterface({input: process.stdin, terminal: false});
-rl.on("line", parseFilter);
-rl.on("close", logRules);
+ console.log(JSON.stringify(contentBlockerLists.convertFilters(filters),
+ null, "\t"));
+});
« no previous file with comments | « README.md ('k') | adblockplus.js » ('j') | lib/contentBlockerLists.js » ('J')

Powered by Google App Engine
This is Rietveld