lib/abp2blocklist.js - Issue 29337803: Issue 3710 - Unify hostname logic

Unified Diff: lib/abp2blocklist.js

Issue 29337803: Issue 3710 - Unify hostname logic (Closed)

Patch Set: Created Feb. 27, 2016, 2:23 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: lib/abp2blocklist.js

diff --git a/lib/abp2blocklist.js b/lib/abp2blocklist.js

index 1c1ff6cf57607e7344359dc9109a616c793df721..069d5153cea8d02a37076bd0b8509c9136f20aee 100644

--- a/lib/abp2blocklist.js

+++ b/lib/abp2blocklist.js

@@ -66,38 +66,49 @@ function convertElemHideFilter(filter, elemhideSelectorExceptions)

}

/**

- * Convert the given filter "regexpSource" string into a regular expression.

+ * Convert the given filter "regexpSource" string into a regular expression,

+ * handling the conversion of unicode inside hostnames to punycode.

* (Also deciding if the regular expression can be safely converted to and

* matched as lower case or not.)

* @param {string} text regexpSource property of a filter

* @returns {object} An object containing a regular expression string and a bool

* indicating if the filter can be safely matched as lower

- * case: {regexp: "...", caseSenstive: true/false}

+ * case: {regexp: "...", canSafelyMatchAsLowercase: true/false}

function toRegExp(text)

{

let result = [];

let lastIndex = text.length - 1;

- let hostnameStarted = false;

+ let hostnameStart = null;

let hostnameFinished = false;

- let caseSensitive = false;

+ let canSafelyMatchAsLowercase = false;

for (let i = 0; i < text.length; i++)

{

let c = text[i];

+ // If we're currently inside the hostname we have to be careful not to

+ // escape any characters until after we have converted it to punycode.

+ if (hostnameStart != null && !hostnameFinished)

+ {

+ if (c == "*" || c == "^" || c == "?" || c == "/" || i == lastIndex)

Sebastian Noack 2016/02/27 20:30:38 If you turn the logic here the other way around, y

Sebastian Noack 2016/02/27 20:30:38 I'm not entirely sure if the case of last index is

kzar 2016/02/27 21:28:53 Done.

kzar 2016/02/27 21:28:53 Good point but it's even more complicated, what if

+ {

+ hostnameFinished = true;

+ let hostname = text.substring(hostnameStart, i);

+ result.push(escapeRegExp(punycode.toASCII(hostname)));

+ }

+ else

+ continue;

+ }

switch (c)

{

case "*":

- if (hostnameStarted)

- hostnameFinished = true;

if (result.length > 0 && i < lastIndex && text[i + 1] != "*")

result.push(".*");

break;

case "^":

- if (hostnameStarted)

- hostnameFinished = true;

if (i < lastIndex)

result.push(".");

break;

@@ -114,45 +125,41 @@ function toRegExp(text)

}

if (i == 1 && text[0] == "|")

{

- hostnameStarted = caseSensitive = true;

result.push("https?://");

Sebastian Noack 2016/02/27 20:30:38 Nit: Mind moving that line to just above the break

kzar 2016/02/27 21:28:53 Done.

+ hostnameStart = i + 1;

+ canSafelyMatchAsLowercase = true;

break;

}

- result.push("\\", c);

+ result.push("\\|");

break;

- case "?":

- if (hostnameStarted)

- hostnameFinished = true;

- case ".": case "+": case "$": case "{": case "}":

+ case "/":

+ result.push("/");

Sebastian Noack 2016/02/27 20:30:38 Nit: Mind moving that line to just above the break

kzar 2016/02/27 21:28:53 Done.

+ if (!hostnameFinished &&

Sebastian Noack 2016/02/27 20:30:38 Nit: It doesn't matter, but I personally find that

kzar 2016/02/27 21:28:53 I'd rather leave this one as it is.

+ text.charAt(i-2) == ":" && text.charAt(i-1) == "/")

+ {

+ hostnameStart = i + 1;

+ canSafelyMatchAsLowercase = true;

+ }

+ break;

+ case ".": case "+": case "$": case "{": case "}": case "?":

Sebastian Noack 2016/02/27 20:30:38 Nit: I think the way this block was originally wra

kzar 2016/02/27 21:28:53 Done.

case "(": case ")": case "[": case "]": case "\\":

result.push("\\", c);

break;

- case "/":

- if (hostnameStarted)

- hostnameFinished = true;

- else if (text.charAt(i-2) == ":" && text.charAt(i-1) == "/")

- hostnameStarted = caseSensitive = true;

kzar 2016/02/27 14:29:34 (I've switched this around as I decided that sneak

default:

if (hostnameFinished && (c >= "a" && c <= "z" ||

c >= "A" && c <= "Z"))

- caseSensitive = false;

+ canSafelyMatchAsLowercase = false;

result.push(c);

}

- return {regexp: result.join(""), caseSensitive: caseSensitive};

+ return {regexp: result.join(""),

+ canSafelyMatchAsLowercase: canSafelyMatchAsLowercase};

}

function getRegExpTrigger(filter)

{

- let result = toRegExp(filter.regexpSource.replace(

- // Safari expects punycode, filter lists use unicode

- /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i,

- function (match, prefix, domain)

- {

- return prefix + punycode.toASCII(domain);

- }

- ));

+ let result = toRegExp(filter.regexpSource);

let trigger = {"url-filter": result.regexp};

@@ -162,10 +169,10 @@ function getRegExpTrigger(filter)

// For rules containing only a hostname we know that we're matching against

// a lowercase string unless the matchCase option was passed.

- if (result.caseSensitive && !filter.matchCase)

+ if (result.canSafelyMatchAsLowercase && !filter.matchCase)

trigger["url-filter"] = trigger["url-filter"].toLowerCase();

- if (result.caseSensitive || filter.matchCase)

+ if (result.canSafelyMatchAsLowercase || filter.matchCase)

trigger["url-filter-is-case-sensitive"] = true;

return trigger;

« no previous file with comments | « abp2blocklist.js ('k') | no next file » | no next file with comments »