Index: lib/abp2blocklist.js |
diff --git a/lib/abp2blocklist.js b/lib/abp2blocklist.js |
index 1c1ff6cf57607e7344359dc9109a616c793df721..069d5153cea8d02a37076bd0b8509c9136f20aee 100644 |
--- a/lib/abp2blocklist.js |
+++ b/lib/abp2blocklist.js |
@@ -66,38 +66,49 @@ function convertElemHideFilter(filter, elemhideSelectorExceptions) |
} |
/** |
- * Convert the given filter "regexpSource" string into a regular expression. |
+ * Convert the given filter "regexpSource" string into a regular expression, |
+ * handling the conversion of unicode inside hostnames to punycode. |
* (Also deciding if the regular expression can be safely converted to and |
* matched as lower case or not.) |
* |
* @param {string} text regexpSource property of a filter |
* @returns {object} An object containing a regular expression string and a bool |
* indicating if the filter can be safely matched as lower |
- * case: {regexp: "...", caseSenstive: true/false} |
+ * case: {regexp: "...", canSafelyMatchAsLowercase: true/false} |
*/ |
function toRegExp(text) |
{ |
let result = []; |
let lastIndex = text.length - 1; |
- let hostnameStarted = false; |
+ let hostnameStart = null; |
let hostnameFinished = false; |
- let caseSensitive = false; |
+ let canSafelyMatchAsLowercase = false; |
for (let i = 0; i < text.length; i++) |
{ |
let c = text[i]; |
+ // If we're currently inside the hostname we have to be careful not to |
+ // escape any characters until after we have converted it to punycode. |
+ if (hostnameStart != null && !hostnameFinished) |
+ { |
+ if (c == "*" || c == "^" || c == "?" || c == "/" || i == lastIndex) |
Sebastian Noack
2016/02/27 20:30:38
If you turn the logic here the other way around, y
Sebastian Noack
2016/02/27 20:30:38
I'm not entirely sure if the case of last index is
kzar
2016/02/27 21:28:53
Done.
kzar
2016/02/27 21:28:53
Good point but it's even more complicated, what if
|
+ { |
+ hostnameFinished = true; |
+ let hostname = text.substring(hostnameStart, i); |
+ result.push(escapeRegExp(punycode.toASCII(hostname))); |
+ } |
+ else |
+ continue; |
+ } |
+ |
switch (c) |
{ |
case "*": |
- if (hostnameStarted) |
- hostnameFinished = true; |
if (result.length > 0 && i < lastIndex && text[i + 1] != "*") |
result.push(".*"); |
break; |
case "^": |
- if (hostnameStarted) |
- hostnameFinished = true; |
if (i < lastIndex) |
result.push("."); |
break; |
@@ -114,45 +125,41 @@ function toRegExp(text) |
} |
if (i == 1 && text[0] == "|") |
{ |
- hostnameStarted = caseSensitive = true; |
result.push("https?://"); |
Sebastian Noack
2016/02/27 20:30:38
Nit: Mind moving that line to just above the break
kzar
2016/02/27 21:28:53
Done.
|
+ hostnameStart = i + 1; |
+ canSafelyMatchAsLowercase = true; |
break; |
} |
- result.push("\\", c); |
+ result.push("\\|"); |
break; |
- case "?": |
- if (hostnameStarted) |
- hostnameFinished = true; |
- case ".": case "+": case "$": case "{": case "}": |
+ case "/": |
+ result.push("/"); |
Sebastian Noack
2016/02/27 20:30:38
Nit: Mind moving that line to just above the break
kzar
2016/02/27 21:28:53
Done.
|
+ if (!hostnameFinished && |
Sebastian Noack
2016/02/27 20:30:38
Nit: It doesn't matter, but I personally find that
kzar
2016/02/27 21:28:53
I'd rather leave this one as it is.
|
+ text.charAt(i-2) == ":" && text.charAt(i-1) == "/") |
+ { |
+ hostnameStart = i + 1; |
+ canSafelyMatchAsLowercase = true; |
+ } |
+ break; |
+ case ".": case "+": case "$": case "{": case "}": case "?": |
Sebastian Noack
2016/02/27 20:30:38
Nit: I think the way this block was originally wra
kzar
2016/02/27 21:28:53
Done.
|
case "(": case ")": case "[": case "]": case "\\": |
result.push("\\", c); |
break; |
- case "/": |
- if (hostnameStarted) |
- hostnameFinished = true; |
- else if (text.charAt(i-2) == ":" && text.charAt(i-1) == "/") |
- hostnameStarted = caseSensitive = true; |
kzar
2016/02/27 14:29:34
(I've switched this around as I decided that sneak
|
default: |
if (hostnameFinished && (c >= "a" && c <= "z" || |
c >= "A" && c <= "Z")) |
- caseSensitive = false; |
+ canSafelyMatchAsLowercase = false; |
result.push(c); |
} |
} |
- return {regexp: result.join(""), caseSensitive: caseSensitive}; |
+ return {regexp: result.join(""), |
+ canSafelyMatchAsLowercase: canSafelyMatchAsLowercase}; |
} |
function getRegExpTrigger(filter) |
{ |
- let result = toRegExp(filter.regexpSource.replace( |
- // Safari expects punycode, filter lists use unicode |
- /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i, |
- function (match, prefix, domain) |
- { |
- return prefix + punycode.toASCII(domain); |
- } |
- )); |
+ let result = toRegExp(filter.regexpSource); |
let trigger = {"url-filter": result.regexp}; |
@@ -162,10 +169,10 @@ function getRegExpTrigger(filter) |
// For rules containing only a hostname we know that we're matching against |
// a lowercase string unless the matchCase option was passed. |
- if (result.caseSensitive && !filter.matchCase) |
+ if (result.canSafelyMatchAsLowercase && !filter.matchCase) |
trigger["url-filter"] = trigger["url-filter"].toLowerCase(); |
- if (result.caseSensitive || filter.matchCase) |
+ if (result.canSafelyMatchAsLowercase || filter.matchCase) |
trigger["url-filter-is-case-sensitive"] = true; |
return trigger; |