Index: lib/abp2blocklist.js |
diff --git a/lib/abp2blocklist.js b/lib/abp2blocklist.js |
index 1c1ff6cf57607e7344359dc9109a616c793df721..6723f14ba78ec068815cee8285cd05c38fc9d3e3 100644 |
--- a/lib/abp2blocklist.js |
+++ b/lib/abp2blocklist.js |
@@ -66,38 +66,50 @@ function convertElemHideFilter(filter, elemhideSelectorExceptions) |
} |
/** |
- * Convert the given filter "regexpSource" string into a regular expression. |
+ * Convert the given filter "regexpSource" string into a regular expression, |
+ * handling the conversion of unicode inside hostnames to punycode. |
* (Also deciding if the regular expression can be safely converted to and |
* matched as lower case or not.) |
* |
* @param {string} text regexpSource property of a filter |
* @returns {object} An object containing a regular expression string and a bool |
* indicating if the filter can be safely matched as lower |
- * case: {regexp: "...", caseSenstive: true/false} |
+ * case: {regexp: "...", canSafelyMatchAsLowercase: true/false} |
*/ |
function toRegExp(text) |
{ |
let result = []; |
let lastIndex = text.length - 1; |
- let hostnameStarted = false; |
+ let hostnameStart = null; |
let hostnameFinished = false; |
- let caseSensitive = false; |
+ let canSafelyMatchAsLowercase = false; |
for (let i = 0; i < text.length; i++) |
{ |
let c = text[i]; |
+ // If we're currently inside the hostname we have to be careful not to |
+ // escape any characters until after we have converted it to punycode. |
+ if (hostnameStart != null && !hostnameFinished) |
+ { |
+ let endingChar = (c == "*" || c == "^" || c == "?" || c == "/"); |
Sebastian Noack
2016/02/27 23:06:16
I know we didn't handle it before, but what's if w
kzar
2016/03/07 17:06:48
So do we want to always consider "|" to end the ho
Sebastian Noack
2016/03/08 09:31:22
I guess, for simplicity we can just assume that an
kzar
2016/03/08 12:36:01
Yea, sounds good to me. Also we already know that
|
+ if (!endingChar && i != lastIndex) |
+ continue; |
+ |
+ let hostname = text.substring(hostnameStart, endingChar ? i : i + 1); |
+ hostnameFinished = true; |
+ result.push(escapeRegExp(punycode.toASCII(hostname))); |
+ if (!endingChar) |
+ break; |
+ } |
+ |
switch (c) |
{ |
case "*": |
- if (hostnameStarted) |
- hostnameFinished = true; |
if (result.length > 0 && i < lastIndex && text[i + 1] != "*") |
result.push(".*"); |
break; |
case "^": |
- if (hostnameStarted) |
- hostnameFinished = true; |
if (i < lastIndex) |
result.push("."); |
break; |
@@ -114,45 +126,42 @@ function toRegExp(text) |
} |
if (i == 1 && text[0] == "|") |
{ |
- hostnameStarted = caseSensitive = true; |
+ hostnameStart = i + 1; |
+ canSafelyMatchAsLowercase = true; |
result.push("https?://"); |
break; |
} |
- result.push("\\", c); |
+ result.push("\\|"); |
+ break; |
+ case "/": |
+ if (!hostnameFinished && |
+ text.charAt(i-2) == ":" && text.charAt(i-1) == "/") |
+ { |
+ hostnameStart = i + 1; |
+ canSafelyMatchAsLowercase = true; |
+ } |
+ result.push("/"); |
break; |
- case "?": |
- if (hostnameStarted) |
- hostnameFinished = true; |
- case ".": case "+": case "$": case "{": case "}": |
- case "(": case ")": case "[": case "]": case "\\": |
+ case ".": case "+": case "$": case "?": |
+ case "{": case "}": case "(": case ")": |
+ case "[": case "]": case "\\": |
result.push("\\", c); |
break; |
- case "/": |
- if (hostnameStarted) |
- hostnameFinished = true; |
- else if (text.charAt(i-2) == ":" && text.charAt(i-1) == "/") |
- hostnameStarted = caseSensitive = true; |
default: |
if (hostnameFinished && (c >= "a" && c <= "z" || |
c >= "A" && c <= "Z")) |
- caseSensitive = false; |
+ canSafelyMatchAsLowercase = false; |
result.push(c); |
} |
} |
- return {regexp: result.join(""), caseSensitive: caseSensitive}; |
+ return {regexp: result.join(""), |
+ canSafelyMatchAsLowercase: canSafelyMatchAsLowercase}; |
} |
function getRegExpTrigger(filter) |
{ |
- let result = toRegExp(filter.regexpSource.replace( |
- // Safari expects punycode, filter lists use unicode |
- /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i, |
- function (match, prefix, domain) |
- { |
- return prefix + punycode.toASCII(domain); |
- } |
- )); |
+ let result = toRegExp(filter.regexpSource); |
let trigger = {"url-filter": result.regexp}; |
@@ -162,10 +171,10 @@ function getRegExpTrigger(filter) |
// For rules containing only a hostname we know that we're matching against |
// a lowercase string unless the matchCase option was passed. |
- if (result.caseSensitive && !filter.matchCase) |
+ if (result.canSafelyMatchAsLowercase && !filter.matchCase) |
trigger["url-filter"] = trigger["url-filter"].toLowerCase(); |
- if (result.caseSensitive || filter.matchCase) |
+ if (result.canSafelyMatchAsLowercase || filter.matchCase) |
trigger["url-filter-is-case-sensitive"] = true; |
return trigger; |