| Index: lib/abp2blocklist.js |
| diff --git a/lib/abp2blocklist.js b/lib/abp2blocklist.js |
| index 1c1ff6cf57607e7344359dc9109a616c793df721..6723f14ba78ec068815cee8285cd05c38fc9d3e3 100644 |
| --- a/lib/abp2blocklist.js |
| +++ b/lib/abp2blocklist.js |
| @@ -66,38 +66,50 @@ function convertElemHideFilter(filter, elemhideSelectorExceptions) |
| } |
| /** |
| - * Convert the given filter "regexpSource" string into a regular expression. |
| + * Convert the given filter "regexpSource" string into a regular expression, |
| + * handling the conversion of unicode inside hostnames to punycode. |
| * (Also deciding if the regular expression can be safely converted to and |
| * matched as lower case or not.) |
| * |
| * @param {string} text regexpSource property of a filter |
| * @returns {object} An object containing a regular expression string and a bool |
| * indicating if the filter can be safely matched as lower |
| - * case: {regexp: "...", caseSenstive: true/false} |
| + * case: {regexp: "...", canSafelyMatchAsLowercase: true/false} |
| */ |
| function toRegExp(text) |
| { |
| let result = []; |
| let lastIndex = text.length - 1; |
| - let hostnameStarted = false; |
| + let hostnameStart = null; |
| let hostnameFinished = false; |
| - let caseSensitive = false; |
| + let canSafelyMatchAsLowercase = false; |
| for (let i = 0; i < text.length; i++) |
| { |
| let c = text[i]; |
| + // If we're currently inside the hostname we have to be careful not to |
| + // escape any characters until after we have converted it to punycode. |
| + if (hostnameStart != null && !hostnameFinished) |
| + { |
| + let endingChar = (c == "*" || c == "^" || c == "?" || c == "/"); |
|
Sebastian Noack
2016/02/27 23:06:16
I know we didn't handle it before, but what's if w
kzar
2016/03/07 17:06:48
So do we want to always consider "|" to end the ho
Sebastian Noack
2016/03/08 09:31:22
I guess, for simplicity we can just assume that an
kzar
2016/03/08 12:36:01
Yea, sounds good to me. Also we already know that
|
| + if (!endingChar && i != lastIndex) |
| + continue; |
| + |
| + let hostname = text.substring(hostnameStart, endingChar ? i : i + 1); |
| + hostnameFinished = true; |
| + result.push(escapeRegExp(punycode.toASCII(hostname))); |
| + if (!endingChar) |
| + break; |
| + } |
| + |
| switch (c) |
| { |
| case "*": |
| - if (hostnameStarted) |
| - hostnameFinished = true; |
| if (result.length > 0 && i < lastIndex && text[i + 1] != "*") |
| result.push(".*"); |
| break; |
| case "^": |
| - if (hostnameStarted) |
| - hostnameFinished = true; |
| if (i < lastIndex) |
| result.push("."); |
| break; |
| @@ -114,45 +126,42 @@ function toRegExp(text) |
| } |
| if (i == 1 && text[0] == "|") |
| { |
| - hostnameStarted = caseSensitive = true; |
| + hostnameStart = i + 1; |
| + canSafelyMatchAsLowercase = true; |
| result.push("https?://"); |
| break; |
| } |
| - result.push("\\", c); |
| + result.push("\\|"); |
| + break; |
| + case "/": |
| + if (!hostnameFinished && |
| + text.charAt(i-2) == ":" && text.charAt(i-1) == "/") |
| + { |
| + hostnameStart = i + 1; |
| + canSafelyMatchAsLowercase = true; |
| + } |
| + result.push("/"); |
| break; |
| - case "?": |
| - if (hostnameStarted) |
| - hostnameFinished = true; |
| - case ".": case "+": case "$": case "{": case "}": |
| - case "(": case ")": case "[": case "]": case "\\": |
| + case ".": case "+": case "$": case "?": |
| + case "{": case "}": case "(": case ")": |
| + case "[": case "]": case "\\": |
| result.push("\\", c); |
| break; |
| - case "/": |
| - if (hostnameStarted) |
| - hostnameFinished = true; |
| - else if (text.charAt(i-2) == ":" && text.charAt(i-1) == "/") |
| - hostnameStarted = caseSensitive = true; |
| default: |
| if (hostnameFinished && (c >= "a" && c <= "z" || |
| c >= "A" && c <= "Z")) |
| - caseSensitive = false; |
| + canSafelyMatchAsLowercase = false; |
| result.push(c); |
| } |
| } |
| - return {regexp: result.join(""), caseSensitive: caseSensitive}; |
| + return {regexp: result.join(""), |
| + canSafelyMatchAsLowercase: canSafelyMatchAsLowercase}; |
| } |
| function getRegExpTrigger(filter) |
| { |
| - let result = toRegExp(filter.regexpSource.replace( |
| - // Safari expects punycode, filter lists use unicode |
| - /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i, |
| - function (match, prefix, domain) |
| - { |
| - return prefix + punycode.toASCII(domain); |
| - } |
| - )); |
| + let result = toRegExp(filter.regexpSource); |
| let trigger = {"url-filter": result.regexp}; |
| @@ -162,10 +171,10 @@ function getRegExpTrigger(filter) |
| // For rules containing only a hostname we know that we're matching against |
| // a lowercase string unless the matchCase option was passed. |
| - if (result.caseSensitive && !filter.matchCase) |
| + if (result.canSafelyMatchAsLowercase && !filter.matchCase) |
| trigger["url-filter"] = trigger["url-filter"].toLowerCase(); |
| - if (result.caseSensitive || filter.matchCase) |
| + if (result.canSafelyMatchAsLowercase || filter.matchCase) |
| trigger["url-filter-is-case-sensitive"] = true; |
| return trigger; |