Index: lib/abp2blocklist.js |
=================================================================== |
--- a/lib/abp2blocklist.js |
+++ b/lib/abp2blocklist.js |
@@ -104,52 +104,61 @@ |
* {regexp: "...", |
* canSafelyMatchAsLowercase: true/false, |
* hostname: "...", |
* justHostname: true/false} |
*/ |
function parseFilterRegexpSource(text) |
{ |
let regexp = []; |
- let lastIndex = text.length - 1; |
+ |
+ // Convert the text into an array of Unicode characters. |
+ // |
+ // In the case of surrogate pairs (the smiley emoji, for example), one |
+ // Unicode code point is represented by two JavaScript characters together. |
+ // We want to iterate over Unicode code points rather than JavaScript |
+ // characters. |
+ let characters = Array.from(text); |
+ |
+ let lastIndex = characters.length - 1; |
let hostname; |
let hostnameStart = null; |
let hostnameFinished = false; |
let justHostname = false; |
let canSafelyMatchAsLowercase = false; |
- for (let i = 0; i < text.length; i++) |
+ for (let i = 0; i < characters.length; i++) |
{ |
- let c = text[i]; |
+ let c = characters[i]; |
if (hostnameFinished) |
justHostname = false; |
// If we're currently inside the hostname we have to be careful not to |
// escape any characters until after we have converted it to punycode. |
if (hostnameStart != null && !hostnameFinished) |
{ |
let endingChar = (c == "*" || c == "^" || |
c == "?" || c == "/" || c == "|"); |
if (!endingChar && i != lastIndex) |
continue; |
hostname = punycode.toASCII( |
- text.substring(hostnameStart, endingChar ? i : i + 1) |
+ characters.slice(hostnameStart, endingChar ? i : i + 1).join("") |
); |
hostnameFinished = justHostname = true; |
regexp.push(escapeRegExp(hostname)); |
if (!endingChar) |
break; |
} |
switch (c) |
{ |
case "*": |
- if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*") |
+ if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*") |
regexp.push(".*"); |
break; |
case "^": |
if (i < lastIndex) |
regexp.push("."); |
break; |
case "|": |
if (i == 0) |
@@ -157,44 +166,44 @@ |
regexp.push("^"); |
break; |
} |
if (i == lastIndex) |
{ |
regexp.push("$"); |
break; |
} |
- if (i == 1 && text[0] == "|") |
+ if (i == 1 && characters[0] == "|") |
{ |
hostnameStart = i + 1; |
canSafelyMatchAsLowercase = true; |
regexp.push("https?://([^/]+\\.)?"); |
break; |
} |
regexp.push("\\|"); |
break; |
case "/": |
if (!hostnameFinished && |
- text.charAt(i-2) == ":" && text.charAt(i-1) == "/") |
+ characters[i - 2] == ":" && characters[i - 1] == "/") |
{ |
hostnameStart = i + 1; |
canSafelyMatchAsLowercase = true; |
} |
regexp.push("/"); |
break; |
case ".": case "+": case "$": case "?": |
case "{": case "}": case "(": case ")": |
case "[": case "]": case "\\": |
regexp.push("\\", c); |
break; |
default: |
if (hostnameFinished && (c >= "a" && c <= "z" || |
c >= "A" && c <= "Z")) |
canSafelyMatchAsLowercase = false; |
- regexp.push(c); |
+ regexp.push(c == "%" ? c : encodeURI(c)); |
} |
} |
return { |
regexp: regexp.join(""), |
canSafelyMatchAsLowercase: canSafelyMatchAsLowercase, |
hostname: hostname, |
justHostname: justHostname |