Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/abp2blocklist.js

Issue 29337803: Issue 3710 - Unify hostname logic (Closed)
Patch Set: Addressed feedback and considered crazy edge cases Created Feb. 27, 2016, 9:27 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « abp2blocklist.js ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: lib/abp2blocklist.js
diff --git a/lib/abp2blocklist.js b/lib/abp2blocklist.js
index 1c1ff6cf57607e7344359dc9109a616c793df721..6723f14ba78ec068815cee8285cd05c38fc9d3e3 100644
--- a/lib/abp2blocklist.js
+++ b/lib/abp2blocklist.js
@@ -66,38 +66,50 @@ function convertElemHideFilter(filter, elemhideSelectorExceptions)
}
/**
- * Convert the given filter "regexpSource" string into a regular expression.
+ * Convert the given filter "regexpSource" string into a regular expression,
+ * handling the conversion of unicode inside hostnames to punycode.
* (Also deciding if the regular expression can be safely converted to and
* matched as lower case or not.)
*
* @param {string} text regexpSource property of a filter
* @returns {object} An object containing a regular expression string and a bool
* indicating if the filter can be safely matched as lower
- * case: {regexp: "...", caseSenstive: true/false}
+ * case: {regexp: "...", canSafelyMatchAsLowercase: true/false}
*/
function toRegExp(text)
{
let result = [];
let lastIndex = text.length - 1;
- let hostnameStarted = false;
+ let hostnameStart = null;
let hostnameFinished = false;
- let caseSensitive = false;
+ let canSafelyMatchAsLowercase = false;
for (let i = 0; i < text.length; i++)
{
let c = text[i];
+ // If we're currently inside the hostname we have to be careful not to
+ // escape any characters until after we have converted it to punycode.
+ if (hostnameStart != null && !hostnameFinished)
+ {
+ let endingChar = (c == "*" || c == "^" || c == "?" || c == "/");
Sebastian Noack 2016/02/27 23:06:16 I know we didn't handle it before, but what's if w
kzar 2016/03/07 17:06:48 So do we want to always consider "|" to end the ho
Sebastian Noack 2016/03/08 09:31:22 I guess, for simplicity we can just assume that an
kzar 2016/03/08 12:36:01 Yea, sounds good to me. Also we already know that
+ if (!endingChar && i != lastIndex)
+ continue;
+
+ let hostname = text.substring(hostnameStart, endingChar ? i : i + 1);
+ hostnameFinished = true;
+ result.push(escapeRegExp(punycode.toASCII(hostname)));
+ if (!endingChar)
+ break;
+ }
+
switch (c)
{
case "*":
- if (hostnameStarted)
- hostnameFinished = true;
if (result.length > 0 && i < lastIndex && text[i + 1] != "*")
result.push(".*");
break;
case "^":
- if (hostnameStarted)
- hostnameFinished = true;
if (i < lastIndex)
result.push(".");
break;
@@ -114,45 +126,42 @@ function toRegExp(text)
}
if (i == 1 && text[0] == "|")
{
- hostnameStarted = caseSensitive = true;
+ hostnameStart = i + 1;
+ canSafelyMatchAsLowercase = true;
result.push("https?://");
break;
}
- result.push("\\", c);
+ result.push("\\|");
+ break;
+ case "/":
+ if (!hostnameFinished &&
+ text.charAt(i-2) == ":" && text.charAt(i-1) == "/")
+ {
+ hostnameStart = i + 1;
+ canSafelyMatchAsLowercase = true;
+ }
+ result.push("/");
break;
- case "?":
- if (hostnameStarted)
- hostnameFinished = true;
- case ".": case "+": case "$": case "{": case "}":
- case "(": case ")": case "[": case "]": case "\\":
+ case ".": case "+": case "$": case "?":
+ case "{": case "}": case "(": case ")":
+ case "[": case "]": case "\\":
result.push("\\", c);
break;
- case "/":
- if (hostnameStarted)
- hostnameFinished = true;
- else if (text.charAt(i-2) == ":" && text.charAt(i-1) == "/")
- hostnameStarted = caseSensitive = true;
default:
if (hostnameFinished && (c >= "a" && c <= "z" ||
c >= "A" && c <= "Z"))
- caseSensitive = false;
+ canSafelyMatchAsLowercase = false;
result.push(c);
}
}
- return {regexp: result.join(""), caseSensitive: caseSensitive};
+ return {regexp: result.join(""),
+ canSafelyMatchAsLowercase: canSafelyMatchAsLowercase};
}
function getRegExpTrigger(filter)
{
- let result = toRegExp(filter.regexpSource.replace(
- // Safari expects punycode, filter lists use unicode
- /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i,
- function (match, prefix, domain)
- {
- return prefix + punycode.toASCII(domain);
- }
- ));
+ let result = toRegExp(filter.regexpSource);
let trigger = {"url-filter": result.regexp};
@@ -162,10 +171,10 @@ function getRegExpTrigger(filter)
// For rules containing only a hostname we know that we're matching against
// a lowercase string unless the matchCase option was passed.
- if (result.caseSensitive && !filter.matchCase)
+ if (result.canSafelyMatchAsLowercase && !filter.matchCase)
trigger["url-filter"] = trigger["url-filter"].toLowerCase();
- if (result.caseSensitive || filter.matchCase)
+ if (result.canSafelyMatchAsLowercase || filter.matchCase)
trigger["url-filter-is-case-sensitive"] = true;
return trigger;
« no previous file with comments | « abp2blocklist.js ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld