lib/abp2blocklist.js - Issue 29337803: Issue 3710 - Unify hostname logic

Side by Side Diff: lib/abp2blocklist.js

Issue 29337803: Issue 3710 - Unify hostname logic (Closed)

Patch Set: Addressed feedback and considered crazy edge cases Created Feb. 27, 2016, 9:27 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * This file is part of Adblock Plus <https://adblockplus.org/>,	2 * This file is part of Adblock Plus <https://adblockplus.org/>,

3 * Copyright (C) 2006-2016 Eyeo GmbH	3 * Copyright (C) 2006-2016 Eyeo GmbH

4 *	4 *

5 * Adblock Plus is free software: you can redistribute it and/or modify	5 * Adblock Plus is free software: you can redistribute it and/or modify

6 * it under the terms of the GNU General Public License version 3 as	6 * it under the terms of the GNU General Public License version 3 as

7 * published by the Free Software Foundation.	7 * published by the Free Software Foundation.

8 *	8 *

9 * Adblock Plus is distributed in the hope that it will be useful,	9 * Adblock Plus is distributed in the hope that it will be useful,

10 * but WITHOUT ANY WARRANTY; without even the implied warranty of	10 * but WITHOUT ANY WARRANTY; without even the implied warranty of

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
59 let excluded = [];	59 let excluded = [];

60 let rules = [];	60 let rules = [];

61	61

62 parseDomains(filter.domains, included, excluded);	62 parseDomains(filter.domains, included, excluded);

63	63

64 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))	64 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))

65 return {matchDomains: included.map(matchDomain), selector: filter.selector};	65 return {matchDomains: included.map(matchDomain), selector: filter.selector};

66 }	66 }

67	67

68 /**	68 /**

69 * Convert the given filter "regexpSource" string into a regular expression.	69 * Convert the given filter "regexpSource" string into a regular expression,

	70 * handling the conversion of unicode inside hostnames to punycode.

70 * (Also deciding if the regular expression can be safely converted to and	71 * (Also deciding if the regular expression can be safely converted to and

71 * matched as lower case or not.)	72 * matched as lower case or not.)

72 *	73 *

73 * @param {string} text regexpSource property of a filter	74 * @param {string} text regexpSource property of a filter

74 * @returns {object} An object containing a regular expression string and a bool	75 * @returns {object} An object containing a regular expression string and a bool

75 * indicating if the filter can be safely matched as lower	76 * indicating if the filter can be safely matched as lower

76 * case: {regexp: "...", caseSenstive: true/false}	77 * case: {regexp: "...", canSafelyMatchAsLowercase: true/false }

77 */	78 */

78 function toRegExp(text)	79 function toRegExp(text)

79 {	80 {

80 let result = [];	81 let result = [];

81 let lastIndex = text.length - 1;	82 let lastIndex = text.length - 1;

82 let hostnameStarted = false;	83 let hostnameStart = null;

83 let hostnameFinished = false;	84 let hostnameFinished = false;

84 let caseSensitive = false;	85 let canSafelyMatchAsLowercase = false;

85	86

86 for (let i = 0; i < text.length; i++)	87 for (let i = 0; i < text.length; i++)

87 {	88 {

88 let c = text[i];	89 let c = text[i];

89	90

	91 // If we're currently inside the hostname we have to be careful not to

	92 // escape any characters until after we have converted it to punycode.

	93 if (hostnameStart != null && !hostnameFinished)

	94 {

	95 let endingChar = (c == "*" \|\| c == "^" \|\| c == "?" \|\| c == "/");
	Sebastian Noack 2016/02/27 23:06:16 I know we didn't handle it before, but what's if w I know we didn't handle it before, but what's if we have a \| in the last position. It implies the end of the URL, and therefore also end of the hostname. kzar 2016/03/07 17:06:48 So do we want to always consider "\|" to end the ho Show quoted text On 2016/02/27 23:06:16, Sebastian Noack wrote: > I know we didn't handle it before, but what's if we have a \| in the last > position. It implies the end of the URL, and therefore also end of the hostname. So do we want to always consider "\|" to end the hostname, or only when it also happens to be the last character? Sebastian Noack 2016/03/08 09:31:22 I guess, for simplicity we can just assume that an Show quoted text On 2016/03/07 17:06:48, kzar wrote: > On 2016/02/27 23:06:16, Sebastian Noack wrote: > > I know we didn't handle it before, but what's if we have a \| in the last > > position. It implies the end of the URL, and therefore also end of the > hostname. > > So do we want to always consider "\|" to end the hostname, or only when it also > happens to be the last character? I guess, for simplicity we can just assume that any "\|" (except in the beginning) implies the end of the URL. It cannot occur in a domain, and if it occurs in the path/query it has to be escaped, anyway. kzar 2016/03/08 12:36:01 Yea, sounds good to me. Also we already know that Show quoted text On 2016/03/08 09:31:22, Sebastian Noack wrote: > On 2016/03/07 17:06:48, kzar wrote: > > On 2016/02/27 23:06:16, Sebastian Noack wrote: > > > I know we didn't handle it before, but what's if we have a \| in the last > > > position. It implies the end of the URL, and therefore also end of the > > hostname. > > > > So do we want to always consider "\|" to end the hostname, or only when it also > > happens to be the last character? > > I guess, for simplicity we can just assume that any "\|" (except in the > beginning) implies the end of the URL. It cannot occur in a domain, and if it > occurs in the path/query it has to be escaped, anyway. Yea, sounds good to me. Also we already know that we've passed the first character here as hostnameStart isn't null which makes things even easier.
	96 if (!endingChar && i != lastIndex)

	97 continue;

	98

	99 let hostname = text.substring(hostnameStart, endingChar ? i : i + 1);

	100 hostnameFinished = true;

	101 result.push(escapeRegExp(punycode.toASCII(hostname)));

	102 if (!endingChar)

	103 break;

	104 }

	105

90 switch (c)	106 switch (c)

91 {	107 {

92 case "*":	108 case "*":

93 if (hostnameStarted)

94 hostnameFinished = true;

95 if (result.length > 0 && i < lastIndex && text[i + 1] != "*")	109 if (result.length > 0 && i < lastIndex && text[i + 1] != "*")

96 result.push(".*");	110 result.push(".*");

97 break;	111 break;

98 case "^":	112 case "^":

99 if (hostnameStarted)

100 hostnameFinished = true;

101 if (i < lastIndex)	113 if (i < lastIndex)

102 result.push(".");	114 result.push(".");

103 break;	115 break;

104 case "\|":	116 case "\|":

105 if (i == 0)	117 if (i == 0)

106 {	118 {

107 result.push("^");	119 result.push("^");

108 break;	120 break;

109 }	121 }

110 if (i == lastIndex)	122 if (i == lastIndex)

111 {	123 {

112 result.push("$");	124 result.push("$");

113 break;	125 break;

114 }	126 }

115 if (i == 1 && text[0] == "\|")	127 if (i == 1 && text[0] == "\|")

116 {	128 {

117 hostnameStarted = caseSensitive = true;	129 hostnameStart = i + 1;

	130 canSafelyMatchAsLowercase = true;

118 result.push("https?://");	131 result.push("https?://");

119 break;	132 break;

120 }	133 }

	134 result.push("\\\|");

	135 break;

	136 case "/":

	137 if (!hostnameFinished &&

	138 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")

	139 {

	140 hostnameStart = i + 1;

	141 canSafelyMatchAsLowercase = true;

	142 }

	143 result.push("/");

	144 break;

	145 case ".": case "+": case "$": case "?":

	146 case "{": case "}": case "(": case ")":

	147 case "[": case "]": case "\\":

121 result.push("\\", c);	148 result.push("\\", c);

122 break;	149 break;

123 case "?":

124 if (hostnameStarted)

125 hostnameFinished = true;

126 case ".": case "+": case "$": case "{": case "}":

127 case "(": case ")": case "[": case "]": case "\\":

128 result.push("\\", c);

129 break;

130 case "/":

131 if (hostnameStarted)

132 hostnameFinished = true;

133 else if (text.charAt(i-2) == ":" && text.charAt(i-1) == "/")

134 hostnameStarted = caseSensitive = true;

135 default:	150 default:

136 if (hostnameFinished && (c >= "a" && c <= "z" \|\|	151 if (hostnameFinished && (c >= "a" && c <= "z" \|\|

137 c >= "A" && c <= "Z"))	152 c >= "A" && c <= "Z"))

138 caseSensitive = false;	153 canSafelyMatchAsLowercase = false;

139 result.push(c);	154 result.push(c);

140 }	155 }

141 }	156 }

142	157

143 return {regexp: result.join(""), caseSensitive: caseSensitive};	158 return {regexp: result.join(""),

	159 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase};

144 }	160 }

145	161

146 function getRegExpTrigger(filter)	162 function getRegExpTrigger(filter)

147 {	163 {

148 let result = toRegExp(filter.regexpSource.replace(	164 let result = toRegExp(filter.regexpSource);

149 // Safari expects punycode, filter lists use unicode

150 /^(\\|\\|\|\\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i,

151 function (match, prefix, domain)

152 {

153 return prefix + punycode.toASCII(domain);

154 }

155 ));

156	165

157 let trigger = {"url-filter": result.regexp};	166 let trigger = {"url-filter": result.regexp};

158	167

159 // Limit rules to to HTTP(S) URLs	168 // Limit rules to to HTTP(S) URLs

160 if (!/^(\^\|http)/i.test(trigger["url-filter"]))	169 if (!/^(\^\|http)/i.test(trigger["url-filter"]))

161 trigger["url-filter"] = "^https?://.*" + trigger["url-filter"];	170 trigger["url-filter"] = "^https?://.*" + trigger["url-filter"];

162	171

163 // For rules containing only a hostname we know that we're matching against	172 // For rules containing only a hostname we know that we're matching against

164 // a lowercase string unless the matchCase option was passed.	173 // a lowercase string unless the matchCase option was passed.

165 if (result.caseSensitive && !filter.matchCase)	174 if (result.canSafelyMatchAsLowercase && !filter.matchCase)

166 trigger["url-filter"] = trigger["url-filter"].toLowerCase();	175 trigger["url-filter"] = trigger["url-filter"].toLowerCase();

167	176

168 if (result.caseSensitive \|\| filter.matchCase)	177 if (result.canSafelyMatchAsLowercase \|\| filter.matchCase)

169 trigger["url-filter-is-case-sensitive"] = true;	178 trigger["url-filter-is-case-sensitive"] = true;

170	179

171 return trigger;	180 return trigger;

172 }	181 }

173	182

174 function getResourceTypes(filter)	183 function getResourceTypes(filter)

175 {	184 {

176 let types = [];	185 let types = [];

177	186

178 if (filter.contentType & typeMap.IMAGE)	187 if (filter.contentType & typeMap.IMAGE)

(...skipping 248 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
427	436

428 for (let filter of this.elemhideExceptions)	437 for (let filter of this.elemhideExceptions)

429 addRule(convertFilter(filter, "ignore-previous-rules", false));	438 addRule(convertFilter(filter, "ignore-previous-rules", false));

430 for (let filter of this.requestFilters)	439 for (let filter of this.requestFilters)

431 addRule(convertFilter(filter, "block", true));	440 addRule(convertFilter(filter, "block", true));

432 for (let filter of this.requestExceptions)	441 for (let filter of this.requestExceptions)

433 addRule(convertFilter(filter, "ignore-previous-rules", true));	442 addRule(convertFilter(filter, "ignore-previous-rules", true));

434	443

435 return rules;	444 return rules;

436 };	445 };

OLD	NEW

« no previous file with comments | « abp2blocklist.js ('k') | no next file » | no next file with comments »