lib/abp2blocklist.js - Issue 29467595: Issue 5325 - Add support for separator characters

Delta Between Two Patch Sets: lib/abp2blocklist.js

Issue 29467595: Issue 5325 - Add support for separator characters (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist

Left Patch Set: Created June 16, 2017, 5:25 p.m.

Right Patch Set: Rebase Created July 12, 2017, 12:45 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 /*	1 /*

2 * This file is part of Adblock Plus <https://adblockplus.org/>,	2 * This file is part of Adblock Plus <https://adblockplus.org/>,

3 * Copyright (C) 2006-2017 eyeo GmbH	3 * Copyright (C) 2006-2017 eyeo GmbH

4 *	4 *

5 * Adblock Plus is free software: you can redistribute it and/or modify	5 * Adblock Plus is free software: you can redistribute it and/or modify

6 * it under the terms of the GNU General Public License version 3 as	6 * it under the terms of the GNU General Public License version 3 as

7 * published by the Free Software Foundation.	7 * published by the Free Software Foundation.

8 *	8 *

9 * Adblock Plus is distributed in the hope that it will be useful,	9 * Adblock Plus is distributed in the hope that it will be useful,

10 * but WITHOUT ANY WARRANTY; without even the implied warranty of	10 * but WITHOUT ANY WARRANTY; without even the implied warranty of

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
69 let subdomains = [];	69 let subdomains = [];

70 let suffixLength = domain.length + 1;	70 let suffixLength = domain.length + 1;

71	71

72 for (let name of list)	72 for (let name of list)

73 {	73 {

74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)	74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)

75 subdomains.push(name.slice(0, -suffixLength));	75 subdomains.push(name.slice(0, -suffixLength));

76 }	76 }

77	77

78 return subdomains;	78 return subdomains;

	79 }

	80

	81 function extractFilterDomains(filters)

	82 {

	83 let domains = new Set();

	84 for (let filter of filters)

	85 {

	86 let parsed = parseFilterRegexpSource(filter.regexpSource);

	87 if (parsed.justHostname)

	88 domains.add(parsed.hostname);

	89 }

	90 return domains;

79 }	91 }

80	92

81 function convertElemHideFilter(filter, elemhideSelectorExceptions)	93 function convertElemHideFilter(filter, elemhideSelectorExceptions)

82 {	94 {

83 let included = [];	95 let included = [];

84 let excluded = [];	96 let excluded = [];

85 let rules = [];	97 let rules = [];

86	98

87 parseDomains(filter.domains, included, excluded);	99 parseDomains(filter.domains, included, excluded);

88	100

(...skipping 13 matching lines...) Expand all Loading...
102 * case, a hostname string (or undefined) and a bool	114 * case, a hostname string (or undefined) and a bool

103 * indicating if the source only contains a hostname or not:	115 * indicating if the source only contains a hostname or not:

104 * {regexp: "...",	116 * {regexp: "...",

105 * canSafelyMatchAsLowercase: true/false,	117 * canSafelyMatchAsLowercase: true/false,

106 * hostname: "...",	118 * hostname: "...",

107 * justHostname: true/false}	119 * justHostname: true/false}

108 */	120 */

109 function parseFilterRegexpSource(text)	121 function parseFilterRegexpSource(text)

110 {	122 {

111 let regexp = [];	123 let regexp = [];

112 let lastIndex = text.length - 1;	124

	125 // Convert the text into an array of Unicode characters.

	126 //

	127 // In the case of surrogate pairs (the smiley emoji, for example), one

	128 // Unicode code point is represented by two JavaScript characters together.

	129 // We want to iterate over Unicode code points rather than JavaScript

	130 // characters.

	131 let characters = Array.from(text);

	132

	133 let lastIndex = characters.length - 1;

113 let hostname;	134 let hostname;

114 let hostnameStart = null;	135 let hostnameStart = null;

115 let hostnameFinished = false;	136 let hostnameFinished = false;

116 let justHostname = false;	137 let justHostname = false;

117 let canSafelyMatchAsLowercase = false;	138 let canSafelyMatchAsLowercase = false;

118	139

119 for (let i = 0; i < text.length; i++)	140 for (let i = 0; i < characters.length; i++)

120 {	141 {

121 let c = text[i];	142 let c = characters[i];

122	143

123 if (hostnameFinished)	144 if (hostnameFinished)

124 justHostname = false;	145 justHostname = false;

125	146

126 // If we're currently inside the hostname we have to be careful not to	147 // If we're currently inside the hostname we have to be careful not to

127 // escape any characters until after we have converted it to punycode.	148 // escape any characters until after we have converted it to punycode.

128 if (hostnameStart != null && !hostnameFinished)	149 if (hostnameStart != null && !hostnameFinished)

129 {	150 {

130 let endingChar = (c == "*" \|\| c == "^" \|\|	151 let endingChar = (c == "*" \|\| c == "^" \|\|

131 c == "?" \|\| c == "/" \|\| c == "\|");	152 c == "?" \|\| c == "/" \|\| c == "\|");

132 if (!endingChar && i != lastIndex)	153 if (!endingChar && i != lastIndex)

133 continue;	154 continue;

134	155

135 hostname = punycode.toASCII(	156 hostname = punycode.toASCII(

136 text.substring(hostnameStart, endingChar ? i : i + 1)	157 characters.slice(hostnameStart, endingChar ? i : i + 1).join("")

	158 .toLowerCase()

137 );	159 );

138 hostnameFinished = justHostname = true;	160 hostnameFinished = justHostname = true;

139 regexp.push(escapeRegExp(hostname));	161 regexp.push(escapeRegExp(hostname));

140 if (!endingChar)	162 if (!endingChar)

141 break;	163 break;

142 }	164 }

143	165

144 switch (c)	166 switch (c)

145 {	167 {

146 case "*":	168 case "*":

147 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*")	169 if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*")

148 regexp.push(".*");	170 regexp.push(".*");

149 break;	171 break;

150 case "^":	172 case "^":

151 if (i < lastIndex)	173 let alphabet = "a-z";

152 regexp.push("[^.%A-Za-z0-9_]");	174 // If justHostname is true and we've encountered a "^", it means we're

	175 // still in the hostname part of the URL. Since hostnames are always

	176 // lower case (Punycode), there's no need to include "A-Z" in the

	177 // pattern. Further, subsequent code may lower-case the entire regular

	178 // expression (if the URL contains only the hostname part), leaving us

	179 // with "a-za-z", which would be redundant.

	180 if (!justHostname)

	181 alphabet = "A-Z" + alphabet;

	182 let digits = "0-9";

	183 // Note that the "-" must appear first here in order to retain its

	184 // literal meaning within the brackets.

	185 let specialCharacters = "-_.%";

	186 let separator = "[^" + specialCharacters + alphabet + digits + "]";

	187 if (i == 0)

	188 regexp.push("^https?://(.*" + separator + ")?");

	189 else if (i == lastIndex)

	190 regexp.push("(" + separator + ".*)?$");

153 else	191 else

154 regexp.push("([^.%A-Za-z0-9_].*)?$");	192 regexp.push(separator);
Sebastian Noack 2017/06/16 21:13:18 Can you put the duplicated part of the regexp in a Can you put the duplicated part of the regexp in a variable? Manish Jethani 2017/06/19 10:39:54 Done. Show quoted text On 2017/06/16 21:13:18, Sebastian Noack wrote: > Can you put the duplicated part of the regexp in a variable? Done.
155 canSafelyMatchAsLowercase = false;
Sebastian Noack 2017/06/16 21:13:18 Why is that necessary? Why is that necessary? Manish Jethani 2017/06/19 10:39:54 It was converting "A-Z" into "a-z". I thought abou Show quoted text On 2017/06/16 21:13:18, Sebastian Noack wrote: > Why is that necessary? It was converting "A-Z" into "a-z". I thought about this a little more, it's not necessary. Instead, we just remove the "A-Z" if the separator occurs at the end of the hostname.
156 break;	193 break;

157 case "\|":	194 case "\|":

158 if (i == 0)	195 if (i == 0)

159 {	196 {

160 regexp.push("^");	197 regexp.push("^");

161 break;	198 break;

162 }	199 }

163 if (i == lastIndex)	200 if (i == lastIndex)

164 {	201 {

165 regexp.push("$");	202 regexp.push("$");

166 break;	203 break;

167 }	204 }

168 if (i == 1 && text[0] == "\|")	205 if (i == 1 && characters[0] == "\|")

169 {	206 {

170 hostnameStart = i + 1;	207 hostnameStart = i + 1;

171 canSafelyMatchAsLowercase = true;	208 canSafelyMatchAsLowercase = true;

172 regexp.push("https?://([^/]+\\.)?");	209 regexp.push("https?://([^/]+\\.)?");

173 break;	210 break;

174 }	211 }

175 regexp.push("\\\|");	212 regexp.push("\\\|");

176 break;	213 break;

177 case "/":	214 case "/":

178 if (!hostnameFinished &&	215 if (!hostnameFinished &&

179 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")	216 characters[i - 2] == ":" && characters[i - 1] == "/")

180 {	217 {

181 hostnameStart = i + 1;	218 hostnameStart = i + 1;

182 canSafelyMatchAsLowercase = true;	219 canSafelyMatchAsLowercase = true;

183 }	220 }

184 regexp.push("/");	221 regexp.push("/");

185 break;	222 break;

186 case ".": case "+": case "$": case "?":	223 case ".": case "+": case "$": case "?":

187 case "{": case "}": case "(": case ")":	224 case "{": case "}": case "(": case ")":

188 case "[": case "]": case "\\":	225 case "[": case "]": case "\\":

189 regexp.push("\\", c);	226 regexp.push("\\", c);

190 break;	227 break;

191 default:	228 default:

192 if (hostnameFinished && (c >= "a" && c <= "z" \|\|	229 if (hostnameFinished && (c >= "a" && c <= "z" \|\|

193 c >= "A" && c <= "Z"))	230 c >= "A" && c <= "Z"))

194 canSafelyMatchAsLowercase = false;	231 canSafelyMatchAsLowercase = false;

195 regexp.push(c);	232 regexp.push(c == "%" ? c : encodeURI(c));

196 }	233 }

197 }	234 }

198	235

199 return {	236 return {

200 regexp: regexp.join(""),	237 regexp: regexp.join(""),

201 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,	238 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,

202 hostname: hostname,	239 hostname: hostname,

203 justHostname: justHostname	240 justHostname: justHostname

204 };	241 };

205 }	242 }

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
272 let included = [];	309 let included = [];

273 let excluded = [];	310 let excluded = [];

274	311

275 parseDomains(filter.domains, included, excluded);	312 parseDomains(filter.domains, included, excluded);

276	313

277 if (exceptionDomains)	314 if (exceptionDomains)

278 excluded = excluded.concat(exceptionDomains);	315 excluded = excluded.concat(exceptionDomains);

279	316

280 if (withResourceTypes)	317 if (withResourceTypes)

281 {	318 {

282 trigger["resource-type"] = getResourceTypes(filter);	319 let resourceTypes = getResourceTypes(filter);

283	320

284 if (trigger["resource-type"].length == 0)	321 // Content blocker rules can't differentiate between sub-document requests

	322 // (iframes) and top-level document requests. To avoid too many false

	323 // positives, we prevent rules with no hostname part from blocking document

	324 // requests.

	325 //

	326 // Once Safari 11 becomes our minimum supported version, we could change

	327 // our approach here to use the new "unless-top-url" property instead.

	328 if (filter instanceof filterClasses.BlockingFilter && !parsed.hostname)

	329 resourceTypes = resourceTypes.filter(type => type != "document");

	330

	331 if (resourceTypes.length == 0)

285 return;	332 return;

	333

	334 trigger["resource-type"] = resourceTypes;

286 }	335 }

287	336

288 if (filter.thirdParty != null)	337 if (filter.thirdParty != null)

289 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];	338 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];

290	339

291 if (included.length > 0)	340 if (included.length > 0)

292 {	341 {

293 trigger["if-domain"] = [];	342 trigger["if-domain"] = [];

294	343

295 for (let name of included)	344 for (let name of included)

(...skipping 15 matching lines...) Expand all Loading...
311 {	360 {

312 trigger["if-domain"].push("*" + name);	361 trigger["if-domain"].push("*" + name);

313 }	362 }

314 }	363 }

315 }	364 }

316 else if (excluded.length > 0)	365 else if (excluded.length > 0)

317 {	366 {

318 trigger["unless-domain"] = excluded.map(name => "*" + name);	367 trigger["unless-domain"] = excluded.map(name => "*" + name);

319 }	368 }

320 else if (filter instanceof filterClasses.BlockingFilter &&	369 else if (filter instanceof filterClasses.BlockingFilter &&

321 filter.contentType & typeMap.SUBDOCUMENT)	370 filter.contentType & typeMap.SUBDOCUMENT && parsed.hostname)

322 {	371 {

	372 // Rules with a hostname part are still allowed to block document requests,

	373 // but we add an exception for top-level documents.

	374 //

	375 // Note that we can only do this if there's no "unless-domain" property for

	376 // now. This also only works in Safari 11 onwards, while older versions

	377 // simply ignore this property. Once Safari 11 becomes our minimum

	378 // supported version, we can merge "unless-domain" into "unless-top-url".

323 trigger["unless-top-url"] = [trigger["url-filter"]];	379 trigger["unless-top-url"] = [trigger["url-filter"]];

324 if (trigger["url-filter-is-case-sensitive"])	380 if (trigger["url-filter-is-case-sensitive"])

325 trigger["top-url-filter-is-case-sensitive"] = true;	381 trigger["top-url-filter-is-case-sensitive"] = true;

326 }	382 }

327	383

328 rules.push({trigger: trigger, action: {type: action}});	384 rules.push({trigger: trigger, action: {type: action}});

329 }

330

331 function hasNonASCI(obj)

332 {

333 if (typeof obj == "string")

334 {

335 if (/[^\x00-\x7F]/.test(obj))

336 return true;

337 }

338

339 if (typeof obj == "object")

340 {

341 if (obj instanceof Array)

342 for (let item of obj)

343 if (hasNonASCI(item))

344 return true;

345

346 let names = Object.getOwnPropertyNames(obj);

347 for (let name of names)

348 if (hasNonASCI(obj[name]))

349 return true;

350 }

351

352 return false;

353 }	385 }

354	386

355 function convertIDSelectorsToAttributeSelectors(selector)	387 function convertIDSelectorsToAttributeSelectors(selector)

356 {	388 {

357 // First we figure out where all the IDs are	389 // First we figure out where all the IDs are

358 let sep = "";	390 let sep = "";

359 let start = null;	391 let start = null;

360 let positions = [];	392 let positions = [];

361 for (let i = 0; i < selector.length; i++)	393 for (let i = 0; i < selector.length; i++)

362 {	394 {

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
396 {	428 {

397 newSelector.push(selector.substring(i, pos.start));	429 newSelector.push(selector.substring(i, pos.start));

398 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');	430 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');

399 i = pos.end;	431 i = pos.end;

400 }	432 }

401 newSelector.push(selector.substring(i));	433 newSelector.push(selector.substring(i));

402	434

403 return newSelector.join("");	435 return newSelector.join("");

404 }	436 }

405	437

406 function addCSSRules(rules, selectors, matchDomain)	438 function addCSSRules(rules, selectors, matchDomain, exceptionDomains)

407 {	439 {

	440 let unlessDomain = exceptionDomains.size > 0 ? [] : null;

	441

	442 exceptionDomains.forEach(name => unlessDomain.push("*" + name));

	443

408 while (selectors.length)	444 while (selectors.length)

409 {	445 {

410 let selector = selectors.splice(0, selectorLimit).join(", ");	446 let selector = selectors.splice(0, selectorLimit).join(", ");

411	447

412 // As of Safari 9.0 element IDs are matched as lowercase. We work around	448 // As of Safari 9.0 element IDs are matched as lowercase. We work around

413 // this by converting to the attribute format [id="elementID"]	449 // this by converting to the attribute format [id="elementID"]

414 selector = convertIDSelectorsToAttributeSelectors(selector);	450 selector = convertIDSelectorsToAttributeSelectors(selector);

415	451

416 rules.push({	452 let rule = {

417 trigger: {"url-filter": matchDomain,	453 trigger: {"url-filter": matchDomain,

418 "url-filter-is-case-sensitive": true},	454 "url-filter-is-case-sensitive": true},

419 action: {type: "css-display-none",	455 action: {type: "css-display-none",

420 selector: selector}	456 selector: selector}

421 });	457 };

	458

	459 if (unlessDomain)

	460 rule.trigger["unless-domain"] = unlessDomain;

	461

	462 rules.push(rule);

422 }	463 }

423 }	464 }

424	465

425 let ContentBlockerList =	466 let ContentBlockerList =

426 /**	467 /**

427 * Create a new Adblock Plus filter to content blocker list converter	468 * Create a new Adblock Plus filter to content blocker list converter

428 *	469 *

429 * @constructor	470 * @constructor

430 */	471 */

431 exports.ContentBlockerList = function ()	472 exports.ContentBlockerList = function ()

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
508 {	549 {

509 for (let matchDomain of result.matchDomains)	550 for (let matchDomain of result.matchDomains)

510 {	551 {

511 let group = groupedElemhideFilters.get(matchDomain) \|\| [];	552 let group = groupedElemhideFilters.get(matchDomain) \|\| [];

512 group.push(result.selector);	553 group.push(result.selector);

513 groupedElemhideFilters.set(matchDomain, group);	554 groupedElemhideFilters.set(matchDomain, group);

514 }	555 }

515 }	556 }

516 }	557 }

517	558

518 addCSSRules(rules, genericSelectors, "^https?://");	559 // Separate out the element hiding exceptions that have only a hostname part

519	560 // from the rest. This allows us to implement a workaround for issue #5345

520 // Right after the generic element hiding filters, add the exceptions that	561 // (WebKit bug #167423), but as a bonus it also reduces the number of

521 // should apply only to those filters.	562 // generated rules. The downside is that the exception will only apply to the

522 for (let filter of this.generichideExceptions)	563 // top-level document, not to iframes. We have to live with this until the

523 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);	564 // WebKit bug is fixed in all supported versions of Safari.

	565 // https://bugs.webkit.org/show_bug.cgi?id=167423

	566 //

	567 // Note that as a result of this workaround we end up with a huge rule set in

	568 // terms of the amount of memory used. This can cause Node.js to throw

	569 // "JavaScript heap out of memory". To avoid this, call Node.js with

	570 // --max_old_space_size=4096

	571 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);

	572

	573 let genericSelectorExceptionDomains =

	574 extractFilterDomains(this.generichideExceptions);

	575 elemhideExceptionDomains.forEach(name =>

	576 {

	577 genericSelectorExceptionDomains.add(name);

	578 });

	579

	580 addCSSRules(rules, genericSelectors, "^https?://",

	581 genericSelectorExceptionDomains);

524	582

525 groupedElemhideFilters.forEach((selectors, matchDomain) =>	583 groupedElemhideFilters.forEach((selectors, matchDomain) =>

526 {	584 {

527 addCSSRules(rules, selectors, matchDomain);	585 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);

528 });	586 });

529

530 for (let filter of this.elemhideExceptions)

531 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);

532	587

533 let requestFilterExceptionDomains = [];	588 let requestFilterExceptionDomains = [];

534 for (let filter of this.genericblockExceptions)	589 for (let filter of this.genericblockExceptions)

535 {	590 {

536 let parsed = parseFilterRegexpSource(filter.regexpSource);	591 let parsed = parseFilterRegexpSource(filter.regexpSource);

537 if (parsed.hostname)	592 if (parsed.hostname)

538 requestFilterExceptionDomains.push(parsed.hostname);	593 requestFilterExceptionDomains.push(parsed.hostname);

539 }	594 }

540	595

541 for (let filter of this.requestFilters)	596 for (let filter of this.requestFilters)

542 {	597 {

543 convertFilterAddRules(rules, filter, "block", true,	598 convertFilterAddRules(rules, filter, "block", true,

544 requestFilterExceptionDomains);	599 requestFilterExceptionDomains);

545 }	600 }

546	601

547 for (let filter of this.requestExceptions)	602 for (let filter of this.requestExceptions)

548 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);	603 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);

549	604

550 return rules.filter(rule => !hasNonASCI(rule));	605 return rules;

551 };	606 };

LEFT	RIGHT