lib/abp2blocklist.js - Issue 29467595: Issue 5325 - Add support for separator characters

Delta Between Two Patch Sets: lib/abp2blocklist.js

Issue 29467595: Issue 5325 - Add support for separator characters (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist

Left Patch Set: Wrap long lines Created July 9, 2017, 10:48 a.m.

Right Patch Set: Rebase Created July 12, 2017, 12:45 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 /*	1 /*

2 * This file is part of Adblock Plus <https://adblockplus.org/>,	2 * This file is part of Adblock Plus <https://adblockplus.org/>,

3 * Copyright (C) 2006-2017 eyeo GmbH	3 * Copyright (C) 2006-2017 eyeo GmbH

4 *	4 *

5 * Adblock Plus is free software: you can redistribute it and/or modify	5 * Adblock Plus is free software: you can redistribute it and/or modify

6 * it under the terms of the GNU General Public License version 3 as	6 * it under the terms of the GNU General Public License version 3 as

7 * published by the Free Software Foundation.	7 * published by the Free Software Foundation.

8 *	8 *

9 * Adblock Plus is distributed in the hope that it will be useful,	9 * Adblock Plus is distributed in the hope that it will be useful,

10 * but WITHOUT ANY WARRANTY; without even the implied warranty of	10 * but WITHOUT ANY WARRANTY; without even the implied warranty of

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
69 let subdomains = [];	69 let subdomains = [];

70 let suffixLength = domain.length + 1;	70 let suffixLength = domain.length + 1;

71	71

72 for (let name of list)	72 for (let name of list)

73 {	73 {

74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)	74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)

75 subdomains.push(name.slice(0, -suffixLength));	75 subdomains.push(name.slice(0, -suffixLength));

76 }	76 }

77	77

78 return subdomains;	78 return subdomains;

	79 }

	80

	81 function extractFilterDomains(filters)

	82 {

	83 let domains = new Set();

	84 for (let filter of filters)

	85 {

	86 let parsed = parseFilterRegexpSource(filter.regexpSource);

	87 if (parsed.justHostname)

	88 domains.add(parsed.hostname);

	89 }

	90 return domains;

79 }	91 }

80	92

81 function convertElemHideFilter(filter, elemhideSelectorExceptions)	93 function convertElemHideFilter(filter, elemhideSelectorExceptions)

82 {	94 {

83 let included = [];	95 let included = [];

84 let excluded = [];	96 let excluded = [];

85 let rules = [];	97 let rules = [];

86	98

87 parseDomains(filter.domains, included, excluded);	99 parseDomains(filter.domains, included, excluded);

88	100

(...skipping 13 matching lines...) Expand all Loading...
102 * case, a hostname string (or undefined) and a bool	114 * case, a hostname string (or undefined) and a bool

103 * indicating if the source only contains a hostname or not:	115 * indicating if the source only contains a hostname or not:

104 * {regexp: "...",	116 * {regexp: "...",

105 * canSafelyMatchAsLowercase: true/false,	117 * canSafelyMatchAsLowercase: true/false,

106 * hostname: "...",	118 * hostname: "...",

107 * justHostname: true/false}	119 * justHostname: true/false}

108 */	120 */

109 function parseFilterRegexpSource(text)	121 function parseFilterRegexpSource(text)

110 {	122 {

111 let regexp = [];	123 let regexp = [];

112 let lastIndex = text.length - 1;	124

	125 // Convert the text into an array of Unicode characters.

	126 //

	127 // In the case of surrogate pairs (the smiley emoji, for example), one

	128 // Unicode code point is represented by two JavaScript characters together.

	129 // We want to iterate over Unicode code points rather than JavaScript

	130 // characters.

	131 let characters = Array.from(text);

	132

	133 let lastIndex = characters.length - 1;

113 let hostname;	134 let hostname;

114 let hostnameStart = null;	135 let hostnameStart = null;

115 let hostnameFinished = false;	136 let hostnameFinished = false;

116 let justHostname = false;	137 let justHostname = false;

117 let canSafelyMatchAsLowercase = false;	138 let canSafelyMatchAsLowercase = false;

118	139

119 for (let i = 0; i < text.length; i++)	140 for (let i = 0; i < characters.length; i++)

120 {	141 {

121 let c = text[i];	142 let c = characters[i];

122	143

123 if (hostnameFinished)	144 if (hostnameFinished)

124 justHostname = false;	145 justHostname = false;

125	146

126 // If we're currently inside the hostname we have to be careful not to	147 // If we're currently inside the hostname we have to be careful not to

127 // escape any characters until after we have converted it to punycode.	148 // escape any characters until after we have converted it to punycode.

128 if (hostnameStart != null && !hostnameFinished)	149 if (hostnameStart != null && !hostnameFinished)

129 {	150 {

130 let endingChar = (c == "*" \|\| c == "^" \|\|	151 let endingChar = (c == "*" \|\| c == "^" \|\|

131 c == "?" \|\| c == "/" \|\| c == "\|");	152 c == "?" \|\| c == "/" \|\| c == "\|");

132 if (!endingChar && i != lastIndex)	153 if (!endingChar && i != lastIndex)

133 continue;	154 continue;

134	155

135 hostname = punycode.toASCII(	156 hostname = punycode.toASCII(

136 text.substring(hostnameStart, endingChar ? i : i + 1)	157 characters.slice(hostnameStart, endingChar ? i : i + 1).join("")

	158 .toLowerCase()

137 );	159 );

138 hostnameFinished = justHostname = true;	160 hostnameFinished = justHostname = true;

139 regexp.push(escapeRegExp(hostname));	161 regexp.push(escapeRegExp(hostname));

140 if (!endingChar)	162 if (!endingChar)

141 break;	163 break;

142 }	164 }

143	165

144 switch (c)	166 switch (c)

145 {	167 {

146 case "*":	168 case "*":

147 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*")	169 if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*")

148 regexp.push(".*");	170 regexp.push(".*");

149 break;	171 break;

150 case "^":	172 case "^":

151 let separator = "[^-_.%" + (justHostname ? "" : "A-Z") + "a-z0-9]";	173 let alphabet = "a-z";

	174 // If justHostname is true and we've encountered a "^", it means we're

	175 // still in the hostname part of the URL. Since hostnames are always

	176 // lower case (Punycode), there's no need to include "A-Z" in the

	177 // pattern. Further, subsequent code may lower-case the entire regular

	178 // expression (if the URL contains only the hostname part), leaving us

	179 // with "a-za-z", which would be redundant.

	180 if (!justHostname)

	181 alphabet = "A-Z" + alphabet;

	182 let digits = "0-9";

	183 // Note that the "-" must appear first here in order to retain its

	184 // literal meaning within the brackets.

	185 let specialCharacters = "-_.%";

	186 let separator = "[^" + specialCharacters + alphabet + digits + "]";

152 if (i == 0)	187 if (i == 0)

153 regexp.push("^https?://(.*" + separator + ")?");	188 regexp.push("^https?://(.*" + separator + ")?");

154 else if (i == lastIndex)	189 else if (i == lastIndex)

155 regexp.push("(" + separator + ".*)?$");	190 regexp.push("(" + separator + ".*)?$");

156 else	191 else

157 regexp.push(separator);	192 regexp.push(separator);

158 break;	193 break;

159 case "\|":	194 case "\|":

160 if (i == 0)	195 if (i == 0)

161 {	196 {

162 regexp.push("^");	197 regexp.push("^");

163 break;	198 break;

164 }	199 }

165 if (i == lastIndex)	200 if (i == lastIndex)

166 {	201 {

167 regexp.push("$");	202 regexp.push("$");

168 break;	203 break;

169 }	204 }

170 if (i == 1 && text[0] == "\|")	205 if (i == 1 && characters[0] == "\|")

171 {	206 {

172 hostnameStart = i + 1;	207 hostnameStart = i + 1;

173 canSafelyMatchAsLowercase = true;	208 canSafelyMatchAsLowercase = true;

174 regexp.push("https?://([^/]+\\.)?");	209 regexp.push("https?://([^/]+\\.)?");

175 break;	210 break;

176 }	211 }

177 regexp.push("\\\|");	212 regexp.push("\\\|");

178 break;	213 break;

179 case "/":	214 case "/":

180 if (!hostnameFinished &&	215 if (!hostnameFinished &&

181 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")	216 characters[i - 2] == ":" && characters[i - 1] == "/")

182 {	217 {

183 hostnameStart = i + 1;	218 hostnameStart = i + 1;

184 canSafelyMatchAsLowercase = true;	219 canSafelyMatchAsLowercase = true;

185 }	220 }

186 regexp.push("/");	221 regexp.push("/");

187 break;	222 break;

188 case ".": case "+": case "$": case "?":	223 case ".": case "+": case "$": case "?":

189 case "{": case "}": case "(": case ")":	224 case "{": case "}": case "(": case ")":

190 case "[": case "]": case "\\":	225 case "[": case "]": case "\\":

191 regexp.push("\\", c);	226 regexp.push("\\", c);

192 break;	227 break;

193 default:	228 default:

194 if (hostnameFinished && (c >= "a" && c <= "z" \|\|	229 if (hostnameFinished && (c >= "a" && c <= "z" \|\|

195 c >= "A" && c <= "Z"))	230 c >= "A" && c <= "Z"))

196 canSafelyMatchAsLowercase = false;	231 canSafelyMatchAsLowercase = false;

197 regexp.push(c);	232 regexp.push(c == "%" ? c : encodeURI(c));

198 }	233 }

199 }	234 }

200	235

201 return {	236 return {

202 regexp: regexp.join(""),	237 regexp: regexp.join(""),

203 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,	238 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,

204 hostname: hostname,	239 hostname: hostname,

205 justHostname: justHostname	240 justHostname: justHostname

206 };	241 };

207 }	242 }

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
274 let included = [];	309 let included = [];

275 let excluded = [];	310 let excluded = [];

276	311

277 parseDomains(filter.domains, included, excluded);	312 parseDomains(filter.domains, included, excluded);

278	313

279 if (exceptionDomains)	314 if (exceptionDomains)

280 excluded = excluded.concat(exceptionDomains);	315 excluded = excluded.concat(exceptionDomains);

281	316

282 if (withResourceTypes)	317 if (withResourceTypes)

283 {	318 {

284 trigger["resource-type"] = getResourceTypes(filter);	319 let resourceTypes = getResourceTypes(filter);

285	320

286 if (trigger["resource-type"].length == 0)	321 // Content blocker rules can't differentiate between sub-document requests

	322 // (iframes) and top-level document requests. To avoid too many false

	323 // positives, we prevent rules with no hostname part from blocking document

	324 // requests.

	325 //

	326 // Once Safari 11 becomes our minimum supported version, we could change

	327 // our approach here to use the new "unless-top-url" property instead.

	328 if (filter instanceof filterClasses.BlockingFilter && !parsed.hostname)

	329 resourceTypes = resourceTypes.filter(type => type != "document");

	330

	331 if (resourceTypes.length == 0)

287 return;	332 return;

	333

	334 trigger["resource-type"] = resourceTypes;

288 }	335 }

289	336

290 if (filter.thirdParty != null)	337 if (filter.thirdParty != null)

291 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];	338 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];

292	339

293 if (included.length > 0)	340 if (included.length > 0)

294 {	341 {

295 trigger["if-domain"] = [];	342 trigger["if-domain"] = [];

296	343

297 for (let name of included)	344 for (let name of included)

(...skipping 15 matching lines...) Expand all Loading...
313 {	360 {

314 trigger["if-domain"].push("*" + name);	361 trigger["if-domain"].push("*" + name);

315 }	362 }

316 }	363 }

317 }	364 }

318 else if (excluded.length > 0)	365 else if (excluded.length > 0)

319 {	366 {

320 trigger["unless-domain"] = excluded.map(name => "*" + name);	367 trigger["unless-domain"] = excluded.map(name => "*" + name);

321 }	368 }

322 else if (filter instanceof filterClasses.BlockingFilter &&	369 else if (filter instanceof filterClasses.BlockingFilter &&

323 filter.contentType & typeMap.SUBDOCUMENT)	370 filter.contentType & typeMap.SUBDOCUMENT && parsed.hostname)

324 {	371 {

	372 // Rules with a hostname part are still allowed to block document requests,

	373 // but we add an exception for top-level documents.

	374 //

	375 // Note that we can only do this if there's no "unless-domain" property for

	376 // now. This also only works in Safari 11 onwards, while older versions

	377 // simply ignore this property. Once Safari 11 becomes our minimum

	378 // supported version, we can merge "unless-domain" into "unless-top-url".

325 trigger["unless-top-url"] = [trigger["url-filter"]];	379 trigger["unless-top-url"] = [trigger["url-filter"]];

326 if (trigger["url-filter-is-case-sensitive"])	380 if (trigger["url-filter-is-case-sensitive"])

327 trigger["top-url-filter-is-case-sensitive"] = true;	381 trigger["top-url-filter-is-case-sensitive"] = true;

328 }	382 }

329	383

330 rules.push({trigger: trigger, action: {type: action}});	384 rules.push({trigger: trigger, action: {type: action}});

331 }

332

333 function hasNonASCI(obj)

334 {

335 if (typeof obj == "string")

336 {

337 if (/[^\x00-\x7F]/.test(obj))

338 return true;

339 }

340

341 if (typeof obj == "object")

342 {

343 if (obj instanceof Array)

344 for (let item of obj)

345 if (hasNonASCI(item))

346 return true;

347

348 let names = Object.getOwnPropertyNames(obj);

349 for (let name of names)

350 if (hasNonASCI(obj[name]))

351 return true;

352 }

353

354 return false;

355 }	385 }

356	386

357 function convertIDSelectorsToAttributeSelectors(selector)	387 function convertIDSelectorsToAttributeSelectors(selector)

358 {	388 {

359 // First we figure out where all the IDs are	389 // First we figure out where all the IDs are

360 let sep = "";	390 let sep = "";

361 let start = null;	391 let start = null;

362 let positions = [];	392 let positions = [];

363 for (let i = 0; i < selector.length; i++)	393 for (let i = 0; i < selector.length; i++)

364 {	394 {

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
398 {	428 {

399 newSelector.push(selector.substring(i, pos.start));	429 newSelector.push(selector.substring(i, pos.start));

400 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');	430 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');

401 i = pos.end;	431 i = pos.end;

402 }	432 }

403 newSelector.push(selector.substring(i));	433 newSelector.push(selector.substring(i));

404	434

405 return newSelector.join("");	435 return newSelector.join("");

406 }	436 }

407	437

408 function addCSSRules(rules, selectors, matchDomain)	438 function addCSSRules(rules, selectors, matchDomain, exceptionDomains)

409 {	439 {

	440 let unlessDomain = exceptionDomains.size > 0 ? [] : null;

	441

	442 exceptionDomains.forEach(name => unlessDomain.push("*" + name));

	443

410 while (selectors.length)	444 while (selectors.length)

411 {	445 {

412 let selector = selectors.splice(0, selectorLimit).join(", ");	446 let selector = selectors.splice(0, selectorLimit).join(", ");

413	447

414 // As of Safari 9.0 element IDs are matched as lowercase. We work around	448 // As of Safari 9.0 element IDs are matched as lowercase. We work around

415 // this by converting to the attribute format [id="elementID"]	449 // this by converting to the attribute format [id="elementID"]

416 selector = convertIDSelectorsToAttributeSelectors(selector);	450 selector = convertIDSelectorsToAttributeSelectors(selector);

417	451

418 rules.push({	452 let rule = {

419 trigger: {"url-filter": matchDomain,	453 trigger: {"url-filter": matchDomain,

420 "url-filter-is-case-sensitive": true},	454 "url-filter-is-case-sensitive": true},

421 action: {type: "css-display-none",	455 action: {type: "css-display-none",

422 selector: selector}	456 selector: selector}

423 });	457 };

	458

	459 if (unlessDomain)

	460 rule.trigger["unless-domain"] = unlessDomain;

	461

	462 rules.push(rule);

424 }	463 }

425 }	464 }

426	465

427 let ContentBlockerList =	466 let ContentBlockerList =

428 /**	467 /**

429 * Create a new Adblock Plus filter to content blocker list converter	468 * Create a new Adblock Plus filter to content blocker list converter

430 *	469 *

431 * @constructor	470 * @constructor

432 */	471 */

433 exports.ContentBlockerList = function ()	472 exports.ContentBlockerList = function ()

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
510 {	549 {

511 for (let matchDomain of result.matchDomains)	550 for (let matchDomain of result.matchDomains)

512 {	551 {

513 let group = groupedElemhideFilters.get(matchDomain) \|\| [];	552 let group = groupedElemhideFilters.get(matchDomain) \|\| [];

514 group.push(result.selector);	553 group.push(result.selector);

515 groupedElemhideFilters.set(matchDomain, group);	554 groupedElemhideFilters.set(matchDomain, group);

516 }	555 }

517 }	556 }

518 }	557 }

519	558

520 addCSSRules(rules, genericSelectors, "^https?://");	559 // Separate out the element hiding exceptions that have only a hostname part

521	560 // from the rest. This allows us to implement a workaround for issue #5345

522 // Right after the generic element hiding filters, add the exceptions that	561 // (WebKit bug #167423), but as a bonus it also reduces the number of

523 // should apply only to those filters.	562 // generated rules. The downside is that the exception will only apply to the

524 for (let filter of this.generichideExceptions)	563 // top-level document, not to iframes. We have to live with this until the

525 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);	564 // WebKit bug is fixed in all supported versions of Safari.

	565 // https://bugs.webkit.org/show_bug.cgi?id=167423

	566 //

	567 // Note that as a result of this workaround we end up with a huge rule set in

	568 // terms of the amount of memory used. This can cause Node.js to throw

	569 // "JavaScript heap out of memory". To avoid this, call Node.js with

	570 // --max_old_space_size=4096

	571 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);

	572

	573 let genericSelectorExceptionDomains =

	574 extractFilterDomains(this.generichideExceptions);

	575 elemhideExceptionDomains.forEach(name =>

	576 {

	577 genericSelectorExceptionDomains.add(name);

	578 });

	579

	580 addCSSRules(rules, genericSelectors, "^https?://",

	581 genericSelectorExceptionDomains);

526	582

527 groupedElemhideFilters.forEach((selectors, matchDomain) =>	583 groupedElemhideFilters.forEach((selectors, matchDomain) =>

528 {	584 {

529 addCSSRules(rules, selectors, matchDomain);	585 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);

530 });	586 });

531

532 for (let filter of this.elemhideExceptions)

533 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);

534	587

535 let requestFilterExceptionDomains = [];	588 let requestFilterExceptionDomains = [];

536 for (let filter of this.genericblockExceptions)	589 for (let filter of this.genericblockExceptions)

537 {	590 {

538 let parsed = parseFilterRegexpSource(filter.regexpSource);	591 let parsed = parseFilterRegexpSource(filter.regexpSource);

539 if (parsed.hostname)	592 if (parsed.hostname)

540 requestFilterExceptionDomains.push(parsed.hostname);	593 requestFilterExceptionDomains.push(parsed.hostname);

541 }	594 }

542	595

543 for (let filter of this.requestFilters)	596 for (let filter of this.requestFilters)

544 {	597 {

545 convertFilterAddRules(rules, filter, "block", true,	598 convertFilterAddRules(rules, filter, "block", true,

546 requestFilterExceptionDomains);	599 requestFilterExceptionDomains);

547 }	600 }

548	601

549 for (let filter of this.requestExceptions)	602 for (let filter of this.requestExceptions)

550 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);	603 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);

551	604

552 return rules.filter(rule => !hasNonASCI(rule));	605 return rules;

553 };	606 };

LEFT	RIGHT