lib/abp2blocklist.js - Issue 29467595: Issue 5325 - Add support for separator characters

Delta Between Two Patch Sets: lib/abp2blocklist.js

Issue 29467595: Issue 5325 - Add support for separator characters (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist

Left Patch Set: Add comment about using only lower case for hostname Created July 11, 2017, 5:06 p.m.

Right Patch Set: Rebase Created July 12, 2017, 12:45 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 /*	1 /*

2 * This file is part of Adblock Plus <https://adblockplus.org/>,	2 * This file is part of Adblock Plus <https://adblockplus.org/>,

3 * Copyright (C) 2006-2017 eyeo GmbH	3 * Copyright (C) 2006-2017 eyeo GmbH

4 *	4 *

5 * Adblock Plus is free software: you can redistribute it and/or modify	5 * Adblock Plus is free software: you can redistribute it and/or modify

6 * it under the terms of the GNU General Public License version 3 as	6 * it under the terms of the GNU General Public License version 3 as

7 * published by the Free Software Foundation.	7 * published by the Free Software Foundation.

8 *	8 *

9 * Adblock Plus is distributed in the hope that it will be useful,	9 * Adblock Plus is distributed in the hope that it will be useful,

10 * but WITHOUT ANY WARRANTY; without even the implied warranty of	10 * but WITHOUT ANY WARRANTY; without even the implied warranty of

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
69 let subdomains = [];	69 let subdomains = [];

70 let suffixLength = domain.length + 1;	70 let suffixLength = domain.length + 1;

71	71

72 for (let name of list)	72 for (let name of list)

73 {	73 {

74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)	74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)

75 subdomains.push(name.slice(0, -suffixLength));	75 subdomains.push(name.slice(0, -suffixLength));

76 }	76 }

77	77

78 return subdomains;	78 return subdomains;

	79 }

	80

	81 function extractFilterDomains(filters)

	82 {

	83 let domains = new Set();

	84 for (let filter of filters)

	85 {

	86 let parsed = parseFilterRegexpSource(filter.regexpSource);

	87 if (parsed.justHostname)

	88 domains.add(parsed.hostname);

	89 }

	90 return domains;

79 }	91 }

80	92

81 function convertElemHideFilter(filter, elemhideSelectorExceptions)	93 function convertElemHideFilter(filter, elemhideSelectorExceptions)

82 {	94 {

83 let included = [];	95 let included = [];

84 let excluded = [];	96 let excluded = [];

85 let rules = [];	97 let rules = [];

86	98

87 parseDomains(filter.domains, included, excluded);	99 parseDomains(filter.domains, included, excluded);

88	100

(...skipping 13 matching lines...) Expand all Loading...
102 * case, a hostname string (or undefined) and a bool	114 * case, a hostname string (or undefined) and a bool

103 * indicating if the source only contains a hostname or not:	115 * indicating if the source only contains a hostname or not:

104 * {regexp: "...",	116 * {regexp: "...",

105 * canSafelyMatchAsLowercase: true/false,	117 * canSafelyMatchAsLowercase: true/false,

106 * hostname: "...",	118 * hostname: "...",

107 * justHostname: true/false}	119 * justHostname: true/false}

108 */	120 */

109 function parseFilterRegexpSource(text)	121 function parseFilterRegexpSource(text)

110 {	122 {

111 let regexp = [];	123 let regexp = [];

112 let lastIndex = text.length - 1;	124

	125 // Convert the text into an array of Unicode characters.

	126 //

	127 // In the case of surrogate pairs (the smiley emoji, for example), one

	128 // Unicode code point is represented by two JavaScript characters together.

	129 // We want to iterate over Unicode code points rather than JavaScript

	130 // characters.

	131 let characters = Array.from(text);

	132

	133 let lastIndex = characters.length - 1;

113 let hostname;	134 let hostname;

114 let hostnameStart = null;	135 let hostnameStart = null;

115 let hostnameFinished = false;	136 let hostnameFinished = false;

116 let justHostname = false;	137 let justHostname = false;

117 let canSafelyMatchAsLowercase = false;	138 let canSafelyMatchAsLowercase = false;

118	139

119 for (let i = 0; i < text.length; i++)	140 for (let i = 0; i < characters.length; i++)

120 {	141 {

121 let c = text[i];	142 let c = characters[i];

122	143

123 if (hostnameFinished)	144 if (hostnameFinished)

124 justHostname = false;	145 justHostname = false;

125	146

126 // If we're currently inside the hostname we have to be careful not to	147 // If we're currently inside the hostname we have to be careful not to

127 // escape any characters until after we have converted it to punycode.	148 // escape any characters until after we have converted it to punycode.

128 if (hostnameStart != null && !hostnameFinished)	149 if (hostnameStart != null && !hostnameFinished)

129 {	150 {

130 let endingChar = (c == "*" \|\| c == "^" \|\|	151 let endingChar = (c == "*" \|\| c == "^" \|\|

131 c == "?" \|\| c == "/" \|\| c == "\|");	152 c == "?" \|\| c == "/" \|\| c == "\|");

132 if (!endingChar && i != lastIndex)	153 if (!endingChar && i != lastIndex)

133 continue;	154 continue;

134	155

135 hostname = punycode.toASCII(	156 hostname = punycode.toASCII(

136 text.substring(hostnameStart, endingChar ? i : i + 1)	157 characters.slice(hostnameStart, endingChar ? i : i + 1).join("")

	158 .toLowerCase()

137 );	159 );

138 hostnameFinished = justHostname = true;	160 hostnameFinished = justHostname = true;

139 regexp.push(escapeRegExp(hostname));	161 regexp.push(escapeRegExp(hostname));

140 if (!endingChar)	162 if (!endingChar)

141 break;	163 break;

142 }	164 }

143	165

144 switch (c)	166 switch (c)

145 {	167 {

146 case "*":	168 case "*":

147 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*")	169 if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*")

148 regexp.push(".*");	170 regexp.push(".*");

149 break;	171 break;

150 case "^":	172 case "^":

151 let alphabet = "a-z";	173 let alphabet = "a-z";

152 // If justHostname is true and we've encountered a "^", it means we're	174 // If justHostname is true and we've encountered a "^", it means we're

153 // still in the hostname part of the URL. Since hostnames are always	175 // still in the hostname part of the URL. Since hostnames are always

154 // lower case (Punycode), there's no need to include "A-Z" in the	176 // lower case (Punycode), there's no need to include "A-Z" in the

155 // pattern. Further, subsequent code may lower-case the entire regular	177 // pattern. Further, subsequent code may lower-case the entire regular

156 // expression (if the URL contains only the hostname part), leaving us	178 // expression (if the URL contains only the hostname part), leaving us

157 // with "a-za-z", which would be redundant.	179 // with "a-za-z", which would be redundant.

(...skipping 15 matching lines...) Expand all Loading...
173 if (i == 0)	195 if (i == 0)

174 {	196 {

175 regexp.push("^");	197 regexp.push("^");

176 break;	198 break;

177 }	199 }

178 if (i == lastIndex)	200 if (i == lastIndex)

179 {	201 {

180 regexp.push("$");	202 regexp.push("$");

181 break;	203 break;

182 }	204 }

183 if (i == 1 && text[0] == "\|")	205 if (i == 1 && characters[0] == "\|")

184 {	206 {

185 hostnameStart = i + 1;	207 hostnameStart = i + 1;

186 canSafelyMatchAsLowercase = true;	208 canSafelyMatchAsLowercase = true;

187 regexp.push("https?://([^/]+\\.)?");	209 regexp.push("https?://([^/]+\\.)?");

188 break;	210 break;

189 }	211 }

190 regexp.push("\\\|");	212 regexp.push("\\\|");

191 break;	213 break;

192 case "/":	214 case "/":

193 if (!hostnameFinished &&	215 if (!hostnameFinished &&

194 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")	216 characters[i - 2] == ":" && characters[i - 1] == "/")

195 {	217 {

196 hostnameStart = i + 1;	218 hostnameStart = i + 1;

197 canSafelyMatchAsLowercase = true;	219 canSafelyMatchAsLowercase = true;

198 }	220 }

199 regexp.push("/");	221 regexp.push("/");

200 break;	222 break;

201 case ".": case "+": case "$": case "?":	223 case ".": case "+": case "$": case "?":

202 case "{": case "}": case "(": case ")":	224 case "{": case "}": case "(": case ")":

203 case "[": case "]": case "\\":	225 case "[": case "]": case "\\":

204 regexp.push("\\", c);	226 regexp.push("\\", c);

205 break;	227 break;

206 default:	228 default:

207 if (hostnameFinished && (c >= "a" && c <= "z" \|\|	229 if (hostnameFinished && (c >= "a" && c <= "z" \|\|

208 c >= "A" && c <= "Z"))	230 c >= "A" && c <= "Z"))

209 canSafelyMatchAsLowercase = false;	231 canSafelyMatchAsLowercase = false;

210 regexp.push(c);	232 regexp.push(c == "%" ? c : encodeURI(c));

211 }	233 }

212 }	234 }

213	235

214 return {	236 return {

215 regexp: regexp.join(""),	237 regexp: regexp.join(""),

216 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,	238 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,

217 hostname: hostname,	239 hostname: hostname,

218 justHostname: justHostname	240 justHostname: justHostname

219 };	241 };

220 }	242 }

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
287 let included = [];	309 let included = [];

288 let excluded = [];	310 let excluded = [];

289	311

290 parseDomains(filter.domains, included, excluded);	312 parseDomains(filter.domains, included, excluded);

291	313

292 if (exceptionDomains)	314 if (exceptionDomains)

293 excluded = excluded.concat(exceptionDomains);	315 excluded = excluded.concat(exceptionDomains);

294	316

295 if (withResourceTypes)	317 if (withResourceTypes)

296 {	318 {

297 trigger["resource-type"] = getResourceTypes(filter);	319 let resourceTypes = getResourceTypes(filter);

298	320

299 if (trigger["resource-type"].length == 0)	321 // Content blocker rules can't differentiate between sub-document requests

	322 // (iframes) and top-level document requests. To avoid too many false

	323 // positives, we prevent rules with no hostname part from blocking document

	324 // requests.

	325 //

	326 // Once Safari 11 becomes our minimum supported version, we could change

	327 // our approach here to use the new "unless-top-url" property instead.

	328 if (filter instanceof filterClasses.BlockingFilter && !parsed.hostname)

	329 resourceTypes = resourceTypes.filter(type => type != "document");

	330

	331 if (resourceTypes.length == 0)

300 return;	332 return;

	333

	334 trigger["resource-type"] = resourceTypes;

301 }	335 }

302	336

303 if (filter.thirdParty != null)	337 if (filter.thirdParty != null)

304 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];	338 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];

305	339

306 if (included.length > 0)	340 if (included.length > 0)

307 {	341 {

308 trigger["if-domain"] = [];	342 trigger["if-domain"] = [];

309	343

310 for (let name of included)	344 for (let name of included)

(...skipping 15 matching lines...) Expand all Loading...
326 {	360 {

327 trigger["if-domain"].push("*" + name);	361 trigger["if-domain"].push("*" + name);

328 }	362 }

329 }	363 }

330 }	364 }

331 else if (excluded.length > 0)	365 else if (excluded.length > 0)

332 {	366 {

333 trigger["unless-domain"] = excluded.map(name => "*" + name);	367 trigger["unless-domain"] = excluded.map(name => "*" + name);

334 }	368 }

335 else if (filter instanceof filterClasses.BlockingFilter &&	369 else if (filter instanceof filterClasses.BlockingFilter &&

336 filter.contentType & typeMap.SUBDOCUMENT)	370 filter.contentType & typeMap.SUBDOCUMENT && parsed.hostname)

337 {	371 {

	372 // Rules with a hostname part are still allowed to block document requests,

	373 // but we add an exception for top-level documents.

	374 //

	375 // Note that we can only do this if there's no "unless-domain" property for

	376 // now. This also only works in Safari 11 onwards, while older versions

	377 // simply ignore this property. Once Safari 11 becomes our minimum

	378 // supported version, we can merge "unless-domain" into "unless-top-url".

338 trigger["unless-top-url"] = [trigger["url-filter"]];	379 trigger["unless-top-url"] = [trigger["url-filter"]];

339 if (trigger["url-filter-is-case-sensitive"])	380 if (trigger["url-filter-is-case-sensitive"])

340 trigger["top-url-filter-is-case-sensitive"] = true;	381 trigger["top-url-filter-is-case-sensitive"] = true;

341 }	382 }

342	383

343 rules.push({trigger: trigger, action: {type: action}});	384 rules.push({trigger: trigger, action: {type: action}});

344 }

345

346 function hasNonASCI(obj)

347 {

348 if (typeof obj == "string")

349 {

350 if (/[^\x00-\x7F]/.test(obj))

351 return true;

352 }

353

354 if (typeof obj == "object")

355 {

356 if (obj instanceof Array)

357 for (let item of obj)

358 if (hasNonASCI(item))

359 return true;

360

361 let names = Object.getOwnPropertyNames(obj);

362 for (let name of names)

363 if (hasNonASCI(obj[name]))

364 return true;

365 }

366

367 return false;

368 }	385 }

369	386

370 function convertIDSelectorsToAttributeSelectors(selector)	387 function convertIDSelectorsToAttributeSelectors(selector)

371 {	388 {

372 // First we figure out where all the IDs are	389 // First we figure out where all the IDs are

373 let sep = "";	390 let sep = "";

374 let start = null;	391 let start = null;

375 let positions = [];	392 let positions = [];

376 for (let i = 0; i < selector.length; i++)	393 for (let i = 0; i < selector.length; i++)

377 {	394 {

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
411 {	428 {

412 newSelector.push(selector.substring(i, pos.start));	429 newSelector.push(selector.substring(i, pos.start));

413 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');	430 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');

414 i = pos.end;	431 i = pos.end;

415 }	432 }

416 newSelector.push(selector.substring(i));	433 newSelector.push(selector.substring(i));

417	434

418 return newSelector.join("");	435 return newSelector.join("");

419 }	436 }

420	437

421 function addCSSRules(rules, selectors, matchDomain)	438 function addCSSRules(rules, selectors, matchDomain, exceptionDomains)

422 {	439 {

	440 let unlessDomain = exceptionDomains.size > 0 ? [] : null;

	441

	442 exceptionDomains.forEach(name => unlessDomain.push("*" + name));

	443

423 while (selectors.length)	444 while (selectors.length)

424 {	445 {

425 let selector = selectors.splice(0, selectorLimit).join(", ");	446 let selector = selectors.splice(0, selectorLimit).join(", ");

426	447

427 // As of Safari 9.0 element IDs are matched as lowercase. We work around	448 // As of Safari 9.0 element IDs are matched as lowercase. We work around

428 // this by converting to the attribute format [id="elementID"]	449 // this by converting to the attribute format [id="elementID"]

429 selector = convertIDSelectorsToAttributeSelectors(selector);	450 selector = convertIDSelectorsToAttributeSelectors(selector);

430	451

431 rules.push({	452 let rule = {

432 trigger: {"url-filter": matchDomain,	453 trigger: {"url-filter": matchDomain,

433 "url-filter-is-case-sensitive": true},	454 "url-filter-is-case-sensitive": true},

434 action: {type: "css-display-none",	455 action: {type: "css-display-none",

435 selector: selector}	456 selector: selector}

436 });	457 };

	458

	459 if (unlessDomain)

	460 rule.trigger["unless-domain"] = unlessDomain;

	461

	462 rules.push(rule);

437 }	463 }

438 }	464 }

439	465

440 let ContentBlockerList =	466 let ContentBlockerList =

441 /**	467 /**

442 * Create a new Adblock Plus filter to content blocker list converter	468 * Create a new Adblock Plus filter to content blocker list converter

443 *	469 *

444 * @constructor	470 * @constructor

445 */	471 */

446 exports.ContentBlockerList = function ()	472 exports.ContentBlockerList = function ()

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
523 {	549 {

524 for (let matchDomain of result.matchDomains)	550 for (let matchDomain of result.matchDomains)

525 {	551 {

526 let group = groupedElemhideFilters.get(matchDomain) \|\| [];	552 let group = groupedElemhideFilters.get(matchDomain) \|\| [];

527 group.push(result.selector);	553 group.push(result.selector);

528 groupedElemhideFilters.set(matchDomain, group);	554 groupedElemhideFilters.set(matchDomain, group);

529 }	555 }

530 }	556 }

531 }	557 }

532	558

533 addCSSRules(rules, genericSelectors, "^https?://");	559 // Separate out the element hiding exceptions that have only a hostname part

534	560 // from the rest. This allows us to implement a workaround for issue #5345

535 // Right after the generic element hiding filters, add the exceptions that	561 // (WebKit bug #167423), but as a bonus it also reduces the number of

536 // should apply only to those filters.	562 // generated rules. The downside is that the exception will only apply to the

537 for (let filter of this.generichideExceptions)	563 // top-level document, not to iframes. We have to live with this until the

538 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);	564 // WebKit bug is fixed in all supported versions of Safari.

	565 // https://bugs.webkit.org/show_bug.cgi?id=167423

	566 //

	567 // Note that as a result of this workaround we end up with a huge rule set in

	568 // terms of the amount of memory used. This can cause Node.js to throw

	569 // "JavaScript heap out of memory". To avoid this, call Node.js with

	570 // --max_old_space_size=4096

	571 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);

	572

	573 let genericSelectorExceptionDomains =

	574 extractFilterDomains(this.generichideExceptions);

	575 elemhideExceptionDomains.forEach(name =>

	576 {

	577 genericSelectorExceptionDomains.add(name);

	578 });

	579

	580 addCSSRules(rules, genericSelectors, "^https?://",

	581 genericSelectorExceptionDomains);

539	582

540 groupedElemhideFilters.forEach((selectors, matchDomain) =>	583 groupedElemhideFilters.forEach((selectors, matchDomain) =>

541 {	584 {

542 addCSSRules(rules, selectors, matchDomain);	585 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);

543 });	586 });

544

545 for (let filter of this.elemhideExceptions)

546 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);

547	587

548 let requestFilterExceptionDomains = [];	588 let requestFilterExceptionDomains = [];

549 for (let filter of this.genericblockExceptions)	589 for (let filter of this.genericblockExceptions)

550 {	590 {

551 let parsed = parseFilterRegexpSource(filter.regexpSource);	591 let parsed = parseFilterRegexpSource(filter.regexpSource);

552 if (parsed.hostname)	592 if (parsed.hostname)

553 requestFilterExceptionDomains.push(parsed.hostname);	593 requestFilterExceptionDomains.push(parsed.hostname);

554 }	594 }

555	595

556 for (let filter of this.requestFilters)	596 for (let filter of this.requestFilters)

557 {	597 {

558 convertFilterAddRules(rules, filter, "block", true,	598 convertFilterAddRules(rules, filter, "block", true,

559 requestFilterExceptionDomains);	599 requestFilterExceptionDomains);

560 }	600 }

561	601

562 for (let filter of this.requestExceptions)	602 for (let filter of this.requestExceptions)

563 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);	603 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);

564	604

565 return rules.filter(rule => !hasNonASCI(rule));	605 return rules;

566 };	606 };

LEFT	RIGHT