lib/abp2blocklist.js - Issue 29473555: Issue 5345 - Whitelist $elemhide and $generichide domains where possible

Delta Between Two Patch Sets: lib/abp2blocklist.js

Issue 29473555: Issue 5345 - Whitelist $elemhide and $generichide domains where possible (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist

Left Patch Set: Address comments to Patch Set 1 Created July 8, 2017, 5:32 a.m.

Right Patch Set: Rebase Created July 11, 2017, 5:28 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 /*	1 /*

2 * This file is part of Adblock Plus <https://adblockplus.org/>,	2 * This file is part of Adblock Plus <https://adblockplus.org/>,

3 * Copyright (C) 2006-2017 eyeo GmbH	3 * Copyright (C) 2006-2017 eyeo GmbH

4 *	4 *

5 * Adblock Plus is free software: you can redistribute it and/or modify	5 * Adblock Plus is free software: you can redistribute it and/or modify

6 * it under the terms of the GNU General Public License version 3 as	6 * it under the terms of the GNU General Public License version 3 as

7 * published by the Free Software Foundation.	7 * published by the Free Software Foundation.

8 *	8 *

9 * Adblock Plus is distributed in the hope that it will be useful,	9 * Adblock Plus is distributed in the hope that it will be useful,

10 * but WITHOUT ANY WARRANTY; without even the implied warranty of	10 * but WITHOUT ANY WARRANTY; without even the implied warranty of

(...skipping 103 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
114 * case, a hostname string (or undefined) and a bool	114 * case, a hostname string (or undefined) and a bool

115 * indicating if the source only contains a hostname or not:	115 * indicating if the source only contains a hostname or not:

116 * {regexp: "...",	116 * {regexp: "...",

117 * canSafelyMatchAsLowercase: true/false,	117 * canSafelyMatchAsLowercase: true/false,

118 * hostname: "...",	118 * hostname: "...",

119 * justHostname: true/false}	119 * justHostname: true/false}

120 */	120 */

121 function parseFilterRegexpSource(text)	121 function parseFilterRegexpSource(text)

122 {	122 {

123 let regexp = [];	123 let regexp = [];

124 let lastIndex = text.length - 1;	124

	125 // Convert the text into an array of Unicode characters.

	126 //

	127 // In the case of surrogate pairs (the smiley emoji, for example), one

	128 // Unicode code point is represented by two JavaScript characters together.

	129 // We want to iterate over Unicode code points rather than JavaScript

	130 // characters.

	131 let characters = Array.from(text);

	132

	133 let lastIndex = characters.length - 1;

125 let hostname;	134 let hostname;

126 let hostnameStart = null;	135 let hostnameStart = null;

127 let hostnameFinished = false;	136 let hostnameFinished = false;

128 let justHostname = false;	137 let justHostname = false;

129 let canSafelyMatchAsLowercase = false;	138 let canSafelyMatchAsLowercase = false;

130	139

131 for (let i = 0; i < text.length; i++)	140 for (let i = 0; i < characters.length; i++)

132 {	141 {

133 let c = text[i];	142 let c = characters[i];

134	143

135 if (hostnameFinished)	144 if (hostnameFinished)

136 justHostname = false;	145 justHostname = false;

137	146

138 // If we're currently inside the hostname we have to be careful not to	147 // If we're currently inside the hostname we have to be careful not to

139 // escape any characters until after we have converted it to punycode.	148 // escape any characters until after we have converted it to punycode.

140 if (hostnameStart != null && !hostnameFinished)	149 if (hostnameStart != null && !hostnameFinished)

141 {	150 {

142 let endingChar = (c == "*" \|\| c == "^" \|\|	151 let endingChar = (c == "*" \|\| c == "^" \|\|

143 c == "?" \|\| c == "/" \|\| c == "\|");	152 c == "?" \|\| c == "/" \|\| c == "\|");

144 if (!endingChar && i != lastIndex)	153 if (!endingChar && i != lastIndex)

145 continue;	154 continue;

146	155

147 hostname = punycode.toASCII(	156 hostname = punycode.toASCII(

148 text.substring(hostnameStart, endingChar ? i : i + 1).toLowerCase()	157 characters.slice(hostnameStart, endingChar ? i : i + 1).join("")
	Manish Jethani 2017/07/12 08:59:56 This was the only conflict during rebase, since te This was the only conflict during rebase, since text (String) changed to characters (Array) in another commit.
	158 .toLowerCase()

149 );	159 );

150 hostnameFinished = justHostname = true;	160 hostnameFinished = justHostname = true;

151 regexp.push(escapeRegExp(hostname));	161 regexp.push(escapeRegExp(hostname));

152 if (!endingChar)	162 if (!endingChar)

153 break;	163 break;

154 }	164 }

155	165

156 switch (c)	166 switch (c)

157 {	167 {

158 case "*":	168 case "*":

159 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*")	169 if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*")

160 regexp.push(".*");	170 regexp.push(".*");

161 break;	171 break;

162 case "^":	172 case "^":

163 if (i < lastIndex)	173 if (i < lastIndex)

164 regexp.push(".");	174 regexp.push(".");

165 break;	175 break;

166 case "\|":	176 case "\|":

167 if (i == 0)	177 if (i == 0)

168 {	178 {

169 regexp.push("^");	179 regexp.push("^");

170 break;	180 break;

171 }	181 }

172 if (i == lastIndex)	182 if (i == lastIndex)

173 {	183 {

174 regexp.push("$");	184 regexp.push("$");

175 break;	185 break;

176 }	186 }

177 if (i == 1 && text[0] == "\|")	187 if (i == 1 && characters[0] == "\|")

178 {	188 {

179 hostnameStart = i + 1;	189 hostnameStart = i + 1;

180 canSafelyMatchAsLowercase = true;	190 canSafelyMatchAsLowercase = true;

181 regexp.push("https?://([^/]+\\.)?");	191 regexp.push("https?://([^/]+\\.)?");

182 break;	192 break;

183 }	193 }

184 regexp.push("\\\|");	194 regexp.push("\\\|");

185 break;	195 break;

186 case "/":	196 case "/":

187 if (!hostnameFinished &&	197 if (!hostnameFinished &&

188 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")	198 characters[i - 2] == ":" && characters[i - 1] == "/")

189 {	199 {

190 hostnameStart = i + 1;	200 hostnameStart = i + 1;

191 canSafelyMatchAsLowercase = true;	201 canSafelyMatchAsLowercase = true;

192 }	202 }

193 regexp.push("/");	203 regexp.push("/");

194 break;	204 break;

195 case ".": case "+": case "$": case "?":	205 case ".": case "+": case "$": case "?":

196 case "{": case "}": case "(": case ")":	206 case "{": case "}": case "(": case ")":

197 case "[": case "]": case "\\":	207 case "[": case "]": case "\\":

198 regexp.push("\\", c);	208 regexp.push("\\", c);

199 break;	209 break;

200 default:	210 default:

201 if (hostnameFinished && (c >= "a" && c <= "z" \|\|	211 if (hostnameFinished && (c >= "a" && c <= "z" \|\|

202 c >= "A" && c <= "Z"))	212 c >= "A" && c <= "Z"))

203 canSafelyMatchAsLowercase = false;	213 canSafelyMatchAsLowercase = false;

204 regexp.push(c);	214 regexp.push(c == "%" ? c : encodeURI(c));

205 }	215 }

206 }	216 }

207	217

208 return {	218 return {

209 regexp: regexp.join(""),	219 regexp: regexp.join(""),

210 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,	220 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,

211 hostname: hostname,	221 hostname: hostname,

212 justHostname: justHostname	222 justHostname: justHostname

213 };	223 };

214 }	224 }

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
281 let included = [];	291 let included = [];

282 let excluded = [];	292 let excluded = [];

283	293

284 parseDomains(filter.domains, included, excluded);	294 parseDomains(filter.domains, included, excluded);

285	295

286 if (exceptionDomains)	296 if (exceptionDomains)

287 excluded = excluded.concat(exceptionDomains);	297 excluded = excluded.concat(exceptionDomains);

288	298

289 if (withResourceTypes)	299 if (withResourceTypes)

290 {	300 {

291 trigger["resource-type"] = getResourceTypes(filter);	301 let resourceTypes = getResourceTypes(filter);

292	302

293 if (trigger["resource-type"].length == 0)	303 // Content blocker rules can't differentiate between sub-document requests

	304 // (iframes) and top-level document requests. To avoid too many false

	305 // positives, we prevent rules with no hostname part from blocking document

	306 // requests.

	307 //

	308 // Once Safari 11 becomes our minimum supported version, we could change

	309 // our approach here to use the new "unless-top-url" property instead.

	310 if (filter instanceof filterClasses.BlockingFilter && !parsed.hostname)

	311 resourceTypes = resourceTypes.filter(type => type != "document");

	312

	313 if (resourceTypes.length == 0)

294 return;	314 return;

	315

	316 trigger["resource-type"] = resourceTypes;

295 }	317 }

296	318

297 if (filter.thirdParty != null)	319 if (filter.thirdParty != null)

298 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];	320 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];

299	321

300 if (included.length > 0)	322 if (included.length > 0)

301 {	323 {

302 trigger["if-domain"] = [];	324 trigger["if-domain"] = [];

303	325

304 for (let name of included)	326 for (let name of included)

(...skipping 15 matching lines...) Expand all Loading...
320 {	342 {

321 trigger["if-domain"].push("*" + name);	343 trigger["if-domain"].push("*" + name);

322 }	344 }

323 }	345 }

324 }	346 }

325 else if (excluded.length > 0)	347 else if (excluded.length > 0)

326 {	348 {

327 trigger["unless-domain"] = excluded.map(name => "*" + name);	349 trigger["unless-domain"] = excluded.map(name => "*" + name);

328 }	350 }

329 else if (filter instanceof filterClasses.BlockingFilter &&	351 else if (filter instanceof filterClasses.BlockingFilter &&

330 filter.contentType & typeMap.SUBDOCUMENT)	352 filter.contentType & typeMap.SUBDOCUMENT && parsed.hostname)

331 {	353 {

	354 // Rules with a hostname part are still allowed to block document requests,

	355 // but we add an exception for top-level documents.

	356 //

	357 // Note that we can only do this if there's no "unless-domain" property for

	358 // now. This also only works in Safari 11 onwards, while older versions

	359 // simply ignore this property. Once Safari 11 becomes our minimum

	360 // supported version, we can merge "unless-domain" into "unless-top-url".

332 trigger["unless-top-url"] = [trigger["url-filter"]];	361 trigger["unless-top-url"] = [trigger["url-filter"]];

333 if (trigger["url-filter-is-case-sensitive"])	362 if (trigger["url-filter-is-case-sensitive"])

334 trigger["top-url-filter-is-case-sensitive"] = true;	363 trigger["top-url-filter-is-case-sensitive"] = true;

335 }	364 }

336	365

337 rules.push({trigger: trigger, action: {type: action}});	366 rules.push({trigger: trigger, action: {type: action}});

338 }

339

340 function hasNonASCI(obj)

341 {

342 if (typeof obj == "string")

343 {

344 if (/[^\x00-\x7F]/.test(obj))

345 return true;

346 }

347

348 if (typeof obj == "object")

349 {

350 if (obj instanceof Array)

351 for (let item of obj)

352 if (hasNonASCI(item))

353 return true;

354

355 let names = Object.getOwnPropertyNames(obj);

356 for (let name of names)

357 if (hasNonASCI(obj[name]))

358 return true;

359 }

360

361 return false;

362 }	367 }

363	368

364 function convertIDSelectorsToAttributeSelectors(selector)	369 function convertIDSelectorsToAttributeSelectors(selector)

365 {	370 {

366 // First we figure out where all the IDs are	371 // First we figure out where all the IDs are

367 let sep = "";	372 let sep = "";

368 let start = null;	373 let start = null;

369 let positions = [];	374 let positions = [];

370 for (let i = 0; i < selector.length; i++)	375 for (let i = 0; i < selector.length; i++)

371 {	376 {

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
407 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');	412 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');

408 i = pos.end;	413 i = pos.end;

409 }	414 }

410 newSelector.push(selector.substring(i));	415 newSelector.push(selector.substring(i));

411	416

412 return newSelector.join("");	417 return newSelector.join("");

413 }	418 }

414	419

415 function addCSSRules(rules, selectors, matchDomain, exceptionDomains)	420 function addCSSRules(rules, selectors, matchDomain, exceptionDomains)

416 {	421 {

	422 let unlessDomain = exceptionDomains.size > 0 ? [] : null;

	423

	424 exceptionDomains.forEach(name => unlessDomain.push("*" + name));

	425

417 while (selectors.length)	426 while (selectors.length)

418 {	427 {

419 let selector = selectors.splice(0, selectorLimit).join(", ");	428 let selector = selectors.splice(0, selectorLimit).join(", ");

420	429

421 // As of Safari 9.0 element IDs are matched as lowercase. We work around	430 // As of Safari 9.0 element IDs are matched as lowercase. We work around

422 // this by converting to the attribute format [id="elementID"]	431 // this by converting to the attribute format [id="elementID"]

423 selector = convertIDSelectorsToAttributeSelectors(selector);	432 selector = convertIDSelectorsToAttributeSelectors(selector);

424	433

425 let rule = {	434 let rule = {

426 trigger: {"url-filter": matchDomain,	435 trigger: {"url-filter": matchDomain,

427 "url-filter-is-case-sensitive": true},	436 "url-filter-is-case-sensitive": true},

428 action: {type: "css-display-none",	437 action: {type: "css-display-none",

429 selector: selector}	438 selector: selector}

430 };	439 };

431	440

432 if (exceptionDomains.size > 0)	441 if (unlessDomain)

433 {	442 rule.trigger["unless-domain"] = unlessDomain;

434 rule.trigger["unless-domain"] = [];

435 exceptionDomains.forEach(name =>

436 {

437 rule.trigger["unless-domain"].push("*" + name);

438 });

439 }

440	443

441 rules.push(rule);	444 rules.push(rule);

442 }	445 }

443 }	446 }

444	447

445 let ContentBlockerList =	448 let ContentBlockerList =

446 /**	449 /**

447 * Create a new Adblock Plus filter to content blocker list converter	450 * Create a new Adblock Plus filter to content blocker list converter

448 *	451 *

449 * @constructor	452 * @constructor

(...skipping 96 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
546 // Note that as a result of this workaround we end up with a huge rule set in	549 // Note that as a result of this workaround we end up with a huge rule set in

547 // terms of the amount of memory used. This can cause Node.js to throw	550 // terms of the amount of memory used. This can cause Node.js to throw

548 // "JavaScript heap out of memory". To avoid this, call Node.js with	551 // "JavaScript heap out of memory". To avoid this, call Node.js with

549 // --max_old_space_size=4096	552 // --max_old_space_size=4096

550 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);	553 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);

551	554

552 let genericSelectorExceptionDomains =	555 let genericSelectorExceptionDomains =

553 extractFilterDomains(this.generichideExceptions);	556 extractFilterDomains(this.generichideExceptions);

554 elemhideExceptionDomains.forEach(name =>	557 elemhideExceptionDomains.forEach(name =>

555 {	558 {

556 genericSelectorExceptionDomains.add(name);	559 genericSelectorExceptionDomains.add(name);
kzar 2017/07/10 12:33:08 I wonder if it would be better to pass two Sets of I wonder if it would be better to pass two Sets of domains to addCSSRules rather than combining them here. I figure we're iterating through those domains twice otherwise. Manish Jethani 2017/07/11 11:19:18 We'd still have to combine them into one set if we Show quoted text On 2017/07/10 12:33:08, kzar wrote: > I wonder if it would be better to pass two Sets of domains to addCSSRules rather > than combining them here. I figure we're iterating through those domains twice > otherwise. We'd still have to combine them into one set if we want to filter out duplicates. Also, this code runs only once per rule set generation, it really doesn't matter. Alternatively we could use the has method to check if the value is already in the other set while iterating. kzar 2017/07/11 12:20:03 Fair enough. Show quoted text On 2017/07/11 11:19:18, Manish Jethani wrote: > On 2017/07/10 12:33:08, kzar wrote: > > I wonder if it would be better to pass two Sets of domains to addCSSRules > rather > > than combining them here. I figure we're iterating through those domains twice > > otherwise. > > We'd still have to combine them into one set if we want to filter out > duplicates. Also, this code runs only once per rule set generation, it really > doesn't matter. > > Alternatively we could use the has method to check if the value is already in > the other set while iterating. Fair enough.
557 });	560 });

558	561

559 addCSSRules(rules, genericSelectors, "^https?://",	562 addCSSRules(rules, genericSelectors, "^https?://",

560 genericSelectorExceptionDomains);	563 genericSelectorExceptionDomains);

561	564

562 groupedElemhideFilters.forEach((selectors, matchDomain) =>	565 groupedElemhideFilters.forEach((selectors, matchDomain) =>

563 {	566 {

564 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);	567 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);

565 });	568 });

566	569

567 let requestFilterExceptionDomains = [];	570 let requestFilterExceptionDomains = [];

568 for (let filter of this.genericblockExceptions)	571 for (let filter of this.genericblockExceptions)

569 {	572 {

570 let parsed = parseFilterRegexpSource(filter.regexpSource);	573 let parsed = parseFilterRegexpSource(filter.regexpSource);

571 if (parsed.hostname)	574 if (parsed.hostname)

572 requestFilterExceptionDomains.push(parsed.hostname);	575 requestFilterExceptionDomains.push(parsed.hostname);

573 }	576 }

574	577

575 for (let filter of this.requestFilters)	578 for (let filter of this.requestFilters)

576 {	579 {

577 convertFilterAddRules(rules, filter, "block", true,	580 convertFilterAddRules(rules, filter, "block", true,

578 requestFilterExceptionDomains);	581 requestFilterExceptionDomains);

579 }	582 }

580	583

581 for (let filter of this.requestExceptions)	584 for (let filter of this.requestExceptions)

582 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);	585 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);

583	586

584 return rules.filter(rule => !hasNonASCI(rule));	587 return rules;

585 };	588 };

LEFT	RIGHT