Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: lib/abp2blocklist.js

Issue 29473555: Issue 5345 - Whitelist $elemhide and $generichide domains where possible (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Left Patch Set: Created June 24, 2017, 2:48 p.m.
Right Patch Set: Rebase Created July 11, 2017, 5:28 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
73 { 73 {
74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain) 74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)
75 subdomains.push(name.slice(0, -suffixLength)); 75 subdomains.push(name.slice(0, -suffixLength));
76 } 76 }
77 77
78 return subdomains; 78 return subdomains;
79 } 79 }
80 80
81 function extractFilterDomains(filters) 81 function extractFilterDomains(filters)
82 { 82 {
83 let domains = []; 83 let domains = new Set();
84 for (let filter of filters) 84 for (let filter of filters)
85 { 85 {
86 let parsed = parseFilterRegexpSource(filter.regexpSource); 86 let parsed = parseFilterRegexpSource(filter.regexpSource);
87 if (parsed.justHostname) 87 if (parsed.justHostname)
88 domains.push(parsed.hostname); 88 domains.add(parsed.hostname);
89 } 89 }
90 return domains; 90 return domains;
kzar 2017/07/07 11:40:13 Why not make domains a Set instead of an Array her
Manish Jethani 2017/07/08 05:33:59 That's a good point, it does seem to make a huge d
91 } 91 }
92 92
93 function convertElemHideFilter(filter, elemhideSelectorExceptions) 93 function convertElemHideFilter(filter, elemhideSelectorExceptions)
94 { 94 {
95 let included = []; 95 let included = [];
96 let excluded = []; 96 let excluded = [];
97 let rules = []; 97 let rules = [];
98 98
99 parseDomains(filter.domains, included, excluded); 99 parseDomains(filter.domains, included, excluded);
100 100
(...skipping 13 matching lines...) Expand all
114 * case, a hostname string (or undefined) and a bool 114 * case, a hostname string (or undefined) and a bool
115 * indicating if the source only contains a hostname or not: 115 * indicating if the source only contains a hostname or not:
116 * {regexp: "...", 116 * {regexp: "...",
117 * canSafelyMatchAsLowercase: true/false, 117 * canSafelyMatchAsLowercase: true/false,
118 * hostname: "...", 118 * hostname: "...",
119 * justHostname: true/false} 119 * justHostname: true/false}
120 */ 120 */
121 function parseFilterRegexpSource(text) 121 function parseFilterRegexpSource(text)
122 { 122 {
123 let regexp = []; 123 let regexp = [];
124 let lastIndex = text.length - 1; 124
125 // Convert the text into an array of Unicode characters.
126 //
127 // In the case of surrogate pairs (the smiley emoji, for example), one
128 // Unicode code point is represented by two JavaScript characters together.
129 // We want to iterate over Unicode code points rather than JavaScript
130 // characters.
131 let characters = Array.from(text);
132
133 let lastIndex = characters.length - 1;
125 let hostname; 134 let hostname;
126 let hostnameStart = null; 135 let hostnameStart = null;
127 let hostnameFinished = false; 136 let hostnameFinished = false;
128 let justHostname = false; 137 let justHostname = false;
129 let canSafelyMatchAsLowercase = false; 138 let canSafelyMatchAsLowercase = false;
130 139
131 for (let i = 0; i < text.length; i++) 140 for (let i = 0; i < characters.length; i++)
132 { 141 {
133 let c = text[i]; 142 let c = characters[i];
134 143
135 if (hostnameFinished) 144 if (hostnameFinished)
136 justHostname = false; 145 justHostname = false;
137 146
138 // If we're currently inside the hostname we have to be careful not to 147 // If we're currently inside the hostname we have to be careful not to
139 // escape any characters until after we have converted it to punycode. 148 // escape any characters until after we have converted it to punycode.
140 if (hostnameStart != null && !hostnameFinished) 149 if (hostnameStart != null && !hostnameFinished)
141 { 150 {
142 let endingChar = (c == "*" || c == "^" || 151 let endingChar = (c == "*" || c == "^" ||
143 c == "?" || c == "/" || c == "|"); 152 c == "?" || c == "/" || c == "|");
144 if (!endingChar && i != lastIndex) 153 if (!endingChar && i != lastIndex)
145 continue; 154 continue;
146 155
147 hostname = punycode.toASCII( 156 hostname = punycode.toASCII(
148 text.substring(hostnameStart, endingChar ? i : i + 1).toLowerCase() 157 characters.slice(hostnameStart, endingChar ? i : i + 1).join("")
Manish Jethani 2017/06/24 14:54:15 punycode.toASCII doesn't lower-case the string, we
Manish Jethani 2017/07/12 08:59:56 This was the only conflict during rebase, since te
158 .toLowerCase()
149 ); 159 );
150 hostnameFinished = justHostname = true; 160 hostnameFinished = justHostname = true;
151 regexp.push(escapeRegExp(hostname)); 161 regexp.push(escapeRegExp(hostname));
152 if (!endingChar) 162 if (!endingChar)
153 break; 163 break;
154 } 164 }
155 165
156 switch (c) 166 switch (c)
157 { 167 {
158 case "*": 168 case "*":
159 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*") 169 if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*")
160 regexp.push(".*"); 170 regexp.push(".*");
161 break; 171 break;
162 case "^": 172 case "^":
163 if (i < lastIndex) 173 if (i < lastIndex)
164 regexp.push("."); 174 regexp.push(".");
165 break; 175 break;
166 case "|": 176 case "|":
167 if (i == 0) 177 if (i == 0)
168 { 178 {
169 regexp.push("^"); 179 regexp.push("^");
170 break; 180 break;
171 } 181 }
172 if (i == lastIndex) 182 if (i == lastIndex)
173 { 183 {
174 regexp.push("$"); 184 regexp.push("$");
175 break; 185 break;
176 } 186 }
177 if (i == 1 && text[0] == "|") 187 if (i == 1 && characters[0] == "|")
178 { 188 {
179 hostnameStart = i + 1; 189 hostnameStart = i + 1;
180 canSafelyMatchAsLowercase = true; 190 canSafelyMatchAsLowercase = true;
181 regexp.push("https?://([^/]+\\.)?"); 191 regexp.push("https?://([^/]+\\.)?");
182 break; 192 break;
183 } 193 }
184 regexp.push("\\|"); 194 regexp.push("\\|");
185 break; 195 break;
186 case "/": 196 case "/":
187 if (!hostnameFinished && 197 if (!hostnameFinished &&
188 text.charAt(i-2) == ":" && text.charAt(i-1) == "/") 198 characters[i - 2] == ":" && characters[i - 1] == "/")
189 { 199 {
190 hostnameStart = i + 1; 200 hostnameStart = i + 1;
191 canSafelyMatchAsLowercase = true; 201 canSafelyMatchAsLowercase = true;
192 } 202 }
193 regexp.push("/"); 203 regexp.push("/");
194 break; 204 break;
195 case ".": case "+": case "$": case "?": 205 case ".": case "+": case "$": case "?":
196 case "{": case "}": case "(": case ")": 206 case "{": case "}": case "(": case ")":
197 case "[": case "]": case "\\": 207 case "[": case "]": case "\\":
198 regexp.push("\\", c); 208 regexp.push("\\", c);
199 break; 209 break;
200 default: 210 default:
201 if (hostnameFinished && (c >= "a" && c <= "z" || 211 if (hostnameFinished && (c >= "a" && c <= "z" ||
202 c >= "A" && c <= "Z")) 212 c >= "A" && c <= "Z"))
203 canSafelyMatchAsLowercase = false; 213 canSafelyMatchAsLowercase = false;
204 regexp.push(c); 214 regexp.push(c == "%" ? c : encodeURI(c));
205 } 215 }
206 } 216 }
207 217
208 return { 218 return {
209 regexp: regexp.join(""), 219 regexp: regexp.join(""),
210 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase, 220 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,
211 hostname: hostname, 221 hostname: hostname,
212 justHostname: justHostname 222 justHostname: justHostname
213 }; 223 };
214 } 224 }
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
281 let included = []; 291 let included = [];
282 let excluded = []; 292 let excluded = [];
283 293
284 parseDomains(filter.domains, included, excluded); 294 parseDomains(filter.domains, included, excluded);
285 295
286 if (exceptionDomains) 296 if (exceptionDomains)
287 excluded = excluded.concat(exceptionDomains); 297 excluded = excluded.concat(exceptionDomains);
288 298
289 if (withResourceTypes) 299 if (withResourceTypes)
290 { 300 {
291 trigger["resource-type"] = getResourceTypes(filter); 301 let resourceTypes = getResourceTypes(filter);
292 302
293 if (trigger["resource-type"].length == 0) 303 // Content blocker rules can't differentiate between sub-document requests
304 // (iframes) and top-level document requests. To avoid too many false
305 // positives, we prevent rules with no hostname part from blocking document
306 // requests.
307 //
308 // Once Safari 11 becomes our minimum supported version, we could change
309 // our approach here to use the new "unless-top-url" property instead.
310 if (filter instanceof filterClasses.BlockingFilter && !parsed.hostname)
311 resourceTypes = resourceTypes.filter(type => type != "document");
312
313 if (resourceTypes.length == 0)
294 return; 314 return;
315
316 trigger["resource-type"] = resourceTypes;
295 } 317 }
296 318
297 if (filter.thirdParty != null) 319 if (filter.thirdParty != null)
298 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"]; 320 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];
299 321
300 if (included.length > 0) 322 if (included.length > 0)
301 { 323 {
302 trigger["if-domain"] = []; 324 trigger["if-domain"] = [];
303 325
304 for (let name of included) 326 for (let name of included)
(...skipping 15 matching lines...) Expand all
320 { 342 {
321 trigger["if-domain"].push("*" + name); 343 trigger["if-domain"].push("*" + name);
322 } 344 }
323 } 345 }
324 } 346 }
325 else if (excluded.length > 0) 347 else if (excluded.length > 0)
326 { 348 {
327 trigger["unless-domain"] = excluded.map(name => "*" + name); 349 trigger["unless-domain"] = excluded.map(name => "*" + name);
328 } 350 }
329 else if (filter instanceof filterClasses.BlockingFilter && 351 else if (filter instanceof filterClasses.BlockingFilter &&
330 filter.contentType & typeMap.SUBDOCUMENT) 352 filter.contentType & typeMap.SUBDOCUMENT && parsed.hostname)
331 { 353 {
354 // Rules with a hostname part are still allowed to block document requests,
355 // but we add an exception for top-level documents.
356 //
357 // Note that we can only do this if there's no "unless-domain" property for
358 // now. This also only works in Safari 11 onwards, while older versions
359 // simply ignore this property. Once Safari 11 becomes our minimum
360 // supported version, we can merge "unless-domain" into "unless-top-url".
332 trigger["unless-top-url"] = [trigger["url-filter"]]; 361 trigger["unless-top-url"] = [trigger["url-filter"]];
333 if (trigger["url-filter-is-case-sensitive"]) 362 if (trigger["url-filter-is-case-sensitive"])
334 trigger["top-url-filter-is-case-sensitive"] = true; 363 trigger["top-url-filter-is-case-sensitive"] = true;
335 } 364 }
336 365
337 rules.push({trigger: trigger, action: {type: action}}); 366 rules.push({trigger: trigger, action: {type: action}});
338 }
339
340 function hasNonASCI(obj)
341 {
342 if (typeof obj == "string")
343 {
344 if (/[^\x00-\x7F]/.test(obj))
345 return true;
346 }
347
348 if (typeof obj == "object")
349 {
350 if (obj instanceof Array)
351 for (let item of obj)
352 if (hasNonASCI(item))
353 return true;
354
355 let names = Object.getOwnPropertyNames(obj);
356 for (let name of names)
357 if (hasNonASCI(obj[name]))
358 return true;
359 }
360
361 return false;
362 } 367 }
363 368
364 function convertIDSelectorsToAttributeSelectors(selector) 369 function convertIDSelectorsToAttributeSelectors(selector)
365 { 370 {
366 // First we figure out where all the IDs are 371 // First we figure out where all the IDs are
367 let sep = ""; 372 let sep = "";
368 let start = null; 373 let start = null;
369 let positions = []; 374 let positions = [];
370 for (let i = 0; i < selector.length; i++) 375 for (let i = 0; i < selector.length; i++)
371 { 376 {
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
407 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 412 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
408 i = pos.end; 413 i = pos.end;
409 } 414 }
410 newSelector.push(selector.substring(i)); 415 newSelector.push(selector.substring(i));
411 416
412 return newSelector.join(""); 417 return newSelector.join("");
413 } 418 }
414 419
415 function addCSSRules(rules, selectors, matchDomain, exceptionDomains) 420 function addCSSRules(rules, selectors, matchDomain, exceptionDomains)
416 { 421 {
417 exceptionDomains = Array.from(new Set(exceptionDomains)); 422 let unlessDomain = exceptionDomains.size > 0 ? [] : null;
Manish Jethani 2017/06/24 14:54:15 Ensure no duplicates.
423
424 exceptionDomains.forEach(name => unlessDomain.push("*" + name));
418 425
419 while (selectors.length) 426 while (selectors.length)
420 { 427 {
421 let selector = selectors.splice(0, selectorLimit).join(", "); 428 let selector = selectors.splice(0, selectorLimit).join(", ");
422 429
423 // As of Safari 9.0 element IDs are matched as lowercase. We work around 430 // As of Safari 9.0 element IDs are matched as lowercase. We work around
424 // this by converting to the attribute format [id="elementID"] 431 // this by converting to the attribute format [id="elementID"]
425 selector = convertIDSelectorsToAttributeSelectors(selector); 432 selector = convertIDSelectorsToAttributeSelectors(selector);
426 433
427 let rule = { 434 let rule = {
428 trigger: {"url-filter": matchDomain, 435 trigger: {"url-filter": matchDomain,
429 "url-filter-is-case-sensitive": true}, 436 "url-filter-is-case-sensitive": true},
430 action: {type: "css-display-none", 437 action: {type: "css-display-none",
431 selector: selector} 438 selector: selector}
432 }; 439 };
433 440
434 if (exceptionDomains.length > 0) 441 if (unlessDomain)
435 rule.trigger["unless-domain"] = exceptionDomains.map(name => "*" + name); 442 rule.trigger["unless-domain"] = unlessDomain;
kzar 2017/07/07 11:40:13 Maybe we should do this work outside of the while
Manish Jethani 2017/07/08 05:33:59 We have to make a copy of the array as a rule, bec
kzar 2017/07/10 12:33:07 I'd rather you did the work outside the loop here
Manish Jethani 2017/07/11 11:19:18 Done.
436 443
437 rules.push(rule); 444 rules.push(rule);
438 } 445 }
439 } 446 }
440 447
441 let ContentBlockerList = 448 let ContentBlockerList =
442 /** 449 /**
443 * Create a new Adblock Plus filter to content blocker list converter 450 * Create a new Adblock Plus filter to content blocker list converter
444 * 451 *
445 * @constructor 452 * @constructor
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
526 { 533 {
527 let group = groupedElemhideFilters.get(matchDomain) || []; 534 let group = groupedElemhideFilters.get(matchDomain) || [];
528 group.push(result.selector); 535 group.push(result.selector);
529 groupedElemhideFilters.set(matchDomain, group); 536 groupedElemhideFilters.set(matchDomain, group);
530 } 537 }
531 } 538 }
532 } 539 }
533 540
534 // Separate out the element hiding exceptions that have only a hostname part 541 // Separate out the element hiding exceptions that have only a hostname part
535 // from the rest. This allows us to implement a workaround for issue #5345 542 // from the rest. This allows us to implement a workaround for issue #5345
536 // (WebKit bug #167423), but as a bonus it also reduces the number of 543 // (WebKit bug #167423), but as a bonus it also reduces the number of
kzar 2017/07/07 11:40:13 Mind giving the full URL for the WebKit bug?
Manish Jethani 2017/07/08 05:33:59 Done. By the way, there are new comments on that
kzar 2017/07/10 12:33:08 Acknowledged.
537 // generated rules. The downside is that the exception will only apply to the 544 // generated rules. The downside is that the exception will only apply to the
538 // top-level document, not to iframes. We have to live with this until the 545 // top-level document, not to iframes. We have to live with this until the
539 // WebKit bug is fixed in all supported versions of Safari. 546 // WebKit bug is fixed in all supported versions of Safari.
547 // https://bugs.webkit.org/show_bug.cgi?id=167423
540 // 548 //
541 // Note that as a result of this workaround we end up with a huge rule set in 549 // Note that as a result of this workaround we end up with a huge rule set in
542 // terms of the amount of memory used. This can cause Node.js to throw 550 // terms of the amount of memory used. This can cause Node.js to throw
kzar 2017/07/07 11:40:13 Have you tested that rule generation still works O
Manish Jethani 2017/07/08 05:33:59 I tested it there and it works without problems.
kzar 2017/07/10 12:33:07 Acknowledged.
543 // "JavaScript heap out of memory". To avoid this, call Node.js with 551 // "JavaScript heap out of memory". To avoid this, call Node.js with
544 // --max_old_space_size=4096 552 // --max_old_space_size=4096
545 let generichideExceptionDomains = 553 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);
554
555 let genericSelectorExceptionDomains =
546 extractFilterDomains(this.generichideExceptions); 556 extractFilterDomains(this.generichideExceptions);
547 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions); 557 elemhideExceptionDomains.forEach(name =>
558 {
559 genericSelectorExceptionDomains.add(name);
560 });
548 561
549 addCSSRules(rules, genericSelectors, "^https?://", 562 addCSSRules(rules, genericSelectors, "^https?://",
550 generichideExceptionDomains.concat(elemhideExceptionDomains)); 563 genericSelectorExceptionDomains);
551 564
552 groupedElemhideFilters.forEach((selectors, matchDomain) => 565 groupedElemhideFilters.forEach((selectors, matchDomain) =>
553 { 566 {
554 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains); 567 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);
555 }); 568 });
556 569
557 let requestFilterExceptionDomains = []; 570 let requestFilterExceptionDomains = [];
558 for (let filter of this.genericblockExceptions) 571 for (let filter of this.genericblockExceptions)
559 { 572 {
560 let parsed = parseFilterRegexpSource(filter.regexpSource); 573 let parsed = parseFilterRegexpSource(filter.regexpSource);
561 if (parsed.hostname) 574 if (parsed.hostname)
562 requestFilterExceptionDomains.push(parsed.hostname); 575 requestFilterExceptionDomains.push(parsed.hostname);
563 } 576 }
564 577
565 for (let filter of this.requestFilters) 578 for (let filter of this.requestFilters)
566 { 579 {
567 convertFilterAddRules(rules, filter, "block", true, 580 convertFilterAddRules(rules, filter, "block", true,
568 requestFilterExceptionDomains); 581 requestFilterExceptionDomains);
569 } 582 }
570 583
571 for (let filter of this.requestExceptions) 584 for (let filter of this.requestExceptions)
572 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 585 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
573 586
574 return rules.filter(rule => !hasNonASCI(rule)); 587 return rules;
575 }; 588 };
LEFTRIGHT

Powered by Google App Engine
This is Rietveld