Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: lib/abp2blocklist.js

Issue 29467595: Issue 5325 - Add support for separator characters (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Left Patch Set: Created June 16, 2017, 5:25 p.m.
Right Patch Set: Rebase Created July 12, 2017, 12:45 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | test/abp2blocklist.js » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
69 let subdomains = []; 69 let subdomains = [];
70 let suffixLength = domain.length + 1; 70 let suffixLength = domain.length + 1;
71 71
72 for (let name of list) 72 for (let name of list)
73 { 73 {
74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain) 74 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)
75 subdomains.push(name.slice(0, -suffixLength)); 75 subdomains.push(name.slice(0, -suffixLength));
76 } 76 }
77 77
78 return subdomains; 78 return subdomains;
79 }
80
81 function extractFilterDomains(filters)
82 {
83 let domains = new Set();
84 for (let filter of filters)
85 {
86 let parsed = parseFilterRegexpSource(filter.regexpSource);
87 if (parsed.justHostname)
88 domains.add(parsed.hostname);
89 }
90 return domains;
79 } 91 }
80 92
81 function convertElemHideFilter(filter, elemhideSelectorExceptions) 93 function convertElemHideFilter(filter, elemhideSelectorExceptions)
82 { 94 {
83 let included = []; 95 let included = [];
84 let excluded = []; 96 let excluded = [];
85 let rules = []; 97 let rules = [];
86 98
87 parseDomains(filter.domains, included, excluded); 99 parseDomains(filter.domains, included, excluded);
88 100
(...skipping 13 matching lines...) Expand all
102 * case, a hostname string (or undefined) and a bool 114 * case, a hostname string (or undefined) and a bool
103 * indicating if the source only contains a hostname or not: 115 * indicating if the source only contains a hostname or not:
104 * {regexp: "...", 116 * {regexp: "...",
105 * canSafelyMatchAsLowercase: true/false, 117 * canSafelyMatchAsLowercase: true/false,
106 * hostname: "...", 118 * hostname: "...",
107 * justHostname: true/false} 119 * justHostname: true/false}
108 */ 120 */
109 function parseFilterRegexpSource(text) 121 function parseFilterRegexpSource(text)
110 { 122 {
111 let regexp = []; 123 let regexp = [];
112 let lastIndex = text.length - 1; 124
125 // Convert the text into an array of Unicode characters.
126 //
127 // In the case of surrogate pairs (the smiley emoji, for example), one
128 // Unicode code point is represented by two JavaScript characters together.
129 // We want to iterate over Unicode code points rather than JavaScript
130 // characters.
131 let characters = Array.from(text);
132
133 let lastIndex = characters.length - 1;
113 let hostname; 134 let hostname;
114 let hostnameStart = null; 135 let hostnameStart = null;
115 let hostnameFinished = false; 136 let hostnameFinished = false;
116 let justHostname = false; 137 let justHostname = false;
117 let canSafelyMatchAsLowercase = false; 138 let canSafelyMatchAsLowercase = false;
118 139
119 for (let i = 0; i < text.length; i++) 140 for (let i = 0; i < characters.length; i++)
120 { 141 {
121 let c = text[i]; 142 let c = characters[i];
122 143
123 if (hostnameFinished) 144 if (hostnameFinished)
124 justHostname = false; 145 justHostname = false;
125 146
126 // If we're currently inside the hostname we have to be careful not to 147 // If we're currently inside the hostname we have to be careful not to
127 // escape any characters until after we have converted it to punycode. 148 // escape any characters until after we have converted it to punycode.
128 if (hostnameStart != null && !hostnameFinished) 149 if (hostnameStart != null && !hostnameFinished)
129 { 150 {
130 let endingChar = (c == "*" || c == "^" || 151 let endingChar = (c == "*" || c == "^" ||
131 c == "?" || c == "/" || c == "|"); 152 c == "?" || c == "/" || c == "|");
132 if (!endingChar && i != lastIndex) 153 if (!endingChar && i != lastIndex)
133 continue; 154 continue;
134 155
135 hostname = punycode.toASCII( 156 hostname = punycode.toASCII(
136 text.substring(hostnameStart, endingChar ? i : i + 1) 157 characters.slice(hostnameStart, endingChar ? i : i + 1).join("")
158 .toLowerCase()
137 ); 159 );
138 hostnameFinished = justHostname = true; 160 hostnameFinished = justHostname = true;
139 regexp.push(escapeRegExp(hostname)); 161 regexp.push(escapeRegExp(hostname));
140 if (!endingChar) 162 if (!endingChar)
141 break; 163 break;
142 } 164 }
143 165
144 switch (c) 166 switch (c)
145 { 167 {
146 case "*": 168 case "*":
147 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*") 169 if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*")
148 regexp.push(".*"); 170 regexp.push(".*");
149 break; 171 break;
150 case "^": 172 case "^":
151 if (i < lastIndex) 173 let alphabet = "a-z";
152 regexp.push("[^.%A-Za-z0-9_]"); 174 // If justHostname is true and we've encountered a "^", it means we're
175 // still in the hostname part of the URL. Since hostnames are always
176 // lower case (Punycode), there's no need to include "A-Z" in the
177 // pattern. Further, subsequent code may lower-case the entire regular
178 // expression (if the URL contains only the hostname part), leaving us
179 // with "a-za-z", which would be redundant.
180 if (!justHostname)
181 alphabet = "A-Z" + alphabet;
182 let digits = "0-9";
183 // Note that the "-" must appear first here in order to retain its
184 // literal meaning within the brackets.
185 let specialCharacters = "-_.%";
186 let separator = "[^" + specialCharacters + alphabet + digits + "]";
187 if (i == 0)
188 regexp.push("^https?://(.*" + separator + ")?");
189 else if (i == lastIndex)
190 regexp.push("(" + separator + ".*)?$");
153 else 191 else
154 regexp.push("([^.%A-Za-z0-9_].*)?$"); 192 regexp.push(separator);
Sebastian Noack 2017/06/16 21:13:18 Can you put the duplicated part of the regexp in a
Manish Jethani 2017/06/19 10:39:54 Done.
155 canSafelyMatchAsLowercase = false;
Sebastian Noack 2017/06/16 21:13:18 Why is that necessary?
Manish Jethani 2017/06/19 10:39:54 It was converting "A-Z" into "a-z". I thought abou
156 break; 193 break;
157 case "|": 194 case "|":
158 if (i == 0) 195 if (i == 0)
159 { 196 {
160 regexp.push("^"); 197 regexp.push("^");
161 break; 198 break;
162 } 199 }
163 if (i == lastIndex) 200 if (i == lastIndex)
164 { 201 {
165 regexp.push("$"); 202 regexp.push("$");
166 break; 203 break;
167 } 204 }
168 if (i == 1 && text[0] == "|") 205 if (i == 1 && characters[0] == "|")
169 { 206 {
170 hostnameStart = i + 1; 207 hostnameStart = i + 1;
171 canSafelyMatchAsLowercase = true; 208 canSafelyMatchAsLowercase = true;
172 regexp.push("https?://([^/]+\\.)?"); 209 regexp.push("https?://([^/]+\\.)?");
173 break; 210 break;
174 } 211 }
175 regexp.push("\\|"); 212 regexp.push("\\|");
176 break; 213 break;
177 case "/": 214 case "/":
178 if (!hostnameFinished && 215 if (!hostnameFinished &&
179 text.charAt(i-2) == ":" && text.charAt(i-1) == "/") 216 characters[i - 2] == ":" && characters[i - 1] == "/")
180 { 217 {
181 hostnameStart = i + 1; 218 hostnameStart = i + 1;
182 canSafelyMatchAsLowercase = true; 219 canSafelyMatchAsLowercase = true;
183 } 220 }
184 regexp.push("/"); 221 regexp.push("/");
185 break; 222 break;
186 case ".": case "+": case "$": case "?": 223 case ".": case "+": case "$": case "?":
187 case "{": case "}": case "(": case ")": 224 case "{": case "}": case "(": case ")":
188 case "[": case "]": case "\\": 225 case "[": case "]": case "\\":
189 regexp.push("\\", c); 226 regexp.push("\\", c);
190 break; 227 break;
191 default: 228 default:
192 if (hostnameFinished && (c >= "a" && c <= "z" || 229 if (hostnameFinished && (c >= "a" && c <= "z" ||
193 c >= "A" && c <= "Z")) 230 c >= "A" && c <= "Z"))
194 canSafelyMatchAsLowercase = false; 231 canSafelyMatchAsLowercase = false;
195 regexp.push(c); 232 regexp.push(c == "%" ? c : encodeURI(c));
196 } 233 }
197 } 234 }
198 235
199 return { 236 return {
200 regexp: regexp.join(""), 237 regexp: regexp.join(""),
201 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase, 238 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,
202 hostname: hostname, 239 hostname: hostname,
203 justHostname: justHostname 240 justHostname: justHostname
204 }; 241 };
205 } 242 }
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
272 let included = []; 309 let included = [];
273 let excluded = []; 310 let excluded = [];
274 311
275 parseDomains(filter.domains, included, excluded); 312 parseDomains(filter.domains, included, excluded);
276 313
277 if (exceptionDomains) 314 if (exceptionDomains)
278 excluded = excluded.concat(exceptionDomains); 315 excluded = excluded.concat(exceptionDomains);
279 316
280 if (withResourceTypes) 317 if (withResourceTypes)
281 { 318 {
282 trigger["resource-type"] = getResourceTypes(filter); 319 let resourceTypes = getResourceTypes(filter);
283 320
284 if (trigger["resource-type"].length == 0) 321 // Content blocker rules can't differentiate between sub-document requests
322 // (iframes) and top-level document requests. To avoid too many false
323 // positives, we prevent rules with no hostname part from blocking document
324 // requests.
325 //
326 // Once Safari 11 becomes our minimum supported version, we could change
327 // our approach here to use the new "unless-top-url" property instead.
328 if (filter instanceof filterClasses.BlockingFilter && !parsed.hostname)
329 resourceTypes = resourceTypes.filter(type => type != "document");
330
331 if (resourceTypes.length == 0)
285 return; 332 return;
333
334 trigger["resource-type"] = resourceTypes;
286 } 335 }
287 336
288 if (filter.thirdParty != null) 337 if (filter.thirdParty != null)
289 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"]; 338 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];
290 339
291 if (included.length > 0) 340 if (included.length > 0)
292 { 341 {
293 trigger["if-domain"] = []; 342 trigger["if-domain"] = [];
294 343
295 for (let name of included) 344 for (let name of included)
(...skipping 15 matching lines...) Expand all
311 { 360 {
312 trigger["if-domain"].push("*" + name); 361 trigger["if-domain"].push("*" + name);
313 } 362 }
314 } 363 }
315 } 364 }
316 else if (excluded.length > 0) 365 else if (excluded.length > 0)
317 { 366 {
318 trigger["unless-domain"] = excluded.map(name => "*" + name); 367 trigger["unless-domain"] = excluded.map(name => "*" + name);
319 } 368 }
320 else if (filter instanceof filterClasses.BlockingFilter && 369 else if (filter instanceof filterClasses.BlockingFilter &&
321 filter.contentType & typeMap.SUBDOCUMENT) 370 filter.contentType & typeMap.SUBDOCUMENT && parsed.hostname)
322 { 371 {
372 // Rules with a hostname part are still allowed to block document requests,
373 // but we add an exception for top-level documents.
374 //
375 // Note that we can only do this if there's no "unless-domain" property for
376 // now. This also only works in Safari 11 onwards, while older versions
377 // simply ignore this property. Once Safari 11 becomes our minimum
378 // supported version, we can merge "unless-domain" into "unless-top-url".
323 trigger["unless-top-url"] = [trigger["url-filter"]]; 379 trigger["unless-top-url"] = [trigger["url-filter"]];
324 if (trigger["url-filter-is-case-sensitive"]) 380 if (trigger["url-filter-is-case-sensitive"])
325 trigger["top-url-filter-is-case-sensitive"] = true; 381 trigger["top-url-filter-is-case-sensitive"] = true;
326 } 382 }
327 383
328 rules.push({trigger: trigger, action: {type: action}}); 384 rules.push({trigger: trigger, action: {type: action}});
329 }
330
331 function hasNonASCI(obj)
332 {
333 if (typeof obj == "string")
334 {
335 if (/[^\x00-\x7F]/.test(obj))
336 return true;
337 }
338
339 if (typeof obj == "object")
340 {
341 if (obj instanceof Array)
342 for (let item of obj)
343 if (hasNonASCI(item))
344 return true;
345
346 let names = Object.getOwnPropertyNames(obj);
347 for (let name of names)
348 if (hasNonASCI(obj[name]))
349 return true;
350 }
351
352 return false;
353 } 385 }
354 386
355 function convertIDSelectorsToAttributeSelectors(selector) 387 function convertIDSelectorsToAttributeSelectors(selector)
356 { 388 {
357 // First we figure out where all the IDs are 389 // First we figure out where all the IDs are
358 let sep = ""; 390 let sep = "";
359 let start = null; 391 let start = null;
360 let positions = []; 392 let positions = [];
361 for (let i = 0; i < selector.length; i++) 393 for (let i = 0; i < selector.length; i++)
362 { 394 {
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
396 { 428 {
397 newSelector.push(selector.substring(i, pos.start)); 429 newSelector.push(selector.substring(i, pos.start));
398 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 430 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
399 i = pos.end; 431 i = pos.end;
400 } 432 }
401 newSelector.push(selector.substring(i)); 433 newSelector.push(selector.substring(i));
402 434
403 return newSelector.join(""); 435 return newSelector.join("");
404 } 436 }
405 437
406 function addCSSRules(rules, selectors, matchDomain) 438 function addCSSRules(rules, selectors, matchDomain, exceptionDomains)
407 { 439 {
440 let unlessDomain = exceptionDomains.size > 0 ? [] : null;
441
442 exceptionDomains.forEach(name => unlessDomain.push("*" + name));
443
408 while (selectors.length) 444 while (selectors.length)
409 { 445 {
410 let selector = selectors.splice(0, selectorLimit).join(", "); 446 let selector = selectors.splice(0, selectorLimit).join(", ");
411 447
412 // As of Safari 9.0 element IDs are matched as lowercase. We work around 448 // As of Safari 9.0 element IDs are matched as lowercase. We work around
413 // this by converting to the attribute format [id="elementID"] 449 // this by converting to the attribute format [id="elementID"]
414 selector = convertIDSelectorsToAttributeSelectors(selector); 450 selector = convertIDSelectorsToAttributeSelectors(selector);
415 451
416 rules.push({ 452 let rule = {
417 trigger: {"url-filter": matchDomain, 453 trigger: {"url-filter": matchDomain,
418 "url-filter-is-case-sensitive": true}, 454 "url-filter-is-case-sensitive": true},
419 action: {type: "css-display-none", 455 action: {type: "css-display-none",
420 selector: selector} 456 selector: selector}
421 }); 457 };
458
459 if (unlessDomain)
460 rule.trigger["unless-domain"] = unlessDomain;
461
462 rules.push(rule);
422 } 463 }
423 } 464 }
424 465
425 let ContentBlockerList = 466 let ContentBlockerList =
426 /** 467 /**
427 * Create a new Adblock Plus filter to content blocker list converter 468 * Create a new Adblock Plus filter to content blocker list converter
428 * 469 *
429 * @constructor 470 * @constructor
430 */ 471 */
431 exports.ContentBlockerList = function () 472 exports.ContentBlockerList = function ()
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
508 { 549 {
509 for (let matchDomain of result.matchDomains) 550 for (let matchDomain of result.matchDomains)
510 { 551 {
511 let group = groupedElemhideFilters.get(matchDomain) || []; 552 let group = groupedElemhideFilters.get(matchDomain) || [];
512 group.push(result.selector); 553 group.push(result.selector);
513 groupedElemhideFilters.set(matchDomain, group); 554 groupedElemhideFilters.set(matchDomain, group);
514 } 555 }
515 } 556 }
516 } 557 }
517 558
518 addCSSRules(rules, genericSelectors, "^https?://"); 559 // Separate out the element hiding exceptions that have only a hostname part
519 560 // from the rest. This allows us to implement a workaround for issue #5345
520 // Right after the generic element hiding filters, add the exceptions that 561 // (WebKit bug #167423), but as a bonus it also reduces the number of
521 // should apply only to those filters. 562 // generated rules. The downside is that the exception will only apply to the
522 for (let filter of this.generichideExceptions) 563 // top-level document, not to iframes. We have to live with this until the
523 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 564 // WebKit bug is fixed in all supported versions of Safari.
565 // https://bugs.webkit.org/show_bug.cgi?id=167423
566 //
567 // Note that as a result of this workaround we end up with a huge rule set in
568 // terms of the amount of memory used. This can cause Node.js to throw
569 // "JavaScript heap out of memory". To avoid this, call Node.js with
570 // --max_old_space_size=4096
571 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);
572
573 let genericSelectorExceptionDomains =
574 extractFilterDomains(this.generichideExceptions);
575 elemhideExceptionDomains.forEach(name =>
576 {
577 genericSelectorExceptionDomains.add(name);
578 });
579
580 addCSSRules(rules, genericSelectors, "^https?://",
581 genericSelectorExceptionDomains);
524 582
525 groupedElemhideFilters.forEach((selectors, matchDomain) => 583 groupedElemhideFilters.forEach((selectors, matchDomain) =>
526 { 584 {
527 addCSSRules(rules, selectors, matchDomain); 585 addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains);
528 }); 586 });
529
530 for (let filter of this.elemhideExceptions)
531 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
532 587
533 let requestFilterExceptionDomains = []; 588 let requestFilterExceptionDomains = [];
534 for (let filter of this.genericblockExceptions) 589 for (let filter of this.genericblockExceptions)
535 { 590 {
536 let parsed = parseFilterRegexpSource(filter.regexpSource); 591 let parsed = parseFilterRegexpSource(filter.regexpSource);
537 if (parsed.hostname) 592 if (parsed.hostname)
538 requestFilterExceptionDomains.push(parsed.hostname); 593 requestFilterExceptionDomains.push(parsed.hostname);
539 } 594 }
540 595
541 for (let filter of this.requestFilters) 596 for (let filter of this.requestFilters)
542 { 597 {
543 convertFilterAddRules(rules, filter, "block", true, 598 convertFilterAddRules(rules, filter, "block", true,
544 requestFilterExceptionDomains); 599 requestFilterExceptionDomains);
545 } 600 }
546 601
547 for (let filter of this.requestExceptions) 602 for (let filter of this.requestExceptions)
548 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 603 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
549 604
550 return rules.filter(rule => !hasNonASCI(rule)); 605 return rules;
551 }; 606 };
LEFTRIGHT

Powered by Google App Engine
This is Rietveld