Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29340694: Issue 3956 - Convert domain whitelisting filters (Closed)
Patch Set: Fix whitelisting request type logic Created May 17, 2016, 11:22 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2016 Eyeo GmbH 3 * Copyright (C) 2006-2016 Eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
16 */ 16 */
17 17
18 /** @module abp2blocklist */ 18 /** @module abp2blocklist */
19 19
20 "use strict"; 20 "use strict";
21 21
22 let filterClasses = require("filterClasses"); 22 let filterClasses = require("filterClasses");
23 let tldjs = require("tldjs"); 23 let tldjs = require("tldjs");
24 let punycode = require("punycode"); 24 let punycode = require("punycode");
25 25
26 const selectorLimit = 5000; 26 const selectorLimit = 5000;
27 const typeMap = filterClasses.RegExpFilter.typeMap; 27 const typeMap = filterClasses.RegExpFilter.typeMap;
28 const whitelistableRequestTypes = (typeMap.IMAGE
29 | typeMap.STYLESHEET
30 | typeMap.SCRIPT
31 | typeMap.FONT
32 | typeMap.MEDIA
33 | typeMap.POPUP
34 | typeMap.OBJECT
35 | typeMap.OBJECT_SUBREQUEST
36 | typeMap.XMLHTTPREQUEST
37 | typeMap.PING
38 | typeMap.SUBDOCUMENT
39 | typeMap.OTHER);
28 40
29 function parseDomains(domains, included, excluded) 41 function parseDomains(domains, included, excluded)
30 { 42 {
31 for (let domain in domains) 43 for (let domain in domains)
32 { 44 {
33 if (domain != "") 45 if (domain != "")
34 { 46 {
35 let enabled = domains[domain]; 47 let enabled = domains[domain];
36 domain = punycode.toASCII(domain.toLowerCase()); 48 domain = punycode.toASCII(domain.toLowerCase());
37 49
(...skipping 21 matching lines...) Expand all
59 let excluded = []; 71 let excluded = [];
60 let rules = []; 72 let rules = [];
61 73
62 parseDomains(filter.domains, included, excluded); 74 parseDomains(filter.domains, included, excluded);
63 75
64 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions)) 76 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))
65 return {matchDomains: included.map(matchDomain), selector: filter.selector}; 77 return {matchDomains: included.map(matchDomain), selector: filter.selector};
66 } 78 }
67 79
68 /** 80 /**
69 * Convert the given filter "regexpSource" string into a regular expression, 81 * Parse the given filter "regexpSource" string. Producing a regular expression,
70 * handling the conversion of unicode inside hostnames to punycode. 82 * extracting the hostname (if any), deciding if the regular expression is safe
71 * (Also deciding if the regular expression can be safely converted to and 83 * to be converted + matched as lower case and noting if the source contains
72 * matched as lower case or not.) 84 * anything after the hostname.)
73 * 85 *
74 * @param {string} text regexpSource property of a filter 86 * @param {string} text regexpSource property of a filter
75 * @returns {object} An object containing a regular expression string and a bool 87 * @returns {object} An object containing a regular expression string, a bool
76 * indicating if the filter can be safely matched as lower 88 * indicating if the filter can be safely matched as lower
77 * case: {regexp: "...", canSafelyMatchAsLowercase: true/false } 89 * case, a hostname string (or undefined) and a bool
90 * indicating if the source only contains a hostname or not:
91 * {regexp: "...",
92 * canSafelyMatchAsLowercase: true/false,
93 * hostname: "...",
94 * justHostname: true/false}
78 */ 95 */
79 function toRegExp(text) 96 function parseFilterRegexpSource(text)
80 { 97 {
81 let result = []; 98 let regexp = [];
82 let lastIndex = text.length - 1; 99 let lastIndex = text.length - 1;
100 let hostname;
83 let hostnameStart = null; 101 let hostnameStart = null;
84 let hostnameFinished = false; 102 let hostnameFinished = false;
103 let justHostname = false;
85 let canSafelyMatchAsLowercase = false; 104 let canSafelyMatchAsLowercase = false;
86 105
87 for (let i = 0; i < text.length; i++) 106 for (let i = 0; i < text.length; i++)
88 { 107 {
89 let c = text[i]; 108 let c = text[i];
90 109
110 if (hostnameFinished)
111 justHostname = false;
112
91 // If we're currently inside the hostname we have to be careful not to 113 // If we're currently inside the hostname we have to be careful not to
92 // escape any characters until after we have converted it to punycode. 114 // escape any characters until after we have converted it to punycode.
93 if (hostnameStart != null && !hostnameFinished) 115 if (hostnameStart != null && !hostnameFinished)
94 { 116 {
95 let endingChar = (c == "*" || c == "^" || 117 let endingChar = (c == "*" || c == "^" ||
96 c == "?" || c == "/" || c == "|"); 118 c == "?" || c == "/" || c == "|");
97 if (!endingChar && i != lastIndex) 119 if (!endingChar && i != lastIndex)
98 continue; 120 continue;
99 121
100 let hostname = text.substring(hostnameStart, endingChar ? i : i + 1); 122 hostname = punycode.toASCII(
101 hostnameFinished = true; 123 text.substring(hostnameStart, endingChar ? i : i + 1)
102 result.push(escapeRegExp(punycode.toASCII(hostname))); 124 );
125 hostnameFinished = justHostname = true;
126 regexp.push(escapeRegExp(hostname));
103 if (!endingChar) 127 if (!endingChar)
104 break; 128 break;
105 } 129 }
106 130
107 switch (c) 131 switch (c)
108 { 132 {
109 case "*": 133 case "*":
110 if (result.length > 0 && i < lastIndex && text[i + 1] != "*") 134 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*")
111 result.push(".*"); 135 regexp.push(".*");
112 break; 136 break;
113 case "^": 137 case "^":
114 if (i < lastIndex) 138 if (i < lastIndex)
115 result.push("."); 139 regexp.push(".");
116 break; 140 break;
117 case "|": 141 case "|":
118 if (i == 0) 142 if (i == 0)
119 { 143 {
120 result.push("^"); 144 regexp.push("^");
121 break; 145 break;
122 } 146 }
123 if (i == lastIndex) 147 if (i == lastIndex)
124 { 148 {
125 result.push("$"); 149 regexp.push("$");
126 break; 150 break;
127 } 151 }
128 if (i == 1 && text[0] == "|") 152 if (i == 1 && text[0] == "|")
129 { 153 {
130 hostnameStart = i + 1; 154 hostnameStart = i + 1;
131 canSafelyMatchAsLowercase = true; 155 canSafelyMatchAsLowercase = true;
132 result.push("https?://"); 156 regexp.push("https?://");
133 break; 157 break;
134 } 158 }
135 result.push("\\|"); 159 regexp.push("\\|");
136 break; 160 break;
137 case "/": 161 case "/":
138 if (!hostnameFinished && 162 if (!hostnameFinished &&
139 text.charAt(i-2) == ":" && text.charAt(i-1) == "/") 163 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")
140 { 164 {
141 hostnameStart = i + 1; 165 hostnameStart = i + 1;
142 canSafelyMatchAsLowercase = true; 166 canSafelyMatchAsLowercase = true;
143 } 167 }
144 result.push("/"); 168 regexp.push("/");
145 break; 169 break;
146 case ".": case "+": case "$": case "?": 170 case ".": case "+": case "$": case "?":
147 case "{": case "}": case "(": case ")": 171 case "{": case "}": case "(": case ")":
148 case "[": case "]": case "\\": 172 case "[": case "]": case "\\":
149 result.push("\\", c); 173 regexp.push("\\", c);
150 break; 174 break;
151 default: 175 default:
152 if (hostnameFinished && (c >= "a" && c <= "z" || 176 if (hostnameFinished && (c >= "a" && c <= "z" ||
153 c >= "A" && c <= "Z")) 177 c >= "A" && c <= "Z"))
154 canSafelyMatchAsLowercase = false; 178 canSafelyMatchAsLowercase = false;
155 result.push(c); 179 regexp.push(c);
156 } 180 }
157 } 181 }
158 182
159 return {regexp: result.join(""), 183 return {
160 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase}; 184 regexp: regexp.join(""),
161 } 185 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,
162 186 hostname: hostname,
163 function getRegExpTrigger(filter) 187 justHostname: justHostname
164 { 188 };
165 let result = toRegExp(filter.regexpSource);
166
167 let trigger = {"url-filter": result.regexp};
168
169 // Limit rules to to HTTP(S) URLs
170 if (!/^(\^|http)/i.test(trigger["url-filter"]))
171 trigger["url-filter"] = "^https?://.*" + trigger["url-filter"];
172
173 // For rules containing only a hostname we know that we're matching against
174 // a lowercase string unless the matchCase option was passed.
175 if (result.canSafelyMatchAsLowercase && !filter.matchCase)
176 trigger["url-filter"] = trigger["url-filter"].toLowerCase();
177
178 if (result.canSafelyMatchAsLowercase || filter.matchCase)
179 trigger["url-filter-is-case-sensitive"] = true;
180
181 return trigger;
182 } 189 }
183 190
184 function getResourceTypes(filter) 191 function getResourceTypes(filter)
185 { 192 {
186 let types = []; 193 let types = [];
187 194
188 if (filter.contentType & typeMap.IMAGE) 195 if (filter.contentType & typeMap.IMAGE)
189 types.push("image"); 196 types.push("image");
190 if (filter.contentType & typeMap.STYLESHEET) 197 if (filter.contentType & typeMap.STYLESHEET)
191 types.push("style-sheet"); 198 types.push("style-sheet");
(...skipping 24 matching lines...) Expand all
216 { 223 {
217 result.push(domain); 224 result.push(domain);
218 225
219 if (tldjs.getDomain(domain) == domain) 226 if (tldjs.getDomain(domain) == domain)
220 result.push("www." + domain); 227 result.push("www." + domain);
221 } 228 }
222 229
223 return result; 230 return result;
224 } 231 }
225 232
226 function convertFilter(filter, action, withResourceTypes) 233 function convertFilterAddRules(rules, filter, action, withResourceTypes)
227 { 234 {
228 let trigger = getRegExpTrigger(filter); 235 let parsed = parseFilterRegexpSource(filter.regexpSource);
236
237 // For the special case of $document whitelisting filters with just a domain
238 // we can generate an equivalent blocking rule exception using if-domain.
239 if (filter instanceof filterClasses.WhitelistFilter &&
240 filter.contentType & typeMap.DOCUMENT &&
241 parsed.justHostname)
242 {
243 rules.push({
244 trigger: {
245 "url-filter": ".*",
246 "if-domain": addDomainPrefix([parsed.hostname])
247 },
248 action: {type: "ignore-previous-rules"}
249 });
250 // If the filter contains other supported options we'll need to generate
251 // further rules for it, but if not we can simply return now.
252 if (!(filter.contentType | whitelistableRequestTypes))
253 return;
254 }
255
256 let trigger = {"url-filter": parsed.regexp};
257
258 // Limit rules to HTTP(S) URLs
259 if (!/^(\^|http)/i.test(trigger["url-filter"]))
260 trigger["url-filter"] = "^https?://.*" + trigger["url-filter"];
261
262 // For rules containing only a hostname we know that we're matching against
263 // a lowercase string unless the matchCase option was passed.
264 if (parsed.canSafelyMatchAsLowercase && !filter.matchCase)
265 trigger["url-filter"] = trigger["url-filter"].toLowerCase();
266
267 if (parsed.canSafelyMatchAsLowercase || filter.matchCase)
268 trigger["url-filter-is-case-sensitive"] = true;
269
229 let included = []; 270 let included = [];
230 let excluded = []; 271 let excluded = [];
231 272
232 parseDomains(filter.domains, included, excluded); 273 parseDomains(filter.domains, included, excluded);
233 274
234 if (withResourceTypes) 275 if (withResourceTypes)
235 trigger["resource-type"] = getResourceTypes(filter); 276 trigger["resource-type"] = getResourceTypes(filter);
236 if (filter.thirdParty != null) 277 if (filter.thirdParty != null)
237 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"]; 278 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];
238 279
239 if (included.length > 0) 280 if (included.length > 0)
240 trigger["if-domain"] = addDomainPrefix(included); 281 trigger["if-domain"] = addDomainPrefix(included);
241 else if (excluded.length > 0) 282 else if (excluded.length > 0)
242 trigger["unless-domain"] = addDomainPrefix(excluded); 283 trigger["unless-domain"] = addDomainPrefix(excluded);
243 284
244 return {trigger: trigger, action: {type: action}}; 285 rules.push({trigger: trigger, action: {type: action}});
245 } 286 }
246 287
247 function hasNonASCI(obj) 288 function hasNonASCI(obj)
248 { 289 {
249 if (typeof obj == "string") 290 if (typeof obj == "string")
250 { 291 {
251 if (/[^\x00-\x7F]/.test(obj)) 292 if (/[^\x00-\x7F]/.test(obj))
252 return true; 293 return true;
253 } 294 }
254 295
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
345 return; 386 return;
346 if (filter instanceof filterClasses.RegExpFilter && 387 if (filter instanceof filterClasses.RegExpFilter &&
347 filter.regexpSource == null) 388 filter.regexpSource == null)
348 return; 389 return;
349 390
350 if (filter instanceof filterClasses.BlockingFilter) 391 if (filter instanceof filterClasses.BlockingFilter)
351 this.requestFilters.push(filter); 392 this.requestFilters.push(filter);
352 393
353 if (filter instanceof filterClasses.WhitelistFilter) 394 if (filter instanceof filterClasses.WhitelistFilter)
354 { 395 {
355 if (filter.contentType & (typeMap.IMAGE 396 if (filter.contentType & (typeMap.DOCUMENT | whitelistableRequestTypes))
356 | typeMap.STYLESHEET
357 | typeMap.SCRIPT
358 | typeMap.FONT
359 | typeMap.MEDIA
360 | typeMap.POPUP
361 | typeMap.OBJECT
362 | typeMap.OBJECT_SUBREQUEST
363 | typeMap.XMLHTTPREQUEST
364 | typeMap.PING
365 | typeMap.SUBDOCUMENT
366 | typeMap.OTHER))
367 this.requestExceptions.push(filter); 397 this.requestExceptions.push(filter);
368 398
369 if (filter.contentType & typeMap.ELEMHIDE) 399 if (filter.contentType & typeMap.ELEMHIDE)
370 this.elemhideExceptions.push(filter); 400 this.elemhideExceptions.push(filter);
371 } 401 }
372 402
373 if (filter instanceof filterClasses.ElemHideFilter) 403 if (filter instanceof filterClasses.ElemHideFilter)
374 this.elemhideFilters.push(filter); 404 this.elemhideFilters.push(filter);
375 405
376 if (filter instanceof filterClasses.ElemHideException) 406 if (filter instanceof filterClasses.ElemHideException)
377 { 407 {
378 let domains = this.elemhideSelectorExceptions[filter.selector]; 408 let domains = this.elemhideSelectorExceptions[filter.selector];
379 if (!domains) 409 if (!domains)
380 domains = this.elemhideSelectorExceptions[filter.selector] = []; 410 domains = this.elemhideSelectorExceptions[filter.selector] = [];
381 411
382 parseDomains(filter.domains, domains, []); 412 parseDomains(filter.domains, domains, []);
383 } 413 }
384 }; 414 };
385 415
386 /** 416 /**
387 * Generate content blocker list for all filters that were added 417 * Generate content blocker list for all filters that were added
388 * 418 *
389 * @returns {Filter} filter Filter to convert 419 * @returns {Filter} filter Filter to convert
390 */ 420 */
391 ContentBlockerList.prototype.generateRules = function(filter) 421 ContentBlockerList.prototype.generateRules = function(filter)
392 { 422 {
393 let rules = []; 423 let rules = [];
394 424
395 function addRule(rule)
396 {
397 if (!hasNonASCI(rule))
398 rules.push(rule);
399 }
400
401 let groupedElemhideFilters = new Map(); 425 let groupedElemhideFilters = new Map();
402 for (let filter of this.elemhideFilters) 426 for (let filter of this.elemhideFilters)
403 { 427 {
404 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 428 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
405 if (!result) 429 if (!result)
406 continue; 430 continue;
407 431
408 if (result.matchDomains.length == 0) 432 if (result.matchDomains.length == 0)
409 result.matchDomains = ["^https?://"]; 433 result.matchDomains = ["^https?://"];
410 434
411 for (let matchDomain of result.matchDomains) 435 for (let matchDomain of result.matchDomains)
412 { 436 {
413 let group = groupedElemhideFilters.get(matchDomain) || []; 437 let group = groupedElemhideFilters.get(matchDomain) || [];
414 group.push(result.selector); 438 group.push(result.selector);
415 groupedElemhideFilters.set(matchDomain, group); 439 groupedElemhideFilters.set(matchDomain, group);
416 } 440 }
417 } 441 }
418 442
419 groupedElemhideFilters.forEach((selectors, matchDomain) => 443 groupedElemhideFilters.forEach((selectors, matchDomain) =>
420 { 444 {
421 while (selectors.length) 445 while (selectors.length)
422 { 446 {
423 let selector = selectors.splice(0, selectorLimit).join(", "); 447 let selector = selectors.splice(0, selectorLimit).join(", ");
424 448
425 // As of Safari 9.0 element IDs are matched as lowercase. We work around 449 // As of Safari 9.0 element IDs are matched as lowercase. We work around
426 // this by converting to the attribute format [id="elementID"] 450 // this by converting to the attribute format [id="elementID"]
427 selector = convertIDSelectorsToAttributeSelectors(selector); 451 selector = convertIDSelectorsToAttributeSelectors(selector);
428 452
429 addRule({ 453 rules.push({
430 trigger: {"url-filter": matchDomain, 454 trigger: {"url-filter": matchDomain,
431 "url-filter-is-case-sensitive": true}, 455 "url-filter-is-case-sensitive": true},
432 action: {type: "css-display-none", 456 action: {type: "css-display-none",
433 selector: selector} 457 selector: selector}
434 }); 458 });
435 } 459 }
436 }); 460 });
437 461
438 for (let filter of this.elemhideExceptions) 462 for (let filter of this.elemhideExceptions)
439 addRule(convertFilter(filter, "ignore-previous-rules", false)); 463 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
440 for (let filter of this.requestFilters) 464 for (let filter of this.requestFilters)
441 addRule(convertFilter(filter, "block", true)); 465 convertFilterAddRules(rules, filter, "block", true);
442 for (let filter of this.requestExceptions) 466 for (let filter of this.requestExceptions)
443 addRule(convertFilter(filter, "ignore-previous-rules", true)); 467 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
444 468
445 return rules; 469 return rules.filter(rule => !hasNonASCI(rule));
446 }; 470 };
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld