Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Left Patch Set: Make it work in Safari Created May 5, 2017, 4:36 a.m.
Right Patch Set: Rebase Created July 28, 2017, 1:31 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
16 */ 16 */
17 17
18 /** @module abp2blocklist */ 18 /** @module abp2blocklist */
19 19
20 "use strict"; 20 "use strict";
21 21
22 let filterClasses = require("filterClasses"); 22 let filterClasses = require("filterClasses");
23 let tldjs = require("tldjs");
24 let punycode = require("punycode"); 23 let punycode = require("punycode");
25 24
26 const selectorLimit = 5000; 25 const selectorLimit = 5000;
27 const typeMap = filterClasses.RegExpFilter.typeMap; 26 const typeMap = filterClasses.RegExpFilter.typeMap;
28 const whitelistableRequestTypes = (typeMap.IMAGE 27
29 | typeMap.STYLESHEET 28 const httpRequestTypes = typeMap.IMAGE |
30 | typeMap.SCRIPT 29 typeMap.STYLESHEET |
31 | typeMap.FONT 30 typeMap.SCRIPT |
32 | typeMap.MEDIA 31 typeMap.FONT |
33 | typeMap.POPUP 32 typeMap.MEDIA |
34 | typeMap.OBJECT 33 typeMap.POPUP |
35 | typeMap.OBJECT_SUBREQUEST 34 typeMap.OBJECT |
36 | typeMap.XMLHTTPREQUEST 35 typeMap.OBJECT_SUBREQUEST |
37 | typeMap.PING 36 typeMap.XMLHTTPREQUEST |
38 | typeMap.SUBDOCUMENT 37 typeMap.PING |
39 | typeMap.OTHER); 38 typeMap.SUBDOCUMENT |
40 39 typeMap.OTHER;
41 function pearsonHash(message) 40 const rawRequestTypes = typeMap.XMLHTTPREQUEST |
42 { 41 typeMap.WEBSOCKET |
43 // This is an implementation of the Pearson hashing algorithm, where we 42 typeMap.WEBRTC |
44 // generate a 32-bit digest for a given message. 43 typeMap.OBJECT_SUBREQUEST |
45 44 typeMap.PING |
46 // Note that this code will only look at the lowest 8 bits of each character. 45 typeMap.OTHER;
47 // For best results, encode the input as UTF-8. 46 const whitelistableRequestTypes = httpRequestTypes |
48 47 typeMap.WEBSOCKET |
49 // A table of all numbers from 0 through 255 shuffled up. 48 typeMap.WEBRTC;
50 const table = [ 49
51 0xe5, 0x0a, 0x9f, 0x79, 0x99, 0xad, 0x10, 0x85, 0x5d, 0x55, 0x75, 0x2e, 50 function callLater(func)
52 0x04, 0x1a, 0xb5, 0x7d, 0x96, 0xe6, 0xa3, 0xc6, 0x82, 0x87, 0xb2, 0xef, 51 {
53 0x00, 0x64, 0x70, 0x4b, 0xe4, 0x2f, 0x37, 0x52, 0x90, 0x1d, 0x08, 0x68, 52 return new Promise(resolve =>
54 0x3a, 0x26, 0x74, 0xaa, 0xc1, 0x80, 0x17, 0xc2, 0x0f, 0xec, 0xc8, 0x1f, 53 {
55 0xe2, 0x3c, 0xe1, 0xa1, 0x8f, 0xfd, 0x9d, 0x0b, 0xd5, 0xcc, 0xd4, 0xd9, 54 let call = () => resolve(func());
56 0xf0, 0x3d, 0x5e, 0x57, 0xae, 0x12, 0x46, 0xb0, 0x63, 0x94, 0x61, 0x9a, 55
57 0xbb, 0x76, 0x0c, 0x3f, 0xc4, 0x59, 0xdc, 0x5c, 0xb4, 0xc7, 0x73, 0x39, 56 // If this looks like Node.js, call process.nextTick, otherwise call
58 0x65, 0x2c, 0x2a, 0xc3, 0xed, 0x20, 0x54, 0xfe, 0xfb, 0xd0, 0xa6, 0x33, 57 // setTimeout.
59 0x4a, 0x9e, 0xe7, 0x49, 0xea, 0x58, 0xaf, 0x35, 0x30, 0x95, 0x2b, 0x56, 58 if (typeof process != "undefined")
60 0x14, 0xff, 0xb1, 0xd6, 0x27, 0x6a, 0x88, 0x89, 0x43, 0x4c, 0xca, 0xb9, 59 process.nextTick(call);
61 0x21, 0x8a, 0x78, 0xf1, 0x18, 0x1e, 0xd1, 0xe0, 0x60, 0xf8, 0x3e, 0xdd, 60 else
62 0x25, 0x16, 0xde, 0xc0, 0x98, 0x28, 0x7a, 0x3b, 0x1b, 0x45, 0xa5, 0xb3, 61 setTimeout(call, 0);
63 0xe3, 0x84, 0xd3, 0xb8, 0xbd, 0x47, 0xe9, 0xfa, 0xc9, 0xb6, 0xbe, 0x6c, 62 });
64 0x9c, 0xac, 0xda, 0xfc, 0x41, 0x0d, 0x07, 0x91, 0x6b, 0x6f, 0x03, 0xcb, 63 }
65 0xbc, 0x8d, 0x06, 0x01, 0xd2, 0x8c, 0x19, 0x5a, 0x02, 0xba, 0x4f, 0x6e, 64
66 0x2d, 0xf4, 0xcf, 0x7e, 0x6d, 0x42, 0x93, 0x31, 0xbf, 0xdf, 0x5f, 0x67, 65 function async(callees, mapFunction)
67 0x53, 0xf6, 0x38, 0x9b, 0xa7, 0xe8, 0xee, 0x5b, 0x0e, 0x22, 0xdb, 0x51, 66 {
68 0x8e, 0x69, 0x97, 0x32, 0x36, 0xce, 0x77, 0x4d, 0xa4, 0xf2, 0x23, 0xc5, 67 if (!(Symbol.iterator in callees))
69 0x11, 0x05, 0xab, 0xf9, 0x13, 0xd8, 0x7b, 0xa8, 0x40, 0x66, 0xd7, 0x24, 68 callees = [callees];
70 0x86, 0xa0, 0xeb, 0xf3, 0x81, 0x4e, 0x50, 0xf7, 0xb7, 0x7f, 0x83, 0xa2, 69
71 0xa9, 0x09, 0xcd, 0x62, 0x7c, 0x92, 0x8b, 0x71, 0x44, 0xf5, 0x72, 0x1c, 70 let lastPause = Date.now();
72 0x29, 0x48, 0x34, 0x15 71 let index = 0;
73 ]; 72
74 73 let promise = Promise.resolve();
75 let digest = 0; 74
76 75 for (let next of callees)
77 for (let i = 0; i < 4; i++) 76 {
78 { 77 let currentIndex = index;
79 let d = table[(message.charCodeAt(0) + i) % 256]; 78
80 for (let j = 1; j < message.length; j++) 79 promise = promise.then(() =>
81 d = table[d ^ message.charCodeAt(j)]; 80 {
82 digest |= (d & 0xff) << i * 8; 81 if (mapFunction)
83 } 82 next = mapFunction(next, currentIndex);
84 83
85 return digest; 84 // If it has been 100ms or longer since the last call, take a pause. This
85 // keeps the browser from freezing up.
86 let now = Date.now();
87 if (now - lastPause >= 100)
88 {
89 lastPause = now;
90 return callLater(next);
91 }
92
93 return next();
94 });
95
96 index++;
97 }
98
99 return promise;
86 } 100 }
87 101
88 function parseDomains(domains, included, excluded) 102 function parseDomains(domains, included, excluded)
89 { 103 {
90 for (let domain in domains) 104 for (let domain in domains)
91 { 105 {
92 if (domain != "") 106 if (domain != "")
93 { 107 {
94 let enabled = domains[domain]; 108 let enabled = domains[domain];
95 domain = punycode.toASCII(domain.toLowerCase()); 109 domain = punycode.toASCII(domain.toLowerCase());
96 110
97 if (!enabled) 111 if (!enabled)
98 excluded.push(domain); 112 excluded.push(domain);
99 else if (!domains[""]) 113 else if (!domains[""])
100 included.push(domain); 114 included.push(domain);
101 } 115 }
102 } 116 }
103 } 117 }
104 118
105 function escapeRegExp(s) 119 function escapeRegExp(s)
106 { 120 {
107 return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); 121 return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
108 } 122 }
109 123
110 function matchDomain(domain) 124 function matchDomain(domain)
111 { 125 {
126 if (!domain)
127 return "^https?://";
128
112 return "^https?://([^/:]*\\.)?" + escapeRegExp(domain).toLowerCase() + "[/:]"; 129 return "^https?://([^/:]*\\.)?" + escapeRegExp(domain).toLowerCase() + "[/:]";
130 }
131
132 function getURLSchemes(contentType)
133 {
134 // If the given content type includes all supported URL schemes, simply
135 // return a single generic URL scheme pattern. This minimizes the size of the
136 // generated rule set. The downside to this is that it will also match
137 // schemes that we do not want to match (e.g. "ftp://"), but this can be
138 // mitigated by adding exceptions for those schemes.
139 if (contentType & typeMap.WEBSOCKET && contentType & typeMap.WEBRTC &&
140 contentType & httpRequestTypes)
141 return ["[^:]+:(//)?"];
142
143 let urlSchemes = [];
144
145 if (contentType & typeMap.WEBSOCKET)
146 urlSchemes.push("wss?://");
147
148 if (contentType & typeMap.WEBRTC)
149 urlSchemes.push("stuns?:", "turns?:");
150
151 if (contentType & httpRequestTypes)
152 urlSchemes.push("https?://");
153
154 return urlSchemes;
155 }
156
157 function findSubdomainsInList(domain, list)
158 {
159 let subdomains = [];
160 let suffixLength = domain.length + 1;
161
162 for (let name of list)
163 {
164 if (name.length > suffixLength && name.slice(-suffixLength) == "." + domain)
165 subdomains.push(name.slice(0, -suffixLength));
166 }
167
168 return subdomains;
169 }
170
171 function extractFilterDomains(filters)
172 {
173 let domains = new Set();
174 for (let filter of filters)
175 {
176 let parsed = parseFilterRegexpSource(filter.regexpSource);
177 if (parsed.justHostname)
178 domains.add(parsed.hostname);
179 }
180 return domains;
113 } 181 }
114 182
115 function convertElemHideFilter(filter, elemhideSelectorExceptions) 183 function convertElemHideFilter(filter, elemhideSelectorExceptions)
116 { 184 {
117 let included = []; 185 let included = [];
118 let excluded = []; 186 let excluded = [];
119 let rules = [];
120 187
121 parseDomains(filter.domains, included, excluded); 188 parseDomains(filter.domains, included, excluded);
122 189
123 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions)) 190 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))
124 return {matchDomains: included.map(matchDomain), selector: filter.selector}; 191 return {matchDomains: included, selector: filter.selector};
125 } 192 }
126 193
127 /** 194 /**
128 * Parse the given filter "regexpSource" string. Producing a regular expression, 195 * Parse the given filter "regexpSource" string. Producing a regular expression,
129 * extracting the hostname (if any), deciding if the regular expression is safe 196 * extracting the hostname (if any), deciding if the regular expression is safe
130 * to be converted + matched as lower case and noting if the source contains 197 * to be converted + matched as lower case and noting if the source contains
131 * anything after the hostname.) 198 * anything after the hostname.)
132 * 199 *
133 * @param {string} text regexpSource property of a filter 200 * @param {string} text regexpSource property of a filter
201 * @param {string} urlScheme The URL scheme to use in the regular expression
134 * @returns {object} An object containing a regular expression string, a bool 202 * @returns {object} An object containing a regular expression string, a bool
135 * indicating if the filter can be safely matched as lower 203 * indicating if the filter can be safely matched as lower
136 * case, a hostname string (or undefined) and a bool 204 * case, a hostname string (or undefined) and a bool
137 * indicating if the source only contains a hostname or not: 205 * indicating if the source only contains a hostname or not:
138 * {regexp: "...", 206 * {regexp: "...",
139 * canSafelyMatchAsLowercase: true/false, 207 * canSafelyMatchAsLowercase: true/false,
140 * hostname: "...", 208 * hostname: "...",
141 * justHostname: true/false} 209 * justHostname: true/false}
142 */ 210 */
143 function parseFilterRegexpSource(text) 211 function parseFilterRegexpSource(text, urlScheme)
144 { 212 {
145 let regexp = []; 213 let regexp = [];
146 let lastIndex = text.length - 1; 214
215 // Convert the text into an array of Unicode characters.
216 //
217 // In the case of surrogate pairs (the smiley emoji, for example), one
218 // Unicode code point is represented by two JavaScript characters together.
219 // We want to iterate over Unicode code points rather than JavaScript
220 // characters.
221 let characters = Array.from(text);
222
223 let lastIndex = characters.length - 1;
147 let hostname; 224 let hostname;
148 let hostnameStart = null; 225 let hostnameStart = null;
149 let hostnameFinished = false; 226 let hostnameFinished = false;
150 let justHostname = false; 227 let justHostname = false;
151 let canSafelyMatchAsLowercase = false; 228 let canSafelyMatchAsLowercase = false;
152 229
153 for (let i = 0; i < text.length; i++) 230 if (!urlScheme)
154 { 231 urlScheme = getURLSchemes()[0];
155 let c = text[i]; 232
233 for (let i = 0; i < characters.length; i++)
234 {
235 let c = characters[i];
156 236
157 if (hostnameFinished) 237 if (hostnameFinished)
158 justHostname = false; 238 justHostname = false;
159 239
160 // If we're currently inside the hostname we have to be careful not to 240 // If we're currently inside the hostname we have to be careful not to
161 // escape any characters until after we have converted it to punycode. 241 // escape any characters until after we have converted it to punycode.
162 if (hostnameStart != null && !hostnameFinished) 242 if (hostnameStart != null && !hostnameFinished)
163 { 243 {
164 let endingChar = (c == "*" || c == "^" || 244 let endingChar = (c == "*" || c == "^" ||
165 c == "?" || c == "/" || c == "|"); 245 c == "?" || c == "/" || c == "|");
166 if (!endingChar && i != lastIndex) 246 if (!endingChar && i != lastIndex)
167 continue; 247 continue;
168 248
169 hostname = punycode.toASCII( 249 hostname = punycode.toASCII(
170 text.substring(hostnameStart, endingChar ? i : i + 1) 250 characters.slice(hostnameStart, endingChar ? i : i + 1).join("")
251 .toLowerCase()
171 ); 252 );
172 hostnameFinished = justHostname = true; 253 hostnameFinished = justHostname = true;
173 regexp.push(escapeRegExp(hostname)); 254 regexp.push(escapeRegExp(hostname));
174 if (!endingChar) 255 if (!endingChar)
175 break; 256 break;
176 } 257 }
177 258
178 switch (c) 259 switch (c)
179 { 260 {
180 case "*": 261 case "*":
181 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*") 262 if (regexp.length > 0 && i < lastIndex && characters[i + 1] != "*")
182 regexp.push(".*"); 263 regexp.push(".*");
183 break; 264 break;
184 case "^": 265 case "^":
185 if (i < lastIndex) 266 let alphabet = "a-z";
186 regexp.push("."); 267 // If justHostname is true and we've encountered a "^", it means we're
268 // still in the hostname part of the URL. Since hostnames are always
269 // lower case (Punycode), there's no need to include "A-Z" in the
270 // pattern. Further, subsequent code may lower-case the entire regular
271 // expression (if the URL contains only the hostname part), leaving us
272 // with "a-za-z", which would be redundant.
273 if (!justHostname)
274 alphabet = "A-Z" + alphabet;
275 let digits = "0-9";
276 // Note that the "-" must appear first here in order to retain its
277 // literal meaning within the brackets.
278 let specialCharacters = "-_.%";
279 let separator = "[^" + specialCharacters + alphabet + digits + "]";
280 if (i == 0)
281 regexp.push("^" + urlScheme + "(.*" + separator + ")?");
282 else if (i == lastIndex)
283 regexp.push("(" + separator + ".*)?$");
284 else
285 regexp.push(separator);
187 break; 286 break;
188 case "|": 287 case "|":
189 if (i == 0) 288 if (i == 0)
190 { 289 {
191 regexp.push("^"); 290 regexp.push("^");
192 break; 291 break;
193 } 292 }
194 if (i == lastIndex) 293 if (i == lastIndex)
195 { 294 {
196 regexp.push("$"); 295 regexp.push("$");
197 break; 296 break;
198 } 297 }
199 if (i == 1 && text[0] == "|") 298 if (i == 1 && characters[0] == "|")
200 { 299 {
201 hostnameStart = i + 1; 300 hostnameStart = i + 1;
202 canSafelyMatchAsLowercase = true; 301 canSafelyMatchAsLowercase = true;
203 regexp.push("https?://([^/]+\\.)?"); 302 regexp.push(urlScheme + "([^/]+\\.)?");
204 break; 303 break;
205 } 304 }
206 regexp.push("\\|"); 305 regexp.push("\\|");
207 break; 306 break;
208 case "/": 307 case "/":
209 if (!hostnameFinished && 308 if (!hostnameFinished &&
210 text.charAt(i-2) == ":" && text.charAt(i-1) == "/") 309 characters[i - 2] == ":" && characters[i - 1] == "/")
211 { 310 {
212 hostnameStart = i + 1; 311 hostnameStart = i + 1;
213 canSafelyMatchAsLowercase = true; 312 canSafelyMatchAsLowercase = true;
214 } 313 }
215 regexp.push("/"); 314 regexp.push("/");
216 break; 315 break;
217 case ".": case "+": case "$": case "?": 316 case ".": case "+": case "$": case "?":
218 case "{": case "}": case "(": case ")": 317 case "{": case "}": case "(": case ")":
219 case "[": case "]": case "\\": 318 case "[": case "]": case "\\":
220 regexp.push("\\", c); 319 regexp.push("\\", c);
221 break; 320 break;
222 default: 321 default:
223 if (hostnameFinished && (c >= "a" && c <= "z" || 322 if (hostnameFinished && (c >= "a" && c <= "z" ||
224 c >= "A" && c <= "Z")) 323 c >= "A" && c <= "Z"))
225 canSafelyMatchAsLowercase = false; 324 canSafelyMatchAsLowercase = false;
226 regexp.push(c); 325 regexp.push(c == "%" ? c : encodeURI(c));
227 } 326 }
228 } 327 }
229 328
230 return { 329 return {
231 regexp: regexp.join(""), 330 regexp: regexp.join(""),
232 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase, 331 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,
233 hostname: hostname, 332 hostname: hostname,
234 justHostname: justHostname 333 justHostname: justHostname
235 }; 334 };
236 } 335 }
237 336
238 function getResourceTypes(filter) 337 function getResourceTypes(contentType)
239 { 338 {
240 let types = []; 339 let types = [];
241 340
242 if (filter.contentType & typeMap.IMAGE) 341 if (contentType & typeMap.IMAGE)
243 types.push("image"); 342 types.push("image");
244 if (filter.contentType & typeMap.STYLESHEET) 343 if (contentType & typeMap.STYLESHEET)
245 types.push("style-sheet"); 344 types.push("style-sheet");
246 if (filter.contentType & typeMap.SCRIPT) 345 if (contentType & typeMap.SCRIPT)
247 types.push("script"); 346 types.push("script");
248 if (filter.contentType & typeMap.FONT) 347 if (contentType & typeMap.FONT)
249 types.push("font"); 348 types.push("font");
250 if (filter.contentType & (typeMap.MEDIA | typeMap.OBJECT)) 349 if (contentType & (typeMap.MEDIA | typeMap.OBJECT))
251 types.push("media"); 350 types.push("media");
252 if (filter.contentType & typeMap.POPUP) 351 if (contentType & typeMap.POPUP)
253 types.push("popup"); 352 types.push("popup");
254 if (filter.contentType & (typeMap.XMLHTTPREQUEST | 353 if (contentType & rawRequestTypes)
255 typeMap.OBJECT_SUBREQUEST |
256 typeMap.PING |
257 typeMap.OTHER))
258 types.push("raw"); 354 types.push("raw");
259 if (filter.contentType & typeMap.SUBDOCUMENT) 355 if (contentType & typeMap.SUBDOCUMENT)
260 types.push("document"); 356 types.push("document");
261 357
262 return types; 358 return types;
263 } 359 }
264 360
265 function addDomainPrefix(domains) 361 function makeRuleCopies(trigger, action, urlSchemes)
266 { 362 {
267 let result = []; 363 let copies = [];
268 364
269 for (let domain of domains) 365 // Always make a deep copy of the rule, since rules may have to be
270 { 366 // manipulated individually at a later stage.
271 result.push(domain); 367 let stringifiedTrigger = JSON.stringify(trigger);
272 368
273 if (tldjs.getDomain(domain) == domain) 369 let filterPattern = trigger["url-filter"].substring(1);
274 result.push("www." + domain); 370 let startIndex = 0;
275 } 371
276 372 // If the URL filter already begins with the first URL scheme pattern, skip
277 return result; 373 // it.
278 } 374 if (trigger["url-filter"].startsWith("^" + urlSchemes[0]))
279 375 {
280 function convertFilterAddRules(rules, filter, action, withResourceTypes) 376 filterPattern = filterPattern.substring(urlSchemes[0].length);
281 { 377 startIndex = 1;
282 let parsed = parseFilterRegexpSource(filter.regexpSource); 378 }
379 else
380 {
381 filterPattern = ".*" + filterPattern;
382 }
383
384 for (let i = startIndex; i < urlSchemes.length; i++)
385 {
386 let copyTrigger = Object.assign(JSON.parse(stringifiedTrigger), {
387 "url-filter": "^" + urlSchemes[i] + filterPattern
388 });
389 copies.push({trigger: copyTrigger, action});
390 }
391
392 return copies;
393 }
394
395 function excludeTopURLFromTrigger(trigger)
396 {
397 trigger["unless-top-url"] = [trigger["url-filter"]];
398 if (trigger["url-filter-is-case-sensitive"])
399 trigger["top-url-filter-is-case-sensitive"] = true;
400 }
401
402 function convertFilterAddRules(rules, filter, action, withResourceTypes,
403 exceptionDomains, contentType)
404 {
405 if (!contentType)
406 contentType = filter.contentType;
407
408 // If WebSocket or WebRTC are given along with other options but not
409 // including all three of WebSocket, WebRTC, and at least one HTTP raw type,
410 // we must generate multiple rules. For example, for the filter
411 // "foo$websocket,image", we must generate one rule with "^wss?://" and "raw"
412 // and another rule with "^https?://" and "image". If we merge the two, we
413 // end up blocking requests of all HTTP raw types (e.g. XMLHttpRequest)
414 // inadvertently.
415 if ((contentType & typeMap.WEBSOCKET && contentType != typeMap.WEBSOCKET &&
416 !(contentType & typeMap.WEBRTC &&
417 contentType & rawRequestTypes & httpRequestTypes)) ||
418 (contentType & typeMap.WEBRTC && contentType != typeMap.WEBRTC &&
419 !(contentType & typeMap.WEBSOCKET &&
420 contentType & rawRequestTypes & httpRequestTypes)))
421 {
422 if (contentType & typeMap.WEBSOCKET)
423 {
424 convertFilterAddRules(rules, filter, action, withResourceTypes,
425 exceptionDomains, typeMap.WEBSOCKET);
426 }
427
428 if (contentType & typeMap.WEBRTC)
429 {
430 convertFilterAddRules(rules, filter, action, withResourceTypes,
431 exceptionDomains, typeMap.WEBRTC);
432 }
433
434 contentType &= ~(typeMap.WEBSOCKET | typeMap.WEBRTC);
435
436 if (!contentType)
437 return;
438 }
439
440 let urlSchemes = getURLSchemes(contentType);
441 let parsed = parseFilterRegexpSource(filter.regexpSource, urlSchemes[0]);
283 442
284 // For the special case of $document whitelisting filters with just a domain 443 // For the special case of $document whitelisting filters with just a domain
285 // we can generate an equivalent blocking rule exception using if-domain. 444 // we can generate an equivalent blocking rule exception using if-domain.
286 if (filter instanceof filterClasses.WhitelistFilter && 445 if (filter instanceof filterClasses.WhitelistFilter &&
287 filter.contentType & typeMap.DOCUMENT && 446 contentType & typeMap.DOCUMENT &&
288 parsed.justHostname) 447 parsed.justHostname)
289 { 448 {
290 rules.push({ 449 rules.push({
291 trigger: { 450 trigger: {
292 "url-filter": ".*", 451 "url-filter": ".*",
293 "if-domain": addDomainPrefix([parsed.hostname]) 452 "if-domain": ["*" + parsed.hostname]
294 }, 453 },
295 action: {type: "ignore-previous-rules"} 454 action: {type: "ignore-previous-rules"}
296 }); 455 });
297 // If the filter contains other supported options we'll need to generate 456 // If the filter contains other supported options we'll need to generate
298 // further rules for it, but if not we can simply return now. 457 // further rules for it, but if not we can simply return now.
299 if (!(filter.contentType & whitelistableRequestTypes)) 458 if (!(contentType & whitelistableRequestTypes))
300 return; 459 return;
301 } 460 }
302 461
303 let trigger = {"url-filter": parsed.regexp}; 462 let trigger = {"url-filter": parsed.regexp};
304 463
305 // Limit rules to HTTP(S) URLs 464 // If the URL filter begins with one of the URL schemes for this content
306 if (!/^(\^|http)/i.test(trigger["url-filter"])) 465 // type, we generate additional rules for all the URL scheme patterns;
307 trigger["url-filter"] = "^https?://.*" + trigger["url-filter"]; 466 // otherwise, if the start of the URL filter literally matches the first URL
467 // scheme pattern, we just generate additional rules for the remaining URL
468 // scheme patterns.
469 //
470 // For example, "stun:foo$webrtc" will give us "stun:foo", then we add a "^"
471 // in front of this and generate two additional rules for
472 // "^stuns?:.*stun:foo" and "^turns?:.*stun:foo". On the other hand,
473 // "||foo$webrtc" will give us "^stuns?:([^/]+\\.)?foo", so we just generate
474 // "^turns?:([^/]+\\.)?foo" in addition.
475 //
476 // Note that the filter can be already anchored to the beginning
477 // (e.g. "|stun:foo$webrtc"), in which case we do not generate any additional
478 // rules.
479 let needAltRules = trigger["url-filter"][0] != "^" ||
480 trigger["url-filter"].startsWith("^" + urlSchemes[0]);
481
482 if (trigger["url-filter"][0] != "^")
483 {
484 if (!urlSchemes.some(scheme => new RegExp("^" + scheme)
485 .test(trigger["url-filter"])))
486 {
487 trigger["url-filter"] = urlSchemes[0] + ".*" + trigger["url-filter"];
488 }
489
490 trigger["url-filter"] = "^" + trigger["url-filter"];
491 }
308 492
309 // For rules containing only a hostname we know that we're matching against 493 // For rules containing only a hostname we know that we're matching against
310 // a lowercase string unless the matchCase option was passed. 494 // a lowercase string unless the matchCase option was passed.
311 if (parsed.canSafelyMatchAsLowercase && !filter.matchCase) 495 if (parsed.canSafelyMatchAsLowercase && !filter.matchCase)
312 trigger["url-filter"] = trigger["url-filter"].toLowerCase(); 496 trigger["url-filter"] = trigger["url-filter"].toLowerCase();
313 497
314 if (parsed.canSafelyMatchAsLowercase || filter.matchCase) 498 if (parsed.canSafelyMatchAsLowercase || filter.matchCase)
315 trigger["url-filter-is-case-sensitive"] = true; 499 trigger["url-filter-is-case-sensitive"] = true;
316 500
317 let included = []; 501 let included = [];
318 let excluded = []; 502 let excluded = [];
319 503
320 parseDomains(filter.domains, included, excluded); 504 parseDomains(filter.domains, included, excluded);
321 505
506 if (exceptionDomains)
507 excluded = excluded.concat(exceptionDomains);
508
322 if (withResourceTypes) 509 if (withResourceTypes)
323 { 510 {
324 trigger["resource-type"] = getResourceTypes(filter); 511 let resourceTypes = getResourceTypes(contentType);
325 512
326 if (trigger["resource-type"].length == 0) 513 // Content blocker rules can't differentiate between sub-document requests
514 // (iframes) and top-level document requests. To avoid too many false
515 // positives, we prevent rules with no hostname part from blocking document
516 // requests.
517 //
518 // Once Safari 11 becomes our minimum supported version, we could change
519 // our approach here to use the new "unless-top-url" property instead.
520 if (filter instanceof filterClasses.BlockingFilter && !parsed.hostname)
521 resourceTypes = resourceTypes.filter(type => type != "document");
522
523 if (resourceTypes.length == 0)
327 return; 524 return;
525
526 trigger["resource-type"] = resourceTypes;
328 } 527 }
329 528
330 if (filter.thirdParty != null) 529 if (filter.thirdParty != null)
331 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"]; 530 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];
332 531
532 let addTopLevelException = false;
533
333 if (included.length > 0) 534 if (included.length > 0)
334 trigger["if-domain"] = addDomainPrefix(included); 535 {
536 trigger["if-domain"] = [];
537
538 for (let name of included)
539 {
540 // If this is a blocking filter or an element hiding filter, add the
541 // subdomain wildcard only if no subdomains have been excluded.
542 let notSubdomains = null;
543 if ((filter instanceof filterClasses.BlockingFilter ||
544 filter instanceof filterClasses.ElemHideFilter) &&
545 (notSubdomains = findSubdomainsInList(name, excluded)).length > 0)
546 {
547 trigger["if-domain"].push(name);
548
549 // Add the "www" prefix but only if it hasn't been excluded.
550 if (!notSubdomains.includes("www"))
551 trigger["if-domain"].push("www." + name);
552 }
553 else
554 {
555 trigger["if-domain"].push("*" + name);
556 }
557 }
558 }
335 else if (excluded.length > 0) 559 else if (excluded.length > 0)
336 trigger["unless-domain"] = addDomainPrefix(excluded); 560 {
561 trigger["unless-domain"] = excluded.map(name => "*" + name);
562 }
563 else if (filter instanceof filterClasses.BlockingFilter &&
564 filter.contentType & typeMap.SUBDOCUMENT && parsed.hostname)
565 {
566 // Rules with a hostname part are still allowed to block document requests,
567 // but we add an exception for top-level documents.
568 //
569 // Note that we can only do this if there's no "unless-domain" property for
570 // now. This also only works in Safari 11 onwards, while older versions
571 // simply ignore this property. Once Safari 11 becomes our minimum
572 // supported version, we can merge "unless-domain" into "unless-top-url".
573 addTopLevelException = true;
574 excludeTopURLFromTrigger(trigger);
575 }
337 576
338 rules.push({trigger: trigger, action: {type: action}}); 577 rules.push({trigger: trigger, action: {type: action}});
339 } 578
340 579 if (needAltRules)
341 function hasNonASCI(obj) 580 {
342 { 581 // Generate additional rules for any alternative URL schemes.
343 if (typeof obj == "string") 582 for (let altRule of makeRuleCopies(trigger, {type: action}, urlSchemes))
344 { 583 {
345 if (/[^\x00-\x7F]/.test(obj)) 584 if (addTopLevelException)
346 return true; 585 excludeTopURLFromTrigger(altRule.trigger);
347 } 586
348 587 rules.push(altRule);
349 if (typeof obj == "object") 588 }
350 { 589 }
351 if (obj instanceof Array)
352 for (let item of obj)
353 if (hasNonASCI(item))
354 return true;
355
356 let names = Object.getOwnPropertyNames(obj);
357 for (let name of names)
358 if (hasNonASCI(obj[name]))
359 return true;
360 }
361
362 return false;
363 } 590 }
364 591
365 function convertIDSelectorsToAttributeSelectors(selector) 592 function convertIDSelectorsToAttributeSelectors(selector)
366 { 593 {
367 // First we figure out where all the IDs are 594 // First we figure out where all the IDs are
368 let sep = ""; 595 let sep = "";
369 let start = null; 596 let start = null;
370 let positions = []; 597 let positions = [];
371 for (let i = 0; i < selector.length; i++) 598 for (let i = 0; i < selector.length; i++)
372 { 599 {
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
406 { 633 {
407 newSelector.push(selector.substring(i, pos.start)); 634 newSelector.push(selector.substring(i, pos.start));
408 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 635 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
409 i = pos.end; 636 i = pos.end;
410 } 637 }
411 newSelector.push(selector.substring(i)); 638 newSelector.push(selector.substring(i));
412 639
413 return newSelector.join(""); 640 return newSelector.join("");
414 } 641 }
415 642
416 function closeMatch(s, t, singleCharacterOnly) 643 function addCSSRules(rules, selectors, domain, exceptionDomains)
417 { 644 {
418 // This function returns an edit operation, one of "substitute", "delete", 645 let unlessDomain = exceptionDomains.size > 0 ? [] : null;
419 // and "insert", along with an index in the source string where the edit must 646
420 // occur in order to arrive at the target string. If the strings are not a 647 exceptionDomains.forEach(name =>
421 // close match, it returns null. 648 {
422 649 // For domain-specific filters, include the exception domains only if
423 // If singleCharacterOnly is false, deletions or insertions of a contiguous 650 // they're subdomains of the given domain.
424 // range of characters from one string into the other, at the same index, are 651 if (!domain || name.substr(-domain.length - 1) == "." + domain)
425 // treated as a single edit. For example, "internal" and "international" are 652 unlessDomain.push("*" + name);
426 // considered to be one edit apart, inserting the substring "tiona" from the 653 });
427 // latter into the former. 654
428 655 while (selectors.length)
429 // A few things to note: 656 {
430 // 657 let selector = selectors.splice(0, selectorLimit).join(", ");
431 // 1) This function does not care about how the input strings are treated 658
432 // by the caller. It only treats them as raw strings. For example, the 659 // As of Safari 9.0 element IDs are matched as lowercase. We work around
433 // caller may treat them as regular expressions, where "[ab]" and "[bc]" 660 // this by converting to the attribute format [id="elementID"]
434 // could be considered to have an edit distance of 1, since the order 661 selector = convertIDSelectorsToAttributeSelectors(selector);
435 // within the brackets does not matter. This function will still return 662
436 // null for this set of inputs since they are two edits apart. 663 let rule = {
437 // 664 trigger: {"url-filter": matchDomain(domain),
438 // 2) To be friendly to calling code that might be passing in regular 665 "url-filter-is-case-sensitive": true},
439 // expressions anyway, this function will simply return null if it 666 action: {type: "css-display-none",
440 // encounters a special character (e.g. "\", "?", "+", "*", etc.) in the 667 selector: selector}
441 // delta. For example, given "Hello" and "Hello, how are you?", it will 668 };
442 // return null instead of "{type: 'insert', index: 5, endIndex: 19}". 669
443 // 670 if (unlessDomain)
444 // 3) The calling code within this file does indeed pass in regular 671 rule.trigger["unless-domain"] = unlessDomain;
445 // expressions (the strict subset of JavaScript regular expressions 672
446 // supported by WebKit for content blockers), making the important 673 rules.push(rule);
447 // assumption that the parts where two such regular expressions may 674 }
448 // differ can always be treated as normal strings. 675 }
449 // 676
450 // For example, "^https?://.*/ads" and "^https?://.*/adv" differ only in 677 /**
451 // the last character, therefore the regular expressions can safely be 678 * Check if two strings are a close match
452 // merged into "^https?://.*/ad[sv]". If, for example, the characters in 679 *
453 // the delta were to appear within square brackets originally in the 680 * This function returns an edit operation, one of "substitute", "delete", and
454 // input strings (e.g. "^https?://.*/ad[sx]" and "^https?://.*/ad[vx]"), 681 * "insert", along with an index in the source string where the edit must occur
455 // the calling code would have to do extra work to merge the two regular 682 * in order to arrive at the target string. If the strings are not a close
456 // expressions correctly. The calling code within this file assumes that 683 * match, it returns null.
457 // this is never the case. 684 *
458 685 * Two strings are considered to be a close match if they are one edit
686 * operation apart.
687 *
688 * Deletions or insertions of a contiguous range of characters from one string
689 * into the other, at the same index, are treated as a single edit. For
690 * example, "internal" and "international" are considered to be one edit apart
691 * and therefore a close match.
692 *
693 * A few things to note:
694 *
695 * 1) This function does not care about the format of the input strings. For
696 * example, the caller may pass in regular expressions, where "[ab]" and
697 * "[bc]" could be considered to be a close match, since the order within the
698 * brackets doesn't matter. This function will still return null for this set
699 * of inputs since they are two edits apart.
700 *
701 * 2) To be friendly to calling code that might be passing in regular
702 * expressions, this function will simply return null if it encounters a
703 * special character (e.g. "\", "?", "+", etc.) in the delta. For example,
704 * given "Hello" and "Hello, how are you?", it will return null.
705 *
706 * 3) If the caller does indeed pass in regular expressions, it must make the
707 * important assumption that the parts where two such regular expressions may
708 * differ can always be treated as normal strings. For example,
709 * "^https?://example.com/ads" and "^https?://example.com/adv" differ only in
710 * the last character, therefore the regular expressions can safely be merged
711 * into "^https?://example.com/ad[sv]".
712 *
713 * @param {string} s The source string
714 * @param {string} t The target string
715 *
716 * @returns {object} An object describing the single edit operation that must
717 * occur in the source string in order to arrive at the
718 * target string
719 */
720 function closeMatch(s, t)
721 {
459 let diff = s.length - t.length; 722 let diff = s.length - t.length;
460
461 // If the string lengths differ by more than one character, we cannot arrive
462 // at target from source in a single edit operation.
463 if (singleCharacterOnly && (diff < -1 || diff > 1))
464 return null;
465 723
466 // If target is longer than source, swap them for the purpose of our 724 // If target is longer than source, swap them for the purpose of our
467 // calculation. 725 // calculation.
468 if (diff < 0) 726 if (diff < 0)
469 { 727 {
470 let tmp = s; 728 let tmp = s;
471 s = t; 729 s = t;
472 t = tmp; 730 t = tmp;
473 } 731 }
474 732
475 let edit = null; 733 let edit = null;
476 734
477 // If the string lengths differ by only one character at most, use the simple 735 let i = 0;
478 // algorithm to find a single character edit. 736 let j = 0;
479 if (diff == 0 || diff == 1 || diff == -1) 737
480 { 738 // Start from the beginning and keep going until we hit a character that
481 for (let i = 0, j = 0; i < s.length; i++) 739 // doesn't match.
482 { 740 for (; i < s.length; i++)
483 if (s[i] == t[j]) 741 {
484 { 742 if (s[i] != t[i])
485 j++; 743 break;
744 }
745
746 // Now do exactly the same from the end, but also stop if we reach the
747 // position where we terminated the previous loop.
748 for (; j < t.length; j++)
749 {
750 if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1])
751 break;
752 }
753
754 if (diff == 0)
755 {
756 // If the strings are equal in length and the delta isn't exactly one
757 // character, it's not a close match.
758 if (t.length - j - i != 1)
759 return null;
760 }
761 else if (i != t.length - j)
762 {
763 // For strings of unequal length, if we haven't found a match for every
764 // single character in the shorter string counting from both the beginning
765 // and the end, it's not a close match.
766 return null;
767 }
768
769 for (let k = i; k < s.length - j; k++)
770 {
771 // If the delta contains any special characters, it's not a close match.
772 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
773 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
774 s[k] == "[" || s[k] == "]" || s[k] == "\\")
775 return null;
776 }
777
778 if (diff == 0)
779 {
780 edit = {type: "substitute", index: i};
781 }
782 else if (diff > 0)
783 {
784 edit = {type: "delete", index: i};
785
786 if (diff > 1)
787 edit.endIndex = s.length - j;
788 }
789 else
790 {
791 edit = {type: "insert", index: i};
792
793 if (diff < -1)
794 edit.endIndex = s.length - j;
795 }
796
797 return edit;
798 }
799
800 function eliminateRedundantRulesByURLFilter(rulesInfo, exhaustive)
801 {
802 const heuristicRange = 1000;
803
804 let ol = rulesInfo.length;
805
806 // Throw out obviously redundant rules.
807 return async(rulesInfo, (ruleInfo, index) => () =>
808 {
809 // If this rule is already marked as redundant, don't bother comparing it
810 // with other rules.
811 if (rulesInfo[index].redundant)
812 return;
813
814 let limit = exhaustive ? rulesInfo.length :
815 Math.min(index + heuristicRange, rulesInfo.length);
816
817 for (let i = index, j = i + 1; j < limit; j++)
818 {
819 if (rulesInfo[j].redundant)
820 continue;
821
822 let source = rulesInfo[i].rule.trigger["url-filter"];
823 let target = rulesInfo[j].rule.trigger["url-filter"];
824
825 if (source.length >= target.length)
826 {
827 // If one URL filter is a substring of the other starting at the
828 // beginning, the other one is clearly redundant.
829 if (source.substring(0, target.length) == target)
830 {
831 rulesInfo[i].redundant = true;
832 break;
833 }
486 } 834 }
487 else if (edit) 835 else if (target.substring(0, source.length) == source)
488 { 836 {
489 // Since we want one and only one edit operation, we must bail here. 837 rulesInfo[j].redundant = true;
490 return null;
491 } 838 }
492 else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" || 839 }
493 s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" || 840 })
494 s[i] == "[" || s[i] == "]" || s[i] == "\\") || 841 .then(() => rulesInfo.filter(ruleInfo => !ruleInfo.redundant));
495 (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" || 842 }
496 t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" || 843
497 t[j] == "[" || t[j] == "]" || t[j] == "\\")) 844 function findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive)
498 { 845 {
499 // We don't deal with special characters for now. 846 // Closely matching rules are likely to be within a certain range. We only
500 return null; 847 // look for matches within this range by default. If we increase this value,
501 } 848 // it can give us more matches and a smaller resulting rule set, but possibly
502 else if (diff == 0) 849 // at a significant performance cost.
503 { 850 //
504 // If both strings are equal in length, this is a substitution. 851 // If the exhaustive option is true, we simply ignore this value and look for
505 edit = {type: "substitute", index: i}; 852 // matches throughout the rule set.
506 j++; 853 const heuristicRange = 1000;
507 } 854
508 else if (diff > 0) 855 let limit = exhaustive ? rulesInfo.length :
509 { 856 Math.min(index + heuristicRange, rulesInfo.length);
510 // If the source string is longer, this is a deletion. 857
511 edit = {type: "delete", index: i}; 858 for (let i = index, j = i + 1; j < limit; j++)
859 {
860 let source = rulesInfo[i].rule.trigger["url-filter"];
861 let target = rulesInfo[j].rule.trigger["url-filter"];
862
863 let edit = closeMatch(source, target);
864
865 if (edit)
866 {
867 let urlFilter, ruleInfo, match = {edit};
868
869 if (edit.type == "insert")
870 {
871 // Convert the insertion into a deletion and stick it on the target
872 // rule instead. We can only group deletions and substitutions;
873 // therefore insertions must be treated as deletions on the target
874 // rule.
875 urlFilter = target;
876 ruleInfo = rulesInfo[j];
877 match.index = i;
878 edit.type = "delete";
512 } 879 }
513 else 880 else
514 { 881 {
515 edit = {type: "insert", index: i}; 882 urlFilter = source;
883 ruleInfo = rulesInfo[i];
884 match.index = j;
516 } 885 }
517 } 886
518 } 887 // If the edit has an end index, it represents a multiple character
519 else if (!singleCharacterOnly) 888 // edit.
520 { 889 let multiEdit = !!edit.endIndex;
521 // Try another algorithm to find a multiple character deletion or 890
522 // insertion. 891 if (multiEdit)
523 892 {
524 let i = 0, j = 0; 893 // We only care about a single multiple character edit because the
525 894 // number of characters for such a match doesn't matter, we can
526 for (; i < s.length; i++) 895 // only merge with one other rule.
527 { 896 if (!ruleInfo.multiEditMatch)
528 if (s[i] != t[i]) 897 ruleInfo.multiEditMatch = match;
529 break;
530 }
531
532 for (; j < t.length; j++)
533 {
534 if (t.length - j == i ||
535 s[s.length - j - 1] != t[t.length - j - 1])
536 break;
537 }
538
539 if (i != t.length - j)
540 return null;
541
542 for (let k = i; k < s.length - j; k++)
543 {
544 // If there are any special characters in the delta, bail.
545 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
546 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
547 s[k] == "[" || s[k] == "]" || s[k] == "\\")
548 return null;
549 }
550
551 if (diff > 0)
552 {
553 edit = {type: "delete", index: i, endIndex: s.length - j};
554 }
555 else
556 {
557 edit = {type: "insert", index: i, endIndex: s.length - j};
558 }
559 }
560
561 return edit;
562 }
563
564 function mergeCloselyMatchingRules(rules, options)
565 {
566 const defaultOptions = {advanced: false, exhaustive: false};
567
568 options = Object.assign({}, defaultOptions, options);
569
570 // Closely matching rules are likely to be within a certain range. We only
571 // look for matches within this range. If we increase this value, it can give
572 // us more matches and a smaller resulting rule set, but possibly at a
573 // significant performance cost.
574 const heuristicRange = 100;
575
576 let rulesInfo = new Array(rules.length);
577
578 rules.forEach((rule, index) =>
579 {
580 rulesInfo[index] = {rule};
581
582 if (rule.action.type == "ignore-previous-rules")
583 {
584 rulesInfo[index].skip = true;
585 }
586 else
587 {
588 // Save a hash of the rule but without the URL filter. We use this for
589 // comparison later.
590 let copy = {
591 trigger: Object.assign({}, rule.trigger),
592 action: Object.assign({}, rule.action)
593 };
594
595 delete copy.trigger["url-filter"];
596
597 let stringified = JSON.stringify(copy);
598
599 if (options.exhaustive)
600 {
601 // The Pearson hash function expects all characters to be within the
602 // 8-bit range.
603 stringified = encodeURIComponent(stringified);
604
605 rulesInfo[index].ruleHash = pearsonHash(stringified);
606 } 898 }
607 else 899 else
608 { 900 {
609 rulesInfo[index].ruleHash = stringified; 901 // For single character edits, multiple rules can be merged into
902 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
903 if (!ruleInfo.matches)
904 ruleInfo.matches = new Array(urlFilter.length);
905
906 // Matches at a particular index. For example, for a source string
907 // "ads", both target strings "ad" (deletion) and "adv"
908 // (substitution) match at index 2, hence they are grouped together
909 // to possibly be merged later into "ad[sv]?".
910 let matchesForIndex = ruleInfo.matches[edit.index];
911
912 if (matchesForIndex)
913 {
914 matchesForIndex.push(match);
915 }
916 else
917 {
918 matchesForIndex = [match];
919 ruleInfo.matches[edit.index] = matchesForIndex;
920 }
921
922 // Keep track of the best set of matches. We later sort by this to
923 // get best results.
924 if (!ruleInfo.bestMatches ||
925 matchesForIndex.length > ruleInfo.bestMatches.length)
926 ruleInfo.bestMatches = matchesForIndex;
610 } 927 }
611 } 928 }
612 }); 929 }
613 930 }
614 for (let i = 0; i < rules.length; i++) 931
615 { 932 function mergeCandidateRulesByURLFilter(rulesInfo)
616 if (rulesInfo[i].skip) 933 {
617 continue;
618
619 let limit = options.exhaustive ? rules.length :
620 Math.min(i + heuristicRange, rules.length);
621
622 for (let j = i + 1; j < limit; j++)
623 {
624 if (rulesInfo[j].skip)
625 continue;
626
627 // Check if the rules are identical except for the URL filter.
628 if (rulesInfo[i].ruleHash == rulesInfo[j].ruleHash)
629 {
630 let source = rules[i].trigger["url-filter"];
631 let target = rules[j].trigger["url-filter"];
632
633 let edit = closeMatch(source, target, !options.advanced);
634
635 if (edit)
636 {
637 let urlFilter, ruleInfo, match = {edit};
638
639 if (edit.type == "insert")
640 {
641 // Convert the insertion into a deletion and stick it on the target
642 // rule instead. We can only group deletions and substitutions;
643 // therefore insertions must be treated as deletions on the target
644 // rule.
645 urlFilter = target;
646 ruleInfo = rulesInfo[j];
647 match.index = i;
648 edit.type = "delete";
649 }
650 else
651 {
652 urlFilter = source;
653 ruleInfo = rulesInfo[i];
654 match.index = j;
655 }
656
657 // If the edit has an end index, it represents a multiple character
658 // edit.
659 let multiEdit = !!edit.endIndex;
660
661 if (multiEdit)
662 {
663 // We only care about a single multiple character edit because the
664 // number of characters for such a match doesn't matter, we can
665 // only merge with one other rule.
666 if (!ruleInfo.multiEditMatch)
667 ruleInfo.multiEditMatch = match;
668 }
669 else
670 {
671 // For single character edits, multiple rules can be merged into
672 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
673 if (!ruleInfo.matches)
674 ruleInfo.matches = new Array(urlFilter.length + 1);
675
676 // Matches at a particular index. For example, for a source string
677 // "ads", both target strings "ad" (deletion) and "adv"
678 // (substitution) match at index 2, hence they are grouped together
679 // to possibly be merged later into "ad[sv]?".
680 let matchesForIndex = ruleInfo.matches[edit.index];
681
682 if (matchesForIndex)
683 {
684 matchesForIndex.push(match);
685 }
686 else
687 {
688 matchesForIndex = [match];
689 ruleInfo.matches[edit.index] = matchesForIndex;
690 }
691
692 // Keep track of the best set of matches. We later sort by this to
693 // get best results.
694 if (!ruleInfo.bestMatches ||
695 matchesForIndex.length > ruleInfo.bestMatches.length)
696 ruleInfo.bestMatches = matchesForIndex;
697 }
698 }
699 }
700 }
701 }
702
703 // Filter out rules that have no matches at all. 934 // Filter out rules that have no matches at all.
704 let candidateRulesInfo = rulesInfo.filter(ruleInfo => 935 let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
705 { 936 {
706 return ruleInfo.bestMatches || ruleInfo.multiEditMatch 937 return ruleInfo.bestMatches || ruleInfo.multiEditMatch
707 }); 938 });
708 939
709 // For best results, we have to sort the candidates by the largest set of 940 // For best results, we have to sort the candidates by the largest set of
710 // matches. 941 // matches.
711 // 942 //
712 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to 943 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
761 if (best.length > 0) 992 if (best.length > 0)
762 { 993 {
763 let urlFilter = rule.trigger["url-filter"]; 994 let urlFilter = rule.trigger["url-filter"];
764 995
765 let editIndex = best[0].edit.index; 996 let editIndex = best[0].edit.index;
766 997
767 if (!multiEdit) 998 if (!multiEdit)
768 { 999 {
769 // Merge all the matching rules into this one. 1000 // Merge all the matching rules into this one.
770 1001
771 let characters = []; 1002 let characters = [urlFilter[editIndex]];
772 let quantifier = ""; 1003 let quantifier = "";
773 1004
774 for (let match of best) 1005 for (let match of best)
775 { 1006 {
776 if (match.edit.type == "delete") 1007 if (match.edit.type == "delete")
777 { 1008 {
778 quantifier = "?"; 1009 quantifier = "?";
779 } 1010 }
780 else 1011 else
781 { 1012 {
782 let character = rules[match.index].trigger["url-filter"][editIndex]; 1013 let character = rulesInfo[match.index].rule
783 characters.push(character); 1014 .trigger["url-filter"][editIndex];
1015
1016 // Insert any hyphen at the beginning so it gets interpreted as a
1017 // literal hyphen.
1018 if (character == "-")
1019 characters.unshift(character);
1020 else
1021 characters.push(character);
784 } 1022 }
785 1023
786 // Mark the target rule as merged so other rules don't try to merge 1024 // Mark the target rule as merged so other rules don't try to merge
787 // it again. 1025 // it again.
788 rulesInfo[match.index].merged = true; 1026 rulesInfo[match.index].merged = true;
789 } 1027 }
790 1028
791 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier + 1029 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
792 urlFilter.substring(editIndex + 1); 1030 urlFilter.substring(editIndex + 1);
793 if (characters.length > 0) 1031 if (characters.length > 1)
794 { 1032 {
795 urlFilter = urlFilter.substring(0, editIndex) + "[" + 1033 urlFilter = urlFilter.substring(0, editIndex) + "[" +
796 urlFilter[editIndex] + characters.join("") + "]" + 1034 characters.join("") + "]" +
797 urlFilter.substring(editIndex + 1); 1035 urlFilter.substring(editIndex + 1);
798 } 1036 }
799 } 1037 }
800 else 1038 else
801 { 1039 {
802 let editEndIndex = best[0].edit.endIndex; 1040 let editEndIndex = best[0].edit.endIndex;
803 1041
804 // Mark the target rule as merged so other rules don't try to merge it 1042 // Mark the target rule as merged so other rules don't try to merge it
805 // again. 1043 // again.
806 rulesInfo[best[0].index].merged = true; 1044 rulesInfo[best[0].index].merged = true;
807 1045
808 urlFilter = urlFilter.substring(0, editIndex) + "(" + 1046 urlFilter = urlFilter.substring(0, editIndex) + "(" +
809 urlFilter.substring(editIndex, editEndIndex) + ")?" + 1047 urlFilter.substring(editIndex, editEndIndex) + ")?" +
810 urlFilter.substring(editEndIndex); 1048 urlFilter.substring(editEndIndex);
811 } 1049 }
812 1050
813 rule.trigger["url-filter"] = urlFilter; 1051 rule.trigger["url-filter"] = urlFilter;
814 1052
815 // Mark this rule as one that has had other rules merged into it. 1053 // Mark this rule as one that has had other rules merged into it.
816 ruleInfo.mergedInto = true; 1054 ruleInfo.mergedInto = true;
817 } 1055 }
818 } 1056 }
819 1057 }
820 // Filter out rules that have been merged into other rules. 1058
821 return rulesInfo.filter(ruleInfo => !ruleInfo.merged) 1059 function mergeRulesByURLFilter(rulesInfo, exhaustive)
822 .map(ruleInfo => ruleInfo.rule); 1060 {
1061 return async(rulesInfo, (ruleInfo, index) => () =>
1062 findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive)
1063 )
1064 .then(() => mergeCandidateRulesByURLFilter(rulesInfo));
1065 }
1066
1067 function mergeRulesByArrayProperty(rulesInfo, propertyType, property)
1068 {
1069 if (rulesInfo.length <= 1)
1070 return;
1071
1072 let valueSet = new Set(rulesInfo[0].rule[propertyType][property]);
1073
1074 for (let i = 1; i < rulesInfo.length; i++)
1075 {
1076 for (let value of rulesInfo[i].rule[propertyType][property] || [])
1077 valueSet.add(value);
1078
1079 rulesInfo[i].merged = true;
1080 }
1081
1082 if (valueSet.size > 0)
1083 rulesInfo[0].rule[propertyType][property] = Array.from(valueSet);
1084
1085 rulesInfo[0].mergedInto = true;
1086 }
1087
1088 function groupRulesByMergeableProperty(rulesInfo, propertyType, property)
1089 {
1090 let mergeableRulesInfoByGroup = new Map();
1091
1092 for (let ruleInfo of rulesInfo)
1093 {
1094 let copy = {
1095 trigger: Object.assign({}, ruleInfo.rule.trigger),
1096 action: Object.assign({}, ruleInfo.rule.action)
1097 };
1098
1099 delete copy[propertyType][property];
1100
1101 let groupKey = JSON.stringify(copy);
1102
1103 let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey);
1104
1105 if (mergeableRulesInfo)
1106 mergeableRulesInfo.push(ruleInfo);
1107 else
1108 mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]);
1109 }
1110
1111 return mergeableRulesInfoByGroup;
1112 }
1113
1114 function mergeRules(rules, exhaustive)
1115 {
1116 let rulesInfo = rules.map(rule => ({rule}));
1117
1118 let arrayPropertiesToMergeBy = ["resource-type", "if-domain"];
1119
1120 return async(() =>
1121 {
1122 let map = groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter");
1123 return async(map.values(), mergeableRulesInfo => () =>
1124 eliminateRedundantRulesByURLFilter(mergeableRulesInfo, exhaustive)
1125 .then(rulesInfo => mergeRulesByURLFilter(rulesInfo, exhaustive))
1126 )
1127 .then(() =>
1128 {
1129 // Filter out rules that are redundant or have been merged into other
1130 // rules.
1131 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.redundant &&
1132 !ruleInfo.merged);
1133 });
1134 })
1135 .then(() => async(arrayPropertiesToMergeBy, arrayProperty => () =>
1136 {
1137 let map = groupRulesByMergeableProperty(rulesInfo, "trigger",
1138 arrayProperty);
1139 return async(map.values(), mergeableRulesInfo => () =>
1140 mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty)
1141 )
1142 .then(() =>
1143 {
1144 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged);
1145 });
1146 }))
1147 .then(() => rulesInfo.map(ruleInfo => ruleInfo.rule));
823 } 1148 }
824 1149
825 let ContentBlockerList = 1150 let ContentBlockerList =
826 /** 1151 /**
827 * Create a new Adblock Plus filter to content blocker list converter 1152 * Create a new Adblock Plus filter to content blocker list converter
828 * 1153 *
1154 * @param {object} options Options for content blocker list generation
1155 *
829 * @constructor 1156 * @constructor
830 */ 1157 */
831 exports.ContentBlockerList = function () 1158 exports.ContentBlockerList = function (options)
832 { 1159 {
1160 const defaultOptions = {
1161 merge: "auto"
1162 };
1163
1164 this.options = Object.assign({}, defaultOptions, options);
1165
833 this.requestFilters = []; 1166 this.requestFilters = [];
834 this.requestExceptions = []; 1167 this.requestExceptions = [];
835 this.elemhideFilters = []; 1168 this.elemhideFilters = [];
836 this.elemhideExceptions = []; 1169 this.elemhideExceptions = [];
1170 this.genericblockExceptions = [];
1171 this.generichideExceptions = [];
837 this.elemhideSelectorExceptions = new Map(); 1172 this.elemhideSelectorExceptions = new Map();
838 }; 1173 };
839 1174
840 /** 1175 /**
841 * Add Adblock Plus filter to be converted 1176 * Add Adblock Plus filter to be converted
842 * 1177 *
843 * @param {Filter} filter Filter to convert 1178 * @param {Filter} filter Filter to convert
844 */ 1179 */
845 ContentBlockerList.prototype.addFilter = function(filter) 1180 ContentBlockerList.prototype.addFilter = function(filter)
846 { 1181 {
847 if (filter.sitekeys) 1182 if (filter.sitekeys)
848 return; 1183 return;
849 if (filter instanceof filterClasses.RegExpFilter && 1184 if (filter instanceof filterClasses.RegExpFilter &&
850 filter.regexpSource == null) 1185 filter.regexpSource == null)
851 return; 1186 return;
852 1187
853 if (filter instanceof filterClasses.BlockingFilter) 1188 if (filter instanceof filterClasses.BlockingFilter)
854 this.requestFilters.push(filter); 1189 this.requestFilters.push(filter);
855 1190
856 if (filter instanceof filterClasses.WhitelistFilter) 1191 if (filter instanceof filterClasses.WhitelistFilter)
857 { 1192 {
858 if (filter.contentType & (typeMap.DOCUMENT | whitelistableRequestTypes)) 1193 if (filter.contentType & (typeMap.DOCUMENT | whitelistableRequestTypes))
859 this.requestExceptions.push(filter); 1194 this.requestExceptions.push(filter);
860 1195
861 if (filter.contentType & typeMap.ELEMHIDE) 1196 if (filter.contentType & typeMap.GENERICBLOCK)
862 this.elemhideExceptions.push(filter); 1197 this.genericblockExceptions.push(filter);
1198
1199 if (filter.contentType & typeMap.ELEMHIDE)
1200 this.elemhideExceptions.push(filter);
1201 else if (filter.contentType & typeMap.GENERICHIDE)
1202 this.generichideExceptions.push(filter);
863 } 1203 }
864 1204
865 if (filter instanceof filterClasses.ElemHideFilter) 1205 if (filter instanceof filterClasses.ElemHideFilter)
866 this.elemhideFilters.push(filter); 1206 this.elemhideFilters.push(filter);
867 1207
868 if (filter instanceof filterClasses.ElemHideException) 1208 if (filter instanceof filterClasses.ElemHideException)
869 { 1209 {
870 let domains = this.elemhideSelectorExceptions[filter.selector]; 1210 let domains = this.elemhideSelectorExceptions[filter.selector];
871 if (!domains) 1211 if (!domains)
872 domains = this.elemhideSelectorExceptions[filter.selector] = []; 1212 domains = this.elemhideSelectorExceptions[filter.selector] = [];
873 1213
874 parseDomains(filter.domains, domains, []); 1214 parseDomains(filter.domains, domains, []);
875 } 1215 }
876 }; 1216 };
877 1217
878 /** 1218 /**
879 * Generate content blocker list for all filters that were added 1219 * Generate content blocker list for all filters that were added
880 *
881 * @returns {Filter} filter Filter to convert
882 */ 1220 */
883 ContentBlockerList.prototype.generateRules = function(options) 1221 ContentBlockerList.prototype.generateRules = function()
884 { 1222 {
885 const defaultOptions = { 1223 let cssRules = [];
886 merge: false, 1224 let cssExceptionRules = [];
887 fastMerge: true, 1225 let blockingRules = [];
888 advancedMerge: null, 1226 let blockingExceptionRules = [];
889 exhaustiveMerge: null 1227
890 }; 1228 let ruleGroups = [cssRules, cssExceptionRules,
891 1229 blockingRules, blockingExceptionRules];
892 options = Object.assign({}, defaultOptions, options); 1230
893 1231 let genericSelectors = [];
894 let rules = [];
895
896 let groupedElemhideFilters = new Map(); 1232 let groupedElemhideFilters = new Map();
1233
897 for (let filter of this.elemhideFilters) 1234 for (let filter of this.elemhideFilters)
898 { 1235 {
899 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 1236 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
900 if (!result) 1237 if (!result)
901 continue; 1238 continue;
902 1239
903 if (result.matchDomains.length == 0) 1240 if (result.matchDomains.length == 0)
904 result.matchDomains = ["^https?://"]; 1241 {
905 1242 genericSelectors.push(result.selector);
906 for (let matchDomain of result.matchDomains) 1243 }
907 { 1244 else
908 let group = groupedElemhideFilters.get(matchDomain) || []; 1245 {
909 group.push(result.selector); 1246 for (let matchDomain of result.matchDomains)
910 groupedElemhideFilters.set(matchDomain, group); 1247 {
911 } 1248 let group = groupedElemhideFilters.get(matchDomain) || [];
912 } 1249 group.push(result.selector);
1250 groupedElemhideFilters.set(matchDomain, group);
1251 }
1252 }
1253 }
1254
1255 // Separate out the element hiding exceptions that have only a hostname part
1256 // from the rest. This allows us to implement a workaround for issue #5345
1257 // (WebKit bug #167423), but as a bonus it also reduces the number of
1258 // generated rules. The downside is that the exception will only apply to the
1259 // top-level document, not to iframes. We have to live with this until the
1260 // WebKit bug is fixed in all supported versions of Safari.
1261 // https://bugs.webkit.org/show_bug.cgi?id=167423
1262 //
1263 // Note that as a result of this workaround we end up with a huge rule set in
1264 // terms of the amount of memory used. This can cause Node.js to throw
1265 // "JavaScript heap out of memory". To avoid this, call Node.js with
1266 // --max_old_space_size=4096
1267 let elemhideExceptionDomains = extractFilterDomains(this.elemhideExceptions);
1268
1269 let genericSelectorExceptionDomains =
1270 extractFilterDomains(this.generichideExceptions);
1271 elemhideExceptionDomains.forEach(name =>
1272 {
1273 genericSelectorExceptionDomains.add(name);
1274 });
1275
1276 addCSSRules(cssRules, genericSelectors, null,
1277 genericSelectorExceptionDomains);
1278
1279 // Filter out whitelisted domains.
1280 elemhideExceptionDomains.forEach(domain =>
1281 groupedElemhideFilters.delete(domain));
913 1282
914 groupedElemhideFilters.forEach((selectors, matchDomain) => 1283 groupedElemhideFilters.forEach((selectors, matchDomain) =>
915 { 1284 {
916 while (selectors.length) 1285 addCSSRules(cssRules, selectors, matchDomain, elemhideExceptionDomains);
917 { 1286 });
918 let selector = selectors.splice(0, selectorLimit).join(", "); 1287
919 1288 let requestFilterExceptionDomains = [];
920 // As of Safari 9.0 element IDs are matched as lowercase. We work around 1289 for (let filter of this.genericblockExceptions)
921 // this by converting to the attribute format [id="elementID"] 1290 {
922 selector = convertIDSelectorsToAttributeSelectors(selector); 1291 let parsed = parseFilterRegexpSource(filter.regexpSource);
923 1292 if (parsed.hostname)
924 rules.push({ 1293 requestFilterExceptionDomains.push(parsed.hostname);
925 trigger: {"url-filter": matchDomain, 1294 }
926 "url-filter-is-case-sensitive": true}, 1295
927 action: {type: "css-display-none", 1296 for (let filter of this.requestFilters)
928 selector: selector} 1297 {
1298 convertFilterAddRules(blockingRules, filter, "block", true,
1299 requestFilterExceptionDomains);
1300 }
1301
1302 for (let filter of this.requestExceptions)
1303 {
1304 convertFilterAddRules(blockingExceptionRules, filter,
1305 "ignore-previous-rules", true);
1306 }
1307
1308 return async(ruleGroups, (group, index) => () =>
1309 {
1310 let next = () =>
1311 {
1312 if (index == ruleGroups.length - 1)
1313 return ruleGroups.reduce((all, rules) => all.concat(rules), []);
1314 };
1315
1316 if (this.options.merge == "all" ||
1317 (this.options.merge == "auto" &&
1318 ruleGroups.reduce((n, group) => n + group.length, 0) > 50000))
1319 {
1320 return mergeRules(ruleGroups[index], this.options.merge == "all")
1321 .then(rules =>
1322 {
1323 ruleGroups[index] = rules;
1324 return next();
929 }); 1325 });
930 } 1326 }
1327
1328 return next();
931 }); 1329 });
932
933 for (let filter of this.elemhideExceptions)
934 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
935 for (let filter of this.requestFilters)
936 convertFilterAddRules(rules, filter, "block", true);
937 for (let filter of this.requestExceptions)
938 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
939
940 rules = rules.filter(rule => !hasNonASCI(rule));
941
942 if (options.merge)
943 {
944 // If the more specific options are specified (e.g. "advanced" and
945 // "exhaustive"), they override the more general options (e.g. "fast").
946 let mergeOptions = {
947 advanced: options.advancedMerge ||
948 (!options.fastMerge && options.advancedMerge != false),
949 exhaustive: options.exhaustiveMerge ||
950 (!options.fastMerge && options.exhaustiveMerge != false)
951 };
952
953 rules = mergeCloselyMatchingRules(rules, mergeOptions);
954 }
955
956 return rules;
957 }; 1330 };
LEFTRIGHT

Powered by Google App Engine
This is Rietveld