Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/contentBlockerLists.js

Issue 29336753: Issue 3671 - Split out contentBlockerList API (Closed)
Patch Set: Addressed feedback, removed some modules Created Feb. 21, 2016, 11:26 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2016 Eyeo GmbH
4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation.
8 *
9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 /** @module contentBlockerLists */
Sebastian Noack 2016/02/21 20:34:45 I'd rather call that module simply abp2blocklist,
Sebastian Noack 2016/02/21 20:36:15 s/is the module of the project/is the name of the
kzar 2016/02/22 12:28:06 I think abp2blocklist is a fine name for the scrip
Sebastian Noack 2016/02/22 17:35:28 Whether it will be included into adblockpluscore o
kzar 2016/02/22 18:09:29 I'd really rather keep the name lib/contentBlocker
Sebastian Noack 2016/02/22 18:20:26 So I guess we need a third opinion here. fhd, what
kzar 2016/02/22 19:46:01 I mean by the same logic, wouldn't having the libr
Felix Dahlke 2016/02/24 13:06:30 No particularly strong opinion, but since you're a
Sebastian Noack 2016/02/24 19:31:51 Thanks for your feedback. But first of all "abp2b
Sebastian Noack 2016/02/24 22:10:04 Dave and I continued the discussion on IRC: <snoa
Felix Dahlke 2016/02/26 15:40:35 Alright, read up on the discussion again and had a
19
20 "use strict";
21
22 let filterClasses = require("filterClasses");
23 let getBaseDomain = require("url").getBaseDomain;
24 let punycode = require("punycode");
25
26
Sebastian Noack 2016/02/21 20:34:45 Nit: One empty line should suffice here. Same belo
kzar 2016/02/22 12:28:05 Done.
27 const selectorLimit = 5000;
28 const typeMap = filterClasses.RegExpFilter.typeMap;
29
30
31 function parseDomains(domains, included, excluded)
32 {
33 for (let domain in domains)
34 {
35 if (domain != "")
36 {
37 let enabled = domains[domain];
38 domain = punycode.toASCII(domain.toLowerCase());
39
40 if (!enabled)
41 excluded.push(domain);
42 else if (!domains[""])
43 included.push(domain);
44 }
45 }
46 }
47
48 function escapeRegExp(s)
49 {
50 return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
51 }
52
53 function matchDomain(domain)
54 {
55 return "^https?://([^/:]*\\.)?" + escapeRegExp(domain) + "[/:]";
56 }
57
58 function convertElemHideFilter(filter, elemhideSelectorExceptions)
59 {
60 let included = [];
61 let excluded = [];
62 let rules = [];
63
64 parseDomains(filter.domains, included, excluded);
65
66 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))
67 return {matchDomains: included.map(matchDomain), selector: filter.selector};
68 }
69
70 function toRegExp(text)
71 {
72 let result = "";
Sebastian Noack 2016/02/21 20:34:45 This is unrelated, but we probably should use an a
kzar 2016/02/22 12:28:05 Done.
73 let lastIndex = text.length - 1;
74
75 for (let i = 0; i < text.length; i++)
76 {
77 let c = text[i];
78
79 switch (c)
80 {
81 case "*":
82 if (result.length > 0 && i < lastIndex && text[i + 1] != "*")
83 result += ".*";
84 break;
85 case "^":
86 if (i < lastIndex)
87 result += ".";
88 break;
89 case "|":
90 if (i == 0)
91 {
92 result += "^";
93 break;
94 }
95 if (i == lastIndex)
96 {
97 result += "$";
98 break;
99 }
100 if (i == 1 && text[0] == "|")
101 {
102 result += "https?://";
103 break;
104 }
105 case ".": case "+": case "?": case "$":
106 case "{": case "}": case "(": case ")":
107 case "[": case "]": case "\\":
108 result += "\\";
109 default:
110 result += c;
111 }
112 }
113
114 return result;
115 }
116
117 function getRegExpSource(filter)
118 {
119 let source = toRegExp(filter.regexpSource.replace(
120 // Safari expects punycode, filter lists use unicode
121 /^(\|\||\|?https?:\/\/)([\w\-.*\u0080-\uFFFF]+)/i,
122 function (match, prefix, domain)
123 {
124 return prefix + punycode.toASCII(domain);
125 }
126 ));
127
128 // Limit rules to to HTTP(S) URLs
129 if (!/^(\^|http)/i.test(source))
130 source = "^https?://.*" + source;
131
132 return source;
133 }
134
135 function getResourceTypes(filter)
136 {
137 let types = [];
138
139 if (filter.contentType & typeMap.IMAGE)
140 types.push("image");
141 if (filter.contentType & typeMap.STYLESHEET)
142 types.push("style-sheet");
143 if (filter.contentType & typeMap.SCRIPT)
144 types.push("script");
145 if (filter.contentType & typeMap.FONT)
146 types.push("font");
147 if (filter.contentType & (typeMap.MEDIA | typeMap.OBJECT))
148 types.push("media");
149 if (filter.contentType & typeMap.POPUP)
150 types.push("popup");
151 if (filter.contentType & (typeMap.XMLHTTPREQUEST | typeMap.OBJECT_SUBREQUEST
Sebastian Noack 2016/02/21 20:34:45 Nit: This check reads easier if the types are alig
kzar 2016/02/22 12:28:05 Done.
152 | typeMap.PING | typeMap.OTHER))
153 types.push("raw");
154 if (filter.contentType & typeMap.SUBDOCUMENT)
155 types.push("document");
156
157 return types;
158 }
159
160 function addDomainPrefix(domains)
161 {
162 let result = [];
163
164 for (let domain of domains)
165 {
166 result.push(domain);
167
168 if (getBaseDomain(domain) == domain)
169 result.push("www." + domain);
170 }
171
172 return result;
173 }
174
175 function convertFilter(filter, action, withResourceTypes)
176 {
177 let trigger = {"url-filter": getRegExpSource(filter)};
178 let included = [];
179 let excluded = [];
180
181 parseDomains(filter.domains, included, excluded);
182
183 if (withResourceTypes)
184 trigger["resource-type"] = getResourceTypes(filter);
185 if (filter.thirdParty != null)
186 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];
187
188 if (included.length > 0)
189 trigger["if-domain"] = addDomainPrefix(included);
190 else if (excluded.length > 0)
191 trigger["unless-domain"] = addDomainPrefix(excluded);
192
193 return {trigger: trigger, action: {type: action}};
194 }
195
196 function hasNonASCI(obj)
197 {
198 if (typeof obj == "string")
199 {
200 if (/[^\x00-\x7F]/.test(obj))
201 return true;
202 }
203
204 if (typeof obj == "object")
205 {
206 if (obj instanceof Array)
207 for (let item of obj)
208 if (hasNonASCI(item))
209 return true;
210
211 for (let name of Object.getOwnPropertyNames(obj))
Sebastian Noack 2016/02/21 20:34:45 When we start to transpile the code with jsHydra,
kzar 2016/02/22 12:28:05 Done.
212 if (hasNonASCI(obj[name]))
213 return true;
214 }
215
216 return false;
217 }
218
219 function convertIDSelectorsToAttributeSelectors(selector)
220 {
221 // First we figure out where all the IDs are
222 let sep = "";
223 let start = null;
224 let positions = [];
225 for (let i = 0; i < selector.length; i++)
226 {
227 let chr = selector[i];
228
229 if (chr == "\\") // ignore escaped characters
230 i++;
231 else if (chr == sep) // don't match IDs within quoted text
232 sep = ""; // e.g. [attr="#Hello"]
233 else if (sep == "")
234 {
235 if (chr == '"' || chr == "'")
236 sep = chr;
237 else if (start == null) // look for the start of an ID
238 {
239 if (chr == "#")
240 start = i;
241 }
242 else if (chr != "-" && chr != "_" &&
243 (chr < "0" ||
244 chr > "9" && chr < "A" ||
245 chr > "Z" && chr < "a" ||
246 chr > "z" && chr < "\x80")) // look for the end of the ID
247 {
248 positions.push({start: start, end: i});
249 start = null;
250 }
251 }
252 }
253 if (start != null)
254 positions.push({start: start, end: selector.length});
255
256 // Now replace them all with the [id="someID"] form
257 let newSelector = [];
258 let i = 0;
259 for (let pos of positions)
260 {
261 newSelector.push(selector.substring(i, pos.start));
262 newSelector.push('[id=' + selector.substring(pos.start + 1, pos.end) + ']');
Sebastian Noack 2016/02/21 20:34:45 Sorry, I missed that one on the previous review; t
kzar 2016/02/22 12:28:05 Done.
263 i = pos.end;
264 }
265 newSelector.push(selector.substring(i));
266
267 return newSelector.join("");
268 }
269
270 /**
271 * Converts given Adblock Plus filters into content blocker list.
272 *
273 * @param {Filter[]} filters Sequence of filters to convert
274 * @returns {Object[]} Array of content blocker rules
275 */
276 exports.convertFilters = function(filters)
Sebastian Noack 2016/02/21 20:34:44 On the command line we read filters line by line.
kzar 2016/02/22 12:28:05 Done.
277 {
278 let requestFilters = [];
279 let requestExceptions = [];
280 let elemhideFilters = [];
281 let elemhideExceptions = [];
282 let elemhideSelectorExceptions = new Map();
283 let rules = [];
284
285 // Group the filters into their various types
286 for (let filter of filters)
287 {
288 if (filter.sitekeys)
289 continue;
290 if (filter instanceof filterClasses.RegExpFilter && !filter.regexpSource)
291 continue;
292
293 if (filter instanceof filterClasses.BlockingFilter)
294 requestFilters.push(filter);
295
296 if (filter instanceof filterClasses.WhitelistFilter)
297 {
298 if (filter.contentType & (typeMap.IMAGE
299 | typeMap.STYLESHEET
300 | typeMap.SCRIPT
301 | typeMap.FONT
302 | typeMap.MEDIA
303 | typeMap.POPUP
304 | typeMap.OBJECT
305 | typeMap.OBJECT_SUBREQUEST
306 | typeMap.XMLHTTPREQUEST
307 | typeMap.PING
308 | typeMap.SUBDOCUMENT
309 | typeMap.OTHER))
310 requestExceptions.push(filter);
311
312 if (filter.contentType & typeMap.ELEMHIDE)
313 elemhideExceptions.push(filter);
314 }
315
316 if (filter instanceof filterClasses.ElemHideFilter)
317 elemhideFilters.push(filter);
318
319 if (filter instanceof filterClasses.ElemHideException)
320 {
321 let domains = elemhideSelectorExceptions[filter.selector];
322 if (!domains)
323 domains = elemhideSelectorExceptions[filter.selector] = [];
324
325 parseDomains(filter.domains, domains, []);
326 }
327 }
328
329 // Create the content blocker rules for those grouped filters
330 function addRule(rule)
331 {
332 if (!hasNonASCI(rule))
333 rules.push(rule);
334 }
335
336 let groupedElemhideFilters = new Map();
337 for (let filter of elemhideFilters)
338 {
339 let result = convertElemHideFilter(filter, elemhideSelectorExceptions);
340 if (!result)
341 continue;
342
343 if (result.matchDomains.length == 0)
344 result.matchDomains = ["^https?://"];
345
346 for (let matchDomain of result.matchDomains)
347 {
348 let group = groupedElemhideFilters.get(matchDomain) || [];
349 group.push(result.selector);
350 groupedElemhideFilters.set(matchDomain, group);
351 }
352 }
353
354 groupedElemhideFilters.forEach((selectors, matchDomain) =>
355 {
356 while (selectors.length)
357 {
358 let selector = selectors.splice(0, selectorLimit).join(", ");
359
360 // As of Safari 9.0 element IDs are matched as lowercase. We work around
361 // this by converting to the attribute format [id="elementID"]
362 selector = convertIDSelectorsToAttributeSelectors(selector);
363
364 addRule({
365 trigger: {"url-filter": matchDomain},
366 action: {type: "css-display-none",
367 selector: selector}
368 });
369 }
370 });
371
372 for (let filter of elemhideExceptions)
373 addRule(convertFilter(filter, "ignore-previous-rules", false));
374 for (let filter of requestFilters)
375 addRule(convertFilter(filter, "block", true));
376 for (let filter of requestExceptions)
377 addRule(convertFilter(filter, "ignore-previous-rules", true));
378
379 return rules;
380 };
OLDNEW

Powered by Google App Engine
This is Rietveld