Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/crawler.js

Issue 5288886037118976: Adblock Plus Crawler rewrite (Closed)
Patch Set: Created April 24, 2015, 3:38 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/.
5 */
6
7 /**
8 * @module crawler
saroyanm 2015/05/04 18:13:43 I think this should be file overview, maybe smth l
Wladimir Palant 2015/05/07 00:04:59 As Sebastian noted elsewhere, @module is actually
9 */
10
11 Cu.import("resource://gre/modules/Services.jsm");
12 Cu.import("resource://gre/modules/Task.jsm");
13 Cu.import("resource://gre/modules/Promise.jsm");
14
15 function abprequire(module)
16 {
17 let result = {};
saroyanm 2015/05/04 18:13:43 We can create here Object without Object prototype
Sebastian Noack 2015/05/04 20:39:01 It seems that we don't need to bother about confli
Wladimir Palant 2015/05/07 00:04:59 Yes, this isn't a place where property conflicts c
18 result.wrappedJSObject = result;
19 Services.obs.notifyObservers(result, "adblockplus-require", module);
20 return result.exports;
21 }
22
23 let {Policy} = abprequire("contentPolicy");
24 let {RequestNotifier} = abprequire("requestNotifier");
25 let {Utils} = abprequire("utils");
26
27 let dataForTab = new WeakMap();
28
29 /**
30 * Creates a pool of tabs and allocates them to tasks on request.
31 *
32 * @param {tabbrowser} browser
33 * The tabbed browser where tabs should be created
34 * @param {int} maxtabs
35 * The maximum number of tabs to be allocated
36 * @constructor
37 */
38 function TabAllocator(browser, maxtabs)
saroyanm 2015/05/04 18:13:43 Why don't we use default amount of maxtab equal to
Wladimir Palant 2015/05/07 00:04:59 Because the text file has 50 thousand URLs.
39 {
40 browser.removeAllTabsBut(browser.tabs[0])
41
42 this._tabs = [];
43 for (let i = 0; i < maxtabs; i++)
44 this._tabs.push(browser.addTab("about:blank"));
45
46 browser.removeTab(browser.tabs[0]);
47
48 this._deferred = [];
49 }
50 TabAllocator.prototype = {
51 /**
52 * Returns a promise that will resolve into a tab once a tab can be allocated.
53 * The tab cannot be used by other tasks until releaseTab() is called.
54 *
55 * @result {Promise}
saroyanm 2015/05/04 18:13:43 Nit: @return ?
Wladimir Palant 2015/05/07 00:04:59 Done.
56 */
57 getTab: function()
58 {
59 if (this._tabs.length)
60 return this._tabs.shift();
61 else
62 {
63 let deferred = Promise.defer();
saroyanm 2015/05/04 18:13:43 The Deffered object is obsolete starting from Geck
Wladimir Palant 2015/05/07 00:04:59 Promise constructor in Promise.jsm is actually sup
64 this._deferred.push(deferred);
65 return deferred.promise;
66 }
67 },
68
69 /**
70 * Adds a tab back to the pool so that it can be used by other tasks.
71 *
72 * @param {tab} tab
73 */
74 releaseTab: function(tab)
75 {
76 let browser = tab.parentNode.tabbrowser;
77 browser.removeTab(tab);
78 tab = browser.addTab("about:blank");
79
80 if (this._deferred.length)
81 this._deferred.shift().resolve(tab);
saroyanm 2015/05/04 18:13:43 As mentioned above, the Deffered object is obsolet
Wladimir Palant 2015/05/07 00:04:59 Done.
82 else
83 this._tabs.push(tab);
84 }
85 };
86
87 /**
88 * Observes page loads in a particular tabbed browser.
89 *
90 * @param {tabbrowser} browser
91 * The tabbed browser to be observed
92 * @param {int} timeout
93 * Load timeout in milliseconds
94 * @constructor
95 */
96 function LoadListener(browser, timeout)
97 {
98 this._browser = browser;
99 this._deferred = new Map();
100 this._timeout = timeout;
101 browser.addTabsProgressListener(this);
102 }
103 LoadListener.prototype = {
104 /**
105 * Returns a promise that will be resolved when the page in the specified tab
106 * finishes loading. Loading will be stopped if the timeout is reached.
107 *
108 * @param {tab} tab
109 * @result {Promise}
saroyanm 2015/05/04 18:13:43 Nit: @return
Wladimir Palant 2015/05/07 00:04:59 Done.
110 */
111 waitForLoad: function(tab)
112 {
113 let deferred = Promise.defer();
saroyanm 2015/05/04 18:13:43 As mentioned above this is Obsolete, this comment
Wladimir Palant 2015/05/07 00:04:59 Done.
114 this._deferred.set(tab.linkedBrowser, deferred);
115
116 tab.ownerDocument.defaultView.setTimeout(function()
117 {
118 tab.linkedBrowser.stop();
119 }, this._timeout);
120
121 return deferred.promise;
122 },
123
124 /**
125 * Deactivates this object.
126 */
127 stop: function()
128 {
129 this._browser.removeTabsProgressListener(this);
130 },
131
132 onStateChange: function(browser, progress, request, flags, status)
133 {
134 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW))
saroyanm 2015/05/04 18:13:43 Nit: This case can be splitted into multiple lines
135 {
136 let deferred = this._deferred.get(browser);
137 if (deferred)
138 {
139 this._deferred.delete(browser);
140
141 let headers = [];
142 if (request instanceof Ci.nsIHttpChannel)
143 {
144 try
145 {
146 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText);
147 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value));
148 }
149 catch (e)
150 {
151 // Exceptions are expected here
saroyanm 2015/05/04 18:13:43 Please also handle the exception. reportException
Wladimir Palant 2015/05/07 00:04:59 As the comment explains, exceptions are expected h
152 }
153 }
154 deferred.resolve([status, headers]);
155 }
156 }
157 }
158 };
159
160 /**
161 * Once created, this object will make sure all new windows are dismissed
162 * immediately.
163 *
164 * @constructor
165 */
166 function WindowCloser()
167 {
168 Services.obs.addObserver(this, "xul-window-registered", true)
saroyanm 2015/05/04 18:13:43 Missing semicolon
Wladimir Palant 2015/05/07 00:04:59 Done.
169 }
170 WindowCloser.prototype = {
171 /**
172 * Deactivates this object.
173 */
174 stop: function()
175 {
176 Services.obs.removeObserver(this, "xul-window-registered")
saroyanm 2015/05/04 18:13:43 Missing semicolon
Wladimir Palant 2015/05/07 00:04:59 Done.
177 },
178
179 observe: function(subject, topic, data)
180 {
181 let window = subject.QueryInterface(Ci.nsIInterfaceRequestor)
182 .getInterface(Ci.nsIDOMWindow)
saroyanm 2015/05/04 18:13:43 Missing semicolon
Wladimir Palant 2015/05/07 00:04:59 Done.
183 window.addEventListener("load", function()
184 {
185 if (window.document.documentElement.localName == 'dialog')
saroyanm 2015/05/04 18:13:43 Nit: Please use double quote.
Wladimir Palant 2015/05/07 00:04:59 Done.
186 window.document.documentElement.acceptDialog();
187 else
188 window.close();
189 }, false);
190 },
191
192 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence])
193 };
194
195 /**
196 * Retrieves crawler results associated with a particular content window.
197 *
198 * @param {Window} window
199 * Content window to retrieve crawler results for
200 * @result {Object}
saroyanm 2015/05/04 18:13:43 Nit: @return
Wladimir Palant 2015/05/07 00:04:59 Done.
201 * Crawler results or undefined if the window wasn't created by the crawler.
202 */
203 function getDataForWindow(window)
204 {
205 let topWindow = window.top;
206 if (!topWindow.document)
207 throw new Error("No document associated with the node's top window");
208 let tabbrowser = Utils.getChromeWindow(topWindow).getBrowser();
209 if (!tabbrowser)
210 throw new Error("Unable to get a tabbrowser reference from the window");
211 let browser = tabbrowser.getBrowserForDocument(topWindow.document);
212 if (!browser)
213 throw new Error("Unable to get browser for the content window");
214 let tab = tabbrowser.getTabForBrowser(browser);
215 if (!tab)
216 throw new Error("Unable to get tab for the browser");
217 return dataForTab.get(tab);
218 };
saroyanm 2015/05/04 18:13:43 Nit: Semicolon is redundant here.
Wladimir Palant 2015/05/07 00:04:59 Done.
219
220 /**
221 * Starts the crawling session. The crawler opens each URL in a tab and stores
222 * the results.
223 *
224 * @param {Window} window
225 * The browser window we're operating in
226 * @param {String[]} urls
227 * URLs to be crawled
228 * @param {int} number_of_tabs
229 * Maximum number of tabs to be opened
230 * @param {String} targetURL
231 * URL that should receive the results
saroyanm 2015/05/04 18:13:43 Nit: Please also document onDone parameter for con
Sebastian Noack 2015/05/04 20:39:01 Also note that JsDoc 3 will mistakenly document th
Wladimir Palant 2015/05/07 00:04:59 Done.
232 */
233 function run(window, urls, timeout, maxtabs, targetURL, onDone)
234 {
235 let requestNotifier = new RequestNotifier(null, function() {});
236
237 let origProcessNode = Policy.processNode;
238 Policy.processNode = processNodeReplacement.bind(null, origProcessNode, reques tNotifier);
239
240 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
241 let loadListener = new LoadListener(window.getBrowser(), timeout);
242 let running = 0;
243 let windowCloser = new WindowCloser();
244 let taskDone = function()
245 {
246 running--;
247 if (running <= 0)
248 {
249 Policy.processNode = origProcessNode;
250 requestNotifier.shutdown();
251 loadListener.stop();
252 windowCloser.stop();
253 onDone();
254 }
255 };
256
257 for (let url of urls)
258 {
259 running++;
260 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result)
261 {
262 let request = new XMLHttpRequest();
263 request.open("POST", targetURL);
264 request.addEventListener("load", taskDone, false);
265 request.addEventListener("error", taskDone, false);
266 request.send(JSON.stringify(result));
267 }, function(url, exception)
268 {
269 reportException(exception);
270
271 let request = new XMLHttpRequest();
272 request.open("POST", targetURL);
273 request.addEventListener("load", taskDone, false);
274 request.addEventListener("error", taskDone, false);
275 request.send(JSON.stringify({
276 url: url,
277 startTime: Date.now(),
278 error: String(exception)
279 }));
280 }.bind(null, url));
281 }
282 }
283 exports.run = run;
284
285 /**
286 * Crawls a URL. This is a generator meant to be used via a Task object.
287 *
288 * @param {String} url
289 * @param {TabAllocator} tabAllocator
290 * @param {loadListener} loadListener
291 * @result {Object}
saroyanm 2015/05/04 18:13:43 Nit: @return {Object} Crawling result
Wladimir Palant 2015/05/07 00:04:59 Done.
292 * Crawling result
293 */
294 function* crawl_url(url, tabAllocator, loadListener)
295 {
296 let tab = yield tabAllocator.getTab();
297 let result = {url: url};
298
299 dataForTab.set(tab, result);
300 try
301 {
302 result.startTime = Date.now();
303 tab.linkedBrowser.loadURI(url, null, null);
304 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ;
305 result.endTime = Date.now();
306 result.finalUrl = tab.linkedBrowser.currentURI.spec;
307
308 let document = tab.linkedBrowser.contentDocument;
309 if (document.documentElement)
310 {
311 try
312 {
313 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas");
314 canvas.width = document.documentElement.scrollWidth;
315 canvas.height = document.documentElement.scrollHeight;
316
317 let context = canvas.getContext("2d");
318 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)");
319 result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
saroyanm 2015/05/04 18:13:43 Maybe make sense to let user specify the quality o
Wladimir Palant 2015/05/07 00:04:59 Well, changing maxtabs is very useful when testing
saroyanm 2015/05/07 13:19:00 Good point.
320 }
321 catch (e)
322 {
323 reportException(e);
324 result.error = "Capturing screenshot failed: " + e;
saroyanm 2015/05/04 18:13:43 Isn't result.error redundant ?
Wladimir Palant 2015/05/07 00:04:59 No. The exception is merely reported to the consol
saroyanm 2015/05/07 13:19:00 Yes, missed that. Good point.
325 }
326
327 // TODO: Capture frames as well?
saroyanm 2015/05/04 18:13:43 Nit: I think we shouldn't have TODO comments in re
Wladimir Palant 2015/05/07 00:04:59 Why not? This is something we might want to do in
328 let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
329 result.source = serializer.serializeToString(document.documentElement);
330 }
331 }
332 finally
333 {
334 tabAllocator.releaseTab(tab);
335 }
336 return result;
337 }
338
339 function reportException(e)
340 {
341 let stack = "";
342 if (e && typeof e == "object" && "stack" in e)
343 stack = e.stack + "\n";
344
345 Cu.reportError(e);
346 dump(e + "\n" + stack + "\n");
347 }
348
349 /**
350 * Wrapper for the Policy.processNode() function in ABP. Calls the original
351 * function and records all the data.
352 *
353 * @param {Function} origProcessNode
354 * The original processNode function.
355 * @param {RequestNotifier} requestNotifier
356 * The crawler's RequestNotifier object instance.
357 * @param {nsIDOMWindow} wnd
358 * @param {nsIDOMElement} node
359 * @param {Number} contentType
360 * @param {nsIURI} location
361 * @param {Boolean} collapse
362 * @return {Boolean}
363 */
364 function processNodeReplacement(origProcessNode, requestNotifier, wnd, node, con tentType, location, collapse)
365 {
366 let filters = [];
367 let origListener = requestNotifier.listener;
368 requestNotifier.listener = function(window, node, entry)
369 {
370 if (entry.filter)
371 filters.push(entry.filter.text);
372 };
373
374 /*
375 * Call the original processNode. If the original throws, then we will too, so this is outside a try clause.
saroyanm 2015/05/04 18:13:43 Nit: the line is too long.
Sebastian Noack 2015/05/04 20:39:01 Also I suggest to use simply // for comments expla
Wladimir Palant 2015/05/07 00:04:59 Done.
376 */
377 let result;
378 try
379 {
380 result = origProcessNode(wnd, node, contentType, location, collapse);
381 }
382 finally
383 {
384 requestNotifier.listener = origListener;
385 }
386
387 try
388 {
389 let data = getDataForWindow(wnd);
390 if (data)
391 {
392 if (!("requests" in data))
393 data.requests = [];
394 data.requests.push({
395 contentType: contentType,
396 location: (contentType == Policy.type.ELEMHIDE ? location.text : locatio n.spec),
397 blocked: result != Ci.nsIContentPolicy.ACCEPT,
398 filters: filters
399 });
400 }
401 }
402 catch (e)
403 {
404 reportException(e);
405 }
406 return result;
407 };
saroyanm 2015/05/04 18:13:43 Nit: Semicolon is redundant here.
Wladimir Palant 2015/05/07 00:04:59 Done.
OLDNEW

Powered by Google App Engine
This is Rietveld