Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/crawler.js

Issue 5288886037118976: Adblock Plus Crawler rewrite (Closed)
Patch Set: Addressed comments Created May 7, 2015, 12:04 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/.
5 */
6
7 /**
8 * @module crawler
9 */
10
11 Cu.import("resource://gre/modules/Services.jsm");
12 Cu.import("resource://gre/modules/Task.jsm");
13 Cu.import("resource://gre/modules/Promise.jsm");
14
15 let {reportException} = require("debug");
16
17 function abprequire(module)
18 {
19 let result = {};
20 result.wrappedJSObject = result;
21 Services.obs.notifyObservers(result, "adblockplus-require", module);
22 return result.exports;
23 }
24
25 let {Policy} = abprequire("contentPolicy");
26 let {RequestNotifier} = abprequire("requestNotifier");
27 let {Utils} = abprequire("utils");
28
29 let dataForTab = new WeakMap();
30
31 /**
32 * Creates a pool of tabs and allocates them to tasks on request.
33 *
34 * @param {tabbrowser} browser
35 * The tabbed browser where tabs should be created
36 * @param {int} maxtabs
37 * The maximum number of tabs to be allocated
38 * @constructor
39 */
40 function TabAllocator(browser, maxtabs)
41 {
42 browser.removeAllTabsBut(browser.tabs[0])
43
44 this._tabs = [];
45 for (let i = 0; i < maxtabs; i++)
46 this._tabs.push(browser.addTab("about:blank"));
47
48 browser.removeTab(browser.tabs[0]);
49
50 this._queue = [];
51 }
52 TabAllocator.prototype = {
53 /**
54 * Returns a promise that will resolve into a tab once a tab can be allocated.
55 * The tab cannot be used by other tasks until releaseTab() is called.
56 *
57 * @return {Promise}
58 */
59 getTab: function()
60 {
61 if (this._tabs.length)
62 return this._tabs.shift();
63 else
64 return new Promise((resolve, reject) => this._queue.push(resolve));
65 },
66
67 /**
68 * Adds a tab back to the pool so that it can be used by other tasks.
69 *
70 * @param {tab} tab
71 */
72 releaseTab: function(tab)
73 {
74 let browser = tab.parentNode.tabbrowser;
75 browser.removeTab(tab);
76 tab = browser.addTab("about:blank");
77
78 if (this._queue.length)
79 this._queue.shift()(tab);
80 else
81 this._tabs.push(tab);
82 }
83 };
84
85 /**
86 * Observes page loads in a particular tabbed browser.
87 *
88 * @param {tabbrowser} browser
89 * The tabbed browser to be observed
90 * @param {int} timeout
91 * Load timeout in milliseconds
92 * @constructor
93 */
94 function LoadListener(browser, timeout)
95 {
96 this._browser = browser;
97 this._queue = new Map();
98 this._timeout = timeout;
99 browser.addTabsProgressListener(this);
100 }
101 LoadListener.prototype = {
102 /**
103 * Returns a promise that will be resolved when the page in the specified tab
104 * finishes loading. Loading will be stopped if the timeout is reached.
105 *
106 * @param {tab} tab
107 * @return {Promise}
108 */
109 waitForLoad: function(tab)
110 {
111 return new Promise((resolve, reject) =>
112 {
113 this._queue.set(tab.linkedBrowser, resolve);
114
115 tab.ownerDocument.defaultView.setTimeout(function()
116 {
117 tab.linkedBrowser.stop();
118 }, this._timeout);
119 });
120 },
121
122 /**
123 * Deactivates this object.
124 */
125 stop: function()
126 {
127 this._browser.removeTabsProgressListener(this);
128 },
129
130 onStateChange: function(browser, progress, request, flags, status)
131 {
132 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) &&
saroyanm 2015/05/07 13:19:00 Nit: We usually also move operators to new line, f
133 (flags & Ci.nsIWebProgressListener.STATE_IS_WINDOW))
134 {
135 let resolve = this._queue.get(browser);
136 if (resolve)
137 {
138 this._queue.delete(browser);
139
140 let headers = [];
141 if (request instanceof Ci.nsIHttpChannel)
142 {
143 try
144 {
145 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText);
146 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value));
147 }
148 catch (e)
149 {
150 // Exceptions are expected here
151 }
152 }
153 resolve([status, headers]);
154 }
155 }
156 }
157 };
158
159 /**
160 * Once created, this object will make sure all new windows are dismissed
161 * immediately.
162 *
163 * @constructor
164 */
165 function WindowCloser()
166 {
167 Services.obs.addObserver(this, "xul-window-registered", true);
168 }
169 WindowCloser.prototype = {
170 /**
171 * Deactivates this object.
172 */
173 stop: function()
174 {
175 Services.obs.removeObserver(this, "xul-window-registered");
176 },
177
178 observe: function(subject, topic, data)
179 {
180 let window = subject.QueryInterface(Ci.nsIInterfaceRequestor)
181 .getInterface(Ci.nsIDOMWindow);
182 window.addEventListener("load", function()
183 {
184 if (window.document.documentElement.localName == "dialog")
185 window.document.documentElement.acceptDialog();
186 else
187 window.close();
188 }, false);
189 },
190
191 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence])
192 };
193
194 /**
195 * Retrieves crawler results associated with a particular content window.
196 *
197 * @param {Window} window
198 * Content window to retrieve crawler results for
199 * @return {Object}
200 * Crawler results or undefined if the window wasn't created by the crawler.
201 */
202 function getDataForWindow(window)
203 {
204 let topWindow = window.top;
205 if (!topWindow.document)
206 throw new Error("No document associated with the node's top window");
207 let tabbrowser = Utils.getChromeWindow(topWindow).getBrowser();
208 if (!tabbrowser)
209 throw new Error("Unable to get a tabbrowser reference from the window");
210 let browser = tabbrowser.getBrowserForDocument(topWindow.document);
211 if (!browser)
212 throw new Error("Unable to get browser for the content window");
213 let tab = tabbrowser.getTabForBrowser(browser);
214 if (!tab)
215 throw new Error("Unable to get tab for the browser");
216 return dataForTab.get(tab);
217 }
218
219 /**
220 * Starts the crawling session. The crawler opens each URL in a tab and stores
221 * the results.
222 *
223 * @param {Window} window
224 * The browser window we're operating in
225 * @param {String[]} urls
226 * URLs to be crawled
227 * @param {int} number_of_tabs
228 * Maximum number of tabs to be opened
229 * @param {String} targetURL
230 * URL that should receive the results
231 * @param {Function} onDone
232 * Callback to be executed once the processing finishes
233 * @static
234 */
235 function run(window, urls, timeout, maxtabs, targetURL, onDone)
236 {
237 let requestNotifier = new RequestNotifier(null, function() {});
238
239 let origProcessNode = Policy.processNode;
240 Policy.processNode = processNodeReplacement.bind(null, origProcessNode, reques tNotifier);
241
242 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
243 let loadListener = new LoadListener(window.getBrowser(), timeout);
244 let running = 0;
245 let windowCloser = new WindowCloser();
246 let taskDone = function()
247 {
248 running--;
249 if (running <= 0)
250 {
251 Policy.processNode = origProcessNode;
252 requestNotifier.shutdown();
253 loadListener.stop();
254 windowCloser.stop();
255 onDone();
256 }
257 };
258
259 for (let url of urls)
260 {
261 running++;
262 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result)
263 {
264 let request = new XMLHttpRequest();
265 request.open("POST", targetURL);
266 request.addEventListener("load", taskDone, false);
267 request.addEventListener("error", taskDone, false);
268 request.send(JSON.stringify(result));
269 }, function(url, exception)
270 {
271 reportException(exception);
272
273 let request = new XMLHttpRequest();
274 request.open("POST", targetURL);
275 request.addEventListener("load", taskDone, false);
276 request.addEventListener("error", taskDone, false);
277 request.send(JSON.stringify({
278 url: url,
279 startTime: Date.now(),
280 error: String(exception)
281 }));
282 }.bind(null, url));
283 }
284 }
285 exports.run = run;
286
287 /**
288 * Crawls a URL. This is a generator meant to be used via a Task object.
289 *
290 * @param {String} url
291 * @param {TabAllocator} tabAllocator
292 * @param {loadListener} loadListener
293 * @return {Object}
294 * Crawling result
295 */
296 function* crawl_url(url, tabAllocator, loadListener)
Sebastian Noack 2015/05/07 12:33:04 Nit: camel case
297 {
298 let tab = yield tabAllocator.getTab();
299 let result = {url: url};
300
301 dataForTab.set(tab, result);
302 try
303 {
304 result.startTime = Date.now();
305 tab.linkedBrowser.loadURI(url, null, null);
306 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ;
307 result.endTime = Date.now();
308 result.finalUrl = tab.linkedBrowser.currentURI.spec;
309
310 let document = tab.linkedBrowser.contentDocument;
311 if (document.documentElement)
312 {
313 try
314 {
315 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas");
316 canvas.width = document.documentElement.scrollWidth;
317 canvas.height = document.documentElement.scrollHeight;
318
319 let context = canvas.getContext("2d");
320 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)");
321 result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
322 }
323 catch (e)
324 {
325 reportException(e);
326 result.error = "Capturing screenshot failed: " + e;
327 }
328
329 // TODO: Capture frames as well?
330 let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
331 result.source = serializer.serializeToString(document.documentElement);
332 }
333 }
334 finally
335 {
336 tabAllocator.releaseTab(tab);
337 }
338 return result;
339 }
340
341 /**
342 * Wrapper for the Policy.processNode() function in ABP. Calls the original
343 * function and records all the data.
344 *
345 * @param {Function} origProcessNode
346 * The original processNode function.
347 * @param {RequestNotifier} requestNotifier
348 * The crawler's RequestNotifier object instance.
349 * @param {nsIDOMWindow} wnd
350 * @param {nsIDOMElement} node
351 * @param {Number} contentType
352 * @param {nsIURI} location
353 * @param {Boolean} collapse
354 * @return {Boolean}
355 */
356 function processNodeReplacement(origProcessNode, requestNotifier, wnd, node, con tentType, location, collapse)
357 {
358 let filters = [];
359 let origListener = requestNotifier.listener;
360 requestNotifier.listener = function(window, node, entry)
361 {
362 if (entry.filter)
363 filters.push(entry.filter.text);
364 };
365
366 // Call the original processNode. If the original throws, then we will too,
367 // so this is outside a try clause.
368 let result;
369 try
370 {
371 result = origProcessNode(wnd, node, contentType, location, collapse);
372 }
373 finally
374 {
375 requestNotifier.listener = origListener;
376 }
377
378 try
379 {
380 let data = getDataForWindow(wnd);
381 if (data)
382 {
383 if (!("requests" in data))
384 data.requests = [];
385 data.requests.push({
386 contentType: contentType,
387 location: (contentType == Policy.type.ELEMHIDE ? location.text : locatio n.spec),
388 blocked: result != Ci.nsIContentPolicy.ACCEPT,
389 filters: filters
390 });
391 }
392 }
393 catch (e)
394 {
395 reportException(e);
396 }
397 return result;
398 }
OLDNEW

Powered by Google App Engine
This is Rietveld