Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * This Source Code is subject to the terms of the Mozilla Public License | |
3 * version 2.0 (the "License"). You can obtain a copy of the License at | |
4 * http://mozilla.org/MPL/2.0/. | |
5 */ | |
6 | |
7 /** | |
8 * @module crawler | |
9 */ | |
10 | |
11 Cu.import("resource://gre/modules/Services.jsm"); | |
12 Cu.import("resource://gre/modules/Task.jsm"); | |
13 Cu.import("resource://gre/modules/Promise.jsm"); | |
14 | |
15 let {reportException} = require("debug"); | |
16 | |
17 function abprequire(module) | |
18 { | |
19 let result = {}; | |
20 result.wrappedJSObject = result; | |
21 Services.obs.notifyObservers(result, "adblockplus-require", module); | |
22 return result.exports; | |
23 } | |
24 | |
25 let {Policy} = abprequire("contentPolicy"); | |
26 let {RequestNotifier} = abprequire("requestNotifier"); | |
27 let {Utils} = abprequire("utils"); | |
28 | |
29 let dataForTab = new WeakMap(); | |
30 | |
31 /** | |
32 * Creates a pool of tabs and allocates them to tasks on request. | |
33 * | |
34 * @param {tabbrowser} browser | |
35 * The tabbed browser where tabs should be created | |
36 * @param {int} maxtabs | |
37 * The maximum number of tabs to be allocated | |
38 * @constructor | |
39 */ | |
40 function TabAllocator(browser, maxtabs) | |
41 { | |
42 browser.removeAllTabsBut(browser.tabs[0]) | |
43 | |
44 this._tabs = []; | |
45 for (let i = 0; i < maxtabs; i++) | |
46 this._tabs.push(browser.addTab("about:blank")); | |
47 | |
48 browser.removeTab(browser.tabs[0]); | |
49 | |
50 this._queue = []; | |
51 } | |
52 TabAllocator.prototype = { | |
53 /** | |
54 * Returns a promise that will resolve into a tab once a tab can be allocated. | |
55 * The tab cannot be used by other tasks until releaseTab() is called. | |
56 * | |
57 * @return {Promise} | |
58 */ | |
59 getTab: function() | |
60 { | |
61 if (this._tabs.length) | |
62 return this._tabs.shift(); | |
63 else | |
64 return new Promise((resolve, reject) => this._queue.push(resolve)); | |
65 }, | |
66 | |
67 /** | |
68 * Adds a tab back to the pool so that it can be used by other tasks. | |
69 * | |
70 * @param {tab} tab | |
71 */ | |
72 releaseTab: function(tab) | |
73 { | |
74 let browser = tab.parentNode.tabbrowser; | |
75 browser.removeTab(tab); | |
76 tab = browser.addTab("about:blank"); | |
77 | |
78 if (this._queue.length) | |
79 this._queue.shift()(tab); | |
80 else | |
81 this._tabs.push(tab); | |
82 } | |
83 }; | |
84 | |
85 /** | |
86 * Observes page loads in a particular tabbed browser. | |
87 * | |
88 * @param {tabbrowser} browser | |
89 * The tabbed browser to be observed | |
90 * @param {int} timeout | |
91 * Load timeout in milliseconds | |
92 * @constructor | |
93 */ | |
94 function LoadListener(browser, timeout) | |
95 { | |
96 this._browser = browser; | |
97 this._queue = new Map(); | |
98 this._timeout = timeout; | |
99 browser.addTabsProgressListener(this); | |
100 } | |
101 LoadListener.prototype = { | |
102 /** | |
103 * Returns a promise that will be resolved when the page in the specified tab | |
104 * finishes loading. Loading will be stopped if the timeout is reached. | |
105 * | |
106 * @param {tab} tab | |
107 * @return {Promise} | |
108 */ | |
109 waitForLoad: function(tab) | |
110 { | |
111 return new Promise((resolve, reject) => | |
112 { | |
113 this._queue.set(tab.linkedBrowser, resolve); | |
114 | |
115 tab.ownerDocument.defaultView.setTimeout(function() | |
116 { | |
117 tab.linkedBrowser.stop(); | |
118 }, this._timeout); | |
119 }); | |
120 }, | |
121 | |
122 /** | |
123 * Deactivates this object. | |
124 */ | |
125 stop: function() | |
126 { | |
127 this._browser.removeTabsProgressListener(this); | |
128 }, | |
129 | |
130 onStateChange: function(browser, progress, request, flags, status) | |
131 { | |
132 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && | |
saroyanm
2015/05/07 13:19:00
Nit: We usually also move operators to new line, f
| |
133 (flags & Ci.nsIWebProgressListener.STATE_IS_WINDOW)) | |
134 { | |
135 let resolve = this._queue.get(browser); | |
136 if (resolve) | |
137 { | |
138 this._queue.delete(browser); | |
139 | |
140 let headers = []; | |
141 if (request instanceof Ci.nsIHttpChannel) | |
142 { | |
143 try | |
144 { | |
145 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText); | |
146 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value)); | |
147 } | |
148 catch (e) | |
149 { | |
150 // Exceptions are expected here | |
151 } | |
152 } | |
153 resolve([status, headers]); | |
154 } | |
155 } | |
156 } | |
157 }; | |
158 | |
159 /** | |
160 * Once created, this object will make sure all new windows are dismissed | |
161 * immediately. | |
162 * | |
163 * @constructor | |
164 */ | |
165 function WindowCloser() | |
166 { | |
167 Services.obs.addObserver(this, "xul-window-registered", true); | |
168 } | |
169 WindowCloser.prototype = { | |
170 /** | |
171 * Deactivates this object. | |
172 */ | |
173 stop: function() | |
174 { | |
175 Services.obs.removeObserver(this, "xul-window-registered"); | |
176 }, | |
177 | |
178 observe: function(subject, topic, data) | |
179 { | |
180 let window = subject.QueryInterface(Ci.nsIInterfaceRequestor) | |
181 .getInterface(Ci.nsIDOMWindow); | |
182 window.addEventListener("load", function() | |
183 { | |
184 if (window.document.documentElement.localName == "dialog") | |
185 window.document.documentElement.acceptDialog(); | |
186 else | |
187 window.close(); | |
188 }, false); | |
189 }, | |
190 | |
191 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) | |
192 }; | |
193 | |
194 /** | |
195 * Retrieves crawler results associated with a particular content window. | |
196 * | |
197 * @param {Window} window | |
198 * Content window to retrieve crawler results for | |
199 * @return {Object} | |
200 * Crawler results or undefined if the window wasn't created by the crawler. | |
201 */ | |
202 function getDataForWindow(window) | |
203 { | |
204 let topWindow = window.top; | |
205 if (!topWindow.document) | |
206 throw new Error("No document associated with the node's top window"); | |
207 let tabbrowser = Utils.getChromeWindow(topWindow).getBrowser(); | |
208 if (!tabbrowser) | |
209 throw new Error("Unable to get a tabbrowser reference from the window"); | |
210 let browser = tabbrowser.getBrowserForDocument(topWindow.document); | |
211 if (!browser) | |
212 throw new Error("Unable to get browser for the content window"); | |
213 let tab = tabbrowser.getTabForBrowser(browser); | |
214 if (!tab) | |
215 throw new Error("Unable to get tab for the browser"); | |
216 return dataForTab.get(tab); | |
217 } | |
218 | |
219 /** | |
220 * Starts the crawling session. The crawler opens each URL in a tab and stores | |
221 * the results. | |
222 * | |
223 * @param {Window} window | |
224 * The browser window we're operating in | |
225 * @param {String[]} urls | |
226 * URLs to be crawled | |
227 * @param {int} number_of_tabs | |
228 * Maximum number of tabs to be opened | |
229 * @param {String} targetURL | |
230 * URL that should receive the results | |
231 * @param {Function} onDone | |
232 * Callback to be executed once the processing finishes | |
233 * @static | |
234 */ | |
235 function run(window, urls, timeout, maxtabs, targetURL, onDone) | |
236 { | |
237 let requestNotifier = new RequestNotifier(null, function() {}); | |
238 | |
239 let origProcessNode = Policy.processNode; | |
240 Policy.processNode = processNodeReplacement.bind(null, origProcessNode, reques tNotifier); | |
241 | |
242 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); | |
243 let loadListener = new LoadListener(window.getBrowser(), timeout); | |
244 let running = 0; | |
245 let windowCloser = new WindowCloser(); | |
246 let taskDone = function() | |
247 { | |
248 running--; | |
249 if (running <= 0) | |
250 { | |
251 Policy.processNode = origProcessNode; | |
252 requestNotifier.shutdown(); | |
253 loadListener.stop(); | |
254 windowCloser.stop(); | |
255 onDone(); | |
256 } | |
257 }; | |
258 | |
259 for (let url of urls) | |
260 { | |
261 running++; | |
262 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) | |
263 { | |
264 let request = new XMLHttpRequest(); | |
265 request.open("POST", targetURL); | |
266 request.addEventListener("load", taskDone, false); | |
267 request.addEventListener("error", taskDone, false); | |
268 request.send(JSON.stringify(result)); | |
269 }, function(url, exception) | |
270 { | |
271 reportException(exception); | |
272 | |
273 let request = new XMLHttpRequest(); | |
274 request.open("POST", targetURL); | |
275 request.addEventListener("load", taskDone, false); | |
276 request.addEventListener("error", taskDone, false); | |
277 request.send(JSON.stringify({ | |
278 url: url, | |
279 startTime: Date.now(), | |
280 error: String(exception) | |
281 })); | |
282 }.bind(null, url)); | |
283 } | |
284 } | |
285 exports.run = run; | |
286 | |
287 /** | |
288 * Crawls a URL. This is a generator meant to be used via a Task object. | |
289 * | |
290 * @param {String} url | |
291 * @param {TabAllocator} tabAllocator | |
292 * @param {loadListener} loadListener | |
293 * @return {Object} | |
294 * Crawling result | |
295 */ | |
296 function* crawl_url(url, tabAllocator, loadListener) | |
Sebastian Noack
2015/05/07 12:33:04
Nit: camel case
| |
297 { | |
298 let tab = yield tabAllocator.getTab(); | |
299 let result = {url: url}; | |
300 | |
301 dataForTab.set(tab, result); | |
302 try | |
303 { | |
304 result.startTime = Date.now(); | |
305 tab.linkedBrowser.loadURI(url, null, null); | |
306 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; | |
307 result.endTime = Date.now(); | |
308 result.finalUrl = tab.linkedBrowser.currentURI.spec; | |
309 | |
310 let document = tab.linkedBrowser.contentDocument; | |
311 if (document.documentElement) | |
312 { | |
313 try | |
314 { | |
315 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas"); | |
316 canvas.width = document.documentElement.scrollWidth; | |
317 canvas.height = document.documentElement.scrollHeight; | |
318 | |
319 let context = canvas.getContext("2d"); | |
320 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)"); | |
321 result.screenshot = canvas.toDataURL("image/jpeg", 0.8); | |
322 } | |
323 catch (e) | |
324 { | |
325 reportException(e); | |
326 result.error = "Capturing screenshot failed: " + e; | |
327 } | |
328 | |
329 // TODO: Capture frames as well? | |
330 let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); | |
331 result.source = serializer.serializeToString(document.documentElement); | |
332 } | |
333 } | |
334 finally | |
335 { | |
336 tabAllocator.releaseTab(tab); | |
337 } | |
338 return result; | |
339 } | |
340 | |
341 /** | |
342 * Wrapper for the Policy.processNode() function in ABP. Calls the original | |
343 * function and records all the data. | |
344 * | |
345 * @param {Function} origProcessNode | |
346 * The original processNode function. | |
347 * @param {RequestNotifier} requestNotifier | |
348 * The crawler's RequestNotifier object instance. | |
349 * @param {nsIDOMWindow} wnd | |
350 * @param {nsIDOMElement} node | |
351 * @param {Number} contentType | |
352 * @param {nsIURI} location | |
353 * @param {Boolean} collapse | |
354 * @return {Boolean} | |
355 */ | |
356 function processNodeReplacement(origProcessNode, requestNotifier, wnd, node, con tentType, location, collapse) | |
357 { | |
358 let filters = []; | |
359 let origListener = requestNotifier.listener; | |
360 requestNotifier.listener = function(window, node, entry) | |
361 { | |
362 if (entry.filter) | |
363 filters.push(entry.filter.text); | |
364 }; | |
365 | |
366 // Call the original processNode. If the original throws, then we will too, | |
367 // so this is outside a try clause. | |
368 let result; | |
369 try | |
370 { | |
371 result = origProcessNode(wnd, node, contentType, location, collapse); | |
372 } | |
373 finally | |
374 { | |
375 requestNotifier.listener = origListener; | |
376 } | |
377 | |
378 try | |
379 { | |
380 let data = getDataForWindow(wnd); | |
381 if (data) | |
382 { | |
383 if (!("requests" in data)) | |
384 data.requests = []; | |
385 data.requests.push({ | |
386 contentType: contentType, | |
387 location: (contentType == Policy.type.ELEMHIDE ? location.text : locatio n.spec), | |
388 blocked: result != Ci.nsIContentPolicy.ACCEPT, | |
389 filters: filters | |
390 }); | |
391 } | |
392 } | |
393 catch (e) | |
394 { | |
395 reportException(e); | |
396 } | |
397 return result; | |
398 } | |
OLD | NEW |