Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * This Source Code is subject to the terms of the Mozilla Public License | |
3 * version 2.0 (the "License"). You can obtain a copy of the License at | |
4 * http://mozilla.org/MPL/2.0/. | |
5 */ | |
6 | |
7 /** | |
8 * @module crawler | |
saroyanm
2015/05/04 18:13:43
I think this should be file overview, maybe smth l
Wladimir Palant
2015/05/07 00:04:59
As Sebastian noted elsewhere, @module is actually
| |
9 */ | |
10 | |
11 Cu.import("resource://gre/modules/Services.jsm"); | |
12 Cu.import("resource://gre/modules/Task.jsm"); | |
13 Cu.import("resource://gre/modules/Promise.jsm"); | |
14 | |
15 function abprequire(module) | |
16 { | |
17 let result = {}; | |
saroyanm
2015/05/04 18:13:43
We can create here Object without Object prototype
Sebastian Noack
2015/05/04 20:39:01
It seems that we don't need to bother about confli
Wladimir Palant
2015/05/07 00:04:59
Yes, this isn't a place where property conflicts c
| |
18 result.wrappedJSObject = result; | |
19 Services.obs.notifyObservers(result, "adblockplus-require", module); | |
20 return result.exports; | |
21 } | |
22 | |
23 let {Policy} = abprequire("contentPolicy"); | |
24 let {RequestNotifier} = abprequire("requestNotifier"); | |
25 let {Utils} = abprequire("utils"); | |
26 | |
27 let dataForTab = new WeakMap(); | |
28 | |
29 /** | |
30 * Creates a pool of tabs and allocates them to tasks on request. | |
31 * | |
32 * @param {tabbrowser} browser | |
33 * The tabbed browser where tabs should be created | |
34 * @param {int} maxtabs | |
35 * The maximum number of tabs to be allocated | |
36 * @constructor | |
37 */ | |
38 function TabAllocator(browser, maxtabs) | |
saroyanm
2015/05/04 18:13:43
Why don't we use default amount of maxtab equal to
Wladimir Palant
2015/05/07 00:04:59
Because the text file has 50 thousand URLs.
| |
39 { | |
40 browser.removeAllTabsBut(browser.tabs[0]) | |
41 | |
42 this._tabs = []; | |
43 for (let i = 0; i < maxtabs; i++) | |
44 this._tabs.push(browser.addTab("about:blank")); | |
45 | |
46 browser.removeTab(browser.tabs[0]); | |
47 | |
48 this._deferred = []; | |
49 } | |
50 TabAllocator.prototype = { | |
51 /** | |
52 * Returns a promise that will resolve into a tab once a tab can be allocated. | |
53 * The tab cannot be used by other tasks until releaseTab() is called. | |
54 * | |
55 * @result {Promise} | |
saroyanm
2015/05/04 18:13:43
Nit: @return ?
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
56 */ | |
57 getTab: function() | |
58 { | |
59 if (this._tabs.length) | |
60 return this._tabs.shift(); | |
61 else | |
62 { | |
63 let deferred = Promise.defer(); | |
saroyanm
2015/05/04 18:13:43
The Deffered object is obsolete starting from Geck
Wladimir Palant
2015/05/07 00:04:59
Promise constructor in Promise.jsm is actually sup
| |
64 this._deferred.push(deferred); | |
65 return deferred.promise; | |
66 } | |
67 }, | |
68 | |
69 /** | |
70 * Adds a tab back to the pool so that it can be used by other tasks. | |
71 * | |
72 * @param {tab} tab | |
73 */ | |
74 releaseTab: function(tab) | |
75 { | |
76 let browser = tab.parentNode.tabbrowser; | |
77 browser.removeTab(tab); | |
78 tab = browser.addTab("about:blank"); | |
79 | |
80 if (this._deferred.length) | |
81 this._deferred.shift().resolve(tab); | |
saroyanm
2015/05/04 18:13:43
As mentioned above, the Deffered object is obsolet
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
82 else | |
83 this._tabs.push(tab); | |
84 } | |
85 }; | |
86 | |
87 /** | |
88 * Observes page loads in a particular tabbed browser. | |
89 * | |
90 * @param {tabbrowser} browser | |
91 * The tabbed browser to be observed | |
92 * @param {int} timeout | |
93 * Load timeout in milliseconds | |
94 * @constructor | |
95 */ | |
96 function LoadListener(browser, timeout) | |
97 { | |
98 this._browser = browser; | |
99 this._deferred = new Map(); | |
100 this._timeout = timeout; | |
101 browser.addTabsProgressListener(this); | |
102 } | |
103 LoadListener.prototype = { | |
104 /** | |
105 * Returns a promise that will be resolved when the page in the specified tab | |
106 * finishes loading. Loading will be stopped if the timeout is reached. | |
107 * | |
108 * @param {tab} tab | |
109 * @result {Promise} | |
saroyanm
2015/05/04 18:13:43
Nit: @return
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
110 */ | |
111 waitForLoad: function(tab) | |
112 { | |
113 let deferred = Promise.defer(); | |
saroyanm
2015/05/04 18:13:43
As mentioned above this is Obsolete, this comment
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
114 this._deferred.set(tab.linkedBrowser, deferred); | |
115 | |
116 tab.ownerDocument.defaultView.setTimeout(function() | |
117 { | |
118 tab.linkedBrowser.stop(); | |
119 }, this._timeout); | |
120 | |
121 return deferred.promise; | |
122 }, | |
123 | |
124 /** | |
125 * Deactivates this object. | |
126 */ | |
127 stop: function() | |
128 { | |
129 this._browser.removeTabsProgressListener(this); | |
130 }, | |
131 | |
132 onStateChange: function(browser, progress, request, flags, status) | |
133 { | |
134 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW)) | |
saroyanm
2015/05/04 18:13:43
Nit: This case can be splitted into multiple lines
| |
135 { | |
136 let deferred = this._deferred.get(browser); | |
137 if (deferred) | |
138 { | |
139 this._deferred.delete(browser); | |
140 | |
141 let headers = []; | |
142 if (request instanceof Ci.nsIHttpChannel) | |
143 { | |
144 try | |
145 { | |
146 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText); | |
147 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value)); | |
148 } | |
149 catch (e) | |
150 { | |
151 // Exceptions are expected here | |
saroyanm
2015/05/04 18:13:43
Please also handle the exception.
reportException
Wladimir Palant
2015/05/07 00:04:59
As the comment explains, exceptions are expected h
| |
152 } | |
153 } | |
154 deferred.resolve([status, headers]); | |
155 } | |
156 } | |
157 } | |
158 }; | |
159 | |
160 /** | |
161 * Once created, this object will make sure all new windows are dismissed | |
162 * immediately. | |
163 * | |
164 * @constructor | |
165 */ | |
166 function WindowCloser() | |
167 { | |
168 Services.obs.addObserver(this, "xul-window-registered", true) | |
saroyanm
2015/05/04 18:13:43
Missing semicolon
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
169 } | |
170 WindowCloser.prototype = { | |
171 /** | |
172 * Deactivates this object. | |
173 */ | |
174 stop: function() | |
175 { | |
176 Services.obs.removeObserver(this, "xul-window-registered") | |
saroyanm
2015/05/04 18:13:43
Missing semicolon
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
177 }, | |
178 | |
179 observe: function(subject, topic, data) | |
180 { | |
181 let window = subject.QueryInterface(Ci.nsIInterfaceRequestor) | |
182 .getInterface(Ci.nsIDOMWindow) | |
saroyanm
2015/05/04 18:13:43
Missing semicolon
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
183 window.addEventListener("load", function() | |
184 { | |
185 if (window.document.documentElement.localName == 'dialog') | |
saroyanm
2015/05/04 18:13:43
Nit: Please use double quote.
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
186 window.document.documentElement.acceptDialog(); | |
187 else | |
188 window.close(); | |
189 }, false); | |
190 }, | |
191 | |
192 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) | |
193 }; | |
194 | |
195 /** | |
196 * Retrieves crawler results associated with a particular content window. | |
197 * | |
198 * @param {Window} window | |
199 * Content window to retrieve crawler results for | |
200 * @result {Object} | |
saroyanm
2015/05/04 18:13:43
Nit: @return
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
201 * Crawler results or undefined if the window wasn't created by the crawler. | |
202 */ | |
203 function getDataForWindow(window) | |
204 { | |
205 let topWindow = window.top; | |
206 if (!topWindow.document) | |
207 throw new Error("No document associated with the node's top window"); | |
208 let tabbrowser = Utils.getChromeWindow(topWindow).getBrowser(); | |
209 if (!tabbrowser) | |
210 throw new Error("Unable to get a tabbrowser reference from the window"); | |
211 let browser = tabbrowser.getBrowserForDocument(topWindow.document); | |
212 if (!browser) | |
213 throw new Error("Unable to get browser for the content window"); | |
214 let tab = tabbrowser.getTabForBrowser(browser); | |
215 if (!tab) | |
216 throw new Error("Unable to get tab for the browser"); | |
217 return dataForTab.get(tab); | |
218 }; | |
saroyanm
2015/05/04 18:13:43
Nit: Semicolon is redundant here.
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
219 | |
220 /** | |
221 * Starts the crawling session. The crawler opens each URL in a tab and stores | |
222 * the results. | |
223 * | |
224 * @param {Window} window | |
225 * The browser window we're operating in | |
226 * @param {String[]} urls | |
227 * URLs to be crawled | |
228 * @param {int} number_of_tabs | |
229 * Maximum number of tabs to be opened | |
230 * @param {String} targetURL | |
231 * URL that should receive the results | |
saroyanm
2015/05/04 18:13:43
Nit: Please also document onDone parameter for con
Sebastian Noack
2015/05/04 20:39:01
Also note that JsDoc 3 will mistakenly document th
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
232 */ | |
233 function run(window, urls, timeout, maxtabs, targetURL, onDone) | |
234 { | |
235 let requestNotifier = new RequestNotifier(null, function() {}); | |
236 | |
237 let origProcessNode = Policy.processNode; | |
238 Policy.processNode = processNodeReplacement.bind(null, origProcessNode, reques tNotifier); | |
239 | |
240 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); | |
241 let loadListener = new LoadListener(window.getBrowser(), timeout); | |
242 let running = 0; | |
243 let windowCloser = new WindowCloser(); | |
244 let taskDone = function() | |
245 { | |
246 running--; | |
247 if (running <= 0) | |
248 { | |
249 Policy.processNode = origProcessNode; | |
250 requestNotifier.shutdown(); | |
251 loadListener.stop(); | |
252 windowCloser.stop(); | |
253 onDone(); | |
254 } | |
255 }; | |
256 | |
257 for (let url of urls) | |
258 { | |
259 running++; | |
260 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) | |
261 { | |
262 let request = new XMLHttpRequest(); | |
263 request.open("POST", targetURL); | |
264 request.addEventListener("load", taskDone, false); | |
265 request.addEventListener("error", taskDone, false); | |
266 request.send(JSON.stringify(result)); | |
267 }, function(url, exception) | |
268 { | |
269 reportException(exception); | |
270 | |
271 let request = new XMLHttpRequest(); | |
272 request.open("POST", targetURL); | |
273 request.addEventListener("load", taskDone, false); | |
274 request.addEventListener("error", taskDone, false); | |
275 request.send(JSON.stringify({ | |
276 url: url, | |
277 startTime: Date.now(), | |
278 error: String(exception) | |
279 })); | |
280 }.bind(null, url)); | |
281 } | |
282 } | |
283 exports.run = run; | |
284 | |
285 /** | |
286 * Crawls a URL. This is a generator meant to be used via a Task object. | |
287 * | |
288 * @param {String} url | |
289 * @param {TabAllocator} tabAllocator | |
290 * @param {loadListener} loadListener | |
291 * @result {Object} | |
saroyanm
2015/05/04 18:13:43
Nit: @return {Object} Crawling result
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
292 * Crawling result | |
293 */ | |
294 function* crawl_url(url, tabAllocator, loadListener) | |
295 { | |
296 let tab = yield tabAllocator.getTab(); | |
297 let result = {url: url}; | |
298 | |
299 dataForTab.set(tab, result); | |
300 try | |
301 { | |
302 result.startTime = Date.now(); | |
303 tab.linkedBrowser.loadURI(url, null, null); | |
304 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; | |
305 result.endTime = Date.now(); | |
306 result.finalUrl = tab.linkedBrowser.currentURI.spec; | |
307 | |
308 let document = tab.linkedBrowser.contentDocument; | |
309 if (document.documentElement) | |
310 { | |
311 try | |
312 { | |
313 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas"); | |
314 canvas.width = document.documentElement.scrollWidth; | |
315 canvas.height = document.documentElement.scrollHeight; | |
316 | |
317 let context = canvas.getContext("2d"); | |
318 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)"); | |
319 result.screenshot = canvas.toDataURL("image/jpeg", 0.8); | |
saroyanm
2015/05/04 18:13:43
Maybe make sense to let user specify the quality o
Wladimir Palant
2015/05/07 00:04:59
Well, changing maxtabs is very useful when testing
saroyanm
2015/05/07 13:19:00
Good point.
| |
320 } | |
321 catch (e) | |
322 { | |
323 reportException(e); | |
324 result.error = "Capturing screenshot failed: " + e; | |
saroyanm
2015/05/04 18:13:43
Isn't result.error redundant ?
Wladimir Palant
2015/05/07 00:04:59
No. The exception is merely reported to the consol
saroyanm
2015/05/07 13:19:00
Yes, missed that.
Good point.
| |
325 } | |
326 | |
327 // TODO: Capture frames as well? | |
saroyanm
2015/05/04 18:13:43
Nit: I think we shouldn't have TODO comments in re
Wladimir Palant
2015/05/07 00:04:59
Why not? This is something we might want to do in
| |
328 let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); | |
329 result.source = serializer.serializeToString(document.documentElement); | |
330 } | |
331 } | |
332 finally | |
333 { | |
334 tabAllocator.releaseTab(tab); | |
335 } | |
336 return result; | |
337 } | |
338 | |
339 function reportException(e) | |
340 { | |
341 let stack = ""; | |
342 if (e && typeof e == "object" && "stack" in e) | |
343 stack = e.stack + "\n"; | |
344 | |
345 Cu.reportError(e); | |
346 dump(e + "\n" + stack + "\n"); | |
347 } | |
348 | |
349 /** | |
350 * Wrapper for the Policy.processNode() function in ABP. Calls the original | |
351 * function and records all the data. | |
352 * | |
353 * @param {Function} origProcessNode | |
354 * The original processNode function. | |
355 * @param {RequestNotifier} requestNotifier | |
356 * The crawler's RequestNotifier object instance. | |
357 * @param {nsIDOMWindow} wnd | |
358 * @param {nsIDOMElement} node | |
359 * @param {Number} contentType | |
360 * @param {nsIURI} location | |
361 * @param {Boolean} collapse | |
362 * @return {Boolean} | |
363 */ | |
364 function processNodeReplacement(origProcessNode, requestNotifier, wnd, node, con tentType, location, collapse) | |
365 { | |
366 let filters = []; | |
367 let origListener = requestNotifier.listener; | |
368 requestNotifier.listener = function(window, node, entry) | |
369 { | |
370 if (entry.filter) | |
371 filters.push(entry.filter.text); | |
372 }; | |
373 | |
374 /* | |
375 * Call the original processNode. If the original throws, then we will too, so this is outside a try clause. | |
saroyanm
2015/05/04 18:13:43
Nit: the line is too long.
Sebastian Noack
2015/05/04 20:39:01
Also I suggest to use simply // for comments expla
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
376 */ | |
377 let result; | |
378 try | |
379 { | |
380 result = origProcessNode(wnd, node, contentType, location, collapse); | |
381 } | |
382 finally | |
383 { | |
384 requestNotifier.listener = origListener; | |
385 } | |
386 | |
387 try | |
388 { | |
389 let data = getDataForWindow(wnd); | |
390 if (data) | |
391 { | |
392 if (!("requests" in data)) | |
393 data.requests = []; | |
394 data.requests.push({ | |
395 contentType: contentType, | |
396 location: (contentType == Policy.type.ELEMHIDE ? location.text : locatio n.spec), | |
397 blocked: result != Ci.nsIContentPolicy.ACCEPT, | |
398 filters: filters | |
399 }); | |
400 } | |
401 } | |
402 catch (e) | |
403 { | |
404 reportException(e); | |
405 } | |
406 return result; | |
407 }; | |
saroyanm
2015/05/04 18:13:43
Nit: Semicolon is redundant here.
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
OLD | NEW |