Left: | ||
Right: |
OLD | NEW |
---|---|
1 /* | 1 /* |
2 * This Source Code is subject to the terms of the Mozilla Public License | 2 * This Source Code is subject to the terms of the Mozilla Public License |
3 * version 2.0 (the "License"). You can obtain a copy of the License at | 3 * version 2.0 (the "License"). You can obtain a copy of the License at |
4 * http://mozilla.org/MPL/2.0/. | 4 * http://mozilla.org/MPL/2.0/. |
5 */ | 5 */ |
6 | 6 |
7 /** | 7 /** |
8 * @module crawler | 8 * @module crawler |
9 */ | 9 */ |
10 | 10 |
11 Cu.import("resource://gre/modules/Services.jsm"); | 11 Cu.import("resource://gre/modules/Services.jsm"); |
12 Cu.import("resource://gre/modules/Task.jsm"); | 12 Cu.import("resource://gre/modules/Task.jsm"); |
13 Cu.import("resource://gre/modules/Promise.jsm"); | 13 Cu.import("resource://gre/modules/Promise.jsm"); |
Wladimir Palant
2016/09/14 16:13:18
Just realized that this import hasn't been removed
| |
14 Cu.import("resource://gre/modules/Timer.jsm"); | |
Wladimir Palant
2016/09/14 16:13:18
Nit: We don't import like that any more, symbols s
sergei
2016/09/29 09:58:14
Fixed and the rest related to coding style is addr
| |
14 | 15 |
15 function abprequire(module) | 16 function abprequire(module) |
16 { | 17 { |
17 let result = {}; | 18 let result = {}; |
18 result.wrappedJSObject = result; | 19 result.wrappedJSObject = result; |
19 Services.obs.notifyObservers(result, "adblockplus-require", module); | 20 Services.obs.notifyObservers(result, "adblockplus-require", module); |
20 return result.exports; | 21 return result.exports; |
21 } | 22 } |
22 | 23 |
23 let {RequestNotifier} = abprequire("requestNotifier"); | 24 let {RequestNotifier} = abprequire("requestNotifier"); |
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
127 } | 128 } |
128 else if (this._tabs < this._maxtabs) | 129 else if (this._tabs < this._maxtabs) |
129 { | 130 { |
130 this._resolvers.shift()(this._createTab()); | 131 this._resolvers.shift()(this._createTab()); |
131 } | 132 } |
132 } | 133 } |
133 }, | 134 }, |
134 }; | 135 }; |
135 | 136 |
136 /** | 137 /** |
137 * Observes page loads in a particular tabbed browser. | |
138 * | |
139 * @param {tabbrowser} browser | |
140 * The tabbed browser to be observed | |
141 * @param {int} timeout | |
142 * Load timeout in milliseconds | |
143 * @constructor | |
144 */ | |
145 function LoadListener(browser, timeout) | |
146 { | |
147 this._browser = browser; | |
148 this._deferred = new Map(); | |
149 this._timeout = timeout; | |
150 browser.addTabsProgressListener(this); | |
151 } | |
152 LoadListener.prototype = { | |
153 /** | |
154 * Returns a promise that will be resolved when the page in the specified tab | |
155 * finishes loading. Loading will be stopped if the timeout is reached. | |
156 * | |
157 * @param {tab} tab | |
158 * @result {Promise} | |
159 */ | |
160 waitForLoad: function(tab) | |
161 { | |
162 let deferred = Promise.defer(); | |
163 this._deferred.set(tab.linkedBrowser, deferred); | |
164 | |
165 tab.ownerDocument.defaultView.setTimeout(function() | |
166 { | |
167 tab.linkedBrowser.stop(); | |
168 }, this._timeout); | |
169 | |
170 return deferred.promise; | |
171 }, | |
172 | |
173 /** | |
174 * Deactivates this object. | |
175 */ | |
176 stop: function() | |
177 { | |
178 this._browser.removeTabsProgressListener(this); | |
179 }, | |
180 | |
181 onStateChange: function(browser, progress, request, flags, status) | |
182 { | |
183 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW)) | |
184 { | |
185 let deferred = this._deferred.get(browser); | |
186 if (deferred) | |
187 { | |
188 this._deferred.delete(browser); | |
189 | |
190 let headers = []; | |
191 if (request instanceof Ci.nsIHttpChannel) | |
192 { | |
193 try | |
194 { | |
195 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText); | |
196 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value)); | |
197 } | |
198 catch (e) | |
199 { | |
200 // Exceptions are expected here | |
201 } | |
202 } | |
203 deferred.resolve([status, headers]); | |
204 } | |
205 } | |
206 } | |
207 }; | |
208 | |
209 /** | |
210 * Once created, this object will make sure all new windows are dismissed | 138 * Once created, this object will make sure all new windows are dismissed |
211 * immediately. | 139 * immediately. |
212 * | 140 * |
213 * @constructor | 141 * @constructor |
214 */ | 142 */ |
215 function WindowCloser() | 143 function WindowCloser() |
216 { | 144 { |
217 Services.obs.addObserver(this, "xul-window-registered", true) | 145 Services.obs.addObserver(this, "xul-window-registered", true) |
218 } | 146 } |
219 WindowCloser.prototype = { | 147 WindowCloser.prototype = { |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
293 * @param {int} maxtabs | 221 * @param {int} maxtabs |
294 * Maximum number of tabs to be opened | 222 * Maximum number of tabs to be opened |
295 * @param {String} targetURL | 223 * @param {String} targetURL |
296 * URL that should receive the results | 224 * URL that should receive the results |
297 * @param {Function} onDone | 225 * @param {Function} onDone |
298 * The callback which is called after finishing of all tasks. | 226 * The callback which is called after finishing of all tasks. |
299 */ | 227 */ |
300 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) | 228 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) |
301 { | 229 { |
302 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); | 230 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); |
303 let loadListener = new LoadListener(window.getBrowser(), timeout); | 231 |
304 let running = 0; | 232 let running = 0; |
305 let windowCloser = new WindowCloser(); | 233 let windowCloser = new WindowCloser(); |
306 let taskDone = function() | 234 let taskDone = function() |
307 { | 235 { |
308 running--; | 236 running--; |
309 if (running <= 0) | 237 if (running <= 0) |
310 { | 238 { |
311 loadListener.stop(); | |
312 windowCloser.stop(); | 239 windowCloser.stop(); |
313 onDone(); | 240 onDone(); |
314 } | 241 } |
315 }; | 242 }; |
316 | 243 |
317 for (let url of urls) | 244 for (let url of urls) |
318 { | 245 { |
319 running++; | 246 running++; |
320 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) | 247 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(r esult) |
321 { | 248 { |
322 let request = new XMLHttpRequest(); | 249 let request = new XMLHttpRequest(); |
323 request.open("POST", targetURL); | 250 request.open("POST", targetURL); |
324 request.addEventListener("load", taskDone, false); | 251 request.addEventListener("load", taskDone, false); |
325 request.addEventListener("error", taskDone, false); | 252 request.addEventListener("error", taskDone, false); |
326 request.send(JSON.stringify(result)); | 253 request.send(JSON.stringify(result)); |
327 }, function(url, exception) | 254 }, function(url, exception) |
328 { | 255 { |
329 reportException(exception); | 256 reportException(exception); |
330 | 257 |
331 let request = new XMLHttpRequest(); | 258 let request = new XMLHttpRequest(); |
332 request.open("POST", targetURL); | 259 request.open("POST", targetURL); |
333 request.addEventListener("load", taskDone, false); | 260 request.addEventListener("load", taskDone, false); |
334 request.addEventListener("error", taskDone, false); | 261 request.addEventListener("error", taskDone, false); |
335 request.send(JSON.stringify({ | 262 request.send(JSON.stringify({ |
336 url: url, | 263 url: url, |
337 startTime: Date.now(), | 264 startTime: Date.now(), |
338 error: String(exception) | 265 error: String(exception) |
339 })); | 266 })); |
340 }.bind(null, url)); | 267 }.bind(null, url)); |
341 } | 268 } |
342 } | 269 } |
343 | 270 |
344 /** | 271 /** |
272 * Expects to receive page info gathered in a content process for the specified | |
273 * `tab`. If there is no relevant message within specified `timeout` then | |
274 * the result promise is resolve with error object. | |
275 * @param tab | |
276 * Tab in which we are interested in | |
277 * @param {int} timeout | |
278 * Timeout in milliseconds | |
279 * @return {Promise} promise which will be resolved with the received page info | |
280 */ | |
281 function getPageInfo(tab, timeout) | |
282 { | |
283 return new Promise((resolve, result) => | |
284 { | |
285 let mm = tab.linkedBrowser.messageManager; | |
286 let timerID; | |
287 let onDone = (pageInfo) => | |
288 { | |
289 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone); | |
290 clearTimeout(timerID); | |
291 resolve(pageInfo); | |
292 } | |
293 mm.addMessageListener("abpcrawler:pageInfoGathered", (msg) => onDone(msg.dat a));; | |
Wladimir Palant
2016/09/14 16:11:48
That's not the callback you are removing above. Al
sergei
2016/09/29 09:58:13
Fixed. Sorry, overlooked.
| |
294 timerID = setTimeout(onDone.bind(this, {error: "timeout"}), timeout); | |
Wladimir Palant
2016/09/14 16:11:49
How about not using bind() here, for clarity and c
sergei
2016/09/29 09:58:13
Done. Good idea.
| |
295 }); | |
296 } | |
297 | |
298 /** | |
345 * Crawls a URL. This is a generator meant to be used via a Task object. | 299 * Crawls a URL. This is a generator meant to be used via a Task object. |
346 * | 300 * |
347 * @param {String} url | 301 * @param {String} url |
348 * @param {TabAllocator} tabAllocator | 302 * @param {TabAllocator} tabAllocator |
349 * @param {loadListener} loadListener | 303 * @param {int} timeout |
304 * Load timeout in milliseconds | |
350 * @result {Object} | 305 * @result {Object} |
351 * Crawling result | 306 * Crawling result |
352 */ | 307 */ |
353 function* crawl_url(url, tabAllocator, loadListener) | 308 function* crawl_url(url, tabAllocator, timeout) |
354 { | 309 { |
355 let tab = yield tabAllocator.getTab(); | 310 let tab = yield tabAllocator.getTab(); |
356 let result = {url, requests: []}; | 311 let result = {url, requests: []}; |
357 let requestNotifier; | 312 let requestNotifier; |
358 try | 313 try |
359 { | 314 { |
360 result.startTime = Date.now(); | 315 result.startTime = Date.now(); |
361 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, | 316 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, |
362 function(entry, scanComplete) | 317 function(entry, scanComplete) |
363 { | 318 { |
364 if (!entry) | 319 if (!entry) |
365 return; | 320 return; |
366 let {type: contentType, location, filter} = entry; | 321 let {type: contentType, location, filter} = entry; |
367 result.requests.push({location, contentType, filter}); | 322 result.requests.push({location, contentType, filter}); |
368 }); | 323 }); |
369 | 324 |
370 tab.linkedBrowser.loadURI(url, null, null); | 325 tab.linkedBrowser.loadURI(url, null, null); |
371 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; | 326 |
327 Object.assign(result, yield getPageInfo(tab, timeout)); | |
328 result.finalUrl = tab.linkedBrowser.currentURI.spec; | |
372 result.endTime = Date.now(); | 329 result.endTime = Date.now(); |
373 result.finalUrl = tab.linkedBrowser.currentURI.spec; | |
374 | |
375 let document = tab.linkedBrowser.contentDocument; | |
376 if (document.documentElement) | |
377 { | |
378 try | |
379 { | |
380 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas"); | |
381 canvas.width = document.documentElement.scrollWidth; | |
382 canvas.height = document.documentElement.scrollHeight; | |
383 | |
384 let context = canvas.getContext("2d"); | |
385 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)"); | |
386 result.screenshot = canvas.toDataURL("image/jpeg", 0.8); | |
387 } | |
388 catch (e) | |
389 { | |
390 reportException(e); | |
391 result.error = "Capturing screenshot failed: " + e; | |
392 } | |
393 | |
394 // TODO: Capture frames as well? | |
395 let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); | |
396 result.source = serializer.serializeToString(document.documentElement); | |
397 } | |
398 } | 330 } |
399 finally | 331 finally |
400 { | 332 { |
401 if (requestNotifier) | 333 if (requestNotifier) |
402 requestNotifier.shutdown(); | 334 requestNotifier.shutdown(); |
403 tabAllocator.releaseTab(tab); | 335 tabAllocator.releaseTab(tab); |
404 } | 336 } |
405 return result; | 337 return result; |
406 } | 338 } |
407 | 339 |
408 function reportException(e) | 340 function reportException(e) |
409 { | 341 { |
410 let stack = ""; | 342 let stack = ""; |
411 if (e && typeof e == "object" && "stack" in e) | 343 if (e && typeof e == "object" && "stack" in e) |
412 stack = e.stack + "\n"; | 344 stack = e.stack + "\n"; |
413 | 345 |
414 Cu.reportError(e); | 346 Cu.reportError(e); |
415 dump(e + "\n" + stack + "\n"); | 347 dump(e + "\n" + stack + "\n"); |
416 } | 348 } |
349 | |
350 let {addonRoot} = require("info"); | |
351 let frameScriptPath = addonRoot + "/lib/child/frameScript.js"; | |
352 let globalMessageManager = Services.mm; | |
353 globalMessageManager.loadFrameScript(frameScriptPath, true); | |
354 | |
355 onShutdown.add(() => | |
356 { | |
357 globalMessageManager.removeDelayedFrameScript(frameScriptPath); | |
358 }); | |
OLD | NEW |