| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * This Source Code is subject to the terms of the Mozilla Public License | 2 * This Source Code is subject to the terms of the Mozilla Public License |
| 3 * version 2.0 (the "License"). You can obtain a copy of the License at | 3 * version 2.0 (the "License"). You can obtain a copy of the License at |
| 4 * http://mozilla.org/MPL/2.0/. | 4 * http://mozilla.org/MPL/2.0/. |
| 5 */ | 5 */ |
| 6 | 6 |
| 7 /** | 7 /** |
| 8 * @module crawler | 8 * @module crawler |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 Cu.import("resource://gre/modules/Services.jsm"); | 11 Cu.import("resource://gre/modules/Services.jsm"); |
| 12 Cu.import("resource://gre/modules/Task.jsm"); | 12 Cu.import("resource://gre/modules/Task.jsm"); |
| 13 Cu.import("resource://gre/modules/Promise.jsm"); | 13 Cu.import("resource://gre/modules/Promise.jsm"); |
|
Wladimir Palant
2016/09/14 16:13:18
Just realized that this import hasn't been removed
| |
| 14 Cu.import("resource://gre/modules/Timer.jsm"); | |
|
Wladimir Palant
2016/09/14 16:13:18
Nit: We don't import like that any more, symbols s
sergei
2016/09/29 09:58:14
Fixed and the rest related to coding style is addr
| |
| 14 | 15 |
| 15 function abprequire(module) | 16 function abprequire(module) |
| 16 { | 17 { |
| 17 let result = {}; | 18 let result = {}; |
| 18 result.wrappedJSObject = result; | 19 result.wrappedJSObject = result; |
| 19 Services.obs.notifyObservers(result, "adblockplus-require", module); | 20 Services.obs.notifyObservers(result, "adblockplus-require", module); |
| 20 return result.exports; | 21 return result.exports; |
| 21 } | 22 } |
| 22 | 23 |
| 23 let {RequestNotifier} = abprequire("requestNotifier"); | 24 let {RequestNotifier} = abprequire("requestNotifier"); |
| (...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 127 } | 128 } |
| 128 else if (this._tabs < this._maxtabs) | 129 else if (this._tabs < this._maxtabs) |
| 129 { | 130 { |
| 130 this._resolvers.shift()(this._createTab()); | 131 this._resolvers.shift()(this._createTab()); |
| 131 } | 132 } |
| 132 } | 133 } |
| 133 }, | 134 }, |
| 134 }; | 135 }; |
| 135 | 136 |
| 136 /** | 137 /** |
| 137 * Observes page loads in a particular tabbed browser. | |
| 138 * | |
| 139 * @param {tabbrowser} browser | |
| 140 * The tabbed browser to be observed | |
| 141 * @param {int} timeout | |
| 142 * Load timeout in milliseconds | |
| 143 * @constructor | |
| 144 */ | |
| 145 function LoadListener(browser, timeout) | |
| 146 { | |
| 147 this._browser = browser; | |
| 148 this._deferred = new Map(); | |
| 149 this._timeout = timeout; | |
| 150 browser.addTabsProgressListener(this); | |
| 151 } | |
| 152 LoadListener.prototype = { | |
| 153 /** | |
| 154 * Returns a promise that will be resolved when the page in the specified tab | |
| 155 * finishes loading. Loading will be stopped if the timeout is reached. | |
| 156 * | |
| 157 * @param {tab} tab | |
| 158 * @result {Promise} | |
| 159 */ | |
| 160 waitForLoad: function(tab) | |
| 161 { | |
| 162 let deferred = Promise.defer(); | |
| 163 this._deferred.set(tab.linkedBrowser, deferred); | |
| 164 | |
| 165 tab.ownerDocument.defaultView.setTimeout(function() | |
| 166 { | |
| 167 tab.linkedBrowser.stop(); | |
| 168 }, this._timeout); | |
| 169 | |
| 170 return deferred.promise; | |
| 171 }, | |
| 172 | |
| 173 /** | |
| 174 * Deactivates this object. | |
| 175 */ | |
| 176 stop: function() | |
| 177 { | |
| 178 this._browser.removeTabsProgressListener(this); | |
| 179 }, | |
| 180 | |
| 181 onStateChange: function(browser, progress, request, flags, status) | |
| 182 { | |
| 183 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW)) | |
| 184 { | |
| 185 let deferred = this._deferred.get(browser); | |
| 186 if (deferred) | |
| 187 { | |
| 188 this._deferred.delete(browser); | |
| 189 | |
| 190 let headers = []; | |
| 191 if (request instanceof Ci.nsIHttpChannel) | |
| 192 { | |
| 193 try | |
| 194 { | |
| 195 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText); | |
| 196 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value)); | |
| 197 } | |
| 198 catch (e) | |
| 199 { | |
| 200 // Exceptions are expected here | |
| 201 } | |
| 202 } | |
| 203 deferred.resolve([status, headers]); | |
| 204 } | |
| 205 } | |
| 206 } | |
| 207 }; | |
| 208 | |
| 209 /** | |
| 210 * Once created, this object will make sure all new windows are dismissed | 138 * Once created, this object will make sure all new windows are dismissed |
| 211 * immediately. | 139 * immediately. |
| 212 * | 140 * |
| 213 * @constructor | 141 * @constructor |
| 214 */ | 142 */ |
| 215 function WindowCloser() | 143 function WindowCloser() |
| 216 { | 144 { |
| 217 Services.obs.addObserver(this, "xul-window-registered", true) | 145 Services.obs.addObserver(this, "xul-window-registered", true) |
| 218 } | 146 } |
| 219 WindowCloser.prototype = { | 147 WindowCloser.prototype = { |
| (...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 293 * @param {int} maxtabs | 221 * @param {int} maxtabs |
| 294 * Maximum number of tabs to be opened | 222 * Maximum number of tabs to be opened |
| 295 * @param {String} targetURL | 223 * @param {String} targetURL |
| 296 * URL that should receive the results | 224 * URL that should receive the results |
| 297 * @param {Function} onDone | 225 * @param {Function} onDone |
| 298 * The callback which is called after finishing of all tasks. | 226 * The callback which is called after finishing of all tasks. |
| 299 */ | 227 */ |
| 300 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) | 228 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) |
| 301 { | 229 { |
| 302 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); | 230 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); |
| 303 let loadListener = new LoadListener(window.getBrowser(), timeout); | 231 |
| 304 let running = 0; | 232 let running = 0; |
| 305 let windowCloser = new WindowCloser(); | 233 let windowCloser = new WindowCloser(); |
| 306 let taskDone = function() | 234 let taskDone = function() |
| 307 { | 235 { |
| 308 running--; | 236 running--; |
| 309 if (running <= 0) | 237 if (running <= 0) |
| 310 { | 238 { |
| 311 loadListener.stop(); | |
| 312 windowCloser.stop(); | 239 windowCloser.stop(); |
| 313 onDone(); | 240 onDone(); |
| 314 } | 241 } |
| 315 }; | 242 }; |
| 316 | 243 |
| 317 for (let url of urls) | 244 for (let url of urls) |
| 318 { | 245 { |
| 319 running++; | 246 running++; |
| 320 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) | 247 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(r esult) |
| 321 { | 248 { |
| 322 let request = new XMLHttpRequest(); | 249 let request = new XMLHttpRequest(); |
| 323 request.open("POST", targetURL); | 250 request.open("POST", targetURL); |
| 324 request.addEventListener("load", taskDone, false); | 251 request.addEventListener("load", taskDone, false); |
| 325 request.addEventListener("error", taskDone, false); | 252 request.addEventListener("error", taskDone, false); |
| 326 request.send(JSON.stringify(result)); | 253 request.send(JSON.stringify(result)); |
| 327 }, function(url, exception) | 254 }, function(url, exception) |
| 328 { | 255 { |
| 329 reportException(exception); | 256 reportException(exception); |
| 330 | 257 |
| 331 let request = new XMLHttpRequest(); | 258 let request = new XMLHttpRequest(); |
| 332 request.open("POST", targetURL); | 259 request.open("POST", targetURL); |
| 333 request.addEventListener("load", taskDone, false); | 260 request.addEventListener("load", taskDone, false); |
| 334 request.addEventListener("error", taskDone, false); | 261 request.addEventListener("error", taskDone, false); |
| 335 request.send(JSON.stringify({ | 262 request.send(JSON.stringify({ |
| 336 url: url, | 263 url: url, |
| 337 startTime: Date.now(), | 264 startTime: Date.now(), |
| 338 error: String(exception) | 265 error: String(exception) |
| 339 })); | 266 })); |
| 340 }.bind(null, url)); | 267 }.bind(null, url)); |
| 341 } | 268 } |
| 342 } | 269 } |
| 343 | 270 |
| 344 /** | 271 /** |
| 272 * Expects to receive page info gathered in a content process for the specified | |
| 273 * `tab`. If there is no relevant message within specified `timeout` then | |
| 274 * the result promise is resolve with error object. | |
| 275 * @param tab | |
| 276 * Tab in which we are interested in | |
| 277 * @param {int} timeout | |
| 278 * Timeout in milliseconds | |
| 279 * @return {Promise} promise which will be resolved with the received page info | |
| 280 */ | |
| 281 function getPageInfo(tab, timeout) | |
| 282 { | |
| 283 return new Promise((resolve, result) => | |
| 284 { | |
| 285 let mm = tab.linkedBrowser.messageManager; | |
| 286 let timerID; | |
| 287 let onDone = (pageInfo) => | |
| 288 { | |
| 289 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone); | |
| 290 clearTimeout(timerID); | |
| 291 resolve(pageInfo); | |
| 292 } | |
| 293 mm.addMessageListener("abpcrawler:pageInfoGathered", (msg) => onDone(msg.dat a));; | |
|
Wladimir Palant
2016/09/14 16:11:48
That's not the callback you are removing above. Al
sergei
2016/09/29 09:58:13
Fixed. Sorry, overlooked.
| |
| 294 timerID = setTimeout(onDone.bind(this, {error: "timeout"}), timeout); | |
|
Wladimir Palant
2016/09/14 16:11:49
How about not using bind() here, for clarity and c
sergei
2016/09/29 09:58:13
Done. Good idea.
| |
| 295 }); | |
| 296 } | |
| 297 | |
| 298 /** | |
| 345 * Crawls a URL. This is a generator meant to be used via a Task object. | 299 * Crawls a URL. This is a generator meant to be used via a Task object. |
| 346 * | 300 * |
| 347 * @param {String} url | 301 * @param {String} url |
| 348 * @param {TabAllocator} tabAllocator | 302 * @param {TabAllocator} tabAllocator |
| 349 * @param {loadListener} loadListener | 303 * @param {int} timeout |
| 304 * Load timeout in milliseconds | |
| 350 * @result {Object} | 305 * @result {Object} |
| 351 * Crawling result | 306 * Crawling result |
| 352 */ | 307 */ |
| 353 function* crawl_url(url, tabAllocator, loadListener) | 308 function* crawl_url(url, tabAllocator, timeout) |
| 354 { | 309 { |
| 355 let tab = yield tabAllocator.getTab(); | 310 let tab = yield tabAllocator.getTab(); |
| 356 let result = {url, requests: []}; | 311 let result = {url, requests: []}; |
| 357 let requestNotifier; | 312 let requestNotifier; |
| 358 try | 313 try |
| 359 { | 314 { |
| 360 result.startTime = Date.now(); | 315 result.startTime = Date.now(); |
| 361 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, | 316 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, |
| 362 function(entry, scanComplete) | 317 function(entry, scanComplete) |
| 363 { | 318 { |
| 364 if (!entry) | 319 if (!entry) |
| 365 return; | 320 return; |
| 366 let {type: contentType, location, filter} = entry; | 321 let {type: contentType, location, filter} = entry; |
| 367 result.requests.push({location, contentType, filter}); | 322 result.requests.push({location, contentType, filter}); |
| 368 }); | 323 }); |
| 369 | 324 |
| 370 tab.linkedBrowser.loadURI(url, null, null); | 325 tab.linkedBrowser.loadURI(url, null, null); |
| 371 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; | 326 |
| 327 Object.assign(result, yield getPageInfo(tab, timeout)); | |
| 328 result.finalUrl = tab.linkedBrowser.currentURI.spec; | |
| 372 result.endTime = Date.now(); | 329 result.endTime = Date.now(); |
| 373 result.finalUrl = tab.linkedBrowser.currentURI.spec; | |
| 374 | |
| 375 let document = tab.linkedBrowser.contentDocument; | |
| 376 if (document.documentElement) | |
| 377 { | |
| 378 try | |
| 379 { | |
| 380 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas"); | |
| 381 canvas.width = document.documentElement.scrollWidth; | |
| 382 canvas.height = document.documentElement.scrollHeight; | |
| 383 | |
| 384 let context = canvas.getContext("2d"); | |
| 385 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)"); | |
| 386 result.screenshot = canvas.toDataURL("image/jpeg", 0.8); | |
| 387 } | |
| 388 catch (e) | |
| 389 { | |
| 390 reportException(e); | |
| 391 result.error = "Capturing screenshot failed: " + e; | |
| 392 } | |
| 393 | |
| 394 // TODO: Capture frames as well? | |
| 395 let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); | |
| 396 result.source = serializer.serializeToString(document.documentElement); | |
| 397 } | |
| 398 } | 330 } |
| 399 finally | 331 finally |
| 400 { | 332 { |
| 401 if (requestNotifier) | 333 if (requestNotifier) |
| 402 requestNotifier.shutdown(); | 334 requestNotifier.shutdown(); |
| 403 tabAllocator.releaseTab(tab); | 335 tabAllocator.releaseTab(tab); |
| 404 } | 336 } |
| 405 return result; | 337 return result; |
| 406 } | 338 } |
| 407 | 339 |
| 408 function reportException(e) | 340 function reportException(e) |
| 409 { | 341 { |
| 410 let stack = ""; | 342 let stack = ""; |
| 411 if (e && typeof e == "object" && "stack" in e) | 343 if (e && typeof e == "object" && "stack" in e) |
| 412 stack = e.stack + "\n"; | 344 stack = e.stack + "\n"; |
| 413 | 345 |
| 414 Cu.reportError(e); | 346 Cu.reportError(e); |
| 415 dump(e + "\n" + stack + "\n"); | 347 dump(e + "\n" + stack + "\n"); |
| 416 } | 348 } |
| 349 | |
| 350 let {addonRoot} = require("info"); | |
| 351 let frameScriptPath = addonRoot + "/lib/child/frameScript.js"; | |
| 352 let globalMessageManager = Services.mm; | |
| 353 globalMessageManager.loadFrameScript(frameScriptPath, true); | |
| 354 | |
| 355 onShutdown.add(() => | |
| 356 { | |
| 357 globalMessageManager.removeDelayedFrameScript(frameScriptPath); | |
| 358 }); | |
| OLD | NEW |