 Issue 29338153:
  Issue 3780 - wait for the loading of filters and only afterwards start to fetch pages  (Closed)
    
  
    Issue 29338153:
  Issue 3780 - wait for the loading of filters and only afterwards start to fetch pages  (Closed) 
  | Left: | ||
| Right: | 
| LEFT | RIGHT | 
|---|---|
| 1 /* | 1 /* | 
| 2 * This Source Code is subject to the terms of the Mozilla Public License | 2 * This Source Code is subject to the terms of the Mozilla Public License | 
| 3 * version 2.0 (the "License"). You can obtain a copy of the License at | 3 * version 2.0 (the "License"). You can obtain a copy of the License at | 
| 4 * http://mozilla.org/MPL/2.0/. | 4 * http://mozilla.org/MPL/2.0/. | 
| 5 */ | 5 */ | 
| 6 | 6 | 
| 7 /** | 7 /** | 
| 8 * @module crawler | 8 * @module crawler | 
| 9 */ | 9 */ | 
| 10 | 10 | 
| 11 Cu.import("resource://gre/modules/Services.jsm"); | 11 Cu.import("resource://gre/modules/Services.jsm"); | 
| 12 Cu.import("resource://gre/modules/Task.jsm"); | 12 Cu.import("resource://gre/modules/Task.jsm"); | 
| 13 Cu.import("resource://gre/modules/Promise.jsm"); | 13 Cu.import("resource://gre/modules/Promise.jsm"); | 
| 14 | 14 | 
| 15 function abprequire(module) | 15 function abprequire(module) | 
| 16 { | 16 { | 
| 17 let result = {}; | 17 let result = {}; | 
| 18 result.wrappedJSObject = result; | 18 result.wrappedJSObject = result; | 
| 19 Services.obs.notifyObservers(result, "adblockplus-require", module); | 19 Services.obs.notifyObservers(result, "adblockplus-require", module); | 
| 20 return result.exports; | 20 return result.exports; | 
| 21 } | 21 } | 
| 22 | 22 | 
| 23 let {RequestNotifier} = abprequire("requestNotifier"); | 23 let {RequestNotifier} = abprequire("requestNotifier"); | 
| 24 | |
| 25 let {FilterNotifier} = abprequire("filterNotifier"); | 24 let {FilterNotifier} = abprequire("filterNotifier"); | 
| 26 let {FilterStorage} = abprequire("filterStorage"); | 25 let {FilterStorage} = abprequire("filterStorage"); | 
| 27 | 26 | 
| 28 /** | 27 /** | 
| 29 * Creates a pool of tabs and allocates them to tasks on request. | 28 * Creates a pool of tabs and allocates them to tasks on request. | 
| 30 * | 29 * | 
| 31 * @param {tabbrowser} browser | 30 * @param {tabbrowser} browser | 
| 32 * The tabbed browser where tabs should be created | 31 * The tabbed browser where tabs should be created | 
| 33 * @param {int} maxtabs | 32 * @param {int} maxtabs | 
| 34 * The maximum number of tabs to be allocated | 33 * The maximum number of tabs to be allocated | 
| (...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 192 }; | 191 }; | 
| 193 | 192 | 
| 194 /** | 193 /** | 
| 195 * Starts the crawling session. The crawler opens each URL in a tab and stores | 194 * Starts the crawling session. The crawler opens each URL in a tab and stores | 
| 196 * the results. | 195 * the results. | 
| 197 * | 196 * | 
| 198 * @param {Window} window | 197 * @param {Window} window | 
| 199 * The browser window we're operating in | 198 * The browser window we're operating in | 
| 200 * @param {String[]} urls | 199 * @param {String[]} urls | 
| 201 * URLs to be crawled | 200 * URLs to be crawled | 
| 202 * @param {int} number_of_tabs | 201 * @param {int} timeout | 
| 202 * Load timeout in milliseconds | |
| 203 * @param {int} maxtabs | |
| 203 * Maximum number of tabs to be opened | 204 * Maximum number of tabs to be opened | 
| 204 * @param {String} targetURL | 205 * @param {String} targetURL | 
| 205 * URL that should receive the results | 206 * URL that should receive the results | 
| 207 * @param {Function} onDone | |
| 208 * The callback which is called after finishing of crawling of all URLs. | |
| 206 */ | 209 */ | 
| 207 function run(window, urls, timeout, maxtabs, targetURL, onDone) | 210 function run(window, urls, timeout, maxtabs, targetURL, onDone) | 
| 211 { | |
| 212 new Promise((resolve, reject) => | |
| 213 { | |
| 214 if (FilterStorage.subscriptions.length > 0) | |
| 215 { | |
| 216 resolve(); | |
| 217 return; | |
| 218 } | |
| 219 let onFiltersLoaded = (action, item, newValue, oldValue) => | |
| 220 { | |
| 221 if (action == "load") | |
| 222 { | |
| 223 FilterNotifier.removeListener(onFiltersLoaded); | |
| 224 resolve(); | |
| 225 } | |
| 226 }; | |
| 227 FilterNotifier.addListener(onFiltersLoaded); | |
| 228 }).then(() => crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)) | |
| 229 .catch(reportException); | |
| 230 } | |
| 231 exports.run = run; | |
| 232 | |
| 233 /** | |
| 234 * Spawns a {Task} task to crawl each url from `urls` argument and calls | |
| 235 * `onDone` when all tasks are finished. | |
| 236 * @param {Window} window | |
| 237 * The browser window we're operating in | |
| 238 * @param {String[]} urls | |
| 239 * URLs to be crawled | |
| 240 * @param {int} timeout | |
| 241 * Load timeout in milliseconds | |
| 242 * @param {int} maxtabs | |
| 243 * Maximum number of tabs to be opened | |
| 244 * @param {String} targetURL | |
| 245 * URL that should receive the results | |
| 246 * @param {Function} onDone | |
| 247 * The callback which is called after finishing of all tasks. | |
| 248 */ | |
| 249 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) | |
| 208 { | 250 { | 
| 209 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); | 251 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); | 
| 210 let loadListener = new LoadListener(window.getBrowser(), timeout); | 252 let loadListener = new LoadListener(window.getBrowser(), timeout); | 
| 211 let running = 0; | 253 let running = 0; | 
| 212 let windowCloser = new WindowCloser(); | 254 let windowCloser = new WindowCloser(); | 
| 213 let taskDone = function() | 255 let taskDone = function() | 
| 214 { | 256 { | 
| 215 running--; | 257 running--; | 
| 216 if (running <= 0) | 258 if (running <= 0) | 
| 217 { | 259 { | 
| 218 loadListener.stop(); | 260 loadListener.stop(); | 
| 219 windowCloser.stop(); | 261 windowCloser.stop(); | 
| 220 onDone(); | 262 onDone(); | 
| 221 } | 263 } | 
| 222 }; | 264 }; | 
| 223 | 265 | 
| 224 new Promise(function(resolve, reject) | 266 for (let url of urls) | 
| 
Wladimir Palant
2016/03/15 09:13:10
Nit: Why not use arrow functions consistently? (re
 
sergei
2016/03/15 12:16:23
We don't need to capture `this` here, so I have no
 
Wladimir Palant
2016/03/15 13:44:22
We generally use arrow functions for inline callba
 | |
| 225 { | 267 { | 
| 226 if (FilterStorage.subscriptions.length > 0 && !FilterStorage._loading) | 268 running++; | 
| 
Wladimir Palant
2016/03/15 09:13:10
_loading is an internal flag to prevent reentrance
 
Wladimir Palant
2016/03/15 09:18:11
Actually, I think that this is a bad assumption fr
 
sergei
2016/03/15 12:16:23
I thought about it.
It does happen that "load" ev
 
Wladimir Palant
2016/03/15 13:44:22
No, subscriptions are added all at once when loadi
 
sergei
2016/03/15 14:44:25
Acknowledged.
 | |
| 227 { | 269 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) | 
| 228 resolve(); | 270 { | 
| 229 return; | 271 let request = new XMLHttpRequest(); | 
| 230 } | 272 request.open("POST", targetURL); | 
| 231 FilterNotifier.addListener((action, item, newValue, oldValue) => | 273 request.addEventListener("load", taskDone, false); | 
| 232 { | 274 request.addEventListener("error", taskDone, false); | 
| 233 if (action === "load") | 275 request.send(JSON.stringify(result)); | 
| 
Wladimir Palant
2016/03/15 09:13:11
Nit: We don't usually use strict equality, and it
 
sergei
2016/03/15 12:16:23
Fixed.
 | |
| 234 { | 276 }, function(url, exception) | 
| 235 resolve(); | 277 { | 
| 236 } | 278 reportException(exception); | 
| 237 }); | 279 | 
| 238 }).then(_ => | 280 let request = new XMLHttpRequest(); | 
| 
Wladimir Palant
2016/03/15 09:13:11
No pointless parameter please, () =>
 
sergei
2016/03/15 12:16:23
Acknowledged.
 | |
| 239 { | 281 request.open("POST", targetURL); | 
| 240 for (let url of urls) | 282 request.addEventListener("load", taskDone, false); | 
| 241 { | 283 request.addEventListener("error", taskDone, false); | 
| 242 running++; | 284 request.send(JSON.stringify({ | 
| 243 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(fun ction(result) | 285 url: url, | 
| 244 { | 286 startTime: Date.now(), | 
| 245 let request = new XMLHttpRequest(); | 287 error: String(exception) | 
| 246 request.open("POST", targetURL); | 288 })); | 
| 247 request.addEventListener("load", taskDone, false); | 289 }.bind(null, url)); | 
| 248 request.addEventListener("error", taskDone, false); | 290 } | 
| 249 request.send(JSON.stringify(result)); | 291 } | 
| 250 }, function(url, exception) | |
| 251 { | |
| 252 reportException(exception); | |
| 253 | |
| 254 let request = new XMLHttpRequest(); | |
| 255 request.open("POST", targetURL); | |
| 256 request.addEventListener("load", taskDone, false); | |
| 257 request.addEventListener("error", taskDone, false); | |
| 258 request.send(JSON.stringify({ | |
| 259 url: url, | |
| 260 startTime: Date.now(), | |
| 261 error: String(exception) | |
| 262 })); | |
| 263 }.bind(null, url)); | |
| 264 } | |
| 265 // Be careful, `catch` does not catch exeptions from any asynchronous calls | |
| 
Wladimir Palant
2016/03/15 09:13:11
exeptions => exceptions.
 
sergei
2016/03/15 12:16:23
Done. Basically, I have removed the comment.
 | |
| 266 // of this `then` handler because the latter one does not return an array of | |
| 267 // promises of asynchrounous tasks and does not contain any waiting code. | |
| 268 }).catch(reportException); | |
| 
Wladimir Palant
2016/03/15 09:13:11
I'm not really happy with the way this is structur
 
sergei
2016/03/15 12:16:23
Done.
 | |
| 269 } | |
| 270 exports.run = run; | |
| 271 | 292 | 
| 272 /** | 293 /** | 
| 273 * Crawls a URL. This is a generator meant to be used via a Task object. | 294 * Crawls a URL. This is a generator meant to be used via a Task object. | 
| 274 * | 295 * | 
| 275 * @param {String} url | 296 * @param {String} url | 
| 276 * @param {TabAllocator} tabAllocator | 297 * @param {TabAllocator} tabAllocator | 
| 277 * @param {loadListener} loadListener | 298 * @param {loadListener} loadListener | 
| 278 * @result {Object} | 299 * @result {Object} | 
| 279 * Crawling result | 300 * Crawling result | 
| 280 */ | 301 */ | 
| 281 function* crawl_url(url, tabAllocator, loadListener) | 302 function* crawl_url(url, tabAllocator, loadListener) | 
| 282 { | 303 { | 
| 283 let tab = yield tabAllocator.getTab(); | 304 let tab = yield tabAllocator.getTab(); | 
| 284 let result = {url, requests: []}; | 305 let result = {url, requests: []}; | 
| 285 | 306 let requestNotifier; | 
| 286 try | 307 try | 
| 287 { | 308 { | 
| 288 result.startTime = Date.now(); | 309 result.startTime = Date.now(); | 
| 289 let requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, f unction({type, location, filter}, scanComplete) | 310 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, | 
| 290 { | 311 function(entry, scanComplete) | 
| 291 result.requests.push({location, contentType: type, filter}); | 312 { | 
| 313 if (!entry) | |
| 314 return; | |
| 315 let {type: contentType, location, filter} = entry; | |
| 316 result.requests.push({location, contentType, filter}); | |
| 292 }); | 317 }); | 
| 293 | 318 | 
| 294 tab.linkedBrowser.loadURI(url, null, null); | 319 tab.linkedBrowser.loadURI(url, null, null); | 
| 295 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; | 320 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; | 
| 296 result.endTime = Date.now(); | 321 result.endTime = Date.now(); | 
| 297 result.finalUrl = tab.linkedBrowser.currentURI.spec; | 322 result.finalUrl = tab.linkedBrowser.currentURI.spec; | 
| 298 | 323 | 
| 299 let document = tab.linkedBrowser.contentDocument; | 324 let document = tab.linkedBrowser.contentDocument; | 
| 300 if (document.documentElement) | 325 if (document.documentElement) | 
| 301 { | 326 { | 
| (...skipping 13 matching lines...) Expand all Loading... | |
| 315 result.error = "Capturing screenshot failed: " + e; | 340 result.error = "Capturing screenshot failed: " + e; | 
| 316 } | 341 } | 
| 317 | 342 | 
| 318 // TODO: Capture frames as well? | 343 // TODO: Capture frames as well? | 
| 319 let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); | 344 let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); | 
| 320 result.source = serializer.serializeToString(document.documentElement); | 345 result.source = serializer.serializeToString(document.documentElement); | 
| 321 } | 346 } | 
| 322 } | 347 } | 
| 323 finally | 348 finally | 
| 324 { | 349 { | 
| 350 if (requestNotifier) | |
| 351 requestNotifier.shutdown(); | |
| 325 tabAllocator.releaseTab(tab); | 352 tabAllocator.releaseTab(tab); | 
| 326 } | 353 } | 
| 327 return result; | 354 return result; | 
| 328 } | 355 } | 
| 329 | 356 | 
| 330 function reportException(e) | 357 function reportException(e) | 
| 331 { | 358 { | 
| 332 let stack = ""; | 359 let stack = ""; | 
| 333 if (e && typeof e == "object" && "stack" in e) | 360 if (e && typeof e == "object" && "stack" in e) | 
| 334 stack = e.stack + "\n"; | 361 stack = e.stack + "\n"; | 
| 335 | 362 | 
| 336 Cu.reportError(e); | 363 Cu.reportError(e); | 
| 337 dump(e + "\n" + stack + "\n"); | 364 dump(e + "\n" + stack + "\n"); | 
| 338 } | 365 } | 
| LEFT | RIGHT |