Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: lib/crawler.js

Issue 29338153: Issue 3780 - wait for the loading of filters and only afterwards start to fetch pages (Closed)
Left Patch Set: Created March 11, 2016, 3:55 p.m.
Right Patch Set: remove onFiltersLoaded listener and fit 80 chars Created March 15, 2016, 2:57 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License 2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at 3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/. 4 * http://mozilla.org/MPL/2.0/.
5 */ 5 */
6 6
7 /** 7 /**
8 * @module crawler 8 * @module crawler
9 */ 9 */
10 10
11 Cu.import("resource://gre/modules/Services.jsm"); 11 Cu.import("resource://gre/modules/Services.jsm");
12 Cu.import("resource://gre/modules/Task.jsm"); 12 Cu.import("resource://gre/modules/Task.jsm");
13 Cu.import("resource://gre/modules/Promise.jsm"); 13 Cu.import("resource://gre/modules/Promise.jsm");
14 14
15 function abprequire(module) 15 function abprequire(module)
16 { 16 {
17 let result = {}; 17 let result = {};
18 result.wrappedJSObject = result; 18 result.wrappedJSObject = result;
19 Services.obs.notifyObservers(result, "adblockplus-require", module); 19 Services.obs.notifyObservers(result, "adblockplus-require", module);
20 return result.exports; 20 return result.exports;
21 } 21 }
22 22
23 let {RequestNotifier} = abprequire("requestNotifier"); 23 let {RequestNotifier} = abprequire("requestNotifier");
24
25 let {FilterNotifier} = abprequire("filterNotifier"); 24 let {FilterNotifier} = abprequire("filterNotifier");
26 let {FilterStorage} = abprequire("filterStorage"); 25 let {FilterStorage} = abprequire("filterStorage");
27 26
28 /** 27 /**
29 * Creates a pool of tabs and allocates them to tasks on request. 28 * Creates a pool of tabs and allocates them to tasks on request.
30 * 29 *
31 * @param {tabbrowser} browser 30 * @param {tabbrowser} browser
32 * The tabbed browser where tabs should be created 31 * The tabbed browser where tabs should be created
33 * @param {int} maxtabs 32 * @param {int} maxtabs
34 * The maximum number of tabs to be allocated 33 * The maximum number of tabs to be allocated
(...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after
192 }; 191 };
193 192
194 /** 193 /**
195 * Starts the crawling session. The crawler opens each URL in a tab and stores 194 * Starts the crawling session. The crawler opens each URL in a tab and stores
196 * the results. 195 * the results.
197 * 196 *
198 * @param {Window} window 197 * @param {Window} window
199 * The browser window we're operating in 198 * The browser window we're operating in
200 * @param {String[]} urls 199 * @param {String[]} urls
201 * URLs to be crawled 200 * URLs to be crawled
202 * @param {int} number_of_tabs 201 * @param {int} timeout
202 * Load timeout in milliseconds
203 * @param {int} maxtabs
203 * Maximum number of tabs to be opened 204 * Maximum number of tabs to be opened
204 * @param {String} targetURL 205 * @param {String} targetURL
205 * URL that should receive the results 206 * URL that should receive the results
207 * @param {Function} onDone
208 * The callback which is called after finishing of crawling of all URLs.
206 */ 209 */
207 function run(window, urls, timeout, maxtabs, targetURL, onDone) 210 function run(window, urls, timeout, maxtabs, targetURL, onDone)
211 {
212 new Promise((resolve, reject) =>
213 {
214 if (FilterStorage.subscriptions.length > 0)
215 {
216 resolve();
217 return;
218 }
219 let onFiltersLoaded = (action, item, newValue, oldValue) =>
220 {
221 if (action == "load")
222 {
223 FilterNotifier.removeListener(onFiltersLoaded);
224 resolve();
225 }
226 };
227 FilterNotifier.addListener(onFiltersLoaded);
228 }).then(() => crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone))
229 .catch(reportException);
230 }
231 exports.run = run;
232
233 /**
234 * Spawns a {Task} task to crawl each url from `urls` argument and calls
235 * `onDone` when all tasks are finished.
236 * @param {Window} window
237 * The browser window we're operating in
238 * @param {String[]} urls
239 * URLs to be crawled
240 * @param {int} timeout
241 * Load timeout in milliseconds
242 * @param {int} maxtabs
243 * Maximum number of tabs to be opened
244 * @param {String} targetURL
245 * URL that should receive the results
246 * @param {Function} onDone
247 * The callback which is called after finishing of all tasks.
248 */
249 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)
208 { 250 {
209 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); 251 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
210 let loadListener = new LoadListener(window.getBrowser(), timeout); 252 let loadListener = new LoadListener(window.getBrowser(), timeout);
211 let running = 0; 253 let running = 0;
212 let windowCloser = new WindowCloser(); 254 let windowCloser = new WindowCloser();
213 let taskDone = function() 255 let taskDone = function()
214 { 256 {
215 running--; 257 running--;
216 if (running <= 0) 258 if (running <= 0)
217 { 259 {
218 loadListener.stop(); 260 loadListener.stop();
219 windowCloser.stop(); 261 windowCloser.stop();
220 onDone(); 262 onDone();
221 } 263 }
222 }; 264 };
223 265
224 new Promise(function(resolve, reject) 266 for (let url of urls)
Wladimir Palant 2016/03/15 09:13:10 Nit: Why not use arrow functions consistently? (re
sergei 2016/03/15 12:16:23 We don't need to capture `this` here, so I have no
Wladimir Palant 2016/03/15 13:44:22 We generally use arrow functions for inline callba
225 { 267 {
226 if (FilterStorage.subscriptions.length > 0 && !FilterStorage._loading) 268 running++;
Wladimir Palant 2016/03/15 09:13:10 _loading is an internal flag to prevent reentrance
Wladimir Palant 2016/03/15 09:18:11 Actually, I think that this is a bad assumption fr
sergei 2016/03/15 12:16:23 I thought about it. It does happen that "load" ev
Wladimir Palant 2016/03/15 13:44:22 No, subscriptions are added all at once when loadi
sergei 2016/03/15 14:44:25 Acknowledged.
227 { 269 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result)
228 resolve(); 270 {
229 return; 271 let request = new XMLHttpRequest();
230 } 272 request.open("POST", targetURL);
231 FilterNotifier.addListener((action, item, newValue, oldValue) => 273 request.addEventListener("load", taskDone, false);
232 { 274 request.addEventListener("error", taskDone, false);
233 if (action === "load") 275 request.send(JSON.stringify(result));
Wladimir Palant 2016/03/15 09:13:11 Nit: We don't usually use strict equality, and it
sergei 2016/03/15 12:16:23 Fixed.
234 { 276 }, function(url, exception)
235 resolve(); 277 {
236 } 278 reportException(exception);
237 }); 279
238 }).then(_ => 280 let request = new XMLHttpRequest();
Wladimir Palant 2016/03/15 09:13:11 No pointless parameter please, () =>
sergei 2016/03/15 12:16:23 Acknowledged.
239 { 281 request.open("POST", targetURL);
240 for (let url of urls) 282 request.addEventListener("load", taskDone, false);
241 { 283 request.addEventListener("error", taskDone, false);
242 running++; 284 request.send(JSON.stringify({
243 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(fun ction(result) 285 url: url,
244 { 286 startTime: Date.now(),
245 let request = new XMLHttpRequest(); 287 error: String(exception)
246 request.open("POST", targetURL); 288 }));
247 request.addEventListener("load", taskDone, false); 289 }.bind(null, url));
248 request.addEventListener("error", taskDone, false); 290 }
249 request.send(JSON.stringify(result)); 291 }
250 }, function(url, exception)
251 {
252 reportException(exception);
253
254 let request = new XMLHttpRequest();
255 request.open("POST", targetURL);
256 request.addEventListener("load", taskDone, false);
257 request.addEventListener("error", taskDone, false);
258 request.send(JSON.stringify({
259 url: url,
260 startTime: Date.now(),
261 error: String(exception)
262 }));
263 }.bind(null, url));
264 }
265 // Be careful, `catch` does not catch exeptions from any asynchronous calls
Wladimir Palant 2016/03/15 09:13:11 exeptions => exceptions.
sergei 2016/03/15 12:16:23 Done. Basically, I have removed the comment.
266 // of this `then` handler because the latter one does not return an array of
267 // promises of asynchrounous tasks and does not contain any waiting code.
268 }).catch(reportException);
Wladimir Palant 2016/03/15 09:13:11 I'm not really happy with the way this is structur
sergei 2016/03/15 12:16:23 Done.
269 }
270 exports.run = run;
271 292
272 /** 293 /**
273 * Crawls a URL. This is a generator meant to be used via a Task object. 294 * Crawls a URL. This is a generator meant to be used via a Task object.
274 * 295 *
275 * @param {String} url 296 * @param {String} url
276 * @param {TabAllocator} tabAllocator 297 * @param {TabAllocator} tabAllocator
277 * @param {loadListener} loadListener 298 * @param {loadListener} loadListener
278 * @result {Object} 299 * @result {Object}
279 * Crawling result 300 * Crawling result
280 */ 301 */
281 function* crawl_url(url, tabAllocator, loadListener) 302 function* crawl_url(url, tabAllocator, loadListener)
282 { 303 {
283 let tab = yield tabAllocator.getTab(); 304 let tab = yield tabAllocator.getTab();
284 let result = {url, requests: []}; 305 let result = {url, requests: []};
285 306 let requestNotifier;
286 try 307 try
287 { 308 {
288 result.startTime = Date.now(); 309 result.startTime = Date.now();
289 let requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, f unction({type, location, filter}, scanComplete) 310 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID,
290 { 311 function(entry, scanComplete)
291 result.requests.push({location, contentType: type, filter}); 312 {
313 if (!entry)
314 return;
315 let {type: contentType, location, filter} = entry;
316 result.requests.push({location, contentType, filter});
292 }); 317 });
293 318
294 tab.linkedBrowser.loadURI(url, null, null); 319 tab.linkedBrowser.loadURI(url, null, null);
295 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; 320 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ;
296 result.endTime = Date.now(); 321 result.endTime = Date.now();
297 result.finalUrl = tab.linkedBrowser.currentURI.spec; 322 result.finalUrl = tab.linkedBrowser.currentURI.spec;
298 323
299 let document = tab.linkedBrowser.contentDocument; 324 let document = tab.linkedBrowser.contentDocument;
300 if (document.documentElement) 325 if (document.documentElement)
301 { 326 {
(...skipping 13 matching lines...) Expand all
315 result.error = "Capturing screenshot failed: " + e; 340 result.error = "Capturing screenshot failed: " + e;
316 } 341 }
317 342
318 // TODO: Capture frames as well? 343 // TODO: Capture frames as well?
319 let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); 344 let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
320 result.source = serializer.serializeToString(document.documentElement); 345 result.source = serializer.serializeToString(document.documentElement);
321 } 346 }
322 } 347 }
323 finally 348 finally
324 { 349 {
350 if (requestNotifier)
351 requestNotifier.shutdown();
325 tabAllocator.releaseTab(tab); 352 tabAllocator.releaseTab(tab);
326 } 353 }
327 return result; 354 return result;
328 } 355 }
329 356
330 function reportException(e) 357 function reportException(e)
331 { 358 {
332 let stack = ""; 359 let stack = "";
333 if (e && typeof e == "object" && "stack" in e) 360 if (e && typeof e == "object" && "stack" in e)
334 stack = e.stack + "\n"; 361 stack = e.stack + "\n";
335 362
336 Cu.reportError(e); 363 Cu.reportError(e);
337 dump(e + "\n" + stack + "\n"); 364 dump(e + "\n" + stack + "\n");
338 } 365 }
LEFTRIGHT
« no previous file | no next file » | Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Toggle Comments ('s')

Powered by Google App Engine
This is Rietveld