lib/crawler.js - Issue 10233013: Crawler, second version

Side by Side Diff: lib/crawler.js

Issue 10233013: Crawler, second version (Closed)

Patch Set: Created April 12, 2013, 1:38 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 Cu.import("resource://gre/modules/Services.jsm");	1 Cu.import( "resource://gre/modules/Services.jsm" );

2	2

3 function abprequire(module)	3 function abprequire( module )

4 {	4 {

5 let result = {};	5 let result = {};

6 result.wrappedJSObject = result;	6 result.wrappedJSObject = result;

7 Services.obs.notifyObservers(result, "adblockplus-require", module);	7 Services.obs.notifyObservers( result, "adblockplus-require", module );

8 return result.exports;	8 return result.exports;

9 }	9 }

10	10

11 let {Storage} = require("storage");	11 let { Action } = require( "action" );

12 let {Client} = require("client");	12 let { Browser_Tab, Tabbed_Browser } = require( "browser" );

13	13 let { Observation } = require( "instruction" );

14 let {Policy} = abprequire("contentPolicy");	14 let { Encoding } = require( "encoding" );

15 let {Filter} = abprequire("filterClasses");	15 let { Logger } = require( "logger" );

16 let {Utils} = abprequire("utils");	16

17	17 let { Policy } = abprequire( "contentPolicy" );

18 let origProcessNode = Policy.processNode;	18 let { RequestNotifier } = abprequire( "requestNotifier" );

19	19 let { Filter } = abprequire( "filterClasses" );

20 let siteTabs;	20 let { Utils } = abprequire( "utils" );

21 let currentTabs;	21

22	22 //-------------------------------------------------------

23 function processNode(wnd, node, contentType, location, collapse)	23 // Shim

24 {	24 //-------------------------------------------------------

25 let result = origProcessNode.apply(this, arguments);	25 /**

26 let url = (contentType === Policy.type.ELEMHIDE) ? location.text :	26 * Manager for shim replacement of an external function.

27 location.spec;	27 * <p/>

28	28 * Since there's no lvalue reference type in JavaScript (non-primitives are all reference types, but they are rvalue

29 let topWindow = wnd.top;	29 * references), the arguments here provide a substitute. The reference is the ex pression 'object[ property ]'.

30 if (!topWindow.document)	30 *

31 {	31 * @param {Object} original_object

32 Cu.reportError("No document associated with the node's top window");	32 * The original function whose call and return are to be surrounded by the shim.

33 return result;	33 * @param {string} original_property

34 }	34 * The original function whose call and return are to be surrounded by the shim.

35	35 * @constructor

36 let tabbrowser = Utils.getChromeWindow(topWindow).gBrowser;	36 */

37 if (!tabbrowser)	37 var Shim = function( original_object, original_property )

38 {	38 {

39 Cu.reportError("Unable to get a tabbrowser reference");	39 /**

40 return result;	40 * @type {Object}

41 }	41 */

42	42 this.original_object = original_object;

43 let browser = tabbrowser.getBrowserForDocument(topWindow.document);	43 /**

44 if (!browser)	44 * @type {String}

45 {	45 */

46 Cu.reportError("Unable to get browser for the tab");	46 this.original_property = original_property;

47 return result;	47

48 }	48 /**

49	49 * The original function as it exists at the time of instantiation. This means that generally the Shim instance

50 let site = siteTabs.get(browser);	50 * should be created as soon as possible, such as in module initialization.

51 let filtered = !result;	51 */

52 Storage.write([url, site, filtered]);	52 this.original_function = original_object[ original_property ];

	53 };

	54

	55 /**

	56 * @return {boolean}

	57 */

	58 Shim.prototype.is_original = function()

	59 {

	60 return (this.original_object[ this.original_property ] === this.original_funct ion);

	61 };

	62

	63 /**

	64 *

	65 * @param {Function} replacer

	66 * The replacement function transformer. Takes the original function as an argument and returns its replacement.

	67 */

	68 Shim.prototype.replace = function( replacer )

	69 {

	70 if ( !replacer )

	71 throw "Must supply a function transformer to supply a replacement function." ;

	72 if ( !this.is_original() )

	73 throw "This version of Shim does not support multiple replacement.";

	74 this.original_object[ this.original_property ] = replacer( this.original_funct ion );

	75 return this.original_function;

	76 };

	77

	78 /**

	79 * Reset the original function to a non-replaced state.

	80 * <p/>

	81 * May be called correctly even if the original has never been replaced.

	82 */

	83 Shim.prototype.reset = function()

	84 {

	85 this.original_object[ this.original_property ] = this.original_function;

	86 };

	87

	88 /**

	89 * Close out the shim and release resources.

	90 */

	91 Shim.prototype.close = function()

	92 {

	93 this.reset();

	94 /*

	95 * At present, this class does not use external resources that aren't dealt wi th by 'reset()'. That could change,

	96 * however, and so we use close() as the substitute-destructor and reset() for ordinary use.

	97 */

	98 };

	99

	100 /**

	101 * Shim instance for 'processNode'. As of this writing it's the only function in ABP we're shimming.

	102 */

	103 var process_node_shim = new Shim( Policy, "processNode" );

	104

	105 //-------------------------------------------------------

	106 // Crawler

	107 //-------------------------------------------------------

	108 /**

	109 * Constructor for a single crawl session. The crawler iterates through each ins truction, loading its URL in a tab,

	110 * running the hooks present in the processor, and storing results accordingly.

	111 *

	112 * @param {Instruction_Set} instructions

	113 * Instruction generator yields a sequence of tuples: URL to crawl, a proce ssor, and storage.

	114 * @param {*} outputs

	115 * @param {Window} window

	116 * The top window we're operating it. Must be present as an argument because the module context this class is

	117 * defined in does not have a window. (Or at least should not be relied upon. ) 1

	118 * @param {number} time_limit

	119 * The maximum duration that we will allow a page to try to load.

	120 * @param {boolean} leave_open

	121 * @param {number} number_of_tabs

	122 */

	123 var Crawler = function( instructions, outputs, window, time_limit, leave_open, n umber_of_tabs )

	124 {

	125 /**

	126 * @type {Instruction_Set}

	127 */

	128 this.instructions = instructions;

	129

	130 this.outputs = outputs;

	131

	132 /**

	133 * Browser window in which to open tabs. Required because, as a module, we don 't have a 'Window' object available.

	134 * @type {Window}

	135 */

	136 this.window = window;

	137

	138 this.time_limit = time_limit;

	139

	140 this.leave_open = leave_open;

	141

	142 if ( number_of_tabs <= 0 )

	143 {

	144 /*

	145 * Defensive. The caller should have already validated this argument.

	146 */

	147 number_of_tabs = 1;

	148 }

	149

	150 /**

	151 * Progress object. It's simple enough not to need its own class. Just overrid e the notice() function to receive

	152 * progress notices.

	153 */

	154 this.progress = {

	155 active: 0,

	156 completed: 0,

	157 total: instructions.size,

	158 notice: function()

	159 {

	160 },

	161 status: function()

	162 {

	163 }

	164 };

	165

	166 if ( !process_node_shim.is_original() )

	167 throw "Function 'processNode' is already shimmed. We may not insert a second one.";

	168 process_node_shim.replace(

	169 function( original )

	170 {

	171 return this.node_action.bind( this, original );

	172 }.bind( this )

	173 );

	174

	175 /**

	176 * Logging service.

	177 * @type {Logger}

	178 */

	179 this.logger = new Logger( "Crawler" );

	180

	181 this.tabbed_browser = new Tabbed_Browser( this.window, number_of_tabs );

	182

	183 /**

	184 * Closed flag. Needed to terminate the generator if this object is closed bef ore the generator stops.

	185 * @type {Boolean}

	186 */

	187 this.closed = false;

	188

	189 /**

	190 * The object responsible for gaining access to the call stream for individual entries within each node. This is

	191 * one of two hooks into ABP itself, the other being the shim for 'processNode '.

	192 *

	193 * @type {RequestNotifier}

	194 */

	195 this.requestNotifier = new RequestNotifier( null, this.node_entry_action.bind( this ) );

	196

	197 /**

	198 * The current nodes that are active in a call to 'node_action'. In ordinary c ases, this map has at most the

	199 * maximum number of concurrent loads.

	200 * @type {WeakMap}

	201 */

	202 this.current_nodes = new WeakMap();

	203 };

	204 exports.Crawler = Crawler;

	205

	206 Crawler.prototype.toJSON = function()

	207 {

	208 return {

	209 instructions: this.instructions,

	210 storage: this.storage

	211 };

	212 };

	213

	214 /**

	215 * Close the present instance. This object holds browser resources because of th e browser tabs it holds open.

	216 */

	217 Crawler.prototype.close = function()

	218 {

	219 for ( let j = 0 ; j < this.outputs.length ; ++j )

	220 {

	221 this.outputs[j].storage.close();

	222 }

	223 if ( this.tabbed_browser ) this.tabbed_browser.close();

	224 if ( this.requestNotifier ) this.requestNotifier.shutdown();

	225 process_node_shim.reset();

	226 this.closed = true;

	227 };

	228

	229 /**

	230 * The output encoding for the session as a whole.

	231 * @type {*}

	232 */

	233 Crawler.prototype.__encoding__ = Encoding.as_object( [

	234 // prelude

	235 Encoding.immediate_fields( ["time_start", "instructions"] ),

	236 // observation

	237 Encoding.field( "trials", Encoding.array_stream() ),

	238 // postlude

	239 Encoding.immediate_fields( ["time_finish", "termination"] )

	240 ] );

	241

	242 /**

	243 * Task generator for the crawler

	244 *

	245 * @param {Function} pause

	246 * @param {Function} resume

	247 */

	248 Crawler.prototype.generator = function( pause, resume )

	249 {

	250 /*

	251 * A crawler object represent a single run of the crawler. Thus the pause and resume function act like object-scope

	252 * variables.

	253 */

	254 this.pause = pause;

	255 this.resume = resume;

	256 var log = this.logger.make_log( "task" );

	257 var tab = null;

	258

	259 var runaway_counter = 0;

	260

	261 try

	262 {

	263 /*

	264 * Preparation code. Ensure that every initialization here can be reversed i n the 'finally' clause whether

	265 * or not it executed, in case some initialization throws an exception.

	266 */

	267 this.time_start = Logger.timestamp();

	268

	269 var multiple = new Encoding.Multiple_Format();

	270 for ( let j = 0 ; j < this.outputs.length ; ++j )

	271 {

	272 let output = this.outputs[j];

	273 let formatter = new Encoding[ output.encode ]( output.storage.writer() );

	274 multiple.add( formatter );

	275 }

	276 this.encoder = new Encoding.Format_stream( multiple );

	277

	278 this.encoder.write( this );

	279 this.encoder.sequence_start();

	280

	281 let gen = this.instructions.generator();

	282 let instruction = null; // Avoid spurious IDEA warning

	283 for ( instruction of gen )

	284 {

	285 if ( this.closed )

	286 {

	287 /*

	288 * Defensive. We only arrive here if some outside code has called our cl ose() method and did not also

	289 * order our cancellation. Regardless, we're done making new tabs.

	290 */

	291 Cu.reportError( "Crawler closed but its enclosing task not cancelled." ) ;

	292 break;

	293 }

	294

	295 if ( this.tabbed_browser.available() )

	296 {

	297 /*

	298 * Since we'll need a variety of browser-tab behaviors, we'll need to ch ange this factory call

	299 * to something dependent upon the instruction.

	300 */

	301 tab = this.tabbed_browser.make_tab( instruction.target, this.leave_open, this._deferred_load_finisher.bind( this ), null );

	302 tab.instruction = instruction;

	303 instruction.begin();

	304 let join = new Action.Join_Timeout( tab, this.time_limit, this._join_fin isher.bind( this ) );

	305 join.go( tab, resume );

	306 /*

	307 * The return value of load is an asynchronous action that could be comb ined with others, if the

	308 * instruction dictates. There's no hook for this yet, although that's t he reason we do not immediately

	309 * execute on calling load.

	310 */

	311 tab.go();

	312 ++this.progress.active;

	313 this.progress.notice();

	314 }

	315 if ( !this.tabbed_browser.available() )

	316 {

	317 pause();

	318 }

	319

	320 var cancelled = yield false;

	321 if ( cancelled )

	322 {

	323 break;

	324 }

	325 }

	326 /*

	327 * At this point in the code, we have launched all the instructions. If we'r e using more than one tab,

	328 * we'll generally have open tabs still. We need to pause until we have no m ore tabs left open.

	329 */

	330 if ( !cancelled )

	331 {

	332 while ( !this.tabbed_browser.quiescent() )

	333 {

	334 // Must yield after pause() for it to take effect

	335 pause();

	336 cancelled = yield false;

	337 if ( cancelled )

	338 {

	339 break;

	340 }

	341 ++runaway_counter;

	342 if ( runaway_counter > 100 )

	343 {

	344 Cu.reportError( "Runaway pause loop. counter = " + runaway_counter );

	345 break;

	346 }

	347 }

	348 }

	349

	350 /*

	351 * OK. Finally done.

	352 */

	353 this.termination = cancelled ? "Cancelled" : "Success";

	354 this.progress.status( cancelled ? "Cancelled" : "Done" );

	355 }

	356 catch ( e if e instanceof Error )

	357 {

	358 log( e.toString() + "\n\n" + e.stack );

	359 this.termination = "Error";

	360 }

	361 catch ( e )

	362 {

	363 log( e.toString() + " - type: " + Object.prototype.toString.call( e ) );

	364 this.termination = "Unknown exception";

	365 }

	366 finally

	367 {

	368 /*

	369 * Finish writing the output before closing ourselves down.

	370 */

	371 this.time_finish = Logger.timestamp();

	372 this.encoder.sequence_stop();

	373

	374 /*

	375 * If everything goes right, this cleanup should not be necessary, as tab in stances are closed as they are used.

	376 * Nonetheless, if there's an error and a landing function is not called, th is line ensures that all the tabs

	377 * are properly destroyed.

	378 */

	379 if ( tab ) tab.close();

	380 // Removes the ABP shim, amongst other things.

	381 this.close();

	382 }

	383 };

	384

	385 /**

	386 * Landing function for the asynchronous action of loading a tab. For some reaso ns, Firefox is delivering the

	387 * STATE_STOP progress message before the last ABP filter is being run. It seems that it's firing events immediately,

	388 * once it knows the request has finished its HTTP transfer, but before it has f ully finished loading the page as a

	389 * whole (the DOM, layout, etc.). Hence we let the browser finish its work in th e current thread and run the actual

	390 * load-end action afterwards.

	391 * <p/>

	392 * The implementation of this function allows it to be defined without arguments . That's not what actually happens.

	393 * Since this function is just a scheduling pass-through, it uses 'arguments' to pass all arguments, no matter what they

	394 * are. (And no matter how they change over time.)

	395 */

	396 Crawler.prototype._deferred_load_finisher = function()

	397 {

	398 /*

	399 * The first argument is the 'this' object when 'apply' runs. The second argum ent is the 'this' object when

	400 * 'this._load_end_action' runs.

	401 */

	402 Action.dispatch( Function.prototype.apply.bind( this._load_finisher, this, arg uments ) );

	403 };

	404

	405 /**

	406 * Since we're done loading (the cause doesn't matter), we order the instruction to write out its results, be they

	407 * successful or any of the varieties of unsuccessful.

	408 */

	409 Crawler.prototype._load_finisher = function( tab, completion_state, error_code )

	410 {

	411 var instruction = tab.instruction;

	412 if ( tab.completed )

	413 {

	414 if ( tab.completed_well )

	415 {

	416 switch ( completion_state )

	417 {

	418 case Browser_Tab.Completion_State.Success:

	419 instruction.end();

	420 break;

	421 case Browser_Tab.Completion_State.No_Success:

	422 instruction.abort( "unsuccessful load. nsresult = " + error_code );

	423 break;

	424 case Browser_Tab.Completion_State.User_Close:

	425 instruction.abort( "user closed tab" );

	426 break;

	427 case Browser_Tab.Completion_State.External_Cancel:

	428 instruction.abort( "timed out" );

	429 break;

	430 default:

	431 instruction.abort( "WTF?" );

	432 break;

	433 }

	434 }

	435 else

	436 {

	437 instruction.abort( "exception. message = " + tab.exception.message );

	438 }

	439 }

	440 else

	441 {

	442 // Defensive. Should not reach.

	443 instruction.abort( "tab load not completed. Huh?" );

	444 }

	445 this.encoder.sequence_send( instruction );

	446

	447 tab.close();

	448 --this.progress.active;

	449 ++this.progress.completed;

	450 this.progress.notice();

	451 this.resume();

	452 };

	453

	454 /**

	455 * The join finisher merely ensures that the tab load action actually completes, stopping it if it hasn't completed yet.

	456 *

	457 * @param tab

	458 */

	459 Crawler.prototype._join_finisher = function( tab )

	460 {

	461 /*

	462 * If the join timeout caused completion, we must assume that the tab is still loading. If the timeout did not fire,

	463 * then the tab action completed. In all cases, the tab will be complete after wards. Thus because Join_Timeout is

	464 * reliable, means that we have made tab-load reliable also.

	465 */

	466 if ( !tab.complete )

	467 {

	468 tab.stop();

	469 }

	470 };

	471

	472 //----------------------------------

	473 // Data gathering functions

	474 //----------------------------------

	475 /**

	476 * Shim for 'processNode' in ABP. Executes once for each node that ABP processes , whether or not it acts on that node.

	477 *

	478 * @param {Function} original_f

	479 * The original processNode function.

	480 * @param {nsIDOMWindow} wnd

	481 * @param {nsIDOMElement} node

	482 * @param {Number} contentType

	483 * @param {nsIURI} location

	484 * @param {Boolean} collapse

	485 * true to force hiding of the node

	486 * @return {Boolean} false if the node should be blocked

	487 */

	488 Crawler.prototype.node_action = function( original_f, wnd, node, contentType, lo cation, collapse )

	489 {

	490 //var log = this.logger.make_log( "node_action" );

	491

	492 /*

	493 * Set up collecting for node_entry_action(). It should be the case that a nod e matches either 0 or 1 filters.

	494 * The collection array 'entries' allows more than 1 to be recorded, and for s uch activity to be detected and

	495 * reported rather than inducing an observation error.

	496 */

	497 var entries = [];

	498 var entry_hook = function( node, windows, entry )

	499 {

	500 entries.push( { node: node, windows: windows, entry: entry } );

	501 };

	502 this.current_nodes.set( node, entry_hook );

	503

	504 /*

	505 * Call the original processNode. If the original throws, then we will too, so this is outside a try clause.

	506 */

	507 var result = original_f( wnd, node, contentType, location, collapse );

	508

	509 try

	510 {

	511 let instruction = null; // Initialize here in case locate_instruction() throws.

	512 try

	513 {

	514 instruction = this.locate_instruction( wnd );

	515 }

	516 catch ( e )

	517 {

	518 Cu.reportError( "Crawler/node_action: error locating instruction: " + e.to String()

	519 + ( ( "stack" in e ) ? ( "\n\tstack = " + e.stack) : "" )

	520 );

	521 return result;

	522 }

	523 if ( !instruction )

	524 {

	525 /*

	526 * If we don't have an instruction, we don't report this node. This is by design, because reporting is

	527 * the responsibility of the instruction object.

	528 */

	529 return result;

	530 }

	531 if ( entries.length == 0 && !instruction.observing_all_nodes() )

	532 {

	533 // Assert we didn't touch this node and the instruction doesn't want to se e it

	534 return result;

	535 }

	536 try

	537 {

	538 var observation = new Observation(

	539 !result, contentType,

	540 (contentType == Policy.type.ELEMHIDE) ? location.text : location.spec,

	541 entries

	542 );

	543 instruction.observe_node( observation );

	544 }

	545 catch ( e )

	546 {

	547 Cu.reportError( "Crawler/node_action: error recording observation: " + e.t oString() );

	548 return result;

	549 }

	550 }

	551 finally

	552 {

	553 /*

	554 * This 'finally' clause ensures that we remove the node from 'this.current_ nodes'. Even though it's a weak map,

	555 * we need to remove the key so that 'entry_hook' is not called inadvertentl y.

	556 */

	557 this.current_nodes.delete( node );

	558 }

53 return result;	559 return result;

	560 };

	561

	562 /**

	563 * Locate our instruction associated with a window that caused to load. First we find the browser associated with the

	564 * window. There should always be one of these, otherwise we have an error. From the browser, we locate our tab

	565 * associated with it, which need not be present. Finally, we locate the instruc tion as a tab member, which should

	566 * always exist.

	567 * <p/>

	568 * This is called only in node_action(). It's separate to simplify the control f low.

	569 *

	570 * @param window

	571 * @return {Instruction_class}

	572 */

	573 Crawler.prototype.locate_instruction = function( window )

	574 {

	575 let topWindow = window.top;

	576 if ( !topWindow.document )

	577 throw new Error( "No document associated with the node's top window" );

	578 let tabbrowser = Utils.getChromeWindow( topWindow ).gBrowser;

	579 if ( !tabbrowser )

	580 throw new Error( "Unable to get a tabbrowser reference from the window" );

	581 let browser = tabbrowser.getBrowserForDocument( topWindow.document );

	582 if ( !browser )

	583 throw new Error( "Unable to get browser for the tab" );

	584 if ( !this.tabbed_browser.map_browser_to_child.has( browser ) )

	585 {

	586 /*

	587 * It's not an error for the browser not to appear in this map. If the tab i s remains open past the time

	588 * we are monitoring (either on purpose or as the result of a quirk of timin g), we simply return a null

	589 * instruction. Nevertheless, the code to report this to the console remains in place, commented out, because

	590 * it's likely to be relevant still during development.

	591 */

	592 // Cu.reportError(

	593 // "Crawler.node_action: Browser not found in internal map. " + Logger. timestamp()

	594 // + "\nlocation=" + url_location

	595 // );

	596 // this.logger.stack_trace();

	597 return null;

	598 }

	599 var tab = this.tabbed_browser.map_browser_to_child.get( browser ).child;

	600 if ( !("instruction" in tab) )

	601 throw new Error( "'instruction' not found as member of tab object" );

	602 return tab.instruction;

	603 };

	604

	605 /**

	606 * This function executes solely underneath (in the call stack) 'node_action'. I t receives at least one call per node,

	607 * more if there are matches on rules of any kind.

	608 *

	609 * @param window

	610 * @param node

	611 * @param {RequestEntry} entry

	612 */

	613 Crawler.prototype.node_entry_action = function( window, node, entry )

	614 {

	615 if ( !this.current_nodes.has( node ) )

	616 {

	617 Cu.reportError( "node_entry_action: node not seen in 'current_nodes'" );

	618 return;

	619 }

	620 if ( !entry.filter )

	621 {

	622 /*

	623 * If there's no filter in the entry, then nothing happened to it. We are pr esently ignoring such entries. In

	624 * the future, however, we will likely want a hook here to process entries t hat are not associated with any

	625 * filter, for example, to ensure that necessary content is not blocked inad vertently.

	626 */

	627 return;

	628 }

	629 var windows = [];

	630 var n = 0;

	631 while ( window != null )

	632 {

	633 if ( ++n > 100 )

	634 {

	635 // Houston, we have a problem.

	636 windows = null;

	637 Cu.reportError( "Crawler/node_entry_action: runaway window chain" );

	638 break;

	639 }

	640 windows.push( window );

	641 if ( window === window.parent )

	642 {

	643 // This is the ordinary statement to exit the loop.

	644 break;

	645 }

	646 window = window.parent;

	647 }

	648 this.current_nodes.get( node )( node, windows, entry );

	649 };

	650

	651

	652 function shutdown_crawler()

	653 {

	654 process_node_shim.close();

54 }	655 }

55	656

56 function loadSite(site, window, callback)	657 try

57 {	658 {

58 if (!site)	659 onShutdown.add( shutdown_crawler );

59 return;

60

61 let tabbrowser = window.gBrowser;

62 let tab = tabbrowser.addTab(site);

63 let browser = tabbrowser.getBrowserForTab(tab);

64

65 siteTabs.set(browser, site);

66

67 let progressListener = {

68 onStateChange: function(aBrowser, aWebProgress, aRequest, aStateFlags, aStat us)

69 {

70 if (browser !== aBrowser)

71 return;

72

73 if (!(aStateFlags & Ci.nsIWebProgressListener.STATE_STOP))

74 return;

75

76 tabbrowser.removeTabsProgressListener(progressListener);

77 tabbrowser.removeTab(tab);

78 callback();

79 }

80 };

81 tabbrowser.addTabsProgressListener(progressListener);

82 }	660 }

83	661 catch ( e )

84 function loadSites(backendUrl, parallelTabs, window, sites, callback)	662 {

85 {	663 Cu.reportError( "Failure adding shutdown function. error = \"" + e.message + " \"" );

86 while (currentTabs < parallelTabs && sites.length)

87 {

88 currentTabs++;

89 let site = sites.shift();

90 loadSite(site, window, function()

91 {

92 currentTabs--;

93 if (!sites.length && !currentTabs)

94 {

95 Storage.finish();

96 let dataFilePath = Storage.dataFile.path;

97 Client.sendCrawlerDataFile(backendUrl, dataFilePath, function()

98 {

99 Storage.destroy();

100 callback();

101 });

102 }

103 else

104 loadSites(backendUrl, parallelTabs, window, sites, callback);

105 });

106 }

107 }	664 }

108

109 let Crawler = exports.Crawler = {};

110

111 Crawler.crawl = function(backendUrl, parallelTabs, window, callback)

112 {

113 if (Policy.processNode != origProcessNode)

114 return;

115

116 Policy.processNode = processNode;

117

118 siteTabs = new WeakMap();

119 currentTabs = 0;

120

121 Storage.init();

122

123 Client.fetchCrawlableSites(backendUrl, function(sites)

124 {

125 loadSites(backendUrl, parallelTabs, window, sites, function()

126 {

127 Policy.processNode = origProcessNode;

128 siteTabs = null;

129 callback();

130 });

131 });

132 };

OLD	NEW

« no previous file with comments | « lib/counter_task.js ('k') | lib/encoding.js » ('j') | no next file with comments »