lib/crawler.js - Issue 9615013: Crawler, first version

Side by Side Diff: lib/crawler.js

Issue 9615013: Crawler, first version (Closed)

Patch Set: Created March 6, 2013, 4:05 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 Cu.import("resource://gre/modules/Services.jsm");	1 Cu.import( "resource://gre/modules/Services.jsm" );

2	2

3 function abprequire(module)	3 function abprequire( module )

4 {	4 {

5 let result = {};	5 let result = {};

6 result.wrappedJSObject = result;	6 result.wrappedJSObject = result;

7 Services.obs.notifyObservers(result, "adblockplus-require", module);	7 Services.obs.notifyObservers( result, "adblockplus-require", module );

8 return result.exports;	8 return result.exports;

9 }	9 }

10	10

11 let {Storage} = require("storage");	11 let {Client} = require( "client" );

12 let {Client} = require("client");	12 let {Browser_Tab,Tabbed_Browser} = require( "browser" );

13	13 let {Encoding} = require( "encoding" );

14 let {Policy} = abprequire("contentPolicy");	14 let {Logger} = require( "logger" );

15 let {Filter} = abprequire("filterClasses");	15

16 let {Utils} = abprequire("utils");	16 let {Policy} = abprequire( "contentPolicy" );

17	17 let {RequestNotifier} = abprequire( "requestNotifier" );

18 let origProcessNode = Policy.processNode;	18 let {Filter} = abprequire( "filterClasses" );

19	19 let {Utils} = abprequire( "utils" );

20 let siteTabs;	20 let {Observation} = require( "instruction" );

21 let currentTabs;	21

22	22 //-------------------------------------------------------

23 function processNode(wnd, node, contentType, location, collapse)	23 // Shim

24 {	24 //-------------------------------------------------------

25 let result = origProcessNode.apply(this, arguments);	25 /**

26 let url = (contentType === Policy.type.ELEMHIDE) ? location.text :	26 * Manager for shim replacement of an external function.

27 location.spec;	27 * <p/>

28	28 * Since there's no lvalue reference type in JavaScript (non-primitives are all reference types, but they are rvalue

29 let topWindow = wnd.top;	29 * references), the arguments here provide a substitute. The reference is the ex pression 'object[ property ]'.

30 if (!topWindow.document)	30 *

31 {	31 * @param {Object} original_object

32 Cu.reportError("No document associated with the node's top window");	32 * The original function whose call and return are to be surrounded by the shim.

	33 * @param {String} original_property

	34 * The original function whose call and return are to be surrounded by the shim.

	35 * @constructor

	36 */

	37 var Shim = function( original_object, original_property )

	38 {

	39 /**

	40 * @type {Object}

	41 */

	42 this.original_object = original_object;

	43 /**

	44 * @type {String}

	45 */

	46 this.original_property = original_property;

	47

	48 /**

	49 * The original function as it exists at the time of instantiation. This mea ns that generally the Shim instance

	50 * should be created as soon as possible, such as in module initialization.

	51 */

	52 this.original_function = original_object[ original_property ];

	53 };

	54

	55 /**

	56 * @return {boolean}

	57 */

	58 Shim.prototype.is_original = function()

	59 {

	60 return (this.original_object[ this.original_property ] === this.original_fun ction);

	61 };

	62

	63 /**

	64 *

	65 * @param {Function} replacer

	66 * The replacement function transformer. Takes the original function as an argument and returns its replacement.

	67 */

	68 Shim.prototype.replace = function( replacer )

	69 {

	70 if ( !replacer )

	71 throw "Must supply a function transformer to supply a replacement functi on.";

	72 if ( !this.is_original() )

	73 throw "This version of Shim does not support multiple replacement.";

	74 this.original_object[ this.original_property ] = replacer( this.original_fun ction );

	75 return this.original_function;

	76 };

	77

	78 /**

	79 * Reset the original function to a non-replaced state.

	80 * <p/>

	81 * May be called correctly even if the original has never been replaced.

	82 */

	83 Shim.prototype.reset = function()

	84 {

	85 this.original_object[ this.original_property ] = this.original_function;

	86 };

	87

	88 /**

	89 * Close out the shim and release resources.

	90 */

	91 Shim.prototype.close = function()

	92 {

	93 this.reset();

	94 /*

	95 * At present, this class does not use external resources that aren't dealt with by 'reset()'. That could change,

	96 * however, and so we use close() as the substitute-destructor and reset() f or ordinary use.

	97 */

	98 };

	99

	100 /**

	101 * Shim instance for 'processNode'. As of this writing it's the only function in ABP we're shimming.

	102 */

	103 var process_node_shim = new Shim( Policy, "processNode" );

	104

	105 //-------------------------------------------------------

	106 // Crawler

	107 //-------------------------------------------------------

	108 /**

	109 * Constructor for a single crawl session. The crawler iterates through each ins truction, loading its URL in a tab,

	110 * running the hooks present in the processor, and storing results accordingly.

	111 *

	112 * @param {Instruction_Set} instructions

	113 * Instruction generator yields a sequence of tuples: URL to crawl, a proce ssor, and storage.

	114 * @param {*} outputs

	115 * @param {*} display

	116 * @param {Window} window

	117 * The top window we're operating it. Must be present as an argument becaus e the module context this class is

	118 * defined in does not have a window. (Or at least should not be relied upo n.)

	119 * @param {boolean} leave_open

	120 * @param {number} number_of_tabs

	121 */

	122 var Crawler = function( instructions, outputs, display, window, leave_open, numb er_of_tabs, progress )

	123 {

	124 /**

	125 * @type {Instruction_Set}

	126 */

	127 this.instructions = instructions;

	128

	129 this.outputs = outputs;

	130

	131 if ( !display )

	132 {

	133 throw "No ability to provide a null display object"

	134 }

	135 /**

	136 * Display object for showing progress messages.

	137 * @type {*}

	138 */

	139 this.display = display;

	140

	141 /**

	142 * Browser window in which to open tabs. Required because, as a module, we d on't have a 'Window' object available.

	143 * @type {Window}

	144 */

	145 this.window = window;

	146

	147 this.leave_open = leave_open;

	148

	149 if ( number_of_tabs <= 0 )

	150 {

	151 /*

	152 * Defensive. The caller should have already validated this argument.

	153 */

	154 number_of_tabs = 1;

	155 }

	156

	157 this.progress = progress;

	158

	159 if ( !process_node_shim.is_original() )

	160 throw "Function 'processNode' is already shimmed. We may not insert a se cond one.";

	161 process_node_shim.replace(

	162 function( original )

	163 {

	164 return this.node_action.bind( this, original );

	165 }.bind( this )

	166 );

	167

	168 /**

	169 * Logging service.

	170 * @type {Logger}

	171 */

	172 this.logger = new Logger( "Crawler" );

	173

	174 this.tabbed_browser = new Tabbed_Browser( this.window, number_of_tabs );

	175

	176 /**

	177 * Closed flag. Needed to terminate the generator if this object is closed b efore the generator stops.

	178 * @type {Boolean}

	179 */

	180 this.closed = false;

	181

	182 /**

	183 * @type {RequestNotifier}

	184 */

	185 this.requestNotifier = new RequestNotifier( null, this.node_entry_action.bin d( this ) );

	186

	187 /**

	188 * The current nodes that are active in a call to 'node_action'. In ordinary cases, this map has at most the

	189 * maximum number of concurrent loads.

	190 * @type {WeakMap}

	191 */

	192 this.current_nodes = new WeakMap();

	193

	194 this.progress_stats = {

	195 active: 0,

	196 completed: 0

	197 };

	198 };

	199 exports.Crawler = Crawler;

	200

	201 Crawler.prototype.toJSON = function()

	202 {

	203 return {

	204 instructions: this.instructions,

	205 storage: this.storage

	206 };

	207 };

	208

	209 /**

	210 * Close the present instance. This object holds browser resources because of th e browser tabs it holds open.

	211 */

	212 Crawler.prototype.close = function()

	213 {

	214 for ( let j = 0 ; j < this.outputs.length ; ++j )

	215 {

	216 this.outputs[j].storage.close();

	217 }

	218 if ( this.tabbed_browser ) this.tabbed_browser.close();

	219 if ( this.requestNotifier ) this.requestNotifier.shutdown();

	220 process_node_shim.reset();

	221 this.closed = true;

	222 };

	223

	224 /**

	225 * The output encoding for the session as a whole.

	226 * @type {*}

	227 */

	228 Crawler.prototype.__encoding__ = Encoding.as_object( [

	229 // prelude

	230 Encoding.immediate_fields( ["time_start", "instructions"] ),

	231 // observation

	232 Encoding.field( "trials", Encoding.array_stream() ),

	233 // postlude

	234 Encoding.immediate_fields( ["time_finish", "termination"] )

	235 ] );

	236

	237 /**

	238 * Task generator for the crawler

	239 *

	240 * @param {Function} pause

	241 * @param {Function} resume

	242 */

	243 Crawler.prototype.generator = function( pause, resume )

	244 {

	245 var log = this.logger.make_log( "task" );

	246 var tab = null;

	247

	248 var runaway_counter = 0;

	249

	250 try

	251 {

	252 /*

	253 * Preparation code. Ensure that every initialization here can be revers ed in the 'finally' clause whether

	254 * or not it executed, in case some initialization throws an exception.

	255 */

	256 this.time_start = Logger.timestamp();

	257

	258 var multiple = new Encoding.Multiple_Format();

	259 for ( let j = 0 ; j < this.outputs.length ; ++j )

	260 {

	261 let output = this.outputs[j];

	262 let formatter = new Encoding[ output.encode ]( output.storage.writer () );

	263 multiple.add( formatter );

	264 }

	265 this.encoder = new Encoding.Format_stream( multiple );

	266

	267 this.encoder.write( this );

	268 this.encoder.sequence_start();

	269

	270 let gen = this.instructions.generator();

	271 let instruction = null; // Avoid spurious IDEA warning

	272 for ( instruction of gen )

	273 {

	274 if ( this.closed )

	275 //noinspection ExceptionCaughtLocallyJS

	276 throw StopIteration;

	277

	278 if ( this.tabbed_browser.available() )

	279 {

	280 /*

	281 * Since we'll need a variety of browser-tab behaviors, we'll ne ed to change this factory call

	282 * to something dependent upon the instruction.

	283 */

	284 tab = this.tabbed_browser.make_tab( this.leave_open );

	285 tab.instruction = instruction;

	286 instruction.begin();

	287 /*

	288 * The return value of load is an asynchronous action that could be combined with others, if the

	289 * instruction dictates. There's no hook for this yet, although that's the reason we do not immediately

	290 * execute on calling load.

	291 */

	292 tab.load( instruction.target ).go( this._land.bind( this, tab, r esume ), null );

	293 ++this.progress_stats.active;

	294 this.progress.notice( this.progress_stats );

	295 }

	296 if ( !this.tabbed_browser.available() )

	297 {

	298 pause();

	299 }

	300

	301 var cancelled = yield false;

	302 if ( cancelled )

	303 {

	304 this.display.log( "Crawler cancelled." );

	305 break;

	306 }

	307 }

	308 //this.alert( "Just finished main instruction loop." );

	309 /*

	310 * At this point in the code, we have launched all the instructions. If we're using more than one tab,

	311 * we'll generally have open tabs still. We need to pause until we have no more tabs left open.

	312 */

	313 while ( !this.tabbed_browser.quiescent() )

	314 {

	315 pause();

	316 // Must yield after pause() for it to take effect

	317 cancelled = yield false;

	318 if ( cancelled )

	319 {

	320 this.display.log( "Crawler cancelled." );

	321 break;

	322 }

	323 ++runaway_counter;

	324 if ( runaway_counter > 100 )

	325 {

	326 Cu.reportError( "Runaway pause loop." );

	327 break;

	328 }

	329 }

	330

	331 /*

	332 * OK. Finally done.

	333 */

	334 this.termination = "ordinary";

	335 }

	336 catch ( e if e instanceof Error )

	337 {

	338 log( e.toString() + "\n\n" + e.stack );

	339 this.termination = "Error";

	340 }

	341 catch ( e )

	342 {

	343 log( e.toString() + " - type: " + Object.prototype.toString.call( e ) ) ;

	344 this.termination = "Unknown exception";

	345 }

	346 finally

	347 {

	348 /*

	349 * Finish writing the output before closing ourselves down.

	350 */

	351 if ( !( "termination" in this) )

	352 {

	353 this.termination = "Success";

	354 }

	355 this.time_finish = Logger.timestamp();

	356 this.encoder.sequence_stop();

	357

	358 /*

	359 * If everything goes right, this cleanup should not be necessary, as ta b instances are closed as they are used.

	360 * Nonetheless, if there's an error and a landing function is not called , this line ensures that all the tabs

	361 * are properly destroyed.

	362 */

	363 if ( tab ) tab.close();

	364 // Removes the ABP shim, amongst other things.

	365 this.close();

	366 }

	367 };

	368

	369 /**

	370 * Landing function for the asynchronous action of loading a tab. For some reaso ns, Firefox is delivering the

	371 * STATE_STOP progress message before the last ABP filter is being run. It seems that it's firing events immediately,

	372 * once it knows the request has finished its HTTP transfer, but before it has f ully finished loading the page as a

	373 * whole (the DOM, layout, etc.). Hence we let the browser finish its work in th e current thread and run the actual

	374 * load-end action afterwards.

	375 * <p/>

	376 * The implementation of this function allows it to be defined without arguments . That's not what actually happens.

	377 * Since this function is just a scheduling pass-through, it uses 'arguments' to pass all arguments, no matter what they

	378 * are. (And no matter how they change over time.)

	379 */

	380 Crawler.prototype._land = function()

	381 {

	382 /*

	383 * The first argument is the 'this' object when 'apply' runs. The second arg ument is the 'this' object when

	384 * 'this._load_end_action' runs.

	385 */

	386 Utils.threadManager.currentThread.dispatch(

	387 { run: Function.prototype.apply.bind( this._load_end_action, this, argum ents )},

	388 Ci.nsIEventTarget.DISPATCH_NORMAL );

	389 };

	390

	391 /**

	392 * Action at the end of loading a tab.

	393 *

	394 * @param tab

	395 * @param {Function} resume

	396 */

	397 Crawler.prototype._load_end_action = function( tab, resume )

	398 {

	399 var instruction = tab.instruction;

	400 tab.instruction.end();

	401 tab.close();

	402 this.encoder.sequence_send( instruction );

	403 --this.progress_stats.active;

	404 ++this.progress_stats.completed;

	405 this.progress.notice( this.progress_stats );

	406 resume();

	407 };

	408

	409 /**

	410 * Shim for 'processNode' in ABP. Executes once for each node that ABP processes , whether or not it acts on that node.

	411 *

	412 * @param {Function} original_f

	413 * The original processNode function.

	414 * @param {nsIDOMWindow} wnd

	415 * @param {nsIDOMElement} node

	416 * @param {Number} contentType

	417 * @param {nsIURI} location

	418 * @param {Boolean} collapse

	419 * true to force hiding of the node

	420 * @return {Boolean} false if the node should be blocked

	421 */

	422 Crawler.prototype.node_action = function( original_f, wnd, node, contentType, lo cation, collapse )

	423 {

	424 //var log = this.logger.make_log( "node_action" );

	425

	426 /*

	427 * Set up collecting for node_entry_action(). It should be the case that a n ode matches either 0 or 1 filters.

	428 * The collection array 'entries' allows more than 1 to be recorded, and for such activity to be detected and

	429 * reported rather than inducing an observation error.

	430 */

	431 var entries = [];

	432 var entry_hook = function( node, windows, entry )

	433 {

	434 entries.push( { node: node, windows: windows, entry: entry } );

	435 };

	436 this.current_nodes.set( node, entry_hook );

	437

	438 /*

	439 * Call the original processNode. If the original throws, then we will too, so this is outside a try clause.

	440 */

	441 var result = original_f( wnd, node, contentType, location, collapse );

	442

	443 try

	444 {

	445 let instruction = null; // Initialize here in case locate_instructio n() throws.

	446 try

	447 {

	448 instruction = this.locate_instruction( wnd );

	449 }

	450 catch ( e )

	451 {

	452 Cu.reportError( "Crawler/node_action: error locating instruction: " + e.toString()

	453 + ( ( "stack" in e ) ? ( "\n\tstack = " + e.stack) : "" )

	454 );

	455 return result;

	456 }

	457 if ( !instruction )

	458 {

	459 /*

	460 * If we don't have an instruction, we don't report this node. This is by design, because reporting is

	461 * the responsibility of the instruction object.

	462 */

	463 return result;

	464 }

	465 if ( entries.length == 0 && !instruction.observing_all_nodes() )

	466 {

	467 // Assert we didn't touch this node and the instruction doesn't want to see it

	468 return result;

	469 }

	470 try

	471 {

	472 var observation = new Observation(

	473 !result, contentType,

	474 (contentType == Policy.type.ELEMHIDE) ? location.text : location .spec,

	475 entries

	476 );

	477 instruction.observe_node( observation );

	478 }

	479 catch ( e )

	480 {

	481 Cu.reportError( "Crawler/node_action: error recording observation: " + e.toString() );

	482 return result;

	483 }

	484 }

	485 finally

	486 {

	487 /*

	488 * This 'finally' clause ensures that we remove the node from 'this.curr ent_nodes'. Even though it's a weak map,

	489 * we need to remove the key so that 'entry_hook' is not called inadvert ently.

	490 */

	491 this.current_nodes.delete( node );

	492 }

33 return result;	493 return result;

34 }	494 };

35	495

36 let tabbrowser = Utils.getChromeWindow(topWindow).gBrowser;	496 /**

37 if (!tabbrowser)	497 * Locate our instruction associated with a window that caused to load. First we find the browser associated with the

38 {	498 * window. There should always be one of these, otherwise we have an error. From the browser, we locate our tab

39 Cu.reportError("Unable to get a tabbrowser reference");	499 * associated with it, which need not be present. Finally, we locate the instruc tion as a tab member, which should

40 return result;	500 * always exist.

41 }	501 * <p/>

42	502 * This is called only in node_action(). It's separate to simplify the control f low.

43 let browser = tabbrowser.getBrowserForDocument(topWindow.document);	503 *

44 if (!browser)	504 * @param window

45 {	505 * @return {Instruction_class}

46 Cu.reportError("Unable to get browser for the tab");	506 */

47 return result;	507 Crawler.prototype.locate_instruction = function( window )

48 }	508 {

49	509 let topWindow = window.top;

50 let site = siteTabs.get(browser);	510 if ( !topWindow.document )

51 let filtered = !result;	511 throw new Error( "No document associated with the node's top window" );

52 Storage.write([url, site, filtered]);	512 let tabbrowser = Utils.getChromeWindow( topWindow ).gBrowser;

53 return result;	513 if ( !tabbrowser )

	514 throw new Error( "Unable to get a tabbrowser reference from the window" );

	515 let browser = tabbrowser.getBrowserForDocument( topWindow.document );

	516 if ( !browser )

	517 throw new Error( "Unable to get browser for the tab" );

	518 if ( !this.tabbed_browser.map_browser_to_child.has( browser ) )

	519 {

	520 /*

	521 * It's not an error for the browser not to appear in this map. If the t ab is remains open past the time

	522 * we are monitoring (either on purpose or as the result of a quirk of t iming), we simply return a null

	523 * instruction. Nevertheless, the code to report this to the console rem ains in place, commented out, because

	524 * it's likely to be relevant still during development.

	525 */

	526 // Cu.reportError(

	527 // "Crawler.node_action: Browser not found in internal map. " + Log ger.timestamp()

	528 // + "\nlocation=" + url_location

	529 // );

	530 // this.logger.stack_trace();

	531 return null;

	532 }

	533 var tab = this.tabbed_browser.map_browser_to_child.get( browser ).child;

	534 if ( !("instruction" in tab) )

	535 throw new Error( "'instruction' not found as member of tab object" );

	536 return tab.instruction;

	537 };

	538

	539 /**

	540 * This function executes solely underneath (in the call stack) 'node_action'. I t receives at least one call per node,

	541 * more if there are matches on rules of any kind.

	542 *

	543 * @param window

	544 * @param node

	545 * @param {RequestEntry} entry

	546 */

	547 Crawler.prototype.node_entry_action = function( window, node, entry )

	548 {

	549 if ( !this.current_nodes.has( node ) )

	550 {

	551 Cu.reportError( "node_entry_action: node not seen in 'current_nodes'" );

	552 return;

	553 }

	554 if ( !entry.filter )

	555 {

	556 /*

	557 * If there's no filter in the entry, then nothing happened to it. We ar e presently ignoring such entries. In

	558 * the future, however, we will likely want a hook here to process entri es that are not associated with any

	559 * filter, for example, to ensure that necessary content is not blocked inadvertently.

	560 */

	561 return;

	562 }

	563 var windows = [];

	564 var n = 0;

	565 while ( window != null )

	566 {

	567 if ( ++n > 100 )

	568 {

	569 // Houston, we have a problem.

	570 windows = null;

	571 Cu.reportError( "Crawler/node_entry_action: runaway window chain" );

	572 break;

	573 }

	574 windows.push( window );

	575 if ( window === window.parent )

	576 {

	577 // This is the ordinary statement to exit the loop.

	578 break;

	579 }

	580 window = window.parent;

	581 }

	582 this.current_nodes.get( node )( node, windows, entry );

	583 };

	584

	585

	586 function shutdown_crawler()

	587 {

	588 process_node_shim.close();

54 }	589 }

55	590

56 function loadSite(site, window, callback)	591 try

57 {	592 {

58 if (!site)	593 onShutdown.add( shutdown_crawler );

59 return;

60

61 let tabbrowser = window.gBrowser;

62 let tab = tabbrowser.addTab(site);

63 let browser = tabbrowser.getBrowserForTab(tab);

64

65 siteTabs.set(browser, site);

66

67 let progressListener = {

68 onStateChange: function(aBrowser, aWebProgress, aRequest, aStateFlags, aStat us)

69 {

70 if (browser !== aBrowser)

71 return;

72

73 if (!(aStateFlags & Ci.nsIWebProgressListener.STATE_STOP))

74 return;

75

76 tabbrowser.removeTabsProgressListener(progressListener);

77 tabbrowser.removeTab(tab);

78 callback();

79 }

80 };

81 tabbrowser.addTabsProgressListener(progressListener);

82 }	594 }

83	595 catch ( e )

84 function loadSites(backendUrl, parallelTabs, window, sites, callback)	596 {

85 {	597 Cu.reportError( "Failure adding shutdown function. error = \"" + e.message + "\"" );

86 while (currentTabs < parallelTabs && sites.length)

87 {

88 currentTabs++;

89 let site = sites.shift();

90 loadSite(site, window, function()

91 {

92 currentTabs--;

93 if (!sites.length && !currentTabs)

94 {

95 Storage.finish();

96 let dataFilePath = Storage.dataFile.path;

97 Client.sendCrawlerDataFile(backendUrl, dataFilePath, function()

98 {

99 Storage.destroy();

100 callback();

101 });

102 }

103 else

104 loadSites(backendUrl, parallelTabs, window, sites, callback);

105 });

106 }

107 }	598 }

108

109 let Crawler = exports.Crawler = {};

110

111 Crawler.crawl = function(backendUrl, parallelTabs, window, callback)

112 {

113 if (Policy.processNode != origProcessNode)

114 return;

115

116 Policy.processNode = processNode;

117

118 siteTabs = new WeakMap();

119 currentTabs = 0;

120

121 Storage.init();

122

123 Client.fetchCrawlableSites(backendUrl, function(sites)

124 {

125 loadSites(backendUrl, parallelTabs, window, sites, function()

126 {

127 Policy.processNode = origProcessNode;

128 siteTabs = null;

129 callback();

130 });

131 });

132 };

OLD	NEW

« .hgignore ('K') | « lib/counter_task.js ('k') | lib/encoding.js » ('j') | no next file with comments »