Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/crawler.js

Issue 9615013: Crawler, first version (Closed)
Patch Set: Created March 6, 2013, 4:05 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 Cu.import("resource://gre/modules/Services.jsm"); 1 Cu.import( "resource://gre/modules/Services.jsm" );
2 2
3 function abprequire(module) 3 function abprequire( module )
4 { 4 {
5 let result = {}; 5 let result = {};
6 result.wrappedJSObject = result; 6 result.wrappedJSObject = result;
7 Services.obs.notifyObservers(result, "adblockplus-require", module); 7 Services.obs.notifyObservers( result, "adblockplus-require", module );
8 return result.exports; 8 return result.exports;
9 } 9 }
10 10
11 let {Storage} = require("storage"); 11 let {Client} = require( "client" );
12 let {Client} = require("client"); 12 let {Browser_Tab,Tabbed_Browser} = require( "browser" );
13 13 let {Encoding} = require( "encoding" );
14 let {Policy} = abprequire("contentPolicy"); 14 let {Logger} = require( "logger" );
15 let {Filter} = abprequire("filterClasses"); 15
16 let {Utils} = abprequire("utils"); 16 let {Policy} = abprequire( "contentPolicy" );
17 17 let {RequestNotifier} = abprequire( "requestNotifier" );
18 let origProcessNode = Policy.processNode; 18 let {Filter} = abprequire( "filterClasses" );
19 19 let {Utils} = abprequire( "utils" );
20 let siteTabs; 20 let {Observation} = require( "instruction" );
21 let currentTabs; 21
22 22 //-------------------------------------------------------
23 function processNode(wnd, node, contentType, location, collapse) 23 // Shim
24 { 24 //-------------------------------------------------------
25 let result = origProcessNode.apply(this, arguments); 25 /**
26 let url = (contentType === Policy.type.ELEMHIDE) ? location.text : 26 * Manager for shim replacement of an external function.
27 location.spec; 27 * <p/>
28 28 * Since there's no lvalue reference type in JavaScript (non-primitives are all reference types, but they are rvalue
29 let topWindow = wnd.top; 29 * references), the arguments here provide a substitute. The reference is the ex pression 'object[ property ]'.
30 if (!topWindow.document) 30 *
31 { 31 * @param {Object} original_object
32 Cu.reportError("No document associated with the node's top window"); 32 * The original function whose call and return are to be surrounded by the shim.
33 * @param {String} original_property
34 * The original function whose call and return are to be surrounded by the shim.
35 * @constructor
36 */
37 var Shim = function( original_object, original_property )
38 {
39 /**
40 * @type {Object}
41 */
42 this.original_object = original_object;
43 /**
44 * @type {String}
45 */
46 this.original_property = original_property;
47
48 /**
49 * The original function as it exists at the time of instantiation. This mea ns that generally the Shim instance
50 * should be created as soon as possible, such as in module initialization.
51 */
52 this.original_function = original_object[ original_property ];
53 };
54
55 /**
56 * @return {boolean}
57 */
58 Shim.prototype.is_original = function()
59 {
60 return (this.original_object[ this.original_property ] === this.original_fun ction);
61 };
62
63 /**
64 *
65 * @param {Function} replacer
66 * The replacement function transformer. Takes the original function as an argument and returns its replacement.
67 */
68 Shim.prototype.replace = function( replacer )
69 {
70 if ( !replacer )
71 throw "Must supply a function transformer to supply a replacement functi on.";
72 if ( !this.is_original() )
73 throw "This version of Shim does not support multiple replacement.";
74 this.original_object[ this.original_property ] = replacer( this.original_fun ction );
75 return this.original_function;
76 };
77
78 /**
79 * Reset the original function to a non-replaced state.
80 * <p/>
81 * May be called correctly even if the original has never been replaced.
82 */
83 Shim.prototype.reset = function()
84 {
85 this.original_object[ this.original_property ] = this.original_function;
86 };
87
88 /**
89 * Close out the shim and release resources.
90 */
91 Shim.prototype.close = function()
92 {
93 this.reset();
94 /*
95 * At present, this class does not use external resources that aren't dealt with by 'reset()'. That could change,
96 * however, and so we use close() as the substitute-destructor and reset() f or ordinary use.
97 */
98 };
99
100 /**
101 * Shim instance for 'processNode'. As of this writing it's the only function in ABP we're shimming.
102 */
103 var process_node_shim = new Shim( Policy, "processNode" );
104
105 //-------------------------------------------------------
106 // Crawler
107 //-------------------------------------------------------
108 /**
109 * Constructor for a single crawl session. The crawler iterates through each ins truction, loading its URL in a tab,
110 * running the hooks present in the processor, and storing results accordingly.
111 *
112 * @param {Instruction_Set} instructions
113 * Instruction generator yields a sequence of tuples: URL to crawl, a proce ssor, and storage.
114 * @param {*} outputs
115 * @param {*} display
116 * @param {Window} window
117 * The top window we're operating it. Must be present as an argument becaus e the module context this class is
118 * defined in does not have a window. (Or at least should not be relied upo n.)
119 * @param {boolean} leave_open
120 * @param {number} number_of_tabs
121 */
122 var Crawler = function( instructions, outputs, display, window, leave_open, numb er_of_tabs, progress )
123 {
124 /**
125 * @type {Instruction_Set}
126 */
127 this.instructions = instructions;
128
129 this.outputs = outputs;
130
131 if ( !display )
132 {
133 throw "No ability to provide a null display object"
134 }
135 /**
136 * Display object for showing progress messages.
137 * @type {*}
138 */
139 this.display = display;
140
141 /**
142 * Browser window in which to open tabs. Required because, as a module, we d on't have a 'Window' object available.
143 * @type {Window}
144 */
145 this.window = window;
146
147 this.leave_open = leave_open;
148
149 if ( number_of_tabs <= 0 )
150 {
151 /*
152 * Defensive. The caller should have already validated this argument.
153 */
154 number_of_tabs = 1;
155 }
156
157 this.progress = progress;
158
159 if ( !process_node_shim.is_original() )
160 throw "Function 'processNode' is already shimmed. We may not insert a se cond one.";
161 process_node_shim.replace(
162 function( original )
163 {
164 return this.node_action.bind( this, original );
165 }.bind( this )
166 );
167
168 /**
169 * Logging service.
170 * @type {Logger}
171 */
172 this.logger = new Logger( "Crawler" );
173
174 this.tabbed_browser = new Tabbed_Browser( this.window, number_of_tabs );
175
176 /**
177 * Closed flag. Needed to terminate the generator if this object is closed b efore the generator stops.
178 * @type {Boolean}
179 */
180 this.closed = false;
181
182 /**
183 * @type {RequestNotifier}
184 */
185 this.requestNotifier = new RequestNotifier( null, this.node_entry_action.bin d( this ) );
186
187 /**
188 * The current nodes that are active in a call to 'node_action'. In ordinary cases, this map has at most the
189 * maximum number of concurrent loads.
190 * @type {WeakMap}
191 */
192 this.current_nodes = new WeakMap();
193
194 this.progress_stats = {
195 active: 0,
196 completed: 0
197 };
198 };
199 exports.Crawler = Crawler;
200
201 Crawler.prototype.toJSON = function()
202 {
203 return {
204 instructions: this.instructions,
205 storage: this.storage
206 };
207 };
208
209 /**
210 * Close the present instance. This object holds browser resources because of th e browser tabs it holds open.
211 */
212 Crawler.prototype.close = function()
213 {
214 for ( let j = 0 ; j < this.outputs.length ; ++j )
215 {
216 this.outputs[j].storage.close();
217 }
218 if ( this.tabbed_browser ) this.tabbed_browser.close();
219 if ( this.requestNotifier ) this.requestNotifier.shutdown();
220 process_node_shim.reset();
221 this.closed = true;
222 };
223
224 /**
225 * The output encoding for the session as a whole.
226 * @type {*}
227 */
228 Crawler.prototype.__encoding__ = Encoding.as_object( [
229 // prelude
230 Encoding.immediate_fields( ["time_start", "instructions"] ),
231 // observation
232 Encoding.field( "trials", Encoding.array_stream() ),
233 // postlude
234 Encoding.immediate_fields( ["time_finish", "termination"] )
235 ] );
236
237 /**
238 * Task generator for the crawler
239 *
240 * @param {Function} pause
241 * @param {Function} resume
242 */
243 Crawler.prototype.generator = function( pause, resume )
244 {
245 var log = this.logger.make_log( "task" );
246 var tab = null;
247
248 var runaway_counter = 0;
249
250 try
251 {
252 /*
253 * Preparation code. Ensure that every initialization here can be revers ed in the 'finally' clause whether
254 * or not it executed, in case some initialization throws an exception.
255 */
256 this.time_start = Logger.timestamp();
257
258 var multiple = new Encoding.Multiple_Format();
259 for ( let j = 0 ; j < this.outputs.length ; ++j )
260 {
261 let output = this.outputs[j];
262 let formatter = new Encoding[ output.encode ]( output.storage.writer () );
263 multiple.add( formatter );
264 }
265 this.encoder = new Encoding.Format_stream( multiple );
266
267 this.encoder.write( this );
268 this.encoder.sequence_start();
269
270 let gen = this.instructions.generator();
271 let instruction = null; // Avoid spurious IDEA warning
272 for ( instruction of gen )
273 {
274 if ( this.closed )
275 //noinspection ExceptionCaughtLocallyJS
276 throw StopIteration;
277
278 if ( this.tabbed_browser.available() )
279 {
280 /*
281 * Since we'll need a variety of browser-tab behaviors, we'll ne ed to change this factory call
282 * to something dependent upon the instruction.
283 */
284 tab = this.tabbed_browser.make_tab( this.leave_open );
285 tab.instruction = instruction;
286 instruction.begin();
287 /*
288 * The return value of load is an asynchronous action that could be combined with others, if the
289 * instruction dictates. There's no hook for this yet, although that's the reason we do not immediately
290 * execute on calling load.
291 */
292 tab.load( instruction.target ).go( this._land.bind( this, tab, r esume ), null );
293 ++this.progress_stats.active;
294 this.progress.notice( this.progress_stats );
295 }
296 if ( !this.tabbed_browser.available() )
297 {
298 pause();
299 }
300
301 var cancelled = yield false;
302 if ( cancelled )
303 {
304 this.display.log( "Crawler cancelled." );
305 break;
306 }
307 }
308 //this.alert( "Just finished main instruction loop." );
309 /*
310 * At this point in the code, we have launched all the instructions. If we're using more than one tab,
311 * we'll generally have open tabs still. We need to pause until we have no more tabs left open.
312 */
313 while ( !this.tabbed_browser.quiescent() )
314 {
315 pause();
316 // Must yield after pause() for it to take effect
317 cancelled = yield false;
318 if ( cancelled )
319 {
320 this.display.log( "Crawler cancelled." );
321 break;
322 }
323 ++runaway_counter;
324 if ( runaway_counter > 100 )
325 {
326 Cu.reportError( "Runaway pause loop." );
327 break;
328 }
329 }
330
331 /*
332 * OK. Finally done.
333 */
334 this.termination = "ordinary";
335 }
336 catch ( e if e instanceof Error )
337 {
338 log( e.toString() + "\n\n" + e.stack );
339 this.termination = "Error";
340 }
341 catch ( e )
342 {
343 log( e.toString() + " - type: " + Object.prototype.toString.call( e ) ) ;
344 this.termination = "Unknown exception";
345 }
346 finally
347 {
348 /*
349 * Finish writing the output before closing ourselves down.
350 */
351 if ( !( "termination" in this) )
352 {
353 this.termination = "Success";
354 }
355 this.time_finish = Logger.timestamp();
356 this.encoder.sequence_stop();
357
358 /*
359 * If everything goes right, this cleanup should not be necessary, as ta b instances are closed as they are used.
360 * Nonetheless, if there's an error and a landing function is not called , this line ensures that all the tabs
361 * are properly destroyed.
362 */
363 if ( tab ) tab.close();
364 // Removes the ABP shim, amongst other things.
365 this.close();
366 }
367 };
368
369 /**
370 * Landing function for the asynchronous action of loading a tab. For some reaso ns, Firefox is delivering the
371 * STATE_STOP progress message before the last ABP filter is being run. It seems that it's firing events immediately,
372 * once it knows the request has finished its HTTP transfer, but before it has f ully finished loading the page as a
373 * whole (the DOM, layout, etc.). Hence we let the browser finish its work in th e current thread and run the actual
374 * load-end action afterwards.
375 * <p/>
376 * The implementation of this function allows it to be defined without arguments . That's not what actually happens.
377 * Since this function is just a scheduling pass-through, it uses 'arguments' to pass all arguments, no matter what they
378 * are. (And no matter how they change over time.)
379 */
380 Crawler.prototype._land = function()
381 {
382 /*
383 * The first argument is the 'this' object when 'apply' runs. The second arg ument is the 'this' object when
384 * 'this._load_end_action' runs.
385 */
386 Utils.threadManager.currentThread.dispatch(
387 { run: Function.prototype.apply.bind( this._load_end_action, this, argum ents )},
388 Ci.nsIEventTarget.DISPATCH_NORMAL );
389 };
390
391 /**
392 * Action at the end of loading a tab.
393 *
394 * @param tab
395 * @param {Function} resume
396 */
397 Crawler.prototype._load_end_action = function( tab, resume )
398 {
399 var instruction = tab.instruction;
400 tab.instruction.end();
401 tab.close();
402 this.encoder.sequence_send( instruction );
403 --this.progress_stats.active;
404 ++this.progress_stats.completed;
405 this.progress.notice( this.progress_stats );
406 resume();
407 };
408
409 /**
410 * Shim for 'processNode' in ABP. Executes once for each node that ABP processes , whether or not it acts on that node.
411 *
412 * @param {Function} original_f
413 * The original processNode function.
414 * @param {nsIDOMWindow} wnd
415 * @param {nsIDOMElement} node
416 * @param {Number} contentType
417 * @param {nsIURI} location
418 * @param {Boolean} collapse
419 * true to force hiding of the node
420 * @return {Boolean} false if the node should be blocked
421 */
422 Crawler.prototype.node_action = function( original_f, wnd, node, contentType, lo cation, collapse )
423 {
424 //var log = this.logger.make_log( "node_action" );
425
426 /*
427 * Set up collecting for node_entry_action(). It should be the case that a n ode matches either 0 or 1 filters.
428 * The collection array 'entries' allows more than 1 to be recorded, and for such activity to be detected and
429 * reported rather than inducing an observation error.
430 */
431 var entries = [];
432 var entry_hook = function( node, windows, entry )
433 {
434 entries.push( { node: node, windows: windows, entry: entry } );
435 };
436 this.current_nodes.set( node, entry_hook );
437
438 /*
439 * Call the original processNode. If the original throws, then we will too, so this is outside a try clause.
440 */
441 var result = original_f( wnd, node, contentType, location, collapse );
442
443 try
444 {
445 let instruction = null; // Initialize here in case locate_instructio n() throws.
446 try
447 {
448 instruction = this.locate_instruction( wnd );
449 }
450 catch ( e )
451 {
452 Cu.reportError( "Crawler/node_action: error locating instruction: " + e.toString()
453 + ( ( "stack" in e ) ? ( "\n\tstack = " + e.stack) : "" )
454 );
455 return result;
456 }
457 if ( !instruction )
458 {
459 /*
460 * If we don't have an instruction, we don't report this node. This is by design, because reporting is
461 * the responsibility of the instruction object.
462 */
463 return result;
464 }
465 if ( entries.length == 0 && !instruction.observing_all_nodes() )
466 {
467 // Assert we didn't touch this node and the instruction doesn't want to see it
468 return result;
469 }
470 try
471 {
472 var observation = new Observation(
473 !result, contentType,
474 (contentType == Policy.type.ELEMHIDE) ? location.text : location .spec,
475 entries
476 );
477 instruction.observe_node( observation );
478 }
479 catch ( e )
480 {
481 Cu.reportError( "Crawler/node_action: error recording observation: " + e.toString() );
482 return result;
483 }
484 }
485 finally
486 {
487 /*
488 * This 'finally' clause ensures that we remove the node from 'this.curr ent_nodes'. Even though it's a weak map,
489 * we need to remove the key so that 'entry_hook' is not called inadvert ently.
490 */
491 this.current_nodes.delete( node );
492 }
33 return result; 493 return result;
34 } 494 };
35 495
36 let tabbrowser = Utils.getChromeWindow(topWindow).gBrowser; 496 /**
37 if (!tabbrowser) 497 * Locate our instruction associated with a window that caused to load. First we find the browser associated with the
38 { 498 * window. There should always be one of these, otherwise we have an error. From the browser, we locate our tab
39 Cu.reportError("Unable to get a tabbrowser reference"); 499 * associated with it, which need not be present. Finally, we locate the instruc tion as a tab member, which should
40 return result; 500 * always exist.
41 } 501 * <p/>
42 502 * This is called only in node_action(). It's separate to simplify the control f low.
43 let browser = tabbrowser.getBrowserForDocument(topWindow.document); 503 *
44 if (!browser) 504 * @param window
45 { 505 * @return {Instruction_class}
46 Cu.reportError("Unable to get browser for the tab"); 506 */
47 return result; 507 Crawler.prototype.locate_instruction = function( window )
48 } 508 {
49 509 let topWindow = window.top;
50 let site = siteTabs.get(browser); 510 if ( !topWindow.document )
51 let filtered = !result; 511 throw new Error( "No document associated with the node's top window" );
52 Storage.write([url, site, filtered]); 512 let tabbrowser = Utils.getChromeWindow( topWindow ).gBrowser;
53 return result; 513 if ( !tabbrowser )
514 throw new Error( "Unable to get a tabbrowser reference from the window" );
515 let browser = tabbrowser.getBrowserForDocument( topWindow.document );
516 if ( !browser )
517 throw new Error( "Unable to get browser for the tab" );
518 if ( !this.tabbed_browser.map_browser_to_child.has( browser ) )
519 {
520 /*
521 * It's not an error for the browser not to appear in this map. If the t ab is remains open past the time
522 * we are monitoring (either on purpose or as the result of a quirk of t iming), we simply return a null
523 * instruction. Nevertheless, the code to report this to the console rem ains in place, commented out, because
524 * it's likely to be relevant still during development.
525 */
526 // Cu.reportError(
527 // "Crawler.node_action: Browser not found in internal map. " + Log ger.timestamp()
528 // + "\nlocation=" + url_location
529 // );
530 // this.logger.stack_trace();
531 return null;
532 }
533 var tab = this.tabbed_browser.map_browser_to_child.get( browser ).child;
534 if ( !("instruction" in tab) )
535 throw new Error( "'instruction' not found as member of tab object" );
536 return tab.instruction;
537 };
538
539 /**
540 * This function executes solely underneath (in the call stack) 'node_action'. I t receives at least one call per node,
541 * more if there are matches on rules of any kind.
542 *
543 * @param window
544 * @param node
545 * @param {RequestEntry} entry
546 */
547 Crawler.prototype.node_entry_action = function( window, node, entry )
548 {
549 if ( !this.current_nodes.has( node ) )
550 {
551 Cu.reportError( "node_entry_action: node not seen in 'current_nodes'" );
552 return;
553 }
554 if ( !entry.filter )
555 {
556 /*
557 * If there's no filter in the entry, then nothing happened to it. We ar e presently ignoring such entries. In
558 * the future, however, we will likely want a hook here to process entri es that are not associated with any
559 * filter, for example, to ensure that necessary content is not blocked inadvertently.
560 */
561 return;
562 }
563 var windows = [];
564 var n = 0;
565 while ( window != null )
566 {
567 if ( ++n > 100 )
568 {
569 // Houston, we have a problem.
570 windows = null;
571 Cu.reportError( "Crawler/node_entry_action: runaway window chain" );
572 break;
573 }
574 windows.push( window );
575 if ( window === window.parent )
576 {
577 // This is the ordinary statement to exit the loop.
578 break;
579 }
580 window = window.parent;
581 }
582 this.current_nodes.get( node )( node, windows, entry );
583 };
584
585
586 function shutdown_crawler()
587 {
588 process_node_shim.close();
54 } 589 }
55 590
56 function loadSite(site, window, callback) 591 try
57 { 592 {
58 if (!site) 593 onShutdown.add( shutdown_crawler );
59 return;
60
61 let tabbrowser = window.gBrowser;
62 let tab = tabbrowser.addTab(site);
63 let browser = tabbrowser.getBrowserForTab(tab);
64
65 siteTabs.set(browser, site);
66
67 let progressListener = {
68 onStateChange: function(aBrowser, aWebProgress, aRequest, aStateFlags, aStat us)
69 {
70 if (browser !== aBrowser)
71 return;
72
73 if (!(aStateFlags & Ci.nsIWebProgressListener.STATE_STOP))
74 return;
75
76 tabbrowser.removeTabsProgressListener(progressListener);
77 tabbrowser.removeTab(tab);
78 callback();
79 }
80 };
81 tabbrowser.addTabsProgressListener(progressListener);
82 } 594 }
83 595 catch ( e )
84 function loadSites(backendUrl, parallelTabs, window, sites, callback) 596 {
85 { 597 Cu.reportError( "Failure adding shutdown function. error = \"" + e.message + "\"" );
86 while (currentTabs < parallelTabs && sites.length)
87 {
88 currentTabs++;
89 let site = sites.shift();
90 loadSite(site, window, function()
91 {
92 currentTabs--;
93 if (!sites.length && !currentTabs)
94 {
95 Storage.finish();
96 let dataFilePath = Storage.dataFile.path;
97 Client.sendCrawlerDataFile(backendUrl, dataFilePath, function()
98 {
99 Storage.destroy();
100 callback();
101 });
102 }
103 else
104 loadSites(backendUrl, parallelTabs, window, sites, callback);
105 });
106 }
107 } 598 }
108
109 let Crawler = exports.Crawler = {};
110
111 Crawler.crawl = function(backendUrl, parallelTabs, window, callback)
112 {
113 if (Policy.processNode != origProcessNode)
114 return;
115
116 Policy.processNode = processNode;
117
118 siteTabs = new WeakMap();
119 currentTabs = 0;
120
121 Storage.init();
122
123 Client.fetchCrawlableSites(backendUrl, function(sites)
124 {
125 loadSites(backendUrl, parallelTabs, window, sites, function()
126 {
127 Policy.processNode = origProcessNode;
128 siteTabs = null;
129 callback();
130 });
131 });
132 };
OLDNEW
« .hgignore ('K') | « lib/counter_task.js ('k') | lib/encoding.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld