Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/crawler.js

Issue 10233013: Crawler, second version (Closed)
Patch Set: Created April 12, 2013, 1:38 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « lib/counter_task.js ('k') | lib/encoding.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: lib/crawler.js
===================================================================
--- a/lib/crawler.js
+++ b/lib/crawler.js
@@ -1,132 +1,664 @@
-Cu.import("resource://gre/modules/Services.jsm");
+Cu.import( "resource://gre/modules/Services.jsm" );
-function abprequire(module)
+function abprequire( module )
{
let result = {};
result.wrappedJSObject = result;
- Services.obs.notifyObservers(result, "adblockplus-require", module);
+ Services.obs.notifyObservers( result, "adblockplus-require", module );
return result.exports;
}
-let {Storage} = require("storage");
-let {Client} = require("client");
+let { Action } = require( "action" );
+let { Browser_Tab, Tabbed_Browser } = require( "browser" );
+let { Observation } = require( "instruction" );
+let { Encoding } = require( "encoding" );
+let { Logger } = require( "logger" );
-let {Policy} = abprequire("contentPolicy");
-let {Filter} = abprequire("filterClasses");
-let {Utils} = abprequire("utils");
+let { Policy } = abprequire( "contentPolicy" );
+let { RequestNotifier } = abprequire( "requestNotifier" );
+let { Filter } = abprequire( "filterClasses" );
+let { Utils } = abprequire( "utils" );
-let origProcessNode = Policy.processNode;
+//-------------------------------------------------------
+// Shim
+//-------------------------------------------------------
+/**
+ * Manager for shim replacement of an external function.
+ * <p/>
+ * Since there's no lvalue reference type in JavaScript (non-primitives are all reference types, but they are rvalue
+ * references), the arguments here provide a substitute. The reference is the expression 'object[ property ]'.
+ *
+ * @param {Object} original_object
+ * The original function whose call and return are to be surrounded by the shim.
+ * @param {string} original_property
+ * The original function whose call and return are to be surrounded by the shim.
+ * @constructor
+ */
+var Shim = function( original_object, original_property )
+{
+ /**
+ * @type {Object}
+ */
+ this.original_object = original_object;
+ /**
+ * @type {String}
+ */
+ this.original_property = original_property;
-let siteTabs;
-let currentTabs;
+ /**
+ * The original function as it exists at the time of instantiation. This means that generally the Shim instance
+ * should be created as soon as possible, such as in module initialization.
+ */
+ this.original_function = original_object[ original_property ];
+};
-function processNode(wnd, node, contentType, location, collapse)
+/**
+ * @return {boolean}
+ */
+Shim.prototype.is_original = function()
{
- let result = origProcessNode.apply(this, arguments);
- let url = (contentType === Policy.type.ELEMHIDE) ? location.text :
- location.spec;
+ return (this.original_object[ this.original_property ] === this.original_function);
+};
- let topWindow = wnd.top;
- if (!topWindow.document)
+/**
+ *
+ * @param {Function} replacer
+ * The replacement function transformer. Takes the original function as an argument and returns its replacement.
+ */
+Shim.prototype.replace = function( replacer )
+{
+ if ( !replacer )
+ throw "Must supply a function transformer to supply a replacement function.";
+ if ( !this.is_original() )
+ throw "This version of Shim does not support multiple replacement.";
+ this.original_object[ this.original_property ] = replacer( this.original_function );
+ return this.original_function;
+};
+
+/**
+ * Reset the original function to a non-replaced state.
+ * <p/>
+ * May be called correctly even if the original has never been replaced.
+ */
+Shim.prototype.reset = function()
+{
+ this.original_object[ this.original_property ] = this.original_function;
+};
+
+/**
+ * Close out the shim and release resources.
+ */
+Shim.prototype.close = function()
+{
+ this.reset();
+ /*
+ * At present, this class does not use external resources that aren't dealt with by 'reset()'. That could change,
+ * however, and so we use close() as the substitute-destructor and reset() for ordinary use.
+ */
+};
+
+/**
+ * Shim instance for 'processNode'. As of this writing it's the only function in ABP we're shimming.
+ */
+var process_node_shim = new Shim( Policy, "processNode" );
+
+//-------------------------------------------------------
+// Crawler
+//-------------------------------------------------------
+/**
+ * Constructor for a single crawl session. The crawler iterates through each instruction, loading its URL in a tab,
+ * running the hooks present in the processor, and storing results accordingly.
+ *
+ * @param {Instruction_Set} instructions
+ * Instruction generator yields a sequence of tuples: URL to crawl, a processor, and storage.
+ * @param {*} outputs
+ * @param {Window} window
+ * The top window we're operating it. Must be present as an argument because the module context this class is
+ * defined in does not have a window. (Or at least should not be relied upon.) 1
+ * @param {number} time_limit
+ * The maximum duration that we will allow a page to try to load.
+ * @param {boolean} leave_open
+ * @param {number} number_of_tabs
+ */
+var Crawler = function( instructions, outputs, window, time_limit, leave_open, number_of_tabs )
+{
+ /**
+ * @type {Instruction_Set}
+ */
+ this.instructions = instructions;
+
+ this.outputs = outputs;
+
+ /**
+ * Browser window in which to open tabs. Required because, as a module, we don't have a 'Window' object available.
+ * @type {Window}
+ */
+ this.window = window;
+
+ this.time_limit = time_limit;
+
+ this.leave_open = leave_open;
+
+ if ( number_of_tabs <= 0 )
{
- Cu.reportError("No document associated with the node's top window");
- return result;
+ /*
+ * Defensive. The caller should have already validated this argument.
+ */
+ number_of_tabs = 1;
}
- let tabbrowser = Utils.getChromeWindow(topWindow).gBrowser;
- if (!tabbrowser)
+ /**
+ * Progress object. It's simple enough not to need its own class. Just override the notice() function to receive
+ * progress notices.
+ */
+ this.progress = {
+ active: 0,
+ completed: 0,
+ total: instructions.size,
+ notice: function()
+ {
+ },
+ status: function()
+ {
+ }
+ };
+
+ if ( !process_node_shim.is_original() )
+ throw "Function 'processNode' is already shimmed. We may not insert a second one.";
+ process_node_shim.replace(
+ function( original )
+ {
+ return this.node_action.bind( this, original );
+ }.bind( this )
+ );
+
+ /**
+ * Logging service.
+ * @type {Logger}
+ */
+ this.logger = new Logger( "Crawler" );
+
+ this.tabbed_browser = new Tabbed_Browser( this.window, number_of_tabs );
+
+ /**
+ * Closed flag. Needed to terminate the generator if this object is closed before the generator stops.
+ * @type {Boolean}
+ */
+ this.closed = false;
+
+ /**
+ * The object responsible for gaining access to the call stream for individual entries within each node. This is
+ * one of two hooks into ABP itself, the other being the shim for 'processNode'.
+ *
+ * @type {RequestNotifier}
+ */
+ this.requestNotifier = new RequestNotifier( null, this.node_entry_action.bind( this ) );
+
+ /**
+ * The current nodes that are active in a call to 'node_action'. In ordinary cases, this map has at most the
+ * maximum number of concurrent loads.
+ * @type {WeakMap}
+ */
+ this.current_nodes = new WeakMap();
+};
+exports.Crawler = Crawler;
+
+Crawler.prototype.toJSON = function()
+{
+ return {
+ instructions: this.instructions,
+ storage: this.storage
+ };
+};
+
+/**
+ * Close the present instance. This object holds browser resources because of the browser tabs it holds open.
+ */
+Crawler.prototype.close = function()
+{
+ for ( let j = 0 ; j < this.outputs.length ; ++j )
{
- Cu.reportError("Unable to get a tabbrowser reference");
- return result;
+ this.outputs[j].storage.close();
}
+ if ( this.tabbed_browser ) this.tabbed_browser.close();
+ if ( this.requestNotifier ) this.requestNotifier.shutdown();
+ process_node_shim.reset();
+ this.closed = true;
+};
- let browser = tabbrowser.getBrowserForDocument(topWindow.document);
- if (!browser)
+/**
+ * The output encoding for the session as a whole.
+ * @type {*}
+ */
+Crawler.prototype.__encoding__ = Encoding.as_object( [
+ // prelude
+ Encoding.immediate_fields( ["time_start", "instructions"] ),
+ // observation
+ Encoding.field( "trials", Encoding.array_stream() ),
+ // postlude
+ Encoding.immediate_fields( ["time_finish", "termination"] )
+] );
+
+/**
+ * Task generator for the crawler
+ *
+ * @param {Function} pause
+ * @param {Function} resume
+ */
+Crawler.prototype.generator = function( pause, resume )
+{
+ /*
+ * A crawler object represent a single run of the crawler. Thus the pause and resume function act like object-scope
+ * variables.
+ */
+ this.pause = pause;
+ this.resume = resume;
+ var log = this.logger.make_log( "task" );
+ var tab = null;
+
+ var runaway_counter = 0;
+
+ try
{
- Cu.reportError("Unable to get browser for the tab");
- return result;
+ /*
+ * Preparation code. Ensure that every initialization here can be reversed in the 'finally' clause whether
+ * or not it executed, in case some initialization throws an exception.
+ */
+ this.time_start = Logger.timestamp();
+
+ var multiple = new Encoding.Multiple_Format();
+ for ( let j = 0 ; j < this.outputs.length ; ++j )
+ {
+ let output = this.outputs[j];
+ let formatter = new Encoding[ output.encode ]( output.storage.writer() );
+ multiple.add( formatter );
+ }
+ this.encoder = new Encoding.Format_stream( multiple );
+
+ this.encoder.write( this );
+ this.encoder.sequence_start();
+
+ let gen = this.instructions.generator();
+ let instruction = null; // Avoid spurious IDEA warning
+ for ( instruction of gen )
+ {
+ if ( this.closed )
+ {
+ /*
+ * Defensive. We only arrive here if some outside code has called our close() method and did not also
+ * order our cancellation. Regardless, we're done making new tabs.
+ */
+ Cu.reportError( "Crawler closed but its enclosing task not cancelled." );
+ break;
+ }
+
+ if ( this.tabbed_browser.available() )
+ {
+ /*
+ * Since we'll need a variety of browser-tab behaviors, we'll need to change this factory call
+ * to something dependent upon the instruction.
+ */
+ tab = this.tabbed_browser.make_tab( instruction.target, this.leave_open, this._deferred_load_finisher.bind( this ), null );
+ tab.instruction = instruction;
+ instruction.begin();
+ let join = new Action.Join_Timeout( tab, this.time_limit, this._join_finisher.bind( this ) );
+ join.go( tab, resume );
+ /*
+ * The return value of load is an asynchronous action that could be combined with others, if the
+ * instruction dictates. There's no hook for this yet, although that's the reason we do not immediately
+ * execute on calling load.
+ */
+ tab.go();
+ ++this.progress.active;
+ this.progress.notice();
+ }
+ if ( !this.tabbed_browser.available() )
+ {
+ pause();
+ }
+
+ var cancelled = yield false;
+ if ( cancelled )
+ {
+ break;
+ }
+ }
+ /*
+ * At this point in the code, we have launched all the instructions. If we're using more than one tab,
+ * we'll generally have open tabs still. We need to pause until we have no more tabs left open.
+ */
+ if ( !cancelled )
+ {
+ while ( !this.tabbed_browser.quiescent() )
+ {
+ // Must yield after pause() for it to take effect
+ pause();
+ cancelled = yield false;
+ if ( cancelled )
+ {
+ break;
+ }
+ ++runaway_counter;
+ if ( runaway_counter > 100 )
+ {
+ Cu.reportError( "Runaway pause loop. counter = " + runaway_counter );
+ break;
+ }
+ }
+ }
+
+ /*
+ * OK. Finally done.
+ */
+ this.termination = cancelled ? "Cancelled" : "Success";
+ this.progress.status( cancelled ? "Cancelled" : "Done" );
}
+ catch ( e if e instanceof Error )
+ {
+ log( e.toString() + "\n\n" + e.stack );
+ this.termination = "Error";
+ }
+ catch ( e )
+ {
+ log( e.toString() + " - type: " + Object.prototype.toString.call( e ) );
+ this.termination = "Unknown exception";
+ }
+ finally
+ {
+ /*
+ * Finish writing the output before closing ourselves down.
+ */
+ this.time_finish = Logger.timestamp();
+ this.encoder.sequence_stop();
- let site = siteTabs.get(browser);
- let filtered = !result;
- Storage.write([url, site, filtered]);
+ /*
+ * If everything goes right, this cleanup should not be necessary, as tab instances are closed as they are used.
+ * Nonetheless, if there's an error and a landing function is not called, this line ensures that all the tabs
+ * are properly destroyed.
+ */
+ if ( tab ) tab.close();
+ // Removes the ABP shim, amongst other things.
+ this.close();
+ }
+};
+
+/**
+ * Landing function for the asynchronous action of loading a tab. For some reasons, Firefox is delivering the
+ * STATE_STOP progress message before the last ABP filter is being run. It seems that it's firing events immediately,
+ * once it knows the request has finished its HTTP transfer, but before it has fully finished loading the page as a
+ * whole (the DOM, layout, etc.). Hence we let the browser finish its work in the current thread and run the actual
+ * load-end action afterwards.
+ * <p/>
+ * The implementation of this function allows it to be defined without arguments. That's not what actually happens.
+ * Since this function is just a scheduling pass-through, it uses 'arguments' to pass all arguments, no matter what they
+ * are. (And no matter how they change over time.)
+ */
+Crawler.prototype._deferred_load_finisher = function()
+{
+ /*
+ * The first argument is the 'this' object when 'apply' runs. The second argument is the 'this' object when
+ * 'this._load_end_action' runs.
+ */
+ Action.dispatch( Function.prototype.apply.bind( this._load_finisher, this, arguments ) );
+};
+
+/**
+ * Since we're done loading (the cause doesn't matter), we order the instruction to write out its results, be they
+ * successful or any of the varieties of unsuccessful.
+ */
+Crawler.prototype._load_finisher = function( tab, completion_state, error_code )
+{
+ var instruction = tab.instruction;
+ if ( tab.completed )
+ {
+ if ( tab.completed_well )
+ {
+ switch ( completion_state )
+ {
+ case Browser_Tab.Completion_State.Success:
+ instruction.end();
+ break;
+ case Browser_Tab.Completion_State.No_Success:
+ instruction.abort( "unsuccessful load. nsresult = " + error_code );
+ break;
+ case Browser_Tab.Completion_State.User_Close:
+ instruction.abort( "user closed tab" );
+ break;
+ case Browser_Tab.Completion_State.External_Cancel:
+ instruction.abort( "timed out" );
+ break;
+ default:
+ instruction.abort( "WTF?" );
+ break;
+ }
+ }
+ else
+ {
+ instruction.abort( "exception. message = " + tab.exception.message );
+ }
+ }
+ else
+ {
+ // Defensive. Should not reach.
+ instruction.abort( "tab load not completed. Huh?" );
+ }
+ this.encoder.sequence_send( instruction );
+
+ tab.close();
+ --this.progress.active;
+ ++this.progress.completed;
+ this.progress.notice();
+ this.resume();
+};
+
+/**
+ * The join finisher merely ensures that the tab load action actually completes, stopping it if it hasn't completed yet.
+ *
+ * @param tab
+ */
+Crawler.prototype._join_finisher = function( tab )
+{
+ /*
+ * If the join timeout caused completion, we must assume that the tab is still loading. If the timeout did not fire,
+ * then the tab action completed. In all cases, the tab will be complete afterwards. Thus because Join_Timeout is
+ * reliable, means that we have made tab-load reliable also.
+ */
+ if ( !tab.complete )
+ {
+ tab.stop();
+ }
+};
+
+//----------------------------------
+// Data gathering functions
+//----------------------------------
+/**
+ * Shim for 'processNode' in ABP. Executes once for each node that ABP processes, whether or not it acts on that node.
+ *
+ * @param {Function} original_f
+ * The original processNode function.
+ * @param {nsIDOMWindow} wnd
+ * @param {nsIDOMElement} node
+ * @param {Number} contentType
+ * @param {nsIURI} location
+ * @param {Boolean} collapse
+ * true to force hiding of the node
+ * @return {Boolean} false if the node should be blocked
+ */
+Crawler.prototype.node_action = function( original_f, wnd, node, contentType, location, collapse )
+{
+ //var log = this.logger.make_log( "node_action" );
+
+ /*
+ * Set up collecting for node_entry_action(). It should be the case that a node matches either 0 or 1 filters.
+ * The collection array 'entries' allows more than 1 to be recorded, and for such activity to be detected and
+ * reported rather than inducing an observation error.
+ */
+ var entries = [];
+ var entry_hook = function( node, windows, entry )
+ {
+ entries.push( { node: node, windows: windows, entry: entry } );
+ };
+ this.current_nodes.set( node, entry_hook );
+
+ /*
+ * Call the original processNode. If the original throws, then we will too, so this is outside a try clause.
+ */
+ var result = original_f( wnd, node, contentType, location, collapse );
+
+ try
+ {
+ let instruction = null; // Initialize here in case locate_instruction() throws.
+ try
+ {
+ instruction = this.locate_instruction( wnd );
+ }
+ catch ( e )
+ {
+ Cu.reportError( "Crawler/node_action: error locating instruction: " + e.toString()
+ + ( ( "stack" in e ) ? ( "\n\tstack = " + e.stack) : "" )
+ );
+ return result;
+ }
+ if ( !instruction )
+ {
+ /*
+ * If we don't have an instruction, we don't report this node. This is by design, because reporting is
+ * the responsibility of the instruction object.
+ */
+ return result;
+ }
+ if ( entries.length == 0 && !instruction.observing_all_nodes() )
+ {
+ // Assert we didn't touch this node and the instruction doesn't want to see it
+ return result;
+ }
+ try
+ {
+ var observation = new Observation(
+ !result, contentType,
+ (contentType == Policy.type.ELEMHIDE) ? location.text : location.spec,
+ entries
+ );
+ instruction.observe_node( observation );
+ }
+ catch ( e )
+ {
+ Cu.reportError( "Crawler/node_action: error recording observation: " + e.toString() );
+ return result;
+ }
+ }
+ finally
+ {
+ /*
+ * This 'finally' clause ensures that we remove the node from 'this.current_nodes'. Even though it's a weak map,
+ * we need to remove the key so that 'entry_hook' is not called inadvertently.
+ */
+ this.current_nodes.delete( node );
+ }
return result;
+};
+
+/**
+ * Locate our instruction associated with a window that caused to load. First we find the browser associated with the
+ * window. There should always be one of these, otherwise we have an error. From the browser, we locate our tab
+ * associated with it, which need not be present. Finally, we locate the instruction as a tab member, which should
+ * always exist.
+ * <p/>
+ * This is called only in node_action(). It's separate to simplify the control flow.
+ *
+ * @param window
+ * @return {Instruction_class}
+ */
+Crawler.prototype.locate_instruction = function( window )
+{
+ let topWindow = window.top;
+ if ( !topWindow.document )
+ throw new Error( "No document associated with the node's top window" );
+ let tabbrowser = Utils.getChromeWindow( topWindow ).gBrowser;
+ if ( !tabbrowser )
+ throw new Error( "Unable to get a tabbrowser reference from the window" );
+ let browser = tabbrowser.getBrowserForDocument( topWindow.document );
+ if ( !browser )
+ throw new Error( "Unable to get browser for the tab" );
+ if ( !this.tabbed_browser.map_browser_to_child.has( browser ) )
+ {
+ /*
+ * It's not an error for the browser not to appear in this map. If the tab is remains open past the time
+ * we are monitoring (either on purpose or as the result of a quirk of timing), we simply return a null
+ * instruction. Nevertheless, the code to report this to the console remains in place, commented out, because
+ * it's likely to be relevant still during development.
+ */
+ // Cu.reportError(
+ // "Crawler.node_action: Browser not found in internal map. " + Logger.timestamp()
+ // + "\nlocation=" + url_location
+ // );
+ // this.logger.stack_trace();
+ return null;
+ }
+ var tab = this.tabbed_browser.map_browser_to_child.get( browser ).child;
+ if ( !("instruction" in tab) )
+ throw new Error( "'instruction' not found as member of tab object" );
+ return tab.instruction;
+};
+
+/**
+ * This function executes solely underneath (in the call stack) 'node_action'. It receives at least one call per node,
+ * more if there are matches on rules of any kind.
+ *
+ * @param window
+ * @param node
+ * @param {RequestEntry} entry
+ */
+Crawler.prototype.node_entry_action = function( window, node, entry )
+{
+ if ( !this.current_nodes.has( node ) )
+ {
+ Cu.reportError( "node_entry_action: node not seen in 'current_nodes'" );
+ return;
+ }
+ if ( !entry.filter )
+ {
+ /*
+ * If there's no filter in the entry, then nothing happened to it. We are presently ignoring such entries. In
+ * the future, however, we will likely want a hook here to process entries that are not associated with any
+ * filter, for example, to ensure that necessary content is not blocked inadvertently.
+ */
+ return;
+ }
+ var windows = [];
+ var n = 0;
+ while ( window != null )
+ {
+ if ( ++n > 100 )
+ {
+ // Houston, we have a problem.
+ windows = null;
+ Cu.reportError( "Crawler/node_entry_action: runaway window chain" );
+ break;
+ }
+ windows.push( window );
+ if ( window === window.parent )
+ {
+ // This is the ordinary statement to exit the loop.
+ break;
+ }
+ window = window.parent;
+ }
+ this.current_nodes.get( node )( node, windows, entry );
+};
+
+
+function shutdown_crawler()
+{
+ process_node_shim.close();
}
-function loadSite(site, window, callback)
+try
{
- if (!site)
- return;
-
- let tabbrowser = window.gBrowser;
- let tab = tabbrowser.addTab(site);
- let browser = tabbrowser.getBrowserForTab(tab);
-
- siteTabs.set(browser, site);
-
- let progressListener = {
- onStateChange: function(aBrowser, aWebProgress, aRequest, aStateFlags, aStatus)
- {
- if (browser !== aBrowser)
- return;
-
- if (!(aStateFlags & Ci.nsIWebProgressListener.STATE_STOP))
- return;
-
- tabbrowser.removeTabsProgressListener(progressListener);
- tabbrowser.removeTab(tab);
- callback();
- }
- };
- tabbrowser.addTabsProgressListener(progressListener);
+ onShutdown.add( shutdown_crawler );
}
-
-function loadSites(backendUrl, parallelTabs, window, sites, callback)
+catch ( e )
{
- while (currentTabs < parallelTabs && sites.length)
- {
- currentTabs++;
- let site = sites.shift();
- loadSite(site, window, function()
- {
- currentTabs--;
- if (!sites.length && !currentTabs)
- {
- Storage.finish();
- let dataFilePath = Storage.dataFile.path;
- Client.sendCrawlerDataFile(backendUrl, dataFilePath, function()
- {
- Storage.destroy();
- callback();
- });
- }
- else
- loadSites(backendUrl, parallelTabs, window, sites, callback);
- });
- }
+ Cu.reportError( "Failure adding shutdown function. error = \"" + e.message + "\"" );
}
-
-let Crawler = exports.Crawler = {};
-
-Crawler.crawl = function(backendUrl, parallelTabs, window, callback)
-{
- if (Policy.processNode != origProcessNode)
- return;
-
- Policy.processNode = processNode;
-
- siteTabs = new WeakMap();
- currentTabs = 0;
-
- Storage.init();
-
- Client.fetchCrawlableSites(backendUrl, function(sites)
- {
- loadSites(backendUrl, parallelTabs, window, sites, function()
- {
- Policy.processNode = origProcessNode;
- siteTabs = null;
- callback();
- });
- });
-};
« no previous file with comments | « lib/counter_task.js ('k') | lib/encoding.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld