Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/instruction.js

Issue 10233013: Crawler, second version (Closed)
Patch Set: Created April 12, 2013, 1:38 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « lib/encoding.js ('k') | lib/logger.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: lib/instruction.js
===================================================================
new file mode 100644
--- /dev/null
+++ b/lib/instruction.js
@@ -0,0 +1,472 @@
+/**
+ * @fileOverview Instructions are the units of effort for the crawler. This file provides iterators for instructions
+ * that the main crawler loop will then execute.
+ */
+
+let {Logger} = require( "logger" );
+let {Storage} = require( "storage" );
+let {Encoding} = require( "encoding" );
+let {YAML, YamlParseException} = require( "yaml" );
+
+function abprequire( module )
+{
+ let result = {};
+ result.wrappedJSObject = result;
+ Services.obs.notifyObservers( result, "adblockplus-require", module );
+ return result.exports;
+}
+let {Policy} = abprequire( "contentPolicy" );
+
+//-------------------------------------------------------
+// Input
+//-------------------------------------------------------
+/**
+ * Base class for retrieving source code for crawl instructions. Implementations include fixed string and local file.
+ *
+ * @property {Object} value
+ * @property {string} text
+ * @constructor
+ */
+var Input_class = function()
+{
+};
+
+/**
+ * Load the input into memory and parse it.
+ * <p/>
+ * Postcondition: 'this.value' has a parsed copy of the input.
+ */
+Input_class.prototype.load = function()
+{
+ throw new Error( "'Input_class.load' is abstract." );
+};
+
+/**
+ * Reset the internal storage members of this object. Use to release memory and assist the garbage collector.
+ */
+Input_class.prototype.reset = function()
+{
+ throw new Error( "'Input_class.reset' is abstract." );
+};
+
+//----------------------------------
+// Input_String
+//----------------------------------
+/**
+ * Use a fixed text for the input.
+ *
+ * @param text
+ * @constructor
+ * @extends {Input_class}
+ */
+var Input_String = function( text )
+{
+ this.text = text;
+ this.value = null;
+};
+Input_String.prototype = new Input_class();
+
+/**
+ * Parse the input string.
+ */
+Input_String.prototype.load = function()
+{
+ this.value = YAML.parse( this.text );
+};
+
+/**
+ * Reset all the internal members.
+ */
+Input_String.prototype.reset = function()
+{
+ this.text = null;
+ this.value = null;
+};
+
+//----------------------------------
+// Input_File
+//----------------------------------
+/**
+ *
+ * @param {nsIFile} file
+ * @constructor
+ */
+var Input_File = function( file )
+{
+ this.file = file;
+};
+Input_File.prototype = new Input_class();
+
+Input_File.prototype.load = function()
+{
+ var data = "";
+ var fstream = Cc["@mozilla.org/network/file-input-stream;1"].createInstance( Ci.nsIFileInputStream );
+ var cstream = Cc["@mozilla.org/intl/converter-input-stream;1"].createInstance( Ci.nsIConverterInputStream );
+ fstream.init( this.file, -1, 0, 0 );
+ cstream.init( fstream, "UTF-8", 0, 0 );
+ let str = {};
+ let read = 0;
+ do {
+ read = cstream.readString( 0xffffffff, str ); // read as much as we can and put it in str.value
+ data += str.value;
+ } while ( read != 0 );
+ cstream.close();
+ this.value = YAML.parse( data );
+};
+
+Input_File.prototype.reset = function()
+{
+ this.file = null;
+ this.value = null;
+};
+
+//-------------------------------------------------------
+// Instruction
+//-------------------------------------------------------
+
+/**
+ * Instruction base class.
+ *
+ * @constructor
+ */
+var Instruction_class = function()
+{
+ /**
+ * The only universal aspect to crawling is that we are crawling other people's web sites. This field is the URL for
+ * a site.
+ * @type {String}
+ */
+ this.target = null;
+
+ /**
+ * The operation to perform at the browse site.
+ */
+ this.operation = {};
+
+ this.operation.toJSON = function()
+ {
+ return "default";
+ };
+
+ this.logger = new Logger( "Instruction_class" );
+ this.log = this.logger.make_log();
+};
+
+/**
+ * Predicate about whether this instruction observes all nodes or only filtered ones.
+ * <p/>
+ * Framework function for the observation system. Intended to be overridden by subclasses.
+ * @return {boolean}
+ */
+Instruction_class.prototype.observing_all_nodes = function()
+{
+ return false;
+};
+
+/**
+ * Record an observation as the crawler sees it.
+ * <p/>
+ * Framework function for the observation system.
+ * @param observation
+ */
+Instruction_class.prototype.observe_node = function( observation )
+{
+ this.observations.push( observation );
+};
+
+/**
+ * Action at start of executing instruction. Run immediately before the tab is loaded.
+ * <p/>
+ * Framework function for the observation system.
+ * <p/>
+ * This function currently has no arguments. The only one that might be relevant is the 'Browser_Tab' instance. It was
+ * not chosen as an argument because there's no apparent reason for it. Altering the load behavior should be done by
+ * specifying a subclass of 'Browser_Tab' in the instruction.
+ */
+Instruction_class.prototype.begin = function()
+{
+ this.time_start = Logger.timestamp();
+};
+
+/**
+ * Action at start of executing instruction. Run immediately before the tab is loaded.
+ * <p/>
+ * Framework function for the observation system.
+ */
+Instruction_class.prototype.end = function()
+{
+ this.ended_well = true;
+ this.time_finish = Logger.timestamp();
+ this.termination = "completed"; // May alter to "cancelled" or "aborted".
+
+ /*
+ * Sort the observation array and merge to remove duplicates.
+ */
+ this.observations.sort( Observation.cmp );
+ if ( this.observations.length >= 2 )
+ {
+ var merged = [];
+ merged.push( this.observations[0] );
+ this.observations.reduce( function( previous, current )
+ {
+ if ( !previous.equals( current ) )
+ {
+ merged.push( current );
+ }
+ return current;
+ } );
+ this.observations = merged;
+ }
+};
+
+/**
+ * Abort the instruction prematurely. This
+ */
+Instruction_class.prototype.abort = function( termination )
+{
+ this.ended_well = false;
+ this.time_finish = Logger.timestamp();
+ this.termination = termination;
+};
+
+//noinspection JSUnusedGlobalSymbols
+/**
+ * The return value of toJSON() defines the result fields are emitted for this instruction.
+ *
+ * @returns {*}
+ */
+Instruction_class.prototype.toJSON = function()
+{
+ var r = {
+ target: this.target,
+ operation: this.operation,
+ time_start: this.time_start,
+ time_finish: this.time_finish
+ };
+ try
+ {
+ if ( this.ended_well )
+ {
+ r.observations = this.observations;
+ } else
+ {
+ r.termination = this.termination;
+ }
+ }
+ catch ( e )
+ {
+ r.termination = "Unexpected exception: " + e.message;
+ }
+ return r;
+};
+
+//-------------------------------------------------------
+// Instruction_Set
+//-------------------------------------------------------
+/**
+ * As-yet unused base class for instruction sets
+ * @constructor
+ */
+var Instruction_Set_class = function()
+{
+};
+Instruction_Set_class.prototype.generator = function()
+{
+ throw new Error( "Must override 'generator' when deriving from Instruction_Set_class" );
+};
+
+var Instruction_Set = {};
+
+//-------------------------------------------------------
+// Instruction_Set.Parsed
+//-------------------------------------------------------
+
+/**
+ * An instruction set constructed from a parsed YAML document.
+ *
+ * @param {Input_class} input
+ * @constructor
+ */
+Instruction_Set.Parsed = function( input )
+{
+ try
+ {
+ input.load();
+ this.source = input.value;
+ }
+ finally
+ {
+ input.reset();
+ }
+
+ this.name = this.source.name;
+ this.instructions = [];
+ let target = this.source.target;
+ let n = target.length;
+ for ( let j = 0 ; j < n ; ++j )
+ {
+ this.instructions.push( new Default_Instruction( target[ j ] ) );
+ }
+ /**
+ * The number of instructions in this set.
+ * @type {number}
+ */
+ this.size = this.instructions.length;
+};
+Instruction_Set.Parsed.prototype = new Instruction_Set_class();
+
+Instruction_Set.Parsed.prototype.generator = function()
+{
+ let n = this.instructions.length;
+ for ( let j = 0 ; j < n ; ++j )
+ {
+ yield this.instructions[ j ];
+ }
+};
+
+Instruction_Set.Parsed.prototype.toJSON = function()
+{
+ return { name: this.name };
+};
+
+//-------------------------------------------------------
+// Default_Instruction
+//-------------------------------------------------------
+/**
+ * The default instruction type.
+ * @param {String} target
+ * @constructor
+ */
+Default_Instruction = function( target )
+{
+ this.target = target;
+
+ /**
+ * Observations array
+ * @type {Array}
+ */
+ this.observations = [];
+};
+Default_Instruction.prototype = new Instruction_class();
+
+//-------------------------------------------------------
+// Observation
+//-------------------------------------------------------
+/**
+ *
+ * @param filtered
+ * @param content_type
+ * @param location
+ * @param entries
+ * @constructor
+ */
+var Observation = function( filtered, content_type, location, entries )
+{
+ this.filtered = filtered;
+ this.content_description = Policy.typeDescr[content_type];
+ this.location = location;
+ this.entries = entries;
+ if ( this.entries.length == 1 )
+ {
+ let x = this.entries[0];
+ this.filter = x.entry.filter.text;
+ let windows = x.windows;
+ this.window_locations = [];
+ // Loop is explicit to ensure array order.
+ for ( let i = 0 ; i < windows.length ; ++i )
+ {
+ this.window_locations.push( windows[i].location.href );
+ }
+ }
+ else
+ {
+ // Figure out something
+ }
+};
+
+//noinspection JSUnusedGlobalSymbols
+Observation.prototype.toJSON = function()
+{
+ return {
+ location: this.location,
+ filtered: this.filtered,
+ content_description: this.content_description,
+ filter: (this.entries.length == 1) ? this.entries[0].entry.filter.text : undefined,
+ window_locations: this.window_locations
+ };
+};
+
+/**
+ * Comparison function
+ *
+ * @param {Observation} x
+ * @return {number}
+ */
+Observation.prototype.compare = function( x )
+{
+ /*
+ * 1. Sort filtered elements before non-filtered ones.
+ */
+ var a = ( this.filtered ? -1 : 0 ) + ( x.filtered ? 1 : 0 );
+ if ( a != 0 ) return a;
+ /*
+ * 2. Sort by location, a URL string.
+ */
+ if ( this.location < x.location ) return -1;
+ if ( this.location > x.location ) return 1;
+ /*
+ * 3. Sort by filter. Because of the way that entries are collected, we check the entry lists as a whole.
+ */
+ var n = Math.min( this.entries.length, x.entries.length );
+ for ( let j = 0 ; j < n ; ++j )
+ {
+ let s1 = this.entries[ j ];
+ let s2 = x.entries[ j ];
+ if ( s1 < s2 ) return -1;
+ if ( s1 > s2 ) return 1;
+ }
+ // Assert all entries are equal up to their common length
+ // The longer element is sorted later
+ a = this.entries.length - x.entries.length;
+ if ( a != 0 ) return a;
+ /*
+ * 4. Sort by window chain.
+ */
+ n = Math.min( this.window_locations.length, x.window_locations.length );
+ for ( let j = 0 ; j < n ; ++j )
+ {
+ let s1 = this.window_locations[ j ];
+ let s2 = x.window_locations[ j ];
+ if ( s1 < s2 ) return -1;
+ if ( s1 > s2 ) return 1;
+ }
+ return this.window_locations.length - x.window_locations.length;
+};
+
+/**
+ * Equality test.
+ * @param x
+ */
+Observation.prototype.equals = function( x )
+{
+ return this.compare( x ) == 0;
+};
+
+/**
+ *
+ * @param {Observation} a
+ * @param {Observation} b
+ * @return {number}
+ */
+Observation.cmp = function( a, b )
+{
+ return a.compare( b );
+};
+
+//-------------------------------------------------------
+// exports
+//-------------------------------------------------------
+exports.Input_String = Input_String;
+exports.Input_File = Input_File;
+exports.Instruction_Set = Instruction_Set;
+exports.Observation = Observation;
« no previous file with comments | « lib/encoding.js ('k') | lib/logger.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld