| Index: lib/instruction.js | 
| =================================================================== | 
| new file mode 100644 | 
| --- /dev/null | 
| +++ b/lib/instruction.js | 
| @@ -0,0 +1,441 @@ | 
| +/** | 
| + * @fileOverview Instructions are the units of effort for the crawler. This file provides iterators for instructions | 
| + * that the main crawler loop will then execute. | 
| + */ | 
| + | 
| +let {Logger} = require( "logger" ); | 
| +let {Storage} = require( "storage" ); | 
| +let {Encoding} = require( "encoding" ); | 
| +let {YAML, YamlParseException} = require( "yaml" ); | 
| + | 
| +function abprequire( module ) | 
| +{ | 
| +    let result = {}; | 
| +    result.wrappedJSObject = result; | 
| +    Services.obs.notifyObservers( result, "adblockplus-require", module ); | 
| +    return result.exports; | 
| +} | 
| +let {Policy} = abprequire( "contentPolicy" ); | 
| + | 
| +//------------------------------------------------------- | 
| +// Input | 
| +//------------------------------------------------------- | 
| +/** | 
| + * Base class for retrieving source code for crawl instructions. Implementations include fixed string and local file. | 
| + * | 
| + * @property {Object} value | 
| + * @property {string} text | 
| + * @constructor | 
| + */ | 
| +var Input_class = function() | 
| +{ | 
| +}; | 
| + | 
| +/** | 
| + * Load the input into memory and parse it. | 
| + * <p/> | 
| + * Postcondition: 'this.value' has a parsed copy of the input. | 
| + */ | 
| +Input_class.prototype.load = function() | 
| +{ | 
| +    throw new Error( "'Input_class.load' is abstract." ); | 
| +}; | 
| + | 
| +/** | 
| + * Reset the internal storage members of this object. Use to release memory and assist the garbage collector. | 
| + */ | 
| +Input_class.prototype.reset = function() | 
| +{ | 
| +    throw new Error( "'Input_class.reset' is abstract." ); | 
| +}; | 
| + | 
| +//---------------------------------- | 
| +// Input_String | 
| +//---------------------------------- | 
| +/** | 
| + * Use a fixed text for the input. | 
| + * | 
| + * @param text | 
| + * @constructor | 
| + * @extends {Input_class} | 
| + */ | 
| +var Input_String = function( text ) | 
| +{ | 
| +    this.text = text; | 
| +    this.value = null; | 
| +}; | 
| +Input_String.prototype = new Input_class(); | 
| + | 
| +/** | 
| + * Parse the input string. | 
| + */ | 
| +Input_String.prototype.load = function() | 
| +{ | 
| +    this.value = YAML.parse( this.text ); | 
| +}; | 
| + | 
| +/** | 
| + * Reset all the internal members. | 
| + */ | 
| +Input_String.prototype.reset = function() | 
| +{ | 
| +    this.text = null; | 
| +    this.value = null; | 
| +}; | 
| + | 
| +//---------------------------------- | 
| +// Input_File | 
| +//---------------------------------- | 
| +/** | 
| + * | 
| + * @param {nsIFile} file | 
| + * @constructor | 
| + */ | 
| +var Input_File = function( file ) | 
| +{ | 
| +    this.file = file; | 
| +}; | 
| +Input_File.prototype = new Input_class(); | 
| + | 
| +Input_File.prototype.load = function() | 
| +{ | 
| +    var data = ""; | 
| +    var fstream = Cc["@mozilla.org/network/file-input-stream;1"].createInstance( Ci.nsIFileInputStream ); | 
| +    var cstream = Cc["@mozilla.org/intl/converter-input-stream;1"].createInstance( Ci.nsIConverterInputStream ); | 
| +    fstream.init( this.file, -1, 0, 0 ); | 
| +    cstream.init( fstream, "UTF-8", 0, 0 ); | 
| +    let str = {}; | 
| +    let read = 0; | 
| +    do { | 
| +        read = cstream.readString( 0xffffffff, str ); // read as much as we can and put it in str.value | 
| +        data += str.value; | 
| +    } while ( read != 0 ); | 
| +    cstream.close(); | 
| +    this.value = YAML.parse( data ); | 
| +}; | 
| + | 
| +Input_File.prototype.reset = function() | 
| +{ | 
| +    this.file = null; | 
| +    this.value = null; | 
| +}; | 
| + | 
| +//---------------------------------- | 
| +// exports for Input | 
| +//---------------------------------- | 
| +exports.Input_String = Input_String; | 
| +exports.Input_File = Input_File; | 
| + | 
| +//------------------------------------------------------- | 
| +// Instruction | 
| +//------------------------------------------------------- | 
| + | 
| +var Instruction = exports.Instruction = {}; | 
| + | 
| +/** | 
| + * Instruction base class. | 
| + * | 
| + * @constructor | 
| + */ | 
| +var Instruction_class = function() | 
| +{ | 
| +    /** | 
| +     * The only universal aspect to crawling is that we are crawling other people's web sites. This field is the URL for | 
| +     * a site. | 
| +     * @type {String} | 
| +     */ | 
| +    this.target = null; | 
| + | 
| +    /** | 
| +     * The operation to perform at the browse site. | 
| +     */ | 
| +    this.operation = {}; | 
| + | 
| +    this.operation.toJSON = function() | 
| +    { | 
| +        return "default"; | 
| +    }; | 
| + | 
| +    this.logger = new Logger( "Instruction_class" ); | 
| +    this.log = this.logger.make_log(); | 
| +}; | 
| + | 
| +/** | 
| + * Predicate about whether this instruction observes all nodes or only filtered ones. | 
| + * <p/> | 
| + * Framework function for the observation system. Intended to be overridden by subclasses. | 
| + * @return {boolean} | 
| + */ | 
| +Instruction_class.prototype.observing_all_nodes = function() | 
| +{ | 
| +    return false; | 
| +}; | 
| + | 
| +/** | 
| + * Record an observation as the crawler sees it. | 
| + * <p/> | 
| + * Framework function for the observation system. | 
| + * @param observation | 
| + */ | 
| +Instruction_class.prototype.observe_node = function( observation ) | 
| +{ | 
| +    this.observations.push( observation ); | 
| +}; | 
| + | 
| +/** | 
| + * Action at start of executing instruction. Run immediately before the tab is loaded. | 
| + * <p/> | 
| + * Framework function for the observation system. | 
| + * <p/> | 
| + * This function currently has no arguments. The only one that might be relevant is the 'Browser_Tab' instance. It was | 
| + * not chosen as an argument because there's no apparent reason for it. Altering the load behavior should be done by | 
| + * specifying a subclass of 'Browser_Tab' in the instruction. | 
| + */ | 
| +Instruction_class.prototype.begin = function() | 
| +{ | 
| +    this.time_start = Logger.timestamp(); | 
| +}; | 
| + | 
| +/** | 
| + * Action at start of executing instruction. Run immediately before the tab is loaded. | 
| + * <p/> | 
| + * Framework function for the observation system. | 
| + */ | 
| +Instruction_class.prototype.end = function() | 
| +{ | 
| +    this.time_finish = Logger.timestamp(); | 
| +    this.termination = "completed";      // May alter to "cancelled" or "aborted". | 
| + | 
| +    /* | 
| +     * Sort the observation array and merge to remove duplicates. | 
| +     */ | 
| +    this.observations.sort( Observation.cmp ); | 
| +    if ( this.observations.length >= 2 ) | 
| +    { | 
| +        var merged = []; | 
| +        merged.push( this.observations[0] ); | 
| +        this.observations.reduce( function( previous, current ) | 
| +        { | 
| +            if ( !previous.equals( current ) ) | 
| +            { | 
| +                merged.push( current ); | 
| +            } | 
| +            return current; | 
| +        } ); | 
| +        this.observations = merged; | 
| +    } | 
| +}; | 
| + | 
| +//noinspection JSUnusedGlobalSymbols | 
| +Instruction_class.prototype.toJSON = function() | 
| +{ | 
| +    return { | 
| +        target: this.target, | 
| +        operation: this.operation, | 
| +        time_start: this.time_start, | 
| +        observations: this.observations, | 
| +        time_finish: this.time_finish | 
| +    }; | 
| +}; | 
| + | 
| +//------------------------------------------------------- | 
| +// Instruction_Set | 
| +//------------------------------------------------------- | 
| +/** | 
| + * As-yet unused base class for instruction sets | 
| + * @constructor | 
| + */ | 
| +var Instruction_Set_class = function() | 
| +{ | 
| +}; | 
| +Instruction_Set_class.prototype.generator = function() | 
| +{ | 
| +    throw new Error( "Must override 'generator' when deriving from Instruction_Set_class" ); | 
| +}; | 
| + | 
| +var Instruction_Set = {}; | 
| +exports.Instruction_Set = Instruction_Set; | 
| + | 
| +//------------------------------------------------------- | 
| +// Instruction_Set.Parsed | 
| +//------------------------------------------------------- | 
| + | 
| +/** | 
| + * An instruction set constructed from a parsed YAML document. | 
| + * | 
| + * @param {Input_class} input | 
| + * @constructor | 
| + */ | 
| +Instruction_Set.Parsed = function( input ) | 
| +{ | 
| +    try | 
| +    { | 
| +        input.load(); | 
| +        this.source = input.value; | 
| +    } | 
| +    finally | 
| +    { | 
| +        input.reset(); | 
| +    } | 
| + | 
| +    this.name = this.source.name; | 
| +    this.instructions = []; | 
| +    let target = this.source.target; | 
| +    let n = target.length; | 
| +    for ( let j = 0 ; j < n ; ++j ) | 
| +    { | 
| +        this.instructions.push( new Default_Instruction( target[ j ] ) ); | 
| +    } | 
| +    this.size = this.instructions.length; | 
| +}; | 
| +Instruction_Set.Parsed.prototype = new Instruction_Set_class(); | 
| + | 
| +Instruction_Set.Parsed.prototype.generator = function() | 
| +{ | 
| +    let n = this.instructions.length; | 
| +    for ( let j = 0 ; j < n ; ++j ) | 
| +    { | 
| +        yield this.instructions[ j ]; | 
| +    } | 
| +}; | 
| + | 
| +Instruction_Set.Parsed.prototype.toJSON = function() | 
| +{ | 
| +    return { name: this.name }; | 
| +}; | 
| + | 
| +//------------------------------------------------------- | 
| +// Default_Instruction | 
| +//------------------------------------------------------- | 
| +/** | 
| + * The default instruction type. | 
| + * @param {String} target | 
| + * @constructor | 
| + */ | 
| +Default_Instruction = function( target ) | 
| +{ | 
| +    this.target = target; | 
| + | 
| +    /** | 
| +     * Observations array | 
| +     * @type {Array} | 
| +     */ | 
| +    this.observations = []; | 
| +}; | 
| +Default_Instruction.prototype = new Instruction_class(); | 
| + | 
| +//------------------------------------------------------- | 
| +// Observation | 
| +//------------------------------------------------------- | 
| +/** | 
| + * | 
| + * @param filtered | 
| + * @param content_type | 
| + * @param location | 
| + * @param entries | 
| + * @constructor | 
| + */ | 
| +var Observation = function( filtered, content_type, location, entries ) | 
| +{ | 
| +    this.filtered = filtered; | 
| +    this.content_description = Policy.typeDescr[content_type]; | 
| +    this.location = location; | 
| +    this.entries = entries; | 
| +    if ( this.entries.length == 1 ) | 
| +    { | 
| +        let x = this.entries[0]; | 
| +        this.filter = x.entry.filter.text; | 
| +        let windows = x.windows; | 
| +        this.window_locations = []; | 
| +        // Loop is explicit to ensure array order. | 
| +        for ( let i = 0 ; i < windows.length ; ++i ) | 
| +        { | 
| +            this.window_locations.push( windows[i].location.href ); | 
| +        } | 
| +    } | 
| +    else | 
| +    { | 
| +        // Figure out something | 
| +    } | 
| +}; | 
| + | 
| +//noinspection JSUnusedGlobalSymbols | 
| +Observation.prototype.toJSON = function() | 
| +{ | 
| +    return { | 
| +        location: this.location, | 
| +        filtered: this.filtered, | 
| +        content_description: this.content_description, | 
| +        filter: (this.entries.length == 1) ? this.entries[0].entry.filter.text : undefined, | 
| +        window_locations: this.window_locations | 
| +    }; | 
| +}; | 
| + | 
| +/** | 
| + * Comparison function | 
| + * | 
| + * @param {Observation} x | 
| + * @return {number} | 
| + */ | 
| +Observation.prototype.compare = function( x ) | 
| +{ | 
| +    /* | 
| +     * 1. Sort filtered elements before non-filtered ones. | 
| +     */ | 
| +    var a = ( this.filtered ? -1 : 0 ) + ( x.filtered ? 1 : 0 ); | 
| +    if ( a != 0 ) return a; | 
| +    /* | 
| +     * 2. Sort by location, a URL string. | 
| +     */ | 
| +    if ( this.location < x.location ) return -1; | 
| +    if ( this.location > x.location ) return 1; | 
| +    /* | 
| +     * 3. Sort by filter. Because of the way that entries are collected, we check the entry lists as a whole. | 
| +     */ | 
| +    var n = Math.min( this.entries.length, x.entries.length ); | 
| +    for ( let j = 0 ; j < n ; ++j ) | 
| +    { | 
| +        let s1 = this.entries[ j ]; | 
| +        let s2 = x.entries[ j ]; | 
| +        if ( s1 < s2 ) return -1; | 
| +        if ( s1 > s2 ) return 1; | 
| +    } | 
| +    // Assert all entries are equal up to their common length | 
| +    // The longer element is sorted later | 
| +    a = this.entries.length - x.entries.length; | 
| +    if ( a != 0 ) return a; | 
| +    /* | 
| +     * 4. Sort by window chain. | 
| +     */ | 
| +    n = Math.min( this.window_locations.length, x.window_locations.length ); | 
| +    for ( let j = 0 ; j < n ; ++j ) | 
| +    { | 
| +        let s1 = this.window_locations[ j ]; | 
| +        let s2 = x.window_locations[ j ]; | 
| +        if ( s1 < s2 ) return -1; | 
| +        if ( s1 > s2 ) return 1; | 
| +    } | 
| +    return this.window_locations.length - x.window_locations.length; | 
| +}; | 
| + | 
| +/** | 
| + * Equality test. | 
| + * @param x | 
| + */ | 
| +Observation.prototype.equals = function( x ) | 
| +{ | 
| +    return this.compare( x ) == 0; | 
| +}; | 
| + | 
| +/** | 
| + * | 
| + * @param {Observation} a | 
| + * @param {Observation} b | 
| + * @return {number} | 
| + */ | 
| +Observation.cmp = function( a, b ) | 
| +{ | 
| +    return a.compare( b ); | 
| +}; | 
| + | 
| +exports.Observation = Observation; | 
|  |