OLD | NEW |
(Empty) | |
| 1 /** |
| 2 * @fileOverview Instructions are the units of effort for the crawler. This file
provides iterators for instructions |
| 3 * that the main crawler loop will then execute. |
| 4 */ |
| 5 |
| 6 let {Logger} = require( "logger" ); |
| 7 let {Storage} = require( "storage" ); |
| 8 let {Encoding} = require( "encoding" ); |
| 9 let {YAML, YamlParseException} = require( "yaml" ); |
| 10 |
| 11 function abprequire( module ) |
| 12 { |
| 13 let result = {}; |
| 14 result.wrappedJSObject = result; |
| 15 Services.obs.notifyObservers( result, "adblockplus-require", module ); |
| 16 return result.exports; |
| 17 } |
| 18 let {Policy} = abprequire( "contentPolicy" ); |
| 19 |
| 20 //------------------------------------------------------- |
| 21 // Input |
| 22 //------------------------------------------------------- |
| 23 /** |
| 24 * Base class for retrieving source code for crawl instructions. Implementations
include fixed string and local file. |
| 25 * |
| 26 * @property {Object} value |
| 27 * @property {string} text |
| 28 * @constructor |
| 29 */ |
| 30 var Input_class = function() |
| 31 { |
| 32 }; |
| 33 |
| 34 /** |
| 35 * Load the input into memory and parse it. |
| 36 * <p/> |
| 37 * Postcondition: 'this.value' has a parsed copy of the input. |
| 38 */ |
| 39 Input_class.prototype.load = function() |
| 40 { |
| 41 throw new Error( "'Input_class.load' is abstract." ); |
| 42 }; |
| 43 |
| 44 /** |
| 45 * Reset the internal storage members of this object. Use to release memory and
assist the garbage collector. |
| 46 */ |
| 47 Input_class.prototype.reset = function() |
| 48 { |
| 49 throw new Error( "'Input_class.reset' is abstract." ); |
| 50 }; |
| 51 |
| 52 //---------------------------------- |
| 53 // Input_String |
| 54 //---------------------------------- |
| 55 /** |
| 56 * Use a fixed text for the input. |
| 57 * |
| 58 * @param text |
| 59 * @constructor |
| 60 * @extends {Input_class} |
| 61 */ |
| 62 var Input_String = function( text ) |
| 63 { |
| 64 this.text = text; |
| 65 this.value = null; |
| 66 }; |
| 67 Input_String.prototype = new Input_class(); |
| 68 |
| 69 /** |
| 70 * Parse the input string. |
| 71 */ |
| 72 Input_String.prototype.load = function() |
| 73 { |
| 74 this.value = YAML.parse( this.text ); |
| 75 }; |
| 76 |
| 77 /** |
| 78 * Reset all the internal members. |
| 79 */ |
| 80 Input_String.prototype.reset = function() |
| 81 { |
| 82 this.text = null; |
| 83 this.value = null; |
| 84 }; |
| 85 |
| 86 //---------------------------------- |
| 87 // Input_File |
| 88 //---------------------------------- |
| 89 /** |
| 90 * |
| 91 * @param {nsIFile} file |
| 92 * @constructor |
| 93 */ |
| 94 var Input_File = function( file ) |
| 95 { |
| 96 this.file = file; |
| 97 }; |
| 98 Input_File.prototype = new Input_class(); |
| 99 |
| 100 Input_File.prototype.load = function() |
| 101 { |
| 102 var data = ""; |
| 103 var fstream = Cc["@mozilla.org/network/file-input-stream;1"].createInstance(
Ci.nsIFileInputStream ); |
| 104 var cstream = Cc["@mozilla.org/intl/converter-input-stream;1"].createInstanc
e( Ci.nsIConverterInputStream ); |
| 105 fstream.init( this.file, -1, 0, 0 ); |
| 106 cstream.init( fstream, "UTF-8", 0, 0 ); |
| 107 let str = {}; |
| 108 let read = 0; |
| 109 do { |
| 110 read = cstream.readString( 0xffffffff, str ); // read as much as we can
and put it in str.value |
| 111 data += str.value; |
| 112 } while ( read != 0 ); |
| 113 cstream.close(); |
| 114 this.value = YAML.parse( data ); |
| 115 }; |
| 116 |
| 117 Input_File.prototype.reset = function() |
| 118 { |
| 119 this.file = null; |
| 120 this.value = null; |
| 121 }; |
| 122 |
| 123 //---------------------------------- |
| 124 // exports for Input |
| 125 //---------------------------------- |
| 126 exports.Input_String = Input_String; |
| 127 exports.Input_File = Input_File; |
| 128 |
| 129 //------------------------------------------------------- |
| 130 // Instruction |
| 131 //------------------------------------------------------- |
| 132 |
| 133 var Instruction = exports.Instruction = {}; |
| 134 |
| 135 /** |
| 136 * Instruction base class. |
| 137 * |
| 138 * @constructor |
| 139 */ |
| 140 var Instruction_class = function() |
| 141 { |
| 142 /** |
| 143 * The only universal aspect to crawling is that we are crawling other peopl
e's web sites. This field is the URL for |
| 144 * a site. |
| 145 * @type {String} |
| 146 */ |
| 147 this.target = null; |
| 148 |
| 149 /** |
| 150 * The operation to perform at the browse site. |
| 151 */ |
| 152 this.operation = {}; |
| 153 |
| 154 this.operation.toJSON = function() |
| 155 { |
| 156 return "default"; |
| 157 }; |
| 158 |
| 159 this.logger = new Logger( "Instruction_class" ); |
| 160 this.log = this.logger.make_log(); |
| 161 }; |
| 162 |
| 163 /** |
| 164 * Predicate about whether this instruction observes all nodes or only filtered
ones. |
| 165 * <p/> |
| 166 * Framework function for the observation system. Intended to be overridden by s
ubclasses. |
| 167 * @return {boolean} |
| 168 */ |
| 169 Instruction_class.prototype.observing_all_nodes = function() |
| 170 { |
| 171 return false; |
| 172 }; |
| 173 |
| 174 /** |
| 175 * Record an observation as the crawler sees it. |
| 176 * <p/> |
| 177 * Framework function for the observation system. |
| 178 * @param observation |
| 179 */ |
| 180 Instruction_class.prototype.observe_node = function( observation ) |
| 181 { |
| 182 this.observations.push( observation ); |
| 183 }; |
| 184 |
| 185 /** |
| 186 * Action at start of executing instruction. Run immediately before the tab is l
oaded. |
| 187 * <p/> |
| 188 * Framework function for the observation system. |
| 189 * <p/> |
| 190 * This function currently has no arguments. The only one that might be relevant
is the 'Browser_Tab' instance. It was |
| 191 * not chosen as an argument because there's no apparent reason for it. Altering
the load behavior should be done by |
| 192 * specifying a subclass of 'Browser_Tab' in the instruction. |
| 193 */ |
| 194 Instruction_class.prototype.begin = function() |
| 195 { |
| 196 this.time_start = Logger.timestamp(); |
| 197 }; |
| 198 |
| 199 /** |
| 200 * Action at start of executing instruction. Run immediately before the tab is l
oaded. |
| 201 * <p/> |
| 202 * Framework function for the observation system. |
| 203 */ |
| 204 Instruction_class.prototype.end = function() |
| 205 { |
| 206 this.time_finish = Logger.timestamp(); |
| 207 this.termination = "completed"; // May alter to "cancelled" or "aborted
". |
| 208 |
| 209 /* |
| 210 * Sort the observation array and merge to remove duplicates. |
| 211 */ |
| 212 this.observations.sort( Observation.cmp ); |
| 213 if ( this.observations.length >= 2 ) |
| 214 { |
| 215 var merged = []; |
| 216 merged.push( this.observations[0] ); |
| 217 this.observations.reduce( function( previous, current ) |
| 218 { |
| 219 if ( !previous.equals( current ) ) |
| 220 { |
| 221 merged.push( current ); |
| 222 } |
| 223 return current; |
| 224 } ); |
| 225 this.observations = merged; |
| 226 } |
| 227 }; |
| 228 |
| 229 //noinspection JSUnusedGlobalSymbols |
| 230 Instruction_class.prototype.toJSON = function() |
| 231 { |
| 232 return { |
| 233 target: this.target, |
| 234 operation: this.operation, |
| 235 time_start: this.time_start, |
| 236 observations: this.observations, |
| 237 time_finish: this.time_finish |
| 238 }; |
| 239 }; |
| 240 |
| 241 //------------------------------------------------------- |
| 242 // Instruction_Set |
| 243 //------------------------------------------------------- |
| 244 /** |
| 245 * As-yet unused base class for instruction sets |
| 246 * @constructor |
| 247 */ |
| 248 var Instruction_Set_class = function() |
| 249 { |
| 250 }; |
| 251 Instruction_Set_class.prototype.generator = function() |
| 252 { |
| 253 throw new Error( "Must override 'generator' when deriving from Instruction_S
et_class" ); |
| 254 }; |
| 255 |
| 256 var Instruction_Set = {}; |
| 257 exports.Instruction_Set = Instruction_Set; |
| 258 |
| 259 //------------------------------------------------------- |
| 260 // Instruction_Set.Parsed |
| 261 //------------------------------------------------------- |
| 262 |
| 263 /** |
| 264 * An instruction set constructed from a parsed YAML document. |
| 265 * |
| 266 * @param {Input_class} input |
| 267 * @constructor |
| 268 */ |
| 269 Instruction_Set.Parsed = function( input ) |
| 270 { |
| 271 try |
| 272 { |
| 273 input.load(); |
| 274 this.source = input.value; |
| 275 } |
| 276 finally |
| 277 { |
| 278 input.reset(); |
| 279 } |
| 280 |
| 281 this.name = this.source.name; |
| 282 this.instructions = []; |
| 283 let target = this.source.target; |
| 284 let n = target.length; |
| 285 for ( let j = 0 ; j < n ; ++j ) |
| 286 { |
| 287 this.instructions.push( new Default_Instruction( target[ j ] ) ); |
| 288 } |
| 289 this.size = this.instructions.length; |
| 290 }; |
| 291 Instruction_Set.Parsed.prototype = new Instruction_Set_class(); |
| 292 |
| 293 Instruction_Set.Parsed.prototype.generator = function() |
| 294 { |
| 295 let n = this.instructions.length; |
| 296 for ( let j = 0 ; j < n ; ++j ) |
| 297 { |
| 298 yield this.instructions[ j ]; |
| 299 } |
| 300 }; |
| 301 |
| 302 Instruction_Set.Parsed.prototype.toJSON = function() |
| 303 { |
| 304 return { name: this.name }; |
| 305 }; |
| 306 |
| 307 //------------------------------------------------------- |
| 308 // Default_Instruction |
| 309 //------------------------------------------------------- |
| 310 /** |
| 311 * The default instruction type. |
| 312 * @param {String} target |
| 313 * @constructor |
| 314 */ |
| 315 Default_Instruction = function( target ) |
| 316 { |
| 317 this.target = target; |
| 318 |
| 319 /** |
| 320 * Observations array |
| 321 * @type {Array} |
| 322 */ |
| 323 this.observations = []; |
| 324 }; |
| 325 Default_Instruction.prototype = new Instruction_class(); |
| 326 |
| 327 //------------------------------------------------------- |
| 328 // Observation |
| 329 //------------------------------------------------------- |
| 330 /** |
| 331 * |
| 332 * @param filtered |
| 333 * @param content_type |
| 334 * @param location |
| 335 * @param entries |
| 336 * @constructor |
| 337 */ |
| 338 var Observation = function( filtered, content_type, location, entries ) |
| 339 { |
| 340 this.filtered = filtered; |
| 341 this.content_description = Policy.typeDescr[content_type]; |
| 342 this.location = location; |
| 343 this.entries = entries; |
| 344 if ( this.entries.length == 1 ) |
| 345 { |
| 346 let x = this.entries[0]; |
| 347 this.filter = x.entry.filter.text; |
| 348 let windows = x.windows; |
| 349 this.window_locations = []; |
| 350 // Loop is explicit to ensure array order. |
| 351 for ( let i = 0 ; i < windows.length ; ++i ) |
| 352 { |
| 353 this.window_locations.push( windows[i].location.href ); |
| 354 } |
| 355 } |
| 356 else |
| 357 { |
| 358 // Figure out something |
| 359 } |
| 360 }; |
| 361 |
| 362 //noinspection JSUnusedGlobalSymbols |
| 363 Observation.prototype.toJSON = function() |
| 364 { |
| 365 return { |
| 366 location: this.location, |
| 367 filtered: this.filtered, |
| 368 content_description: this.content_description, |
| 369 filter: (this.entries.length == 1) ? this.entries[0].entry.filter.text :
undefined, |
| 370 window_locations: this.window_locations |
| 371 }; |
| 372 }; |
| 373 |
| 374 /** |
| 375 * Comparison function |
| 376 * |
| 377 * @param {Observation} x |
| 378 * @return {number} |
| 379 */ |
| 380 Observation.prototype.compare = function( x ) |
| 381 { |
| 382 /* |
| 383 * 1. Sort filtered elements before non-filtered ones. |
| 384 */ |
| 385 var a = ( this.filtered ? -1 : 0 ) + ( x.filtered ? 1 : 0 ); |
| 386 if ( a != 0 ) return a; |
| 387 /* |
| 388 * 2. Sort by location, a URL string. |
| 389 */ |
| 390 if ( this.location < x.location ) return -1; |
| 391 if ( this.location > x.location ) return 1; |
| 392 /* |
| 393 * 3. Sort by filter. Because of the way that entries are collected, we chec
k the entry lists as a whole. |
| 394 */ |
| 395 var n = Math.min( this.entries.length, x.entries.length ); |
| 396 for ( let j = 0 ; j < n ; ++j ) |
| 397 { |
| 398 let s1 = this.entries[ j ]; |
| 399 let s2 = x.entries[ j ]; |
| 400 if ( s1 < s2 ) return -1; |
| 401 if ( s1 > s2 ) return 1; |
| 402 } |
| 403 // Assert all entries are equal up to their common length |
| 404 // The longer element is sorted later |
| 405 a = this.entries.length - x.entries.length; |
| 406 if ( a != 0 ) return a; |
| 407 /* |
| 408 * 4. Sort by window chain. |
| 409 */ |
| 410 n = Math.min( this.window_locations.length, x.window_locations.length ); |
| 411 for ( let j = 0 ; j < n ; ++j ) |
| 412 { |
| 413 let s1 = this.window_locations[ j ]; |
| 414 let s2 = x.window_locations[ j ]; |
| 415 if ( s1 < s2 ) return -1; |
| 416 if ( s1 > s2 ) return 1; |
| 417 } |
| 418 return this.window_locations.length - x.window_locations.length; |
| 419 }; |
| 420 |
| 421 /** |
| 422 * Equality test. |
| 423 * @param x |
| 424 */ |
| 425 Observation.prototype.equals = function( x ) |
| 426 { |
| 427 return this.compare( x ) == 0; |
| 428 }; |
| 429 |
| 430 /** |
| 431 * |
| 432 * @param {Observation} a |
| 433 * @param {Observation} b |
| 434 * @return {number} |
| 435 */ |
| 436 Observation.cmp = function( a, b ) |
| 437 { |
| 438 return a.compare( b ); |
| 439 }; |
| 440 |
| 441 exports.Observation = Observation; |
OLD | NEW |