OLD | NEW |
(Empty) | |
| 1 /** |
| 2 * @fileOverview Instructions are the units of effort for the crawler. This file
provides iterators for instructions |
| 3 * that the main crawler loop will then execute. |
| 4 */ |
| 5 |
| 6 let {Logger} = require( "logger" ); |
| 7 let {Storage} = require( "storage" ); |
| 8 let {Encoding} = require( "encoding" ); |
| 9 let {YAML, YamlParseException} = require( "yaml" ); |
| 10 |
| 11 function abprequire( module ) |
| 12 { |
| 13 let result = {}; |
| 14 result.wrappedJSObject = result; |
| 15 Services.obs.notifyObservers( result, "adblockplus-require", module ); |
| 16 return result.exports; |
| 17 } |
| 18 let {Policy} = abprequire( "contentPolicy" ); |
| 19 |
| 20 //------------------------------------------------------- |
| 21 // Input |
| 22 //------------------------------------------------------- |
| 23 /** |
| 24 * Base class for retrieving source code for crawl instructions. Implementations
include fixed string and local file. |
| 25 * |
| 26 * @property {Object} value |
| 27 * @property {string} text |
| 28 * @constructor |
| 29 */ |
| 30 var Input_class = function() |
| 31 { |
| 32 }; |
| 33 |
| 34 /** |
| 35 * Load the input into memory and parse it. |
| 36 * <p/> |
| 37 * Postcondition: 'this.value' has a parsed copy of the input. |
| 38 */ |
| 39 Input_class.prototype.load = function() |
| 40 { |
| 41 throw new Error( "'Input_class.load' is abstract." ); |
| 42 }; |
| 43 |
| 44 /** |
| 45 * Reset the internal storage members of this object. Use to release memory and
assist the garbage collector. |
| 46 */ |
| 47 Input_class.prototype.reset = function() |
| 48 { |
| 49 throw new Error( "'Input_class.reset' is abstract." ); |
| 50 }; |
| 51 |
| 52 //---------------------------------- |
| 53 // Input_String |
| 54 //---------------------------------- |
| 55 /** |
| 56 * Use a fixed text for the input. |
| 57 * |
| 58 * @param text |
| 59 * @constructor |
| 60 * @extends {Input_class} |
| 61 */ |
| 62 var Input_String = function( text ) |
| 63 { |
| 64 this.text = text; |
| 65 this.value = null; |
| 66 }; |
| 67 Input_String.prototype = new Input_class(); |
| 68 |
| 69 /** |
| 70 * Parse the input string. |
| 71 */ |
| 72 Input_String.prototype.load = function() |
| 73 { |
| 74 this.value = YAML.parse( this.text ); |
| 75 }; |
| 76 |
| 77 /** |
| 78 * Reset all the internal members. |
| 79 */ |
| 80 Input_String.prototype.reset = function() |
| 81 { |
| 82 this.text = null; |
| 83 this.value = null; |
| 84 }; |
| 85 |
| 86 //---------------------------------- |
| 87 // Input_File |
| 88 //---------------------------------- |
| 89 /** |
| 90 * |
| 91 * @param {nsIFile} file |
| 92 * @constructor |
| 93 */ |
| 94 var Input_File = function( file ) |
| 95 { |
| 96 this.file = file; |
| 97 }; |
| 98 Input_File.prototype = new Input_class(); |
| 99 |
| 100 Input_File.prototype.load = function() |
| 101 { |
| 102 var data = ""; |
| 103 var fstream = Cc["@mozilla.org/network/file-input-stream;1"].createInstance( C
i.nsIFileInputStream ); |
| 104 var cstream = Cc["@mozilla.org/intl/converter-input-stream;1"].createInstance(
Ci.nsIConverterInputStream ); |
| 105 fstream.init( this.file, -1, 0, 0 ); |
| 106 cstream.init( fstream, "UTF-8", 0, 0 ); |
| 107 let str = {}; |
| 108 let read = 0; |
| 109 do { |
| 110 read = cstream.readString( 0xffffffff, str ); // read as much as we can and
put it in str.value |
| 111 data += str.value; |
| 112 } while ( read != 0 ); |
| 113 cstream.close(); |
| 114 this.value = YAML.parse( data ); |
| 115 }; |
| 116 |
| 117 Input_File.prototype.reset = function() |
| 118 { |
| 119 this.file = null; |
| 120 this.value = null; |
| 121 }; |
| 122 |
| 123 //------------------------------------------------------- |
| 124 // Instruction |
| 125 //------------------------------------------------------- |
| 126 |
| 127 /** |
| 128 * Instruction base class. |
| 129 * |
| 130 * @constructor |
| 131 */ |
| 132 var Instruction_class = function() |
| 133 { |
| 134 /** |
| 135 * The only universal aspect to crawling is that we are crawling other people'
s web sites. This field is the URL for |
| 136 * a site. |
| 137 * @type {String} |
| 138 */ |
| 139 this.target = null; |
| 140 |
| 141 /** |
| 142 * The operation to perform at the browse site. |
| 143 */ |
| 144 this.operation = {}; |
| 145 |
| 146 this.operation.toJSON = function() |
| 147 { |
| 148 return "default"; |
| 149 }; |
| 150 |
| 151 this.logger = new Logger( "Instruction_class" ); |
| 152 this.log = this.logger.make_log(); |
| 153 }; |
| 154 |
| 155 /** |
| 156 * Predicate about whether this instruction observes all nodes or only filtered
ones. |
| 157 * <p/> |
| 158 * Framework function for the observation system. Intended to be overridden by s
ubclasses. |
| 159 * @return {boolean} |
| 160 */ |
| 161 Instruction_class.prototype.observing_all_nodes = function() |
| 162 { |
| 163 return false; |
| 164 }; |
| 165 |
| 166 /** |
| 167 * Record an observation as the crawler sees it. |
| 168 * <p/> |
| 169 * Framework function for the observation system. |
| 170 * @param observation |
| 171 */ |
| 172 Instruction_class.prototype.observe_node = function( observation ) |
| 173 { |
| 174 this.observations.push( observation ); |
| 175 }; |
| 176 |
| 177 /** |
| 178 * Action at start of executing instruction. Run immediately before the tab is l
oaded. |
| 179 * <p/> |
| 180 * Framework function for the observation system. |
| 181 * <p/> |
| 182 * This function currently has no arguments. The only one that might be relevant
is the 'Browser_Tab' instance. It was |
| 183 * not chosen as an argument because there's no apparent reason for it. Altering
the load behavior should be done by |
| 184 * specifying a subclass of 'Browser_Tab' in the instruction. |
| 185 */ |
| 186 Instruction_class.prototype.begin = function() |
| 187 { |
| 188 this.time_start = Logger.timestamp(); |
| 189 }; |
| 190 |
| 191 /** |
| 192 * Action at start of executing instruction. Run immediately before the tab is l
oaded. |
| 193 * <p/> |
| 194 * Framework function for the observation system. |
| 195 */ |
| 196 Instruction_class.prototype.end = function() |
| 197 { |
| 198 this.ended_well = true; |
| 199 this.time_finish = Logger.timestamp(); |
| 200 this.termination = "completed"; // May alter to "cancelled" or "aborted". |
| 201 |
| 202 /* |
| 203 * Sort the observation array and merge to remove duplicates. |
| 204 */ |
| 205 this.observations.sort( Observation.cmp ); |
| 206 if ( this.observations.length >= 2 ) |
| 207 { |
| 208 var merged = []; |
| 209 merged.push( this.observations[0] ); |
| 210 this.observations.reduce( function( previous, current ) |
| 211 { |
| 212 if ( !previous.equals( current ) ) |
| 213 { |
| 214 merged.push( current ); |
| 215 } |
| 216 return current; |
| 217 } ); |
| 218 this.observations = merged; |
| 219 } |
| 220 }; |
| 221 |
| 222 /** |
| 223 * Abort the instruction prematurely. This |
| 224 */ |
| 225 Instruction_class.prototype.abort = function( termination ) |
| 226 { |
| 227 this.ended_well = false; |
| 228 this.time_finish = Logger.timestamp(); |
| 229 this.termination = termination; |
| 230 }; |
| 231 |
| 232 //noinspection JSUnusedGlobalSymbols |
| 233 /** |
| 234 * The return value of toJSON() defines the result fields are emitted for this i
nstruction. |
| 235 * |
| 236 * @returns {*} |
| 237 */ |
| 238 Instruction_class.prototype.toJSON = function() |
| 239 { |
| 240 var r = { |
| 241 target: this.target, |
| 242 operation: this.operation, |
| 243 time_start: this.time_start, |
| 244 time_finish: this.time_finish |
| 245 }; |
| 246 try |
| 247 { |
| 248 if ( this.ended_well ) |
| 249 { |
| 250 r.observations = this.observations; |
| 251 } else |
| 252 { |
| 253 r.termination = this.termination; |
| 254 } |
| 255 } |
| 256 catch ( e ) |
| 257 { |
| 258 r.termination = "Unexpected exception: " + e.message; |
| 259 } |
| 260 return r; |
| 261 }; |
| 262 |
| 263 //------------------------------------------------------- |
| 264 // Instruction_Set |
| 265 //------------------------------------------------------- |
| 266 /** |
| 267 * As-yet unused base class for instruction sets |
| 268 * @constructor |
| 269 */ |
| 270 var Instruction_Set_class = function() |
| 271 { |
| 272 }; |
| 273 Instruction_Set_class.prototype.generator = function() |
| 274 { |
| 275 throw new Error( "Must override 'generator' when deriving from Instruction_Set
_class" ); |
| 276 }; |
| 277 |
| 278 var Instruction_Set = {}; |
| 279 |
| 280 //------------------------------------------------------- |
| 281 // Instruction_Set.Parsed |
| 282 //------------------------------------------------------- |
| 283 |
| 284 /** |
| 285 * An instruction set constructed from a parsed YAML document. |
| 286 * |
| 287 * @param {Input_class} input |
| 288 * @constructor |
| 289 */ |
| 290 Instruction_Set.Parsed = function( input ) |
| 291 { |
| 292 try |
| 293 { |
| 294 input.load(); |
| 295 this.source = input.value; |
| 296 } |
| 297 finally |
| 298 { |
| 299 input.reset(); |
| 300 } |
| 301 |
| 302 this.name = this.source.name; |
| 303 this.instructions = []; |
| 304 let target = this.source.target; |
| 305 let n = target.length; |
| 306 for ( let j = 0 ; j < n ; ++j ) |
| 307 { |
| 308 this.instructions.push( new Default_Instruction( target[ j ] ) ); |
| 309 } |
| 310 /** |
| 311 * The number of instructions in this set. |
| 312 * @type {number} |
| 313 */ |
| 314 this.size = this.instructions.length; |
| 315 }; |
| 316 Instruction_Set.Parsed.prototype = new Instruction_Set_class(); |
| 317 |
| 318 Instruction_Set.Parsed.prototype.generator = function() |
| 319 { |
| 320 let n = this.instructions.length; |
| 321 for ( let j = 0 ; j < n ; ++j ) |
| 322 { |
| 323 yield this.instructions[ j ]; |
| 324 } |
| 325 }; |
| 326 |
| 327 Instruction_Set.Parsed.prototype.toJSON = function() |
| 328 { |
| 329 return { name: this.name }; |
| 330 }; |
| 331 |
| 332 //------------------------------------------------------- |
| 333 // Default_Instruction |
| 334 //------------------------------------------------------- |
| 335 /** |
| 336 * The default instruction type. |
| 337 * @param {String} target |
| 338 * @constructor |
| 339 */ |
| 340 Default_Instruction = function( target ) |
| 341 { |
| 342 this.target = target; |
| 343 |
| 344 /** |
| 345 * Observations array |
| 346 * @type {Array} |
| 347 */ |
| 348 this.observations = []; |
| 349 }; |
| 350 Default_Instruction.prototype = new Instruction_class(); |
| 351 |
| 352 //------------------------------------------------------- |
| 353 // Observation |
| 354 //------------------------------------------------------- |
| 355 /** |
| 356 * |
| 357 * @param filtered |
| 358 * @param content_type |
| 359 * @param location |
| 360 * @param entries |
| 361 * @constructor |
| 362 */ |
| 363 var Observation = function( filtered, content_type, location, entries ) |
| 364 { |
| 365 this.filtered = filtered; |
| 366 this.content_description = Policy.typeDescr[content_type]; |
| 367 this.location = location; |
| 368 this.entries = entries; |
| 369 if ( this.entries.length == 1 ) |
| 370 { |
| 371 let x = this.entries[0]; |
| 372 this.filter = x.entry.filter.text; |
| 373 let windows = x.windows; |
| 374 this.window_locations = []; |
| 375 // Loop is explicit to ensure array order. |
| 376 for ( let i = 0 ; i < windows.length ; ++i ) |
| 377 { |
| 378 this.window_locations.push( windows[i].location.href ); |
| 379 } |
| 380 } |
| 381 else |
| 382 { |
| 383 // Figure out something |
| 384 } |
| 385 }; |
| 386 |
| 387 //noinspection JSUnusedGlobalSymbols |
| 388 Observation.prototype.toJSON = function() |
| 389 { |
| 390 return { |
| 391 location: this.location, |
| 392 filtered: this.filtered, |
| 393 content_description: this.content_description, |
| 394 filter: (this.entries.length == 1) ? this.entries[0].entry.filter.text : und
efined, |
| 395 window_locations: this.window_locations |
| 396 }; |
| 397 }; |
| 398 |
| 399 /** |
| 400 * Comparison function |
| 401 * |
| 402 * @param {Observation} x |
| 403 * @return {number} |
| 404 */ |
| 405 Observation.prototype.compare = function( x ) |
| 406 { |
| 407 /* |
| 408 * 1. Sort filtered elements before non-filtered ones. |
| 409 */ |
| 410 var a = ( this.filtered ? -1 : 0 ) + ( x.filtered ? 1 : 0 ); |
| 411 if ( a != 0 ) return a; |
| 412 /* |
| 413 * 2. Sort by location, a URL string. |
| 414 */ |
| 415 if ( this.location < x.location ) return -1; |
| 416 if ( this.location > x.location ) return 1; |
| 417 /* |
| 418 * 3. Sort by filter. Because of the way that entries are collected, we check
the entry lists as a whole. |
| 419 */ |
| 420 var n = Math.min( this.entries.length, x.entries.length ); |
| 421 for ( let j = 0 ; j < n ; ++j ) |
| 422 { |
| 423 let s1 = this.entries[ j ]; |
| 424 let s2 = x.entries[ j ]; |
| 425 if ( s1 < s2 ) return -1; |
| 426 if ( s1 > s2 ) return 1; |
| 427 } |
| 428 // Assert all entries are equal up to their common length |
| 429 // The longer element is sorted later |
| 430 a = this.entries.length - x.entries.length; |
| 431 if ( a != 0 ) return a; |
| 432 /* |
| 433 * 4. Sort by window chain. |
| 434 */ |
| 435 n = Math.min( this.window_locations.length, x.window_locations.length ); |
| 436 for ( let j = 0 ; j < n ; ++j ) |
| 437 { |
| 438 let s1 = this.window_locations[ j ]; |
| 439 let s2 = x.window_locations[ j ]; |
| 440 if ( s1 < s2 ) return -1; |
| 441 if ( s1 > s2 ) return 1; |
| 442 } |
| 443 return this.window_locations.length - x.window_locations.length; |
| 444 }; |
| 445 |
| 446 /** |
| 447 * Equality test. |
| 448 * @param x |
| 449 */ |
| 450 Observation.prototype.equals = function( x ) |
| 451 { |
| 452 return this.compare( x ) == 0; |
| 453 }; |
| 454 |
| 455 /** |
| 456 * |
| 457 * @param {Observation} a |
| 458 * @param {Observation} b |
| 459 * @return {number} |
| 460 */ |
| 461 Observation.cmp = function( a, b ) |
| 462 { |
| 463 return a.compare( b ); |
| 464 }; |
| 465 |
| 466 //------------------------------------------------------- |
| 467 // exports |
| 468 //------------------------------------------------------- |
| 469 exports.Input_String = Input_String; |
| 470 exports.Input_File = Input_File; |
| 471 exports.Instruction_Set = Instruction_Set; |
| 472 exports.Observation = Observation; |
OLD | NEW |