Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/instruction.js

Issue 10233013: Crawler, second version (Closed)
Patch Set: Created April 12, 2013, 1:38 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « lib/encoding.js ('k') | lib/logger.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /**
2 * @fileOverview Instructions are the units of effort for the crawler. This file provides iterators for instructions
3 * that the main crawler loop will then execute.
4 */
5
6 let {Logger} = require( "logger" );
7 let {Storage} = require( "storage" );
8 let {Encoding} = require( "encoding" );
9 let {YAML, YamlParseException} = require( "yaml" );
10
11 function abprequire( module )
12 {
13 let result = {};
14 result.wrappedJSObject = result;
15 Services.obs.notifyObservers( result, "adblockplus-require", module );
16 return result.exports;
17 }
18 let {Policy} = abprequire( "contentPolicy" );
19
20 //-------------------------------------------------------
21 // Input
22 //-------------------------------------------------------
23 /**
24 * Base class for retrieving source code for crawl instructions. Implementations include fixed string and local file.
25 *
26 * @property {Object} value
27 * @property {string} text
28 * @constructor
29 */
30 var Input_class = function()
31 {
32 };
33
34 /**
35 * Load the input into memory and parse it.
36 * <p/>
37 * Postcondition: 'this.value' has a parsed copy of the input.
38 */
39 Input_class.prototype.load = function()
40 {
41 throw new Error( "'Input_class.load' is abstract." );
42 };
43
44 /**
45 * Reset the internal storage members of this object. Use to release memory and assist the garbage collector.
46 */
47 Input_class.prototype.reset = function()
48 {
49 throw new Error( "'Input_class.reset' is abstract." );
50 };
51
52 //----------------------------------
53 // Input_String
54 //----------------------------------
55 /**
56 * Use a fixed text for the input.
57 *
58 * @param text
59 * @constructor
60 * @extends {Input_class}
61 */
62 var Input_String = function( text )
63 {
64 this.text = text;
65 this.value = null;
66 };
67 Input_String.prototype = new Input_class();
68
69 /**
70 * Parse the input string.
71 */
72 Input_String.prototype.load = function()
73 {
74 this.value = YAML.parse( this.text );
75 };
76
77 /**
78 * Reset all the internal members.
79 */
80 Input_String.prototype.reset = function()
81 {
82 this.text = null;
83 this.value = null;
84 };
85
86 //----------------------------------
87 // Input_File
88 //----------------------------------
89 /**
90 *
91 * @param {nsIFile} file
92 * @constructor
93 */
94 var Input_File = function( file )
95 {
96 this.file = file;
97 };
98 Input_File.prototype = new Input_class();
99
100 Input_File.prototype.load = function()
101 {
102 var data = "";
103 var fstream = Cc["@mozilla.org/network/file-input-stream;1"].createInstance( C i.nsIFileInputStream );
104 var cstream = Cc["@mozilla.org/intl/converter-input-stream;1"].createInstance( Ci.nsIConverterInputStream );
105 fstream.init( this.file, -1, 0, 0 );
106 cstream.init( fstream, "UTF-8", 0, 0 );
107 let str = {};
108 let read = 0;
109 do {
110 read = cstream.readString( 0xffffffff, str ); // read as much as we can and put it in str.value
111 data += str.value;
112 } while ( read != 0 );
113 cstream.close();
114 this.value = YAML.parse( data );
115 };
116
117 Input_File.prototype.reset = function()
118 {
119 this.file = null;
120 this.value = null;
121 };
122
123 //-------------------------------------------------------
124 // Instruction
125 //-------------------------------------------------------
126
127 /**
128 * Instruction base class.
129 *
130 * @constructor
131 */
132 var Instruction_class = function()
133 {
134 /**
135 * The only universal aspect to crawling is that we are crawling other people' s web sites. This field is the URL for
136 * a site.
137 * @type {String}
138 */
139 this.target = null;
140
141 /**
142 * The operation to perform at the browse site.
143 */
144 this.operation = {};
145
146 this.operation.toJSON = function()
147 {
148 return "default";
149 };
150
151 this.logger = new Logger( "Instruction_class" );
152 this.log = this.logger.make_log();
153 };
154
155 /**
156 * Predicate about whether this instruction observes all nodes or only filtered ones.
157 * <p/>
158 * Framework function for the observation system. Intended to be overridden by s ubclasses.
159 * @return {boolean}
160 */
161 Instruction_class.prototype.observing_all_nodes = function()
162 {
163 return false;
164 };
165
166 /**
167 * Record an observation as the crawler sees it.
168 * <p/>
169 * Framework function for the observation system.
170 * @param observation
171 */
172 Instruction_class.prototype.observe_node = function( observation )
173 {
174 this.observations.push( observation );
175 };
176
177 /**
178 * Action at start of executing instruction. Run immediately before the tab is l oaded.
179 * <p/>
180 * Framework function for the observation system.
181 * <p/>
182 * This function currently has no arguments. The only one that might be relevant is the 'Browser_Tab' instance. It was
183 * not chosen as an argument because there's no apparent reason for it. Altering the load behavior should be done by
184 * specifying a subclass of 'Browser_Tab' in the instruction.
185 */
186 Instruction_class.prototype.begin = function()
187 {
188 this.time_start = Logger.timestamp();
189 };
190
191 /**
192 * Action at start of executing instruction. Run immediately before the tab is l oaded.
193 * <p/>
194 * Framework function for the observation system.
195 */
196 Instruction_class.prototype.end = function()
197 {
198 this.ended_well = true;
199 this.time_finish = Logger.timestamp();
200 this.termination = "completed"; // May alter to "cancelled" or "aborted".
201
202 /*
203 * Sort the observation array and merge to remove duplicates.
204 */
205 this.observations.sort( Observation.cmp );
206 if ( this.observations.length >= 2 )
207 {
208 var merged = [];
209 merged.push( this.observations[0] );
210 this.observations.reduce( function( previous, current )
211 {
212 if ( !previous.equals( current ) )
213 {
214 merged.push( current );
215 }
216 return current;
217 } );
218 this.observations = merged;
219 }
220 };
221
222 /**
223 * Abort the instruction prematurely. This
224 */
225 Instruction_class.prototype.abort = function( termination )
226 {
227 this.ended_well = false;
228 this.time_finish = Logger.timestamp();
229 this.termination = termination;
230 };
231
232 //noinspection JSUnusedGlobalSymbols
233 /**
234 * The return value of toJSON() defines the result fields are emitted for this i nstruction.
235 *
236 * @returns {*}
237 */
238 Instruction_class.prototype.toJSON = function()
239 {
240 var r = {
241 target: this.target,
242 operation: this.operation,
243 time_start: this.time_start,
244 time_finish: this.time_finish
245 };
246 try
247 {
248 if ( this.ended_well )
249 {
250 r.observations = this.observations;
251 } else
252 {
253 r.termination = this.termination;
254 }
255 }
256 catch ( e )
257 {
258 r.termination = "Unexpected exception: " + e.message;
259 }
260 return r;
261 };
262
263 //-------------------------------------------------------
264 // Instruction_Set
265 //-------------------------------------------------------
266 /**
267 * As-yet unused base class for instruction sets
268 * @constructor
269 */
270 var Instruction_Set_class = function()
271 {
272 };
273 Instruction_Set_class.prototype.generator = function()
274 {
275 throw new Error( "Must override 'generator' when deriving from Instruction_Set _class" );
276 };
277
278 var Instruction_Set = {};
279
280 //-------------------------------------------------------
281 // Instruction_Set.Parsed
282 //-------------------------------------------------------
283
284 /**
285 * An instruction set constructed from a parsed YAML document.
286 *
287 * @param {Input_class} input
288 * @constructor
289 */
290 Instruction_Set.Parsed = function( input )
291 {
292 try
293 {
294 input.load();
295 this.source = input.value;
296 }
297 finally
298 {
299 input.reset();
300 }
301
302 this.name = this.source.name;
303 this.instructions = [];
304 let target = this.source.target;
305 let n = target.length;
306 for ( let j = 0 ; j < n ; ++j )
307 {
308 this.instructions.push( new Default_Instruction( target[ j ] ) );
309 }
310 /**
311 * The number of instructions in this set.
312 * @type {number}
313 */
314 this.size = this.instructions.length;
315 };
316 Instruction_Set.Parsed.prototype = new Instruction_Set_class();
317
318 Instruction_Set.Parsed.prototype.generator = function()
319 {
320 let n = this.instructions.length;
321 for ( let j = 0 ; j < n ; ++j )
322 {
323 yield this.instructions[ j ];
324 }
325 };
326
327 Instruction_Set.Parsed.prototype.toJSON = function()
328 {
329 return { name: this.name };
330 };
331
332 //-------------------------------------------------------
333 // Default_Instruction
334 //-------------------------------------------------------
335 /**
336 * The default instruction type.
337 * @param {String} target
338 * @constructor
339 */
340 Default_Instruction = function( target )
341 {
342 this.target = target;
343
344 /**
345 * Observations array
346 * @type {Array}
347 */
348 this.observations = [];
349 };
350 Default_Instruction.prototype = new Instruction_class();
351
352 //-------------------------------------------------------
353 // Observation
354 //-------------------------------------------------------
355 /**
356 *
357 * @param filtered
358 * @param content_type
359 * @param location
360 * @param entries
361 * @constructor
362 */
363 var Observation = function( filtered, content_type, location, entries )
364 {
365 this.filtered = filtered;
366 this.content_description = Policy.typeDescr[content_type];
367 this.location = location;
368 this.entries = entries;
369 if ( this.entries.length == 1 )
370 {
371 let x = this.entries[0];
372 this.filter = x.entry.filter.text;
373 let windows = x.windows;
374 this.window_locations = [];
375 // Loop is explicit to ensure array order.
376 for ( let i = 0 ; i < windows.length ; ++i )
377 {
378 this.window_locations.push( windows[i].location.href );
379 }
380 }
381 else
382 {
383 // Figure out something
384 }
385 };
386
387 //noinspection JSUnusedGlobalSymbols
388 Observation.prototype.toJSON = function()
389 {
390 return {
391 location: this.location,
392 filtered: this.filtered,
393 content_description: this.content_description,
394 filter: (this.entries.length == 1) ? this.entries[0].entry.filter.text : und efined,
395 window_locations: this.window_locations
396 };
397 };
398
399 /**
400 * Comparison function
401 *
402 * @param {Observation} x
403 * @return {number}
404 */
405 Observation.prototype.compare = function( x )
406 {
407 /*
408 * 1. Sort filtered elements before non-filtered ones.
409 */
410 var a = ( this.filtered ? -1 : 0 ) + ( x.filtered ? 1 : 0 );
411 if ( a != 0 ) return a;
412 /*
413 * 2. Sort by location, a URL string.
414 */
415 if ( this.location < x.location ) return -1;
416 if ( this.location > x.location ) return 1;
417 /*
418 * 3. Sort by filter. Because of the way that entries are collected, we check the entry lists as a whole.
419 */
420 var n = Math.min( this.entries.length, x.entries.length );
421 for ( let j = 0 ; j < n ; ++j )
422 {
423 let s1 = this.entries[ j ];
424 let s2 = x.entries[ j ];
425 if ( s1 < s2 ) return -1;
426 if ( s1 > s2 ) return 1;
427 }
428 // Assert all entries are equal up to their common length
429 // The longer element is sorted later
430 a = this.entries.length - x.entries.length;
431 if ( a != 0 ) return a;
432 /*
433 * 4. Sort by window chain.
434 */
435 n = Math.min( this.window_locations.length, x.window_locations.length );
436 for ( let j = 0 ; j < n ; ++j )
437 {
438 let s1 = this.window_locations[ j ];
439 let s2 = x.window_locations[ j ];
440 if ( s1 < s2 ) return -1;
441 if ( s1 > s2 ) return 1;
442 }
443 return this.window_locations.length - x.window_locations.length;
444 };
445
446 /**
447 * Equality test.
448 * @param x
449 */
450 Observation.prototype.equals = function( x )
451 {
452 return this.compare( x ) == 0;
453 };
454
455 /**
456 *
457 * @param {Observation} a
458 * @param {Observation} b
459 * @return {number}
460 */
461 Observation.cmp = function( a, b )
462 {
463 return a.compare( b );
464 };
465
466 //-------------------------------------------------------
467 // exports
468 //-------------------------------------------------------
469 exports.Input_String = Input_String;
470 exports.Input_File = Input_File;
471 exports.Instruction_Set = Instruction_Set;
472 exports.Observation = Observation;
OLDNEW
« no previous file with comments | « lib/encoding.js ('k') | lib/logger.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld