Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/crawler.js

Issue 29338242: Issue 3792 - Fix to support multiprocess firefox (Closed)
Patch Set: address comments Created Sept. 29, 2016, 3:33 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License 2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at 3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/. 4 * http://mozilla.org/MPL/2.0/.
5 */ 5 */
6 6
7 "use strict"; 7 "use strict";
8 8
9 /** 9 /**
10 * @module crawler 10 * @module crawler
11 */ 11 */
12 12
13 const {Services} = Cu.import("resource://gre/modules/Services.jsm", {}); 13 const {Services} = Cu.import("resource://gre/modules/Services.jsm", {});
14 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); 14 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
15 const {Task} = Cu.import("resource://gre/modules/Task.jsm", {}); 15 const {Task} = Cu.import("resource://gre/modules/Task.jsm", {});
16 const {setTimeout, clearTimeout} = Cu.import("resource://gre/modules/Timer.jsm", {});
16 17
17 function abprequire(module) 18 function abprequire(module)
18 { 19 {
19 let result = {}; 20 let result = {};
20 result.wrappedJSObject = result; 21 result.wrappedJSObject = result;
21 Services.obs.notifyObservers(result, "adblockplus-require", module); 22 Services.obs.notifyObservers(result, "adblockplus-require", module);
22 return result.exports; 23 return result.exports;
23 } 24 }
24 25
25 let {RequestNotifier} = abprequire("requestNotifier"); 26 let {RequestNotifier} = abprequire("requestNotifier");
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
102 * @param {tab} tab 103 * @param {tab} tab
103 */ 104 */
104 releaseTab: function(tab) 105 releaseTab: function(tab)
105 { 106 {
106 // If we are about to close last tab don't close it immediately to keep 107 // If we are about to close last tab don't close it immediately to keep
107 // the window alive. It will be closed when a new tab is created. 108 // the window alive. It will be closed when a new tab is created.
108 if (this._tabs > 1) 109 if (this._tabs > 1)
109 this._browser.removeTab(tab); 110 this._browser.removeTab(tab);
110 else 111 else
111 { 112 {
112 // navigate away from early opened URL 113 // navigate away from previously opened URL
113 tab.linkedBrowser.loadURI('about:blank', null, null); 114 tab.linkedBrowser.loadURI("about:blank", null, null);
114 this._tabKeepingWindowAlive = tab; 115 this._tabKeepingWindowAlive = tab;
115 } 116 }
116 117
117 this._tabs--; 118 this._tabs--;
118 if (this._resolvers.length && this._tabs < this._maxtabs) 119 if (this._resolvers.length && this._tabs < this._maxtabs)
119 { 120 {
120 this._resolvers.shift()(this._createTab()); 121 this._resolvers.shift()(this._createTab());
121 } 122 }
122 }, 123 },
123 }; 124 };
124 125
125 /** 126 /**
126 * Observes page loads in a particular tabbed browser.
127 *
128 * @param {tabbrowser} browser
129 * The tabbed browser to be observed
130 * @param {int} timeout
131 * Load timeout in milliseconds
132 * @constructor
133 */
134 function LoadListener(browser, timeout)
135 {
136 this._browser = browser;
137 this._deferred = new Map();
138 this._timeout = timeout;
139 browser.addTabsProgressListener(this);
140 }
141 LoadListener.prototype = {
142 /**
143 * Returns a promise that will be resolved when the page in the specified tab
144 * finishes loading. Loading will be stopped if the timeout is reached.
145 *
146 * @param {tab} tab
147 * @result {Promise}
148 */
149 waitForLoad: function(tab)
150 {
151 let deferred = Promise.defer();
152 this._deferred.set(tab.linkedBrowser, deferred);
153
154 tab.ownerDocument.defaultView.setTimeout(function()
155 {
156 tab.linkedBrowser.stop();
157 }, this._timeout);
158
159 return deferred.promise;
160 },
161
162 /**
163 * Deactivates this object.
164 */
165 stop: function()
166 {
167 this._browser.removeTabsProgressListener(this);
168 },
169
170 onStateChange: function(browser, progress, request, flags, status)
171 {
172 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW))
173 {
174 let deferred = this._deferred.get(browser);
175 if (deferred)
176 {
177 this._deferred.delete(browser);
178
179 let headers = [];
180 if (request instanceof Ci.nsIHttpChannel)
181 {
182 try
183 {
184 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText);
185 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value));
186 }
187 catch (e)
188 {
189 // Exceptions are expected here
190 }
191 }
192 deferred.resolve([status, headers]);
193 }
194 }
195 }
196 };
197
198 /**
199 * Once created, this object will make sure all new windows are dismissed 127 * Once created, this object will make sure all new windows are dismissed
200 * immediately. 128 * immediately.
201 * 129 *
202 * @constructor 130 * @constructor
203 */ 131 */
204 function WindowCloser() 132 function WindowCloser()
205 { 133 {
206 Services.obs.addObserver(this, "xul-window-registered", true) 134 Services.obs.addObserver(this, "xul-window-registered", true)
207 } 135 }
208 WindowCloser.prototype = { 136 WindowCloser.prototype = {
(...skipping 14 matching lines...) Expand all
223 if (window.document.documentElement.localName == 'dialog') 151 if (window.document.documentElement.localName == 'dialog')
224 window.document.documentElement.acceptDialog(); 152 window.document.documentElement.acceptDialog();
225 else 153 else
226 window.close(); 154 window.close();
227 }, false); 155 }, false);
228 }, 156 },
229 157
230 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) 158 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence])
231 }; 159 };
232 160
161 function configureFrameScript()
162 {
163 const frameScriptPath = info.addonRoot + "/lib/child/frameScript.js";
164 Services.mm.loadFrameScript(frameScriptPath, true);
165
166 onShutdown.add(() =>
167 {
168 Services.mm.removeDelayedFrameScript(frameScriptPath);
169 });
170 }
171
233 /** 172 /**
234 * Starts the crawling session. The crawler opens each URL in a tab and stores 173 * Starts the crawling session. The crawler opens each URL in a tab and stores
235 * the results. 174 * the results.
236 * 175 *
237 * @param {Window} window 176 * @param {Window} window
238 * The browser window we're operating in 177 * The browser window we're operating in
239 * @param {String[]} urls 178 * @param {String[]} urls
240 * URLs to be crawled 179 * URLs to be crawled
241 * @param {int} timeout 180 * @param {int} timeout
242 * Load timeout in milliseconds 181 * Load timeout in milliseconds
243 * @param {int} maxtabs 182 * @param {int} maxtabs
244 * Maximum number of tabs to be opened 183 * Maximum number of tabs to be opened
245 * @param {String} targetURL 184 * @param {String} targetURL
246 * URL that should receive the results 185 * URL that should receive the results
247 * @param {Function} onDone 186 * @param {Function} onDone
248 * The callback which is called after finishing of crawling of all URLs. 187 * The callback which is called after finishing of crawling of all URLs.
249 */ 188 */
250 function run(window, urls, timeout, maxtabs, targetURL, onDone) 189 function run(window, urls, timeout, maxtabs, targetURL, onDone)
251 { 190 {
191 configureFrameScript();
252 new Promise((resolve, reject) => 192 new Promise((resolve, reject) =>
253 { 193 {
254 if (FilterStorage.subscriptions.length > 0) 194 if (FilterStorage.subscriptions.length > 0)
255 { 195 {
256 resolve(); 196 resolve();
257 return; 197 return;
258 } 198 }
259 let onFiltersLoaded = (action, item, newValue, oldValue) => 199 let onFiltersLoaded = (action, item, newValue, oldValue) =>
260 { 200 {
261 if (action == "load") 201 if (action == "load")
(...skipping 20 matching lines...) Expand all
282 * @param {int} maxtabs 222 * @param {int} maxtabs
283 * Maximum number of tabs to be opened 223 * Maximum number of tabs to be opened
284 * @param {String} targetURL 224 * @param {String} targetURL
285 * URL that should receive the results 225 * URL that should receive the results
286 * @param {Function} onDone 226 * @param {Function} onDone
287 * The callback which is called after finishing of all tasks. 227 * The callback which is called after finishing of all tasks.
288 */ 228 */
289 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) 229 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)
290 { 230 {
291 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); 231 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
292 let loadListener = new LoadListener(window.getBrowser(), timeout); 232
293 let running = 0; 233 let running = 0;
294 let windowCloser = new WindowCloser(); 234 let windowCloser = new WindowCloser();
295 let taskDone = function() 235 let taskDone = function()
296 { 236 {
297 running--; 237 running--;
298 if (running <= 0) 238 if (running <= 0)
299 { 239 {
300 loadListener.stop();
301 windowCloser.stop(); 240 windowCloser.stop();
302 onDone(); 241 onDone();
303 } 242 }
304 }; 243 };
305 244
306 for (let url of urls) 245 for (let url of urls)
307 { 246 {
308 running++; 247 running++;
309 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) 248 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(r esult)
310 { 249 {
311 let request = new XMLHttpRequest(); 250 let request = new XMLHttpRequest();
312 request.open("POST", targetURL); 251 request.open("POST", targetURL);
313 request.addEventListener("load", taskDone, false); 252 request.addEventListener("load", taskDone, false);
314 request.addEventListener("error", taskDone, false); 253 request.addEventListener("error", taskDone, false);
315 request.send(JSON.stringify(result)); 254 request.send(JSON.stringify(result));
316 }, function(url, exception) 255 }, function(url, exception)
317 { 256 {
318 reportException(exception); 257 reportException(exception);
319 258
320 let request = new XMLHttpRequest(); 259 let request = new XMLHttpRequest();
321 request.open("POST", targetURL); 260 request.open("POST", targetURL);
322 request.addEventListener("load", taskDone, false); 261 request.addEventListener("load", taskDone, false);
323 request.addEventListener("error", taskDone, false); 262 request.addEventListener("error", taskDone, false);
324 request.send(JSON.stringify({ 263 request.send(JSON.stringify({
325 url: url, 264 url: url,
326 startTime: Date.now(), 265 startTime: Date.now(),
327 error: String(exception) 266 error: String(exception)
328 })); 267 }));
329 }.bind(null, url)); 268 }.bind(null, url));
330 } 269 }
331 } 270 }
332 271
333 /** 272 /**
273 * Expects to receive page info gathered in a content process for the specified
274 * `tab`. If there is no relevant message within specified `timeout` then
275 * the result promise is resolved with error object.
276 * @param tab
277 * Tab in which we are interested in
278 * @param {int} timeout
279 * Timeout in milliseconds
280 * @return {Promise} promise which will be resolved with the received page info
281 */
282 function getPageInfo(tab, timeout)
283 {
284 return new Promise((resolve, result) =>
285 {
286 const mm = tab.linkedBrowser.messageManager;
287 let timerID;
288 let onDone = (msg) =>
289 {
290 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone);
291 clearTimeout(timerID);
292 resolve(msg.data);
293 }
294 mm.addMessageListener("abpcrawler:pageInfoGathered", onDone);
295 timerID = setTimeout(() => onDone({data: {error: "timeout"}}), timeout);
296 });
297 }
298
299 /**
334 * Crawls a URL. This is a generator meant to be used via a Task object. 300 * Crawls a URL. This is a generator meant to be used via a Task object.
335 * 301 *
336 * @param {String} url 302 * @param {String} url
337 * @param {TabAllocator} tabAllocator 303 * @param {TabAllocator} tabAllocator
338 * @param {loadListener} loadListener 304 * @param {int} timeout
305 * Load timeout in milliseconds
339 * @result {Object} 306 * @result {Object}
340 * Crawling result 307 * Crawling result
341 */ 308 */
342 function* crawl_url(url, tabAllocator, loadListener) 309 function* crawl_url(url, tabAllocator, timeout)
343 { 310 {
344 let tab = yield tabAllocator.getTab(); 311 let tab = yield tabAllocator.getTab();
345 let result = {url, requests: []}; 312 let result = {url, requests: []};
346 let requestNotifier; 313 let requestNotifier;
347 try 314 try
348 { 315 {
349 result.startTime = Date.now(); 316 result.startTime = Date.now();
350 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, 317 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID,
351 function(entry, scanComplete) 318 function(entry, scanComplete)
352 { 319 {
353 if (!entry) 320 if (!entry)
354 return; 321 return;
355 let {type: contentType, location, filter} = entry; 322 let {type: contentType, location, filter} = entry;
356 result.requests.push({location, contentType, filter}); 323 result.requests.push({location, contentType, filter});
357 }); 324 });
358 325
359 tab.linkedBrowser.loadURI(url, null, null); 326 tab.linkedBrowser.loadURI(url, null, null);
360 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; 327
328 Object.assign(result, yield getPageInfo(tab, timeout));
329 result.finalUrl = tab.linkedBrowser.currentURI.spec;
361 result.endTime = Date.now(); 330 result.endTime = Date.now();
362 result.finalUrl = tab.linkedBrowser.currentURI.spec;
363
364 let document = tab.linkedBrowser.contentDocument;
365 if (document.documentElement)
366 {
367 try
368 {
369 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas");
370 canvas.width = document.documentElement.scrollWidth;
371 canvas.height = document.documentElement.scrollHeight;
372
373 let context = canvas.getContext("2d");
374 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)");
375 result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
376 }
377 catch (e)
378 {
379 reportException(e);
380 result.error = "Capturing screenshot failed: " + e;
381 }
382
383 // TODO: Capture frames as well?
384 let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
385 result.source = serializer.serializeToString(document.documentElement);
386 }
387 } 331 }
388 finally 332 finally
389 { 333 {
390 if (requestNotifier) 334 if (requestNotifier)
391 requestNotifier.shutdown(); 335 requestNotifier.shutdown();
392 tabAllocator.releaseTab(tab); 336 tabAllocator.releaseTab(tab);
393 } 337 }
394 return result; 338 return result;
395 } 339 }
396 340
397 function reportException(e) 341 function reportException(e)
398 { 342 {
399 let stack = ""; 343 let stack = "";
400 if (e && typeof e == "object" && "stack" in e) 344 if (e && typeof e == "object" && "stack" in e)
401 stack = e.stack + "\n"; 345 stack = e.stack + "\n";
402 346
403 Cu.reportError(e); 347 Cu.reportError(e);
404 dump(e + "\n" + stack + "\n"); 348 dump(e + "\n" + stack + "\n");
405 } 349 }
OLDNEW
« no previous file with comments | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld