Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/crawler.js

Issue 29338242: Issue 3792 - Fix to support multiprocess firefox (Closed)
Patch Set: fix Created Sept. 29, 2016, 8:59 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« lib/child/frameScript.js ('K') | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License 2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at 3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/. 4 * http://mozilla.org/MPL/2.0/.
5 */ 5 */
6 6
7 "use strict"; 7 "use strict";
8 8
9 /** 9 /**
10 * @module crawler 10 * @module crawler
11 */ 11 */
12 12
13 const {Services} = Cu.import("resource://gre/modules/Services.jsm", {}); 13 const {Services} = Cu.import("resource://gre/modules/Services.jsm", {});
14 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); 14 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
15 const {Task} = Cu.import("resource://gre/modules/Task.jsm", {}); 15 const {Task} = Cu.import("resource://gre/modules/Task.jsm", {});
16 const {setTimeout, clearTimeout} = Cu.import("resource://gre/modules/Timer.jsm", {});
16 17
17 function abprequire(module) 18 function abprequire(module)
18 { 19 {
19 let result = {}; 20 let result = {};
20 result.wrappedJSObject = result; 21 result.wrappedJSObject = result;
21 Services.obs.notifyObservers(result, "adblockplus-require", module); 22 Services.obs.notifyObservers(result, "adblockplus-require", module);
22 return result.exports; 23 return result.exports;
23 } 24 }
24 25
25 let {RequestNotifier} = abprequire("requestNotifier"); 26 let {RequestNotifier} = abprequire("requestNotifier");
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
102 * @param {tab} tab 103 * @param {tab} tab
103 */ 104 */
104 releaseTab: function(tab) 105 releaseTab: function(tab)
105 { 106 {
106 // If we are about to close last tab don't close it immediately to keep 107 // If we are about to close last tab don't close it immediately to keep
107 // the window alive. It will be closed when a new tab is created. 108 // the window alive. It will be closed when a new tab is created.
108 if (this._tabs > 1) 109 if (this._tabs > 1)
109 this._browser.removeTab(tab); 110 this._browser.removeTab(tab);
110 else 111 else
111 { 112 {
112 // navigate away from early opened URL 113 // navigate away from previously opened URL
113 tab.linkedBrowser.loadURI('about:blank', null, null); 114 tab.linkedBrowser.loadURI("about:blank", null, null);
114 this._tabKeepingWindowAlive = tab; 115 this._tabKeepingWindowAlive = tab;
115 } 116 }
116 117
117 this._tabs--; 118 this._tabs--;
118 if (this._resolvers.length && this._tabs < this._maxtabs) 119 if (this._resolvers.length && this._tabs < this._maxtabs)
119 { 120 {
120 this._resolvers.shift()(this._createTab()); 121 this._resolvers.shift()(this._createTab());
121 } 122 }
122 }, 123 },
123 }; 124 };
124 125
125 /** 126 /**
126 * Observes page loads in a particular tabbed browser.
127 *
128 * @param {tabbrowser} browser
129 * The tabbed browser to be observed
130 * @param {int} timeout
131 * Load timeout in milliseconds
132 * @constructor
133 */
134 function LoadListener(browser, timeout)
135 {
136 this._browser = browser;
137 this._deferred = new Map();
138 this._timeout = timeout;
139 browser.addTabsProgressListener(this);
140 }
141 LoadListener.prototype = {
142 /**
143 * Returns a promise that will be resolved when the page in the specified tab
144 * finishes loading. Loading will be stopped if the timeout is reached.
145 *
146 * @param {tab} tab
147 * @result {Promise}
148 */
149 waitForLoad: function(tab)
150 {
151 return new Promise((resolve, reject) =>
152 {
153 this._deferred.set(tab.linkedBrowser, resolve);
154
155 tab.ownerDocument.defaultView.setTimeout(function()
156 {
157 tab.linkedBrowser.stop();
158 }, this._timeout);
159 });
160 },
161
162 /**
163 * Deactivates this object.
164 */
165 stop: function()
166 {
167 this._browser.removeTabsProgressListener(this);
168 },
169
170 onStateChange: function(browser, progress, request, flags, status)
171 {
172 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW))
173 {
174 let resolve = this._deferred.get(browser);
175 if (resolve)
176 {
177 this._deferred.delete(browser);
178
179 let headers = [];
180 if (request instanceof Ci.nsIHttpChannel)
181 {
182 try
183 {
184 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText);
185 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value));
186 }
187 catch (e)
188 {
189 // Exceptions are expected here
190 }
191 }
192 resolve([status, headers]);
193 }
194 }
195 }
196 };
197
198 /**
199 * Once created, this object will make sure all new windows are dismissed 127 * Once created, this object will make sure all new windows are dismissed
200 * immediately. 128 * immediately.
201 * 129 *
202 * @constructor 130 * @constructor
203 */ 131 */
204 function WindowCloser() 132 function WindowCloser()
205 { 133 {
206 Services.obs.addObserver(this, "xul-window-registered", true) 134 Services.obs.addObserver(this, "xul-window-registered", true)
207 } 135 }
208 WindowCloser.prototype = { 136 WindowCloser.prototype = {
(...skipping 14 matching lines...) Expand all
223 if (window.document.documentElement.localName == 'dialog') 151 if (window.document.documentElement.localName == 'dialog')
224 window.document.documentElement.acceptDialog(); 152 window.document.documentElement.acceptDialog();
225 else 153 else
226 window.close(); 154 window.close();
227 }, false); 155 }, false);
228 }, 156 },
229 157
230 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) 158 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence])
231 }; 159 };
232 160
161 function configureFrameScript()
162 {
163 const info = require("info");
164 let frameScriptPath = info.addonRoot + "/lib/child/frameScript.js";
165 Services.mm.loadFrameScript(frameScriptPath, true);
166
167 onShutdown.add(() =>
168 {
169 Services.mm.removeDelayedFrameScript(frameScriptPath);
170 });
171 }
172
233 /** 173 /**
234 * Starts the crawling session. The crawler opens each URL in a tab and stores 174 * Starts the crawling session. The crawler opens each URL in a tab and stores
235 * the results. 175 * the results.
236 * 176 *
237 * @param {Window} window 177 * @param {Window} window
238 * The browser window we're operating in 178 * The browser window we're operating in
239 * @param {String[]} urls 179 * @param {String[]} urls
240 * URLs to be crawled 180 * URLs to be crawled
241 * @param {int} timeout 181 * @param {int} timeout
242 * Load timeout in milliseconds 182 * Load timeout in milliseconds
243 * @param {int} maxtabs 183 * @param {int} maxtabs
244 * Maximum number of tabs to be opened 184 * Maximum number of tabs to be opened
245 * @param {String} targetURL 185 * @param {String} targetURL
246 * URL that should receive the results 186 * URL that should receive the results
247 * @param {Function} onDone 187 * @param {Function} onDone
248 * The callback which is called after finishing of crawling of all URLs. 188 * The callback which is called after finishing of crawling of all URLs.
249 */ 189 */
250 function run(window, urls, timeout, maxtabs, targetURL, onDone) 190 function run(window, urls, timeout, maxtabs, targetURL, onDone)
251 { 191 {
192 configureFrameScript();
252 new Promise((resolve, reject) => 193 new Promise((resolve, reject) =>
253 { 194 {
254 if (FilterStorage.subscriptions.length > 0) 195 if (FilterStorage.subscriptions.length > 0)
255 { 196 {
256 resolve(); 197 resolve();
257 return; 198 return;
258 } 199 }
259 let onFiltersLoaded = (action, item, newValue, oldValue) => 200 let onFiltersLoaded = (action, item, newValue, oldValue) =>
260 { 201 {
261 if (action == "load") 202 if (action == "load")
(...skipping 20 matching lines...) Expand all
282 * @param {int} maxtabs 223 * @param {int} maxtabs
283 * Maximum number of tabs to be opened 224 * Maximum number of tabs to be opened
284 * @param {String} targetURL 225 * @param {String} targetURL
285 * URL that should receive the results 226 * URL that should receive the results
286 * @param {Function} onDone 227 * @param {Function} onDone
287 * The callback which is called after finishing of all tasks. 228 * The callback which is called after finishing of all tasks.
288 */ 229 */
289 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) 230 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)
290 { 231 {
291 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); 232 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
292 let loadListener = new LoadListener(window.getBrowser(), timeout); 233
293 let running = 0; 234 let running = 0;
294 let windowCloser = new WindowCloser(); 235 let windowCloser = new WindowCloser();
295 let taskDone = function() 236 let taskDone = function()
296 { 237 {
297 running--; 238 running--;
298 if (running <= 0) 239 if (running <= 0)
299 { 240 {
300 loadListener.stop();
301 windowCloser.stop(); 241 windowCloser.stop();
302 onDone(); 242 onDone();
303 } 243 }
304 }; 244 };
305 245
306 for (let url of urls) 246 for (let url of urls)
307 { 247 {
308 running++; 248 running++;
309 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) 249 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(r esult)
310 { 250 {
311 let request = new XMLHttpRequest(); 251 let request = new XMLHttpRequest();
312 request.open("POST", targetURL); 252 request.open("POST", targetURL);
313 request.addEventListener("load", taskDone, false); 253 request.addEventListener("load", taskDone, false);
314 request.addEventListener("error", taskDone, false); 254 request.addEventListener("error", taskDone, false);
315 request.send(JSON.stringify(result)); 255 request.send(JSON.stringify(result));
316 }, function(url, exception) 256 }, function(url, exception)
317 { 257 {
318 reportException(exception); 258 reportException(exception);
319 259
320 let request = new XMLHttpRequest(); 260 let request = new XMLHttpRequest();
321 request.open("POST", targetURL); 261 request.open("POST", targetURL);
322 request.addEventListener("load", taskDone, false); 262 request.addEventListener("load", taskDone, false);
323 request.addEventListener("error", taskDone, false); 263 request.addEventListener("error", taskDone, false);
324 request.send(JSON.stringify({ 264 request.send(JSON.stringify({
325 url: url, 265 url: url,
326 startTime: Date.now(), 266 startTime: Date.now(),
327 error: String(exception) 267 error: String(exception)
328 })); 268 }));
329 }.bind(null, url)); 269 }.bind(null, url));
330 } 270 }
331 } 271 }
332 272
333 /** 273 /**
274 * Expects to receive page info gathered in a content process for the specified
275 * `tab`. If there is no relevant message within specified `timeout` then
276 * the result promise is resolved with error object.
277 * @param tab
278 * Tab in which we are interested in
279 * @param {int} timeout
280 * Timeout in milliseconds
281 * @return {Promise} promise which will be resolved with the received page info
282 */
283 function getPageInfo(tab, timeout)
284 {
285 return new Promise((resolve, result) =>
286 {
287 let mm = tab.linkedBrowser.messageManager;
288 let timerID;
289 let onDone = (msg) =>
290 {
291 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone);
292 clearTimeout(timerID);
293 resolve(msg.data);
294 }
295 mm.addMessageListener("abpcrawler:pageInfoGathered", onDone);
296 timerID = setTimeout(() => onDone({data: {error: "timeout"}}), timeout);
297 });
298 }
299
300 /**
334 * Crawls a URL. This is a generator meant to be used via a Task object. 301 * Crawls a URL. This is a generator meant to be used via a Task object.
335 * 302 *
336 * @param {String} url 303 * @param {String} url
337 * @param {TabAllocator} tabAllocator 304 * @param {TabAllocator} tabAllocator
338 * @param {loadListener} loadListener 305 * @param {int} timeout
306 * Load timeout in milliseconds
339 * @result {Object} 307 * @result {Object}
340 * Crawling result 308 * Crawling result
341 */ 309 */
342 function* crawl_url(url, tabAllocator, loadListener) 310 function* crawl_url(url, tabAllocator, timeout)
343 { 311 {
344 let tab = yield tabAllocator.getTab(); 312 let tab = yield tabAllocator.getTab();
345 let result = {url, requests: []}; 313 let result = {url, requests: []};
346 let requestNotifier; 314 let requestNotifier;
347 try 315 try
348 { 316 {
349 result.startTime = Date.now(); 317 result.startTime = Date.now();
350 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, 318 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID,
351 function(entry, scanComplete) 319 function(entry, scanComplete)
352 { 320 {
353 if (!entry) 321 if (!entry)
354 return; 322 return;
355 let {type: contentType, location, filter} = entry; 323 let {type: contentType, location, filter} = entry;
356 result.requests.push({location, contentType, filter}); 324 result.requests.push({location, contentType, filter});
357 }); 325 });
358 326
359 tab.linkedBrowser.loadURI(url, null, null); 327 tab.linkedBrowser.loadURI(url, null, null);
360 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; 328
329 Object.assign(result, yield getPageInfo(tab, timeout));
330 result.finalUrl = tab.linkedBrowser.currentURI.spec;
361 result.endTime = Date.now(); 331 result.endTime = Date.now();
362 result.finalUrl = tab.linkedBrowser.currentURI.spec;
363
364 let document = tab.linkedBrowser.contentDocument;
365 if (document.documentElement)
366 {
367 try
368 {
369 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas");
370 canvas.width = document.documentElement.scrollWidth;
371 canvas.height = document.documentElement.scrollHeight;
372
373 let context = canvas.getContext("2d");
374 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)");
375 result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
376 }
377 catch (e)
378 {
379 reportException(e);
380 result.error = "Capturing screenshot failed: " + e;
381 }
382
383 // TODO: Capture frames as well?
384 let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
385 result.source = serializer.serializeToString(document.documentElement);
386 }
387 } 332 }
388 finally 333 finally
389 { 334 {
390 if (requestNotifier) 335 if (requestNotifier)
391 requestNotifier.shutdown(); 336 requestNotifier.shutdown();
392 tabAllocator.releaseTab(tab); 337 tabAllocator.releaseTab(tab);
393 } 338 }
394 return result; 339 return result;
395 } 340 }
396 341
397 function reportException(e) 342 function reportException(e)
398 { 343 {
399 let stack = ""; 344 let stack = "";
400 if (e && typeof e == "object" && "stack" in e) 345 if (e && typeof e == "object" && "stack" in e)
401 stack = e.stack + "\n"; 346 stack = e.stack + "\n";
402 347
403 Cu.reportError(e); 348 Cu.reportError(e);
404 dump(e + "\n" + stack + "\n"); 349 dump(e + "\n" + stack + "\n");
405 } 350 }
OLDNEW
« lib/child/frameScript.js ('K') | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld