Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/crawler.js

Issue 29338242: Issue 3792 - Fix to support multiprocess firefox (Closed)
Patch Set: address comments Created Sept. 29, 2016, 9:52 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« lib/child/frameScript.js ('K') | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License 2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at 3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/. 4 * http://mozilla.org/MPL/2.0/.
5 */ 5 */
6 6
7 'use strict'; 7 'use strict';
8 8
9 /** 9 /**
10 * @module crawler 10 * @module crawler
11 */ 11 */
12 12
13 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); 13 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
14 const {Task} = Cu.import("resource://gre/modules/Task.jsm"); 14 const {Task} = Cu.import("resource://gre/modules/Task.jsm");
15 const {setTimeout, clearTimeout} = Cu.import("resource://gre/modules/Timer.jsm", {});
15 16
16 function abprequire(module) 17 function abprequire(module)
17 { 18 {
18 let result = {}; 19 let result = {};
19 result.wrappedJSObject = result; 20 result.wrappedJSObject = result;
20 Services.obs.notifyObservers(result, "adblockplus-require", module); 21 Services.obs.notifyObservers(result, "adblockplus-require", module);
21 return result.exports; 22 return result.exports;
22 } 23 }
23 24
24 let {RequestNotifier} = abprequire("requestNotifier"); 25 let {RequestNotifier} = abprequire("requestNotifier");
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
101 * @param {tab} tab 102 * @param {tab} tab
102 */ 103 */
103 releaseTab: function(tab) 104 releaseTab: function(tab)
104 { 105 {
105 // If we are about to close last tab don't close it immediately to keep 106 // If we are about to close last tab don't close it immediately to keep
106 // the window alive. It will be closed when a new tab is created. 107 // the window alive. It will be closed when a new tab is created.
107 if (this._tabs > 1) 108 if (this._tabs > 1)
108 this._browser.removeTab(tab); 109 this._browser.removeTab(tab);
109 else 110 else
110 { 111 {
111 // navigate away from early opened URL 112 // navigate away from early opened URL
Wladimir Palant 2016/09/29 11:44:59 early => previously
sergei 2016/09/29 15:36:22 Done.
112 tab.linkedBrowser.loadURI('about:blank', null, null); 113 tab.linkedBrowser.loadURI('about:blank', null, null);
Wladimir Palant 2016/09/29 11:44:59 Double quotation marks please.
sergei 2016/09/29 15:36:22 Done.
113 this._tabKeepingWindowAlive = tab; 114 this._tabKeepingWindowAlive = tab;
114 } 115 }
115 116
116 this._tabs--; 117 this._tabs--;
117 if (this._resolvers.length && this._tabs < this._maxtabs) 118 if (this._resolvers.length && this._tabs < this._maxtabs)
118 { 119 {
119 this._resolvers.shift()(this._createTab()); 120 this._resolvers.shift()(this._createTab());
120 } 121 }
121 }, 122 },
122 }; 123 };
123 124
124 /** 125 /**
125 * Observes page loads in a particular tabbed browser.
126 *
127 * @param {tabbrowser} browser
128 * The tabbed browser to be observed
129 * @param {int} timeout
130 * Load timeout in milliseconds
131 * @constructor
132 */
133 function LoadListener(browser, timeout)
134 {
135 this._browser = browser;
136 this._deferred = new Map();
137 this._timeout = timeout;
138 browser.addTabsProgressListener(this);
139 }
140 LoadListener.prototype = {
141 /**
142 * Returns a promise that will be resolved when the page in the specified tab
143 * finishes loading. Loading will be stopped if the timeout is reached.
144 *
145 * @param {tab} tab
146 * @result {Promise}
147 */
148 waitForLoad: function(tab)
149 {
150 let deferred = Promise.defer();
151 this._deferred.set(tab.linkedBrowser, deferred);
152
153 tab.ownerDocument.defaultView.setTimeout(function()
154 {
155 tab.linkedBrowser.stop();
156 }, this._timeout);
157
158 return deferred.promise;
159 },
160
161 /**
162 * Deactivates this object.
163 */
164 stop: function()
165 {
166 this._browser.removeTabsProgressListener(this);
167 },
168
169 onStateChange: function(browser, progress, request, flags, status)
170 {
171 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW))
172 {
173 let deferred = this._deferred.get(browser);
174 if (deferred)
175 {
176 this._deferred.delete(browser);
177
178 let headers = [];
179 if (request instanceof Ci.nsIHttpChannel)
180 {
181 try
182 {
183 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText);
184 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value));
185 }
186 catch (e)
187 {
188 // Exceptions are expected here
189 }
190 }
191 deferred.resolve([status, headers]);
192 }
193 }
194 }
195 };
196
197 /**
198 * Once created, this object will make sure all new windows are dismissed 126 * Once created, this object will make sure all new windows are dismissed
199 * immediately. 127 * immediately.
200 * 128 *
201 * @constructor 129 * @constructor
202 */ 130 */
203 function WindowCloser() 131 function WindowCloser()
204 { 132 {
205 Services.obs.addObserver(this, "xul-window-registered", true) 133 Services.obs.addObserver(this, "xul-window-registered", true)
206 } 134 }
207 WindowCloser.prototype = { 135 WindowCloser.prototype = {
(...skipping 14 matching lines...) Expand all
222 if (window.document.documentElement.localName == 'dialog') 150 if (window.document.documentElement.localName == 'dialog')
223 window.document.documentElement.acceptDialog(); 151 window.document.documentElement.acceptDialog();
224 else 152 else
225 window.close(); 153 window.close();
226 }, false); 154 }, false);
227 }, 155 },
228 156
229 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) 157 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence])
230 }; 158 };
231 159
160 function configureFrameScript()
161 {
162 const info = require("info");
163 const frameScriptPath = info.addonRoot + '/lib/child/frameScript.js?' + Math.r andom() +
164 '&info=' + encodeURIComponent(JSON.stringify(info));
Wladimir Palant 2016/09/29 11:44:59 Double quotation marks please. What's the point o
sergei 2016/09/29 15:36:22 Done.
165 Services.mm.loadFrameScript(frameScriptPath, true);
166
167 onShutdown.add(() =>
168 {
169 Services.mm.removeDelayedFrameScript(frameScriptPath);
170 });
171 }
172
232 /** 173 /**
233 * Starts the crawling session. The crawler opens each URL in a tab and stores 174 * Starts the crawling session. The crawler opens each URL in a tab and stores
234 * the results. 175 * the results.
235 * 176 *
236 * @param {Window} window 177 * @param {Window} window
237 * The browser window we're operating in 178 * The browser window we're operating in
238 * @param {String[]} urls 179 * @param {String[]} urls
239 * URLs to be crawled 180 * URLs to be crawled
240 * @param {int} timeout 181 * @param {int} timeout
241 * Load timeout in milliseconds 182 * Load timeout in milliseconds
242 * @param {int} maxtabs 183 * @param {int} maxtabs
243 * Maximum number of tabs to be opened 184 * Maximum number of tabs to be opened
244 * @param {String} targetURL 185 * @param {String} targetURL
245 * URL that should receive the results 186 * URL that should receive the results
246 * @param {Function} onDone 187 * @param {Function} onDone
247 * The callback which is called after finishing of crawling of all URLs. 188 * The callback which is called after finishing of crawling of all URLs.
248 */ 189 */
249 function run(window, urls, timeout, maxtabs, targetURL, onDone) 190 function run(window, urls, timeout, maxtabs, targetURL, onDone)
250 { 191 {
192 configureFrameScript();
251 new Promise((resolve, reject) => 193 new Promise((resolve, reject) =>
252 { 194 {
253 if (FilterStorage.subscriptions.length > 0) 195 if (FilterStorage.subscriptions.length > 0)
254 { 196 {
255 resolve(); 197 resolve();
256 return; 198 return;
257 } 199 }
258 let onFiltersLoaded = (action, item, newValue, oldValue) => 200 let onFiltersLoaded = (action, item, newValue, oldValue) =>
259 { 201 {
260 if (action == "load") 202 if (action == "load")
(...skipping 20 matching lines...) Expand all
281 * @param {int} maxtabs 223 * @param {int} maxtabs
282 * Maximum number of tabs to be opened 224 * Maximum number of tabs to be opened
283 * @param {String} targetURL 225 * @param {String} targetURL
284 * URL that should receive the results 226 * URL that should receive the results
285 * @param {Function} onDone 227 * @param {Function} onDone
286 * The callback which is called after finishing of all tasks. 228 * The callback which is called after finishing of all tasks.
287 */ 229 */
288 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) 230 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)
289 { 231 {
290 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); 232 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
291 let loadListener = new LoadListener(window.getBrowser(), timeout); 233
292 let running = 0; 234 let running = 0;
293 let windowCloser = new WindowCloser(); 235 let windowCloser = new WindowCloser();
294 let taskDone = function() 236 let taskDone = function()
295 { 237 {
296 running--; 238 running--;
297 if (running <= 0) 239 if (running <= 0)
298 { 240 {
299 loadListener.stop();
300 windowCloser.stop(); 241 windowCloser.stop();
301 onDone(); 242 onDone();
302 } 243 }
303 }; 244 };
304 245
305 for (let url of urls) 246 for (let url of urls)
306 { 247 {
307 running++; 248 running++;
308 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(funct ion(result) 249 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(r esult)
309 { 250 {
310 let request = new XMLHttpRequest(); 251 let request = new XMLHttpRequest();
311 request.open("POST", targetURL); 252 request.open("POST", targetURL);
312 request.addEventListener("load", taskDone, false); 253 request.addEventListener("load", taskDone, false);
313 request.addEventListener("error", taskDone, false); 254 request.addEventListener("error", taskDone, false);
314 request.send(JSON.stringify(result)); 255 request.send(JSON.stringify(result));
315 }, function(url, exception) 256 }, function(url, exception)
316 { 257 {
317 reportException(exception); 258 reportException(exception);
318 259
319 let request = new XMLHttpRequest(); 260 let request = new XMLHttpRequest();
320 request.open("POST", targetURL); 261 request.open("POST", targetURL);
321 request.addEventListener("load", taskDone, false); 262 request.addEventListener("load", taskDone, false);
322 request.addEventListener("error", taskDone, false); 263 request.addEventListener("error", taskDone, false);
323 request.send(JSON.stringify({ 264 request.send(JSON.stringify({
324 url: url, 265 url: url,
325 startTime: Date.now(), 266 startTime: Date.now(),
326 error: String(exception) 267 error: String(exception)
327 })); 268 }));
328 }.bind(null, url)); 269 }.bind(null, url));
329 } 270 }
330 } 271 }
331 272
332 /** 273 /**
274 * Expects to receive page info gathered in a content process for the specified
275 * `tab`. If there is no relevant message within specified `timeout` then
276 * the result promise is resolved with error object.
277 * @param tab
278 * Tab in which we are interested in
279 * @param {int} timeout
280 * Timeout in milliseconds
281 * @return {Promise} promise which will be resolved with the received page info
282 */
283 function getPageInfo(tab, timeout)
284 {
285 return new Promise((resolve, result) =>
286 {
287 const mm = tab.linkedBrowser.messageManager;
288 let timerID;
289 let onDone = (msg) =>
290 {
291 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone);
292 clearTimeout(timerID);
293 resolve(msg.data);
294 }
295 mm.addMessageListener("abpcrawler:pageInfoGathered", onDone);
296 timerID = setTimeout(() => onDone({data: {error: "timeout"}}), timeout);
297 });
298 }
299
300 /**
333 * Crawls a URL. This is a generator meant to be used via a Task object. 301 * Crawls a URL. This is a generator meant to be used via a Task object.
334 * 302 *
335 * @param {String} url 303 * @param {String} url
336 * @param {TabAllocator} tabAllocator 304 * @param {TabAllocator} tabAllocator
337 * @param {loadListener} loadListener 305 * @param {int} timeout
306 * Load timeout in milliseconds
338 * @result {Object} 307 * @result {Object}
339 * Crawling result 308 * Crawling result
340 */ 309 */
341 function* crawl_url(url, tabAllocator, loadListener) 310 function* crawl_url(url, tabAllocator, timeout)
342 { 311 {
343 let tab = yield tabAllocator.getTab(); 312 let tab = yield tabAllocator.getTab();
344 let result = {url, requests: []}; 313 let result = {url, requests: []};
345 let requestNotifier; 314 let requestNotifier;
346 try 315 try
347 { 316 {
348 result.startTime = Date.now(); 317 result.startTime = Date.now();
349 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, 318 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID,
350 function(entry, scanComplete) 319 function(entry, scanComplete)
351 { 320 {
352 if (!entry) 321 if (!entry)
353 return; 322 return;
354 let {type: contentType, location, filter} = entry; 323 let {type: contentType, location, filter} = entry;
355 result.requests.push({location, contentType, filter}); 324 result.requests.push({location, contentType, filter});
356 }); 325 });
357 326
358 tab.linkedBrowser.loadURI(url, null, null); 327 tab.linkedBrowser.loadURI(url, null, null);
359 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; 328
329 Object.assign(result, yield getPageInfo(tab, timeout));
330 result.finalUrl = tab.linkedBrowser.currentURI.spec;
360 result.endTime = Date.now(); 331 result.endTime = Date.now();
361 result.finalUrl = tab.linkedBrowser.currentURI.spec;
362
363 let document = tab.linkedBrowser.contentDocument;
364 if (document.documentElement)
365 {
366 try
367 {
368 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas");
369 canvas.width = document.documentElement.scrollWidth;
370 canvas.height = document.documentElement.scrollHeight;
371
372 let context = canvas.getContext("2d");
373 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)");
374 result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
375 }
376 catch (e)
377 {
378 reportException(e);
379 result.error = "Capturing screenshot failed: " + e;
380 }
381
382 // TODO: Capture frames as well?
383 let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
384 result.source = serializer.serializeToString(document.documentElement);
385 }
386 } 332 }
387 finally 333 finally
388 { 334 {
389 if (requestNotifier) 335 if (requestNotifier)
390 requestNotifier.shutdown(); 336 requestNotifier.shutdown();
391 tabAllocator.releaseTab(tab); 337 tabAllocator.releaseTab(tab);
392 } 338 }
393 return result; 339 return result;
394 } 340 }
395 341
396 function reportException(e) 342 function reportException(e)
397 { 343 {
398 let stack = ""; 344 let stack = "";
399 if (e && typeof e == "object" && "stack" in e) 345 if (e && typeof e == "object" && "stack" in e)
400 stack = e.stack + "\n"; 346 stack = e.stack + "\n";
401 347
402 Cu.reportError(e); 348 Cu.reportError(e);
403 dump(e + "\n" + stack + "\n"); 349 dump(e + "\n" + stack + "\n");
404 } 350 }
OLDNEW
« lib/child/frameScript.js ('K') | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld