Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/crawler.js

Issue 29338242: Issue 3792 - Fix to support multiprocess firefox (Closed)
Patch Set: Created March 14, 2016, 2:41 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« lib/child/frameScript.js ('K') | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License 2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at 3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/. 4 * http://mozilla.org/MPL/2.0/.
5 */ 5 */
6 6
7 /** 7 /**
8 * @module crawler 8 * @module crawler
9 */ 9 */
10 10
11 Cu.import("resource://gre/modules/Services.jsm"); 11 Cu.import("resource://gre/modules/Services.jsm");
12 Cu.import("resource://gre/modules/Task.jsm"); 12 Cu.import("resource://gre/modules/Task.jsm");
13 Cu.import("resource://gre/modules/Promise.jsm"); 13 Cu.import("resource://gre/modules/Promise.jsm");
14 Cu.import("resource://gre/modules/Timer.jsm");
14 15
15 function abprequire(module) 16 function abprequire(module)
16 { 17 {
17 let result = {}; 18 let result = {};
18 result.wrappedJSObject = result; 19 result.wrappedJSObject = result;
19 Services.obs.notifyObservers(result, "adblockplus-require", module); 20 Services.obs.notifyObservers(result, "adblockplus-require", module);
20 return result.exports; 21 return result.exports;
21 } 22 }
22 23
23 let {RequestNotifier} = abprequire("requestNotifier"); 24 let {RequestNotifier} = abprequire("requestNotifier");
24
25 let {FilterNotifier} = abprequire("filterNotifier"); 25 let {FilterNotifier} = abprequire("filterNotifier");
26 let {FilterStorage} = abprequire("filterStorage"); 26 let {FilterStorage} = abprequire("filterStorage");
27 27
28 /** 28 /**
29 * Creates a pool of tabs and allocates them to tasks on request. 29 * Creates a pool of tabs and allocates them to tasks on request.
30 * 30 *
31 * @param {tabbrowser} browser 31 * @param {tabbrowser} browser
32 * The tabbed browser where tabs should be created 32 * The tabbed browser where tabs should be created
33 * @param {int} maxtabs 33 * @param {int} maxtabs
34 * The maximum number of tabs to be allocated 34 * The maximum number of tabs to be allocated
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
77 tab = browser.addTab("about:blank"); 77 tab = browser.addTab("about:blank");
78 78
79 if (this._deferred.length) 79 if (this._deferred.length)
80 this._deferred.shift().resolve(tab); 80 this._deferred.shift().resolve(tab);
81 else 81 else
82 this._tabs.push(tab); 82 this._tabs.push(tab);
83 } 83 }
84 }; 84 };
85 85
86 /** 86 /**
87 * Observes page loads in a particular tabbed browser.
88 *
89 * @param {tabbrowser} browser
90 * The tabbed browser to be observed
91 * @param {int} timeout
92 * Load timeout in milliseconds
93 * @constructor
94 */
95 function LoadListener(browser, timeout)
96 {
97 this._browser = browser;
98 this._deferred = new Map();
99 this._timeout = timeout;
100 browser.addTabsProgressListener(this);
101 }
102 LoadListener.prototype = {
103 /**
104 * Returns a promise that will be resolved when the page in the specified tab
105 * finishes loading. Loading will be stopped if the timeout is reached.
106 *
107 * @param {tab} tab
108 * @result {Promise}
109 */
110 waitForLoad: function(tab)
111 {
112 let deferred = Promise.defer();
113 this._deferred.set(tab.linkedBrowser, deferred);
114
115 tab.ownerDocument.defaultView.setTimeout(function()
116 {
117 tab.linkedBrowser.stop();
118 }, this._timeout);
119
120 return deferred.promise;
121 },
122
123 /**
124 * Deactivates this object.
125 */
126 stop: function()
127 {
128 this._browser.removeTabsProgressListener(this);
129 },
130
131 onStateChange: function(browser, progress, request, flags, status)
132 {
133 if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProg ressListener.STATE_IS_WINDOW))
134 {
135 let deferred = this._deferred.get(browser);
136 if (deferred)
137 {
138 this._deferred.delete(browser);
139
140 let headers = [];
141 if (request instanceof Ci.nsIHttpChannel)
142 {
143 try
144 {
145 headers.push("HTTP/x.x " + request.responseStatus + " " + request.re sponseStatusText);
146 request.visitResponseHeaders((header, value) => headers.push(header + ": " + value));
147 }
148 catch (e)
149 {
150 // Exceptions are expected here
151 }
152 }
153 deferred.resolve([status, headers]);
154 }
155 }
156 }
157 };
Wladimir Palant 2016/03/15 10:07:10 Why did you move this functionality into the conte
sergei 2016/03/15 16:40:10 In e10s it does not work in chrome process, we can
158
159 /**
160 * Once created, this object will make sure all new windows are dismissed 87 * Once created, this object will make sure all new windows are dismissed
161 * immediately. 88 * immediately.
162 * 89 *
163 * @constructor 90 * @constructor
164 */ 91 */
165 function WindowCloser() 92 function WindowCloser()
166 { 93 {
167 Services.obs.addObserver(this, "xul-window-registered", true) 94 Services.obs.addObserver(this, "xul-window-registered", true)
168 } 95 }
169 WindowCloser.prototype = { 96 WindowCloser.prototype = {
(...skipping 30 matching lines...) Expand all
200 * @param {String[]} urls 127 * @param {String[]} urls
201 * URLs to be crawled 128 * URLs to be crawled
202 * @param {int} number_of_tabs 129 * @param {int} number_of_tabs
203 * Maximum number of tabs to be opened 130 * Maximum number of tabs to be opened
204 * @param {String} targetURL 131 * @param {String} targetURL
205 * URL that should receive the results 132 * URL that should receive the results
206 */ 133 */
207 function run(window, urls, timeout, maxtabs, targetURL, onDone) 134 function run(window, urls, timeout, maxtabs, targetURL, onDone)
208 { 135 {
209 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); 136 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
210 let loadListener = new LoadListener(window.getBrowser(), timeout); 137
211 let running = 0; 138 let running = 0;
212 let windowCloser = new WindowCloser(); 139 let windowCloser = new WindowCloser();
213 let taskDone = function() 140 let taskDone = function()
214 { 141 {
215 running--; 142 running--;
216 if (running <= 0) 143 if (running <= 0)
217 { 144 {
218 loadListener.stop();
219 windowCloser.stop(); 145 windowCloser.stop();
220 onDone(); 146 onDone();
221 } 147 }
222 }; 148 };
223 149
224 new Promise(function(resolve, reject) 150 new Promise(function(resolve, reject)
225 { 151 {
226 if (FilterStorage.subscriptions.length > 0 && !FilterStorage._loading) 152 if (FilterStorage.subscriptions.length > 0 && !FilterStorage._loading)
227 { 153 {
228 resolve(); 154 resolve();
229 return; 155 return;
230 } 156 }
231 FilterNotifier.addListener((action, item, newValue, oldValue) => 157 FilterNotifier.addListener((action, item, newValue, oldValue) =>
232 { 158 {
233 if (action === "load") 159 if (action === "load")
234 { 160 {
235 resolve(); 161 resolve();
236 } 162 }
237 }); 163 });
238 }).then(_ => 164 }).then(_ =>
239 { 165 {
240 for (let url of urls) 166 for (let url of urls)
241 { 167 {
242 running++; 168 running++;
243 Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(fun ction(result) 169 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function (result)
244 { 170 {
245 let request = new XMLHttpRequest(); 171 let request = new XMLHttpRequest();
246 request.open("POST", targetURL); 172 request.open("POST", targetURL);
247 request.addEventListener("load", taskDone, false); 173 request.addEventListener("load", taskDone, false);
248 request.addEventListener("error", taskDone, false); 174 request.addEventListener("error", taskDone, false);
249 request.send(JSON.stringify(result)); 175 request.send(JSON.stringify(result));
250 }, function(url, exception) 176 }, function(url, exception)
251 { 177 {
252 reportException(exception); 178 reportException(exception);
253 179
(...skipping 13 matching lines...) Expand all
267 // and does not contain any waiting code. 193 // and does not contain any waiting code.
268 }).catch(reportException); 194 }).catch(reportException);
269 } 195 }
270 exports.run = run; 196 exports.run = run;
271 197
272 /** 198 /**
273 * Crawls a URL. This is a generator meant to be used via a Task object. 199 * Crawls a URL. This is a generator meant to be used via a Task object.
274 * 200 *
275 * @param {String} url 201 * @param {String} url
276 * @param {TabAllocator} tabAllocator 202 * @param {TabAllocator} tabAllocator
277 * @param {loadListener} loadListener
278 * @result {Object} 203 * @result {Object}
279 * Crawling result 204 * Crawling result
280 */ 205 */
281 function* crawl_url(url, tabAllocator, loadListener) 206 function* crawl_url(url, tabAllocator, timeout)
282 { 207 {
283 let tab = yield tabAllocator.getTab(); 208 let tab = yield tabAllocator.getTab();
284 let result = {url, requests: []}; 209 let result = {url, requests: []};
285 210
286 try 211 try
287 { 212 {
288 result.startTime = Date.now(); 213 result.startTime = Date.now();
289 let requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, f unction({type, location, filter}, scanComplete) 214 let requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, f unction({type, location, filter}, scanComplete)
sergei 2016/03/15 16:40:10 BTW, in addition, this part stops to work, I have
sergei 2016/03/16 14:44:23 https://issues.adblockplus.org/ticket/3815
290 { 215 {
291 result.requests.push({location, contentType: type, filter}); 216 result.requests.push({location, contentType: type, filter});
292 }); 217 });
293 218
294 tab.linkedBrowser.loadURI(url, null, null); 219 tab.linkedBrowser.loadURI(url, null, null);
295 [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab) ; 220
221 let mm = tab.linkedBrowser.messageManager;
222 let pageInfoFuture = new Promise((resolve, result) =>
223 {
224 let timerID;
225 let onDone = (pageInfo) =>
226 {
227 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone);
Wladimir Palant 2016/03/15 10:07:10 So, which tab did you get the page info for? The
sergei 2016/03/16 14:44:23 For the `tab`, it's "browser message manager" it a
228 clearTimeout(timerID);
229 resolve(pageInfo);
230 }
231 mm.addMessageListener("abpcrawler:pageInfoGathered", (msg) => onDone(msg.d ata));;
232 timerID = setTimeout(onDone.bind(this, {error: "timeout"}), timeout);
233 });
Wladimir Palant 2016/03/15 10:07:10 Please move this functionality into a separate fun
sergei 2016/03/16 14:44:23 Done.
234
235 let pageInfo = yield pageInfoFuture;
236
237 result.finalUrl = tab.linkedBrowser.currentURI.spec;
238 Object.assign(result, pageInfo);
296 result.endTime = Date.now(); 239 result.endTime = Date.now();
297 result.finalUrl = tab.linkedBrowser.currentURI.spec;
298
299 let document = tab.linkedBrowser.contentDocument;
300 if (document.documentElement)
301 {
302 try
303 {
304 let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "c anvas");
305 canvas.width = document.documentElement.scrollWidth;
306 canvas.height = document.documentElement.scrollHeight;
307
308 let context = canvas.getContext("2d");
309 context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.heig ht, "rgb(255, 255, 255)");
310 result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
311 }
312 catch (e)
313 {
314 reportException(e);
315 result.error = "Capturing screenshot failed: " + e;
316 }
317
318 // TODO: Capture frames as well?
319 let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
320 result.source = serializer.serializeToString(document.documentElement);
321 }
322 } 240 }
323 finally 241 finally
324 { 242 {
325 tabAllocator.releaseTab(tab); 243 tabAllocator.releaseTab(tab);
326 } 244 }
327 return result; 245 return result;
328 } 246 }
329 247
330 function reportException(e) 248 function reportException(e)
331 { 249 {
332 let stack = ""; 250 let stack = "";
333 if (e && typeof e == "object" && "stack" in e) 251 if (e && typeof e == "object" && "stack" in e)
334 stack = e.stack + "\n"; 252 stack = e.stack + "\n";
335 253
336 Cu.reportError(e); 254 Cu.reportError(e);
337 dump(e + "\n" + stack + "\n"); 255 dump(e + "\n" + stack + "\n");
338 } 256 }
257
258 let {addonRoot} = require("info");
259 let frameScriptPath = addonRoot + "/lib/child/frameScript.js";
260 let globalMessageManager = Services.mm;
261 globalMessageManager.loadFrameScript(frameScriptPath, true);
Wladimir Palant 2016/03/15 10:07:10 This should be a process script, no point using a
262
263 let onReportException = function(msg)
264 {
265 reportException(msg.objects);
Wladimir Palant 2016/03/15 10:07:10 Please don't use msg.objects - ever. That's a wrap
sergei 2016/03/16 14:44:23 Acknowledged. Actually, I wanted to avoid duplicat
sergei 2016/03/16 14:44:23 I know, I used `msg.objects` because we don't know
Wladimir Palant 2016/09/14 16:11:46 Worst-case scenario: deadlocks because all of that
266 }
267 globalMessageManager.addMessageListener("abpcrawler:reportException", onReportEx ception);
268
269 onShutdown.add(() =>
270 {
271 globalMessageManager.removeMessageListener("abpcrawler:reportException", onRep ortException);
272 globalMessageManager.removeDelayedFrameScript(frameScriptPath);
273 });
OLDNEW
« lib/child/frameScript.js ('K') | « lib/child/frameScript.js ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld