Index: lib/crawler.js |
=================================================================== |
new file mode 100644 |
--- /dev/null |
+++ b/lib/crawler.js |
@@ -0,0 +1,407 @@ |
+/* |
+ * This Source Code is subject to the terms of the Mozilla Public License |
+ * version 2.0 (the "License"). You can obtain a copy of the License at |
+ * http://mozilla.org/MPL/2.0/. |
+ */ |
+ |
+/** |
+ * @module crawler |
saroyanm
2015/05/04 18:13:43
I think this should be file overview, maybe smth l
Wladimir Palant
2015/05/07 00:04:59
As Sebastian noted elsewhere, @module is actually
|
+ */ |
+ |
+Cu.import("resource://gre/modules/Services.jsm"); |
+Cu.import("resource://gre/modules/Task.jsm"); |
+Cu.import("resource://gre/modules/Promise.jsm"); |
+ |
+function abprequire(module) |
+{ |
+ let result = {}; |
saroyanm
2015/05/04 18:13:43
We can create here Object without Object prototype
Sebastian Noack
2015/05/04 20:39:01
It seems that we don't need to bother about confli
Wladimir Palant
2015/05/07 00:04:59
Yes, this isn't a place where property conflicts c
|
+ result.wrappedJSObject = result; |
+ Services.obs.notifyObservers(result, "adblockplus-require", module); |
+ return result.exports; |
+} |
+ |
+let {Policy} = abprequire("contentPolicy"); |
+let {RequestNotifier} = abprequire("requestNotifier"); |
+let {Utils} = abprequire("utils"); |
+ |
+let dataForTab = new WeakMap(); |
+ |
+/** |
+ * Creates a pool of tabs and allocates them to tasks on request. |
+ * |
+ * @param {tabbrowser} browser |
+ * The tabbed browser where tabs should be created |
+ * @param {int} maxtabs |
+ * The maximum number of tabs to be allocated |
+ * @constructor |
+ */ |
+function TabAllocator(browser, maxtabs) |
saroyanm
2015/05/04 18:13:43
Why don't we use default amount of maxtab equal to
Wladimir Palant
2015/05/07 00:04:59
Because the text file has 50 thousand URLs.
|
+{ |
+ browser.removeAllTabsBut(browser.tabs[0]) |
+ |
+ this._tabs = []; |
+ for (let i = 0; i < maxtabs; i++) |
+ this._tabs.push(browser.addTab("about:blank")); |
+ |
+ browser.removeTab(browser.tabs[0]); |
+ |
+ this._deferred = []; |
+} |
+TabAllocator.prototype = { |
+ /** |
+ * Returns a promise that will resolve into a tab once a tab can be allocated. |
+ * The tab cannot be used by other tasks until releaseTab() is called. |
+ * |
+ * @result {Promise} |
saroyanm
2015/05/04 18:13:43
Nit: @return ?
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ */ |
+ getTab: function() |
+ { |
+ if (this._tabs.length) |
+ return this._tabs.shift(); |
+ else |
+ { |
+ let deferred = Promise.defer(); |
saroyanm
2015/05/04 18:13:43
The Deffered object is obsolete starting from Geck
Wladimir Palant
2015/05/07 00:04:59
Promise constructor in Promise.jsm is actually sup
|
+ this._deferred.push(deferred); |
+ return deferred.promise; |
+ } |
+ }, |
+ |
+ /** |
+ * Adds a tab back to the pool so that it can be used by other tasks. |
+ * |
+ * @param {tab} tab |
+ */ |
+ releaseTab: function(tab) |
+ { |
+ let browser = tab.parentNode.tabbrowser; |
+ browser.removeTab(tab); |
+ tab = browser.addTab("about:blank"); |
+ |
+ if (this._deferred.length) |
+ this._deferred.shift().resolve(tab); |
saroyanm
2015/05/04 18:13:43
As mentioned above, the Deffered object is obsolet
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ else |
+ this._tabs.push(tab); |
+ } |
+}; |
+ |
+/** |
+ * Observes page loads in a particular tabbed browser. |
+ * |
+ * @param {tabbrowser} browser |
+ * The tabbed browser to be observed |
+ * @param {int} timeout |
+ * Load timeout in milliseconds |
+ * @constructor |
+ */ |
+function LoadListener(browser, timeout) |
+{ |
+ this._browser = browser; |
+ this._deferred = new Map(); |
+ this._timeout = timeout; |
+ browser.addTabsProgressListener(this); |
+} |
+LoadListener.prototype = { |
+ /** |
+ * Returns a promise that will be resolved when the page in the specified tab |
+ * finishes loading. Loading will be stopped if the timeout is reached. |
+ * |
+ * @param {tab} tab |
+ * @result {Promise} |
saroyanm
2015/05/04 18:13:43
Nit: @return
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ */ |
+ waitForLoad: function(tab) |
+ { |
+ let deferred = Promise.defer(); |
saroyanm
2015/05/04 18:13:43
As mentioned above this is Obsolete, this comment
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ this._deferred.set(tab.linkedBrowser, deferred); |
+ |
+ tab.ownerDocument.defaultView.setTimeout(function() |
+ { |
+ tab.linkedBrowser.stop(); |
+ }, this._timeout); |
+ |
+ return deferred.promise; |
+ }, |
+ |
+ /** |
+ * Deactivates this object. |
+ */ |
+ stop: function() |
+ { |
+ this._browser.removeTabsProgressListener(this); |
+ }, |
+ |
+ onStateChange: function(browser, progress, request, flags, status) |
+ { |
+ if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProgressListener.STATE_IS_WINDOW)) |
saroyanm
2015/05/04 18:13:43
Nit: This case can be splitted into multiple lines
|
+ { |
+ let deferred = this._deferred.get(browser); |
+ if (deferred) |
+ { |
+ this._deferred.delete(browser); |
+ |
+ let headers = []; |
+ if (request instanceof Ci.nsIHttpChannel) |
+ { |
+ try |
+ { |
+ headers.push("HTTP/x.x " + request.responseStatus + " " + request.responseStatusText); |
+ request.visitResponseHeaders((header, value) => headers.push(header + ": " + value)); |
+ } |
+ catch (e) |
+ { |
+ // Exceptions are expected here |
saroyanm
2015/05/04 18:13:43
Please also handle the exception.
reportException
Wladimir Palant
2015/05/07 00:04:59
As the comment explains, exceptions are expected h
|
+ } |
+ } |
+ deferred.resolve([status, headers]); |
+ } |
+ } |
+ } |
+}; |
+ |
+/** |
+ * Once created, this object will make sure all new windows are dismissed |
+ * immediately. |
+ * |
+ * @constructor |
+ */ |
+function WindowCloser() |
+{ |
+ Services.obs.addObserver(this, "xul-window-registered", true) |
saroyanm
2015/05/04 18:13:43
Missing semicolon
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+} |
+WindowCloser.prototype = { |
+ /** |
+ * Deactivates this object. |
+ */ |
+ stop: function() |
+ { |
+ Services.obs.removeObserver(this, "xul-window-registered") |
saroyanm
2015/05/04 18:13:43
Missing semicolon
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ }, |
+ |
+ observe: function(subject, topic, data) |
+ { |
+ let window = subject.QueryInterface(Ci.nsIInterfaceRequestor) |
+ .getInterface(Ci.nsIDOMWindow) |
saroyanm
2015/05/04 18:13:43
Missing semicolon
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ window.addEventListener("load", function() |
+ { |
+ if (window.document.documentElement.localName == 'dialog') |
saroyanm
2015/05/04 18:13:43
Nit: Please use double quote.
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ window.document.documentElement.acceptDialog(); |
+ else |
+ window.close(); |
+ }, false); |
+ }, |
+ |
+ QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakReference]) |
+}; |
+ |
+/** |
+ * Retrieves crawler results associated with a particular content window. |
+ * |
+ * @param {Window} window |
+ * Content window to retrieve crawler results for |
+ * @result {Object} |
saroyanm
2015/05/04 18:13:43
Nit: @return
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ * Crawler results or undefined if the window wasn't created by the crawler. |
+ */ |
+function getDataForWindow(window) |
+{ |
+ let topWindow = window.top; |
+ if (!topWindow.document) |
+ throw new Error("No document associated with the node's top window"); |
+ let tabbrowser = Utils.getChromeWindow(topWindow).getBrowser(); |
+ if (!tabbrowser) |
+ throw new Error("Unable to get a tabbrowser reference from the window"); |
+ let browser = tabbrowser.getBrowserForDocument(topWindow.document); |
+ if (!browser) |
+ throw new Error("Unable to get browser for the content window"); |
+ let tab = tabbrowser.getTabForBrowser(browser); |
+ if (!tab) |
+ throw new Error("Unable to get tab for the browser"); |
+ return dataForTab.get(tab); |
+}; |
saroyanm
2015/05/04 18:13:43
Nit: Semicolon is redundant here.
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ |
+/** |
+ * Starts the crawling session. The crawler opens each URL in a tab and stores |
+ * the results. |
+ * |
+ * @param {Window} window |
+ * The browser window we're operating in |
+ * @param {String[]} urls |
+ * URLs to be crawled |
+ * @param {int} number_of_tabs |
+ * Maximum number of tabs to be opened |
+ * @param {String} targetURL |
+ * URL that should receive the results |
saroyanm
2015/05/04 18:13:43
Nit: Please also document onDone parameter for con
Sebastian Noack
2015/05/04 20:39:01
Also note that JsDoc 3 will mistakenly document th
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ */ |
+function run(window, urls, timeout, maxtabs, targetURL, onDone) |
+{ |
+ let requestNotifier = new RequestNotifier(null, function() {}); |
+ |
+ let origProcessNode = Policy.processNode; |
+ Policy.processNode = processNodeReplacement.bind(null, origProcessNode, requestNotifier); |
+ |
+ let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); |
+ let loadListener = new LoadListener(window.getBrowser(), timeout); |
+ let running = 0; |
+ let windowCloser = new WindowCloser(); |
+ let taskDone = function() |
+ { |
+ running--; |
+ if (running <= 0) |
+ { |
+ Policy.processNode = origProcessNode; |
+ requestNotifier.shutdown(); |
+ loadListener.stop(); |
+ windowCloser.stop(); |
+ onDone(); |
+ } |
+ }; |
+ |
+ for (let url of urls) |
+ { |
+ running++; |
+ Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(function(result) |
+ { |
+ let request = new XMLHttpRequest(); |
+ request.open("POST", targetURL); |
+ request.addEventListener("load", taskDone, false); |
+ request.addEventListener("error", taskDone, false); |
+ request.send(JSON.stringify(result)); |
+ }, function(url, exception) |
+ { |
+ reportException(exception); |
+ |
+ let request = new XMLHttpRequest(); |
+ request.open("POST", targetURL); |
+ request.addEventListener("load", taskDone, false); |
+ request.addEventListener("error", taskDone, false); |
+ request.send(JSON.stringify({ |
+ url: url, |
+ startTime: Date.now(), |
+ error: String(exception) |
+ })); |
+ }.bind(null, url)); |
+ } |
+} |
+exports.run = run; |
+ |
+/** |
+ * Crawls a URL. This is a generator meant to be used via a Task object. |
+ * |
+ * @param {String} url |
+ * @param {TabAllocator} tabAllocator |
+ * @param {loadListener} loadListener |
+ * @result {Object} |
saroyanm
2015/05/04 18:13:43
Nit: @return {Object} Crawling result
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ * Crawling result |
+ */ |
+function* crawl_url(url, tabAllocator, loadListener) |
+{ |
+ let tab = yield tabAllocator.getTab(); |
+ let result = {url: url}; |
+ |
+ dataForTab.set(tab, result); |
+ try |
+ { |
+ result.startTime = Date.now(); |
+ tab.linkedBrowser.loadURI(url, null, null); |
+ [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab); |
+ result.endTime = Date.now(); |
+ result.finalUrl = tab.linkedBrowser.currentURI.spec; |
+ |
+ let document = tab.linkedBrowser.contentDocument; |
+ if (document.documentElement) |
+ { |
+ try |
+ { |
+ let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "canvas"); |
+ canvas.width = document.documentElement.scrollWidth; |
+ canvas.height = document.documentElement.scrollHeight; |
+ |
+ let context = canvas.getContext("2d"); |
+ context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.height, "rgb(255, 255, 255)"); |
+ result.screenshot = canvas.toDataURL("image/jpeg", 0.8); |
saroyanm
2015/05/04 18:13:43
Maybe make sense to let user specify the quality o
Wladimir Palant
2015/05/07 00:04:59
Well, changing maxtabs is very useful when testing
saroyanm
2015/05/07 13:19:00
Good point.
|
+ } |
+ catch (e) |
+ { |
+ reportException(e); |
+ result.error = "Capturing screenshot failed: " + e; |
saroyanm
2015/05/04 18:13:43
Isn't result.error redundant ?
Wladimir Palant
2015/05/07 00:04:59
No. The exception is merely reported to the consol
saroyanm
2015/05/07 13:19:00
Yes, missed that.
Good point.
|
+ } |
+ |
+ // TODO: Capture frames as well? |
saroyanm
2015/05/04 18:13:43
Nit: I think we shouldn't have TODO comments in re
Wladimir Palant
2015/05/07 00:04:59
Why not? This is something we might want to do in
|
+ let serializer = new tab.ownerDocument.defaultView.XMLSerializer(); |
+ result.source = serializer.serializeToString(document.documentElement); |
+ } |
+ } |
+ finally |
+ { |
+ tabAllocator.releaseTab(tab); |
+ } |
+ return result; |
+} |
+ |
+function reportException(e) |
+{ |
+ let stack = ""; |
+ if (e && typeof e == "object" && "stack" in e) |
+ stack = e.stack + "\n"; |
+ |
+ Cu.reportError(e); |
+ dump(e + "\n" + stack + "\n"); |
+} |
+ |
+/** |
+ * Wrapper for the Policy.processNode() function in ABP. Calls the original |
+ * function and records all the data. |
+ * |
+ * @param {Function} origProcessNode |
+ * The original processNode function. |
+ * @param {RequestNotifier} requestNotifier |
+ * The crawler's RequestNotifier object instance. |
+ * @param {nsIDOMWindow} wnd |
+ * @param {nsIDOMElement} node |
+ * @param {Number} contentType |
+ * @param {nsIURI} location |
+ * @param {Boolean} collapse |
+ * @return {Boolean} |
+ */ |
+function processNodeReplacement(origProcessNode, requestNotifier, wnd, node, contentType, location, collapse) |
+{ |
+ let filters = []; |
+ let origListener = requestNotifier.listener; |
+ requestNotifier.listener = function(window, node, entry) |
+ { |
+ if (entry.filter) |
+ filters.push(entry.filter.text); |
+ }; |
+ |
+ /* |
+ * Call the original processNode. If the original throws, then we will too, so this is outside a try clause. |
saroyanm
2015/05/04 18:13:43
Nit: the line is too long.
Sebastian Noack
2015/05/04 20:39:01
Also I suggest to use simply // for comments expla
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ */ |
+ let result; |
+ try |
+ { |
+ result = origProcessNode(wnd, node, contentType, location, collapse); |
+ } |
+ finally |
+ { |
+ requestNotifier.listener = origListener; |
+ } |
+ |
+ try |
+ { |
+ let data = getDataForWindow(wnd); |
+ if (data) |
+ { |
+ if (!("requests" in data)) |
+ data.requests = []; |
+ data.requests.push({ |
+ contentType: contentType, |
+ location: (contentType == Policy.type.ELEMHIDE ? location.text : location.spec), |
+ blocked: result != Ci.nsIContentPolicy.ACCEPT, |
+ filters: filters |
+ }); |
+ } |
+ } |
+ catch (e) |
+ { |
+ reportException(e); |
+ } |
+ return result; |
+}; |
saroyanm
2015/05/04 18:13:43
Nit: Semicolon is redundant here.
Wladimir Palant
2015/05/07 00:04:59
Done.
|