Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/crawler.js

Issue 5288886037118976: Adblock Plus Crawler rewrite (Closed)
Patch Set: Created April 24, 2015, 3:38 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: lib/crawler.js
===================================================================
new file mode 100644
--- /dev/null
+++ b/lib/crawler.js
@@ -0,0 +1,407 @@
+/*
+ * This Source Code is subject to the terms of the Mozilla Public License
+ * version 2.0 (the "License"). You can obtain a copy of the License at
+ * http://mozilla.org/MPL/2.0/.
+ */
+
+/**
+ * @module crawler
saroyanm 2015/05/04 18:13:43 I think this should be file overview, maybe smth l
Wladimir Palant 2015/05/07 00:04:59 As Sebastian noted elsewhere, @module is actually
+ */
+
+Cu.import("resource://gre/modules/Services.jsm");
+Cu.import("resource://gre/modules/Task.jsm");
+Cu.import("resource://gre/modules/Promise.jsm");
+
+function abprequire(module)
+{
+ let result = {};
saroyanm 2015/05/04 18:13:43 We can create here Object without Object prototype
Sebastian Noack 2015/05/04 20:39:01 It seems that we don't need to bother about confli
Wladimir Palant 2015/05/07 00:04:59 Yes, this isn't a place where property conflicts c
+ result.wrappedJSObject = result;
+ Services.obs.notifyObservers(result, "adblockplus-require", module);
+ return result.exports;
+}
+
+let {Policy} = abprequire("contentPolicy");
+let {RequestNotifier} = abprequire("requestNotifier");
+let {Utils} = abprequire("utils");
+
+let dataForTab = new WeakMap();
+
+/**
+ * Creates a pool of tabs and allocates them to tasks on request.
+ *
+ * @param {tabbrowser} browser
+ * The tabbed browser where tabs should be created
+ * @param {int} maxtabs
+ * The maximum number of tabs to be allocated
+ * @constructor
+ */
+function TabAllocator(browser, maxtabs)
saroyanm 2015/05/04 18:13:43 Why don't we use default amount of maxtab equal to
Wladimir Palant 2015/05/07 00:04:59 Because the text file has 50 thousand URLs.
+{
+ browser.removeAllTabsBut(browser.tabs[0])
+
+ this._tabs = [];
+ for (let i = 0; i < maxtabs; i++)
+ this._tabs.push(browser.addTab("about:blank"));
+
+ browser.removeTab(browser.tabs[0]);
+
+ this._deferred = [];
+}
+TabAllocator.prototype = {
+ /**
+ * Returns a promise that will resolve into a tab once a tab can be allocated.
+ * The tab cannot be used by other tasks until releaseTab() is called.
+ *
+ * @result {Promise}
saroyanm 2015/05/04 18:13:43 Nit: @return ?
Wladimir Palant 2015/05/07 00:04:59 Done.
+ */
+ getTab: function()
+ {
+ if (this._tabs.length)
+ return this._tabs.shift();
+ else
+ {
+ let deferred = Promise.defer();
saroyanm 2015/05/04 18:13:43 The Deffered object is obsolete starting from Geck
Wladimir Palant 2015/05/07 00:04:59 Promise constructor in Promise.jsm is actually sup
+ this._deferred.push(deferred);
+ return deferred.promise;
+ }
+ },
+
+ /**
+ * Adds a tab back to the pool so that it can be used by other tasks.
+ *
+ * @param {tab} tab
+ */
+ releaseTab: function(tab)
+ {
+ let browser = tab.parentNode.tabbrowser;
+ browser.removeTab(tab);
+ tab = browser.addTab("about:blank");
+
+ if (this._deferred.length)
+ this._deferred.shift().resolve(tab);
saroyanm 2015/05/04 18:13:43 As mentioned above, the Deffered object is obsolet
Wladimir Palant 2015/05/07 00:04:59 Done.
+ else
+ this._tabs.push(tab);
+ }
+};
+
+/**
+ * Observes page loads in a particular tabbed browser.
+ *
+ * @param {tabbrowser} browser
+ * The tabbed browser to be observed
+ * @param {int} timeout
+ * Load timeout in milliseconds
+ * @constructor
+ */
+function LoadListener(browser, timeout)
+{
+ this._browser = browser;
+ this._deferred = new Map();
+ this._timeout = timeout;
+ browser.addTabsProgressListener(this);
+}
+LoadListener.prototype = {
+ /**
+ * Returns a promise that will be resolved when the page in the specified tab
+ * finishes loading. Loading will be stopped if the timeout is reached.
+ *
+ * @param {tab} tab
+ * @result {Promise}
saroyanm 2015/05/04 18:13:43 Nit: @return
Wladimir Palant 2015/05/07 00:04:59 Done.
+ */
+ waitForLoad: function(tab)
+ {
+ let deferred = Promise.defer();
saroyanm 2015/05/04 18:13:43 As mentioned above this is Obsolete, this comment
Wladimir Palant 2015/05/07 00:04:59 Done.
+ this._deferred.set(tab.linkedBrowser, deferred);
+
+ tab.ownerDocument.defaultView.setTimeout(function()
+ {
+ tab.linkedBrowser.stop();
+ }, this._timeout);
+
+ return deferred.promise;
+ },
+
+ /**
+ * Deactivates this object.
+ */
+ stop: function()
+ {
+ this._browser.removeTabsProgressListener(this);
+ },
+
+ onStateChange: function(browser, progress, request, flags, status)
+ {
+ if ((flags & Ci.nsIWebProgressListener.STATE_STOP) && (flags & Ci.nsIWebProgressListener.STATE_IS_WINDOW))
saroyanm 2015/05/04 18:13:43 Nit: This case can be splitted into multiple lines
+ {
+ let deferred = this._deferred.get(browser);
+ if (deferred)
+ {
+ this._deferred.delete(browser);
+
+ let headers = [];
+ if (request instanceof Ci.nsIHttpChannel)
+ {
+ try
+ {
+ headers.push("HTTP/x.x " + request.responseStatus + " " + request.responseStatusText);
+ request.visitResponseHeaders((header, value) => headers.push(header + ": " + value));
+ }
+ catch (e)
+ {
+ // Exceptions are expected here
saroyanm 2015/05/04 18:13:43 Please also handle the exception. reportException
Wladimir Palant 2015/05/07 00:04:59 As the comment explains, exceptions are expected h
+ }
+ }
+ deferred.resolve([status, headers]);
+ }
+ }
+ }
+};
+
+/**
+ * Once created, this object will make sure all new windows are dismissed
+ * immediately.
+ *
+ * @constructor
+ */
+function WindowCloser()
+{
+ Services.obs.addObserver(this, "xul-window-registered", true)
saroyanm 2015/05/04 18:13:43 Missing semicolon
Wladimir Palant 2015/05/07 00:04:59 Done.
+}
+WindowCloser.prototype = {
+ /**
+ * Deactivates this object.
+ */
+ stop: function()
+ {
+ Services.obs.removeObserver(this, "xul-window-registered")
saroyanm 2015/05/04 18:13:43 Missing semicolon
Wladimir Palant 2015/05/07 00:04:59 Done.
+ },
+
+ observe: function(subject, topic, data)
+ {
+ let window = subject.QueryInterface(Ci.nsIInterfaceRequestor)
+ .getInterface(Ci.nsIDOMWindow)
saroyanm 2015/05/04 18:13:43 Missing semicolon
Wladimir Palant 2015/05/07 00:04:59 Done.
+ window.addEventListener("load", function()
+ {
+ if (window.document.documentElement.localName == 'dialog')
saroyanm 2015/05/04 18:13:43 Nit: Please use double quote.
Wladimir Palant 2015/05/07 00:04:59 Done.
+ window.document.documentElement.acceptDialog();
+ else
+ window.close();
+ }, false);
+ },
+
+ QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakReference])
+};
+
+/**
+ * Retrieves crawler results associated with a particular content window.
+ *
+ * @param {Window} window
+ * Content window to retrieve crawler results for
+ * @result {Object}
saroyanm 2015/05/04 18:13:43 Nit: @return
Wladimir Palant 2015/05/07 00:04:59 Done.
+ * Crawler results or undefined if the window wasn't created by the crawler.
+ */
+function getDataForWindow(window)
+{
+ let topWindow = window.top;
+ if (!topWindow.document)
+ throw new Error("No document associated with the node's top window");
+ let tabbrowser = Utils.getChromeWindow(topWindow).getBrowser();
+ if (!tabbrowser)
+ throw new Error("Unable to get a tabbrowser reference from the window");
+ let browser = tabbrowser.getBrowserForDocument(topWindow.document);
+ if (!browser)
+ throw new Error("Unable to get browser for the content window");
+ let tab = tabbrowser.getTabForBrowser(browser);
+ if (!tab)
+ throw new Error("Unable to get tab for the browser");
+ return dataForTab.get(tab);
+};
saroyanm 2015/05/04 18:13:43 Nit: Semicolon is redundant here.
Wladimir Palant 2015/05/07 00:04:59 Done.
+
+/**
+ * Starts the crawling session. The crawler opens each URL in a tab and stores
+ * the results.
+ *
+ * @param {Window} window
+ * The browser window we're operating in
+ * @param {String[]} urls
+ * URLs to be crawled
+ * @param {int} number_of_tabs
+ * Maximum number of tabs to be opened
+ * @param {String} targetURL
+ * URL that should receive the results
saroyanm 2015/05/04 18:13:43 Nit: Please also document onDone parameter for con
Sebastian Noack 2015/05/04 20:39:01 Also note that JsDoc 3 will mistakenly document th
Wladimir Palant 2015/05/07 00:04:59 Done.
+ */
+function run(window, urls, timeout, maxtabs, targetURL, onDone)
+{
+ let requestNotifier = new RequestNotifier(null, function() {});
+
+ let origProcessNode = Policy.processNode;
+ Policy.processNode = processNodeReplacement.bind(null, origProcessNode, requestNotifier);
+
+ let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
+ let loadListener = new LoadListener(window.getBrowser(), timeout);
+ let running = 0;
+ let windowCloser = new WindowCloser();
+ let taskDone = function()
+ {
+ running--;
+ if (running <= 0)
+ {
+ Policy.processNode = origProcessNode;
+ requestNotifier.shutdown();
+ loadListener.stop();
+ windowCloser.stop();
+ onDone();
+ }
+ };
+
+ for (let url of urls)
+ {
+ running++;
+ Task.spawn(crawl_url.bind(null, url, tabAllocator, loadListener)).then(function(result)
+ {
+ let request = new XMLHttpRequest();
+ request.open("POST", targetURL);
+ request.addEventListener("load", taskDone, false);
+ request.addEventListener("error", taskDone, false);
+ request.send(JSON.stringify(result));
+ }, function(url, exception)
+ {
+ reportException(exception);
+
+ let request = new XMLHttpRequest();
+ request.open("POST", targetURL);
+ request.addEventListener("load", taskDone, false);
+ request.addEventListener("error", taskDone, false);
+ request.send(JSON.stringify({
+ url: url,
+ startTime: Date.now(),
+ error: String(exception)
+ }));
+ }.bind(null, url));
+ }
+}
+exports.run = run;
+
+/**
+ * Crawls a URL. This is a generator meant to be used via a Task object.
+ *
+ * @param {String} url
+ * @param {TabAllocator} tabAllocator
+ * @param {loadListener} loadListener
+ * @result {Object}
saroyanm 2015/05/04 18:13:43 Nit: @return {Object} Crawling result
Wladimir Palant 2015/05/07 00:04:59 Done.
+ * Crawling result
+ */
+function* crawl_url(url, tabAllocator, loadListener)
+{
+ let tab = yield tabAllocator.getTab();
+ let result = {url: url};
+
+ dataForTab.set(tab, result);
+ try
+ {
+ result.startTime = Date.now();
+ tab.linkedBrowser.loadURI(url, null, null);
+ [result.channelStatus, result.headers] = yield loadListener.waitForLoad(tab);
+ result.endTime = Date.now();
+ result.finalUrl = tab.linkedBrowser.currentURI.spec;
+
+ let document = tab.linkedBrowser.contentDocument;
+ if (document.documentElement)
+ {
+ try
+ {
+ let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "canvas");
+ canvas.width = document.documentElement.scrollWidth;
+ canvas.height = document.documentElement.scrollHeight;
+
+ let context = canvas.getContext("2d");
+ context.drawWindow(document.defaultView, 0, 0, canvas.width, canvas.height, "rgb(255, 255, 255)");
+ result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
saroyanm 2015/05/04 18:13:43 Maybe make sense to let user specify the quality o
Wladimir Palant 2015/05/07 00:04:59 Well, changing maxtabs is very useful when testing
saroyanm 2015/05/07 13:19:00 Good point.
+ }
+ catch (e)
+ {
+ reportException(e);
+ result.error = "Capturing screenshot failed: " + e;
saroyanm 2015/05/04 18:13:43 Isn't result.error redundant ?
Wladimir Palant 2015/05/07 00:04:59 No. The exception is merely reported to the consol
saroyanm 2015/05/07 13:19:00 Yes, missed that. Good point.
+ }
+
+ // TODO: Capture frames as well?
saroyanm 2015/05/04 18:13:43 Nit: I think we shouldn't have TODO comments in re
Wladimir Palant 2015/05/07 00:04:59 Why not? This is something we might want to do in
+ let serializer = new tab.ownerDocument.defaultView.XMLSerializer();
+ result.source = serializer.serializeToString(document.documentElement);
+ }
+ }
+ finally
+ {
+ tabAllocator.releaseTab(tab);
+ }
+ return result;
+}
+
+function reportException(e)
+{
+ let stack = "";
+ if (e && typeof e == "object" && "stack" in e)
+ stack = e.stack + "\n";
+
+ Cu.reportError(e);
+ dump(e + "\n" + stack + "\n");
+}
+
+/**
+ * Wrapper for the Policy.processNode() function in ABP. Calls the original
+ * function and records all the data.
+ *
+ * @param {Function} origProcessNode
+ * The original processNode function.
+ * @param {RequestNotifier} requestNotifier
+ * The crawler's RequestNotifier object instance.
+ * @param {nsIDOMWindow} wnd
+ * @param {nsIDOMElement} node
+ * @param {Number} contentType
+ * @param {nsIURI} location
+ * @param {Boolean} collapse
+ * @return {Boolean}
+ */
+function processNodeReplacement(origProcessNode, requestNotifier, wnd, node, contentType, location, collapse)
+{
+ let filters = [];
+ let origListener = requestNotifier.listener;
+ requestNotifier.listener = function(window, node, entry)
+ {
+ if (entry.filter)
+ filters.push(entry.filter.text);
+ };
+
+ /*
+ * Call the original processNode. If the original throws, then we will too, so this is outside a try clause.
saroyanm 2015/05/04 18:13:43 Nit: the line is too long.
Sebastian Noack 2015/05/04 20:39:01 Also I suggest to use simply // for comments expla
Wladimir Palant 2015/05/07 00:04:59 Done.
+ */
+ let result;
+ try
+ {
+ result = origProcessNode(wnd, node, contentType, location, collapse);
+ }
+ finally
+ {
+ requestNotifier.listener = origListener;
+ }
+
+ try
+ {
+ let data = getDataForWindow(wnd);
+ if (data)
+ {
+ if (!("requests" in data))
+ data.requests = [];
+ data.requests.push({
+ contentType: contentType,
+ location: (contentType == Policy.type.ELEMHIDE ? location.text : location.spec),
+ blocked: result != Ci.nsIContentPolicy.ACCEPT,
+ filters: filters
+ });
+ }
+ }
+ catch (e)
+ {
+ reportException(e);
+ }
+ return result;
+};
saroyanm 2015/05/04 18:13:43 Nit: Semicolon is redundant here.
Wladimir Palant 2015/05/07 00:04:59 Done.

Powered by Google App Engine
This is Rietveld