lib/typoRules.js - Issue 8559070: Integrated URL Fixer into Adblock Plus

Unified Diff: lib/typoRules.js

Issue 8559070: Integrated URL Fixer into Adblock Plus (Closed)

Patch Set: Created Nov. 9, 2012, 3:21 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: lib/typoRules.js

===================================================================

new file mode 100644

--- /dev/null

+++ b/lib/typoRules.js

@@ -0,0 +1,409 @@

+/* This Source Code Form is subject to the terms of the Mozilla Public

+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,

+ * You can obtain one at http://mozilla.org/MPL/2.0/. */

+Cu.import("resource://gre/modules/Services.jsm");

+Cu.import("resource://gre/modules/FileUtils.jsm");

+let {Prefs} = require("prefs");

+let RULES_VERSION = 2;

+let CUSTOM_RULE_PRIORITY = 0x7FFFFFFF;

+let rules = {expressions: []};

+loadRules();

+// Make first attempt to update rules after five minutes

+let updateTimer = null;

+updateTimer = Cc["@mozilla.org/timer;1"].createInstance(Ci.nsITimer);

+updateTimer.initWithCallback(onTimer, 1000 * 60 * 5, Ci.nsITimer.TYPE_REPEATING_SLACK);

+onShutdown.add(function() updateTimer.cancel());

+function loadRules()

+ loadRulesFrom(Services.io.newFileURI(getRuleFile()).spec, false, function(success)

+ {

+ if (!success)

+ loadRulesFrom(require("info").addonRoot + "defaults/typoRules.json", true);

+ });

+function loadRulesFrom(url, ignoreVersion, callback)

+ if (typeof callback != "function")

+ callback = function() {};

+ let request = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(Ci.nsIXMLHttpRequest);

+ request.open("GET", url);

+ request.overrideMimeType("text/plain");

+ request.addEventListener("load", function()

+ {

+ try

+ {

+ // Remove comments from the file if any

+ let data = JSON.parse(request.responseText.replace(/^\s*\/\/.*/mg, ""));

+ if (ignoreVersion || data.version == RULES_VERSION)

+ {

+ rules = data;

+ callback(true);

+ // Add user-defined rules after calling the callback - if the callback

+ // saves the rules then the custom rules won't be included.

+ addCustomRules();

+ }

+ else

+ callback(false);

+ }

+ catch (e)

+ {

+ Cu.reportError(e);

+ callback(false);

+ }

+ }, false);

+ request.addEventListener("error", function()

+ {

+ callback(false);

+ }, false);

+ try

+ {

+ request.send(null);

+ }

+ catch (e)

+ {

+ if (e.result != Cr.NS_ERROR_FILE_NOT_FOUND)

+ Cu.reportError(e);

+ callback(false);

+ }

+function getRuleFile()

+ let result = FileUtils.getFile("ProfD", [require("info").addonName + "-rules.json"]);

+ getRuleFile = function() result;

+ return getRuleFile();

+function addCustomRules()

+ for (let domain in Prefs.whitelist)

+ onWhitelistEntryAdded(domain);

+function onWhitelistEntryAdded(domain)

+ let reverse = domain.split("").reverse().join("");

+ addSuffix(rules.domain, reverse, CUSTOM_RULE_PRIORITY);

+exports.onWhitelistEntryAdded = onWhitelistEntryAdded;

+function onWhitelistEntryRemoved(domain)

+ let reverse = domain.split("").reverse().join("");

+ removeSuffix(rules.domain, reverse, CUSTOM_RULE_PRIORITY);

+exports.onWhitelistEntryRemoved = onWhitelistEntryRemoved;

+function addSuffix(tree, suffix, priority)

+ if (suffix.length == 0)

+ {

+ // We are at the last character, just put our priority here

+ tree[""] = " " + priority;

+ return;

+ }

+ let c = suffix[0];

+ if (c in tree)

+ {

+ let existing = tree[c];

+ if (typeof existing == "string")

+ {

+ // Single choice for this suffix, maybe the same entry?

+ if (existing.substr(0, suffix.length - 1) == suffix.substr(1) && existing[suffix.length - 1] == " ")

+ {

+ // Same entry, simply replace it by new priority

+ tree[c] = suffix.substr(1) + " " + priority;

+ }

+ else

+ {

+ // Different entry, need to add a new branching point and go deeper

+ if (existing[0] == " ")

+ tree[c] = {"": existing};

+ else

+ {

+ tree[c] = {};

+ tree[c][existing[0]] = existing.substr(1);

+ }

+ addSuffix(tree[c], suffix.substr(1), priority);

+ }

+ else

+ {

+ // Multiple choices for this suffix - go deeper

+ addSuffix(existing, suffix.substr(1), priority);

+ }

+ else

+ {

+ // No existing entry yet, just add ours

+ tree[c] = suffix.substr(1) + " " + priority;

+ }

+function removeSuffix(tree, suffix, priority)

+ if (suffix.length == 0)

+ {

+ // We are at the last character, check whether there is an entry with

+ // matching priority

+ if ("" in tree && tree[""] == " " + priority)

+ delete tree[""];

+ return;

+ }

+ let c = suffix[0];

+ if (!(c in tree))

+ return;

+ if (typeof tree[c] == "string")

+ {

+ // Single entry - check whether it is the right one

+ if (tree[c] == suffix.substr(1) + " " + priority)

+ delete tree[c];

+ }

+ else

+ {

+ // Multiple entries, need to go deeper

+ removeSuffix(tree[c], suffix.substr(1), priority);

+ }

+function onTimer()

+ // Next check in 1 hour

+ updateTimer.delay = 1000 * 60 * 60;

+ // Only download rules every three days

+ let nextUpdate = Prefs.lastRuleUpdate + 60 * 60 * 24 * 3;

+ if (nextUpdate > Date.now() / 1000)

+ return;

+ loadRulesFrom("http://urlfixer.org/download/rules.json?version=" + RULES_VERSION, false, function(success)

+ {

+ if (success)

+ {

+ rules.timestamp = Date.now();

+ try

+ {

+ // Save the rules to file.

+ let rulesText = JSON.stringify(rules);

+ let fileStream = FileUtils.openSafeFileOutputStream(getRuleFile());

+ let stream = Cc["@mozilla.org/intl/converter-output-stream;1"].createInstance(Ci.nsIConverterOutputStream);

+ stream.init(fileStream, "UTF-8", 16384, Ci.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);

+ stream.writeString(rulesText);

+ stream.flush();

+ FileUtils.closeSafeFileOutputStream(fileStream);

+ }

+ catch(e)

+ {

+ Cu.reportError(e);

+ }

+ });

+ Prefs.lastRuleUpdate = Date.now() / 1000;

+exports.getSchemeCorrection = getSchemeCorrection;

+function getSchemeCorrection(scheme)

+ return getBestMatch(scheme, rules.scheme, 1, null);

+exports.isKnownScheme = isKnownScheme;

+function isKnownScheme(scheme)

+ return (getSimilarStrings(scheme, rules.scheme, 0, null).length > 0);

+exports.getDomainCorrection = getDomainCorrection;

+function getDomainCorrection(domain)

+ // Apply user's custom changes first

+ let customRules = Prefs.custom_replace;

+ for (let searchString in customRules)

+ {

+ let replacement = customRules[searchString];

+ searchString = searchString.toLowerCase();

+ if (/^re:+/.test(searchString))

+ domain = domain.replace(new RegExp(RegExp.rightContext, "g"), replacement);

+ else

+ domain = domain.replace(searchString, replacement);

+ }

+ // Now apply our rules on the domain name

+ for (let i = 0, l = rules.expressions.length; i < l; i++)

+ domain = applyExpression(domain, rules.expressions[i]);

+ // Find similar known domains, test domains without the www. prefix

+ if (domain.substr(0, 4) == "www.")

+ domain = "www." + getBestMatch(domain.substr(4), rules.domain, 1, ".");

+ else

+ domain = getBestMatch(domain, rules.domain, 1, ".");

+ return domain;

+exports.getDomainReferral = getDomainReferral;

+function getDomainReferral(domain)

+ if ("domainReferrals" in rules && domain in rules.domainReferrals)

+ return rules.domainReferrals[domain];

+ else

+ return null;

+function applyExpression(string, expression)

+ if (expression.nomatch && new RegExp(expression.nomatch).test(string))

+ return string;

+ return string.replace(new RegExp(expression.find, "g"), expression.replace);

+function getSimilarStrings(input, dictionary, maxDistance, separator)

+ // We use a non-deterministic final automaton to perform a search on all

+ // strings in the dictionary simultaneously (see

+ // http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata

+ // for the basic algorithm). However, we use Damerau-Levenshtein distance

+ // (transposition of two adjacent letters counts as one operation), meaning

+ // one additional transition for the automaton. The number of automaton states

+ // can theoretically be extremely large, the maxDistance parameter limits the

+ // number of states we need to process however. We process the input string

+ // backwards to allow matching domain names for a longer host name.

+ let results = [];

+ function processState(entry, distance, position, result)

+ {

+ let isString = (typeof entry == "string");

+ // Do we have a result?

+ if (position < 0 || input[position] == separator)

+ {

+ if (!isString && "" in entry)

+ results.push([result, input.substr(position + 1), distance, parseInt(entry[""], 10)]);

+ else if (isString && entry[0] == " ")

+ results.push([result, input.substr(position + 1), distance, parseInt(entry, 10)]);

+ }

+ // Maybe there is a match

+ if (position >= 0)

+ {

+ let nextChar = input[position];

+ if (!isString && nextChar in entry)

+ processState(entry[nextChar], distance, position - 1, nextChar + result);

+ else if (isString && entry[0] == nextChar)

+ processState(entry.substr(1), distance, position - 1, nextChar + result);

+ }

+ // Mistakes

+ if (distance < maxDistance)

+ {

+ // Deletion and substitution

+ if (!isString)

+ {

+ for (let c in entry)

+ {

+ if (c != "")

+ processState(entry[c], distance + 1, position, c + result);

+ if (c != "" && position >= 0)

+ processState(entry[c], distance + 1, position - 1, c + result);

+ }

+ else if (entry[0] != " ")

+ {

+ processState(entry.substr(1), distance + 1, position, entry[0] + result);

+ if (position >= 0)

+ processState(entry.substr(1), distance + 1, position - 1, entry[0] + result);

+ }

+ // Insertion

+ if (position >= 0)

+ processState(entry, distance + 1, position - 1, result);

+ // Transposition

+ if (position >= 1)

+ {

+ let nextChar1 = input[position];

+ let nextChar2 = input[position - 1];

+ if (isString)

+ {

+ if (entry[0] == nextChar2 && entry[1] == nextChar1)

+ processState(entry.substr(2), distance + 1, position - 2, nextChar1 + nextChar2 + result);

+ }

+ else if (nextChar2 in entry)

+ {

+ let nextEntry = entry[nextChar2];

+ if (typeof nextEntry != "string")

+ {

+ if (nextChar1 in nextEntry)

+ processState(nextEntry[nextChar1], distance + 1, position - 2, nextChar1 + nextChar2 + result);

+ }

+ else

+ {

+ if (nextEntry[0] == nextChar1)

+ processState(nextEntry.substr(1), distance + 1, position - 2, nextChar1 + nextChar2 + result);

+ }

+ processState(dictionary, 0, input.length - 1, "");

+ return results;

+function getBestMatch(input, dictionary, maxDistance, separator)

+ let suggestions = getSimilarStrings(input, dictionary, maxDistance, separator);

+ let bestSuggestion = null;

+ let bestSuggestionDistance;

+ let bestSuggestionMatched;

+ let bestSuggestionPriority;

+ for (let i = 0; i < suggestions.length; i++)

+ {

+ let [suggestion, matchedString, distance, priority] = suggestions[i];

+ if (suggestion == input)

+ return suggestion;

+ let matchedLen = matchedString.length;

+ if (priority < 0 && matchedLen == input.length)

+ {

+ // TLDs should never be proposed as a replacement for the entire host name

+ continue;

+ }

+ if (!bestSuggestion ||

+ bestSuggestionMatched < matchedLen ||

+ (bestSuggestionMatched == matchedLen && bestSuggestionDistance > distance) ||

+ (bestSuggestionMatched == matchedLen && bestSuggestionDistance == distance && bestSuggestionPriority < priority))

+ {

+ bestSuggestion = suggestion;

+ bestSuggestionDistance = distance;

+ bestSuggestionMatched = matchedLen;

+ bestSuggestionPriority = priority;

+ }

+ if (bestSuggestion)

+ return input.substr(0, input.length - bestSuggestionMatched) + bestSuggestion;

+ else

+ return input;

« no previous file with comments | « lib/typoNetError.js ('k') | lib/typoSurvey.js » ('j') | no next file with comments »