Commit 7a00a2e2 authored by Marco Bonardo's avatar Marco Bonardo
Browse files

Bug 1623383 - Cleanup URIFixup.jsm. r=harry

parent 1943cf3d
Loading
Loading
Loading
Loading
+169 −242
Original line number Diff line number Diff line
@@ -7,11 +7,14 @@
/**
 * This component handles fixing up URIs, by correcting obvious typos and adding
 * missing schemes.
 * URI references:
 *   http://www.faqs.org/rfcs/rfc1738.html
 *   http://www.faqs.org/rfcs/rfc2396.html
 */

// TODO (Bug 1623383): This is a direct conversion from .cpp, as such a few
// functions are overly complex and should be cleaned up.
/* eslint complexity: ["error", 63] */
// getFixupURIInfo has a complex logic, that likely could be simplified, but
// the risk of regressions is high, thus that should be done with care.
/* eslint complexity: ["error", 39] */

var EXPORTED_SYMBOLS = ["URIFixup", "URIFixupInfo"];

@@ -74,36 +77,25 @@ const {
const COMMON_PROTOCOLS = ["http", "https", "ftp", "file"];

// Regex used to identify user:password tokens in url strings.
// This is not a strict character check, because we try to fixup this part of
// a the url too.
// This is not a strict valid characters check, because we try to fixup this
// part of the url too.
XPCOMUtils.defineLazyGetter(
  this,
  "userPasswordRegex",
  () => /^([a-z+.-]+:\/{0,3})*[^\/@]+@.+/i
);

// Regex used to identify ascii alphabetical characters.
XPCOMUtils.defineLazyGetter(this, "asciiAlphaRegex", () => /^[a-z]$/i);
// Regex used to look for ascii alphabetical characters.
XPCOMUtils.defineLazyGetter(this, "asciiAlphaRegex", () => /[a-z]/i);

// Regex used to identify ascii a-f characters.
XPCOMUtils.defineLazyGetter(this, "asciiHexRegex", () => /^[a-f0-9]$/i);

// Regex used to identify ascii numbers.
XPCOMUtils.defineLazyGetter(
  this,
  "asciiNumberRegex",
  () => /^[0-9]+(\.[0-9]+)?$/
);
// Regex used to identify numbers.
XPCOMUtils.defineLazyGetter(this, "numberRegex", () => /^[0-9]+(\.[0-9]+)?$/);

// Regex used to identify tab separated content (having at least 2 tabs).
XPCOMUtils.defineLazyGetter(this, "maxOneTabRegex", () => /^[^\t]*\t?[^\t]*$/);

// Regex to test if the protocol might actually be a url without a protocol:
//
//   http://www.faqs.org/rfcs/rfc1738.html
//   http://www.faqs.org/rfcs/rfc2396.html
//
// e.g. Anything of the form:
// Regex used to test if a string with a protocol might instead be a url
// without a protocol but with a port:
//
//   <hostname>:<port> or
//   <hostname>:<port>/
@@ -138,18 +130,34 @@ XPCOMUtils.defineLazyGetter(
  () => /^([a-z][a-z0-9.+\t-]*):/i
);

// Regex used to match IPs. Note that these are not made to validate IPs, but
// just to detect strings that look like an IP. They also skip protocol.
// For IPv4 this also accepts a shorthand format with just 2 dots.
XPCOMUtils.defineLazyGetter(
  this,
  "IPv4LikeRegex",
  () => /^(?:[a-z+.-]+:\/*(?!\/))?(?:\d{1,3}\.){2,3}\d{1,3}(?::\d+|\/)?/i
);
XPCOMUtils.defineLazyGetter(
  this,
  "IPv6LikeRegex",
  () =>
    /^(?:[a-z+.-]+:\/*(?!\/))?\[(?:[0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?(?::\d+|\/)?/i
);

// Cache of whitelisted domains.
XPCOMUtils.defineLazyGetter(this, "domainsWhitelist", () => {
  const branch = "browser.fixup.domainwhitelist.";
  let domains = new Set(
    Services.prefs
      .getChildList("browser.fixup.domainwhitelist.")
      .getChildList(branch)
      .filter(p => Services.prefs.getBoolPref(p, false))
      .map(p => p.substring(30))
      .map(p => p.substring(branch.length))
  );
  // Hold onto the observer.
  // Hold onto the observer to avoid it being GC-ed.
  domains._observer = {
    observe(subject, topic, data) {
      let domain = data.substring(30);
      let domain = data.substring(branch.length);
      if (Services.prefs.getBoolPref(data, false)) {
        domains.add(domain);
      } else {
@@ -161,11 +169,7 @@ XPCOMUtils.defineLazyGetter(this, "domainsWhitelist", () => {
      Ci.nsISupportsWeakReference,
    ]),
  };
  Services.prefs.addObserver(
    "browser.fixup.domainwhitelist.",
    domains._observer,
    true
  );
  Services.prefs.addObserver(branch, domains._observer, true);
  return domains;
});

@@ -209,40 +213,12 @@ URIFixup.prototype = {
    let info = new URIFixupInfo(uriString);

    let scheme = extractScheme(uriString);
    let isCommonProtocol = COMMON_PROTOCOLS.includes(scheme);
    // View-source is a pseudo scheme. We're interested in fixing up the stuff
    // after it. The easiest way to do that is to call this method again with
    // the "view-source:" lopped off and then prepend it again afterwards.
    if (scheme == "view-source") {
      // We disable keyword lookup and alternate URIs so that small typos don't
      // cause us to look at very different domains.
      let newFixupFlags =
        fixupFlags &
        ~FIXUP_FLAG_ALLOW_KEYWORD_LOOKUP &
        ~FIXUP_FLAGS_MAKE_ALTERNATE_URI;

      let innerURIString = uriString.substring(12).trim();

      // Prevent recursion.
      let innerScheme = extractScheme(innerURIString);
      if (innerScheme == "view-source") {
        throw new Components.Exception(
          "Prevent view-source recursion",
          Cr.NS_ERROR_FAILURE
        );
      }

      let innerInfo = this.getFixupURIInfo(
        innerURIString,
        newFixupFlags,
      info.preferredURI = info.fixedURI = fixupViewSource(
        uriString,
        fixupFlags,
        postData
      );
      if (!innerInfo.preferredURI) {
        throw new Components.Exception("No preferredURI", Cr.NS_ERROR_FAILURE);
      }
      info.preferredURI = info.fixedURI = Services.io.newURI(
        "view-source:" + innerInfo.preferredURI.spec
      );
      return info;
    }

@@ -259,13 +235,14 @@ URIFixup.prototype = {

    // Fix up common scheme typos.
    // TODO: Use levenshtein distance here?
    let isCommonProtocol = COMMON_PROTOCOLS.includes(scheme);
    if (
      fixupSchemeTypos &&
      fixupFlags & FIXUP_FLAG_FIX_SCHEME_TYPOS &&
      scheme &&
      !isCommonProtocol
    ) {
      for (let [typo, fixed] of [
      info.fixupChangedProtocol = [
        ["ttp", "http"],
        ["htp", "http"],
        ["ttps", "https"],
@@ -274,19 +251,18 @@ URIFixup.prototype = {
        ["htps", "https"],
        ["ile", "file"],
        ["le", "file"],
      ]) {
      ].some(([typo, fixed]) => {
        if (uriString.startsWith(typo)) {
          scheme = fixed;
          uriString = scheme + uriString.substring(typo.length);
          info.fixupChangedProtocol = true;
          isCommonProtocol = true;
          break;
        }
          return true;
        }
        return false;
      });
    }

    // Now we need to check whether "scheme" is something we don't
    // really know about.
    // Now check whether "scheme" is something we don't know about.
    let isDefaultProtocolHandler =
      !isCommonProtocol &&
      scheme &&
@@ -329,9 +305,7 @@ URIFixup.prototype = {

    if (info.fixedURI) {
      if (!info.preferredURI) {
        if (fixupFlags & FIXUP_FLAGS_MAKE_ALTERNATE_URI) {
          info.fixupCreatedAlternateURI = makeAlternateFixedURI(info);
        }
        maybeSetAlternateFixedURI(info, fixupFlags);
        info.preferredURI = info.fixedURI;
      }
      return info;
@@ -342,15 +316,10 @@ URIFixup.prototype = {
    // Prune duff protocol schemes:
    //   ://totallybroken.url.com
    //   //shorthand.url.com
    let inputHadDuffProtocol = false;
    if (!scheme) {
      if (uriString.startsWith("://")) {
        uriString = uriString.substring(3);
        inputHadDuffProtocol = true;
      } else if (uriString.startsWith("//")) {
        uriString = uriString.substring(2);
        inputHadDuffProtocol = true;
      }
    let inputHadDuffProtocol =
      uriString.startsWith("://") || uriString.startsWith("//");
    if (inputHadDuffProtocol) {
      uriString = uriString.replace(/^:?\/\//, "");
    }

    // Avoid fixing up content that looks like tab-separated values.
@@ -361,11 +330,7 @@ URIFixup.prototype = {
      if (uriWithProtocol) {
        info.fixedURI = uriWithProtocol;
        info.fixupChangedProtocol = true;
        // Did the caller want us to try an alternative URI?
        // If so, attempt to fixup http://foo into http://www.foo.com
        if (fixupFlags & FIXUP_FLAGS_MAKE_ALTERNATE_URI) {
          info.fixupCreatedAlternateURI = makeAlternateFixedURI(info);
        }
        maybeSetAlternateFixedURI(info, fixupFlags);
        info.preferredURI = info.fixedURI;
      }
    }
@@ -374,12 +339,11 @@ URIFixup.prototype = {
    if (
      keywordEnabled &&
      fixupFlags & FIXUP_FLAG_ALLOW_KEYWORD_LOOKUP &&
      !inputHadDuffProtocol
      !inputHadDuffProtocol &&
      keywordURIFixup(uriString, info, isPrivateContext, postData)
    ) {
      if (keywordURIFixup(uriString, info, isPrivateContext, postData)) {
      return info;
    }
    }

    if (
      info.fixedURI &&
@@ -400,7 +364,7 @@ URIFixup.prototype = {
    }

    if (!info.preferredURI) {
      // we couldn't salvage anything.
      // We couldn't salvage anything.
      throw new Components.Exception(
        "Couldn't build a valid uri",
        Cr.NS_ERROR_MALFORMED_URI
@@ -575,6 +539,7 @@ URIFixupInfo.prototype = {
 * Implementation of isDomainWhitelisted, so we don't have to go through the
 * service.
 * @param {string} asciiHost
 * @returns {boolean} whether the domain is whitelisted
 */
function isDomainWhitelisted(asciiHost) {
  if (dnsFirstForSingleWords) {
@@ -595,8 +560,8 @@ function isDomainWhitelisted(asciiHost) {
 * If the suffix is unknown due to a typo this will try to fix it up.
 * @param {URIFixupInfo} info about the uri to check.
 * @note this may modify the public suffix of info.fixedURI.
 * @returns false if the public suffix is unknown, true in any other case,
 *          included if it's not present.
 * @returns {boolean} false if the public suffix is unknown, true in any other
 *          case, included if it's not present.
 */
function checkAndFixPublicSuffix(info) {
  let uri = info.fixedURI;
@@ -625,7 +590,7 @@ function checkAndFixPublicSuffix(info) {
  // Suffix is unknown, try to fix most common 3 chars TLDs typos.
  // .com is the most commonly mistyped tld, so it has more cases.
  let suffix = Services.eTLD.getPublicSuffix(uri);
  if (!suffix || asciiNumberRegex.test(suffix)) {
  if (!suffix || numberRegex.test(suffix)) {
    return true;
  }
  for (let [typo, fixed] of [
@@ -685,12 +650,14 @@ function tryKeywordFixupForURIInfo(
 * This generates an alternate fixedURI, by adding a prefix and a suffix to
 * the fixedURI host, if and only if the protocol is http. It should _never_
 * modify URIs with other protocols.
 * @param info an URIInfo object
 * @returns whether an alternate uri was generated.
 * @param {URIFixupInfo} info an URIInfo object
 * @param {integer} fixupFlags the fixup flags
 * @returns {boolean} Whether an alternate uri was generated
 */
function makeAlternateFixedURI(info) {
function maybeSetAlternateFixedURI(info, fixupFlags) {
  let uri = info.fixedURI;
  if (
    !(fixupFlags & FIXUP_FLAGS_MAKE_ALTERNATE_URI) ||
    !alternateEnabled ||
    // Code only works for http. Not for any other protocol including https!
    !uri.schemeIs("http") ||
@@ -701,14 +668,13 @@ function makeAlternateFixedURI(info) {
  ) {
    return false;
  }

  let oldHost = uri.host;
  // Don't create an alternate uri for localhost, because it would be confusing.
  if (oldHost == "localhost") {
    return false;
  }

  let numDots = (oldHost.match(/\./g) || []).length;

  // Get the prefix and suffix to stick onto the new hostname. By default these
  // are www. & .com but they could be any other value, e.g. www. & .org
  let prefix = Services.prefs.getCharPref(
@@ -721,6 +687,7 @@ function makeAlternateFixedURI(info) {
  );

  let newHost = "";
  let numDots = (oldHost.match(/\./g) || []).length;
  if (numDots == 0) {
    newHost = prefix + oldHost + suffix;
  } else if (numDots == 1) {
@@ -746,12 +713,13 @@ function makeAlternateFixedURI(info) {
    }
    return false;
  }
  info.fixupCreatedAlternateURI = true;
  return true;
}

/**
 * Try to fixup a file URI.
 * @param uriString The file URI to fix.
 * @param {string} uriString The file URI to fix.
 * @returns {nsIURI} a fixed uri or null.
 * @note FileURIFixup only returns a URI if it has to add the file: protocol.
 */
@@ -791,7 +759,7 @@ function fileURIFixup(uriString) {
 *    user:pass@no-scheme.com
 *
 * @param {string} uriString The string to fixup.
 * @returns an nsIURI built adding the default protocol to the string,
 * @returns {nsIURI} an nsIURI built adding the default protocol to the string,
 *          or null if fixing was not possible.
 */
function fixupURIProtocol(uriString) {
@@ -837,132 +805,41 @@ function keywordURIFixup(uriString, fixupInfo, isPrivateContext, postData) {
  // "nonQualifiedHost?some args"
  // "blah.com."

  // Note: -1 is greater than any actual location
  // in practice.  So if we cast all locations to uint32_t, then a <
  // b guarantees that either b is kNotFound and a is found, or both
  // are found and a found before b.

  // TODO: the cpp code used to assign -1 to an unsigned int, causing overflow.
  // We should replace most of this with better js code.
  const NOT_FOUND = Infinity;
  let firstDotLoc = NOT_FOUND;
  let lastDotLoc = NOT_FOUND;
  let firstColonLoc = NOT_FOUND;
  let firstQuoteLoc = NOT_FOUND;
  let firstSpaceLoc = NOT_FOUND;
  let firstQMarkLoc = NOT_FOUND;
  let lastLSBracketLoc = NOT_FOUND;
  let lastSlashLoc = NOT_FOUND;
  let foundDots = 0;
  let foundColons = 0;
  let foundDigits = 0;
  let foundRSBrackets = 0;
  let looksLikeIpv6 = true;
  let hasAsciiAlpha = false;
  for (let i = 0, pos = 0; i < uriString.length; ++i, ++pos) {
    let iter = uriString[i];
    if (pos >= 1 && foundRSBrackets == 0) {
      if (
        !(
          lastLSBracketLoc == 0 &&
          (iter == ":" ||
            iter == "." ||
            iter == "]" ||
            asciiHexRegex.test(iter))
        )
      ) {
        looksLikeIpv6 = false;
      }
    }

    // If we're at the end of the string or this is the first slash,
    // check if the thing before the slash looks like ipv4:
    if (
      (i == uriString.length - 1 ||
        (lastSlashLoc == NOT_FOUND && iter == "/")) &&
      // Need 2 or 3 dots + only digits
      (foundDots == 2 || foundDots == 3) &&
      // and they should be all that came before now:
      (foundDots + foundDigits == pos ||
        // or maybe there was also exactly 1 colon that came after the last
        // dot, and the digits, dots and colon were all that came before now:
        (foundColons == 1 &&
          firstColonLoc > lastDotLoc &&
          foundDots + foundDigits + foundColons == pos))
    ) {
      // Hurray, we got ourselves some ipv4!
      // At this point, there's no way we will do a keyword lookup, so just bail
      // immediately:
  // Check for IPs.
  if (IPv4LikeRegex.test(uriString) || IPv6LikeRegex.test(uriString)) {
    return false;
  }

    if (iter == ".") {
      foundDots++;
      lastDotLoc = pos;
      if (firstDotLoc == NOT_FOUND) {
        firstDotLoc = pos;
      }
    } else if (iter == ":") {
      foundColons++;
      if (firstColonLoc == NOT_FOUND) {
        firstColonLoc = pos;
      }
    } else if (iter == " " && firstSpaceLoc == NOT_FOUND) {
      firstSpaceLoc = pos;
    } else if (iter == "?" && firstQMarkLoc == NOT_FOUND) {
      firstQMarkLoc = pos;
    } else if ((iter == "'" || iter == '"') && firstQuoteLoc == NOT_FOUND) {
      firstQuoteLoc = pos;
    } else if (iter == "[") {
      lastLSBracketLoc = pos;
    } else if (iter == "]") {
      foundRSBrackets++;
    } else if (iter == "/") {
      lastSlashLoc = pos;
    } else if (asciiAlphaRegex.test(iter)) {
      hasAsciiAlpha = true;
    } else if (asciiNumberRegex.test(iter)) {
      foundDigits++;
    }
  }

  if (lastLSBracketLoc > 0 || foundRSBrackets != 1) {
    looksLikeIpv6 = false;
  // We do keyword lookups if the input starts with a question mark, or if it
  // contains a space or quote, provided they don't come after a dot, colon or
  // question mark.
  if (uriString.startsWith("?") || /^[^.:?]*[\s"']/.test(uriString)) {
    return tryKeywordFixupForURIInfo(
      fixupInfo.originalInput,
      fixupInfo,
      isPrivateContext,
      postData
    );
  }

  // If there are only colons and only hexadecimal characters ([a-z][0-9])
  // enclosed in [], then don't do a keyword lookup
  if (looksLikeIpv6) {
  // Avoid lookup if we can identify an host and it's whitelisted.
  // Note that if dnsFirstForSingleWords is true isDomainWhitelisted will always
  // return true, so we can avoid checking dnsFirstForSingleWords after this.
  let asciiHost = fixupInfo.fixedURI?.asciiHost;
  if (asciiHost && isDomainWhitelisted(asciiHost)) {
    return false;
  }

  let asciiHost = fixupInfo.fixedURI && fixupInfo.fixedURI.asciiHost;
  // Or when the asciiHost is the same as displayHost and there are no
  // alphabetical characters
  let displayHost = fixupInfo.fixedURI && fixupInfo.fixedURI.displayHost;

  // We do keyword lookups if a space or quote preceded the dot, colon
  // or question mark (or if the latter is not found, or if the input starts
  // with a question mark)
  let hasAsciiAlpha = asciiAlphaRegex.test(uriString);
  if (
    ((firstSpaceLoc < firstDotLoc || firstQuoteLoc < firstDotLoc) &&
      (firstSpaceLoc < firstColonLoc || firstQuoteLoc < firstColonLoc) &&
      (firstSpaceLoc < firstQMarkLoc || firstQuoteLoc < firstQMarkLoc)) ||
    firstQMarkLoc == 0
  ) {
    return tryKeywordFixupForURIInfo(
      fixupInfo.originalInput,
      fixupInfo,
      isPrivateContext,
      postData
    );
    // ... or when the asciiHost is the same as displayHost and there are no
    // characters from [a-z][A-Z]
  } else if (
    asciiHost &&
    displayHost &&
    !hasAsciiAlpha &&
    asciiHost.toLowerCase() == displayHost.toLowerCase()
  ) {
    if (!dnsFirstForSingleWords) {
    return tryKeywordFixupForURIInfo(
      fixupInfo.originalInput,
      fixupInfo,
@@ -970,32 +847,33 @@ function keywordURIFixup(uriString, fixupInfo, isPrivateContext, postData) {
      postData
    );
  }
    // ... or if there is no question mark or colon, and there is either no
    // dot, or exactly 1 and it is the first or last character of the input:
  } else if (
    (firstDotLoc == NOT_FOUND ||
      (foundDots == 1 &&
        (firstDotLoc == 0 || firstDotLoc == uriString.length - 1))) &&
    firstColonLoc == NOT_FOUND &&
    firstQMarkLoc == NOT_FOUND
  ) {
    if (asciiHost && isDomainWhitelisted(asciiHost)) {

  // Avoid lookup if we reached this point and there is a question mark or colon.
  if (uriString.includes(":") || uriString.includes("?")) {
    return false;
  }

    // ... unless there are no dots, and a slash, and alpha characters, and
    // this is a valid host:
  // Keyword lookup if there is exactly one dot and it is the first or last
  // character of the input.
  let firstDotIndex = uriString.indexOf(".");
  if (
      firstDotLoc == NOT_FOUND &&
      lastSlashLoc != NOT_FOUND &&
      hasAsciiAlpha &&
      asciiHost
    firstDotIndex == uriString.length - 1 ||
    (firstDotIndex == 0 && firstDotIndex == uriString.lastIndexOf("."))
  ) {
      return false;
    return tryKeywordFixupForURIInfo(
      fixupInfo.originalInput,
      fixupInfo,
      isPrivateContext,
      postData
    );
  }

    // If we get here, we don't have a valid URI, or we did but the
    // host is not whitelisted, so we do a keyword search *anyway*:
  // Keyword lookup if there is no dot and the string doesn't include a slash,
  // or any alphabetical character or an host.
  if (
    firstDotIndex == -1 &&
    (!uriString.includes("/") || !hasAsciiAlpha || !asciiHost)
  ) {
    return tryKeywordFixupForURIInfo(
      fixupInfo.originalInput,
      fixupInfo,
@@ -1003,10 +881,59 @@ function keywordURIFixup(uriString, fixupInfo, isPrivateContext, postData) {
      postData
    );
  }

  return false;
}

/**
 * Mimics the logic in Services.io.extractScheme, but avoids crossing XPConnect.
 * @param {string} uriString the string to examine
 * @returns {string} a scheme or empty string if one could not be identified
 */
function extractScheme(uriString) {
  let matches = uriString.match(possibleProtocolRegex);
  return matches ? matches[1].replace("\t", "").toLowerCase() : "";
}

/**
 * View-source is a pseudo scheme. We're interested in fixing up the stuff
 * after it. The easiest way to do that is to call this method again with
 * the "view-source:" lopped off and then prepend it again afterwards.
 * @param {string} uriString The original string to fixup
 * @param {integer} fixupFlags The original fixup flags
 * @param {nsIInputStream} postData Optional POST data for the search
 * @returns {nsIURI} a fixed URI
 * @throws if it's not possible to fixup the url
 */
function fixupViewSource(uriString, fixupFlags, postData) {
  // We disable keyword lookup and alternate URIs so that small typos don't
  // cause us to look at very different domains.
  let newFixupFlags =
    fixupFlags &
    ~FIXUP_FLAG_ALLOW_KEYWORD_LOOKUP &
    ~FIXUP_FLAGS_MAKE_ALTERNATE_URI;

  let innerURIString = uriString.substring(12).trim();

  // Prevent recursion.
  let innerScheme = extractScheme(innerURIString);
  if (innerScheme == "view-source") {
    throw new Components.Exception(
      "Prevent view-source recursion",
      Cr.NS_ERROR_FAILURE
    );
  }

  let info = Services.uriFixup.getFixupURIInfo(
    innerURIString,
    newFixupFlags,
    postData
  );
  if (!info.preferredURI) {
    throw new Components.Exception(
      "Couldn't build a valid uri",
      Cr.NS_ERROR_MALFORMED_URI
    );
  }
  return Services.io.newURI("view-source:" + info.preferredURI.spec);
}