Commit 4af24d9b authored by Will Hawkins's avatar Will Hawkins
Browse files

Issue #5376: Lenient and strict methods for determing whether a string is a URL

From only one method to determine whether a string is a url, create
two -- one for a strict check and one for a lenient check. The strict
check will guarantee that the URL precisely conforms to ICANN standards
(e.g., the TLD in the URL is valid). The lenient check will classify any
string that contains ://, ., or : as a URL.
parent 8be7eceb
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -28,6 +28,15 @@ private val re = object {
 */
fun String.isUrl() = URLStringUtils.isURLLike(this)

/**
 * Checks if this string is a URL.
 *
 * This method performs a strict check to determine whether the string is a URL. It takes longer
 * to execute than isUrl() but checks whether, e.g., the TLD is ICANN-recognized. Consider
 * using isUrl() unless these guarantees are required.
 */
fun String.isUrlStrict() = URLStringUtils.isURLLikeStrict(this)

fun String.toNormalizedUrl() = URLStringUtils.toNormalizedURL(this)

fun String.isPhone() = re.phoneish.matches(this)
+28 −0
Original line number Diff line number Diff line
@@ -43,6 +43,34 @@ class StringTest {
        assertNotEquals("Tweet:", url)
    }

    @Test
    fun isUrlStrict() {
        assertTrue("mozilla.org".isUrlStrict())
        assertTrue(" mozilla.org ".isUrlStrict())
        assertTrue("http://mozilla.org".isUrlStrict())
        assertTrue("https://mozilla.org".isUrlStrict())
        assertTrue("file://somefile.txt".isUrlStrict())
        assertTrue("http://mozilla".isUrlStrict())
        assertTrue("http://192.168.255.255".isUrlStrict())
        assertTrue("about:crashcontent".isUrlStrict())
        assertTrue(" about:crashcontent ".isUrlStrict())
        assertTrue("sample:about ".isUrlStrict())

        assertFalse("sample.notatld".isUrlStrict())
        assertTrue("sample.rocks".isUrlStrict())

        assertFalse("mozilla".isUrlStrict())
        assertFalse("mozilla android".isUrlStrict())
        assertFalse(" mozilla android ".isUrlStrict())
        assertFalse("Tweet:".isUrlStrict())
        assertFalse("inurl:mozilla.org advanced search".isUrlStrict())
        assertFalse("what is about:crashes".isUrlStrict())

        val extraText = "Check out @asa’s Tweet: https://twitter.com/asa/status/123456789?s=09"
        val url = extraText.split(" ").find { it.isUrlStrict() }
        assertNotEquals("Tweet:", url)
    }

    @Test
    fun toNormalizedUrl() {
        val expectedUrl = "http://mozilla.org"
+60 −2
Original line number Diff line number Diff line
@@ -6,16 +6,39 @@ package mozilla.components.support.utils

import android.net.Uri
import android.text.TextUtils
import androidx.annotation.VisibleForTesting
import java.util.regex.Pattern

object URLStringUtils {
    fun isURLLike(string: String, safe: Boolean = false) =
    /**
     * Determine whether a string is a URL.
     *
     * This method performs a strict check to determine whether a string is a URL. It takes longer
     * to execute than isURLLike() but checks whether, e.g., the TLD is ICANN-recognized. Consider
     * using isURLLike() unless these guarantees are required.
     */
    fun isURLLikeStrict(string: String, safe: Boolean = false) =
        if (safe) {
            string.matches(WebURLFinder.fuzzyUrlRegex)
        } else {
            string.matches(WebURLFinder.fuzzyUrlNonWebRegex)
        }

    fun isSearchTerm(string: String) = !isURLLike(string, false)
    /**
     * Determine whether a string is a URL.
     *
     * This method performs a lenient check to determine whether a string is a URL. Anything that
     * contains a :, ://, or . and has no internal spaces is potentially a URL. If you need a
     * stricter check, consider using isURLLikeStrict().
     */
    fun isURLLike(string: String) = isURLLenient.matcher(string).matches()

    /**
     * Determine whether a string is a search term.
     *
     * This method recognizes a string as a search term as anything other than a URL.
     */
    fun isSearchTerm(string: String) = !isURLLike(string)

    /**
     * Normalizes a URL String.
@@ -28,4 +51,39 @@ object URLStringUtils {
        }
        return uri.toString()
    }

    private val isURLLenient by lazy {
        // Be lenient about what is classified as potentially a URL. Anything that contains a :,
        // ://, or . and has no internal spaces is potentially a URL.
        //
        // Use java.util.regex because it is always unicode aware on Android.
        // https://developer.android.com/reference/java/util/regex/Pattern.html

        // Use both the \w+ and \S* after the punctuation because they seem to match slightly
        // different things. The \S matches any non-whitespace character (e.g., '~') and \w
        // matches only word characters. In other words, the regex is requiring that there be a
        // non-symbol character somewhere after the ., : or :// and before any other character
        // or the end of the string. For example, match
        // mozilla.com/~userdir
        // and not
        // mozilla./~ or mozilla:/
        // Without the [/]* after the :// in the alternation of the characters required to be a
        // valid URL,
        // file:///home/user/myfile.html
        // is considered a search term; it is clearly a URL.
        Pattern.compile("^\\s*\\w+(://[/]*|:|\\.)\\w+\\S*\\s*$", flags)
    }

    @VisibleForTesting(otherwise = VisibleForTesting.PRIVATE)
    internal const val UNICODE_CHARACTER_CLASS: Int = 0x100

    // To run tests on a non-Android device (like a computer), Pattern.compile
    // requires a flag to enable unicode support. Set a value like flags here with a local
    // copy of UNICODE_CHARACTER_CLASS. Use a local copy because that constant is not
    // available on Android platforms < 24 (Fenix targets 21). At runtime this is not an issue
    // because, again, Android REs are always unicode compliant.
    // NB: The value has to go through an intermediate variable; otherwise, the linter will
    // complain that this value is not one of the predefined enums that are allowed.
    @VisibleForTesting(otherwise = VisibleForTesting.PRIVATE)
    internal var flags = 0
}
+83 −31
Original line number Diff line number Diff line
@@ -7,17 +7,24 @@ package mozilla.components.support.utils
import androidx.test.ext.junit.runners.AndroidJUnit4
import mozilla.components.support.utils.URLStringUtils.isSearchTerm
import mozilla.components.support.utils.URLStringUtils.isURLLike
import mozilla.components.support.utils.URLStringUtils.isURLLikeStrict
import mozilla.components.support.utils.URLStringUtils.toNormalizedURL
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
import org.junit.Assert.assertNotEquals
import org.junit.Assert.assertTrue
import org.junit.Before
import org.junit.Test
import org.junit.runner.RunWith

@RunWith(AndroidJUnit4::class)
class URLStringUtilsTest {

    @Before
    fun configurePatternFlags() {
        URLStringUtils.flags = URLStringUtils.UNICODE_CHARACTER_CLASS
    }

    @Test
    fun toNormalizedURL() {
        val expectedUrl = "http://mozilla.org"
@@ -27,49 +34,94 @@ class URLStringUtilsTest {
    }

    @Test
    fun isURLLike() {
        assertTrue(isURLLike("mozilla.org"))
        assertTrue(isURLLike(" mozilla.org "))
        assertTrue(isURLLike("http://mozilla.org"))
        assertTrue(isURLLike("https://mozilla.org"))
        assertTrue(isURLLike("file://somefile.txt"))
        assertFalse(isURLLike("file://somefile.txt", true))

        assertTrue(isURLLike("http://mozilla"))
        assertTrue(isURLLike("http://192.168.255.255"))
        assertTrue(isURLLike("192.167.255.255"))
        assertTrue(isURLLike("about:crashcontent"))
        assertTrue(isURLLike(" about:crashcontent "))
        assertTrue(isURLLike("sample:about "))
        assertTrue(isURLLike("https://mozilla-mobile.com"))

        assertTrue(isURLLike("link.info"))
        assertFalse(isURLLike("link.unknown"))

        assertFalse(isURLLike("mozilla"))
        assertFalse(isURLLike("mozilla android"))
        assertFalse(isURLLike(" mozilla android "))
        assertFalse(isURLLike("Tweet:"))
        assertFalse(isURLLike("inurl:mozilla.org advanced search"))
        assertFalse(isURLLike("what is about:crashes"))
    fun isURLLikeStrict() {
        assertTrue(isURLLikeStrict("mozilla.org"))
        assertTrue(isURLLikeStrict(" mozilla.org "))
        assertTrue(isURLLikeStrict("http://mozilla.org"))
        assertTrue(isURLLikeStrict("https://mozilla.org"))
        assertTrue(isURLLikeStrict("file://somefile.txt"))
        assertFalse(isURLLikeStrict("file://somefile.txt", true))

        assertTrue(isURLLikeStrict("http://mozilla"))
        assertTrue(isURLLikeStrict("http://192.168.255.255"))
        assertTrue(isURLLikeStrict("192.167.255.255"))
        assertTrue(isURLLikeStrict("about:crashcontent"))
        assertTrue(isURLLikeStrict(" about:crashcontent "))
        assertTrue(isURLLikeStrict("sample:about "))
        assertTrue(isURLLikeStrict("https://mozilla-mobile.com"))

        assertTrue(isURLLikeStrict("link.info"))
        assertFalse(isURLLikeStrict("link.unknown"))

        assertFalse(isURLLikeStrict("mozilla"))
        assertFalse(isURLLikeStrict("mozilla android"))
        assertFalse(isURLLikeStrict(" mozilla android "))
        assertFalse(isURLLikeStrict("Tweet:"))
        assertFalse(isURLLikeStrict("inurl:mozilla.org advanced search"))
        assertFalse(isURLLikeStrict("what is about:crashes"))

        val extraText = "Check out @asa’s Tweet: https://twitter.com/asa/status/123456789?s=09"
        val url = extraText.split(" ").find { isURLLike(it) }
        val url = extraText.split(" ").find { isURLLikeStrict(it) }
        assertNotEquals("Tweet:", url)

        assertFalse(isURLLike("3.14"))
        assertFalse(isURLLike("3.14.2019"))
        assertFalse(isURLLikeStrict("3.14"))
        assertFalse(isURLLikeStrict("3.14.2019"))

        assertFalse(isURLLikeStrict("file://somefile.txt", true))
        assertFalse(isURLLikeStrict("about:config", true))
    }

    @Test
    fun isUrlLike() {
        assertFalse(isURLLike("inurl:mozilla.org advanced search"))
        assertFalse(isURLLike("sf: help"))
        assertFalse(isURLLike("mozilla./~"))
        assertFalse(isURLLike("cnn.com politics"))

        assertTrue(isURLLike("about:config"))
        assertTrue(isURLLike("about:config:8000"))
        assertTrue(isURLLike("file:///home/user/myfile.html"))
        assertTrue(isURLLike("file://////////////home//user/myfile.html"))
        assertTrue(isURLLike("file://C:\\Users\\user\\myfile.html"))
        assertTrue(isURLLike("http://192.168.255.255"))

        assertTrue(isURLLike("link.unknown"))

        // Per https://bugs.chromium.org/p/chromium/issues/detail?id=31405, ICANN will accept
        // purely numeric gTLDs.
        assertTrue(isURLLike("3.14.2019"))
        assertTrue(isURLLike(" cnn.com "))
        assertTrue(isURLLike(" cnn.com"))
        assertTrue(isURLLike("cnn.com "))
        assertTrue(isURLLike("mozilla.com/~userdir"))

        assertFalse(isURLLike("file://somefile.txt", true))
        assertFalse(isURLLike("about:config", true))
        assertTrue(isURLLike("http://faß.de//"))
        assertTrue(isURLLike("cnn.cơḿ"))
        assertTrue(isURLLike("cnn.çơḿ"))
    }

    @Test
    fun isSearchTerm() {
        assertTrue(isSearchTerm("inurl:mozilla.org advanced search"))
        assertTrue(isSearchTerm("3.14.2019"))
        assertTrue(isSearchTerm("sf: help"))
        assertTrue(isSearchTerm("mozilla./~"))
        assertTrue(isSearchTerm("cnn.com politics"))

        assertFalse(isSearchTerm("about:config"))
        assertFalse(isSearchTerm("about:config:8000"))
        assertFalse(isSearchTerm("file:///home/user/myfile.html"))
        assertFalse(isSearchTerm("file://////////////home//user/myfile.html"))
        assertFalse(isSearchTerm("file://C:\\Users\\user\\myfile.html"))
        assertFalse(isSearchTerm("http://192.168.255.255"))
        // Per https://bugs.chromium.org/p/chromium/issues/detail?id=31405, ICANN will accept
        // purely numeric gTLDs.
        assertFalse(isSearchTerm("3.14.2019"))
        assertFalse(isSearchTerm(" cnn.com "))
        assertFalse(isSearchTerm(" cnn.com"))
        assertFalse(isSearchTerm("cnn.com "))
        assertFalse(isSearchTerm("mozilla.com/~userdir"))
        assertFalse(isSearchTerm("http://faß.de//"))
        assertFalse(isSearchTerm("cnn.cơḿ"))
        assertFalse(isSearchTerm("cnn.çơḿ"))
    }
}
+1 −0
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ permalink: /changelog/

* **support-ktx**
  * Added `Context.getDrawableWithTint` extension method to get a drawable resource with a tint applied.
	* `String.isUrl` is now using a more lenient check for improved performance. Strictly checking whether a string is a URL or not is supported through the new `String.isUrlStrict` method.

* **support-base**
  * ⚠️ **This is a breaking change**: