Commit 359f27ab authored by Michael Comella's avatar Michael Comella Committed by Michael Comella
Browse files

For #17972: split frameworkStart into two metrics.

We do this in order to make it easier to analyze in GLAM: see the metric
descriptions for more details.

Additionally, we change the time unit to milliseconds to make it easier
to analyze in GLAM.
parent 012cc215
Loading
Loading
Loading
Loading
+34 −6
Original line number Diff line number Diff line
@@ -3878,14 +3878,21 @@ addons:
    expires: "2021-04-01"

startup.timeline:
  framework_start:
  framework_primary:
    send_in_pings:
      - startup-timeline
    type: timespan
    time_unit: nanosecond
    time_unit: millisecond
    description: |
      The duration the Android framework takes to start before letting us run
      code in `*Application.init`. This is calculated from `appInitTimestamp -
      code in `*Application.init` when this device has `clock_ticks_per_second`
      equal to 100: if it's not equal to 100, then this value is captured in
      `framework_secondary`. We split this into two metrics to make it easier
      to analyze in GLAM. We split on 100 because when we did our initial brief
      analysis - https://sql.telemetry.mozilla.org/queries/75591 - the results
      for clocks ticks were overwhelmingly 100.

      The duration is calculated from `appInitTimestamp -
      processStartTimestamp`. `processStartTimestamp` is derived from the clock
      tick time unit, which is expected to be less granular than nanoseconds.
      Therefore, we convert and round our timestamps to clock ticks before
@@ -3895,9 +3902,30 @@ startup.timeline:
      devices, is also reported as a metric
    bugs:
      - https://github.com/mozilla-mobile/fenix/issues/8803
      - https://github.com/mozilla-mobile/fenix/issues/17972
    data_reviews:
      - https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626
      - https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068
      - https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284
    data_sensitivity:
      - technical
    notification_emails:
      - perf-android-fe@mozilla.com
      - mcomella@mozilla.com
    expires: "2021-08-01"
  framework_secondary:
    send_in_pings:
      - startup-timeline
    type: timespan
    time_unit: millisecond
    description: |
      The duration the Android framework takes to start before letting us run
      code in `*Application.init` when this device has `clock_ticks_per_second`
      not equal to 100. For more details on this metric, see `framework_primary`
    bugs:
      - https://github.com/mozilla-mobile/fenix/issues/17972
    data_reviews:
      - https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284
    data_sensitivity:
      - technical
    notification_emails:
@@ -3909,9 +3937,9 @@ startup.timeline:
      - startup-timeline
    type: boolean
    description: |
      An error when attempting to record `framework_start` - the application
      init timestamp returned a negative value - which is likely indicative of a
      bug in the implementation.
      An error when attempting to record `framework_primary/secondary` - the
      application init timestamp returned a negative value - which is likely
      indicative of a bug in the implementation.
    bugs:
      - https://github.com/mozilla-mobile/fenix/issues/8803
    data_reviews:
+1 −0
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@ startup-timeline:
    - https://github.com/mozilla-mobile/fenix/issues/17972
  data_reviews:
    - https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626
    - https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284
  notification_emails:
    - perf-android-fe@mozilla.com
    - mcomella@mozilla.com
+15 −5
Original line number Diff line number Diff line
@@ -54,20 +54,30 @@ internal class StartupFrameworkStartMeasurement(
        if (applicationInitNanos < 0) {
            telemetry.frameworkStartError.set(true)
        } else {
            val clockTicksPerSecond = stat.clockTicksPerSecond.also {
                // framework* is derived from the number of clock ticks per second. To ensure this
                // value does not throw off our result, we capture it too.
                telemetry.clockTicksPerSecond.add(it.toInt())
            }

            // In our brief analysis, clock ticks per second was overwhelmingly equal to 100. To make
            // analysis easier in GLAM, we split the results into two separate metrics. See the
            // metric descriptions for more details.
            @Suppress("MagicNumber") // it's more confusing to separate the comment above from the value declaration.
            val durationMetric =
                if (clockTicksPerSecond == 100L) telemetry.frameworkPrimary else telemetry.frameworkSecondary

            try {
                telemetry.frameworkStart.setRawNanos(getFrameworkStartNanos())
                durationMetric.setRawNanos(getFrameworkStartNanos())
            } catch (e: FileNotFoundException) {
                // Privacy managers can add hooks that block access to reading system /proc files.
                // We want to catch these exception and report an error on accessing the file
                // rather than an implementation error.
                telemetry.frameworkStartReadError.set(true)
            }

            // frameworkStart is derived from the number of clock ticks per second. To ensure this
            // value does not throw off our result, we capture it too.
            telemetry.clockTicksPerSecond.add(stat.clockTicksPerSecond.toInt())
        }
    }

    /**
     * @throws [java.io.FileNotFoundException]
     */
+73 −14
Original line number Diff line number Diff line
@@ -17,7 +17,8 @@ import org.junit.Before
import org.junit.Test
import org.mozilla.fenix.GleanMetrics.StartupTimeline as Telemetry

private const val CLOCK_TICKS_PER_SECOND = 100L
private const val PRIMARY_TICKS = 100L
private const val SECONDARY_TICKS = 50L

class StartupFrameworkStartMeasurementTest {

@@ -27,10 +28,13 @@ class StartupFrameworkStartMeasurementTest {
    // We'd prefer to use the Glean test methods over these mocks but they require us to add
    // Robolectric and it's not worth the impact on test duration.
    @MockK private lateinit var telemetry: Telemetry
    @MockK(relaxed = true) private lateinit var frameworkStart: TimespanMetricType
    @MockK(relaxed = true) private lateinit var frameworkPrimary: TimespanMetricType
    @MockK(relaxed = true) private lateinit var frameworkSecondary: TimespanMetricType
    @MockK(relaxed = true) private lateinit var frameworkStartError: BooleanMetricType
    @MockK(relaxed = true) private lateinit var clockTicksPerSecond: CounterMetricType

    private var clockTicksPerSecondValue = -1L

    private var elapsedRealtimeNanos = -1L
    private var processStartTimeTicks = -1L

@@ -41,13 +45,17 @@ class StartupFrameworkStartMeasurementTest {
        elapsedRealtimeNanos = -1
        processStartTimeTicks = -1

        // This value is hard-coded in the OS so we default to it as it's the expected value.
        clockTicksPerSecondValue = PRIMARY_TICKS

        stat = spyk(object : Stat() {
            override val clockTicksPerSecond: Long get() = CLOCK_TICKS_PER_SECOND
            override val clockTicksPerSecond: Long get() = clockTicksPerSecondValue
        })
        every { stat.getProcessStartTimeTicks(any()) } answers { processStartTimeTicks }
        val getElapsedRealtimeNanos = { elapsedRealtimeNanos }

        every { telemetry.frameworkStart } returns frameworkStart
        every { telemetry.frameworkPrimary } returns frameworkPrimary
        every { telemetry.frameworkSecondary } returns frameworkSecondary
        every { telemetry.frameworkStartError } returns frameworkStartError
        every { telemetry.clockTicksPerSecond } returns clockTicksPerSecond

@@ -67,13 +75,32 @@ class StartupFrameworkStartMeasurementTest {
    }

    @Test
    fun `GIVEN app init is set to valid values WHEN metrics are set THEN frameworkStart is set with the correct value`() {
    fun `GIVEN app init is set to valid values and clock ticks per second is 100 WHEN metrics are set THEN frameworkPrimary is set with the correct value`() {
        clockTicksPerSecondValue = PRIMARY_TICKS
        setProcessAppInitAndMetrics(processStart = 166_636_813, appInit = 1_845_312_345_673_925)
        verifyFrameworkStartSuccess(178_944_220_000_000, isPrimary = true) // calculated by hand.
    }

    @Test
    fun `GIVEN app init is set to valid values and clock ticks per second is not 100 WHEN metrics are set THEN frameworkSecondary is set with the correct value`() {
        clockTicksPerSecondValue = SECONDARY_TICKS
        setProcessAppInitAndMetrics(processStart = 166_636_813, appInit = 1_845_312_345_673_925)
        verifyFrameworkStartSuccess(178_944_220_000_000) // calculated by hand.
        verifyFrameworkStartSuccess(178_944_220_000_000, isPrimary = false) // calculated by hand.
    }

    @Test // this overlaps with the success case test.
    fun `GIVEN app init has valid values WHEN onAppInit is called twice and metrics are set THEN frameworkStart uses the first app init value`() {
    fun `GIVEN app init has valid values and clock ticks per second is 100 WHEN onAppInit is called twice and metrics are set THEN frameworkPrimary uses the first app init value`() {
        clockTicksPerSecondValue = PRIMARY_TICKS
        testAppInitCalledTwice(isPrimary = true)
    }

    @Test // this overlaps with the success case test.
    fun `GIVEN app init has valid values and clock ticks per second is not 100 WHEN onAppInit is called twice and metrics are set THEN frameworkSecondary uses the first app init value`() {
        clockTicksPerSecondValue = SECONDARY_TICKS
        testAppInitCalledTwice(isPrimary = false)
    }

    private fun testAppInitCalledTwice(isPrimary: Boolean) {
        processStartTimeTicks = 166_636_813

        elapsedRealtimeNanos = 1_845_312_345_673_925
@@ -82,14 +109,28 @@ class StartupFrameworkStartMeasurementTest {
        metrics.onApplicationInit()

        metrics.setExpensiveMetric()
        verifyFrameworkStartSuccess(178_944_220_000_000) // calculated by hand.
        verifyFrameworkStartSuccess(178_944_220_000_000, isPrimary) // calculated by hand.
    }

    @Test
    fun `GIVEN app init have valid values WHEN metrics are set twice THEN frameworkStart is only set once`() {
    fun `GIVEN app init have valid values and clock ticks per second is 100 WHEN metrics are set twice THEN frameworkPrimary is only set once`() {
        clockTicksPerSecondValue = PRIMARY_TICKS
        testMetricsSetTwice(isPrimary = true)
    }

    @Test
    fun `GIVEN app init have valid values and clock ticks per second is not 100 WHEN metrics are set twice THEN frameworkSecondary is only set once`() {
        clockTicksPerSecondValue = SECONDARY_TICKS
        testMetricsSetTwice(isPrimary = false)
    }

    private fun testMetricsSetTwice(isPrimary: Boolean) {
        setProcessAppInitAndMetrics(10, 100)
        metrics.setExpensiveMetric()
        verify(exactly = 1) { frameworkStart.setRawNanos(any()) }

        val (setMetric, unsetMetric) = getSetAndUnsetMetric(isPrimary)
        verify(exactly = 1) { setMetric.setRawNanos(any()) }
        verify { unsetMetric wasNot Called }
        verify(exactly = 1) { clockTicksPerSecond.add(any()) }
        verify { frameworkStartError wasNot Called }
    }
@@ -103,15 +144,33 @@ class StartupFrameworkStartMeasurementTest {
        metrics.setExpensiveMetric()
    }

    private fun verifyFrameworkStartSuccess(nanos: Long) {
        verify { frameworkStart.setRawNanos(nanos) }
        verify { clockTicksPerSecond.add(CLOCK_TICKS_PER_SECOND.toInt()) }
    private fun verifyFrameworkStartSuccess(nanos: Long, isPrimary: Boolean) {
        val (setMetric, unsetMetric) = getSetAndUnsetMetric(isPrimary)
        verify { setMetric.setRawNanos(nanos) }
        verify { unsetMetric wasNot Called }

        val expectedClockTicksPerSecond = getExpectedClockTicksPerSecond(isPrimary)
        verify { clockTicksPerSecond.add(expectedClockTicksPerSecond.toInt()) }
        verify { frameworkStartError wasNot Called }
    }

    private fun verifyFrameworkStartError() {
        verify { frameworkStartError.set(true) }
        verify { frameworkStart wasNot Called }
        verify { frameworkPrimary wasNot Called }
        verify { frameworkSecondary wasNot Called }
        verify { clockTicksPerSecond wasNot Called }
    }

    private fun getSetAndUnsetMetric(isPrimary: Boolean): Pair<TimespanMetricType, TimespanMetricType> {
        return if (isPrimary) {
            Pair(frameworkPrimary, frameworkSecondary)
        } else {
            Pair(frameworkSecondary, frameworkPrimary)
        }
    }

    // This hard-codes some data that's passed into the test but I don't want to spend more time
    // so I don't bother cleaning it up now.
    private fun getExpectedClockTicksPerSecond(isPrimary: Boolean): Long =
        if (isPrimary) PRIMARY_TICKS else SECONDARY_TICKS
}
+13 −5
Original line number Diff line number Diff line
@@ -346,26 +346,34 @@ The following metrics are added to the ping:

This ping is intended to provide an understanding of startup performance.

The ping is intended to be captured by performance testing automation to
report results there, in addition to user telemetry. We place these metrics
into their own ping in order to isolate them and make this process easier.
In addition to being captured on real devices, the ping data was prematurely
optimized into this separate ping to be isolated from other metrics to be
more easily captured by performance testing automation but that hasn't
happened in practice. We would have removed it but implementation
details don't make that possible:
https://github.com/mozilla-mobile/fenix/issues/17972#issuecomment-781002987


This ping includes the [client id](https://mozilla.github.io/glean/book/user/pings/index.html#the-client_info-section).

**Data reviews for this ping:**

- <https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626>
- <https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284>

**Bugs related to this ping:**

- <https://github.com/mozilla-mobile/fenix/issues/8803>
- <https://github.com/mozilla-mobile/fenix/issues/17972>

The following metrics are added to the ping:

| Name | Type | Description | Data reviews | Extras | Expiration | [Data Sensitivity](https://wiki.mozilla.org/Firefox/Data_Collection) |
| --- | --- | --- | --- | --- | --- | --- |
| startup.timeline.clock_ticks_per_second |[counter](https://mozilla.github.io/glean/book/user/metrics/counter.html) |The number of clock tick time units that occur in one second on this particular device. This value is expected to be used in conjunction with the `framework_start` metric.  |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 |
| startup.timeline.framework_start |[timespan](https://mozilla.github.io/glean/book/user/metrics/timespan.html) |The duration the Android framework takes to start before letting us run code in `*Application.init`. This is calculated from `appInitTimestamp - processStartTimestamp`. `processStartTimestamp` is derived from the clock tick time unit, which is expected to be less granular than nanoseconds. Therefore, we convert and round our timestamps to clock ticks before computing the difference and convert back to nanoseconds to report.  For debugging purposes, `clock_ticks_per_second`, which may vary between devices, is also reported as a metric  |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 |
| startup.timeline.framework_start_error |[boolean](https://mozilla.github.io/glean/book/user/metrics/boolean.html) |An error when attempting to record `framework_start` - the application init timestamp returned a negative value - which is likely indicative of a bug in the implementation.  |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 |
| startup.timeline.framework_primary |[timespan](https://mozilla.github.io/glean/book/user/metrics/timespan.html) |The duration the Android framework takes to start before letting us run code in `*Application.init` when this device has `clock_ticks_per_second` equal to 100: if it's not equal to 100, then this value is captured in `framework_secondary`. We split this into two metrics to make it easier to analyze in GLAM. We split on 100 because when we did our initial brief analysis - https://sql.telemetry.mozilla.org/queries/75591 - the results for clocks ticks were overwhelmingly 100.  The duration is calculated from `appInitTimestamp - processStartTimestamp`. `processStartTimestamp` is derived from the clock tick time unit, which is expected to be less granular than nanoseconds. Therefore, we convert and round our timestamps to clock ticks before computing the difference and convert back to nanoseconds to report.  For debugging purposes, `clock_ticks_per_second`, which may vary between devices, is also reported as a metric  |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068), [3](https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284)||2021-08-01 |1 |
| startup.timeline.framework_secondary |[timespan](https://mozilla.github.io/glean/book/user/metrics/timespan.html) |The duration the Android framework takes to start before letting us run code in `*Application.init` when this device has `clock_ticks_per_second` not equal to 100. For more details on this metric, see `framework_primary`  |[1](https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284)||2021-08-01 |1 |
| startup.timeline.framework_start_error |[boolean](https://mozilla.github.io/glean/book/user/metrics/boolean.html) |An error when attempting to record `framework_primary/secondary` - the application init timestamp returned a negative value - which is likely indicative of a bug in the implementation.  |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 |
| startup.timeline.framework_start_read_error |[boolean](https://mozilla.github.io/glean/book/user/metrics/boolean.html) |An error when attempting to read stats from /proc pseudo-filesystem - privacy managers can block access to reading these files - the application will catch a file reading exception.  |[1](https://github.com/mozilla-mobile/fenix/pull/10481), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 |