Commit 2c44721c authored by Karsten Loesing's avatar Karsten Loesing
Browse files

Use readr to speed up drawing graphs.

Over two years ago, in commit 1f90b723 from October 2016, we made our
user graphs faster by avoiding to read the large .csv file on demand.
Instead we read it once as part of the daily update, saved it to disk
as .RData file using R's save() function, and loaded it back to memory
using R's load() function when drawing a graph.

This approach worked okay. It just had two disadvantages:

 1. We had to write a small amount of R code for each graph type,
    which is why we only did it for graphs with large .csv files.
 2. Running these small R script as part of the daily update made it
    harder to move away from Ant towards a Java-only execution model.

The new approach implemented in this commit uses read_csv() fromt the
readr package which reads CSV files several times faster than
read.csv().

Requires installing the readr package from CRAN, which is available on
Debian in stretch-backports and later as r-cran-readr.

Implements #28799.
parent ffaab885
Loading
Loading
Loading
Loading
+0 −14
Original line number Diff line number Diff line
@@ -362,8 +362,6 @@
    <property name="module.name" value="clients" />
    <property name="localmoddir" value="${modulebase}/${module.name}" />

    <property name="rdatadir" value="${localmoddir}/RData" />
    <mkdir dir="${rdatadir}" />
    <property name="statsdir"
              value="${localmoddir}/stats" />
    <mkdir dir="${statsdir}" />
@@ -410,10 +408,6 @@

    <copy file="${localmoddir}/clients.csv" todir="${statsdir}" />
    <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" />

    <antcall target="run-R" >
      <param name="module.Rscript" value="split-clients.R" />
    </antcall>
  </target>

  <target name="servers" >
@@ -426,13 +420,7 @@

  <target name="webstats" >
    <property name="module.name" value="webstats" />
    <property name="rdatadir" value="${modulebase}/${module.name}/RData" />
    <mkdir dir="${rdatadir}" />

    <antcall target="run-java" />
    <antcall target="run-R" >
      <param name="module.Rscript" value="write-RData.R" />
    </antcall>
  </target>

  <target name="totalcw" >
@@ -482,8 +470,6 @@
      <fileset dir="${modulebase}/totalcw/stats" includes="totalcw.csv" />
    </copy>
    <copy todir="${rdatadir}" >
      <fileset dir="${modulebase}/clients/RData" includes="*.RData" />
      <fileset dir="${modulebase}/webstats/RData" includes="*.RData" />
      <fileset dir="${resources}/web/images/" includes="no-data-available.*" />
    </copy>
  </target>
+0 −12
Original line number Diff line number Diff line
dir.create("RData", showWarnings = FALSE)

c <- read.csv("clients.csv", stringsAsFactors = FALSE)
data <- c[c$node == 'relay', !(names(c) %in% c("node"))]
save(data, file = "RData/clients-relay.RData")
data <- c[c$node == 'bridge', !(names(c) %in% c("node"))]
save(data, file = "RData/clients-bridge.RData")

u <- read.csv("userstats-combined.csv", stringsAsFactors = FALSE)
data <- u[, !(names(u) %in% c("node", "version"))]
save(data, file = "RData/userstats-bridge-combined.RData")
+135 −34
Original line number Diff line number Diff line
@@ -348,6 +348,9 @@ robust_call <- function(wrappee, filename) {
       })
}

# Disable readr's automatic progress bar.
options(readr.show_progress = FALSE)

prepare_networksize <- function(start_p, end_p) {
  read.csv(paste(stats_dir, "networksize.csv", sep = ""),
    colClasses = c("date" = "Date")) %>%
@@ -863,8 +866,19 @@ write_bandwidth_flags <- function(start_p = NULL, end_p = NULL, path_p) {

plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p,
    events_p, path_p) {
  load(paste(rdata_dir, "clients-", node_p, ".RData", sep = ""))
  c <- data
  c <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
      col_types = cols(
        date = col_date(format = ""),
        node = col_character(),
        country = col_character(),
        transport = col_character(),
        version = col_character(),
        lower = col_double(),
        upper = col_double(),
        clients = col_double(),
        frac = col_skip()),
      na = character()) %>%
    filter(node == node_p)
  u <- c[c$date >= start_p & c$date <= end_p, c("date", "country", "transport",
      "version", "lower", "upper", "clients")]
  u <- rbind(u, data.frame(date = start_p,
@@ -1011,14 +1025,24 @@ plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) {

write_userstats_relay_country <- function(start_p = NULL, end_p = NULL,
    country_p = NULL, events_p = NULL, path_p) {
  load(paste(rdata_dir, "clients-relay.RData", sep = ""))
  u <- data %>%
  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
      col_types = cols(
        date = col_date(format = ""),
        node = col_character(),
        country = col_character(),
        transport = col_character(),
        version = col_character(),
        lower = col_double(),
        upper = col_double(),
        clients = col_double(),
        frac = col_double())) %>%
    filter(node == "relay") %>%
    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
    filter(if (!is.null(country_p))
      country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
    filter(transport == "") %>%
    filter(version == "") %>%
    filter(is.na(transport)) %>%
    filter(is.na(version)) %>%
    select(date, country, clients, lower, upper, frac) %>%
    rename(users = clients) %>%
    write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
@@ -1026,14 +1050,24 @@ write_userstats_relay_country <- function(start_p = NULL, end_p = NULL,

write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
    country_p = NULL, path_p) {
  load(paste(rdata_dir, "clients-bridge.RData", sep = ""))
  data %>%
  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
      col_types = cols(
        date = col_date(format = ""),
        node = col_character(),
        country = col_character(),
        transport = col_character(),
        version = col_character(),
        lower = col_double(),
        upper = col_double(),
        clients = col_double(),
        frac = col_double())) %>%
    filter(node == "bridge") %>%
    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
    filter(if (!is.null(country_p))
      country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
    filter(transport == "") %>%
    filter(version == "") %>%
    filter(is.na(transport)) %>%
    filter(is.na(version)) %>%
    select(date, country, clients, frac) %>%
    rename(users = clients) %>%
    write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
@@ -1041,13 +1075,23 @@ write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,

write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
    transport_p = NULL, path_p) {
  load(paste(rdata_dir, "clients-bridge.RData", sep = ""))
  u <- data %>%
  u <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
      col_types = cols(
        date = col_date(format = ""),
        node = col_character(),
        country = col_character(),
        transport = col_character(),
        version = col_character(),
        lower = col_double(),
        upper = col_double(),
        clients = col_double(),
        frac = col_double())) %>%
    filter(node == "bridge") %>%
    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
    filter(country == "") %>%
    filter(version == "") %>%
    filter(transport != "") %>%
    filter(is.na(country)) %>%
    filter(is.na(version)) %>%
    filter(!is.na(transport)) %>%
    select(date, transport, clients, frac)
  if (is.null(transport_p) || "!<OR>" %in% transport_p) {
    n <- u %>%
@@ -1068,12 +1112,22 @@ write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,

write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL,
    version_p = NULL, path_p) {
  load(paste(rdata_dir, "clients-bridge.RData", sep = ""))
  data %>%
  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
      col_types = cols(
        date = col_date(format = ""),
        node = col_character(),
        country = col_character(),
        transport = col_character(),
        version = col_character(),
        lower = col_double(),
        upper = col_double(),
        clients = col_double(),
        frac = col_double())) %>%
    filter(node == "bridge") %>%
    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
    filter(country == "") %>%
    filter(transport == "") %>%
    filter(is.na(country)) %>%
    filter(is.na(transport)) %>%
    filter(if (!is.null(version_p)) version == version_p else TRUE) %>%
    select(date, version, clients, frac) %>%
    rename(users = clients) %>%
@@ -1081,8 +1135,16 @@ write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL,
}

prepare_userstats_bridge_combined <- function(start_p, end_p, country_p) {
  load(paste(rdata_dir, "userstats-bridge-combined.RData", sep = ""))
  data %>%
  read_csv(file = paste(stats_dir, "userstats-combined.csv", sep = ""),
      col_types = cols(
        date = col_date(format = ""),
        node = col_skip(),
        country = col_character(),
        transport = col_character(),
        version = col_skip(),
        frac = col_double(),
        low = col_double(),
        high = col_double())) %>%
    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
    filter(if (!is.null(country_p)) country == country_p else TRUE)
@@ -1135,7 +1197,7 @@ prepare_advbwdist_perc <- function(start_p, end_p, p_p) {
    filter(if (!is.null(p_p)) percentile %in% as.numeric(p_p) else
      percentile != "") %>%
    transmute(date, percentile = as.factor(percentile),
      variable = ifelse(isexit != "t", "all", "exits"),
      variable = ifelse(is.na(isexit), "all", "exits"),
      advbw = advbw * 8 / 1e9)
}

@@ -1258,11 +1320,20 @@ write_hidserv_rend_relayed_cells <- function(start_p = NULL, end_p = NULL,
}

prepare_webstats_tb <- function(start_p, end_p) {
  load(paste(rdata_dir, "webstats-tb.RData", sep = ""))
  data %>%
  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
      col_types = cols(
        log_date = col_date(format = ""),
        request_type = col_factor(),
        platform = col_skip(),
        channel = col_skip(),
        locale = col_skip(),
        incremental = col_skip(),
        count = col_double())) %>%
    filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
    mutate(request_type = factor(request_type))
    filter(request_type %in% c("tbid", "tbsd", "tbup", "tbur")) %>%
    group_by(log_date, request_type) %>%
    summarize(count = sum(count))
}

plot_webstats_tb <- function(start_p, end_p, path_p) {
@@ -1296,8 +1367,15 @@ write_webstats_tb <- function(start_p = NULL, end_p = NULL, path_p) {
}

prepare_webstats_tb_platform <- function(start_p, end_p) {
  read.csv(paste(stats_dir, "webstats.csv", sep = ""),
    colClasses = c("log_date" = "Date")) %>%
  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
      col_types = cols(
        log_date = col_date(format = ""),
        request_type = col_factor(),
        platform = col_factor(),
        channel = col_skip(),
        locale = col_skip(),
        incremental = col_skip(),
        count = col_double())) %>%
    filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
    filter(request_type %in% c("tbid", "tbup")) %>%
@@ -1337,8 +1415,15 @@ write_webstats_tb_platform <- function(start_p = NULL, end_p = NULL, path_p) {
}

plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
  d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""),
    colClasses = c("log_date" = "Date", "locale" = "character"))
  d <- read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
      col_types = cols(
        log_date = col_date(format = ""),
        request_type = col_factor(),
        platform = col_skip(),
        channel = col_skip(),
        locale = col_factor(),
        incremental = col_skip(),
        count = col_double()))
  d <- d[d$log_date >= start_p & d$log_date <= end_p &
         d$request_type %in% c("tbid", "tbup"), ]
  levels(d$request_type) <- list(
@@ -1375,8 +1460,15 @@ plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
# plot_webstats_tb_locale needs the preliminary data frame e for its
# breaks and labels. Left as future work.
write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) {
  read.csv(paste(stats_dir, "webstats.csv", sep = ""),
    colClasses = c("log_date" = "Date", "locale" = "character")) %>%
  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
      col_types = cols(
        log_date = col_date(format = ""),
        request_type = col_factor(),
        platform = col_skip(),
        channel = col_skip(),
        locale = col_factor(),
        incremental = col_skip(),
        count = col_double())) %>%
    filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
    filter(request_type %in% c("tbid", "tbup")) %>%
@@ -1390,11 +1482,20 @@ write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) {
}

prepare_webstats_tm <- function(start_p, end_p) {
  load(paste(rdata_dir, "webstats-tm.RData", sep = ""))
  data %>%
  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
      col_types = cols(
        log_date = col_date(format = ""),
        request_type = col_factor(),
        platform = col_skip(),
        channel = col_skip(),
        locale = col_skip(),
        incremental = col_skip(),
        count = col_double())) %>%
    filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
    filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
    mutate(request_type = factor(request_type))
    filter(request_type %in% c("tmid", "tmup")) %>%
    group_by(log_date, request_type) %>%
    summarize(count = sum(count))
}

plot_webstats_tm <- function(start_p, end_p, path_p) {
+1 −0
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ library("RColorBrewer")
library("scales")
library(dplyr)
library(tidyr)
library(readr)

source('graphs.R')
source('tables.R')

src/main/R/webstats/write-RData.R

deleted100644 → 0
+0 −16
Original line number Diff line number Diff line
dir.create("RData", showWarnings = FALSE)

d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE)
d <- d[d$request_type %in% c('tbid', 'tbsd', 'tbup', 'tbur'), ]
data <- aggregate(list(count = d$count),
    by = list(log_date = as.Date(d$log_date), request_type = d$request_type),
    FUN = sum)
save(data, file = "RData/webstats-tb.RData")

d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE)
d <- d[d$request_type %in% c('tmid', 'tmup'), ]
data <- aggregate(list(count = d$count),
    by = list(log_date = as.Date(d$log_date), request_type = d$request_type),
    FUN = sum)
save(data, file = "RData/webstats-tm.RData")