From 715fce27f9894f12fa43494bf24ac906148faa05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= <anarcat@debian.org>
Date: Sat, 16 Nov 2019 10:34:42 -0500
Subject: [PATCH] publis parts of the nginx Puppet source code publicly

---
 tsa/howto/cache.mdwn | 157 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/tsa/howto/cache.mdwn b/tsa/howto/cache.mdwn
index 72326815..fad06d64 100644
--- a/tsa/howto/cache.mdwn
+++ b/tsa/howto/cache.mdwn
@@ -109,6 +109,161 @@ various counters and exposes those as metrics that are then scraped by
 [[prometheus]]. We use [[grafana]] to display that hit ratio which, at
 the time of writing, is about 88% for the blog.
 
+## Puppet architecture
+
+Because the Puppet code isn't public yet ([ticket #29387](https://trac.torproject.org/projects/tor/ticket/29387), here's a
+quick overview of how we set things up for others to follow.
+
+The entry point in Puppet is the `roles::cache` class, which
+configures an "Nginx server" (like an Apache vhost) to do the caching
+of the backend. It also includes our common Nginx configuration in
+`profile::nginx` which in turns delegates most of the configuration to
+the Voxpupuli Nginx Module.
+
+The role is essentially consists of:
+
+    include profile::nginx
+
+    nginx::resource::server { 'blog.torproject.org':
+      ssl_cert              => '/etc/ssl/torproject/certs/blog.torproject.org.crt-chained',
+      ssl_key               => '/etc/ssl/private/blog.torproject.org.key',
+      proxy                 => 'https://live-tor-blog-8.pantheonsite.io',
+      # no servicable parts below
+      ipv6_enable           => true,
+      ipv6_listen_options   => '',
+      ssl                   => true,
+      # part of HSTS configuration, the other bit is in add_header below
+      ssl_redirect          => true,
+      # proxy configuration
+      #
+      # pass the Host header to the backend (otherwise the proxy URL above is used)
+      proxy_set_header      => ['Host $host'],
+      # should map to a cache zone defined in the nginx profile
+      proxy_cache           => 'default',
+      # start caching redirects and 404s. this code is taken from the
+      # upstream documentation in
+      # https://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_cache_valid
+      proxy_cache_valid     => [
+        '200 302 10m',
+        '301      1h',
+        'any 1m',
+      ],
+      # allow serving stale content on error, timeout, or refresh
+      proxy_cache_use_stale => 'error timeout updating',
+      # allow only first request through backend
+      proxy_cache_lock      => 'on',
+      # purge headers from backend we will override. X-Served-By and Via
+      # are merged into the Via header, as per rfc7230 section 5.7.1
+      proxy_hide_header     => ['Strict-Transport-Security', 'Via', 'X-Served-By'],
+      add_header            => {
+        # this is a rough equivalent to Varnish's Age header: it caches
+        # when the page was cached, instead of its age
+        'X-Cache-Date'              => '$upstream_http_date',
+        # if this was served from cache
+        'X-Cache-Status'            => '$upstream_cache_status',
+        # replace the Via header with ours
+        'Via'                       => '$server_protocol $server_name',
+        # cargo-culted from Apache's configuration
+        'Strict-Transport-Security' => 'max-age=15768000; preload',
+      },
+      # cache 304 not modified entries
+      raw_append            => "proxy_cache_revalidate on;\n",
+      # caches shouldn't log, because it is too slow
+      #access_log            => 'off',
+      format_log            => 'cacheprivacy',
+    }
+
+There are also firewall (to open the monitoring, HTTP and HTTPS ports)
+and mtail (to read the log fiels for hit ratios) configurations but
+those are not essential to get Nginx itself working.
+
+The `profile::nginx` class is our common Nginx configuration that also
+covers non-caching setups:
+
+    # common nginx configuration
+    #
+    # @param client_max_body_size max upload size on this server. upstream
+    #                             default is 1m, see:
+    #                             https://nginx.org/en/docs/http/ngx_http_core_module.html#client_max_body_size
+    class profile::nginx(
+      Optional[String] $client_max_body_size = '1m',
+    ) {
+      include webserver
+      class { 'nginx':
+        confd_purge           => true,
+        server_purge          => true,
+        manage_repo           => false,
+        http2                 => 'on',
+        server_tokens         => 'off',
+        package_flavor        => 'light',
+        log_format            => {
+          # built-in, according to: http://nginx.org/en/docs/http/ngx_http_log_module.html#log_format
+          # 'combined' => '$remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"'
+
+          # "privacy" censors the client IP address from logs, taken from
+          # the Apache config, minus the "day" granularity because of
+          # limitations in nginx. we remove the IP address and user agent
+          # but keep the original request time, in other words.
+          'privacy'      => '0.0.0.0 - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "-"',
+
+          # the "cache" formats adds information about the backend, namely:
+          # upstream_addr - address and port of upstream server (string)
+          # upstream_response_time - total time spent talking to the backend server, in seconds (float)
+          # upstream_cache_status - state fo the cache (MISS, HIT, UPDATING, etc)
+          # request_time - total time spent answering this query, in seconds (float)
+          'cache'        => '$server_name:$server_port $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $upstream_addr $upstream_response_time $upstream_cache_status $request_time',  #lint:ignore:140chars
+          'cacheprivacy' => '$server_name:$server_port 0.0.0.0 - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "-" $upstream_addr $upstream_response_time $upstream_cache_status $request_time',  #lint:ignore:140chars
+        },
+        # XXX: doesn't work because a default is specified in the
+        # class. doesn't matter much because the puppet module reuses
+        # upstream default.
+        worker_rlimit_nofile  => undef,
+        accept_mutex          => 'off',
+        # XXX: doesn't work because a default is specified in the
+        # class. but that doesn't matter because accept_mutex is off so
+        # this has no effect
+        accept_mutex_delay    => undef,
+        http_tcp_nopush       => 'on',
+        gzip                  => 'on',
+        client_max_body_size  => $client_max_body_size,
+        run_dir               => '/run/nginx',
+        client_body_temp_path => '/run/nginx/client_body_temp',
+        proxy_temp_path       => '/run/nginx/proxy_temp',
+        proxy_connect_timeout => '60s',
+        proxy_read_timeout    => '60s',
+        proxy_send_timeout    => '60s',
+        proxy_cache_path      => '/var/cache/nginx/',
+        proxy_cache_levels    => '1:2',
+        proxy_cache_keys_zone => 'default:10m',
+        # XXX: hardcoded, should just let nginx figure it out
+        proxy_cache_max_size  => '15g',
+        proxy_cache_inactive  => '24h',
+        ssl_protocols         => 'TLSv1 TLSv1.1 TLSv1.2 TLSv1.3',
+        # XXX: from the apache module see also https://trac.torproject.org/projects/tor/ticket/32351
+        ssl_ciphers           => 'ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-RSA-AES256-SHA256:DHE-RSA-AES256-SHA:ECDHE-ECDSA-DES-CBC3-SHA:ECDHE-RSA-DES-CBC3-SHA:EDH-RSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA:!DSS', # lint:ignore:140chars
+      }
+      # recreate the default vhost
+      nginx::resource::server { 'default':
+        server_name         => ['_'],
+        www_root            => "/srv/www/${webserver::defaultpage::defaultdomain}/htdocs/",
+        listen_options      => 'default_server',
+        ipv6_enable         => true,
+        ipv6_listen_options => 'default_server',
+        # XXX: until we have an anonymous log format
+        access_log          => 'off',
+        ssl                 => true,
+        ssl_redirect        => true,
+        ssl_cert            => '/etc/ssl/torproject-auto/servercerts/thishost.crt',
+        ssl_key             => '/etc/ssl/torproject-auto/serverkeys/thishost.key';
+      }
+    }
+
+There are *lots* of config settings there, but they are provided to
+reduce the diff between the upstream debian package and the [Nginx
+module from the forge](https://forge.puppet.com/puppet/nginx). This was [filed upstream as a bug][puppet-nginx-1359].
+
+[puppet-nginx-1359]: https://github.com/voxpupuli/puppet-nginx/issues/1359
+
 ## Issues
 
  * logs should not be written to disk, but instead piped directly into
@@ -118,6 +273,8 @@ the time of writing, is about 88% for the blog.
    see [ticket #32462](https://trac.torproject.org/projects/tor/ticket/32462) for the varnish conversion
  * the cipher suite is an old hardcoded copy derived from Apache, see
    [ticket #32351](https://trac.torproject.org/projects/tor/ticket/32351)
+ * the Nginx puppet module diverges needlessly from upstream and
+   Debian package configuration see [puppet-nginx-1359][]
 
 There is no issue tracker specifically for this project, file and
 serch for issues in [internal services](https://trac.torproject.org/projects/tor/query?status=!closed&component=Internal+Services%2FTor+Sysadmin+Team).
-- 
GitLab