Commit cd56247a authored by David Fifield's avatar David Fifield
Browse files

amp package.

This package contains a CacheURL function that modifies a URL to be
accessed through an AMP cache, and the "AMP armor" data encoding scheme
for encoding data into the AMP subset of HTML.
parent b706e9c7
package amp
import (
"bufio"
"bytes"
"encoding/base64"
"fmt"
"io"
"golang.org/x/net/html"
)
// ErrUnknownVersion is the error returned when the first character inside the
// element encoding (but outside the base64 encoding) is not '0'.
type ErrUnknownVersion byte
func (err ErrUnknownVersion) Error() string {
return fmt.Sprintf("unknown armor version indicator %+q", byte(err))
}
func isASCIIWhitespace(b byte) bool {
switch b {
// https://infra.spec.whatwg.org/#ascii-whitespace
case '\x09', '\x0a', '\x0c', '\x0d', '\x20':
return true
default:
return false
}
}
func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) {
var i, j int
// Skip initial whitespace.
for i = 0; i < len(data); i++ {
if !isASCIIWhitespace(data[i]) {
break
}
}
// Look for next whitespace.
for j = i; j < len(data); j++ {
if isASCIIWhitespace(data[j]) {
return j + 1, data[i:j], nil
}
}
// We reached the end of data without finding more whitespace. Only
// consider it a token if we are at EOF.
if atEOF && i < j {
return j, data[i:j], nil
}
// Otherwise, request more data.
return i, nil, nil
}
func decodeToWriter(w io.Writer, r io.Reader) (int64, error) {
tokenizer := html.NewTokenizer(r)
// Set a memory limit on token sizes, otherwise the tokenizer will
// buffer text indefinitely if it is not broken up by other token types.
tokenizer.SetMaxBuf(elementSizeLimit)
active := false
total := int64(0)
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
err := tokenizer.Err()
if err == io.EOF {
err = nil
}
if err == nil && active {
return total, fmt.Errorf("missing </pre> tag")
}
return total, err
case html.TextToken:
if active {
// Re-join the separate chunks of text and
// feed them to the decoder.
scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text()))
scanner.Split(splitASCIIWhitespace)
for scanner.Scan() {
n, err := w.Write(scanner.Bytes())
total += int64(n)
if err != nil {
return total, err
}
}
if err := scanner.Err(); err != nil {
return total, err
}
}
case html.StartTagToken:
tn, _ := tokenizer.TagName()
if string(tn) == "pre" {
if active {
// nesting not allowed
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
}
active = true
}
case html.EndTagToken:
tn, _ := tokenizer.TagName()
if string(tn) == "pre" {
if !active {
// stray end tag
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
}
active = false
}
}
}
}
// NewArmorDecoder returns a new AMP armor decoder.
func NewArmorDecoder(r io.Reader) (io.Reader, error) {
pr, pw := io.Pipe()
go func() {
_, err := decodeToWriter(pw, r)
pw.CloseWithError(err)
}()
// The first byte inside the element encoding is a server–client
// protocol version indicator.
var version [1]byte
_, err := pr.Read(version[:])
if err != nil {
pr.CloseWithError(err)
return nil, err
}
switch version[0] {
case '0':
return base64.NewDecoder(base64.StdEncoding, pr), nil
default:
err := ErrUnknownVersion(version[0])
pr.CloseWithError(err)
return nil, err
}
}
package amp
import (
"encoding/base64"
"io"
)
// https://amp.dev/boilerplate/
// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amp-boilerplate/?format=websites
// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/?format=websites#the-amp-html-format
const (
boilerplateStart = `<!doctype html>
<html amp>
<head>
<meta charset="utf-8">
<script async src="https://cdn.ampproject.org/v0.js"></script>
<link rel="canonical" href="#">
<meta name="viewport" content="width=device-width">
<style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript>
</head>
<body>
`
boilerplateEnd = `</body>
</html>`
)
const (
// We restrict the amount of text may go inside an HTML element, in
// order to limit the amount a decoder may have to buffer.
elementSizeLimit = 32 * 1024
// The payload is conceptually a long base64-encoded string, but we
// break the string into short chunks separated by whitespace. This is
// to protect against modification by AMP caches, which reportedly may
// truncate long words in text:
// https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985#note_2592348
bytesPerChunk = 32
// We set the number of chunks per element so as to stay under
// elementSizeLimit. Here, we assume that there is 1 byte of whitespace
// after each chunk (with an additional whitespace byte at the beginning
// of the element).
chunksPerElement = (elementSizeLimit - 1) / (bytesPerChunk + 1)
)
// The AMP armor encoder is a chain of a base64 encoder (base64.NewEncoder) and
// an HTML element encoder (elementEncoder). A top-level encoder (armorEncoder)
// coordinates these two, and handles prepending and appending the AMP
// boilerplate. armorEncoder's Write method writes data into the base64 encoder,
// where it makes its way through the chain.
// NewArmorEncoder returns a new AMP armor encoder. Anything written to the
// returned io.WriteCloser will be encoded and written to w. The caller must
// call Close to flush any partially written data and output the AMP boilerplate
// trailer.
func NewArmorEncoder(w io.Writer) (io.WriteCloser, error) {
// Immediately write the AMP boilerplate header.
_, err := w.Write([]byte(boilerplateStart))
if err != nil {
return nil, err
}
element := &elementEncoder{w: w}
// Write a server–client protocol version indicator, outside the base64
// layer.
_, err = element.Write([]byte{'0'})
if err != nil {
return nil, err
}
base64 := base64.NewEncoder(base64.StdEncoding, element)
return &armorEncoder{
w: w,
element: element,
base64: base64,
}, nil
}
type armorEncoder struct {
base64 io.WriteCloser
element *elementEncoder
w io.Writer
}
func (enc *armorEncoder) Write(p []byte) (int, error) {
// Write into the chain base64 | element | w.
return enc.base64.Write(p)
}
func (enc *armorEncoder) Close() error {
// Close the base64 encoder first, to flush out any buffered data and
// the final padding.
err := enc.base64.Close()
if err != nil {
return err
}
// Next, close the element encoder, to close any open elements.
err = enc.element.Close()
if err != nil {
return err
}
// Finally, output the AMP boilerplate trailer.
_, err = enc.w.Write([]byte(boilerplateEnd))
if err != nil {
return err
}
return nil
}
// elementEncoder arranges written data into pre elements, with the text within
// separated into chunks. It does no HTML encoding, so data written must not
// contain any bytes that are meaningful in HTML.
type elementEncoder struct {
w io.Writer
chunkCounter int
elementCounter int
}
func (enc *elementEncoder) Write(p []byte) (n int, err error) {
total := 0
for len(p) > 0 {
if enc.elementCounter == 0 && enc.chunkCounter == 0 {
_, err := enc.w.Write([]byte("<pre>\n"))
if err != nil {
return total, err
}
}
n := bytesPerChunk - enc.chunkCounter
if n > len(p) {
n = len(p)
}
nn, err := enc.w.Write(p[:n])
if err != nil {
return total, err
}
total += nn
p = p[n:]
enc.chunkCounter += n
if enc.chunkCounter >= bytesPerChunk {
enc.chunkCounter = 0
enc.elementCounter += 1
nn, err = enc.w.Write([]byte("\n"))
if err != nil {
return total, err
}
total += nn
}
if enc.elementCounter >= chunksPerElement {
enc.elementCounter = 0
nn, err = enc.w.Write([]byte("</pre>\n"))
if err != nil {
return total, err
}
total += nn
}
}
return total, nil
}
func (enc *elementEncoder) Close() error {
var err error
if !(enc.elementCounter == 0 && enc.chunkCounter == 0) {
if enc.chunkCounter == 0 {
_, err = enc.w.Write([]byte("</pre>\n"))
} else {
_, err = enc.w.Write([]byte("\n</pre>\n"))
}
}
return err
}
package amp
import (
"crypto/rand"
"io"
"io/ioutil"
"strings"
"testing"
)
func armorDecodeToString(src string) (string, error) {
dec, err := NewArmorDecoder(strings.NewReader(src))
if err != nil {
return "", err
}
p, err := ioutil.ReadAll(dec)
return string(p), err
}
func TestArmorDecoder(t *testing.T) {
for _, test := range []struct {
input string
expectedOutput string
expectedErr bool
}{
{`
<pre>
0
</pre>
`,
"",
false,
},
{`
<pre>
0aGVsbG8gd29ybGQK
</pre>
`,
"hello world\n",
false,
},
// bad version indicator
{`
<pre>
1aGVsbG8gd29ybGQK
</pre>
`,
"",
true,
},
// text outside <pre> elements
{`
0aGVsbG8gd29ybGQK
blah blah blah
<pre>
0aGVsbG8gd29ybGQK
</pre>
0aGVsbG8gd29ybGQK
blah blah blah
`,
"hello world\n",
false,
},
{`
<pre>
0QUJDREV
GR0hJSkt
MTU5PUFF
SU1RVVld
</pre>
junk
<pre>
YWVowMTI
zNDU2Nzg
5Cg
=
</pre>
<pre>
=
</pre>
`,
"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n",
false,
},
// no <pre> elements, hence no version indicator
{`
aGVsbG8gd29ybGQK
blah blah blah
aGVsbG8gd29ybGQK
aGVsbG8gd29ybGQK
blah blah blah
`,
"",
true,
},
// empty <pre> elements, hence no version indicator
{`
aGVsbG8gd29ybGQK
blah blah blah
<pre> </pre>
aGVsbG8gd29ybGQK
aGVsbG8gd29ybGQK<pre></pre>
blah blah blah
`,
"",
true,
},
// other elements inside <pre>
{
"blah <pre>0aGVsb<p>G8gd29</p>ybGQK</pre>",
"hello world\n",
false,
},
// HTML comment
{
"blah <!-- <pre>aGVsbG8gd29ybGQK</pre> -->",
"",
true,
},
// all kinds of ASCII whitespace
{
"blah <pre>\x200\x09aG\x0aV\x0csb\x0dG8\x20gd29ybGQK</pre>",
"hello world\n",
false,
},
// bad padding
{`
<pre>
0QUJDREV
GR0hJSkt
MTU5PUFF
SU1RVVld
</pre>
junk
<pre>
YWVowMTI
zNDU2Nzg
5Cg
=
</pre>
`,
"",
true,
},
/*
// per-chunk base64
// test disabled because Go stdlib handles this incorrectly:
// https://github.com/golang/go/issues/31626
{
"<pre>QQ==</pre><pre>Qg==</pre>",
"",
true,
},
*/
// missing </pre>
{
"blah <pre></pre><pre>0aGVsbG8gd29ybGQK",
"",
true,
},
// nested <pre>
{
"blah <pre>0aGVsb<pre>G8gd29</pre>ybGQK</pre>",
"",
true,
},
} {
output, err := armorDecodeToString(test.input)
if test.expectedErr && err == nil {
t.Errorf("%+q → (%+q, %v), expected error", test.input, output, err)
continue
}
if !test.expectedErr && err != nil {
t.Errorf("%+q → (%+q, %v), expected no error", test.input, output, err)
continue
}
if !test.expectedErr && output != test.expectedOutput {
t.Errorf("%+q → (%+q, %v), expected (%+q, %v)",
test.input, output, err, test.expectedOutput, nil)
continue
}
}
}
func armorRoundTrip(s string) (string, error) {
var encoded strings.Builder
enc, err := NewArmorEncoder(&encoded)
if err != nil {
return "", err
}
_, err = io.Copy(enc, strings.NewReader(s))
if err != nil {
return "", err
}
err = enc.Close()
if err != nil {
return "", err
}
return armorDecodeToString(encoded.String())
}
func TestArmorRoundTrip(t *testing.T) {
lengths := make([]int, 0)
// Test short strings and lengths around elementSizeLimit thresholds.
for i := 0; i < bytesPerChunk*2; i++ {
lengths = append(lengths, i)
}
for i := -10; i < +10; i++ {
lengths = append(lengths, elementSizeLimit+i)
lengths = append(lengths, 2*elementSizeLimit+i)
}
for _, n := range lengths {
buf := make([]byte, n)
rand.Read(buf)
input := string(buf)
output, err := armorRoundTrip(input)
if err != nil {
t.Errorf("length %d → error %v", n, err)
continue
}
if output != input {
t.Errorf("length %d → %+q", n, output)
continue
}
}
}
package amp
import (
"crypto/sha256"
"encoding/base32"
"fmt"
"net"
"net/url"
"path"
"strings"
"golang.org/x/net/idna"
)
// domainPrefixBasic does the basic domain prefix conversion. Does not do any
// IDNA mapping, such as https://www.unicode.org/reports/tr46/.
//
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
func domainPrefixBasic(domain string) (string, error) {
// 1. Punycode Decode the publisher domain.
prefix, err := idna.ToUnicode(domain)
if err != nil {
return "", err
}
// 2. Replace any "-" (hyphen) character in the output of step 1 with
// "--" (two hyphens).
prefix = strings.Replace(prefix, "-", "--", -1)
// 3. Replace any "." (dot) character in the output of step 2 with "-"
// (hyphen).
prefix = strings.Replace(prefix, ".", "-", -1)
// 4. If the output of step 3 has a "-" (hyphen) at both positions 3 and
// 4, then to the output of step 3, add a prefix of "0-" and add a
// suffix of "-0".
if len(prefix) >= 4 && prefix[2] == '-' && prefix[3] == '-' {
prefix = "0-" + prefix + "-0"
}
// 5. Punycode Encode the output of step 3.
return idna.ToASCII(prefix)
}
// Lower-case base32 without padding.
var fallbackBase32Encoding = base32.NewEncoding("abcdefghijklmnopqrstuvwxyz234567").WithPadding(base32.NoPadding)
// domainPrefixFallback does the fallback domain prefix conversion. The returned
// base32 domain uses lower-case letters.
//
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#fallback-algorithm
func domainPrefixFallback(domain string) string {
// The algorithm specification does not say what, exactly, we are to
// take the SHA-256 of. domain is notionally an abstract Unicode
// string, not a byte sequence. While
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L62
// says "Take the SHA256 of the punycode view of the domain," in reality
// it hashes the UTF-8 encoding of the domain, without Punycode:
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L141
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/browser/Sha256.js#L24
// We do the same here, hashing the raw bytes of domain, presumed to be
// UTF-8.
// 1. Hash the publisher's domain using SHA256.
h := sha256.Sum256([]byte(domain))
// 2. Base32 Escape the output of step 1.
// 3. Remove the last 4 characters from the output of step 2, which are
// always "=" (equals) characters.
return fallbackBase32Encoding.EncodeToString(h[:])
}
// domainPrefix computes the domain prefix of an AMP cache URL.
//
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#domain-name-prefix
func domainPrefix(domain string) string {
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#combined-algorithm
// 1. Run the Basic Algorithm. If the output is a valid DNS label,
// [append the Cache domain suffix and] return. Otherwise continue to
// step 2.
prefix, err := domainPrefixBasic(domain)
// "A domain prefix is not a valid DNS label if it is longer than 63
// characters"
if err == nil && len(prefix) <= 63 {
return prefix