adding git hook test, simplified structure using vendoring

This commit is contained in:
2023-01-09 05:50:26 -05:00
parent a51f35966d
commit 5fb3085848
93 changed files with 3906 additions and 7551 deletions

27
vendor/golang.org/x/crypto/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

22
vendor/golang.org/x/crypto/PATENTS generated vendored Normal file
View File

@@ -0,0 +1,22 @@
Additional IP Rights Grant (Patents)
"This implementation" means the copyrightable works distributed by
Google as part of the Go project.
Google hereby grants to You a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer and otherwise run, modify and propagate the contents of this
implementation of Go, where such license applies only to those patent
claims, both currently owned or controlled by Google and acquired in
the future, licensable by Google that are necessarily infringed by this
implementation of Go. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of Go or any code incorporated within this
implementation of Go constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of Go
shall terminate as of the date such litigation is filed.

17
vendor/golang.org/x/crypto/chacha20/chacha_arm64.go generated vendored Normal file
View File

@@ -0,0 +1,17 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.11 && gc && !purego
// +build go1.11,gc,!purego
package chacha20
const bufSize = 256
//go:noescape
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
}

308
vendor/golang.org/x/crypto/chacha20/chacha_arm64.s generated vendored Normal file
View File

@@ -0,0 +1,308 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.11 && gc && !purego
// +build go1.11,gc,!purego
#include "textflag.h"
#define NUM_ROUNDS 10
// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
MOVD dst+0(FP), R1
MOVD src+24(FP), R2
MOVD src_len+32(FP), R3
MOVD key+48(FP), R4
MOVD nonce+56(FP), R6
MOVD counter+64(FP), R7
MOVD $·constants(SB), R10
MOVD $·incRotMatrix(SB), R11
MOVW (R7), R20
AND $~255, R3, R13
ADD R2, R13, R12 // R12 for block end
AND $255, R3, R13
loop:
MOVD $NUM_ROUNDS, R21
VLD1 (R11), [V30.S4, V31.S4]
// load contants
// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
WORD $0x4D60E940
// load keys
// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
WORD $0x4DFFE884
// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
WORD $0x4DFFE888
SUB $32, R4
// load counter + nonce
// VLD1R (R7), [V12.S4]
WORD $0x4D40C8EC
// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
WORD $0x4D40E8CD
// update counter
VADD V30.S4, V12.S4, V12.S4
chacha:
// V0..V3 += V4..V7
// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
VADD V0.S4, V4.S4, V0.S4
VADD V1.S4, V5.S4, V1.S4
VADD V2.S4, V6.S4, V2.S4
VADD V3.S4, V7.S4, V3.S4
VEOR V12.B16, V0.B16, V12.B16
VEOR V13.B16, V1.B16, V13.B16
VEOR V14.B16, V2.B16, V14.B16
VEOR V15.B16, V3.B16, V15.B16
VREV32 V12.H8, V12.H8
VREV32 V13.H8, V13.H8
VREV32 V14.H8, V14.H8
VREV32 V15.H8, V15.H8
// V8..V11 += V12..V15
// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
VADD V8.S4, V12.S4, V8.S4
VADD V9.S4, V13.S4, V9.S4
VADD V10.S4, V14.S4, V10.S4
VADD V11.S4, V15.S4, V11.S4
VEOR V8.B16, V4.B16, V16.B16
VEOR V9.B16, V5.B16, V17.B16
VEOR V10.B16, V6.B16, V18.B16
VEOR V11.B16, V7.B16, V19.B16
VSHL $12, V16.S4, V4.S4
VSHL $12, V17.S4, V5.S4
VSHL $12, V18.S4, V6.S4
VSHL $12, V19.S4, V7.S4
VSRI $20, V16.S4, V4.S4
VSRI $20, V17.S4, V5.S4
VSRI $20, V18.S4, V6.S4
VSRI $20, V19.S4, V7.S4
// V0..V3 += V4..V7
// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
VADD V0.S4, V4.S4, V0.S4
VADD V1.S4, V5.S4, V1.S4
VADD V2.S4, V6.S4, V2.S4
VADD V3.S4, V7.S4, V3.S4
VEOR V12.B16, V0.B16, V12.B16
VEOR V13.B16, V1.B16, V13.B16
VEOR V14.B16, V2.B16, V14.B16
VEOR V15.B16, V3.B16, V15.B16
VTBL V31.B16, [V12.B16], V12.B16
VTBL V31.B16, [V13.B16], V13.B16
VTBL V31.B16, [V14.B16], V14.B16
VTBL V31.B16, [V15.B16], V15.B16
// V8..V11 += V12..V15
// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
VADD V12.S4, V8.S4, V8.S4
VADD V13.S4, V9.S4, V9.S4
VADD V14.S4, V10.S4, V10.S4
VADD V15.S4, V11.S4, V11.S4
VEOR V8.B16, V4.B16, V16.B16
VEOR V9.B16, V5.B16, V17.B16
VEOR V10.B16, V6.B16, V18.B16
VEOR V11.B16, V7.B16, V19.B16
VSHL $7, V16.S4, V4.S4
VSHL $7, V17.S4, V5.S4
VSHL $7, V18.S4, V6.S4
VSHL $7, V19.S4, V7.S4
VSRI $25, V16.S4, V4.S4
VSRI $25, V17.S4, V5.S4
VSRI $25, V18.S4, V6.S4
VSRI $25, V19.S4, V7.S4
// V0..V3 += V5..V7, V4
// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
VADD V0.S4, V5.S4, V0.S4
VADD V1.S4, V6.S4, V1.S4
VADD V2.S4, V7.S4, V2.S4
VADD V3.S4, V4.S4, V3.S4
VEOR V15.B16, V0.B16, V15.B16
VEOR V12.B16, V1.B16, V12.B16
VEOR V13.B16, V2.B16, V13.B16
VEOR V14.B16, V3.B16, V14.B16
VREV32 V12.H8, V12.H8
VREV32 V13.H8, V13.H8
VREV32 V14.H8, V14.H8
VREV32 V15.H8, V15.H8
// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
// ...
VADD V15.S4, V10.S4, V10.S4
VADD V12.S4, V11.S4, V11.S4
VADD V13.S4, V8.S4, V8.S4
VADD V14.S4, V9.S4, V9.S4
VEOR V10.B16, V5.B16, V16.B16
VEOR V11.B16, V6.B16, V17.B16
VEOR V8.B16, V7.B16, V18.B16
VEOR V9.B16, V4.B16, V19.B16
VSHL $12, V16.S4, V5.S4
VSHL $12, V17.S4, V6.S4
VSHL $12, V18.S4, V7.S4
VSHL $12, V19.S4, V4.S4
VSRI $20, V16.S4, V5.S4
VSRI $20, V17.S4, V6.S4
VSRI $20, V18.S4, V7.S4
VSRI $20, V19.S4, V4.S4
// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
// ...
VADD V5.S4, V0.S4, V0.S4
VADD V6.S4, V1.S4, V1.S4
VADD V7.S4, V2.S4, V2.S4
VADD V4.S4, V3.S4, V3.S4
VEOR V0.B16, V15.B16, V15.B16
VEOR V1.B16, V12.B16, V12.B16
VEOR V2.B16, V13.B16, V13.B16
VEOR V3.B16, V14.B16, V14.B16
VTBL V31.B16, [V12.B16], V12.B16
VTBL V31.B16, [V13.B16], V13.B16
VTBL V31.B16, [V14.B16], V14.B16
VTBL V31.B16, [V15.B16], V15.B16
// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
// ...
VADD V15.S4, V10.S4, V10.S4
VADD V12.S4, V11.S4, V11.S4
VADD V13.S4, V8.S4, V8.S4
VADD V14.S4, V9.S4, V9.S4
VEOR V10.B16, V5.B16, V16.B16
VEOR V11.B16, V6.B16, V17.B16
VEOR V8.B16, V7.B16, V18.B16
VEOR V9.B16, V4.B16, V19.B16
VSHL $7, V16.S4, V5.S4
VSHL $7, V17.S4, V6.S4
VSHL $7, V18.S4, V7.S4
VSHL $7, V19.S4, V4.S4
VSRI $25, V16.S4, V5.S4
VSRI $25, V17.S4, V6.S4
VSRI $25, V18.S4, V7.S4
VSRI $25, V19.S4, V4.S4
SUB $1, R21
CBNZ R21, chacha
// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
WORD $0x4D60E950
// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
WORD $0x4DFFE894
VADD V30.S4, V12.S4, V12.S4
VADD V16.S4, V0.S4, V0.S4
VADD V17.S4, V1.S4, V1.S4
VADD V18.S4, V2.S4, V2.S4
VADD V19.S4, V3.S4, V3.S4
// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
WORD $0x4DFFE898
// restore R4
SUB $32, R4
// load counter + nonce
// VLD1R (R7), [V28.S4]
WORD $0x4D40C8FC
// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
WORD $0x4D40E8DD
VADD V20.S4, V4.S4, V4.S4
VADD V21.S4, V5.S4, V5.S4
VADD V22.S4, V6.S4, V6.S4
VADD V23.S4, V7.S4, V7.S4
VADD V24.S4, V8.S4, V8.S4
VADD V25.S4, V9.S4, V9.S4
VADD V26.S4, V10.S4, V10.S4
VADD V27.S4, V11.S4, V11.S4
VADD V28.S4, V12.S4, V12.S4
VADD V29.S4, V13.S4, V13.S4
VADD V30.S4, V14.S4, V14.S4
VADD V31.S4, V15.S4, V15.S4
VZIP1 V1.S4, V0.S4, V16.S4
VZIP2 V1.S4, V0.S4, V17.S4
VZIP1 V3.S4, V2.S4, V18.S4
VZIP2 V3.S4, V2.S4, V19.S4
VZIP1 V5.S4, V4.S4, V20.S4
VZIP2 V5.S4, V4.S4, V21.S4
VZIP1 V7.S4, V6.S4, V22.S4
VZIP2 V7.S4, V6.S4, V23.S4
VZIP1 V9.S4, V8.S4, V24.S4
VZIP2 V9.S4, V8.S4, V25.S4
VZIP1 V11.S4, V10.S4, V26.S4
VZIP2 V11.S4, V10.S4, V27.S4
VZIP1 V13.S4, V12.S4, V28.S4
VZIP2 V13.S4, V12.S4, V29.S4
VZIP1 V15.S4, V14.S4, V30.S4
VZIP2 V15.S4, V14.S4, V31.S4
VZIP1 V18.D2, V16.D2, V0.D2
VZIP2 V18.D2, V16.D2, V4.D2
VZIP1 V19.D2, V17.D2, V8.D2
VZIP2 V19.D2, V17.D2, V12.D2
VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
VZIP1 V22.D2, V20.D2, V1.D2
VZIP2 V22.D2, V20.D2, V5.D2
VZIP1 V23.D2, V21.D2, V9.D2
VZIP2 V23.D2, V21.D2, V13.D2
VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
VZIP1 V26.D2, V24.D2, V2.D2
VZIP2 V26.D2, V24.D2, V6.D2
VZIP1 V27.D2, V25.D2, V10.D2
VZIP2 V27.D2, V25.D2, V14.D2
VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
VZIP1 V30.D2, V28.D2, V3.D2
VZIP2 V30.D2, V28.D2, V7.D2
VZIP1 V31.D2, V29.D2, V11.D2
VZIP2 V31.D2, V29.D2, V15.D2
VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
VEOR V0.B16, V16.B16, V16.B16
VEOR V1.B16, V17.B16, V17.B16
VEOR V2.B16, V18.B16, V18.B16
VEOR V3.B16, V19.B16, V19.B16
VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
VEOR V4.B16, V20.B16, V20.B16
VEOR V5.B16, V21.B16, V21.B16
VEOR V6.B16, V22.B16, V22.B16
VEOR V7.B16, V23.B16, V23.B16
VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
VEOR V8.B16, V24.B16, V24.B16
VEOR V9.B16, V25.B16, V25.B16
VEOR V10.B16, V26.B16, V26.B16
VEOR V11.B16, V27.B16, V27.B16
VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
VEOR V12.B16, V28.B16, V28.B16
VEOR V13.B16, V29.B16, V29.B16
VEOR V14.B16, V30.B16, V30.B16
VEOR V15.B16, V31.B16, V31.B16
VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
ADD $4, R20
MOVW R20, (R7) // update counter
CMP R2, R12
BGT loop
RET
DATA ·constants+0x00(SB)/4, $0x61707865
DATA ·constants+0x04(SB)/4, $0x3320646e
DATA ·constants+0x08(SB)/4, $0x79622d32
DATA ·constants+0x0c(SB)/4, $0x6b206574
GLOBL ·constants(SB), NOPTR|RODATA, $32
DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
DATA ·incRotMatrix+0x10(SB)/4, $0x02010003
DATA ·incRotMatrix+0x14(SB)/4, $0x06050407
DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B
DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32

400
vendor/golang.org/x/crypto/chacha20/chacha_generic.go generated vendored Normal file
View File

@@ -0,0 +1,400 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package chacha20 implements the ChaCha20 and XChaCha20 encryption algorithms
// as specified in RFC 8439 and draft-irtf-cfrg-xchacha-01.
package chacha20
import (
"crypto/cipher"
"encoding/binary"
"errors"
"math/bits"
"golang.org/x/crypto/internal/alias"
)
const (
// KeySize is the size of the key used by this cipher, in bytes.
KeySize = 32
// NonceSize is the size of the nonce used with the standard variant of this
// cipher, in bytes.
//
// Note that this is too short to be safely generated at random if the same
// key is reused more than 2³² times.
//NonceSize = 12
// BITE ME, GOLANG DEVS.
NonceSize = 16
// NonceSizeX is the size of the nonce used with the XChaCha20 variant of
// this cipher, in bytes.
NonceSizeX = 24
)
// Cipher is a stateful instance of ChaCha20 or XChaCha20 using a particular key
// and nonce. A *Cipher implements the cipher.Stream interface.
type Cipher struct {
// The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter
// (incremented after each block), and 3 of nonce.
key [8]uint32
counter uint32
nonce [3]uint32
// The last len bytes of buf are leftover key stream bytes from the previous
// XORKeyStream invocation. The size of buf depends on how many blocks are
// computed at a time by xorKeyStreamBlocks.
buf [bufSize]byte
len int
// overflow is set when the counter overflowed, no more blocks can be
// generated, and the next XORKeyStream call should panic.
overflow bool
// The counter-independent results of the first round are cached after they
// are computed the first time.
precompDone bool
p1, p5, p9, p13 uint32
p2, p6, p10, p14 uint32
p3, p7, p11, p15 uint32
}
var _ cipher.Stream = (*Cipher)(nil)
// NewUnauthenticatedCipher creates a new ChaCha20 stream cipher with the given
// 32 bytes key and a 12 or 24 bytes nonce. If a nonce of 24 bytes is provided,
// the XChaCha20 construction will be used. It returns an error if key or nonce
// have any other length.
//
// Note that ChaCha20, like all stream ciphers, is not authenticated and allows
// attackers to silently tamper with the plaintext. For this reason, it is more
// appropriate as a building block than as a standalone encryption mechanism.
// Instead, consider using package golang.org/x/crypto/chacha20poly1305.
func NewUnauthenticatedCipher(key, nonce []byte) (*Cipher, error) {
// This function is split into a wrapper so that the Cipher allocation will
// be inlined, and depending on how the caller uses the return value, won't
// escape to the heap.
c := &Cipher{}
return newUnauthenticatedCipher(c, key, nonce)
}
func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) {
if len(key) != KeySize {
return nil, errors.New("chacha20: wrong key size")
}
if len(nonce) == NonceSizeX {
// XChaCha20 uses the ChaCha20 core to mix 16 bytes of the nonce into a
// derived key, allowing it to operate on a nonce of 24 bytes. See
// draft-irtf-cfrg-xchacha-01, Section 2.3.
key, _ = HChaCha20(key, nonce[0:16])
cNonce := make([]byte, NonceSize)
copy(cNonce[4:12], nonce[16:24])
nonce = cNonce
} else if len(nonce) != NonceSize {
return nil, errors.New("chacha20: wrong nonce size")
}
key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
c.key = [8]uint32{
binary.LittleEndian.Uint32(key[0:4]),
binary.LittleEndian.Uint32(key[4:8]),
binary.LittleEndian.Uint32(key[8:12]),
binary.LittleEndian.Uint32(key[12:16]),
binary.LittleEndian.Uint32(key[16:20]),
binary.LittleEndian.Uint32(key[20:24]),
binary.LittleEndian.Uint32(key[24:28]),
binary.LittleEndian.Uint32(key[28:32]),
}
c.nonce = [3]uint32{
binary.LittleEndian.Uint32(nonce[0:4]),
binary.LittleEndian.Uint32(nonce[4:8]),
binary.LittleEndian.Uint32(nonce[8:12]),
}
return c, nil
}
// The constant first 4 words of the ChaCha20 state.
const (
j0 uint32 = 0x61707865 // expa
j1 uint32 = 0x3320646e // nd 3
j2 uint32 = 0x79622d32 // 2-by
j3 uint32 = 0x6b206574 // te k
)
const blockSize = 64
// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words.
// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16
// words each round, in columnar or diagonal groups of 4 at a time.
func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
a += b
d ^= a
d = bits.RotateLeft32(d, 16)
c += d
b ^= c
b = bits.RotateLeft32(b, 12)
a += b
d ^= a
d = bits.RotateLeft32(d, 8)
c += d
b ^= c
b = bits.RotateLeft32(b, 7)
return a, b, c, d
}
// SetCounter sets the Cipher counter. The next invocation of XORKeyStream will
// behave as if (64 * counter) bytes had been encrypted so far.
//
// To prevent accidental counter reuse, SetCounter panics if counter is less
// than the current value.
//
// Note that the execution time of XORKeyStream is not independent of the
// counter value.
func (s *Cipher) SetCounter(counter uint32) {
// Internally, s may buffer multiple blocks, which complicates this
// implementation slightly. When checking whether the counter has rolled
// back, we must use both s.counter and s.len to determine how many blocks
// we have already output.
outputCounter := s.counter - uint32(s.len)/blockSize
if s.overflow || counter < outputCounter {
panic("chacha20: SetCounter attempted to rollback counter")
}
// In the general case, we set the new counter value and reset s.len to 0,
// causing the next call to XORKeyStream to refill the buffer. However, if
// we're advancing within the existing buffer, we can save work by simply
// setting s.len.
if counter < s.counter {
s.len = int(s.counter-counter) * blockSize
} else {
s.counter = counter
s.len = 0
}
}
// XORKeyStream XORs each byte in the given slice with a byte from the
// cipher's key stream. Dst and src must overlap entirely or not at all.
//
// If len(dst) < len(src), XORKeyStream will panic. It is acceptable
// to pass a dst bigger than src, and in that case, XORKeyStream will
// only update dst[:len(src)] and will not touch the rest of dst.
//
// Multiple calls to XORKeyStream behave as if the concatenation of
// the src buffers was passed in a single run. That is, Cipher
// maintains state and does not reset at each XORKeyStream call.
func (s *Cipher) XORKeyStream(dst, src []byte) {
if len(src) == 0 {
return
}
if len(dst) < len(src) {
panic("chacha20: output smaller than input")
}
dst = dst[:len(src)]
if alias.InexactOverlap(dst, src) {
panic("chacha20: invalid buffer overlap")
}
// First, drain any remaining key stream from a previous XORKeyStream.
if s.len != 0 {
keyStream := s.buf[bufSize-s.len:]
if len(src) < len(keyStream) {
keyStream = keyStream[:len(src)]
}
_ = src[len(keyStream)-1] // bounds check elimination hint
for i, b := range keyStream {
dst[i] = src[i] ^ b
}
s.len -= len(keyStream)
dst, src = dst[len(keyStream):], src[len(keyStream):]
}
if len(src) == 0 {
return
}
// If we'd need to let the counter overflow and keep generating output,
// panic immediately. If instead we'd only reach the last block, remember
// not to generate any more output after the buffer is drained.
numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize
if s.overflow || uint64(s.counter)+numBlocks > 1<<32 {
panic("chacha20: counter overflow")
} else if uint64(s.counter)+numBlocks == 1<<32 {
s.overflow = true
}
// xorKeyStreamBlocks implementations expect input lengths that are a
// multiple of bufSize. Platform-specific ones process multiple blocks at a
// time, so have bufSizes that are a multiple of blockSize.
full := len(src) - len(src)%bufSize
if full > 0 {
s.xorKeyStreamBlocks(dst[:full], src[:full])
}
dst, src = dst[full:], src[full:]
// If using a multi-block xorKeyStreamBlocks would overflow, use the generic
// one that does one block at a time.
const blocksPerBuf = bufSize / blockSize
if uint64(s.counter)+blocksPerBuf > 1<<32 {
s.buf = [bufSize]byte{}
numBlocks := (len(src) + blockSize - 1) / blockSize
buf := s.buf[bufSize-numBlocks*blockSize:]
copy(buf, src)
s.xorKeyStreamBlocksGeneric(buf, buf)
s.len = len(buf) - copy(dst, buf)
return
}
// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
// keep the leftover keystream for the next XORKeyStream invocation.
if len(src) > 0 {
s.buf = [bufSize]byte{}
copy(s.buf[:], src)
s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
s.len = bufSize - copy(dst, s.buf[:])
}
}
func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
if len(dst) != len(src) || len(dst)%blockSize != 0 {
panic("chacha20: internal error: wrong dst and/or src length")
}
// To generate each block of key stream, the initial cipher state
// (represented below) is passed through 20 rounds of shuffling,
// alternatively applying quarterRounds by columns (like 1, 5, 9, 13)
// or by diagonals (like 1, 6, 11, 12).
//
// 0:cccccccc 1:cccccccc 2:cccccccc 3:cccccccc
// 4:kkkkkkkk 5:kkkkkkkk 6:kkkkkkkk 7:kkkkkkkk
// 8:kkkkkkkk 9:kkkkkkkk 10:kkkkkkkk 11:kkkkkkkk
// 12:bbbbbbbb 13:nnnnnnnn 14:nnnnnnnn 15:nnnnnnnn
//
// c=constant k=key b=blockcount n=nonce
var (
c0, c1, c2, c3 = j0, j1, j2, j3
c4, c5, c6, c7 = s.key[0], s.key[1], s.key[2], s.key[3]
c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7]
_, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2]
)
// Three quarters of the first round don't depend on the counter, so we can
// calculate them here, and reuse them for multiple blocks in the loop, and
// for future XORKeyStream invocations.
if !s.precompDone {
s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13)
s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14)
s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15)
s.precompDone = true
}
// A condition of len(src) > 0 would be sufficient, but this also
// acts as a bounds check elimination hint.
for len(src) >= 64 && len(dst) >= 64 {
// The remainder of the first column round.
fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
// The second diagonal round.
x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15)
x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12)
x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13)
x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14)
// The remaining 18 rounds.
for i := 0; i < 9; i++ {
// Column round.
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
// Diagonal round.
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
}
// Add back the initial state to generate the key stream, then
// XOR the key stream with the source and write out the result.
addXor(dst[0:4], src[0:4], x0, c0)
addXor(dst[4:8], src[4:8], x1, c1)
addXor(dst[8:12], src[8:12], x2, c2)
addXor(dst[12:16], src[12:16], x3, c3)
addXor(dst[16:20], src[16:20], x4, c4)
addXor(dst[20:24], src[20:24], x5, c5)
addXor(dst[24:28], src[24:28], x6, c6)
addXor(dst[28:32], src[28:32], x7, c7)
addXor(dst[32:36], src[32:36], x8, c8)
addXor(dst[36:40], src[36:40], x9, c9)
addXor(dst[40:44], src[40:44], x10, c10)
addXor(dst[44:48], src[44:48], x11, c11)
addXor(dst[48:52], src[48:52], x12, s.counter)
addXor(dst[52:56], src[52:56], x13, c13)
addXor(dst[56:60], src[56:60], x14, c14)
addXor(dst[60:64], src[60:64], x15, c15)
s.counter += 1
src, dst = src[blockSize:], dst[blockSize:]
}
}
// HChaCha20 uses the ChaCha20 core to generate a derived key from a 32 bytes
// key and a 16 bytes nonce. It returns an error if key or nonce have any other
// length. It is used as part of the XChaCha20 construction.
func HChaCha20(key, nonce []byte) ([]byte, error) {
// This function is split into a wrapper so that the slice allocation will
// be inlined, and depending on how the caller uses the return value, won't
// escape to the heap.
out := make([]byte, 32)
return hChaCha20(out, key, nonce)
}
func hChaCha20(out, key, nonce []byte) ([]byte, error) {
if len(key) != KeySize {
return nil, errors.New("chacha20: wrong HChaCha20 key size")
}
if len(nonce) != 16 {
return nil, errors.New("chacha20: wrong HChaCha20 nonce size")
}
x0, x1, x2, x3 := j0, j1, j2, j3
x4 := binary.LittleEndian.Uint32(key[0:4])
x5 := binary.LittleEndian.Uint32(key[4:8])
x6 := binary.LittleEndian.Uint32(key[8:12])
x7 := binary.LittleEndian.Uint32(key[12:16])
x8 := binary.LittleEndian.Uint32(key[16:20])
x9 := binary.LittleEndian.Uint32(key[20:24])
x10 := binary.LittleEndian.Uint32(key[24:28])
x11 := binary.LittleEndian.Uint32(key[28:32])
x12 := binary.LittleEndian.Uint32(nonce[0:4])
x13 := binary.LittleEndian.Uint32(nonce[4:8])
x14 := binary.LittleEndian.Uint32(nonce[8:12])
x15 := binary.LittleEndian.Uint32(nonce[12:16])
for i := 0; i < 10; i++ {
// Diagonal round.
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
// Column round.
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
}
_ = out[31] // bounds check elimination hint
binary.LittleEndian.PutUint32(out[0:4], x0)
binary.LittleEndian.PutUint32(out[4:8], x1)
binary.LittleEndian.PutUint32(out[8:12], x2)
binary.LittleEndian.PutUint32(out[12:16], x3)
binary.LittleEndian.PutUint32(out[16:20], x12)
binary.LittleEndian.PutUint32(out[20:24], x13)
binary.LittleEndian.PutUint32(out[24:28], x14)
binary.LittleEndian.PutUint32(out[28:32], x15)
return out, nil
}

14
vendor/golang.org/x/crypto/chacha20/chacha_noasm.go generated vendored Normal file
View File

@@ -0,0 +1,14 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!arm64 && !s390x && !ppc64le) || (arm64 && !go1.11) || !gc || purego
// +build !arm64,!s390x,!ppc64le arm64,!go1.11 !gc purego
package chacha20
const bufSize = blockSize
func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
s.xorKeyStreamBlocksGeneric(dst, src)
}

17
vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go generated vendored Normal file
View File

@@ -0,0 +1,17 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
package chacha20
const bufSize = 256
//go:noescape
func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
}

450
vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s generated vendored Normal file
View File

@@ -0,0 +1,450 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Based on CRYPTOGAMS code with the following comment:
// # ====================================================================
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// # project. The module is, however, dual licensed under OpenSSL and
// # CRYPTOGAMS licenses depending on where you obtain it. For further
// # details see http://www.openssl.org/~appro/cryptogams/.
// # ====================================================================
// Code for the perl script that generates the ppc64 assembler
// can be found in the cryptogams repository at the link below. It is based on
// the original from openssl.
// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
// The differences in this and the original implementation are
// due to the calling conventions and initialization of constants.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"
#define OUT R3
#define INP R4
#define LEN R5
#define KEY R6
#define CNT R7
#define TMP R15
#define CONSTBASE R16
#define BLOCKS R17
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
DATA consts<>+0x10(SB)/8, $0x0000000000000001
DATA consts<>+0x18(SB)/8, $0x0000000000000000
DATA consts<>+0x20(SB)/8, $0x0000000000000004
DATA consts<>+0x28(SB)/8, $0x0000000000000000
DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
DATA consts<>+0x38(SB)/8, $0x0203000106070405
DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
DATA consts<>+0x48(SB)/8, $0x0102030005060704
DATA consts<>+0x50(SB)/8, $0x6170786561707865
DATA consts<>+0x58(SB)/8, $0x6170786561707865
DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
DATA consts<>+0x90(SB)/8, $0x0000000100000000
DATA consts<>+0x98(SB)/8, $0x0000000300000002
GLOBL consts<>(SB), RODATA, $0xa0
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
MOVD out+0(FP), OUT
MOVD inp+8(FP), INP
MOVD len+16(FP), LEN
MOVD key+24(FP), KEY
MOVD counter+32(FP), CNT
// Addressing for constants
MOVD $consts<>+0x00(SB), CONSTBASE
MOVD $16, R8
MOVD $32, R9
MOVD $48, R10
MOVD $64, R11
SRD $6, LEN, BLOCKS
// V16
LXVW4X (CONSTBASE)(R0), VS48
ADD $80,CONSTBASE
// Load key into V17,V18
LXVW4X (KEY)(R0), VS49
LXVW4X (KEY)(R8), VS50
// Load CNT, NONCE into V19
LXVW4X (CNT)(R0), VS51
// Clear V27
VXOR V27, V27, V27
// V28
LXVW4X (CONSTBASE)(R11), VS60
// splat slot from V19 -> V26
VSPLTW $0, V19, V26
VSLDOI $4, V19, V27, V19
VSLDOI $12, V27, V19, V19
VADDUWM V26, V28, V26
MOVD $10, R14
MOVD R14, CTR
loop_outer_vsx:
// V0, V1, V2, V3
LXVW4X (R0)(CONSTBASE), VS32
LXVW4X (R8)(CONSTBASE), VS33
LXVW4X (R9)(CONSTBASE), VS34
LXVW4X (R10)(CONSTBASE), VS35
// splat values from V17, V18 into V4-V11
VSPLTW $0, V17, V4
VSPLTW $1, V17, V5
VSPLTW $2, V17, V6
VSPLTW $3, V17, V7
VSPLTW $0, V18, V8
VSPLTW $1, V18, V9
VSPLTW $2, V18, V10
VSPLTW $3, V18, V11
// VOR
VOR V26, V26, V12
// splat values from V19 -> V13, V14, V15
VSPLTW $1, V19, V13
VSPLTW $2, V19, V14
VSPLTW $3, V19, V15
// splat const values
VSPLTISW $-16, V27
VSPLTISW $12, V28
VSPLTISW $8, V29
VSPLTISW $7, V30
loop_vsx:
VADDUWM V0, V4, V0
VADDUWM V1, V5, V1
VADDUWM V2, V6, V2
VADDUWM V3, V7, V3
VXOR V12, V0, V12
VXOR V13, V1, V13
VXOR V14, V2, V14
VXOR V15, V3, V15
VRLW V12, V27, V12
VRLW V13, V27, V13
VRLW V14, V27, V14
VRLW V15, V27, V15
VADDUWM V8, V12, V8
VADDUWM V9, V13, V9
VADDUWM V10, V14, V10
VADDUWM V11, V15, V11
VXOR V4, V8, V4
VXOR V5, V9, V5
VXOR V6, V10, V6
VXOR V7, V11, V7
VRLW V4, V28, V4
VRLW V5, V28, V5
VRLW V6, V28, V6
VRLW V7, V28, V7
VADDUWM V0, V4, V0
VADDUWM V1, V5, V1
VADDUWM V2, V6, V2
VADDUWM V3, V7, V3
VXOR V12, V0, V12
VXOR V13, V1, V13
VXOR V14, V2, V14
VXOR V15, V3, V15
VRLW V12, V29, V12
VRLW V13, V29, V13
VRLW V14, V29, V14
VRLW V15, V29, V15
VADDUWM V8, V12, V8
VADDUWM V9, V13, V9
VADDUWM V10, V14, V10
VADDUWM V11, V15, V11
VXOR V4, V8, V4
VXOR V5, V9, V5
VXOR V6, V10, V6
VXOR V7, V11, V7
VRLW V4, V30, V4
VRLW V5, V30, V5
VRLW V6, V30, V6
VRLW V7, V30, V7
VADDUWM V0, V5, V0
VADDUWM V1, V6, V1
VADDUWM V2, V7, V2
VADDUWM V3, V4, V3
VXOR V15, V0, V15
VXOR V12, V1, V12
VXOR V13, V2, V13
VXOR V14, V3, V14
VRLW V15, V27, V15
VRLW V12, V27, V12
VRLW V13, V27, V13
VRLW V14, V27, V14
VADDUWM V10, V15, V10
VADDUWM V11, V12, V11
VADDUWM V8, V13, V8
VADDUWM V9, V14, V9
VXOR V5, V10, V5
VXOR V6, V11, V6
VXOR V7, V8, V7
VXOR V4, V9, V4
VRLW V5, V28, V5
VRLW V6, V28, V6
VRLW V7, V28, V7
VRLW V4, V28, V4
VADDUWM V0, V5, V0
VADDUWM V1, V6, V1
VADDUWM V2, V7, V2
VADDUWM V3, V4, V3
VXOR V15, V0, V15
VXOR V12, V1, V12
VXOR V13, V2, V13
VXOR V14, V3, V14
VRLW V15, V29, V15
VRLW V12, V29, V12
VRLW V13, V29, V13
VRLW V14, V29, V14
VADDUWM V10, V15, V10
VADDUWM V11, V12, V11
VADDUWM V8, V13, V8
VADDUWM V9, V14, V9
VXOR V5, V10, V5
VXOR V6, V11, V6
VXOR V7, V8, V7
VXOR V4, V9, V4
VRLW V5, V30, V5
VRLW V6, V30, V6
VRLW V7, V30, V7
VRLW V4, V30, V4
BC 16, LT, loop_vsx
VADDUWM V12, V26, V12
WORD $0x13600F8C // VMRGEW V0, V1, V27
WORD $0x13821F8C // VMRGEW V2, V3, V28
WORD $0x10000E8C // VMRGOW V0, V1, V0
WORD $0x10421E8C // VMRGOW V2, V3, V2
WORD $0x13A42F8C // VMRGEW V4, V5, V29
WORD $0x13C63F8C // VMRGEW V6, V7, V30
XXPERMDI VS32, VS34, $0, VS33
XXPERMDI VS32, VS34, $3, VS35
XXPERMDI VS59, VS60, $0, VS32
XXPERMDI VS59, VS60, $3, VS34
WORD $0x10842E8C // VMRGOW V4, V5, V4
WORD $0x10C63E8C // VMRGOW V6, V7, V6
WORD $0x13684F8C // VMRGEW V8, V9, V27
WORD $0x138A5F8C // VMRGEW V10, V11, V28
XXPERMDI VS36, VS38, $0, VS37
XXPERMDI VS36, VS38, $3, VS39
XXPERMDI VS61, VS62, $0, VS36
XXPERMDI VS61, VS62, $3, VS38
WORD $0x11084E8C // VMRGOW V8, V9, V8
WORD $0x114A5E8C // VMRGOW V10, V11, V10
WORD $0x13AC6F8C // VMRGEW V12, V13, V29
WORD $0x13CE7F8C // VMRGEW V14, V15, V30
XXPERMDI VS40, VS42, $0, VS41
XXPERMDI VS40, VS42, $3, VS43
XXPERMDI VS59, VS60, $0, VS40
XXPERMDI VS59, VS60, $3, VS42
WORD $0x118C6E8C // VMRGOW V12, V13, V12
WORD $0x11CE7E8C // VMRGOW V14, V15, V14
VSPLTISW $4, V27
VADDUWM V26, V27, V26
XXPERMDI VS44, VS46, $0, VS45
XXPERMDI VS44, VS46, $3, VS47
XXPERMDI VS61, VS62, $0, VS44
XXPERMDI VS61, VS62, $3, VS46
VADDUWM V0, V16, V0
VADDUWM V4, V17, V4
VADDUWM V8, V18, V8
VADDUWM V12, V19, V12
CMPU LEN, $64
BLT tail_vsx
// Bottom of loop
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(R10)
ADD $64, OUT
BEQ done_vsx
VADDUWM V1, V16, V0
VADDUWM V5, V17, V4
VADDUWM V9, V18, V8
VADDUWM V13, V19, V12
CMPU LEN, $64
BLT tail_vsx
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(V10)
ADD $64, OUT
BEQ done_vsx
VADDUWM V2, V16, V0
VADDUWM V6, V17, V4
VADDUWM V10, V18, V8
VADDUWM V14, V19, V12
CMPU LEN, $64
BLT tail_vsx
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(R10)
ADD $64, OUT
BEQ done_vsx
VADDUWM V3, V16, V0
VADDUWM V7, V17, V4
VADDUWM V11, V18, V8
VADDUWM V15, V19, V12
CMPU LEN, $64
BLT tail_vsx
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(R10)
ADD $64, OUT
MOVD $10, R14
MOVD R14, CTR
BNE loop_outer_vsx
done_vsx:
// Increment counter by number of 64 byte blocks
MOVD (CNT), R14
ADD BLOCKS, R14
MOVD R14, (CNT)
RET
tail_vsx:
ADD $32, R1, R11
MOVD LEN, CTR
// Save values on stack to copy from
STXVW4X VS32, (R11)(R0)
STXVW4X VS36, (R11)(R8)
STXVW4X VS40, (R11)(R9)
STXVW4X VS44, (R11)(R10)
ADD $-1, R11, R12
ADD $-1, INP
ADD $-1, OUT
looptail_vsx:
// Copying the result to OUT
// in bytes.
MOVBZU 1(R12), KEY
MOVBZU 1(INP), TMP
XOR KEY, TMP, KEY
MOVBU KEY, 1(OUT)
BC 16, LT, looptail_vsx
// Clear the stack values
STXVW4X VS48, (R11)(R0)
STXVW4X VS48, (R11)(R8)
STXVW4X VS48, (R11)(R9)
STXVW4X VS48, (R11)(R10)
BR done_vsx

28
vendor/golang.org/x/crypto/chacha20/chacha_s390x.go generated vendored Normal file
View File

@@ -0,0 +1,28 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
package chacha20
import "golang.org/x/sys/cpu"
var haveAsm = cpu.S390X.HasVX
const bufSize = 256
// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
// be called when the vector facility is available. Implementation in asm_s390x.s.
//
//go:noescape
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
if cpu.S390X.HasVX {
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
} else {
c.xorKeyStreamBlocksGeneric(dst, src)
}
}

225
vendor/golang.org/x/crypto/chacha20/chacha_s390x.s generated vendored Normal file
View File

@@ -0,0 +1,225 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "go_asm.h"
#include "textflag.h"
// This is an implementation of the ChaCha20 encryption algorithm as
// specified in RFC 7539. It uses vector instructions to compute
// 4 keystream blocks in parallel (256 bytes) which are then XORed
// with the bytes in the input slice.
GLOBL ·constants<>(SB), RODATA|NOPTR, $32
// BSWAP: swap bytes in each 4-byte element
DATA ·constants<>+0x00(SB)/4, $0x03020100
DATA ·constants<>+0x04(SB)/4, $0x07060504
DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
// J0: [j0, j1, j2, j3]
DATA ·constants<>+0x10(SB)/4, $0x61707865
DATA ·constants<>+0x14(SB)/4, $0x3320646e
DATA ·constants<>+0x18(SB)/4, $0x79622d32
DATA ·constants<>+0x1c(SB)/4, $0x6b206574
#define BSWAP V5
#define J0 V6
#define KEY0 V7
#define KEY1 V8
#define NONCE V9
#define CTR V10
#define M0 V11
#define M1 V12
#define M2 V13
#define M3 V14
#define INC V15
#define X0 V16
#define X1 V17
#define X2 V18
#define X3 V19
#define X4 V20
#define X5 V21
#define X6 V22
#define X7 V23
#define X8 V24
#define X9 V25
#define X10 V26
#define X11 V27
#define X12 V28
#define X13 V29
#define X14 V30
#define X15 V31
#define NUM_ROUNDS 20
#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
VAF a1, a0, a0 \
VAF b1, b0, b0 \
VAF c1, c0, c0 \
VAF d1, d0, d0 \
VX a0, a2, a2 \
VX b0, b2, b2 \
VX c0, c2, c2 \
VX d0, d2, d2 \
VERLLF $16, a2, a2 \
VERLLF $16, b2, b2 \
VERLLF $16, c2, c2 \
VERLLF $16, d2, d2 \
VAF a2, a3, a3 \
VAF b2, b3, b3 \
VAF c2, c3, c3 \
VAF d2, d3, d3 \
VX a3, a1, a1 \
VX b3, b1, b1 \
VX c3, c1, c1 \
VX d3, d1, d1 \
VERLLF $12, a1, a1 \
VERLLF $12, b1, b1 \
VERLLF $12, c1, c1 \
VERLLF $12, d1, d1 \
VAF a1, a0, a0 \
VAF b1, b0, b0 \
VAF c1, c0, c0 \
VAF d1, d0, d0 \
VX a0, a2, a2 \
VX b0, b2, b2 \
VX c0, c2, c2 \
VX d0, d2, d2 \
VERLLF $8, a2, a2 \
VERLLF $8, b2, b2 \
VERLLF $8, c2, c2 \
VERLLF $8, d2, d2 \
VAF a2, a3, a3 \
VAF b2, b3, b3 \
VAF c2, c3, c3 \
VAF d2, d3, d3 \
VX a3, a1, a1 \
VX b3, b1, b1 \
VX c3, c1, c1 \
VX d3, d1, d1 \
VERLLF $7, a1, a1 \
VERLLF $7, b1, b1 \
VERLLF $7, c1, c1 \
VERLLF $7, d1, d1
#define PERMUTE(mask, v0, v1, v2, v3) \
VPERM v0, v0, mask, v0 \
VPERM v1, v1, mask, v1 \
VPERM v2, v2, mask, v2 \
VPERM v3, v3, mask, v3
#define ADDV(x, v0, v1, v2, v3) \
VAF x, v0, v0 \
VAF x, v1, v1 \
VAF x, v2, v2 \
VAF x, v3, v3
#define XORV(off, dst, src, v0, v1, v2, v3) \
VLM off(src), M0, M3 \
PERMUTE(BSWAP, v0, v1, v2, v3) \
VX v0, M0, M0 \
VX v1, M1, M1 \
VX v2, M2, M2 \
VX v3, M3, M3 \
VSTM M0, M3, off(dst)
#define SHUFFLE(a, b, c, d, t, u, v, w) \
VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
MOVD $·constants<>(SB), R1
MOVD dst+0(FP), R2 // R2=&dst[0]
LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
MOVD key+48(FP), R5 // R5=key
MOVD nonce+56(FP), R6 // R6=nonce
MOVD counter+64(FP), R7 // R7=counter
// load BSWAP and J0
VLM (R1), BSWAP, J0
// setup
MOVD $95, R0
VLM (R5), KEY0, KEY1
VLL R0, (R6), NONCE
VZERO M0
VLEIB $7, $32, M0
VSRLB M0, NONCE, NONCE
// initialize counter values
VLREPF (R7), CTR
VZERO INC
VLEIF $1, $1, INC
VLEIF $2, $2, INC
VLEIF $3, $3, INC
VAF INC, CTR, CTR
VREPIF $4, INC
chacha:
VREPF $0, J0, X0
VREPF $1, J0, X1
VREPF $2, J0, X2
VREPF $3, J0, X3
VREPF $0, KEY0, X4
VREPF $1, KEY0, X5
VREPF $2, KEY0, X6
VREPF $3, KEY0, X7
VREPF $0, KEY1, X8
VREPF $1, KEY1, X9
VREPF $2, KEY1, X10
VREPF $3, KEY1, X11
VLR CTR, X12
VREPF $1, NONCE, X13
VREPF $2, NONCE, X14
VREPF $3, NONCE, X15
MOVD $(NUM_ROUNDS/2), R1
loop:
ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
ADD $-1, R1
BNE loop
// decrement length
ADD $-256, R4
// rearrange vectors
SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
ADDV(J0, X0, X1, X2, X3)
SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
ADDV(KEY0, X4, X5, X6, X7)
SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
ADDV(KEY1, X8, X9, X10, X11)
VAF CTR, X12, X12
SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
ADDV(NONCE, X12, X13, X14, X15)
// increment counters
VAF INC, CTR, CTR
// xor keystream with plaintext
XORV(0*64, R2, R3, X0, X4, X8, X12)
XORV(1*64, R2, R3, X1, X5, X9, X13)
XORV(2*64, R2, R3, X2, X6, X10, X14)
XORV(3*64, R2, R3, X3, X7, X11, X15)
// increment pointers
MOVD $256(R2), R2
MOVD $256(R3), R3
CMPBNE R4, $0, chacha
VSTEF $0, CTR, (R7)
RET

42
vendor/golang.org/x/crypto/chacha20/xor.go generated vendored Normal file
View File

@@ -0,0 +1,42 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found src the LICENSE file.
package chacha20
import "runtime"
// Platforms that have fast unaligned 32-bit little endian accesses.
const unaligned = runtime.GOARCH == "386" ||
runtime.GOARCH == "amd64" ||
runtime.GOARCH == "arm64" ||
runtime.GOARCH == "ppc64le" ||
runtime.GOARCH == "s390x"
// addXor reads a little endian uint32 from src, XORs it with (a + b) and
// places the result in little endian byte order in dst.
func addXor(dst, src []byte, a, b uint32) {
_, _ = src[3], dst[3] // bounds check elimination hint
if unaligned {
// The compiler should optimize this code into
// 32-bit unaligned little endian loads and stores.
// TODO: delete once the compiler does a reliably
// good job with the generic code below.
// See issue #25111 for more details.
v := uint32(src[0])
v |= uint32(src[1]) << 8
v |= uint32(src[2]) << 16
v |= uint32(src[3]) << 24
v ^= a + b
dst[0] = byte(v)
dst[1] = byte(v >> 8)
dst[2] = byte(v >> 16)
dst[3] = byte(v >> 24)
} else {
a += b
dst[0] = src[0] ^ byte(a)
dst[1] = src[1] ^ byte(a>>8)
dst[2] = src[2] ^ byte(a>>16)
dst[3] = src[3] ^ byte(a>>24)
}
}

32
vendor/golang.org/x/crypto/internal/alias/alias.go generated vendored Normal file
View File

@@ -0,0 +1,32 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !purego
// +build !purego
// Package alias implements memory aliasing tests.
package alias
import "unsafe"
// AnyOverlap reports whether x and y share memory at any (not necessarily
// corresponding) index. The memory beyond the slice length is ignored.
func AnyOverlap(x, y []byte) bool {
return len(x) > 0 && len(y) > 0 &&
uintptr(unsafe.Pointer(&x[0])) <= uintptr(unsafe.Pointer(&y[len(y)-1])) &&
uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsafe.Pointer(&x[len(x)-1]))
}
// InexactOverlap reports whether x and y share memory at any non-corresponding
// index. The memory beyond the slice length is ignored. Note that x and y can
// have different lengths and still not have any inexact overlap.
//
// InexactOverlap can be used to implement the requirements of the crypto/cipher
// AEAD, Block, BlockMode and Stream interfaces.
func InexactOverlap(x, y []byte) bool {
if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
return false
}
return AnyOverlap(x, y)
}

View File

@@ -0,0 +1,35 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build purego
// +build purego
// Package alias implements memory aliasing tests.
package alias
// This is the Google App Engine standard variant based on reflect
// because the unsafe package and cgo are disallowed.
import "reflect"
// AnyOverlap reports whether x and y share memory at any (not necessarily
// corresponding) index. The memory beyond the slice length is ignored.
func AnyOverlap(x, y []byte) bool {
return len(x) > 0 && len(y) > 0 &&
reflect.ValueOf(&x[0]).Pointer() <= reflect.ValueOf(&y[len(y)-1]).Pointer() &&
reflect.ValueOf(&y[0]).Pointer() <= reflect.ValueOf(&x[len(x)-1]).Pointer()
}
// InexactOverlap reports whether x and y share memory at any non-corresponding
// index. The memory beyond the slice length is ignored. Note that x and y can
// have different lengths and still not have any inexact overlap.
//
// InexactOverlap can be used to implement the requirements of the crypto/cipher
// AEAD, Block, BlockMode and Stream interfaces.
func InexactOverlap(x, y []byte) bool {
if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
return false
}
return AnyOverlap(x, y)
}

View File

@@ -0,0 +1,40 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !go1.13
// +build !go1.13
package poly1305
// Generic fallbacks for the math/bits intrinsics, copied from
// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had
// variable time fallbacks until Go 1.13.
func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
sum = x + y + carry
carryOut = ((x & y) | ((x | y) &^ sum)) >> 63
return
}
func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
diff = x - y - borrow
borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63
return
}
func bitsMul64(x, y uint64) (hi, lo uint64) {
const mask32 = 1<<32 - 1
x0 := x & mask32
x1 := x >> 32
y0 := y & mask32
y1 := y >> 32
w0 := x0 * y0
t := x1*y0 + w0>>32
w1 := t & mask32
w2 := t >> 32
w1 += x0 * y1
hi = x1*y1 + w2 + w1>>32
lo = x * y
return
}

View File

@@ -0,0 +1,22 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.13
// +build go1.13
package poly1305
import "math/bits"
func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
return bits.Add64(x, y, carry)
}
func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
return bits.Sub64(x, y, borrow)
}
func bitsMul64(x, y uint64) (hi, lo uint64) {
return bits.Mul64(x, y)
}

View File

@@ -0,0 +1,10 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
// +build !amd64,!ppc64le,!s390x !gc purego
package poly1305
type mac struct{ macGeneric }

View File

@@ -0,0 +1,99 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package poly1305 implements Poly1305 one-time message authentication code as
// specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
//
// Poly1305 is a fast, one-time authentication function. It is infeasible for an
// attacker to generate an authenticator for a message without the key. However, a
// key must only be used for a single message. Authenticating two different
// messages with the same key allows an attacker to forge authenticators for other
// messages with the same key.
//
// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
// used with a fixed key in order to generate one-time keys from an nonce.
// However, in this package AES isn't used and the one-time key is specified
// directly.
package poly1305
import "crypto/subtle"
// TagSize is the size, in bytes, of a poly1305 authenticator.
const TagSize = 16
// Sum generates an authenticator for msg using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
h := New(key)
h.Write(m)
h.Sum(out[:0])
}
// Verify returns true if mac is a valid authenticator for m with the given key.
func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
var tmp [16]byte
Sum(&tmp, m, key)
return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1
}
// New returns a new MAC computing an authentication
// tag of all data written to it with the given key.
// This allows writing the message progressively instead
// of passing it as a single slice. Common users should use
// the Sum function instead.
//
// The key must be unique for each message, as authenticating
// two different messages with the same key allows an attacker
// to forge messages at will.
func New(key *[32]byte) *MAC {
m := &MAC{}
initialize(key, &m.macState)
return m
}
// MAC is an io.Writer computing an authentication tag
// of the data written to it.
//
// MAC cannot be used like common hash.Hash implementations,
// because using a poly1305 key twice breaks its security.
// Therefore writing data to a running MAC after calling
// Sum or Verify causes it to panic.
type MAC struct {
mac // platform-dependent implementation
finalized bool
}
// Size returns the number of bytes Sum will return.
func (h *MAC) Size() int { return TagSize }
// Write adds more data to the running message authentication code.
// It never returns an error.
//
// It must not be called after the first call of Sum or Verify.
func (h *MAC) Write(p []byte) (n int, err error) {
if h.finalized {
panic("poly1305: write to MAC after Sum or Verify")
}
return h.mac.Write(p)
}
// Sum computes the authenticator of all data written to the
// message authentication code.
func (h *MAC) Sum(b []byte) []byte {
var mac [TagSize]byte
h.mac.Sum(&mac)
h.finalized = true
return append(b, mac[:]...)
}
// Verify returns whether the authenticator of all data written to
// the message authentication code matches the expected value.
func (h *MAC) Verify(expected []byte) bool {
var mac [TagSize]byte
h.mac.Sum(&mac)
h.finalized = true
return subtle.ConstantTimeCompare(expected, mac[:]) == 1
}

View File

@@ -0,0 +1,48 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
package poly1305
//go:noescape
func update(state *macState, msg []byte)
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
// using function pointers would carry a major performance cost.
type mac struct{ macGeneric }
func (h *mac) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < TagSize {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
update(&h.macState, h.buffer[:])
}
if n := len(p) - (len(p) % TagSize); n > 0 {
update(&h.macState, p[:n])
p = p[n:]
}
if len(p) > 0 {
h.offset += copy(h.buffer[h.offset:], p)
}
return nn, nil
}
func (h *mac) Sum(out *[16]byte) {
state := h.macState
if h.offset > 0 {
update(&state, h.buffer[:h.offset])
}
finalize(out, &state.h, &state.s)
}

View File

@@ -0,0 +1,109 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"
#define POLY1305_ADD(msg, h0, h1, h2) \
ADDQ 0(msg), h0; \
ADCQ 8(msg), h1; \
ADCQ $1, h2; \
LEAQ 16(msg), msg
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
MOVQ r0, AX; \
MULQ h0; \
MOVQ AX, t0; \
MOVQ DX, t1; \
MOVQ r0, AX; \
MULQ h1; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ r0, t2; \
IMULQ h2, t2; \
ADDQ DX, t2; \
\
MOVQ r1, AX; \
MULQ h0; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ DX, h0; \
MOVQ r1, t3; \
IMULQ h2, t3; \
MOVQ r1, AX; \
MULQ h1; \
ADDQ AX, t2; \
ADCQ DX, t3; \
ADDQ h0, t2; \
ADCQ $0, t3; \
\
MOVQ t0, h0; \
MOVQ t1, h1; \
MOVQ t2, h2; \
ANDQ $3, h2; \
MOVQ t2, t0; \
ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
ADDQ t0, h0; \
ADCQ t3, h1; \
ADCQ $0, h2; \
SHRQ $2, t3, t2; \
SHRQ $2, t3; \
ADDQ t2, h0; \
ADCQ t3, h1; \
ADCQ $0, h2
// func update(state *[7]uint64, msg []byte)
TEXT ·update(SB), $0-32
MOVQ state+0(FP), DI
MOVQ msg_base+8(FP), SI
MOVQ msg_len+16(FP), R15
MOVQ 0(DI), R8 // h0
MOVQ 8(DI), R9 // h1
MOVQ 16(DI), R10 // h2
MOVQ 24(DI), R11 // r0
MOVQ 32(DI), R12 // r1
CMPQ R15, $16
JB bytes_between_0_and_15
loop:
POLY1305_ADD(SI, R8, R9, R10)
multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
SUBQ $16, R15
CMPQ R15, $16
JAE loop
bytes_between_0_and_15:
TESTQ R15, R15
JZ done
MOVQ $1, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI
flush_buffer:
SHLQ $8, BX, CX
SHLQ $8, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer
ADDQ BX, R8
ADCQ CX, R9
ADCQ $0, R10
MOVQ $16, R15
JMP multiply
done:
MOVQ R8, 0(DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
RET

View File

@@ -0,0 +1,309 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file provides the generic implementation of Sum and MAC. Other files
// might provide optimized assembly implementations of some of this code.
package poly1305
import "encoding/binary"
// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
// for a 64 bytes message is approximately
//
// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
//
// for some secret r and s. It can be computed sequentially like
//
// for len(msg) > 0:
// h += read(msg, 16)
// h *= r
// h %= 2¹³⁰ - 5
// return h + s
//
// All the complexity is about doing performant constant-time math on numbers
// larger than any available numeric type.
func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
h := newMACGeneric(key)
h.Write(msg)
h.Sum(out)
}
func newMACGeneric(key *[32]byte) macGeneric {
m := macGeneric{}
initialize(key, &m.macState)
return m
}
// macState holds numbers in saturated 64-bit little-endian limbs. That is,
// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
type macState struct {
// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
// can grow larger during and after rounds. It must, however, remain below
// 2 * (2¹³⁰ - 5).
h [3]uint64
// r and s are the private key components.
r [2]uint64
s [2]uint64
}
type macGeneric struct {
macState
buffer [TagSize]byte
offset int
}
// Write splits the incoming message into TagSize chunks, and passes them to
// update. It buffers incomplete chunks.
func (h *macGeneric) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < TagSize {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
updateGeneric(&h.macState, h.buffer[:])
}
if n := len(p) - (len(p) % TagSize); n > 0 {
updateGeneric(&h.macState, p[:n])
p = p[n:]
}
if len(p) > 0 {
h.offset += copy(h.buffer[h.offset:], p)
}
return nn, nil
}
// Sum flushes the last incomplete chunk from the buffer, if any, and generates
// the MAC output. It does not modify its state, in order to allow for multiple
// calls to Sum, even if no Write is allowed after Sum.
func (h *macGeneric) Sum(out *[TagSize]byte) {
state := h.macState
if h.offset > 0 {
updateGeneric(&state, h.buffer[:h.offset])
}
finalize(out, &state.h, &state.s)
}
// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
// clears some bits of the secret coefficient to make it possible to implement
// multiplication more efficiently.
const (
rMask0 = 0x0FFFFFFC0FFFFFFF
rMask1 = 0x0FFFFFFC0FFFFFFC
)
// initialize loads the 256-bit key into the two 128-bit secret values r and s.
func initialize(key *[32]byte, m *macState) {
m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
m.s[0] = binary.LittleEndian.Uint64(key[16:24])
m.s[1] = binary.LittleEndian.Uint64(key[24:32])
}
// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
// bits.Mul64 and bits.Add64 intrinsics.
type uint128 struct {
lo, hi uint64
}
func mul64(a, b uint64) uint128 {
hi, lo := bitsMul64(a, b)
return uint128{lo, hi}
}
func add128(a, b uint128) uint128 {
lo, c := bitsAdd64(a.lo, b.lo, 0)
hi, c := bitsAdd64(a.hi, b.hi, c)
if c != 0 {
panic("poly1305: unexpected overflow")
}
return uint128{lo, hi}
}
func shiftRightBy2(a uint128) uint128 {
a.lo = a.lo>>2 | (a.hi&3)<<62
a.hi = a.hi >> 2
return a
}
// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
// 128 bits of message, it computes
//
// h₊ = (h + m) * r mod 2¹³⁰ - 5
//
// If the msg length is not a multiple of TagSize, it assumes the last
// incomplete chunk is the final one.
func updateGeneric(state *macState, msg []byte) {
h0, h1, h2 := state.h[0], state.h[1], state.h[2]
r0, r1 := state.r[0], state.r[1]
for len(msg) > 0 {
var c uint64
// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
// reduced at the end of the multiplication below.
//
// The spec requires us to set a bit just above the message size, not to
// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
// add 1 to the most significant (2¹²⁸) limb, h2.
if len(msg) >= TagSize {
h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
h2 += c + 1
msg = msg[TagSize:]
} else {
var buf [TagSize]byte
copy(buf[:], msg)
buf[len(msg)] = 1
h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
h2 += c
msg = nil
}
// Multiplication of big number limbs is similar to elementary school
// columnar multiplication. Instead of digits, there are 64-bit limbs.
//
// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
//
// h2 h1 h0 x
// r1 r0 =
// ----------------
// h2r0 h1r0 h0r0 <-- individual 128-bit products
// + h2r1 h1r1 h0r1
// ------------------------
// m3 m2 m1 m0 <-- result in 128-bit overlapping limbs
// ------------------------
// m3.hi m2.hi m1.hi m0.hi <-- carry propagation
// + m3.lo m2.lo m1.lo m0.lo
// -------------------------------
// t4 t3 t2 t1 t0 <-- final result in 64-bit limbs
//
// The main difference from pen-and-paper multiplication is that we do
// carry propagation in a separate step, as if we wrote two digit sums
// at first (the 128-bit limbs), and then carried the tens all at once.
h0r0 := mul64(h0, r0)
h1r0 := mul64(h1, r0)
h2r0 := mul64(h2, r0)
h0r1 := mul64(h0, r1)
h1r1 := mul64(h1, r1)
h2r1 := mul64(h2, r1)
// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
// top 4 bits cleared by rMask{0,1}, we know that their product is not going
// to overflow 64 bits, so we can ignore the high part of the products.
//
// This also means that the product doesn't have a fifth limb (t4).
if h2r0.hi != 0 {
panic("poly1305: unexpected overflow")
}
if h2r1.hi != 0 {
panic("poly1305: unexpected overflow")
}
m0 := h0r0
m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
m3 := h2r1
t0 := m0.lo
t1, c := bitsAdd64(m1.lo, m0.hi, 0)
t2, c := bitsAdd64(m2.lo, m1.hi, c)
t3, _ := bitsAdd64(m3.lo, m2.hi, c)
// Now we have the result as 4 64-bit limbs, and we need to reduce it
// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
// a cheap partial reduction according to the reduction identity
//
// c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5
//
// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
// assumptions we make about h in the rest of the code.
//
// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
// Note that the carry bits are effectively shifted left by 2, in other
// words, cc = c * 4 for the c in the reduction identity.
h0, h1, h2 = t0, t1, t2&maskLow2Bits
cc := uint128{t2 & maskNotLow2Bits, t3}
// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.
h0, c = bitsAdd64(h0, cc.lo, 0)
h1, c = bitsAdd64(h1, cc.hi, c)
h2 += c
cc = shiftRightBy2(cc)
h0, c = bitsAdd64(h0, cc.lo, 0)
h1, c = bitsAdd64(h1, cc.hi, c)
h2 += c
// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
//
// 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
}
state.h[0], state.h[1], state.h[2] = h0, h1, h2
}
const (
maskLow2Bits uint64 = 0x0000000000000003
maskNotLow2Bits uint64 = ^maskLow2Bits
)
// select64 returns x if v == 1 and y if v == 0, in constant time.
func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
const (
p0 = 0xFFFFFFFFFFFFFFFB
p1 = 0xFFFFFFFFFFFFFFFF
p2 = 0x0000000000000003
)
// finalize completes the modular reduction of h and computes
//
// out = h + s mod 2¹²⁸
func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
h0, h1, h2 := h[0], h[1], h[2]
// After the partial reduction in updateGeneric, h might be more than
// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
// result if the subtraction underflows, and t otherwise.
hMinusP0, b := bitsSub64(h0, p0, 0)
hMinusP1, b := bitsSub64(h1, p1, b)
_, b = bitsSub64(h2, p2, b)
// h = h if h < p else h - p
h0 = select64(b, h0, hMinusP0)
h1 = select64(b, h1, hMinusP1)
// Finally, we compute the last Poly1305 step
//
// tag = h + s mod 2¹²⁸
//
// by just doing a wide addition with the 128 low bits of h and discarding
// the overflow.
h0, c := bitsAdd64(h0, s[0], 0)
h1, _ = bitsAdd64(h1, s[1], c)
binary.LittleEndian.PutUint64(out[0:8], h0)
binary.LittleEndian.PutUint64(out[8:16], h1)
}

View File

@@ -0,0 +1,48 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
package poly1305
//go:noescape
func update(state *macState, msg []byte)
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
// using function pointers would carry a major performance cost.
type mac struct{ macGeneric }
func (h *mac) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < TagSize {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
update(&h.macState, h.buffer[:])
}
if n := len(p) - (len(p) % TagSize); n > 0 {
update(&h.macState, p[:n])
p = p[n:]
}
if len(p) > 0 {
h.offset += copy(h.buffer[h.offset:], p)
}
return nn, nil
}
func (h *mac) Sum(out *[16]byte) {
state := h.macState
if h.offset > 0 {
update(&state, h.buffer[:h.offset])
}
finalize(out, &state.h, &state.s)
}

View File

@@ -0,0 +1,182 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"
// This was ported from the amd64 implementation.
#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
MOVD (msg), t0; \
MOVD 8(msg), t1; \
MOVD $1, t2; \
ADDC t0, h0, h0; \
ADDE t1, h1, h1; \
ADDE t2, h2; \
ADD $16, msg
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \
MULLD r0, h0, t0; \
MULLD r0, h1, t4; \
MULHDU r0, h0, t1; \
MULHDU r0, h1, t5; \
ADDC t4, t1, t1; \
MULLD r0, h2, t2; \
ADDZE t5; \
MULHDU r1, h0, t4; \
MULLD r1, h0, h0; \
ADD t5, t2, t2; \
ADDC h0, t1, t1; \
MULLD h2, r1, t3; \
ADDZE t4, h0; \
MULHDU r1, h1, t5; \
MULLD r1, h1, t4; \
ADDC t4, t2, t2; \
ADDE t5, t3, t3; \
ADDC h0, t2, t2; \
MOVD $-4, t4; \
MOVD t0, h0; \
MOVD t1, h1; \
ADDZE t3; \
ANDCC $3, t2, h2; \
AND t2, t4, t0; \
ADDC t0, h0, h0; \
ADDE t3, h1, h1; \
SLD $62, t3, t4; \
SRD $2, t2; \
ADDZE h2; \
OR t4, t2, t2; \
SRD $2, t3; \
ADDC t2, h0, h0; \
ADDE t3, h1, h1; \
ADDZE h2
DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
GLOBL ·poly1305Mask<>(SB), RODATA, $16
// func update(state *[7]uint64, msg []byte)
TEXT ·update(SB), $0-32
MOVD state+0(FP), R3
MOVD msg_base+8(FP), R4
MOVD msg_len+16(FP), R5
MOVD 0(R3), R8 // h0
MOVD 8(R3), R9 // h1
MOVD 16(R3), R10 // h2
MOVD 24(R3), R11 // r0
MOVD 32(R3), R12 // r1
CMP R5, $16
BLT bytes_between_0_and_15
loop:
POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22)
multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21)
ADD $-16, R5
CMP R5, $16
BGE loop
bytes_between_0_and_15:
CMP R5, $0
BEQ done
MOVD $0, R16 // h0
MOVD $0, R17 // h1
flush_buffer:
CMP R5, $8
BLE just1
MOVD $8, R21
SUB R21, R5, R21
// Greater than 8 -- load the rightmost remaining bytes in msg
// and put into R17 (h1)
MOVD (R4)(R21), R17
MOVD $16, R22
// Find the offset to those bytes
SUB R5, R22, R22
SLD $3, R22
// Shift to get only the bytes in msg
SRD R22, R17, R17
// Put 1 at high end
MOVD $1, R23
SLD $3, R21
SLD R21, R23, R23
OR R23, R17, R17
// Remainder is 8
MOVD $8, R5
just1:
CMP R5, $8
BLT less8
// Exactly 8
MOVD (R4), R16
CMP R17, $0
// Check if we've already set R17; if not
// set 1 to indicate end of msg.
BNE carry
MOVD $1, R17
BR carry
less8:
MOVD $0, R16 // h0
MOVD $0, R22 // shift count
CMP R5, $4
BLT less4
MOVWZ (R4), R16
ADD $4, R4
ADD $-4, R5
MOVD $32, R22
less4:
CMP R5, $2
BLT less2
MOVHZ (R4), R21
SLD R22, R21, R21
OR R16, R21, R16
ADD $16, R22
ADD $-2, R5
ADD $2, R4
less2:
CMP R5, $0
BEQ insert1
MOVBZ (R4), R21
SLD R22, R21, R21
OR R16, R21, R16
ADD $8, R22
insert1:
// Insert 1 at end of msg
MOVD $1, R21
SLD R22, R21, R21
OR R16, R21, R16
carry:
// Add new values to h0, h1, h2
ADDC R16, R8
ADDE R17, R9
ADDZE R10, R10
MOVD $16, R5
ADD R5, R4
BR multiply
done:
// Save h0, h1, h2 in state
MOVD R8, 0(R3)
MOVD R9, 8(R3)
MOVD R10, 16(R3)
RET

View File

@@ -0,0 +1,77 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
package poly1305
import (
"golang.org/x/sys/cpu"
)
// updateVX is an assembly implementation of Poly1305 that uses vector
// instructions. It must only be called if the vector facility (vx) is
// available.
//
//go:noescape
func updateVX(state *macState, msg []byte)
// mac is a replacement for macGeneric that uses a larger buffer and redirects
// calls that would have gone to updateGeneric to updateVX if the vector
// facility is installed.
//
// A larger buffer is required for good performance because the vector
// implementation has a higher fixed cost per call than the generic
// implementation.
type mac struct {
macState
buffer [16 * TagSize]byte // size must be a multiple of block size (16)
offset int
}
func (h *mac) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < len(h.buffer) {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
if cpu.S390X.HasVX {
updateVX(&h.macState, h.buffer[:])
} else {
updateGeneric(&h.macState, h.buffer[:])
}
}
tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
body := len(p) - tail // number of bytes to process now
if body > 0 {
if cpu.S390X.HasVX {
updateVX(&h.macState, p[:body])
} else {
updateGeneric(&h.macState, p[:body])
}
}
h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
return nn, nil
}
func (h *mac) Sum(out *[TagSize]byte) {
state := h.macState
remainder := h.buffer[:h.offset]
// Use the generic implementation if we have 2 or fewer blocks left
// to sum. The vector implementation has a higher startup time.
if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
updateVX(&state, remainder)
} else if len(remainder) > 0 {
updateGeneric(&state, remainder)
}
finalize(out, &state.h, &state.s)
}

View File

@@ -0,0 +1,504 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"
// This implementation of Poly1305 uses the vector facility (vx)
// to process up to 2 blocks (32 bytes) per iteration using an
// algorithm based on the one described in:
//
// NEON crypto, Daniel J. Bernstein & Peter Schwabe
// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
//
// This algorithm uses 5 26-bit limbs to represent a 130-bit
// value. These limbs are, for the most part, zero extended and
// placed into 64-bit vector register elements. Each vector
// register is 128-bits wide and so holds 2 of these elements.
// Using 26-bit limbs allows us plenty of headroom to accommodate
// accumulations before and after multiplication without
// overflowing either 32-bits (before multiplication) or 64-bits
// (after multiplication).
//
// In order to parallelise the operations required to calculate
// the sum we use two separate accumulators and then sum those
// in an extra final step. For compatibility with the generic
// implementation we perform this summation at the end of every
// updateVX call.
//
// To use two accumulators we must multiply the message blocks
// by r² rather than r. Only the final message block should be
// multiplied by r.
//
// Example:
//
// We want to calculate the sum (h) for a 64 byte message (m):
//
// h = m[0:16]r + m[16:32]r³ + m[32:48]r² + m[48:64]r
//
// To do this we split the calculation into the even indices
// and odd indices of the message. These form our SIMD 'lanes':
//
// h = m[ 0:16]r + m[32:48]r² + <- lane 0
// m[16:32]r³ + m[48:64]r <- lane 1
//
// To calculate this iteratively we refactor so that both lanes
// are written in terms of r² and r:
//
// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
// (m[16:32]r² + m[48:64])r <- lane 1
// ^ ^
// | coefficients for second iteration
// coefficients for first iteration
//
// So in this case we would have two iterations. In the first
// both lanes are multiplied by r². In the second only the
// first lane is multiplied by r² and the second lane is
// instead multiplied by r. This gives use the odd and even
// powers of r that we need from the original equation.
//
// Notation:
//
// h - accumulator
// r - key
// m - message
//
// [a, b] - SIMD register holding two 64-bit values
// [a, b, c, d] - SIMD register holding four 32-bit values
// x[n] - limb n of variable x with bit width i
//
// Limbs are expressed in little endian order, so for 26-bit
// limbs x[4] will be the most significant limb and x[0]
// will be the least significant limb.
// masking constants
#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
// expansion constants (see EXPAND macro)
#define EX0 V2
#define EX1 V3
#define EX2 V4
// key (r², r or 1 depending on context)
#define R_0 V5
#define R_1 V6
#define R_2 V7
#define R_3 V8
#define R_4 V9
// precalculated coefficients (5r², 5r or 0 depending on context)
#define R5_1 V10
#define R5_2 V11
#define R5_3 V12
#define R5_4 V13
// message block (m)
#define M_0 V14
#define M_1 V15
#define M_2 V16
#define M_3 V17
#define M_4 V18
// accumulator (h)
#define H_0 V19
#define H_1 V20
#define H_2 V21
#define H_3 V22
#define H_4 V23
// temporary registers (for short-lived values)
#define T_0 V24
#define T_1 V25
#define T_2 V26
#define T_3 V27
#define T_4 V28
GLOBL ·constants<>(SB), RODATA, $0x30
// EX0
DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
// EX1
DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
// EX2
DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
// MULTIPLY multiplies each lane of f and g, partially reduced
// modulo 2¹³ - 5. The result, h, consists of partial products
// in each lane that need to be reduced further to produce the
// final result.
//
// h = (fg) % 2¹³ + (5fg) / 2¹³
//
// Note that the multiplication by 5 of the high bits is
// achieved by precalculating the multiplication of four of the
// g coefficients by 5. These are g51-g54.
#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
VMLOF f0, g0, h0 \
VMLOF f0, g3, h3 \
VMLOF f0, g1, h1 \
VMLOF f0, g4, h4 \
VMLOF f0, g2, h2 \
VMLOF f1, g54, T_0 \
VMLOF f1, g2, T_3 \
VMLOF f1, g0, T_1 \
VMLOF f1, g3, T_4 \
VMLOF f1, g1, T_2 \
VMALOF f2, g53, h0, h0 \
VMALOF f2, g1, h3, h3 \
VMALOF f2, g54, h1, h1 \
VMALOF f2, g2, h4, h4 \
VMALOF f2, g0, h2, h2 \
VMALOF f3, g52, T_0, T_0 \
VMALOF f3, g0, T_3, T_3 \
VMALOF f3, g53, T_1, T_1 \
VMALOF f3, g1, T_4, T_4 \
VMALOF f3, g54, T_2, T_2 \
VMALOF f4, g51, h0, h0 \
VMALOF f4, g54, h3, h3 \
VMALOF f4, g52, h1, h1 \
VMALOF f4, g0, h4, h4 \
VMALOF f4, g53, h2, h2 \
VAG T_0, h0, h0 \
VAG T_3, h3, h3 \
VAG T_1, h1, h1 \
VAG T_4, h4, h4 \
VAG T_2, h2, h2
// REDUCE performs the following carry operations in four
// stages, as specified in Bernstein & Schwabe:
//
// 1: h[0]->h[1] h[3]->h[4]
// 2: h[1]->h[2] h[4]->h[0]
// 3: h[0]->h[1] h[2]->h[3]
// 4: h[3]->h[4]
//
// The result is that all of the limbs are limited to 26-bits
// except for h[1] and h[4] which are limited to 27-bits.
//
// Note that although each limb is aligned at 26-bit intervals
// they may contain values that exceed 2² - 1, hence the need
// to carry the excess bits in each limb.
#define REDUCE(h0, h1, h2, h3, h4) \
VESRLG $26, h0, T_0 \
VESRLG $26, h3, T_1 \
VN MOD26, h0, h0 \
VN MOD26, h3, h3 \
VAG T_0, h1, h1 \
VAG T_1, h4, h4 \
VESRLG $26, h1, T_2 \
VESRLG $26, h4, T_3 \
VN MOD26, h1, h1 \
VN MOD26, h4, h4 \
VESLG $2, T_3, T_4 \
VAG T_3, T_4, T_4 \
VAG T_2, h2, h2 \
VAG T_4, h0, h0 \
VESRLG $26, h2, T_0 \
VESRLG $26, h0, T_1 \
VN MOD26, h2, h2 \
VN MOD26, h0, h0 \
VAG T_0, h3, h3 \
VAG T_1, h1, h1 \
VESRLG $26, h3, T_2 \
VN MOD26, h3, h3 \
VAG T_2, h4, h4
// EXPAND splits the 128-bit little-endian values in0 and in1
// into 26-bit big-endian limbs and places the results into
// the first and second lane of d[0:4] respectively.
//
// The EX0, EX1 and EX2 constants are arrays of byte indices
// for permutation. The permutation both reverses the bytes
// in the input and ensures the bytes are copied into the
// destination limb ready to be shifted into their final
// position.
#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
VPERM in0, in1, EX0, d0 \
VPERM in0, in1, EX1, d2 \
VPERM in0, in1, EX2, d4 \
VESRLG $26, d0, d1 \
VESRLG $30, d2, d3 \
VESRLG $4, d2, d2 \
VN MOD26, d0, d0 \ // [in0[0], in1[0]]
VN MOD26, d3, d3 \ // [in0[3], in1[3]]
VN MOD26, d1, d1 \ // [in0[1], in1[1]]
VN MOD24, d4, d4 \ // [in0[4], in1[4]]
VN MOD26, d2, d2 // [in0[2], in1[2]]
// func updateVX(state *macState, msg []byte)
TEXT ·updateVX(SB), NOSPLIT, $0
MOVD state+0(FP), R1
LMG msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len
// load EX0, EX1 and EX2
MOVD $·constants<>(SB), R5
VLM (R5), EX0, EX2
// generate masks
VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff]
VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff]
// load h (accumulator) and r (key) from state
VZERO T_1 // [0, 0]
VL 0(R1), T_0 // [h[0], h[1]]
VLEG $0, 16(R1), T_1 // [h[2], 0]
VL 24(R1), T_2 // [r[0], r[1]]
VPDI $0, T_0, T_2, T_3 // [h[0], r[0]]
VPDI $5, T_0, T_2, T_4 // [h[1], r[1]]
// unpack h and r into 26-bit limbs
// note: h[2] may have the low 3 bits set, so h[4] is a 27-bit value
VN MOD26, T_3, H_0 // [h[0], r[0]]
VZERO H_1 // [0, 0]
VZERO H_3 // [0, 0]
VGMG $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out
VESLG $24, T_1, T_1 // [h[2]<<24, 0]
VERIMG $-26&63, T_3, MOD26, H_1 // [h[1], r[1]]
VESRLG $+52&63, T_3, H_2 // [h[2], r[2]] - low 12 bits only
VERIMG $-14&63, T_4, MOD26, H_3 // [h[1], r[1]]
VESRLG $40, T_4, H_4 // [h[4], r[4]] - low 24 bits only
VERIMG $+12&63, T_4, T_0, H_2 // [h[2], r[2]] - complete
VO T_1, H_4, H_4 // [h[4], r[4]] - complete
// replicate r across all 4 vector elements
VREPF $3, H_0, R_0 // [r[0], r[0], r[0], r[0]]
VREPF $3, H_1, R_1 // [r[1], r[1], r[1], r[1]]
VREPF $3, H_2, R_2 // [r[2], r[2], r[2], r[2]]
VREPF $3, H_3, R_3 // [r[3], r[3], r[3], r[3]]
VREPF $3, H_4, R_4 // [r[4], r[4], r[4], r[4]]
// zero out lane 1 of h
VLEIG $1, $0, H_0 // [h[0], 0]
VLEIG $1, $0, H_1 // [h[1], 0]
VLEIG $1, $0, H_2 // [h[2], 0]
VLEIG $1, $0, H_3 // [h[3], 0]
VLEIG $1, $0, H_4 // [h[4], 0]
// calculate 5r (ignore least significant limb)
VREPIF $5, T_0
VMLF T_0, R_1, R5_1 // [5r[1], 5r[1], 5r[1], 5r[1]]
VMLF T_0, R_2, R5_2 // [5r[2], 5r[2], 5r[2], 5r[2]]
VMLF T_0, R_3, R5_3 // [5r[3], 5r[3], 5r[3], 5r[3]]
VMLF T_0, R_4, R5_4 // [5r[4], 5r[4], 5r[4], 5r[4]]
// skip r² calculation if we are only calculating one block
CMPBLE R3, $16, skip
// calculate r²
MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4)
REDUCE(M_0, M_1, M_2, M_3, M_4)
VGBM $0x0f0f, T_0
VERIMG $0, M_0, T_0, R_0 // [r[0], r²[0], r[0], r²[0]]
VERIMG $0, M_1, T_0, R_1 // [r[1], r²[1], r[1], r²[1]]
VERIMG $0, M_2, T_0, R_2 // [r[2], r²[2], r[2], r²[2]]
VERIMG $0, M_3, T_0, R_3 // [r[3], r²[3], r[3], r²[3]]
VERIMG $0, M_4, T_0, R_4 // [r[4], r²[4], r[4], r²[4]]
// calculate 5r² (ignore least significant limb)
VREPIF $5, T_0
VMLF T_0, R_1, R5_1 // [5r[1], 5r²[1], 5r[1], 5r²[1]]
VMLF T_0, R_2, R5_2 // [5r[2], 5r²[2], 5r[2], 5r²[2]]
VMLF T_0, R_3, R5_3 // [5r[3], 5r²[3], 5r[3], 5r²[3]]
VMLF T_0, R_4, R5_4 // [5r[4], 5r²[4], 5r[4], 5r²[4]]
loop:
CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients
// load next 2 blocks from message
VLM (R2), T_0, T_1
// update message slice
SUB $32, R3
MOVD $32(R2), R2
// unpack message blocks into 26-bit big-endian limbs
EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
// add 2¹² to each message block value
VLEIB $4, $1, M_4
VLEIB $12, $1, M_4
multiply:
// accumulate the incoming message
VAG H_0, M_0, M_0
VAG H_3, M_3, M_3
VAG H_1, M_1, M_1
VAG H_4, M_4, M_4
VAG H_2, M_2, M_2
// multiply the accumulator by the key coefficient
MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
// carry and partially reduce the partial products
REDUCE(H_0, H_1, H_2, H_3, H_4)
CMPBNE R3, $0, loop
finish:
// sum lane 0 and lane 1 and put the result in lane 1
VZERO T_0
VSUMQG H_0, T_0, H_0
VSUMQG H_3, T_0, H_3
VSUMQG H_1, T_0, H_1
VSUMQG H_4, T_0, H_4
VSUMQG H_2, T_0, H_2
// reduce again after summation
// TODO(mundaym): there might be a more efficient way to do this
// now that we only have 1 active lane. For example, we could
// simultaneously pack the values as we reduce them.
REDUCE(H_0, H_1, H_2, H_3, H_4)
// carry h[1] through to h[4] so that only h[4] can exceed 2² - 1
// TODO(mundaym): in testing this final carry was unnecessary.
// Needs a proof before it can be removed though.
VESRLG $26, H_1, T_1
VN MOD26, H_1, H_1
VAQ T_1, H_2, H_2
VESRLG $26, H_2, T_2
VN MOD26, H_2, H_2
VAQ T_2, H_3, H_3
VESRLG $26, H_3, T_3
VN MOD26, H_3, H_3
VAQ T_3, H_4, H_4
// h is now < 2(2¹³ - 5)
// Pack each lane in h[0:4] into h[0:1].
VESLG $26, H_1, H_1
VESLG $26, H_3, H_3
VO H_0, H_1, H_0
VO H_2, H_3, H_2
VESLG $4, H_2, H_2
VLEIB $7, $48, H_1
VSLB H_1, H_2, H_2
VO H_0, H_2, H_0
VLEIB $7, $104, H_1
VSLB H_1, H_4, H_3
VO H_3, H_0, H_0
VLEIB $7, $24, H_1
VSRLB H_1, H_4, H_1
// update state
VSTEG $1, H_0, 0(R1)
VSTEG $0, H_0, 8(R1)
VSTEG $1, H_1, 16(R1)
RET
b2: // 2 or fewer blocks remaining
CMPBLE R3, $16, b1
// Load the 2 remaining blocks (17-32 bytes remaining).
MOVD $-17(R3), R0 // index of final byte to load modulo 16
VL (R2), T_0 // load full 16 byte block
VLL R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes
// The Poly1305 algorithm requires that a 1 bit be appended to
// each message block. If the final block is less than 16 bytes
// long then it is easiest to insert the 1 before the message
// block is split into 26-bit limbs. If, on the other hand, the
// final message block is 16 bytes long then we append the 1 bit
// after expansion as normal.
MOVBZ $1, R0
MOVD $-16(R3), R3 // index of byte in last block to insert 1 at (could be 16)
CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long
VLVGB R3, R0, T_1 // insert 1 into the byte at index R3
// Split both blocks into 26-bit limbs in the appropriate lanes.
EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
// Append a 1 byte to the end of the second to last block.
VLEIB $4, $1, M_4
// Append a 1 byte to the end of the last block only if it is a
// full 16 byte block.
CMPBNE R3, $16, 2(PC)
VLEIB $12, $1, M_4
// Finally, set up the coefficients for the final multiplication.
// We have previously saved r and 5r in the 32-bit even indexes
// of the R_[0-4] and R5_[1-4] coefficient registers.
//
// We want lane 0 to be multiplied by r² so that can be kept the
// same. We want lane 1 to be multiplied by r so we need to move
// the saved r value into the 32-bit odd index in lane 1 by
// rotating the 64-bit lane by 32.
VGBM $0x00ff, T_0 // [0, 0xffffffffffffffff] - mask lane 1 only
VERIMG $32, R_0, T_0, R_0 // [_, r²[0], _, r[0]]
VERIMG $32, R_1, T_0, R_1 // [_, r²[1], _, r[1]]
VERIMG $32, R_2, T_0, R_2 // [_, r²[2], _, r[2]]
VERIMG $32, R_3, T_0, R_3 // [_, r²[3], _, r[3]]
VERIMG $32, R_4, T_0, R_4 // [_, r²[4], _, r[4]]
VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²[1], _, 5r[1]]
VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²[2], _, 5r[2]]
VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²[3], _, 5r[3]]
VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²[4], _, 5r[4]]
MOVD $0, R3
BR multiply
skip:
CMPBEQ R3, $0, finish
b1: // 1 block remaining
// Load the final block (1-16 bytes). This will be placed into
// lane 0.
MOVD $-1(R3), R0
VLL R0, (R2), T_0 // pad to 16 bytes with zeros
// The Poly1305 algorithm requires that a 1 bit be appended to
// each message block. If the final block is less than 16 bytes
// long then it is easiest to insert the 1 before the message
// block is split into 26-bit limbs. If, on the other hand, the
// final message block is 16 bytes long then we append the 1 bit
// after expansion as normal.
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, T_0
// Set the message block in lane 1 to the value 0 so that it
// can be accumulated without affecting the final result.
VZERO T_1
// Split the final message block into 26-bit limbs in lane 0.
// Lane 1 will be contain 0.
EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
// Append a 1 byte to the end of the last block only if it is a
// full 16 byte block.
CMPBNE R3, $16, 2(PC)
VLEIB $4, $1, M_4
// We have previously saved r and 5r in the 32-bit even indexes
// of the R_[0-4] and R5_[1-4] coefficient registers.
//
// We want lane 0 to be multiplied by r so we need to move the
// saved r value into the 32-bit odd index in lane 0. We want
// lane 1 to be set to the value 1. This makes multiplication
// a no-op. We do this by setting lane 1 in every register to 0
// and then just setting the 32-bit index 3 in R_0 to 1.
VZERO T_0
MOVD $0, R0
MOVD $0x10111213, R12
VLVGP R12, R0, T_1 // [_, 0x10111213, _, 0x00000000]
VPERM T_0, R_0, T_1, R_0 // [_, r[0], _, 0]
VPERM T_0, R_1, T_1, R_1 // [_, r[1], _, 0]
VPERM T_0, R_2, T_1, R_2 // [_, r[2], _, 0]
VPERM T_0, R_3, T_1, R_3 // [_, r[3], _, 0]
VPERM T_0, R_4, T_1, R_4 // [_, r[4], _, 0]
VPERM T_0, R5_1, T_1, R5_1 // [_, 5r[1], _, 0]
VPERM T_0, R5_2, T_1, R5_2 // [_, 5r[2], _, 0]
VPERM T_0, R5_3, T_1, R5_3 // [_, 5r[3], _, 0]
VPERM T_0, R5_4, T_1, R5_4 // [_, 5r[4], _, 0]
// Set the value of lane 1 to be 1.
VLEIF $3, $1, R_0 // [_, r[0], _, 1]
MOVD $0, R3
BR multiply

91
vendor/golang.org/x/crypto/poly1305/poly1305_compat.go generated vendored Normal file
View File

@@ -0,0 +1,91 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package poly1305 implements Poly1305 one-time message authentication code as
// specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
//
// Poly1305 is a fast, one-time authentication function. It is infeasible for an
// attacker to generate an authenticator for a message without the key. However, a
// key must only be used for a single message. Authenticating two different
// messages with the same key allows an attacker to forge authenticators for other
// messages with the same key.
//
// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
// used with a fixed key in order to generate one-time keys from an nonce.
// However, in this package AES isn't used and the one-time key is specified
// directly.
//
// Deprecated: Poly1305 as implemented by this package is a cryptographic
// building block that is not safe for general purpose use.
// For encryption, use the full ChaCha20-Poly1305 construction implemented by
// golang.org/x/crypto/chacha20poly1305. For authentication, use a general
// purpose MAC such as HMAC implemented by crypto/hmac.
package poly1305 // import "golang.org/x/crypto/poly1305"
import "golang.org/x/crypto/internal/poly1305"
// TagSize is the size, in bytes, of a poly1305 authenticator.
//
// For use with golang.org/x/crypto/chacha20poly1305, chacha20poly1305.Overhead
// can be used instead.
const TagSize = 16
// Sum generates an authenticator for msg using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
poly1305.Sum(out, m, key)
}
// Verify returns true if mac is a valid authenticator for m with the given key.
func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
return poly1305.Verify(mac, m, key)
}
// New returns a new MAC computing an authentication
// tag of all data written to it with the given key.
// This allows writing the message progressively instead
// of passing it as a single slice. Common users should use
// the Sum function instead.
//
// The key must be unique for each message, as authenticating
// two different messages with the same key allows an attacker
// to forge messages at will.
func New(key *[32]byte) *MAC {
return &MAC{mac: poly1305.New(key)}
}
// MAC is an io.Writer computing an authentication tag
// of the data written to it.
//
// MAC cannot be used like common hash.Hash implementations,
// because using a poly1305 key twice breaks its security.
// Therefore writing data to a running MAC after calling
// Sum or Verify causes it to panic.
type MAC struct {
mac *poly1305.MAC
}
// Size returns the number of bytes Sum will return.
func (h *MAC) Size() int { return TagSize }
// Write adds more data to the running message authentication code.
// It never returns an error.
//
// It must not be called after the first call of Sum or Verify.
func (h *MAC) Write(p []byte) (n int, err error) {
return h.mac.Write(p)
}
// Sum computes the authenticator of all data written to the
// message authentication code.
func (h *MAC) Sum(b []byte) []byte {
return h.mac.Sum(b)
}
// Verify returns whether the authenticator of all data written to
// the message authentication code matches the expected value.
func (h *MAC) Verify(expected []byte) bool {
return h.mac.Verify(expected)
}