@@ -34,18 +34,26 @@ CTR4BL: .octa 0x00000000000000000000000000000002
.text
+/**
+ * chacha_2block_xor_avx2 - Encrypt 2 blocks using the x86 AVX2 feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 2 data blocks output, o (%rsi)
+ * @src: address of up to 2 data blocks input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * This function encrypts two ChaCha blocks by loading the state
+ * matrix twice across four AVX registers. It performs matrix operations
+ * on four words in each matrix in parallel, but requires shuffling to
+ * rearrange the words after each round.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_2block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
vzeroupper
# x0..3[0-2] = s0..3
@@ -226,20 +234,28 @@ SYM_FUNC_START(chacha_2block_xor_avx2)
SYM_FUNC_END(chacha_2block_xor_avx2)
+/**
+ * chacha_4block_xor_avx2 - Encrypt 4 blocks using the x86 AVX2 feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 4 data blocks output, o (%rsi)
+ * @src: address of up to 4 data blocks input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * This function encrypts four ChaCha blocks by loading the state
+ * matrix four times across eight AVX registers. It performs matrix
+ * operations on four words in two matrices in parallel, sequentially
+ * to the operations on the four words of the other two matrices. The
+ * required word shuffling has a rather high latency, we can do the
+ * arithmetic on two matrix-pairs without much slowdown.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_4block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
vzeroupper
# x0..3[0-4] = s0..3
@@ -531,12 +547,28 @@ SYM_FUNC_START(chacha_4block_xor_avx2)
SYM_FUNC_END(chacha_4block_xor_avx2)
+/**
+ * chacha_8block_xor_avx2 - Encrypt 8 blocks using the x86 AVX2 feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 8 data blocks output, o (%rsi)
+ * @src: address of up to 8 data blocks input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * This function encrypts four ChaCha blocks by loading the state
+ * matrix four times across eight AVX registers. It performs matrix
+ * operations on four words in two matrices in parallel, sequentially
+ * to the operations on the four words of the other two matrices. The
+ * required word shuffling has a rather high latency, we can do the
+ * arithmetic on two matrix-pairs without much slowdown.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_8block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
# This function encrypts eight consecutive ChaCha blocks by loading
# the state matrix in AVX registers eight times. As we need some
@@ -24,18 +24,26 @@ CTR8BL: .octa 0x00000003000000020000000100000000
.text
+/**
+ * chacha_2block_xor_avx512vl - Encrypt 2 blocks using the x86 AVX512VL feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 2 data blocks output, o (%rsi)
+ * @src: address of up to 2 data blocks input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * This function encrypts two ChaCha blocks by loading the state
+ * matrix twice across four AVX registers. It performs matrix operations
+ * on four words in each matrix in parallel, but requires shuffling to
+ * rearrange the words after each round.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_2block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
vzeroupper
# x0..3[0-2] = s0..3
@@ -189,20 +197,28 @@ SYM_FUNC_START(chacha_2block_xor_avx512vl)
SYM_FUNC_END(chacha_2block_xor_avx512vl)
+/**
+ * chacha_4block_xor_avx512vl - Encrypt 4 blocks using the x86 AVX512VL feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 4 data blocks output, o (%rsi)
+ * @src: address of up to 4 data blocks input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * This function encrypts four ChaCha blocks by loading the state
+ * matrix four times across eight AVX registers. It performs matrix
+ * operations on four words in two matrices in parallel, sequentially
+ * to the operations on the four words of the other two matrices. The
+ * required word shuffling has a rather high latency, we can do the
+ * arithmetic on two matrix-pairs without much slowdown.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_4block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
vzeroupper
# x0..3[0-4] = s0..3
@@ -455,18 +471,26 @@ SYM_FUNC_START(chacha_4block_xor_avx512vl)
SYM_FUNC_END(chacha_4block_xor_avx512vl)
+/**
+ * chacha_8block_xor_avx512vl - Encrypt 8 blocks using the x86 AVX512VL feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 8 data blocks output, o (%rsi)
+ * @src: address of up to 8 data blocks input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * This function encrypts eight consecutive ChaCha blocks by loading
+ * the state matrix in AVX registers eight times. Compared to AVX2, this
+ * mostly benefits from the new rotate instructions in VL and the
+ * additional registers.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_8block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. Compared to AVX2, this
- # mostly benefits from the new rotate instructions in VL and the
- # additional registers.
-
vzeroupper
# x0..15[0-7] = s[0..15]
@@ -34,7 +34,6 @@ CTRINC: .octa 0x00000003000000020000000100000000
* Clobbers: %r8d, %xmm4-%xmm7
*/
SYM_FUNC_START_LOCAL(chacha_permute)
-
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5
@@ -111,12 +110,21 @@ SYM_FUNC_START_LOCAL(chacha_permute)
RET
SYM_FUNC_END(chacha_permute)
+/**
+ * chacha_block_xor_ssse3 - Encrypt 1 block using the x86 SSSE3 feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 1 data block output, o (%rsi)
+ * @src: address of up to 1 data block input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 1 data block output, o
- # %rdx: up to 1 data block input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
FRAME_BEGIN
# x0..3 = s0..3
@@ -199,10 +207,19 @@ SYM_FUNC_START(chacha_block_xor_ssse3)
SYM_FUNC_END(chacha_block_xor_ssse3)
+/**
+ * hchacha_block_ssse3 - Encrypt 1 block using the x86 SSSE3 feature set
+ * @state: address of input state matrix, s (%rdu)
+ * @out: address of output (8 32-bit words)(%rsi)
+ * @nrounds: number of rounds (%edx);
+ * only uses lower 32 bits
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * Return: none
+ * Prototype: asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+ */
SYM_FUNC_START(hchacha_block_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: output (8 32-bit words)
- # %edx: nrounds
FRAME_BEGIN
movdqu 0x00(%rdi),%xmm0
@@ -220,23 +237,31 @@ SYM_FUNC_START(hchacha_block_ssse3)
RET
SYM_FUNC_END(hchacha_block_ssse3)
+/**
+ * chacha_4block_xor_ssse3 - Encrypt 4 blocks using the x86 SSSE3 feature set
+ * @state: address of input state matrix, s (%rdi)
+ * @dst: address of up to 4 data blocks output, o (%rsi)
+ * @src: address of up to 4 data blocks input, i (%rdx)
+ * @len: input/output length in bytes (%rcx)
+ * @nrounds: number of rounds (%r8d)
+ *
+ * This function supports 64-bit CPUs.
+ *
+ * This function encrypts four consecutive ChaCha blocks by loading the
+ * state matrix in SSE registers four times. As we need some scratch
+ * registers, we save the first four registers on the stack. The
+ * algorithm performs each operation on the corresponding word of each
+ * state matrix, hence requires no word shuffling. For final XORing step
+ * we transpose the matrix by interleaving 32- and then 64-bit words,
+ * which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling,
+ * 7/12-bit word rotation uses traditional shift+OR.
+ *
+ * Return: none
+ * Prototype: asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ * unsigned int len, int nrounds);
+ */
SYM_FUNC_START(chacha_4block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four consecutive ChaCha blocks by loading the
- # the state matrix in SSE registers four times. As we need some scratch
- # registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32- and then 64-bit words,
- # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
- # done with the slightly better performing SSSE3 byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
Add kernel-doc comments for assembly language functions exported to C glue code. Signed-off-by: Robert Elliott <elliott@hpe.com> --- arch/x86/crypto/chacha-avx2-x86_64.S | 90 +++++++++++++++-------- arch/x86/crypto/chacha-avx512vl-x86_64.S | 94 +++++++++++++++--------- arch/x86/crypto/chacha-ssse3-x86_64.S | 75 ++++++++++++------- 3 files changed, 170 insertions(+), 89 deletions(-)