From patchwork Thu Dec 12 21:28:38 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 849841 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 74FB81D6DC8 for ; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; cv=none; b=n4Rrk4jVE+8zUddYYZphKW9i3bzCohFecpPdONHJwXnK8B2JEzk7/RQoWde923L0mXf1E8YX+OVFIPlcepZEZbJFFHSEvqjcj5MsTWcJQniHpbCRf3pJt39OxnJq6JI0DirAKvOdjqHzQwT9T0iKSiF9VAP4ctwFkbUzuSO+2zw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; c=relaxed/simple; bh=Deqd9Udw9e4M29yPDW2Z4eSvU6e0HfZyfV/z7xPajj4=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=CHDZ+K6jHohLHB22cD2F+Ottm1+YOrkaHS+cwfk/KMcKshoIPHmPvLNS9giXHVSy5XasHs4MGErjaKgblO01yE3HQd/PHXWbNiwhyR/C88d6J718Yo2pnWg5RcqAmJh+wSCuUkHZqXBwn1kDY7Wp08rYGSryeMIY0pjtT/GXsK4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=atdpu3/Y; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="atdpu3/Y" Received: by smtp.kernel.org (Postfix) with ESMTPSA id DFCF1C4CED3; Thu, 12 Dec 2024 21:29:14 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038955; bh=Deqd9Udw9e4M29yPDW2Z4eSvU6e0HfZyfV/z7xPajj4=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=atdpu3/YK52uFuqVJkG3M9CuCClaDBovS/JDBt7PnezTkDELgWIpmliR5301c0pa2 OdsR9t4dHOim6aJEpjytAiqXdow9nKribi66VJk5YJoXYYqw/Zr94wjPCMuqnJPd+F l5ieQf1Qbp1YlwFKgFEomm4UXj0X/LT+JuFrGaCSLKyo+iMMNMmC0lnExxWPfFHl0O RR2fPjhZkFlE0d0spqiNv00BrGQTNXksUNwVez/2rUMjvxdBNr0WXb/nD5MloGS0V6 gM8ONZ7UFjWg+tG9U42z7ufg49X/S+cemX829kelo8nsnyA0+OJEaAJbj5d0nbdWZ/ ykizcfymVOPhQ== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 1/8] crypto: x86/aes-gcm - code size optimization Date: Thu, 12 Dec 2024 13:28:38 -0800 Message-ID: <20241212212845.40333-2-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers Prefer immediates of -128 to 128, since the former fits in a signed byte, saving 3 bytes per instruction. Also replace a vpand and vpxor with a vpternlogd. Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-gcm-avx10-x86_64.S | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S index 97e0ee515fc5..8989bf9b8384 100644 --- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -382,12 +382,12 @@ // wide shift instruction, so instead double each of the two 64-bit // halves and incorporate the internal carry bit into the value XOR'd. vpshufd $0xd3, H_CUR_XMM, %xmm0 vpsrad $31, %xmm0, %xmm0 vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM - vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 - vpxor %xmm0, H_CUR_XMM, H_CUR_XMM + // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit + vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM // Load the gfpoly constant. vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY // Square H^1 to get H^2. @@ -711,11 +711,11 @@ // that processes 4*VL bytes of data at a time. Otherwise skip it. // // Pre-subtracting 4*VL from DATALEN saves an instruction from the main // loop and also ensures that at least one write always occurs to // DATALEN, zero-extending it and allowing DATALEN64 to be used later. - sub $4*VL, DATALEN + add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 jl .Lcrypt_loop_4x_done\@ // Load powers of the hash key. vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 @@ -758,13 +758,13 @@ vaesenclast RNDKEYLAST3, V3, GHASHDATA3 vmovdqu8 GHASHDATA0, 0*VL(DST) vmovdqu8 GHASHDATA1, 1*VL(DST) vmovdqu8 GHASHDATA2, 2*VL(DST) vmovdqu8 GHASHDATA3, 3*VL(DST) - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, DATALEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN jl .Lghash_last_ciphertext_4x\@ .endif // Cache as many additional AES round keys as possible. .irp i, 9,8,7,6,5 @@ -838,13 +838,13 @@ vmovdqu8 GHASHDATA0, 0*VL(DST) vmovdqu8 GHASHDATA1, 1*VL(DST) vmovdqu8 GHASHDATA2, 2*VL(DST) vmovdqu8 GHASHDATA3, 3*VL(DST) - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, DATALEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN jge .Lcrypt_loop_4x\@ .if \enc .Lghash_last_ciphertext_4x\@: // Update GHASH with the last set of ciphertext blocks. @@ -854,11 +854,11 @@ .endif .Lcrypt_loop_4x_done\@: // Undo the extra subtraction by 4*VL and check whether data remains. - add $4*VL, DATALEN + sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 jz .Ldone\@ // The data length isn't a multiple of 4*VL. Process the remaining data // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. // Going one vector at a time may seem inefficient compared to having From patchwork Thu Dec 12 21:28:39 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 849840 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A3B4A1D88A6 for ; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; cv=none; b=Oq+zoKqqZsDNOEw7hUn25a9XGEGJhhv61Ru55yoyvOkQjMnFRpO3IA1xlPJdUdCumaFmKagrtCyENWsAYTHvD/FVrWFupZWUSAMxKDnmHDrnGDC+V+Az5mIvaaWx9jfCgp5mw9Isuy7+fD2DU2AL3be5RhgqM/8+Rafg6cTsArU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; c=relaxed/simple; bh=71v8x5crsm+t+a6BSRSvg+wasb7ypc/4XZLm5Grh3n8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=BB2AnhDGvSjlRMvmLNDV66bUBM8Gs3sJzwRVNsEv8/yMlWCdkLXcn28hUe2i8FsTSpuz3oaNQxoSkO0B1GI9nNtGto26y5xXafht5I7v+CT0gpAGbfsy+pyYB0MpFhlldqif1ZkkwgLs+cg7IjxL5fidbF+Dz8cwprhO3etugzA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=EDfgoRYG; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="EDfgoRYG" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2237AC4CEDE; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038955; bh=71v8x5crsm+t+a6BSRSvg+wasb7ypc/4XZLm5Grh3n8=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=EDfgoRYGyBR5ZortZZBzLiYk2mddfIyvHINDj3P0XdcCvuOvKHB9IgkoU8I0h8O6j 2d22AUINO/WLHAwJ1flY7X+LNDD4EEG3r9O7yC6AEOoRedi3qOWN0EjaHaypwhfaTC 6h00dajwVpbiG6DFIUInSC0SQ4mCNekFKoPGG1do0UgV3dO8/qnjbiFz4fJ7H83E9W kIBHfoD7CYUzoo3FXcP202+7iq7I50UqAFa1yS6YKYH+eroeq6/H4lyhJUIxx607sZ Cao+1zl7S6bF50HJpF2vQdfJ8dT3WjhkVQ7nacoVP9Y7zbw8fEa5JrKiNDjwoaWoNS RsdBEFzjb4qhA== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 2/8] crypto: x86/aes-gcm - tune better for AMD CPUs Date: Thu, 12 Dec 2024 13:28:39 -0800 Message-ID: <20241212212845.40333-3-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers Reorganize the main loop to free up the RNDKEYLAST[0-3] registers and use them for more cached round keys. This improves performance by about 2% on AMD Zen 4 and Zen 5. Intel performance remains about the same. Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-gcm-avx10-x86_64.S | 99 ++++++++++---------------- 1 file changed, 38 insertions(+), 61 deletions(-) diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S index 8989bf9b8384..02ee11083d4f 100644 --- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -86,11 +86,11 @@ .section .rodata .p2align 6 // A shuffle mask that reflects the bytes of 16-byte blocks .Lbswap_mask: - .octa 0x000102030405060708090a0b0c0d0e0f + .octa 0x000102030405060708090a0b0c0d0e0f // This is the GHASH reducing polynomial without its constant term, i.e. // x^128 + x^7 + x^2 + x, represented using the backwards mapping // between bits and polynomial coefficients. // @@ -560,10 +560,36 @@ vpxord RNDKEY0, V1, V1 vpxord RNDKEY0, V2, V2 vpxord RNDKEY0, V3, V3 .endm +// Do the last AES round for four vectors of counter blocks V0-V3, XOR source +// data with the resulting keystream, and write the result to DST and +// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) +.macro _aesenclast_and_xor_4x + // XOR the source data with the last round key, saving the result in + // GHASHDATA[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). + vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 + vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 + vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 + vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast GHASHDATA0, V0, GHASHDATA0 + vaesenclast GHASHDATA1, V1, GHASHDATA1 + vaesenclast GHASHDATA2, V2, GHASHDATA2 + vaesenclast GHASHDATA3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) +.endm + // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, // const u32 le_ctr[4], u8 ghash_acc[16], // const u8 *src, u8 *dst, int datalen); // // This macro generates a GCM encryption or decryption update function with the @@ -638,29 +664,24 @@ .set LE_CTR_INC, V11 // LE_CTR contains the next set of little-endian counter blocks. .set LE_CTR, V12 - // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. .set RNDKEY0, V13 .set RNDKEYLAST, V14 .set RNDKEY_M9, V15 .set RNDKEY_M8, V16 .set RNDKEY_M7, V17 .set RNDKEY_M6, V18 .set RNDKEY_M5, V19 - - // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with - // the corresponding block of source data. This is useful because - // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can - // be computed in parallel with the AES rounds. - .set RNDKEYLAST0, V20 - .set RNDKEYLAST1, V21 - .set RNDKEYLAST2, V22 - .set RNDKEYLAST3, V23 + .set RNDKEY_M4, V20 + .set RNDKEY_M3, V21 + .set RNDKEY_M2, V22 + .set RNDKEY_M1, V23 // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These // cannot coincide with anything used for AES encryption, since for // performance reasons GHASH and AES encryption are interleaved. .set GHASHTMP0, V24 @@ -746,30 +767,19 @@ vbroadcasti32x4 (%rax), RNDKEY _vaesenc_4x RNDKEY add $16, %rax cmp %rax, RNDKEYLAST_PTR jne 1b - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) + _aesenclast_and_xor_4x sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 sub $-4*VL, DST add $-4*VL, DATALEN jl .Lghash_last_ciphertext_4x\@ .endif // Cache as many additional AES round keys as possible. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i .endr .Lcrypt_loop_4x\@: @@ -797,51 +807,18 @@ _vaesenc_4x RNDKEY vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY _vaesenc_4x RNDKEY 128: - // XOR the source data with the last round key, saving the result in - // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the - // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). -.if \enc - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 -.else - vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 - vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 - vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 - vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 -.endif - // Finish the AES encryption of the counter blocks in V0-V3, interleaved // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i) _vaesenc_4x RNDKEY_M\i - _ghash_step_4x (9 - \i) -.endr -.irp i, 4,3,2,1 - vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY - _vaesenc_4x RNDKEY - _ghash_step_4x (9 - \i) .endr _ghash_step_4x 9 - - // Do the last AES round. This handles the XOR with the source data - // too, as per the optimization described above. - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - - // Store the en/decrypted data to DST. - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) - + _aesenclast_and_xor_4x sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 sub $-4*VL, DST add $-4*VL, DATALEN jge .Lcrypt_loop_4x\@ @@ -938,11 +915,11 @@ // be whole block(s) that get processed by the GHASH multiplication and // reduction instructions but should not actually be included in the // GHASH. However, any such blocks are all-zeroes, and the values that // they're multiplied with are also all-zeroes. Therefore they just add // 0 * 0 = 0 to the final GHASH result, which makes no difference. - vmovdqu8 (POWERS_PTR), H_POW1 + vmovdqu8 (POWERS_PTR), H_POW1 .if \enc vmovdqu8 V0, V1{%k1}{z} .endif vpshufb BSWAP_MASK, V1, V0 vpxord GHASH_ACC, V0, V0 From patchwork Thu Dec 12 21:28:40 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 851074 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8A4251D79B4 for ; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; cv=none; b=IIbAztC20Dh2EawNQDTKXKiWmMMdWQG9O85aE0Go1hP/LspJC0HIbA5bf7w1H0IJcnmc7cy72YlCYuVR/OVhHrYQQwuDMWOYM3qUINuWHqez2F+KdkV09/Y06npD8+Y6HQJt/3Kfv3G3OWxv6gmPQPC4bhatj4Bs3CMhQcBZsEw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; c=relaxed/simple; bh=FFfyPoD79Mg8fUB5soc2AZToAwkrs7/exNRNK6bG6ws=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=X6lV4xrROnxl9FxAsfjPso+bGmABakgOyPFiScbEsR4MTjrVayI2v489c/JuyFmbWLl3Q6VWTBofo7fipk3YSkS+7U7Eo73X3MXua0G8bD0YXqrtu3UDDh4ZMSzPQTIEV2S16K7hRP2JQTUJGqRJbSbwynq6JHtRzt6lKj3Ck8U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=cbDuwCQy; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="cbDuwCQy" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 58AE9C4CED0; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038955; bh=FFfyPoD79Mg8fUB5soc2AZToAwkrs7/exNRNK6bG6ws=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=cbDuwCQyGBodWU2+KS76E+TWZM/qRKosrR4MchQrVbarB29ZhxZ540Qhp0b2Z22OE /5e2NYFd73nCV5iI4baaEhUEjmAuDAsjNWJCHDuQSVIRbCep3qg6PSXVT0z1q0VFwl lbzI66I1QJSCYmhlod/CsKN2mboK7Fa/0ZA4oYd1tMTxVIbowhoK8Utnn5RhJ+LTBJ 0PnN6bXEd4T402E+zjSMoizlCMJnGCYTT/k8r1SRrlTpFmMwfNWNN5HeB7cip4EmGX 6Suf0bprPN64Lfip3l16TzGwydRD/ueKxNY4plQlXtIyiQ5DDUG3BjCqrKCfyNQ0ya Kp3aFsA7y9GOg== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 3/8] crypto: x86/aes-xts - use .irp when useful Date: Thu, 12 Dec 2024 13:28:40 -0800 Message-ID: <20241212212845.40333-4-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers Use .irp instead of repeating code. No change in the generated code. Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-xts-avx-x86_64.S | 74 ++++++---------------------- 1 file changed, 15 insertions(+), 59 deletions(-) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 48f97b79f7a9..580e73396052 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -110,43 +110,17 @@ .macro _define_aliases // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers // are available, that map to the xmm, ymm, or zmm registers according // to the selected Vector Length (VL). - _define_Vi 0 - _define_Vi 1 - _define_Vi 2 - _define_Vi 3 - _define_Vi 4 - _define_Vi 5 - _define_Vi 6 - _define_Vi 7 - _define_Vi 8 - _define_Vi 9 - _define_Vi 10 - _define_Vi 11 - _define_Vi 12 - _define_Vi 13 - _define_Vi 14 - _define_Vi 15 +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + _define_Vi \i +.endr .if USE_AVX10 - _define_Vi 16 - _define_Vi 17 - _define_Vi 18 - _define_Vi 19 - _define_Vi 20 - _define_Vi 21 - _define_Vi 22 - _define_Vi 23 - _define_Vi 24 - _define_Vi 25 - _define_Vi 26 - _define_Vi 27 - _define_Vi 28 - _define_Vi 29 - _define_Vi 30 - _define_Vi 31 +.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + _define_Vi \i +.endr .endif // V0-V3 hold the data blocks during the main loop, or temporary values // otherwise. V4-V5 hold temporary values. @@ -543,19 +517,13 @@ _vaes_1x \enc, 0, 2, \xmm_suffix, \data .Laes192\@: _vaes_1x \enc, 0, 3, \xmm_suffix, \data _vaes_1x \enc, 0, 4, \xmm_suffix, \data .Laes128\@: - _vaes_1x \enc, 0, 5, \xmm_suffix, \data - _vaes_1x \enc, 0, 6, \xmm_suffix, \data - _vaes_1x \enc, 0, 7, \xmm_suffix, \data - _vaes_1x \enc, 0, 8, \xmm_suffix, \data - _vaes_1x \enc, 0, 9, \xmm_suffix, \data - _vaes_1x \enc, 0, 10, \xmm_suffix, \data - _vaes_1x \enc, 0, 11, \xmm_suffix, \data - _vaes_1x \enc, 0, 12, \xmm_suffix, \data - _vaes_1x \enc, 0, 13, \xmm_suffix, \data +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_1x \enc, 0, \i, \xmm_suffix, \data +.endr _vaes_1x \enc, 1, 14, \xmm_suffix, \data _vpxor \tweak, \data, \data .endm .macro _aes_xts_crypt enc @@ -616,19 +584,13 @@ _vaes_4x \enc, 0, 2 .Laes192\@: _vaes_4x \enc, 0, 3 _vaes_4x \enc, 0, 4 .Laes128\@: - _vaes_4x \enc, 0, 5 - _vaes_4x \enc, 0, 6 - _vaes_4x \enc, 0, 7 - _vaes_4x \enc, 0, 8 - _vaes_4x \enc, 0, 9 - _vaes_4x \enc, 0, 10 - _vaes_4x \enc, 0, 11 - _vaes_4x \enc, 0, 12 - _vaes_4x \enc, 0, 13 +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_4x \enc, 0, \i +.endr _vaes_4x \enc, 1, 14 // XOR in the tweaks again. _vpxor TWEAK0, V0, V0 _vpxor TWEAK1, V1, V1 @@ -777,19 +739,13 @@ SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) vaesenc -5*16(%rdi), %xmm0, %xmm0 .Lencrypt_iv_aes192: vaesenc -4*16(%rdi), %xmm0, %xmm0 vaesenc -3*16(%rdi), %xmm0, %xmm0 .Lencrypt_iv_aes128: - vaesenc -2*16(%rdi), %xmm0, %xmm0 - vaesenc -1*16(%rdi), %xmm0, %xmm0 - vaesenc 0*16(%rdi), %xmm0, %xmm0 - vaesenc 1*16(%rdi), %xmm0, %xmm0 - vaesenc 2*16(%rdi), %xmm0, %xmm0 - vaesenc 3*16(%rdi), %xmm0, %xmm0 - vaesenc 4*16(%rdi), %xmm0, %xmm0 - vaesenc 5*16(%rdi), %xmm0, %xmm0 - vaesenc 6*16(%rdi), %xmm0, %xmm0 +.irp i, -2,-1,0,1,2,3,4,5,6 + vaesenc \i*16(%rdi), %xmm0, %xmm0 +.endr vaesenclast 7*16(%rdi), %xmm0, %xmm0 vmovdqu %xmm0, (%rsi) RET SYM_FUNC_END(aes_xts_encrypt_iv) From patchwork Thu Dec 12 21:28:41 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 851073 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BEEFB1D88DD for ; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; cv=none; b=mYKiyB0SVujq2gvzF/q9ddTVMsSXoJqm8iu0UBspMFsZCJsPN5Y7YVl9fq38mZuKGhPRPwqBkC1TZaX0zjRPSrTmrQwTBjOXD0zbT5L1rX75lPY7DoEOpWkWva4L6u2R575LXZ71CzAJDaFi30Edl1uXmcLP8nyeK/PkcIMd3lo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038955; c=relaxed/simple; bh=yzWMmgWBSarP35vlDb1i18NSU9mHYw/ZyjnDbn+PkMU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=HpoGZpVu8HhPVPrjy30YQNskzbK5+rE3oBaD7fXuOMXcVZdFSDQ0sQkyIinviIDbO353JwghnB4NkxAHVevdVH2WDnbuTcPHOV7q3xUkiURGZ/TB7tzaTYcDQWxn79/ke2yWZ6fnJA6ptZA9J/DLNlKS22FBUrwgGJioTqjQdes= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ONoxceoa; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ONoxceoa" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8F2A7C4CED1; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038955; bh=yzWMmgWBSarP35vlDb1i18NSU9mHYw/ZyjnDbn+PkMU=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ONoxceoaZWymjULE9EJP1DnE8d2Z6OGusfHroXM80DZB6euiHFMyikmC8rbfq75Cl GuL2C37gWkO7HGL1UbdxgTyD5kU9lX71stofR0dpwQS/UC+2W8pKt091nNy1HCrMiV h4ePGv7n13I2HcQ+dDS/Q/1LZi3Ygh2zsZwE4Nscmh9HnmUVK8DuYS9cwUHehxkTvi +CuSEvjzfMZcbsWuHvSqWrnPl/5aybVAxzDnK8J1i7lO4FJ+OYdqGGhghX6YDeJV1I FAwKkOI2BP3rHPJ/WUA1slFKFfJVVA7Ki3OwkTxsrN4O9i7326coMrduLNCrD/6iHG XGmPfZsLYS7Rw== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 4/8] crypto: x86/aes-xts - make the register aliases per-function Date: Thu, 12 Dec 2024 13:28:41 -0800 Message-ID: <20241212212845.40333-5-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers Since aes-xts-avx-x86_64.S contains multiple functions, move the register aliases for the parameters and local variables of the XTS update function into the macro that generates that function. Then add register aliases to aes_xts_encrypt_iv() to improve readability there. This makes aes-xts-avx-x86_64.S consistent with the GCM assembly files. No change in the generated code. Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-xts-avx-x86_64.S | 61 +++++++++++++++------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 580e73396052..ca69e6480cb6 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -78,26 +78,10 @@ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .text -// Function parameters -.set KEY, %rdi // Initially points to crypto_aes_ctx, then is - // advanced to point to 7th-from-last round key -.set SRC, %rsi // Pointer to next source data -.set DST, %rdx // Pointer to next destination data -.set LEN, %ecx // Remaining length in bytes -.set LEN8, %cl -.set LEN64, %rcx -.set TWEAK, %r8 // Pointer to next tweak - -// %rax holds the AES key length in bytes. -.set KEYLEN, %eax -.set KEYLEN64, %rax - -// %r9-r11 are available as temporaries. - .macro _define_Vi i .if VL == 16 .set V\i, %xmm\i .elseif VL == 32 .set V\i, %ymm\i @@ -119,10 +103,26 @@ .irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 _define_Vi \i .endr .endif + // Function parameters + .set KEY, %rdi // Initially points to crypto_aes_ctx, then is + // advanced to point to 7th-from-last round key + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes + .set LEN8, %cl + .set LEN64, %rcx + .set TWEAK, %r8 // Pointer to next tweak + + // %rax holds the AES key length in bytes. + .set KEYLEN, %eax + .set KEYLEN64, %rax + + // %r9-r11 are available as temporaries. + // V0-V3 hold the data blocks during the main loop, or temporary values // otherwise. V4-V5 hold temporary values. // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. .set TWEAK0_XMM, %xmm6 @@ -726,28 +726,33 @@ .endm // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, // u8 iv[AES_BLOCK_SIZE]); SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) - vmovdqu (%rsi), %xmm0 - vpxor (%rdi), %xmm0, %xmm0 - movl 480(%rdi), %eax // AES key length - lea -16(%rdi, %rax, 4), %rdi - cmp $24, %eax + .set TWEAK_KEY, %rdi + .set IV, %rsi + .set KEYLEN, %eax + .set KEYLEN64, %rax + + vmovdqu (IV), %xmm0 + vpxor (TWEAK_KEY), %xmm0, %xmm0 + movl 480(TWEAK_KEY), KEYLEN + lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY + cmp $24, KEYLEN jl .Lencrypt_iv_aes128 je .Lencrypt_iv_aes192 - vaesenc -6*16(%rdi), %xmm0, %xmm0 - vaesenc -5*16(%rdi), %xmm0, %xmm0 + vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes192: - vaesenc -4*16(%rdi), %xmm0, %xmm0 - vaesenc -3*16(%rdi), %xmm0, %xmm0 + vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes128: .irp i, -2,-1,0,1,2,3,4,5,6 - vaesenc \i*16(%rdi), %xmm0, %xmm0 + vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 .endr - vaesenclast 7*16(%rdi), %xmm0, %xmm0 - vmovdqu %xmm0, (%rsi) + vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 + vmovdqu %xmm0, (IV) RET SYM_FUNC_END(aes_xts_encrypt_iv) // Below are the actual AES-XTS encryption and decryption functions, // instantiated from the above macro. They all have the following prototype: From patchwork Thu Dec 12 21:28:42 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 851071 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 525A51D9A63 for ; Thu, 12 Dec 2024 21:29:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038957; cv=none; b=DYf2QgSATGBW3Ax+0sy7HR9zK6YZFVH8dmCjPneECTFD64JyP8UIBCdZRkQVhk5m6okqNYM66bZYOV+jMq2JcT0Q5K5siKQ0rEjiHEnB4l07FkoQWs7RBTa9+XpA9tfr24tqg2hdI9vm3sC3YPD7x3I5I8BxsdrocZdHGSsxCjo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038957; c=relaxed/simple; bh=O+QDgYdLqxtymmElnvWAzhHkq2WeU2TcPO63cefZkw0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=fUllaQQAcYd8MYqqOHayW+vPebir3G/DLQoY75x57ydjbxUPwT1tpV+Sy7EK2PJY/5UM7f7He71t0a67d0WXKgqx/JM0SiF3wPTNF2bOEVF9QvvMb+EElyZxIgEBcpYfUOu34wb06uUSBzc9kbQP4d85YhCYkjY0Y+/QsPrkXvs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=sVGhh4n7; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="sVGhh4n7" Received: by smtp.kernel.org (Postfix) with ESMTPSA id C76FBC4CED0; Thu, 12 Dec 2024 21:29:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038955; bh=O+QDgYdLqxtymmElnvWAzhHkq2WeU2TcPO63cefZkw0=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=sVGhh4n7j9gpegCVvtk0RmuHla2JjoRLDMsKGboEGhMWOOyb6mlbvMqD3bnTTdJ/9 Fxun92adT9Lq35I84LBUJ66kv6VDRkyStaqJom/3yz/XfP2JkHW6Z9w3sD/FNI078B kB0FQTKKyTBc82vcKiIRfYA7rIt/t4T2UX/QN8C/XESxyj3bNGQffwzkTI1MidRD6o Zxl0x3kgqI2Bll8OvgbmgMmL21DaYhX8G6AXwQDSLK2tm8wHBCs2/+uGkGgGIryarK CIPb3fkymhqXBcO+s6zEkvp1HxFWZqspJi/LTPWy+yyieWjMX30QLoPjY6s6enakAh 5Usl3ulG0YR9w== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 5/8] crypto: x86/aes-xts - improve some comments Date: Thu, 12 Dec 2024 13:28:42 -0800 Message-ID: <20241212212845.40333-6-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers Improve some of the comments in aes-xts-avx-x86_64.S. Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-xts-avx-x86_64.S | 31 ++++++++++++++++++---------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index ca69e6480cb6..903b894e5f48 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -341,13 +341,18 @@ .endif .endm // Do one step in computing the next set of tweaks using the VPCLMULQDQ method // (the same method _next_tweakvec uses for VL > 16). This means multiplying -// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 -// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, -// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. +// each tweak by x^(4*VL/16) independently. +// +// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed +// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq +// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves +// instructions directly. The 128-bit right shift (vpsrldq) performs better +// than a 64-bit right shift on Intel CPUs in the context where it is used here, +// because it runs on a different execution port from the AES instructions. .macro _tweak_step_pclmul i .if \i == 0 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 .elseif \i == 2 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 @@ -378,11 +383,11 @@ // _tweak_step does one step of the computation of the next set of tweaks from // TWEAK[0-3]. To complete all steps, this is invoked with increasing values of // \i that include at least 0 through 19, then 1000 which signals the last step. // // This is used to interleave the computation of the next set of tweaks with the -// AES en/decryptions, which increases performance in some cases. +// AES en/decryptions, which increases performance in some cases. Clobbers V5. .macro _tweak_step i .if VL == 16 _tweak_step_mulx \i .else _tweak_step_pclmul \i @@ -415,13 +420,14 @@ // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the // beginning. Skipping rounds at the end doesn't work as well because // the last round needs different instructions. // // An alternative approach would be to roll up all the round loops. We - // don't do that because it isn't compatible with caching the round keys - // in registers which we do when possible (see below), and also because - // it seems unwise to rely *too* heavily on the CPU's branch predictor. + // don't do that because (a) it isn't compatible with caching the round + // keys in registers which we do when possible (see below), (b) we + // interleave the AES rounds with the XTS tweak computation, and (c) it + // seems unwise to rely *too* heavily on the CPU's branch predictor. lea OFFS-16(KEY, KEYLEN64, 4), KEY // If all 32 SIMD registers are available, cache all the round keys. .if USE_AVX10 cmp $24, KEYLEN @@ -482,11 +488,11 @@ .endm // Do a single round of AES en/decryption on the blocks in registers V0-V3, // using the same key for all blocks. The round key is loaded from the // appropriate register or memory location for round \i. In addition, does two -// steps of the computation of the next set of tweaks. May clobber V4. +// steps of the computation of the next set of tweaks. May clobber V4 and V5. .macro _vaes_4x enc, last, i .if USE_AVX10 _tweak_step (2*(\i-5)) _vaes \enc, \last, KEY\i, V0 _vaes \enc, \last, KEY\i, V1 @@ -725,10 +731,13 @@ jmp .Ldone\@ .endm // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, // u8 iv[AES_BLOCK_SIZE]); +// +// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) .set TWEAK_KEY, %rdi .set IV, %rsi .set KEYLEN, %eax .set KEYLEN64, %rax @@ -755,13 +764,13 @@ SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) SYM_FUNC_END(aes_xts_encrypt_iv) // Below are the actual AES-XTS encryption and decryption functions, // instantiated from the above macro. They all have the following prototype: // -// void (*xts_asm_func)(const struct crypto_aes_ctx *key, -// const u8 *src, u8 *dst, unsigned int len, -// u8 tweak[AES_BLOCK_SIZE]); +// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, unsigned int len, +// u8 tweak[AES_BLOCK_SIZE]); // // |key| is the data key. |tweak| contains the next tweak; the encryption of // the original IV with the tweak key was already done. This function supports // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and // |len| must be a multiple of 16 except on the last call. If |len| is a From patchwork Thu Dec 12 21:28:43 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 849839 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 41CF31D79B4 for ; Thu, 12 Dec 2024 21:29:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038956; cv=none; b=jfj2kukbPtm1WdOSMi3fhgU1A5ZRTIwjEi+c8gnWNk9JZoY+q2M1lJY1r2F8uKZyX77YHJhg6uignXmENNIzHUkLSUVvfW4bE6l62+zCsuJj1QMnor8aUqhHIaegApcBfrCtpikCtWTvz5/oSprF84oKU9EZmzwGlroVgTg3PnU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038956; c=relaxed/simple; bh=irQS/qqvaIZjrLrFc9t6oh6P4EtcoJVwxiXnHqisG2U=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=OQdBLVuPdt10FskgAb8FxjDydfw0LnTMF8xAfHneu/mbn5uPtfXJwh0JpzLN/TarwkL0TcRjpqpIhEFRrwQtBcM723Ltn+CYkpLs4+k1ZRibk2bAQH6g1AGTK7fwW3ZeXW+e2sSg+vWLOWec+3FkdjDPhRSLqP/LyvlJZ5divVk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=slWTMyhm; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="slWTMyhm" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 0AE8BC4CED3; Thu, 12 Dec 2024 21:29:16 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038956; bh=irQS/qqvaIZjrLrFc9t6oh6P4EtcoJVwxiXnHqisG2U=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=slWTMyhmhrX+evPy+y5P3BlUjmJzDRB4u2Ww/EhLp97PUN2NXfzBCF7/xftYFJsU8 5CpQyfbPPuFOrNRrQE4If9PJCT577o9SMl8bVDPca+fo8CxXtDF9l4l66JxqkjUylL VUdlpBVNEtWbNtmcvIU2Gl2EAx0NMe2UAEMOVAL6Nz5fR5BI/RdrLBd8tQsUAmTf6O AK0UNvfxBaEyF0N4ybs0wx+j/CartZkPWW4K6YDtQ2z4RNKg0L2MtSzDQ0uZ7zkjiQ wls/QiFKL9JqeXCCMaprBwph9hYEXm25Pk8TvtNYLQ8fB3YMlDjo+LAmm1O14mnTpf 8UCez49v6rFsA== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 6/8] crypto: x86/aes-xts - change len parameter to int Date: Thu, 12 Dec 2024 13:28:43 -0800 Message-ID: <20241212212845.40333-7-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers The AES-XTS assembly code currently treats the length as signed, since this saves a few instructions in the loop compared to treating it as unsigned. Therefore update the type to make this clear. (It is not actually passed any values larger than PAGE_SIZE.) Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-xts-avx-x86_64.S | 2 +- arch/x86/crypto/aesni-intel_glue.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 903b894e5f48..c4e8ba6ed61d 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -765,11 +765,11 @@ SYM_FUNC_END(aes_xts_encrypt_iv) // Below are the actual AES-XTS encryption and decryption functions, // instantiated from the above macro. They all have the following prototype: // // void (*xts_crypt_func)(const struct crypto_aes_ctx *key, -// const u8 *src, u8 *dst, unsigned int len, +// const u8 *src, u8 *dst, int len, // u8 tweak[AES_BLOCK_SIZE]); // // |key| is the data key. |tweak| contains the next tweak; the encryption of // the original IV with the tweak key was already done. This function supports // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index fbf43482e1f5..11e95fc62636 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -503,11 +503,11 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, } typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, - const u8 *src, u8 *dst, unsigned int len, + const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); /* This handles cases where the source and/or destination span pages. */ static noinline int xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func) @@ -622,18 +622,18 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, { aesni_enc(tweak_key, iv, iv); } static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, - const u8 *src, u8 *dst, unsigned int len, + const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_enc(key, dst, src, len, tweak); } static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, - const u8 *src, u8 *dst, unsigned int len, + const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_dec(key, dst, src, len, tweak); } @@ -788,14 +788,14 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, #define DEFINE_XTS_ALG(suffix, driver_name, priority) \ \ asmlinkage void \ aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ - u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ asmlinkage void \ aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ - u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ \ static int xts_encrypt_##suffix(struct skcipher_request *req) \ { \ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \ } \ From patchwork Thu Dec 12 21:28:44 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 851072 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 731111D88DD for ; Thu, 12 Dec 2024 21:29:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038956; cv=none; b=mchiRQAI9EE+fkqk4EH4UiQxXxyuUJLTCWskjv+r/31JZUaxJz9zRipxfEyzYr+YXlIIc7FZ46oOoUfqhF/q01DjDRXIDWupvkhnUnUairPhTiuwgZKyixW70do7O2rtRg4s5a8sHicdQC4vISdayfACRoA6hX4/A5RgZpJpQh4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038956; c=relaxed/simple; bh=k2wxlfkrPJzpwypg33wSL3idasfAIk8ZbsJ9FQV2tN4=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=CPwlWDFG/DctKji7AjdO03D/VdL/65oiMIB7Ok9rrciu0RQTjtv1/t80qlknyJxLK+nlYBubg5I5lCz528yP3k/k1UQXRozzIAL9ETcmKeZl2FnSFqwUnBGIedgIepip/gA63HIMmJZ6G+bfGUcUCzk/VVGxftebM7fc3ZYwGm4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=fAJI5dIw; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="fAJI5dIw" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 455E8C4CEDF; Thu, 12 Dec 2024 21:29:16 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038956; bh=k2wxlfkrPJzpwypg33wSL3idasfAIk8ZbsJ9FQV2tN4=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=fAJI5dIwADW97o04XD0Yp1TLndPHlnPuiO3YEThU5LN25gghS8QuHtjOHu9lztHd0 7N8sXQ2nZs4Ibh/XSB42rPQ25ThbyfR8YIk7EpDJiIAQFOUzjMFksfFjaUfa84p8Vs tspHuiRkJfE/dxal475jpxx7bZkA2oLf89l7RYPCadx+B/kFQ3lbcIgETuFDAnpmzh 4qaZwvmwe5uO4kfVD4oPCkZ2x3m79TRyhtTz8+bM7ybAlS+TnLWPGZOZfcxbUeGSsX QPpboz5pmZsR7eFaJtfvkTqVoDwCFtc808y3kh6cUdJjxMoZL4CdVcBvccoqLl8uZE mCcQS+gnkfKPA== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 7/8] crypto: x86/aes-xts - more code size optimizations Date: Thu, 12 Dec 2024 13:28:44 -0800 Message-ID: <20241212212845.40333-8-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers Prefer immediates of -128 to 128, since the former fits in a signed byte, saving 3 bytes per instruction. Also prefer VEX-coded instructions to EVEX where this is easy to do. Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-xts-avx-x86_64.S | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index c4e8ba6ed61d..0e6b9ae12e95 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -186,10 +186,11 @@ .endif // V30-V31 are currently unused. .endm // Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst .else vmovdqu8 \src, \dst @@ -206,15 +207,16 @@ vbroadcasti32x4 \src, \dst .endif .endm // XOR two vectors together. +// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst -.if USE_AVX10 - vpxord \src1, \src2, \dst -.else +.if VL < 64 vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst .endif .endm // XOR three vectors together. .macro _xor3 src1, src2, src3_and_dst @@ -553,22 +555,22 @@ _setup_round_keys \enc // Compute the first set of tweaks TWEAK[0-3]. _compute_first_set_of_tweaks - sub $4*VL, LEN + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 jl .Lhandle_remainder\@ .Lmain_loop\@: // This is the main loop, en/decrypting 4*VL bytes per iteration. // XOR each source block with its tweak and the zero-th round key. .if USE_AVX10 - vmovdqu8 0*VL(SRC), V0 - vmovdqu8 1*VL(SRC), V1 - vmovdqu8 2*VL(SRC), V2 - vmovdqu8 3*VL(SRC), V3 + _vmovdqu 0*VL(SRC), V0 + _vmovdqu 1*VL(SRC), V1 + _vmovdqu 2*VL(SRC), V2 + _vmovdqu 3*VL(SRC), V3 vpternlogd $0x96, TWEAK0, KEY0, V0 vpternlogd $0x96, TWEAK1, KEY0, V1 vpternlogd $0x96, TWEAK2, KEY0, V2 vpternlogd $0x96, TWEAK3, KEY0, V3 .else @@ -610,13 +612,13 @@ _vmovdqu V3, 3*VL(DST) // Finish computing the next set of tweaks. _tweak_step 1000 - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, LEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN jge .Lmain_loop\@ // Check for the uncommon case where the data length isn't a multiple of // 4*VL. Handle it out-of-line in order to optimize for the common // case. In the common case, just fall through to the ret. From patchwork Thu Dec 12 21:28:45 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 849838 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AA3581D9A63 for ; Thu, 12 Dec 2024 21:29:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038956; cv=none; b=B+tqI5D9mRrFCGVe9PVEcT9cT/6ShCXIFK07VkUJb6LV06wJW+UUI6DanEJv1LDGKTcpOD2KEcmVBkhwTDdlf/O2yjMVH/PHUz23uGggmMwWTdTTeGvEsiwZ4RL8qQ7uDTP115BgmquV7ZfwUd5d+NQxf2gZArZ3UcGO01+GQ1U= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734038956; c=relaxed/simple; bh=y9KTABvcRIhwuqNg/VsbNreS5DFEXsz8n6JyaAlYXyA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=RL+Y++cLvGadStcsmsHEV+nUo0GNJGqgoeZT+/R2HmzOc1gmEp7hQmNnsezm50tbpzYmWUZfuH9SDli3sln0wzdXPfItlU/Of8ju97XeebT9D4Hyx+BnVnybYcW2kagYG28nUsKS0NYda/NOnhrbZ84Rwfz+mSEomDjvVKDTpGE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=g95eL9wy; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="g95eL9wy" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7C9CCC4CED1; Thu, 12 Dec 2024 21:29:16 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734038956; bh=y9KTABvcRIhwuqNg/VsbNreS5DFEXsz8n6JyaAlYXyA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=g95eL9wyMetVbuiD6h5STAyMgsBqvScp3kgopWWmMLXlgdBow/W7RCI5TriWNEBdq sasHdyA6AW0G12+tXd+s9j0k2b8ehaFcMuq6rvzyK8+TD3aDrflgwimXvqzoXiGp2n uM4V5az8QlLJkI393cPkxjITTLMMC/zXYGVSSL7OMRfTQxukfGzb513NCbmzDGt82W xeettV3x3G2JbnxVcbaOWijzFKx68R6gpdSjLmivVzqDJeH/g1J8alhzxHcw0dkIse 4K11IyDIBSV3Obx+X9NtZc9ugxeO6GtPH5q7iwmbOIiI/0CnugqJfgRPDZTpvCSy9l bIiXrSNciZwOA== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org Subject: [PATCH v2 8/8] crypto: x86/aes-xts - additional optimizations Date: Thu, 12 Dec 2024 13:28:45 -0800 Message-ID: <20241212212845.40333-9-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.1 In-Reply-To: <20241212212845.40333-1-ebiggers@kernel.org> References: <20241212212845.40333-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers Reduce latency by taking advantage of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), like I did in the AES-GCM code. Also replace a vpand and vpxor with a vpternlogd. On AMD Zen 5 this improves performance by about 3%. Intel performance remains about the same, with a 0.1% improvement being seen on Icelake. Signed-off-by: Eric Biggers --- arch/x86/crypto/aes-xts-avx-x86_64.S | 145 +++++++++++++++++---------- 1 file changed, 90 insertions(+), 55 deletions(-) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 0e6b9ae12e95..8a3e23fbcf85 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -233,12 +233,16 @@ // (by multiplying by the polynomial 'x') and write it to \dst. .macro _next_tweak src, tmp, dst vpshufd $0x13, \src, \tmp vpaddq \src, \src, \dst vpsrad $31, \tmp, \tmp +.if USE_AVX10 + vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst +.else vpand GF_POLY_XMM, \tmp, \tmp vpxor \tmp, \dst, \dst +.endif .endm // Given the XTS tweak(s) in the vector \src, compute the next vector of // tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. // @@ -452,88 +456,98 @@ _vbroadcast128 6*16(KEY), KEY13 _vbroadcast128 7*16(KEY), KEY14 .endif .endm -// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) -// on the block(s) in \data using the round key(s) in \key. The register length -// determines the number of AES blocks en/decrypted. -.macro _vaes enc, last, key, data +// Do a single non-last round of AES encryption (if \enc==1) or decryption (if +// \enc==0) on the block(s) in \data using the round key(s) in \key. The +// register length determines the number of AES blocks en/decrypted. +.macro _vaes enc, key, data .if \enc -.if \last - vaesenclast \key, \data, \data -.else vaesenc \key, \data, \data -.endif -.else -.if \last - vaesdeclast \key, \data, \data .else vaesdec \key, \data, \data .endif +.endm + +// Same as _vaes, but does the last round. +.macro _vaeslast enc, key, data +.if \enc + vaesenclast \key, \data, \data +.else + vaesdeclast \key, \data, \data .endif .endm -// Do a single round of AES en/decryption on the block(s) in \data, using the -// same key for all block(s). The round key is loaded from the appropriate -// register or memory location for round \i. May clobber V4. -.macro _vaes_1x enc, last, i, xmm_suffix, data +// Do a single non-last round of AES en/decryption on the block(s) in \data, +// using the same key for all block(s). The round key is loaded from the +// appropriate register or memory location for round \i. May clobber \tmp. +.macro _vaes_1x enc, i, xmm_suffix, data, tmp .if USE_AVX10 - _vaes \enc, \last, KEY\i\xmm_suffix, \data + _vaes \enc, KEY\i\xmm_suffix, \data .else .ifnb \xmm_suffix - _vaes \enc, \last, (\i-7)*16(KEY), \data + _vaes \enc, (\i-7)*16(KEY), \data .else - _vbroadcast128 (\i-7)*16(KEY), V4 - _vaes \enc, \last, V4, \data + _vbroadcast128 (\i-7)*16(KEY), \tmp + _vaes \enc, \tmp, \data .endif .endif .endm -// Do a single round of AES en/decryption on the blocks in registers V0-V3, -// using the same key for all blocks. The round key is loaded from the +// Do a single non-last round of AES en/decryption on the blocks in registers +// V0-V3, using the same key for all blocks. The round key is loaded from the // appropriate register or memory location for round \i. In addition, does two // steps of the computation of the next set of tweaks. May clobber V4 and V5. -.macro _vaes_4x enc, last, i +.macro _vaes_4x enc, i .if USE_AVX10 _tweak_step (2*(\i-5)) - _vaes \enc, \last, KEY\i, V0 - _vaes \enc, \last, KEY\i, V1 + _vaes \enc, KEY\i, V0 + _vaes \enc, KEY\i, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, KEY\i, V2 - _vaes \enc, \last, KEY\i, V3 + _vaes \enc, KEY\i, V2 + _vaes \enc, KEY\i, V3 .else _vbroadcast128 (\i-7)*16(KEY), V4 _tweak_step (2*(\i-5)) - _vaes \enc, \last, V4, V0 - _vaes \enc, \last, V4, V1 + _vaes \enc, V4, V0 + _vaes \enc, V4, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, V4, V2 - _vaes \enc, \last, V4, V3 + _vaes \enc, V4, V2 + _vaes \enc, V4, V3 .endif .endm // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, // then XOR with \tweak again) of the block(s) in \data. To process a single // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of -// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. -.macro _aes_crypt enc, xmm_suffix, tweak, data +// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. +.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp _xor3 KEY0\xmm_suffix, \tweak, \data cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ - _vaes_1x \enc, 0, 1, \xmm_suffix, \data - _vaes_1x \enc, 0, 2, \xmm_suffix, \data + _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp .Laes192\@: - _vaes_1x \enc, 0, 3, \xmm_suffix, \data - _vaes_1x \enc, 0, 4, \xmm_suffix, \data + _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp .Laes128\@: .irp i, 5,6,7,8,9,10,11,12,13 - _vaes_1x \enc, 0, \i, \xmm_suffix, \data + _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp .endr - _vaes_1x \enc, 1, 14, \xmm_suffix, \data - _vpxor \tweak, \data, \data +.if USE_AVX10 + vpxord KEY14\xmm_suffix, \tweak, \tmp +.else +.ifnb \xmm_suffix + vpxor 7*16(KEY), \tweak, \tmp +.else + _vbroadcast128 7*16(KEY), \tmp + vpxor \tweak, \tmp, \tmp +.endif +.endif + _vaeslast \enc, \tmp, \data .endm .macro _aes_xts_crypt enc _define_aliases @@ -586,26 +600,47 @@ cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ // Do all the AES rounds on the data blocks, interleaved with // the computation of the next set of tweaks. - _vaes_4x \enc, 0, 1 - _vaes_4x \enc, 0, 2 + _vaes_4x \enc, 1 + _vaes_4x \enc, 2 .Laes192\@: - _vaes_4x \enc, 0, 3 - _vaes_4x \enc, 0, 4 + _vaes_4x \enc, 3 + _vaes_4x \enc, 4 .Laes128\@: .irp i, 5,6,7,8,9,10,11,12,13 - _vaes_4x \enc, 0, \i + _vaes_4x \enc, \i .endr - _vaes_4x \enc, 1, 14 - - // XOR in the tweaks again. - _vpxor TWEAK0, V0, V0 - _vpxor TWEAK1, V1, V1 - _vpxor TWEAK2, V2, V2 - _vpxor TWEAK3, V3, V3 + // Do the last AES round, then XOR the results with the tweaks again. + // Reduce latency by doing the XOR before the vaesenclast, utilizing the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) + // (and likewise for vaesdeclast). +.if USE_AVX10 + _tweak_step 18 + _tweak_step 19 + vpxord TWEAK0, KEY14, V4 + vpxord TWEAK1, KEY14, V5 + _vaeslast \enc, V4, V0 + _vaeslast \enc, V5, V1 + vpxord TWEAK2, KEY14, V4 + vpxord TWEAK3, KEY14, V5 + _vaeslast \enc, V4, V2 + _vaeslast \enc, V5, V3 +.else + _vbroadcast128 7*16(KEY), V4 + _tweak_step 18 // uses V5 + _tweak_step 19 // uses V5 + vpxor TWEAK0, V4, V5 + _vaeslast \enc, V5, V0 + vpxor TWEAK1, V4, V5 + _vaeslast \enc, V5, V1 + vpxor TWEAK2, V4, V5 + vpxor TWEAK3, V4, V4 + _vaeslast \enc, V5, V2 + _vaeslast \enc, V4, V3 +.endif // Store the destination blocks. _vmovdqu V0, 0*VL(DST) _vmovdqu V1, 1*VL(DST) _vmovdqu V2, 2*VL(DST) @@ -638,11 +673,11 @@ .if VL > 16 add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL. jl .Lvec_at_a_time_done\@ .Lvec_at_a_time\@: _vmovdqu (SRC), V0 - _aes_crypt \enc, , TWEAK0, V0 + _aes_crypt \enc, , TWEAK0, V0, tmp=V1 _vmovdqu V0, (DST) _next_tweakvec TWEAK0, V0, V1, TWEAK0 add $VL, SRC add $VL, DST sub $VL, LEN @@ -655,11 +690,11 @@ // En/decrypt any remaining full blocks, one at a time. jl .Lblock_at_a_time_done\@ .Lblock_at_a_time\@: vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM add $16, SRC add $16, DST sub $16, LEN @@ -683,11 +718,11 @@ // If decrypting, the main loop didn't decrypt the last full block // because CTS decryption uses the last two tweaks in reverse order. // Do it now by advancing the tweak and decrypting the last full block. _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 .endif .if USE_AVX10 // Create a mask that has the first LEN bits set. mov $-1, %r9d @@ -726,11 +761,11 @@ // Do a blend to generate the src partial block followed by the second // part of the en/decryption of the last full block. vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 .endif // En/decrypt again and store the last full block. - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) jmp .Ldone\@ .endm // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,