diff mbox series

[v2,1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S

Message ID 20240408140852.RskXwnod7hCkdtYE902ZkKih8SiQytD_B0tynIjrABE@z
State New
Headers show
Series [v2,1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S | expand

Commit Message

Stefan Kanthak April 8, 2024, 2:08 p.m. UTC
Use shorter SSE2 instructions instead of some SSE4.1
use short displacements into K256
diff mbox series

Patch

--- -/arch/x86/crypto/sha256_ni_asm.S
+++ +/arch/x86/crypto/sha256_ni_asm.S
@@ -108,17 +108,17 @@ 
 	 * Need to reorder these appropriately
 	 * DCBA, HGFE -> ABEF, CDGH
 	 */
-	movdqu		0*16(DIGEST_PTR), STATE0
-	movdqu		1*16(DIGEST_PTR), STATE1
+	movdqu		0*16(DIGEST_PTR), STATE0	/* DCBA */
+	movdqu		1*16(DIGEST_PTR), STATE1	/* HGFE */
 
-	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
-	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
 	movdqa		STATE0, MSGTMP4
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+	punpcklqdq	STATE1, STATE0			/* FEBA */
+	punpckhqdq	MSGTMP4, STATE1			/* DCHG */
+	pshufd		$0x1B, STATE0,  STATE0		/* ABEF */
+	pshufd		$0xB1, STATE1,  STATE1		/* CDGH */
 
 	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-	lea		K256(%rip), SHA256CONSTANTS
+	lea		K256+8*16(%rip), SHA256CONSTANTS
 
 .Lloop0:
 	/* Save hash values for addition after rounds */
@@ -129,18 +129,18 @@ 
 	movdqu		0*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, MSGTMP0
-		paddd		0*16(SHA256CONSTANTS), MSG
+		paddd		-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 
 	/* Rounds 4-7 */
 	movdqu		1*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, MSGTMP1
-		paddd		1*16(SHA256CONSTANTS), MSG
+		paddd		-7*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
@@ -148,9 +148,9 @@ 
 	movdqu		2*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, MSGTMP2
-		paddd		2*16(SHA256CONSTANTS), MSG
+		paddd		-6*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
@@ -158,151 +158,151 @@ 
 	movdqu		3*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, MSGTMP3
-		paddd		3*16(SHA256CONSTANTS), MSG
+		paddd		-5*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 16-19 */
 	movdqa		MSGTMP0, MSG
-		paddd		4*16(SHA256CONSTANTS), MSG
+		paddd		-4*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 20-23 */
 	movdqa		MSGTMP1, MSG
-		paddd		5*16(SHA256CONSTANTS), MSG
+		paddd		-3*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 24-27 */
 	movdqa		MSGTMP2, MSG
-		paddd		6*16(SHA256CONSTANTS), MSG
+		paddd		-2*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 28-31 */
 	movdqa		MSGTMP3, MSG
-		paddd		7*16(SHA256CONSTANTS), MSG
+		paddd		-1*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 32-35 */
 	movdqa		MSGTMP0, MSG
-		paddd		8*16(SHA256CONSTANTS), MSG
+		paddd		0*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 36-39 */
 	movdqa		MSGTMP1, MSG
-		paddd		9*16(SHA256CONSTANTS), MSG
+		paddd		1*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 40-43 */
 	movdqa		MSGTMP2, MSG
-		paddd		10*16(SHA256CONSTANTS), MSG
+		paddd		2*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 44-47 */
 	movdqa		MSGTMP3, MSG
-		paddd		11*16(SHA256CONSTANTS), MSG
+		paddd		3*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 48-51 */
 	movdqa		MSGTMP0, MSG
-		paddd		12*16(SHA256CONSTANTS), MSG
+		paddd		4*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 52-55 */
 	movdqa		MSGTMP1, MSG
-		paddd		13*16(SHA256CONSTANTS), MSG
+		paddd		5*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 
 	/* Rounds 56-59 */
 	movdqa		MSGTMP2, MSG
-		paddd		14*16(SHA256CONSTANTS), MSG
+		paddd		6*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	movdqa		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 
 	/* Rounds 60-63 */
 	movdqa		MSGTMP3, MSG
-		paddd		15*16(SHA256CONSTANTS), MSG
+		paddd		7*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
+		punpckhqdq	MSG, MSG
 		sha256rnds2	STATE1, STATE0
 
 	/* Add current hash values with previously saved */
@@ -315,11 +315,11 @@ 
 	jne		.Lloop0
 
 	/* Write hash values back in the correct order */
-	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
-	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
 	movdqa		STATE1, MSGTMP4
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+	punpcklqdq	STATE0, STATE1			/* EFGH */
+	punpckhqdq	MSGTMP4, STATE0			/* CDAB */
+	pshufd		$0x1B, STATE0,  STATE0		/* HGFE */
+	pshufd		$0xB1, STATE1,  STATE1		/* DCBA */
 
 	movdqu		STATE0, 0*16(DIGEST_PTR)
 	movdqu		STATE1, 1*16(DIGEST_PTR)