@@ -72,15 +72,15 @@
.text
/**
* Calculate crc32
- * BUF - buffer (16 bytes aligned)
- * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ * BUF - buffer - must be 16 bytes aligned
+ * LEN - sizeof buffer - must be multiple of 16 bytes and greater than 63
* CRC - initial crc32
* return %eax crc32
* uint crc32_pclmul_le_16(unsigned char const *buffer,
* size_t len, uint crc32)
*/
-SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
+SYM_FUNC_START(crc32_pclmul_le_16)
movdqa (BUF), %xmm1
movdqa 0x10(BUF), %xmm2
movdqa 0x20(BUF), %xmm3
@@ -46,6 +46,9 @@
#define SCALE_F 16L /* size of xmm register */
#define SCALE_F_MASK (SCALE_F - 1)
+/* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+static const unsigned int bytes_per_fpu = 655 * 1024;
+
u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
static u32 __attribute__((pure))
@@ -55,6 +58,9 @@ static u32 __attribute__((pure))
unsigned int iremainder;
unsigned int prealign;
+ BUILD_BUG_ON(bytes_per_fpu < PCLMUL_MIN_LEN);
+ BUILD_BUG_ON(bytes_per_fpu & SCALE_F_MASK);
+
if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable())
return crc32_le(crc, p, len);
@@ -70,12 +76,19 @@ static u32 __attribute__((pure))
iquotient = len & (~SCALE_F_MASK);
iremainder = len & SCALE_F_MASK;
- kernel_fpu_begin();
- crc = crc32_pclmul_le_16(p, iquotient, crc);
- kernel_fpu_end();
+ while (iquotient >= PCLMUL_MIN_LEN) {
+ unsigned int chunk = min(iquotient, bytes_per_fpu);
+
+ kernel_fpu_begin();
+ crc = crc32_pclmul_le_16(p, chunk, crc);
+ kernel_fpu_end();
+
+ iquotient -= chunk;
+ p += chunk;
+ }
- if (iremainder)
- crc = crc32_le(crc, p + iquotient, iremainder);
+ if (iquotient || iremainder)
+ crc = crc32_le(crc, p, iquotient + iremainder);
return crc;
}
@@ -120,8 +133,8 @@ static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
}
/* No final XOR 0xFFFFFFFF, like crc32_le */
-static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __crc32_pclmul_finup(const u32 *crcp, const u8 *data,
+ unsigned int len, u8 *out)
{
*(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
return 0;
@@ -45,7 +45,10 @@ asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
#endif /* CONFIG_X86_64 */
-static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+/* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+static const unsigned int bytes_per_fpu = 868 * 1024;
+
+static u32 crc32c_intel_le_hw_byte(u32 crc, const unsigned char *data, size_t length)
{
while (length--) {
asm("crc32b %1, %0"
@@ -56,7 +59,7 @@ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t le
return crc;
}
-static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+static u32 __pure crc32c_intel_le_hw(u32 crc, const unsigned char *p, size_t len)
{
unsigned int iquotient = len / SCALE_F;
unsigned int iremainder = len % SCALE_F;
@@ -110,8 +113,8 @@ static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
return 0;
}
-static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __crc32c_intel_finup(const u32 *crcp, const u8 *data,
+ unsigned int len, u8 *out)
{
*(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
@@ -153,29 +156,52 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
{
u32 *crcp = shash_desc_ctx(desc);
+ BUILD_BUG_ON(bytes_per_fpu < CRC32C_PCL_BREAKEVEN);
+ BUILD_BUG_ON(bytes_per_fpu % SCALE_F);
+
/*
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
- kernel_fpu_end();
+ while (len) {
+ unsigned int chunk = min(len, bytes_per_fpu);
+
+ kernel_fpu_begin();
+ *crcp = crc_pcl(data, chunk, *crcp);
+ kernel_fpu_end();
+
+ len -= chunk;
+ data += chunk;
+ }
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
-static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __crc32c_pcl_intel_finup(const u32 *crcp, const u8 *data,
+ unsigned int len, u8 *out)
{
+ u32 crc = *crcp;
+
+ BUILD_BUG_ON(bytes_per_fpu < CRC32C_PCL_BREAKEVEN);
+ BUILD_BUG_ON(bytes_per_fpu % SCALE_F);
+
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
- kernel_fpu_end();
+ while (len) {
+ unsigned int chunk = min(len, bytes_per_fpu);
+
+ kernel_fpu_begin();
+ crc = crc_pcl(data, chunk, crc);
+ kernel_fpu_end();
+
+ len -= chunk;
+ data += chunk;
+ }
+ *(__le32 *)out = ~cpu_to_le32(crc);
} else
*(__le32 *)out =
- ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+ ~cpu_to_le32(crc32c_intel_le_hw(crc, data, len));
return 0;
}
@@ -34,6 +34,11 @@
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
+#define PCLMUL_MIN_LEN 16U /* minimum size of buffer for crc_t10dif_pcl */
+
+/* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+static const unsigned int bytes_per_fpu = 614 * 1024;
+
asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
struct chksum_desc_ctx {
@@ -54,11 +59,21 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
- if (length >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
- kernel_fpu_end();
- } else
+ BUILD_BUG_ON(bytes_per_fpu < PCLMUL_MIN_LEN);
+
+ if (length >= PCLMUL_MIN_LEN && crypto_simd_usable()) {
+ while (length >= PCLMUL_MIN_LEN) {
+ unsigned int chunk = min(length, bytes_per_fpu);
+
+ kernel_fpu_begin();
+ ctx->crc = crc_t10dif_pcl(ctx->crc, data, chunk);
+ kernel_fpu_end();
+
+ length -= chunk;
+ data += chunk;
+ }
+ }
+ if (length)
ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
return 0;
}
@@ -73,12 +88,23 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
{
- if (len >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
- kernel_fpu_end();
- } else
- *(__u16 *)out = crc_t10dif_generic(crc, data, len);
+ BUILD_BUG_ON(bytes_per_fpu < PCLMUL_MIN_LEN);
+
+ if (len >= PCLMUL_MIN_LEN && crypto_simd_usable()) {
+ while (len >= PCLMUL_MIN_LEN) {
+ unsigned int chunk = min(len, bytes_per_fpu);
+
+ kernel_fpu_begin();
+ crc = crc_t10dif_pcl(crc, data, chunk);
+ kernel_fpu_end();
+
+ len -= chunk;
+ data += chunk;
+ }
+ }
+ if (len)
+ crc = crc_t10dif_generic(crc, data, len);
+ *(__u16 *)out = crc;
return 0;
}
Limit the number of bytes processed between kernel_fpu_begin() and kernel_fpu_end() calls. Those functions call preempt_disable() and preempt_enable(), so the CPU core is unavailable for scheduling while running, leading to: rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: ... Fixes: 78c37d191dd6 ("crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation") Fixes: 6a8ce1ef3940 ("crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ instruction") Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform") Suggested-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Robert Elliott <elliott@hpe.com> --- v3 use while loops and static int, simplify one of the loop structures, add algorithm-specific limits, use local stack variable in crc32 finup rather than the context pointer like update uses --- arch/x86/crypto/crc32-pclmul_asm.S | 6 +-- arch/x86/crypto/crc32-pclmul_glue.c | 27 +++++++++---- arch/x86/crypto/crc32c-intel_glue.c | 52 ++++++++++++++++++------- arch/x86/crypto/crct10dif-pclmul_glue.c | 48 +++++++++++++++++------ 4 files changed, 99 insertions(+), 34 deletions(-)