diff mbox series

[v3] riscv: Optimize crct10dif with zbc extension

Message ID 20250205065815.91132-1-zhihang.shao.iscas@gmail.com
State New
Headers show
Series [v3] riscv: Optimize crct10dif with zbc extension | expand

Commit Message

Zhihang Shao Feb. 5, 2025, 6:58 a.m. UTC
The current CRC-T10DIF algorithm on RISC-V platform is based on
table-lookup optimization.
Given the previous work on optimizing crc32 calculations with zbc
extension, it is believed that this will be equally effective for
accelerating crc-t10dif.

Therefore this patch offers an implementation of crc-t10dif using zbc
extension. This can detect whether the current runtime environment
supports zbc feature and, if so, uses it to accelerate crc-t10dif
calculations.

This patch is updated due to the patchset of updating kernel's
CRC-T10DIF library in 6.14, which is finished by Eric Biggers.
Also, I used crc_kunit.c to test the performance of crc-t10dif optimized
by crc extension.

Signed-off-by: Zhihang Shao <zhihang.shao.iscas@gmail.com>
---
 arch/riscv/Kconfig                |   1 +
 arch/riscv/lib/Makefile           |   1 +
 arch/riscv/lib/crc-t10dif-riscv.c | 132 ++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+)
 create mode 100644 arch/riscv/lib/crc-t10dif-riscv.c

Comments

Eric Biggers Feb. 5, 2025, 4:30 p.m. UTC | #1
On Wed, Feb 05, 2025 at 02:58:15PM +0800, Zhihang Shao wrote:
> The current CRC-T10DIF algorithm on RISC-V platform is based on
> table-lookup optimization.
> Given the previous work on optimizing crc32 calculations with zbc
> extension, it is believed that this will be equally effective for
> accelerating crc-t10dif.
> 
> Therefore this patch offers an implementation of crc-t10dif using zbc
> extension. This can detect whether the current runtime environment
> supports zbc feature and, if so, uses it to accelerate crc-t10dif
> calculations.
> 
> This patch is updated due to the patchset of updating kernel's
> CRC-T10DIF library in 6.14, which is finished by Eric Biggers.
> Also, I used crc_kunit.c to test the performance of crc-t10dif optimized
> by crc extension.
> 
> Signed-off-by: Zhihang Shao <zhihang.shao.iscas@gmail.com>
> ---
>  arch/riscv/Kconfig                |   1 +
>  arch/riscv/lib/Makefile           |   1 +
>  arch/riscv/lib/crc-t10dif-riscv.c | 132 ++++++++++++++++++++++++++++++
>  3 files changed, 134 insertions(+)
>  create mode 100644 arch/riscv/lib/crc-t10dif-riscv.c

Acked-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Eric Biggers <ebiggers@kernel.org>

This can go through the riscv tree.

Some minor comments below.

> +static inline u64 crct10dif_prep(u16 crc, unsigned long const *ptr)
> +{
> +	return ((u64)crc << 48) ^ (__force u64)__cpu_to_be64(*ptr);
> +}
> +
> +#elif __riscv_xlen == 32
> +#define STEP_ORDER 2
> +#define CRCT10DIF_POLY_QT_BE 0xf65a57f8
> +
> +static inline u32 crct10dif_prep(u16 crc, unsigned long const *ptr)
> +{
> +	return ((u32)crc << 16) ^ (__force u32)__cpu_to_be32(*ptr);
> +}

Maybe use 'const __be64 *' and 'const __be32 *' for the pointer, and use
be64_to_cpu() and be32_to_cpu().  Then the __force cast won't be needed.

> +static inline u16 crct10dif_zbc(unsigned long s)
> +{
> +	u16 crc;
> +
> +	asm volatile   (".option push\n"
> +			".option arch,+zbc\n"
> +			"clmulh %0, %1, %2\n"
> +			"xor    %0, %0, %1\n"
> +			"clmul  %0, %0, %3\n"
> +			".option pop\n"
> +			: "=&r" (crc)
> +			: "r"(s),
> +			  "r"(CRCT10DIF_POLY_QT_BE),
> +			  "r"(CRCT10DIF_POLY)
> +			:);
> +
> +	return crc;
> +}

A comment mentioning that this is using Barrett reduction would be helpful.

BTW, this is fine for an initial implementation, but eventually we'll probably
want to make it fold multiple words at a time to take advantage of
instruction-level parallelism, like the x86 PCLMULQDQ code does.  We also might
be able to share code among all the CRC variants like what I'm doing for x86.

> +#define STEP (1 << STEP_ORDER)
> +#define OFFSET_MASK (STEP - 1)

You can just remove the above #defines and use 'sizeof(unsigned long)' directly.
You can even use the % and / operators since the compiler optimizes them.
arch/x86/lib/crc32-glue.c does this.

> +	p = (unsigned char const *)p_ul;

Please use 'const u8 *'.

Thanks!

- Eric
Eric Biggers Feb. 6, 2025, 8:19 p.m. UTC | #2
On Wed, Feb 05, 2025 at 08:30:12AM -0800, Eric Biggers wrote:
> On Wed, Feb 05, 2025 at 02:58:15PM +0800, Zhihang Shao wrote:
> > The current CRC-T10DIF algorithm on RISC-V platform is based on
> > table-lookup optimization.
> > Given the previous work on optimizing crc32 calculations with zbc
> > extension, it is believed that this will be equally effective for
> > accelerating crc-t10dif.
> > 
> > Therefore this patch offers an implementation of crc-t10dif using zbc
> > extension. This can detect whether the current runtime environment
> > supports zbc feature and, if so, uses it to accelerate crc-t10dif
> > calculations.
> > 
> > This patch is updated due to the patchset of updating kernel's
> > CRC-T10DIF library in 6.14, which is finished by Eric Biggers.
> > Also, I used crc_kunit.c to test the performance of crc-t10dif optimized
> > by crc extension.
> > 
> > Signed-off-by: Zhihang Shao <zhihang.shao.iscas@gmail.com>
> > ---
> >  arch/riscv/Kconfig                |   1 +
> >  arch/riscv/lib/Makefile           |   1 +
> >  arch/riscv/lib/crc-t10dif-riscv.c | 132 ++++++++++++++++++++++++++++++
> >  3 files changed, 134 insertions(+)
> >  create mode 100644 arch/riscv/lib/crc-t10dif-riscv.c
> 
> Acked-by: Eric Biggers <ebiggers@kernel.org>
> Tested-by: Eric Biggers <ebiggers@kernel.org>
> 
> This can go through the riscv tree.
> 

Actually, if people don't mind I'd like to take this through the crc tree.  Due
to https://lore.kernel.org/r/20250206173857.39794-1-ebiggers@kernel.org the
function crc_t10dif_is_optimized() becomes unused and we should remove it, which
would conflict with this patch which adds another implementation of it.

- Eric
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 7612c52e9b1e..db1cf9666dfd 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -25,6 +25,7 @@  config RISCV
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_CRC32 if RISCV_ISA_ZBC
+	select ARCH_HAS_CRC_T10DIF if RISCV_ISA_ZBC
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 79368a895fee..689895b271bd 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -16,6 +16,7 @@  lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
 obj-$(CONFIG_CRC32_ARCH)	+= crc32-riscv.o
+obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-riscv.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 lib-$(CONFIG_RISCV_ISA_V)	+= xor.o
 lib-$(CONFIG_RISCV_ISA_V)	+= riscv_v_helpers.o
diff --git a/arch/riscv/lib/crc-t10dif-riscv.c b/arch/riscv/lib/crc-t10dif-riscv.c
new file mode 100644
index 000000000000..b1ff7309f0f5
--- /dev/null
+++ b/arch/riscv/lib/crc-t10dif-riscv.c
@@ -0,0 +1,132 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF implementation with RISC-V Zbc extension.
+ *
+ * Copyright (C) 2024 Institute of Software, CAS.
+ */
+
+#include <asm/alternative-macros.h>
+#include <asm/byteorder.h>
+#include <asm/hwcap.h>
+
+#include <linux/byteorder/generic.h>
+#include <linux/crc-t10dif.h>
+#include <linux/minmax.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define CRCT10DIF_POLY 0x8bb7
+
+#if __riscv_xlen == 64
+#define STEP_ORDER 3
+
+#define CRCT10DIF_POLY_QT_BE 0xf65a57f81d33a48a
+
+static inline u64 crct10dif_prep(u16 crc, unsigned long const *ptr)
+{
+	return ((u64)crc << 48) ^ (__force u64)__cpu_to_be64(*ptr);
+}
+
+#elif __riscv_xlen == 32
+#define STEP_ORDER 2
+#define CRCT10DIF_POLY_QT_BE 0xf65a57f8
+
+static inline u32 crct10dif_prep(u16 crc, unsigned long const *ptr)
+{
+	return ((u32)crc << 16) ^ (__force u32)__cpu_to_be32(*ptr);
+}
+
+#else
+#error "Unexpected __riscv_xlen"
+#endif
+
+static inline u16 crct10dif_zbc(unsigned long s)
+{
+	u16 crc;
+
+	asm volatile   (".option push\n"
+			".option arch,+zbc\n"
+			"clmulh %0, %1, %2\n"
+			"xor    %0, %0, %1\n"
+			"clmul  %0, %0, %3\n"
+			".option pop\n"
+			: "=&r" (crc)
+			: "r"(s),
+			  "r"(CRCT10DIF_POLY_QT_BE),
+			  "r"(CRCT10DIF_POLY)
+			:);
+
+	return crc;
+}
+
+#define STEP (1 << STEP_ORDER)
+#define OFFSET_MASK (STEP - 1)
+
+static inline u16 crct10dif_unaligned(u16 crc, const u8 *p, size_t len)
+{
+	size_t bits = len * 8;
+	unsigned long s = 0;
+	u16 crc_low = 0;
+
+	for (int i = 0; i < len; i++)
+		s = *p++ | (s << 8);
+
+	if (len < sizeof(u16)) {
+		s ^= crc >> (16 - bits);
+		crc_low = crc << bits;
+	} else {
+		s ^= (unsigned long)crc << (bits - 16);
+	}
+
+	crc = crct10dif_zbc(s);
+	crc ^= crc_low;
+
+	return crc;
+}
+
+u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+	size_t offset, head_len, tail_len;
+	unsigned long const *p_ul;
+	unsigned long s;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+			     RISCV_ISA_EXT_ZBC, 1)
+		 : : : : legacy);
+
+	offset = (unsigned long)p & OFFSET_MASK;
+	if (offset && len) {
+		head_len = min(STEP - offset, len);
+		crc = crct10dif_unaligned(crc, p, head_len);
+		p += head_len;
+		len -= head_len;
+	}
+
+	tail_len = len & OFFSET_MASK;
+	len = len >> STEP_ORDER;
+	p_ul = (unsigned long const *)p;
+
+	for (int i = 0; i < len; i++) {
+		s = crct10dif_prep(crc, p_ul);
+		crc = crct10dif_zbc(s);
+		p_ul++;
+	}
+
+	p = (unsigned char const *)p_ul;
+	if (tail_len)
+		crc = crct10dif_unaligned(crc, p, tail_len);
+
+	return crc;
+legacy:
+	return crc_t10dif_generic(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+bool crc_t10dif_is_optimized(void)
+{
+	return riscv_has_extension_likely(RISCV_ISA_EXT_ZBC);
+}
+EXPORT_SYMBOL(crc_t10dif_is_optimized);
+
+MODULE_DESCRIPTION("CRC-T10DIF using RISC-V ZBC Extension");
+MODULE_LICENSE("GPL");