@@ -3,6 +3,7 @@ obj-y += arm/ iommufd/
obj-$(CONFIG_AMD_IOMMU) += amd/
obj-$(CONFIG_INTEL_IOMMU) += intel/
obj-$(CONFIG_RISCV_IOMMU) += riscv/
+obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/
obj-$(CONFIG_IOMMU_API) += iommu.o
obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o
obj-$(CONFIG_IOMMU_API) += iommu-traces.o
@@ -32,4 +32,17 @@ config IOMMU_PT
IOMMU_PT provides an implementation of the page table operations
related struct iommu_domain using GENERIC_PT to abstract the page
table format.
+
+if IOMMU_PT
+config IOMMU_PT_AMDV1
+ tristate "IOMMU page table for 64 bit AMD IOMMU v1"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ default n
+ help
+ iommu_domain implementation for the AMD v1 page table. AMDv1 is the
+ "host" page table. It supports granular page sizes of almost every
+ power of 2 and decodes an full 64 bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+endif
endif
new file mode 100644
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
+
+define create_format
+obj-$(2) += iommu_$(1).o
+
+endef
+
+$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y)))
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m)))
new file mode 100644
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * AMD IOMMU v1 page table
+ *
+ * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
+ * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
+ *
+ * Note the level numbering here matches the core code, so level 0 is the same
+ * as mode 1.
+ *
+ */
+#ifndef __GENERIC_PT_FMT_AMDV1_H
+#define __GENERIC_PT_FMT_AMDV1_H
+
+#include "defs_amdv1.h"
+#include "../pt_defs.h"
+
+#include <asm/page.h>
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/mem_encrypt.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 64,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 5,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* The DTE only has these bits for the top phyiscal address */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* PTE bits */
+enum {
+ AMDV1PT_FMT_PR = BIT(0),
+ AMDV1PT_FMT_D = BIT(6),
+ AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
+ AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
+ AMDV1PT_FMT_FC = BIT_ULL(60),
+ AMDV1PT_FMT_IR = BIT_ULL(61),
+ AMDV1PT_FMT_IW = BIT_ULL(62),
+};
+
+/*
+ * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
+ * these defines to avoid it.
+ */
+#define AMDV1PT_FMT_NL_DEFAULT 0
+#define AMDV1PT_FMT_NL_SIZE 7
+
+#define common_to_amdv1pt(common_ptr) \
+ container_of_const(common_ptr, struct pt_amdv1, common)
+#define to_amdv1pt(pts) common_to_amdv1pt((pts)->range->common)
+
+static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, pts->entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_table_pa amdv1pt_table_pa
+
+/* Returns the oa for the start of the contiguous entry */
+static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+{
+ pt_oaddr_t oa = FIELD_GET(AMDV1PT_FMT_OA, pts->entry);
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
+ AMDV1PT_FMT_NL_SIZE) {
+ unsigned int sz_bits = oalog2_ffz(oa);
+
+ oa = oalog2_set_mod(oa, 0, sz_bits);
+ } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
+ AMDV1PT_FMT_NL_DEFAULT))
+ return 0;
+ return oalog2_mul(oa, PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa amdv1pt_entry_oa
+
+static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
+{
+ /*
+ * Table 15: Page Table Level Parameters
+ * The top most level cannot have translation entries
+ */
+ return pts->level < PT_MAX_TOP_LEVEL;
+}
+#define pt_can_have_leaf amdv1pt_can_have_leaf
+
+static inline unsigned int amdv1pt_table_item_lg2sz(const struct pt_state *pts)
+{
+ return PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level;
+}
+#define pt_table_item_lg2sz amdv1pt_table_item_lg2sz
+
+static inline unsigned int
+amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ u32 code;
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
+ AMDV1PT_FMT_NL_DEFAULT)
+ return ilog2(1);
+
+ PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
+ AMDV1PT_FMT_NL_SIZE);
+
+ /*
+ * The contiguous size is encoded in the length of a string of 1's in
+ * the low bits of the OA. Reverse the equation:
+ * code = log2_to_int(num_contig_lg2 + item_lg2sz -
+ * PT_GRANULE_LG2SZ - 1) - 1
+ * Which can be expressed as:
+ * num_contig_lg2 = oalog2_ffz(code) + 1 -
+ * item_lg2sz - PT_GRANULE_LG2SZ
+ *
+ * Assume the bit layout is correct and remove the masking. Reorganize
+ * the equation to move all the arithmetic before the ffz.
+ */
+ code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
+ pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
+ return log2_ffz_t(u32, code);
+}
+#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
+
+static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
+{
+ /*
+ * Top entry covers bits [63:57] only, this is handled through
+ * max_vasz_lg2.
+ */
+ if (PT_WARN_ON(pts->level == 5))
+ return 7;
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 amdv1pt_num_items_lg2
+
+static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!amdv1pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * Table 14: Example Page Size Encodings
+ * Address bits 51:32 can be used to encode page sizes greater than 4
+ * Gbytes. Address bits 63:52 are zero-extended.
+ *
+ * 512GB Pages are not supported due to a hardware bug.
+ * Otherwise every power of two size is supported.
+ */
+ return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
+ isz_lg2) & ~SZ_512G;
+}
+#define pt_possible_sizes amdv1pt_possible_sizes
+
+static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ unsigned int next_level;
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(*tablep);
+ if (!(entry & AMDV1PT_FMT_PR))
+ return PT_ENTRY_EMPTY;
+
+ next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
+ if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
+ next_level == AMDV1PT_FMT_NL_SIZE)
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw amdv1pt_load_entry_raw
+
+static inline void
+amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 entry;
+
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+
+ if (oasz_lg2 == isz_lg2) {
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_DEFAULT);
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_SIZE) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
+ 1) -
+ 1);
+
+ /* See amdv1pt_clear_entry() */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ memset64(tablep, entry, log2_to_int(num_contig_lg2));
+ }
+ }
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry amdv1pt_install_leaf_entry
+
+static inline bool amdv1pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ /*
+ * IR and IW are ANDed from the table levels along with the PTE. We
+ * always control permissions from the PTE, so always set IR and IW for
+ * tables.
+ */
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ log2_div(table_pa, PT_GRANULE_LG2SZ)) |
+ AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table amdv1pt_install_table
+
+static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits =
+ pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
+}
+#define pt_attr_from_entry amdv1pt_attr_from_entry
+
+static inline void amdv1pt_clear_entry(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ /*
+ * gcc generates rep stos for the io-pgtable code, and this difference
+ * can show in microbenchmarks with larger contiguous page sizes.
+ * rep is slower for small cases.
+ */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+ } else {
+ memset64(tablep, 0, log2_to_int(num_contig_lg2));
+ }
+}
+#define pt_clear_entry amdv1pt_clear_entry
+
+static inline bool amdv1pt_entry_write_is_dirty(const struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
+ return true;
+ return false;
+}
+#define pt_entry_write_is_dirty amdv1pt_entry_write_is_dirty
+
+static inline void amdv1pt_entry_set_write_clean(struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
+}
+#define pt_entry_set_write_clean amdv1pt_entry_set_write_clean
+
+static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | AMDV1PT_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_amdv1
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
+ ->amdpt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
+}
+
+static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
+ pte |= AMDV1PT_FMT_FC;
+ if (iommu_prot & IOMMU_READ)
+ pte |= AMDV1PT_FMT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= AMDV1PT_FMT_IW;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot amdv1pt_iommu_set_prot
+
+static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
+ const struct pt_iommu_amdv1_cfg *cfg)
+{
+ struct pt_amdv1 *table = &iommu_table->amdpt;
+ unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+
+ if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
+ return -EINVAL;
+
+ if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
+ cfg->starting_level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
+ (cfg->starting_level + 1);
+
+ table->common.max_vasz_lg2 =
+ min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ pt_top_set_level(&table->common, cfg->starting_level);
+ return 0;
+}
+#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
+
+static inline void
+amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_amdv1_hw_info *info)
+{
+ info->host_pt_root = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
+ info->mode = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
+#endif
new file mode 100644
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H
+#define __GENERIC_PT_FMT_DEFS_AMDV1_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct amdv1pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs amdv1pt_write_attrs
+
+#endif
new file mode 100644
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT amdv1
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \
+ BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+#define PT_FORCE_ENABLED_FEATURES \
+ (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+
+#include "iommu_template.h"
@@ -131,4 +131,23 @@ enum pt_features {
PT_FEAT_FMT_START,
};
+struct pt_amdv1 {
+ struct pt_common common;
+};
+
+enum {
+ /*
+ * The memory backing the tables is encrypted. Use __sme_set() to adjust
+ * the page table pointers in the tree. This only works with
+ * CONFIG_AMD_MEM_ENCRYPT.
+ */
+ PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START,
+ /*
+ * The PTEs are set to prevent cache incoherent traffic, such as PCI no
+ * snoop. This is set either at creation time or before the first map
+ * operation.
+ */
+ PT_FEAT_AMDV1_FORCE_COHERENCE,
+};
+
#endif
@@ -115,4 +115,33 @@ struct pt_iommu_cfg {
u8 hw_max_oasz_lg2;
};
+/* Generate the exported function signatures from iommu_pt.h */
+#define IOMMU_PROTOTYPES(fmt) \
+ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \
+ const struct pt_iommu_##fmt##_cfg *cfg, \
+ gfp_t gfp); \
+ void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \
+ struct pt_iommu_##fmt##_hw_info *info)
+#define IOMMU_FORMAT(fmt, member) \
+ struct pt_iommu_##fmt { \
+ struct pt_iommu iommu; \
+ struct pt_##fmt member; \
+ }; \
+ IOMMU_PROTOTYPES(fmt)
+
+
+struct pt_iommu_amdv1_cfg {
+ struct pt_iommu_cfg common;
+ unsigned int starting_level;
+};
+
+struct pt_iommu_amdv1_hw_info {
+ u64 host_pt_root;
+ u8 mode;
+};
+
+IOMMU_FORMAT(amdv1, amdpt);
+
+#undef IOMMU_PROTOTYPES
+#undef IOMMU_FORMAT
#endif