diff mbox series

[v12,10/18] KVM: x86/mmu: Handle guest page faults for guest_memfd with shared memory

Message ID 20250611133330.1514028-11-tabba@google.com
State New
Headers show
Series KVM: Mapping guest_memfd backed memory at the host for software protected VMs | expand

Commit Message

Fuad Tabba June 11, 2025, 1:33 p.m. UTC
From: Ackerley Tng <ackerleytng@google.com>

For memslots backed by guest_memfd with shared mem support, the KVM MMU
must always fault in pages from guest_memfd, and not from the host
userspace_addr. Update the fault handler to do so.

This patch also refactors related function names for accuracy:

kvm_mem_is_private() returns true only when the current private/shared
state (in the CoCo sense) of the memory is private, and returns false if
the current state is shared explicitly or impicitly, e.g., belongs to a
non-CoCo VM.

kvm_mmu_faultin_pfn_gmem() is updated to indicate that it can be used to
fault in not just private memory, but more generally, from guest_memfd.

Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
 arch/x86/kvm/mmu/mmu.c   | 38 +++++++++++++++++++++++---------------
 include/linux/kvm_host.h | 25 +++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 17 deletions(-)

Comments

Sean Christopherson June 13, 2025, 10:08 p.m. UTC | #1
On Wed, Jun 11, 2025, Fuad Tabba wrote:
> From: Ackerley Tng <ackerleytng@google.com>
> 
> For memslots backed by guest_memfd with shared mem support, the KVM MMU
> must always fault in pages from guest_memfd, and not from the host
> userspace_addr. Update the fault handler to do so.

And with a KVM_MEMSLOT_GUEST_MEMFD_ONLY flag, this becomes super obvious.

> This patch also refactors related function names for accuracy:

This patch.  And phrase changelogs as commands.

> kvm_mem_is_private() returns true only when the current private/shared
> state (in the CoCo sense) of the memory is private, and returns false if
> the current state is shared explicitly or impicitly, e.g., belongs to a
> non-CoCo VM.

Again, state changes as commands.  For the above, it's not obvious if you're
talking about the existing code versus the state of things after "this patch".


> kvm_mmu_faultin_pfn_gmem() is updated to indicate that it can be used to
> fault in not just private memory, but more generally, from guest_memfd.

> +static inline u8 kvm_max_level_for_order(int order)

Do not use "inline" for functions that are visible only to the local compilation
unit.  "inline" is just a hint, and modern compilers are smart enough to inline
functions when appropriate without a hint.

A longer explanation/rant here: https://lore.kernel.org/all/ZAdfX+S323JVWNZC@google.com

> +static inline int kvm_gmem_max_mapping_level(const struct kvm_memory_slot *slot,
> +					     gfn_t gfn, int max_level)
> +{
> +	int max_order;
>  
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;

This is dead code, the one and only caller has *just* checked for this condition.
>  
> -	host_level = host_pfn_mapping_level(kvm, gfn, slot);
> -	return min(host_level, max_level);
> +	max_order = kvm_gmem_mapping_order(slot, gfn);
> +	return min(max_level, kvm_max_level_for_order(max_order));
>  }

...

> -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
> -					u8 max_level, int gmem_order)
> +static u8 kvm_max_level_for_fault_and_order(struct kvm *kvm,

This is comically verbose.  C ain't Java.  And having two separate helpers makes
it *really* hard to (a) even see there are TWO helpers in the first place, and
(b) understand how they differ.

Gah, and not your bug, but completely ignoring the RMP in kvm_mmu_max_mapping_level()
is wrong.  It "works" because guest_memfd doesn't (yet) support dirty logging,
no one enables the NX hugepage mitigation on AMD hosts.

We could plumb in the pfn and private info, but I don't really see the point,
at least not at this time.

> +					    struct kvm_page_fault *fault,
> +					    int order)
>  {
> -	u8 req_max_level;
> +	u8 max_level = fault->max_level;
>  
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> -	max_level = min(kvm_max_level_for_order(gmem_order), max_level);
> +	max_level = min(kvm_max_level_for_order(order), max_level);
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> -	req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
> -	if (req_max_level)
> -		max_level = min(max_level, req_max_level);
> +	if (fault->is_private) {
> +		u8 level = kvm_x86_call(private_max_mapping_level)(kvm, fault->pfn);

Hmm, so the interesting thing here is that (IIRC) the RMP restrictions aren't
just on the private pages, they also apply to the HYPERVISOR/SHARED pages.  (Don't
quote me on that).

Regardless, I'm leaning toward dropping the "private" part, and making SNP deal
with the intricacies of the RMP:

	/* Some VM types have additional restrictions, e.g. SNP's RMP. */
	req_max_level = kvm_x86_call(max_mapping_level)(kvm, fault);
	if (req_max_level)
		max_level = min(max_level, req_max_level);

Then we can get to something like:

static int kvm_gmem_max_mapping_level(struct kvm *kvm, int order,
				      struct kvm_page_fault *fault)
{
	int max_level, req_max_level;

	max_level = kvm_max_level_for_order(order);
	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;

	req_max_level = kvm_x86_call(max_mapping_level)(kvm, fault);
	if (req_max_level)
		max_level = min(max_level, req_max_level);

	return max_level;
}

int kvm_mmu_max_mapping_level(struct kvm *kvm,
			      const struct kvm_memory_slot *slot, gfn_t gfn)
{
	int max_level;

	max_level = kvm_lpage_info_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM);
	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;

	/* TODO: Comment goes here about KVM not supporting this path (yet). */
	if (kvm_mem_is_private(kvm, gfn))
		return PG_LEVEL_4K;

	if (kvm_is_memslot_gmem_only(slot)) {
		int order = kvm_gmem_mapping_order(slot, gfn);

		return min(max_level, kvm_gmem_max_mapping_level(kvm, order, NULL));
	}

	return min(max_level, host_pfn_mapping_level(kvm, gfn, slot));
}

static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
				    struct kvm_page_fault *fault)
{
	struct kvm *kvm = vcpu->kvm;
	int order, r;

	if (!kvm_slot_has_gmem(fault->slot)) {
		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
		return -EFAULT;
	}

	r = kvm_gmem_get_pfn(kvm, fault->slot, fault->gfn, &fault->pfn,
			     &fault->refcounted_page, &order);
	if (r) {
		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
		return r;
	}

	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
	fault->max_level = kvm_gmem_max_mapping_level(kvm, order, fault);

	return RET_PF_CONTINUE;
}

int sev_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault)
{
	int level, rc;
	bool assigned;

	if (!sev_snp_guest(kvm))
		return 0;

	if (WARN_ON_ONCE(!fault) || !fault->is_private)
		return 0;

	rc = snp_lookup_rmpentry(fault->pfn, &assigned, &level);
	if (rc || !assigned)
		return PG_LEVEL_4K;

	return level;
}
> +/*
> + * Returns true if the given gfn's private/shared status (in the CoCo sense) is
> + * private.
> + *
> + * A return value of false indicates that the gfn is explicitly or implicitly
> + * shared (i.e., non-CoCo VMs).
> + */
>  static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  {
> -	return IS_ENABLED(CONFIG_KVM_GMEM) &&
> -	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +	struct kvm_memory_slot *slot;
> +
> +	if (!IS_ENABLED(CONFIG_KVM_GMEM))
> +		return false;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (kvm_slot_has_gmem(slot) && kvm_gmem_memslot_supports_shared(slot)) {
> +		/*
> +		 * Without in-place conversion support, if a guest_memfd memslot
> +		 * supports shared memory, then all the slot's memory is
> +		 * considered not private, i.e., implicitly shared.
> +		 */
> +		return false;

Why!?!?  Just make sure KVM_MEMORY_ATTRIBUTE_PRIVATE is mutually exclusive with
mappable guest_memfd.  You need to do that no matter what.  Then you don't need
to sprinkle special case code all over the place.

> +	}
> +
> +	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
>  }
>  #else
>  static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> -- 
> 2.50.0.rc0.642.g800a2b2222-goog
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 75b7b02cfcb7..2aab5a00caee 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3291,6 +3291,11 @@  int kvm_mmu_max_mapping_level(struct kvm *kvm,
 	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
 }
 
+static inline bool fault_from_gmem(struct kvm_page_fault *fault)
+{
+	return fault->is_private || kvm_gmem_memslot_supports_shared(fault->slot);
+}
+
 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_memory_slot *slot = fault->slot;
@@ -4467,21 +4472,25 @@  static inline u8 kvm_max_level_for_order(int order)
 	return PG_LEVEL_4K;
 }
 
-static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
-					u8 max_level, int gmem_order)
+static u8 kvm_max_level_for_fault_and_order(struct kvm *kvm,
+					    struct kvm_page_fault *fault,
+					    int order)
 {
-	u8 req_max_level;
+	u8 max_level = fault->max_level;
 
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
-	max_level = min(kvm_max_level_for_order(gmem_order), max_level);
+	max_level = min(kvm_max_level_for_order(order), max_level);
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
-	req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
-	if (req_max_level)
-		max_level = min(max_level, req_max_level);
+	if (fault->is_private) {
+		u8 level = kvm_x86_call(private_max_mapping_level)(kvm, fault->pfn);
+
+		if (level)
+			max_level = min(max_level, level);
+	}
 
 	return max_level;
 }
@@ -4493,10 +4502,10 @@  static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
 				 r == RET_PF_RETRY, fault->map_writable);
 }
 
-static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
-				       struct kvm_page_fault *fault)
+static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
+				    struct kvm_page_fault *fault)
 {
-	int max_order, r;
+	int gmem_order, r;
 
 	if (!kvm_slot_has_gmem(fault->slot)) {
 		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
@@ -4504,15 +4513,14 @@  static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
 	}
 
 	r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
-			     &fault->refcounted_page, &max_order);
+			     &fault->refcounted_page, &gmem_order);
 	if (r) {
 		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 		return r;
 	}
 
 	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
-	fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
-							 fault->max_level, max_order);
+	fault->max_level = kvm_max_level_for_fault_and_order(vcpu->kvm, fault, gmem_order);
 
 	return RET_PF_CONTINUE;
 }
@@ -4522,8 +4530,8 @@  static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
 {
 	unsigned int foll = fault->write ? FOLL_WRITE : 0;
 
-	if (fault->is_private)
-		return kvm_mmu_faultin_pfn_private(vcpu, fault);
+	if (fault_from_gmem(fault))
+		return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
 
 	foll |= FOLL_NOWAIT;
 	fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bba7d2c14177..8f7069385189 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2547,10 +2547,31 @@  bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 					 struct kvm_gfn_range *range);
 
+/*
+ * Returns true if the given gfn's private/shared status (in the CoCo sense) is
+ * private.
+ *
+ * A return value of false indicates that the gfn is explicitly or implicitly
+ * shared (i.e., non-CoCo VMs).
+ */
 static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 {
-	return IS_ENABLED(CONFIG_KVM_GMEM) &&
-	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+	struct kvm_memory_slot *slot;
+
+	if (!IS_ENABLED(CONFIG_KVM_GMEM))
+		return false;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (kvm_slot_has_gmem(slot) && kvm_gmem_memslot_supports_shared(slot)) {
+		/*
+		 * Without in-place conversion support, if a guest_memfd memslot
+		 * supports shared memory, then all the slot's memory is
+		 * considered not private, i.e., implicitly shared.
+		 */
+		return false;
+	}
+
+	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
 }
 #else
 static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)