diff mbox series

[RFC,v4,13/14] KVM: arm64: Handle guest_memfd()-backed guest page faults

Message ID 20241213164811.2006197-14-tabba@google.com
State New
Headers show
Series KVM: Restricted mapping of guest_memfd at the host and arm64 support | expand

Commit Message

Fuad Tabba Dec. 13, 2024, 4:48 p.m. UTC
Add arm64 support for resolving guest page faults on
guest_memfd() backed memslots. This support is not contingent on
pKVM, or other confidential computing support, and works in both
VHE and nVHE modes.

Without confidential computing, this support is useful for
testing and debugging. In the future, it might also be useful
should a user want to use guest_memfd() for all code, whether
it's for a protected guest or not.

For now, the fault granule is restricted to PAGE_SIZE.

Signed-off-by: Fuad Tabba <tabba@google.com>
---
 arch/arm64/kvm/mmu.c | 111 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 2 deletions(-)

Comments

Patrick Roy Jan. 16, 2025, 2:48 p.m. UTC | #1
On Fri, 2024-12-13 at 16:48 +0000, Fuad Tabba wrote:
> Add arm64 support for resolving guest page faults on
> guest_memfd() backed memslots. This support is not contingent on
> pKVM, or other confidential computing support, and works in both
> VHE and nVHE modes.
> 
> Without confidential computing, this support is useful forQ
> testing and debugging. In the future, it might also be useful
> should a user want to use guest_memfd() for all code, whether
> it's for a protected guest or not.
> 
> For now, the fault granule is restricted to PAGE_SIZE.
> 
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
>  arch/arm64/kvm/mmu.c | 111 ++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 109 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 342a9bd3848f..1c4b3871967c 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1434,6 +1434,107 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>         return vma->vm_flags & VM_MTE_ALLOWED;
>  }
> 
> +static int guest_memfd_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> +                            struct kvm_memory_slot *memslot, bool fault_is_perm)
> +{
> +       struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> +       bool exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> +       bool logging_active = memslot_is_logging(memslot);
> +       struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
> +       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> +       bool write_fault = kvm_is_write_fault(vcpu);
> +       struct mm_struct *mm = current->mm;
> +       gfn_t gfn = gpa_to_gfn(fault_ipa);
> +       struct kvm *kvm = vcpu->kvm;
> +       struct page *page;
> +       kvm_pfn_t pfn;
> +       int ret;
> +
> +       /* For now, guest_memfd() only supports PAGE_SIZE granules. */
> +       if (WARN_ON_ONCE(fault_is_perm &&
> +                        kvm_vcpu_trap_get_perm_fault_granule(vcpu) != PAGE_SIZE)) {
> +               return -EFAULT;
> +       }
> +
> +       VM_BUG_ON(write_fault && exec_fault);
> +
> +       if (fault_is_perm && !write_fault && !exec_fault) {
> +               kvm_err("Unexpected L2 read permission error\n");
> +               return -EFAULT;
> +       }
> +
> +       /*
> +        * Permission faults just need to update the existing leaf entry,
> +        * and so normally don't require allocations from the memcache. The
> +        * only exception to this is when dirty logging is enabled at runtime
> +        * and a write fault needs to collapse a block entry into a table.
> +        */
> +       if (!fault_is_perm || (logging_active && write_fault)) {
> +               ret = kvm_mmu_topup_memory_cache(memcache,
> +                                                kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
> +               if (ret)
> +                       return ret;
> +       }
> +
> +       /*
> +        * Holds the folio lock until mapped in the guest and its refcount is
> +        * stable, to avoid races with paths that check if the folio is mapped
> +        * by the host.
> +        */
> +       ret = kvm_gmem_get_pfn_locked(kvm, memslot, gfn, &pfn, &page, NULL);
> +       if (ret)
> +               return ret;
> +
> +       if (!kvm_slot_gmem_is_guest_mappable(memslot, gfn)) {
> +               ret = -EAGAIN;
> +               goto unlock_page;
> +       }
> +
> +       /*
> +        * Once it's faulted in, a guest_memfd() page will stay in memory.
> +        * Therefore, count it as locked.
> +        */
> +       if (!fault_is_perm) {
> +               ret = account_locked_vm(mm, 1, true);
> +               if (ret)
> +                       goto unlock_page;
> +       }
> +
> +       read_lock(&kvm->mmu_lock);
> +       if (write_fault)
> +               prot |= KVM_PGTABLE_PROT_W;
> +
> +       if (exec_fault)
> +               prot |= KVM_PGTABLE_PROT_X;
> +
> +       if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
> +               prot |= KVM_PGTABLE_PROT_X;
> +
> +       /*
> +        * Under the premise of getting a FSC_PERM fault, we just need to relax
> +        * permissions.
> +        */
> +       if (fault_is_perm)
> +               ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
> +       else
> +               ret = kvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE,
> +                                       __pfn_to_phys(pfn), prot,
> +                                       memcache,
> +                                       KVM_PGTABLE_WALK_HANDLE_FAULT |
> +                                       KVM_PGTABLE_WALK_SHARED);
> +
> +       kvm_release_faultin_page(kvm, page, !!ret, write_fault);
> +       read_unlock(&kvm->mmu_lock);
> +
> +       if (ret && !fault_is_perm)
> +               account_locked_vm(mm, 1, false);
> +unlock_page:
> +       unlock_page(page);
> +       put_page(page);

There's a double-free of `page` here, as kvm_release_faultin_page
already calls put_page. I fixed it up locally with

+       unlock_page(page);
 	kvm_release_faultin_page(kvm, page, !!ret, write_fault);
 	read_unlock(&kvm->mmu_lock);
 
 	if (ret && !fault_is_perm)
 		account_locked_vm(mm, 1, false);
+       goto out;
+
 unlock_page:
 	unlock_page(page);
 	put_page(page);
-
+out:
 	return ret != -EAGAIN ? ret : 0;
 }

which I'm admittedly not sure is correct either because now the locks
don't get released in reverse order of acquisition, but with this I
was able to boot simple VMs.

> +
> +       return ret != -EAGAIN ? ret : 0;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                           struct kvm_s2_trans *nested,
>                           struct kvm_memory_slot *memslot, unsigned long hva,
> @@ -1900,8 +2001,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>                 goto out_unlock;
>         }
> 
> -       ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> -                            esr_fsc_is_permission_fault(esr));
> +       if (kvm_slot_can_be_private(memslot)) {

For my setup, I needed

if (kvm_mem_is_private(vcpu->kvm, gfn))

here instead, because I am making use of KVM_GENERIC_MEMORY_ATTRIBUTES,
and  had a memslot with the `KVM_MEM_GUEST_MEMFD` flag set, but whose
gfn range wasn't actually set to KVM_MEMORY_ATTRIBUTE_PRIVATE.

If I'm reading patch 12 correctly, your memslots always set only one of
userspace_addr or guest_memfd, and the stage 2 table setup simply checks
which one is the case to decide what to fault in, so maybe to support
both cases, this check should be

if (kvm_mem_is_private(vcpu->kvm, gfn) || (kvm_slot_can_be_private(memslot) && !memslot->userspace_addr)

?

[1]: https://lore.kernel.org/all/20240801090117.3841080-1-tabba@google.com/

> +               ret = guest_memfd_abort(vcpu, fault_ipa, memslot,
> +                                       esr_fsc_is_permission_fault(esr));
> +       } else {
> +               ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> +                                    esr_fsc_is_permission_fault(esr));
> +       }
> +
>         if (ret == 0)
>                 ret = 1;
>  out:
> --
> 2.47.1.613.gc27f4b7a9f-goog

Best,
Patrick
Fuad Tabba Jan. 16, 2025, 3:16 p.m. UTC | #2
Hi Patrick,

On Thu, 16 Jan 2025 at 14:48, Patrick Roy <roypat@amazon.co.uk> wrote:
>
> On Fri, 2024-12-13 at 16:48 +0000, Fuad Tabba wrote:
> > Add arm64 support for resolving guest page faults on
> > guest_memfd() backed memslots. This support is not contingent on
> > pKVM, or other confidential computing support, and works in both
> > VHE and nVHE modes.
> >
> > Without confidential computing, this support is useful forQ
> > testing and debugging. In the future, it might also be useful
> > should a user want to use guest_memfd() for all code, whether
> > it's for a protected guest or not.
> >
> > For now, the fault granule is restricted to PAGE_SIZE.
> >
> > Signed-off-by: Fuad Tabba <tabba@google.com>
> > ---
> >  arch/arm64/kvm/mmu.c | 111 ++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 109 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 342a9bd3848f..1c4b3871967c 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1434,6 +1434,107 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> >         return vma->vm_flags & VM_MTE_ALLOWED;
> >  }
> >
> > +static int guest_memfd_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > +                            struct kvm_memory_slot *memslot, bool fault_is_perm)
> > +{
> > +       struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> > +       bool exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> > +       bool logging_active = memslot_is_logging(memslot);
> > +       struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
> > +       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> > +       bool write_fault = kvm_is_write_fault(vcpu);
> > +       struct mm_struct *mm = current->mm;
> > +       gfn_t gfn = gpa_to_gfn(fault_ipa);
> > +       struct kvm *kvm = vcpu->kvm;
> > +       struct page *page;
> > +       kvm_pfn_t pfn;
> > +       int ret;
> > +
> > +       /* For now, guest_memfd() only supports PAGE_SIZE granules. */
> > +       if (WARN_ON_ONCE(fault_is_perm &&
> > +                        kvm_vcpu_trap_get_perm_fault_granule(vcpu) != PAGE_SIZE)) {
> > +               return -EFAULT;
> > +       }
> > +
> > +       VM_BUG_ON(write_fault && exec_fault);
> > +
> > +       if (fault_is_perm && !write_fault && !exec_fault) {
> > +               kvm_err("Unexpected L2 read permission error\n");
> > +               return -EFAULT;
> > +       }
> > +
> > +       /*
> > +        * Permission faults just need to update the existing leaf entry,
> > +        * and so normally don't require allocations from the memcache. The
> > +        * only exception to this is when dirty logging is enabled at runtime
> > +        * and a write fault needs to collapse a block entry into a table.
> > +        */
> > +       if (!fault_is_perm || (logging_active && write_fault)) {
> > +               ret = kvm_mmu_topup_memory_cache(memcache,
> > +                                                kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
> > +               if (ret)
> > +                       return ret;
> > +       }
> > +
> > +       /*
> > +        * Holds the folio lock until mapped in the guest and its refcount is
> > +        * stable, to avoid races with paths that check if the folio is mapped
> > +        * by the host.
> > +        */
> > +       ret = kvm_gmem_get_pfn_locked(kvm, memslot, gfn, &pfn, &page, NULL);
> > +       if (ret)
> > +               return ret;
> > +
> > +       if (!kvm_slot_gmem_is_guest_mappable(memslot, gfn)) {
> > +               ret = -EAGAIN;
> > +               goto unlock_page;
> > +       }
> > +
> > +       /*
> > +        * Once it's faulted in, a guest_memfd() page will stay in memory.
> > +        * Therefore, count it as locked.
> > +        */
> > +       if (!fault_is_perm) {
> > +               ret = account_locked_vm(mm, 1, true);
> > +               if (ret)
> > +                       goto unlock_page;
> > +       }
> > +
> > +       read_lock(&kvm->mmu_lock);
> > +       if (write_fault)
> > +               prot |= KVM_PGTABLE_PROT_W;
> > +
> > +       if (exec_fault)
> > +               prot |= KVM_PGTABLE_PROT_X;
> > +
> > +       if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
> > +               prot |= KVM_PGTABLE_PROT_X;
> > +
> > +       /*
> > +        * Under the premise of getting a FSC_PERM fault, we just need to relax
> > +        * permissions.
> > +        */
> > +       if (fault_is_perm)
> > +               ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
> > +       else
> > +               ret = kvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE,
> > +                                       __pfn_to_phys(pfn), prot,
> > +                                       memcache,
> > +                                       KVM_PGTABLE_WALK_HANDLE_FAULT |
> > +                                       KVM_PGTABLE_WALK_SHARED);
> > +
> > +       kvm_release_faultin_page(kvm, page, !!ret, write_fault);
> > +       read_unlock(&kvm->mmu_lock);
> > +
> > +       if (ret && !fault_is_perm)
> > +               account_locked_vm(mm, 1, false);
> > +unlock_page:
> > +       unlock_page(page);
> > +       put_page(page);
>
> There's a double-free of `page` here, as kvm_release_faultin_page
> already calls put_page. I fixed it up locally with
>
> +       unlock_page(page);
>         kvm_release_faultin_page(kvm, page, !!ret, write_fault);
>         read_unlock(&kvm->mmu_lock);
>
>         if (ret && !fault_is_perm)
>                 account_locked_vm(mm, 1, false);
> +       goto out;
> +
>  unlock_page:
>         unlock_page(page);
>         put_page(page);
> -
> +out:
>         return ret != -EAGAIN ? ret : 0;
>  }
>
> which I'm admittedly not sure is correct either because now the locks
> don't get released in reverse order of acquisition, but with this I
> was able to boot simple VMs.

Thanks for that. You're right, I broke this code right before sending
out the series while fixing a merge conflict. have prepared a new
patch series (rebased on Linux 6.13-rc7), with this redone to be part
of  user_mem_abort(), as opposed to being in its own function. Makes
the code cleaner more maintainable.


> > +
> > +       return ret != -EAGAIN ? ret : 0;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                           struct kvm_s2_trans *nested,
> >                           struct kvm_memory_slot *memslot, unsigned long hva,
> > @@ -1900,8 +2001,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> >                 goto out_unlock;
> >         }
> >
> > -       ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> > -                            esr_fsc_is_permission_fault(esr));
> > +       if (kvm_slot_can_be_private(memslot)) {
>
> For my setup, I needed
>
> if (kvm_mem_is_private(vcpu->kvm, gfn))
>
> here instead, because I am making use of KVM_GENERIC_MEMORY_ATTRIBUTES,
> and  had a memslot with the `KVM_MEM_GUEST_MEMFD` flag set, but whose
> gfn range wasn't actually set to KVM_MEMORY_ATTRIBUTE_PRIVATE.
>
> If I'm reading patch 12 correctly, your memslots always set only one of
> userspace_addr or guest_memfd, and the stage 2 table setup simply checks
> which one is the case to decide what to fault in, so maybe to support
> both cases, this check should be
>
> if (kvm_mem_is_private(vcpu->kvm, gfn) || (kvm_slot_can_be_private(memslot) && !memslot->userspace_addr)
>
> ?

I've actually missed supporting both cases, and I think your
suggestion is the right way to do it. I'll fix it in the respin.

Cheers,
/fuad





>
> [1]: https://lore.kernel.org/all/20240801090117.3841080-1-tabba@google.com/
>
> > +               ret = guest_memfd_abort(vcpu, fault_ipa, memslot,
> > +                                       esr_fsc_is_permission_fault(esr));
> > +       } else {
> > +               ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> > +                                    esr_fsc_is_permission_fault(esr));
> > +       }
> > +
> >         if (ret == 0)
> >                 ret = 1;
> >  out:
> > --
> > 2.47.1.613.gc27f4b7a9f-goog
>
> Best,
> Patrick
>
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 342a9bd3848f..1c4b3871967c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1434,6 +1434,107 @@  static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
+static int guest_memfd_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+			     struct kvm_memory_slot *memslot, bool fault_is_perm)
+{
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	bool exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
+	bool logging_active = memslot_is_logging(memslot);
+	struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+	bool write_fault = kvm_is_write_fault(vcpu);
+	struct mm_struct *mm = current->mm;
+	gfn_t gfn = gpa_to_gfn(fault_ipa);
+	struct kvm *kvm = vcpu->kvm;
+	struct page *page;
+	kvm_pfn_t pfn;
+	int ret;
+
+	/* For now, guest_memfd() only supports PAGE_SIZE granules. */
+	if (WARN_ON_ONCE(fault_is_perm &&
+			 kvm_vcpu_trap_get_perm_fault_granule(vcpu) != PAGE_SIZE)) {
+		return -EFAULT;
+	}
+
+	VM_BUG_ON(write_fault && exec_fault);
+
+	if (fault_is_perm && !write_fault && !exec_fault) {
+		kvm_err("Unexpected L2 read permission error\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * Permission faults just need to update the existing leaf entry,
+	 * and so normally don't require allocations from the memcache. The
+	 * only exception to this is when dirty logging is enabled at runtime
+	 * and a write fault needs to collapse a block entry into a table.
+	 */
+	if (!fault_is_perm || (logging_active && write_fault)) {
+		ret = kvm_mmu_topup_memory_cache(memcache,
+						 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Holds the folio lock until mapped in the guest and its refcount is
+	 * stable, to avoid races with paths that check if the folio is mapped
+	 * by the host.
+	 */
+	ret = kvm_gmem_get_pfn_locked(kvm, memslot, gfn, &pfn, &page, NULL);
+	if (ret)
+		return ret;
+
+	if (!kvm_slot_gmem_is_guest_mappable(memslot, gfn)) {
+		ret = -EAGAIN;
+		goto unlock_page;
+	}
+
+	/*
+	 * Once it's faulted in, a guest_memfd() page will stay in memory.
+	 * Therefore, count it as locked.
+	 */
+	if (!fault_is_perm) {
+		ret = account_locked_vm(mm, 1, true);
+		if (ret)
+			goto unlock_page;
+	}
+
+	read_lock(&kvm->mmu_lock);
+	if (write_fault)
+		prot |= KVM_PGTABLE_PROT_W;
+
+	if (exec_fault)
+		prot |= KVM_PGTABLE_PROT_X;
+
+	if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+		prot |= KVM_PGTABLE_PROT_X;
+
+	/*
+	 * Under the premise of getting a FSC_PERM fault, we just need to relax
+	 * permissions.
+	 */
+	if (fault_is_perm)
+		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
+	else
+		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE,
+					__pfn_to_phys(pfn), prot,
+					memcache,
+					KVM_PGTABLE_WALK_HANDLE_FAULT |
+					KVM_PGTABLE_WALK_SHARED);
+
+	kvm_release_faultin_page(kvm, page, !!ret, write_fault);
+	read_unlock(&kvm->mmu_lock);
+
+	if (ret && !fault_is_perm)
+		account_locked_vm(mm, 1, false);
+unlock_page:
+	unlock_page(page);
+	put_page(page);
+
+	return ret != -EAGAIN ? ret : 0;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_s2_trans *nested,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1900,8 +2001,14 @@  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		goto out_unlock;
 	}
 
-	ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
-			     esr_fsc_is_permission_fault(esr));
+	if (kvm_slot_can_be_private(memslot)) {
+		ret = guest_memfd_abort(vcpu, fault_ipa, memslot,
+					esr_fsc_is_permission_fault(esr));
+	} else {
+		ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
+				     esr_fsc_is_permission_fault(esr));
+	}
+
 	if (ret == 0)
 		ret = 1;
 out: