Message ID | 768488c67540aa18c200d7ee16e75a3a087022d4.1726009989.git.ackerleytng@google.com |
---|---|
State | New |
Headers | show |
Series | 1G page support for guest_memfd | expand |
On Tue, Sep 10, 2024 at 11:43:46PM +0000, Ackerley Tng wrote: > If HugeTLB is requested at guest_memfd creation time, HugeTLB pages > will be used to back guest_memfd. > > Signed-off-by: Ackerley Tng <ackerleytng@google.com> > --- > virt/kvm/guest_memfd.c | 252 ++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 239 insertions(+), 13 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 31e1115273e1..2e6f12e2bac8 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -8,6 +8,8 @@ > #include <linux/pseudo_fs.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > +#include <linux/memcontrol.h> > +#include <linux/mempolicy.h> > > #include "kvm_mm.h" > > @@ -29,6 +31,13 @@ static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode) > return inode->i_mapping->i_private_data; > } > > +static bool is_kvm_gmem_hugetlb(struct inode *inode) > +{ > + u64 flags = (u64)inode->i_private; > + > + return flags & KVM_GUEST_MEMFD_HUGETLB; > +} > + > /** > * folio_file_pfn - like folio_file_page, but return a pfn. > * @folio: The folio which contains this index. > @@ -58,6 +67,9 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo > return 0; > } > > +/** > + * Use the uptodate flag to indicate that the folio is prepared for KVM's usage. > + */ > static inline void kvm_gmem_mark_prepared(struct folio *folio) > { > folio_mark_uptodate(folio); > @@ -72,13 +84,18 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > gfn_t gfn, struct folio *folio) > { > - unsigned long nr_pages, i; > pgoff_t index; > int r; > > - nr_pages = folio_nr_pages(folio); > - for (i = 0; i < nr_pages; i++) > - clear_highpage(folio_page(folio, i)); > + if (folio_test_hugetlb(folio)) { > + folio_zero_user(folio, folio->index << PAGE_SHIFT); Is (folio->index << PAGE_SHIFT) the right address hint to provide? I don't think we can say the folio will be mapped at this address since this value is an offset into the file. In most cases, I believe it won't be mapped anywhere since we just allocated it. Thanks, Elliot
On Tue, Sep 10, 2024 at 11:43:46PM +0000, Ackerley Tng wrote: > +static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h, > + struct hugepage_subpool *spool) > +{ > + bool memcg_charge_was_prepared; > + struct mem_cgroup *memcg; > + struct mempolicy *mpol; > + nodemask_t *nodemask; > + struct folio *folio; > + gfp_t gfp_mask; > + int ret; > + int nid; > + > + gfp_mask = htlb_alloc_mask(h); > + > + memcg = get_mem_cgroup_from_current(); > + ret = mem_cgroup_hugetlb_try_charge(memcg, > + gfp_mask | __GFP_RETRY_MAYFAIL, > + pages_per_huge_page(h)); > + if (ret == -ENOMEM) > + goto err; > + > + memcg_charge_was_prepared = ret != -EOPNOTSUPP; > + > + /* Pages are only to be taken from guest_memfd subpool and nowhere else. */ > + if (hugepage_subpool_get_pages(spool, 1)) > + goto err_cancel_charge; > + > + nid = kvm_gmem_get_mpol_node_nodemask(htlb_alloc_mask(h), &mpol, > + &nodemask); > + /* > + * charge_cgroup_reservation is false because we didn't make any cgroup > + * reservations when creating the guest_memfd subpool. Hmm.. isn't this the exact reason to set charge_cgroup_reservation==true instead? IIUC gmem hugetlb pages should participate in the hugetlb cgroup resv charge as well. It is already involved in the rest cgroup charges, and I wonder whether it's intended that the patch treated the resv accounting specially. Thanks, > + * > + * use_hstate_resv is true because we reserved from global hstate when > + * creating the guest_memfd subpool. > + */ > + folio = hugetlb_alloc_folio(h, mpol, nid, nodemask, false, true); > + mpol_cond_put(mpol); > + > + if (!folio) > + goto err_put_pages; > + > + hugetlb_set_folio_subpool(folio, spool); > + > + if (memcg_charge_was_prepared) > + mem_cgroup_commit_charge(folio, memcg); > + > +out: > + mem_cgroup_put(memcg); > + > + return folio; > + > +err_put_pages: > + hugepage_subpool_put_pages(spool, 1); > + > +err_cancel_charge: > + if (memcg_charge_was_prepared) > + mem_cgroup_cancel_charge(memcg, pages_per_huge_page(h)); > + > +err: > + folio = ERR_PTR(-ENOMEM); > + goto out; > +}
On Thu, Feb 13, 2025 at 07:52:43AM +0000, Ackerley Tng wrote: > Peter Xu <peterx@redhat.com> writes: > > > On Tue, Sep 10, 2024 at 11:43:46PM +0000, Ackerley Tng wrote: > >> +static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h, > >> + struct hugepage_subpool *spool) > >> +{ > >> + bool memcg_charge_was_prepared; > >> + struct mem_cgroup *memcg; > >> + struct mempolicy *mpol; > >> + nodemask_t *nodemask; > >> + struct folio *folio; > >> + gfp_t gfp_mask; > >> + int ret; > >> + int nid; > >> + > >> + gfp_mask = htlb_alloc_mask(h); > >> + > >> + memcg = get_mem_cgroup_from_current(); > >> + ret = mem_cgroup_hugetlb_try_charge(memcg, > >> + gfp_mask | __GFP_RETRY_MAYFAIL, > >> + pages_per_huge_page(h)); > >> + if (ret == -ENOMEM) > >> + goto err; > >> + > >> + memcg_charge_was_prepared = ret != -EOPNOTSUPP; > >> + > >> + /* Pages are only to be taken from guest_memfd subpool and nowhere else. */ > >> + if (hugepage_subpool_get_pages(spool, 1)) > >> + goto err_cancel_charge; > >> + > >> + nid = kvm_gmem_get_mpol_node_nodemask(htlb_alloc_mask(h), &mpol, > >> + &nodemask); > >> + /* > >> + * charge_cgroup_reservation is false because we didn't make any cgroup > >> + * reservations when creating the guest_memfd subpool. > > > > Hmm.. isn't this the exact reason to set charge_cgroup_reservation==true > > instead? > > > > IIUC gmem hugetlb pages should participate in the hugetlb cgroup resv > > charge as well. It is already involved in the rest cgroup charges, and I > > wonder whether it's intended that the patch treated the resv accounting > > specially. > > > > Thanks, > > > > Thank you for your careful reviews! > > I misunderstood charging a cgroup for hugetlb reservations when I was > working on this patch. > > Before this, I thought hugetlb_cgroup_charge_cgroup_rsvd() was only for > resv_map reservations, so I set charge_cgroup_reservation to false since > guest_memfd didn't use resv_map, but I understand better now. Please > help me check my understanding: > > + All reservations are made at the hstate > + In addition, every reservation is associated with a subpool (through > spool->rsv_hpages) or recorded in a resv_map > + Reservations are either in a subpool or in a resv_map but not both > + hugetlb_cgroup_charge_cgroup_rsvd() is for any reservation > > Regarding the time that a cgroup is charged for reservations: > > + If a reservation is made during subpool creation, the cgroup is not > charged during the reservation by the subpool, probably by design > since the process doing the mount may not be the process using the > pages Exactly. > + Charging a cgroup for the reservation happens in > hugetlb_reserve_pages(), which is called at mmap() time. Yes, or if it's not charged in hugetlb_reserve_pages() it needs to be charged at folio allocation as of now. > > For guest_memfd, I see two options: > > Option 1: Charge cgroup for reservations at fault time > > Pros: > > + Similar in behavior to a fd on a hugetlbfs mount, where the cgroup of > the process calling fallocate() is charged for the reservation. > + Symmetric approach, since uncharging happens when the hugetlb folio is > freed. > > Cons: > > + Room for allocation failure after guest_memfd creation. Even though > this guest_memfd had been created with a subpool and pages have been > reserved, there is a chance of hitting the cgroup's hugetlb > reservation cap and failing to allocate a page. > > Option 2 (preferred): Charge cgroup for reservations at guest_memfd > creation time > > Pros: > > + Once guest_memfd file is created, a page is guaranteed at fault time. This would definitely be nice, that whatever that can block the guest from using the memory should be a fault upfront when a VM boots if ever possible (e.g. this is not a mmap() interface, so user yet doesn't allow NORESERVE). It'll be slightly different from the spool use case of mount points, but I think it's a new use case anyway, so IIUC we can define its behavior to best suite the use case. > + Simplifies/doesn't carry over the complexities of the hugetlb(fs) > reservation system > > Cons: > > + The cgroup being charged is the cgroup of the process creating > guest_memfd, which might be an issue if users expect the process > faulting the page to be charged. Right, though I can't picture such use case yet. I'm guessing multiple processes use of guest-memfd is still very far away. When it happens, I would expect these tasks be put into the same cgroup.. Maybe kubevirt already have some of such use, I can go and have a check. If they're not in the same cgroup, it's still more reasonable to always charge that at the VM instance, rather than whatever other process that may operate on the guest memory. So it could be that we don't see major cons in solution 2. In general, I agree with your preference. > > Implementation: > > + At guest_memfd creation time, when creating the subpool, charge the > cgroups for everything: > + for hugetlb usage I suppose here you meant the global reservation? If so, I agree. IIUC the new code shouldn't need to worry on this if the subpool is created by the API, as that API does the global charging, like we discussed elsewhere. If you meant hugetlb_cgroup_commit_charge(),IMHO it should still be left done until allocation. In guest-memfd case, when fallocate(). AFAICT, that's the only reason why we need two of such anyway.. > + hugetlb reservation usage and Agree on this one. > + hugetlb usage by page count (as in mem_cgroup_charge_hugetlb(), > which is new since [1]) This one should, IMHO, also be done only during allocation. Thanks, > + Refactoring in [1] would be focused on just dequeueing a folio or > failing which, allocating a surplus folio. > + After allocation, don't set cgroup on the folio so that the freeing > process doesn't uncharge anything > + Uncharge when the file is closed > > Please let me know if anyone has any thoughts/suggestions! > > >> + * > >> + * use_hstate_resv is true because we reserved from global hstate when > >> + * creating the guest_memfd subpool. > >> + */ > >> + folio = hugetlb_alloc_folio(h, mpol, nid, nodemask, false, true); > >> + mpol_cond_put(mpol); > >> + > >> + if (!folio) > >> + goto err_put_pages; > >> + > >> + hugetlb_set_folio_subpool(folio, spool); > >> + > >> + if (memcg_charge_was_prepared) > >> + mem_cgroup_commit_charge(folio, memcg); > >> + > >> +out: > >> + mem_cgroup_put(memcg); > >> + > >> + return folio; > >> + > >> +err_put_pages: > >> + hugepage_subpool_put_pages(spool, 1); > >> + > >> +err_cancel_charge: > >> + if (memcg_charge_was_prepared) > >> + mem_cgroup_cancel_charge(memcg, pages_per_huge_page(h)); > >> + > >> +err: > >> + folio = ERR_PTR(-ENOMEM); > >> + goto out; > >> +} > > [1] https://lore.kernel.org/all/7348091f4c539ed207d9bb0f3744d0f0efb7f2b3.1726009989.git.ackerleytng@google.com/ >
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 31e1115273e1..2e6f12e2bac8 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -8,6 +8,8 @@ #include <linux/pseudo_fs.h> #include <linux/pagemap.h> #include <linux/anon_inodes.h> +#include <linux/memcontrol.h> +#include <linux/mempolicy.h> #include "kvm_mm.h" @@ -29,6 +31,13 @@ static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode) return inode->i_mapping->i_private_data; } +static bool is_kvm_gmem_hugetlb(struct inode *inode) +{ + u64 flags = (u64)inode->i_private; + + return flags & KVM_GUEST_MEMFD_HUGETLB; +} + /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. @@ -58,6 +67,9 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo return 0; } +/** + * Use the uptodate flag to indicate that the folio is prepared for KVM's usage. + */ static inline void kvm_gmem_mark_prepared(struct folio *folio) { folio_mark_uptodate(folio); @@ -72,13 +84,18 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, struct folio *folio) { - unsigned long nr_pages, i; pgoff_t index; int r; - nr_pages = folio_nr_pages(folio); - for (i = 0; i < nr_pages; i++) - clear_highpage(folio_page(folio, i)); + if (folio_test_hugetlb(folio)) { + folio_zero_user(folio, folio->index << PAGE_SHIFT); + } else { + unsigned long nr_pages, i; + + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); + } /* * Preparing huge folios should always be safe, since it should @@ -103,6 +120,174 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } +static int kvm_gmem_get_mpol_node_nodemask(gfp_t gfp_mask, + struct mempolicy **mpol, + nodemask_t **nodemask) +{ + /* + * TODO: mempolicy would probably have to be stored on the inode, use + * task policy for now. + */ + *mpol = get_task_policy(current); + + /* TODO: ignore interleaving (set ilx to 0) for now. */ + return policy_node_nodemask(*mpol, gfp_mask, 0, nodemask); +} + +static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h, + struct hugepage_subpool *spool) +{ + bool memcg_charge_was_prepared; + struct mem_cgroup *memcg; + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int ret; + int nid; + + gfp_mask = htlb_alloc_mask(h); + + memcg = get_mem_cgroup_from_current(); + ret = mem_cgroup_hugetlb_try_charge(memcg, + gfp_mask | __GFP_RETRY_MAYFAIL, + pages_per_huge_page(h)); + if (ret == -ENOMEM) + goto err; + + memcg_charge_was_prepared = ret != -EOPNOTSUPP; + + /* Pages are only to be taken from guest_memfd subpool and nowhere else. */ + if (hugepage_subpool_get_pages(spool, 1)) + goto err_cancel_charge; + + nid = kvm_gmem_get_mpol_node_nodemask(htlb_alloc_mask(h), &mpol, + &nodemask); + /* + * charge_cgroup_reservation is false because we didn't make any cgroup + * reservations when creating the guest_memfd subpool. + * + * use_hstate_resv is true because we reserved from global hstate when + * creating the guest_memfd subpool. + */ + folio = hugetlb_alloc_folio(h, mpol, nid, nodemask, false, true); + mpol_cond_put(mpol); + + if (!folio) + goto err_put_pages; + + hugetlb_set_folio_subpool(folio, spool); + + if (memcg_charge_was_prepared) + mem_cgroup_commit_charge(folio, memcg); + +out: + mem_cgroup_put(memcg); + + return folio; + +err_put_pages: + hugepage_subpool_put_pages(spool, 1); + +err_cancel_charge: + if (memcg_charge_was_prepared) + mem_cgroup_cancel_charge(memcg, pages_per_huge_page(h)); + +err: + folio = ERR_PTR(-ENOMEM); + goto out; +} + +static int kvm_gmem_hugetlb_filemap_add_folio(struct address_space *mapping, + struct folio *folio, pgoff_t index, + gfp_t gfp) +{ + int ret; + + __folio_set_locked(folio); + ret = __filemap_add_folio(mapping, folio, index, gfp, NULL); + if (unlikely(ret)) { + __folio_clear_locked(folio); + return ret; + } + + /* + * In hugetlb_add_to_page_cache(), there is a call to + * folio_clear_hugetlb_restore_reserve(). This is handled when the pages + * are removed from the page cache in unmap_hugepage_range() -> + * __unmap_hugepage_range() by conditionally calling + * folio_set_hugetlb_restore_reserve(). In kvm_gmem_hugetlb's usage of + * hugetlb, there are no VMAs involved, and pages are never taken from + * the surplus, so when pages are freed, the hstate reserve must be + * restored. Hence, this function makes no call to + * folio_clear_hugetlb_restore_reserve(). + */ + + /* mark folio dirty so that it will not be removed from cache/inode */ + folio_mark_dirty(folio); + + return 0; +} + +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode, + pgoff_t index) +{ + struct kvm_gmem_hugetlb *hgmem; + struct folio *folio; + int ret; + + hgmem = kvm_gmem_hgmem(inode); + folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool); + if (IS_ERR(folio)) + return folio; + + /* TODO: Fix index here to be aligned to huge page size. */ + ret = kvm_gmem_hugetlb_filemap_add_folio( + inode->i_mapping, folio, index, htlb_alloc_mask(hgmem->h)); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); + } + + spin_lock(&inode->i_lock); + inode->i_blocks += blocks_per_huge_page(hgmem->h); + spin_unlock(&inode->i_lock); + + return folio; +} + +static struct folio *kvm_gmem_get_hugetlb_folio(struct inode *inode, + pgoff_t index) +{ + struct address_space *mapping; + struct folio *folio; + struct hstate *h; + pgoff_t hindex; + u32 hash; + + h = kvm_gmem_hgmem(inode)->h; + hindex = index >> huge_page_order(h); + mapping = inode->i_mapping; + + /* To lock, we calculate the hash using the hindex and not index. */ + hash = hugetlb_fault_mutex_hash(mapping, hindex); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* + * The filemap is indexed with index and not hindex. Taking lock on + * folio to align with kvm_gmem_get_regular_folio() + */ + folio = filemap_lock_folio(mapping, index); + if (!IS_ERR(folio)) + goto out; + + folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index); +out: + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + return folio; +} + /* * Returns a locked folio on success. The caller is responsible for * setting the up-to-date flag before the memory is mapped into the guest. @@ -114,8 +299,10 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, */ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { - /* TODO: Support huge pages. */ - return filemap_grab_folio(inode->i_mapping, index); + if (is_kvm_gmem_hugetlb(inode)) + return kvm_gmem_get_hugetlb_folio(inode, index); + else + return filemap_grab_folio(inode->i_mapping, index); } static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, @@ -240,6 +427,35 @@ static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode, spin_unlock(&inode->i_lock); } +static void kvm_gmem_hugetlb_truncate_range(struct inode *inode, loff_t lstart, + loff_t lend) +{ + loff_t full_hpage_start; + loff_t full_hpage_end; + unsigned long hsize; + struct hstate *h; + + h = kvm_gmem_hgmem(inode)->h; + hsize = huge_page_size(h); + + full_hpage_start = round_up(lstart, hsize); + full_hpage_end = round_down(lend, hsize); + + if (lstart < full_hpage_start) { + hugetlb_zero_partial_page(h, inode->i_mapping, lstart, + full_hpage_start); + } + + if (full_hpage_end > full_hpage_start) { + kvm_gmem_hugetlb_truncate_folios_range(inode, full_hpage_start, + full_hpage_end); + } + + if (lend > full_hpage_end) { + hugetlb_zero_partial_page(h, inode->i_mapping, full_hpage_end, + lend); + } +} static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { @@ -257,7 +473,12 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_begin(gmem, start, end); - truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); + if (is_kvm_gmem_hugetlb(inode)) { + kvm_gmem_hugetlb_truncate_range(inode, offset, offset + len); + } else { + truncate_inode_pages_range(inode->i_mapping, offset, + offset + len - 1); + } list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_end(gmem, start, end); @@ -279,8 +500,15 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) filemap_invalidate_lock_shared(mapping); - start = offset >> PAGE_SHIFT; - end = (offset + len) >> PAGE_SHIFT; + if (is_kvm_gmem_hugetlb(inode)) { + unsigned long hsize = huge_page_size(kvm_gmem_hgmem(inode)->h); + + start = round_down(offset, hsize) >> PAGE_SHIFT; + end = round_down(offset + len, hsize) >> PAGE_SHIFT; + } else { + start = offset >> PAGE_SHIFT; + end = (offset + len) >> PAGE_SHIFT; + } r = 0; for (index = start; index < end; ) { @@ -408,9 +636,7 @@ static void kvm_gmem_hugetlb_teardown(struct inode *inode) static void kvm_gmem_evict_inode(struct inode *inode) { - u64 flags = (u64)inode->i_private; - - if (flags & KVM_GUEST_MEMFD_HUGETLB) + if (is_kvm_gmem_hugetlb(inode)) kvm_gmem_hugetlb_teardown(inode); else truncate_inode_pages_final(inode->i_mapping); @@ -827,7 +1053,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, *pfn = folio_file_pfn(folio, index); if (max_order) - *max_order = 0; + *max_order = folio_order(folio); *is_prepared = folio_test_uptodate(folio); return folio;
If HugeTLB is requested at guest_memfd creation time, HugeTLB pages will be used to back guest_memfd. Signed-off-by: Ackerley Tng <ackerleytng@google.com> --- virt/kvm/guest_memfd.c | 252 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 239 insertions(+), 13 deletions(-)