@@ -322,6 +322,7 @@
313 common finit_module sys_finit_module
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
+316 common vrange sys_vrange
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -117,6 +117,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
/* Used by sys_madvise() */
+#define VM_VOLATILE 0x00001000 /* VMA is volatile */
#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
new file mode 100644
@@ -0,0 +1,12 @@
+#ifndef _LINUX_VRANGE_H
+#define _LINUX_VRANGE_H
+
+#include <linux/mm.h>
+
+
+#define VRANGE_NONVOLATILE 0
+#define VRANGE_VOLATILE 1
+
+extern int discard_vpage(struct page *page);
+
+#endif /* _LINUX_VRANGE_H */
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o \
+ compaction.o balloon_compaction.o vrange.o \
interval_tree.o list_lru.o $(mmu-y)
obj-y += init-mm.o
@@ -225,10 +225,8 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
-#endif
#else /* !CONFIG_MMU */
static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
{
@@ -728,6 +728,11 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
referenced++;
}
pte_unmap_unlock(pte, ptl);
+ if (vma->vm_flags & VM_VOLATILE) {
+ pra->mapcount = 0;
+ pra->vm_flags |= VM_VOLATILE;
+ return SWAP_FAIL;
+ }
}
if (referenced) {
@@ -43,6 +43,7 @@
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/prefetch.h>
+#include <linux/vrange.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -683,6 +684,7 @@ enum page_references {
PAGEREF_RECLAIM,
PAGEREF_RECLAIM_CLEAN,
PAGEREF_KEEP,
+ PAGEREF_DISCARD,
PAGEREF_ACTIVATE,
};
@@ -703,6 +705,13 @@ static enum page_references page_check_references(struct page *page,
if (vm_flags & VM_LOCKED)
return PAGEREF_RECLAIM;
+ /*
+ * If volatile page is reached on LRU's tail, we discard the
+ * page without considering recycle the page.
+ */
+ if (vm_flags & VM_VOLATILE)
+ return PAGEREF_DISCARD;
+
if (referenced_ptes) {
if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
@@ -930,6 +939,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
+ case PAGEREF_DISCARD:
+ if (may_enter_fs && discard_vpage(page) == 0)
+ goto free_it;
case PAGEREF_KEEP:
goto keep_locked;
case PAGEREF_RECLAIM:
new file mode 100644
@@ -0,0 +1,348 @@
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/vrange.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
+#include <linux/pagevec.h>
+#include <linux/shmem_fs.h>
+#include "internal.h"
+
+struct vrange_walker {
+ struct vm_area_struct *vma;
+ int pages_purged;
+};
+
+
+
+static unsigned long vrange_check_purged(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end);
+
+
+
+
+static ssize_t do_vrange(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int mode, int *purged)
+{
+ struct vm_area_struct *vma, *prev;
+ unsigned long orig_start = start;
+ ssize_t count = 0, ret = 0;
+ int lpurged = 0;
+
+ down_read(&mm->mmap_sem);
+
+ vma = find_vma_prev(mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ unsigned long new_flags;
+ pgoff_t pgoff;
+ unsigned long tmp;
+
+ if (!vma)
+ goto out;
+
+ if (vma->vm_flags & (VM_SPECIAL|VM_LOCKED|VM_MIXEDMAP|
+ VM_HUGETLB))
+ goto out;
+
+ /* We don't support volatility on files for now */
+ if (vma->vm_file) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ new_flags = vma->vm_flags;
+
+ if (start < vma->vm_start) {
+ start = vma->vm_start;
+ if (start >= end)
+ goto out;
+ }
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ switch(mode) {
+ case VRANGE_VOLATILE:
+ new_flags |= VM_VOLATILE;
+ break;
+ case VRANGE_NONVOLATILE:
+ new_flags &= ~VM_VOLATILE;
+ lpurged |= vrange_check_purged(mm, vma,
+ vma->vm_start,
+ vma->vm_end);
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ prev = vma_merge(mm, prev, start, tmp, new_flags,
+ vma->anon_vma, vma->vm_file, pgoff,
+ vma_policy(vma));
+ if (prev) {
+ goto success;
+ }
+
+
+ if (start != vma->vm_start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ goto out;
+ }
+
+ if (tmp != vma->vm_end) {
+ ret = split_vma(mm, vma, tmp, 0);
+ if (ret)
+ goto out;
+ }
+
+ prev = vma;
+success:
+ vma->vm_flags = new_flags;
+ *purged = lpurged;
+
+ /* update count to distance covered so far*/
+ count = tmp - orig_start;
+
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ if (start >= end)
+ goto out;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(mm, start);
+ }
+out:
+ up_read(&mm->mmap_sem);
+
+ /* report bytes successfully marked, even if we're exiting on error */
+ if (count)
+ return count;
+
+ return ret;
+}
+
+SYSCALL_DEFINE4(vrange, unsigned long, start,
+ size_t, len, int, mode, int __user *, purged)
+{
+ unsigned long end;
+ struct mm_struct *mm = current->mm;
+ ssize_t ret = -EINVAL;
+ int p = 0;
+
+ if (start & ~PAGE_MASK)
+ goto out;
+
+ len &= PAGE_MASK;
+ if (!len)
+ goto out;
+
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ if (start >= TASK_SIZE)
+ goto out;
+
+ if (purged) {
+ /* Test pointer is valid before making any changes */
+ if (put_user(p, purged))
+ return -EFAULT;
+ }
+
+ ret = do_vrange(mm, start, end, mode, &p);
+
+ if (purged) {
+ if (put_user(p, purged)) {
+ /*
+ * This would be bad, since we've modified volatilty
+ * and the change in purged state would be lost.
+ */
+ BUG();
+ }
+ }
+
+out:
+ return ret;
+}
+
+static void try_to_discard_one(struct page *page, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *pte;
+ pte_t pteval;
+ spinlock_t *ptl;
+ unsigned long addr;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ addr = vma_address(page, vma);
+ pte = page_check_address(page, mm, addr, &ptl, 0);
+ if (!pte)
+ return;
+
+ BUG_ON(vma->vm_flags & (VM_SPECIAL|VM_LOCKED|VM_MIXEDMAP|VM_HUGETLB));
+
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ pteval = ptep_clear_flush(vma, addr, pte);
+
+ update_hiwater_rss(mm);
+ if (PageAnon(page))
+ dec_mm_counter(mm, MM_ANONPAGES);
+ else
+ dec_mm_counter(mm, MM_FILEPAGES);
+
+ page_remove_rmap(page);
+ page_cache_release(page);
+
+// set_pte_at(mm, addr, pte, swp_entry_to_pte(make_vrange_entry()));
+ pte_unmap_unlock(pte, ptl);
+ mmu_notifier_invalidate_page(mm, addr);
+
+}
+
+
+static int try_to_discard_anon_vpage(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff;
+
+ anon_vma = page_lock_anon_vma_read(page);
+ if (!anon_vma)
+ return -1;
+
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ /*
+ * During interating the loop, some processes could see a page as
+ * purged while others could see a page as not-purged because we have
+ * no global lock between parent and child for protecting vrange system
+ * call during this loop. But it's not a problem because the page is
+ * not *SHARED* page but *COW* page so parent and child can see other
+ * data anytime. The worst case by this race is a page was purged
+ * but couldn't be discarded so it makes unnecessary page fault but
+ * it wouldn't be severe.
+ */
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+
+ if (!(vma->vm_flags & VM_VOLATILE))
+ continue;
+ try_to_discard_one(page, vma);
+ }
+ page_unlock_anon_vma_read(anon_vma);
+ return 0;
+}
+
+
+static int try_to_discard_vpage(struct page *page)
+{
+ if (PageAnon(page))
+ return try_to_discard_anon_vpage(page);
+ return -1;
+}
+
+
+int discard_vpage(struct page *page)
+{
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
+
+ if (!try_to_discard_vpage(page)) {
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+
+ if (page_freeze_refs(page, 1)) {
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static void vrange_pte_entry(pte_t pteval, unsigned long address,
+ unsigned ptent_size, struct mm_walk *walk)
+{
+ struct page *page;
+ struct vrange_walker *vw = walk->private;
+ struct vm_area_struct *vma = vw->vma;
+
+ if (pte_none(pteval)) {
+ vw->pages_purged = 1;
+ return;
+ }
+
+ if (!pte_present(pteval)) {
+ return;
+ }
+
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page))
+ return;
+
+ if (!PageLRU(page) || PageLocked(page))
+ return;
+
+ BUG_ON(PageCompound(page));
+
+
+ VM_BUG_ON(page_is_file_cache(page));
+}
+
+static int vrange_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vrange_walker *vw = walk->private;
+ struct vm_area_struct *uninitialized_var(vma);
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ vma = vw->vma;
+ split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ vrange_pte_entry(*pte, addr, PAGE_SIZE, walk);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ return 0;
+}
+
+static unsigned long vrange_check_purged(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
+{
+ struct vrange_walker vw;
+ struct mm_walk vrange_walk = {
+ .pmd_entry = vrange_pte_range,
+ .mm = vma->vm_mm,
+ .private = &vw,
+ };
+ vw.pages_purged = 0;
+ vw.vma = vma;
+
+ walk_page_range(start, end, &vrange_walk);
+
+ return vw.pages_purged;
+
+}
+
This patch add a vrange() syscall, which allows vmas to be set as volatile or nonvolatile. If volatile, pages in the vma can be discarded under memory pressure. When setting memory non-volatile, we check the pages to see if they have been purged and return a purged flag accordingly. This does not have the sigbus semantics in place. This patch hevily takes from Minchan's vrange patchset, so credits to him for his original work. Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: John Stultz <john.stultz@linaro.org> --- arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/mm.h | 1 + include/linux/vrange.h | 12 ++ mm/Makefile | 2 +- mm/internal.h | 2 - mm/rmap.c | 5 + mm/vmscan.c | 12 ++ mm/vrange.c | 348 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 380 insertions(+), 3 deletions(-) create mode 100644 include/linux/vrange.h create mode 100644 mm/vrange.c