diff mbox

madvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags

Message ID 1320988016-21614-1-git-send-email-john.stultz@linaro.org
State Superseded
Headers show

Commit Message

John Stultz Nov. 11, 2011, 5:06 a.m. UTC
Ok. Mind giving this a quick review for any style or other
issues that hang out (except the printks)?

thanks
-john



This patch provides new madvise flags that can be used to mark
memory as volatile, which will allow it to be discarded if the
kernel wants to reclaim memory.

This is very much influenced by the Android Ashmem interface by
Robert Love so credits to him and the Android developers.
In many cases the code & logic come directly from the ashmem patch.
The intent of this patch is to allow for ashmem-like behavior, but
embeds the idea a little deeper into the VM code, instead of isolating
it into a specific driver.

Note, this only provides half of the ashmem functionality, as
ashmem also works on files as well, so similar fadvise calls
will also be needed to provide full ashmem coverage.

Also many thanks to Dave Hansen who helped design and develop the
initial version of this patch, and has provided continued review and
mentoring in the VM code.

Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/inode.c                        |    1 +
 include/asm-generic/mman-common.h |    3 +
 include/linux/fs.h                |   45 +++++++++
 mm/madvise.c                      |  190 +++++++++++++++++++++++++++++++++++++
 mm/shmem.c                        |   19 ++++
 5 files changed, 258 insertions(+), 0 deletions(-)
diff mbox

Patch

diff --git a/fs/inode.c b/fs/inode.c
index ee4e66b..c1f55f4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -278,6 +278,7 @@  void address_space_init_once(struct address_space *mapping)
 	spin_lock_init(&mapping->private_lock);
 	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
 	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	INIT_LIST_HEAD(&mapping->unpinned_list);
 }
 EXPORT_SYMBOL(address_space_init_once);
 
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 787abbb..adf1565 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -47,6 +47,9 @@ 
 
 #define MADV_HUGEPAGE	14		/* Worth backing with hugepages */
 #define MADV_NOHUGEPAGE	15		/* Not worth backing with hugepages */
+#define MADV_VOLATILE   16		/* _can_ toss, but don't toss now */
+#define MADV_ISVOLATILE 17		/* Returns volatile flag for region */
+#define MADV_NONVOLATILE 18		/* Remove VOLATILE flag */
 
 /* compatibility flags */
 #define MAP_FILE	0
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0c4df26..c605336 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -632,6 +632,50 @@  int pagecache_write_end(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 
+
+
+/* upinned_mem_range & range helpers from Robert Love's Ashmem patch */
+struct unpinned_mem_range {
+	struct list_head unpinned;	/* Next unpinned range */
+	size_t start;
+	size_t end;
+	unsigned int purged;
+};
+
+static inline bool page_range_subsumes_range(struct unpinned_mem_range *range,
+					size_t start_addr, size_t end_addr)
+{
+
+	return (range->start >= start_addr) && (range->end <= end_addr);
+}
+
+static inline bool page_range_subsumed_by_range(
+					struct unpinned_mem_range *range,
+					size_t start_addr, size_t end_addr)
+{
+	return (range->start <= start_addr) && (range->end >= end_addr);
+}
+
+static inline bool page_in_range(struct unpinned_mem_range *range, size_t page)
+{
+	return (range->start <= page) && (range->end > page);
+}
+
+static inline bool page_range_in_range(struct unpinned_mem_range *range,
+					size_t start_addr, size_t end_addr)
+{
+	return page_in_range(range, start_addr) ||
+		page_in_range(range, end_addr) ||
+		page_range_subsumes_range(range, start_addr, end_addr);
+}
+
+static inline bool range_before_page(struct unpinned_mem_range *range,
+					size_t page)
+{
+	return range->end < page;
+}
+
+
 struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
@@ -650,6 +694,7 @@  struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct list_head	unpinned_list;	/* unpinned area list */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/madvise.c b/mm/madvise.c
index 74bf193..25baaf8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -225,6 +225,187 @@  static long madvise_remove(struct vm_area_struct *vma,
 	return error;
 }
 
+
+/*
+ * Allocates a unpinned_mem_range, and adds it to the address_space's
+ * unpinned list
+ */
+static int unpinned_range_alloc(struct address_space *addrsp,
+				struct unpinned_mem_range *prev_range,
+				unsigned int purged, size_t start, size_t end)
+{
+	struct unpinned_mem_range *range;
+
+	range = kzalloc(sizeof(struct unpinned_mem_range), GFP_KERNEL);
+	if (unlikely(!range))
+		return -ENOMEM;
+
+	range->start = start;
+	range->end = end;
+	range->purged = purged;
+
+	list_add_tail(&range->unpinned, &prev_range->unpinned);
+
+	return 0;
+}
+
+/*
+ * Deletes a unpinned_mem_range, removing it from the address_space's
+ * unpinned list
+ */
+static void unpinned_range_del(struct unpinned_mem_range *range)
+{
+	list_del(&range->unpinned);
+	kfree(range);
+}
+
+/*
+ * Resizes a unpinned_mem_range
+ */
+static inline void unpinned_range_shrink(struct unpinned_mem_range *range,
+				size_t start, size_t end)
+{
+	range->start = start;
+	range->end = end;
+}
+
+/*
+ * Mark a region as volatile, allowing pages to be purged
+ * under memory pressure
+ */
+static long madvise_volatile(struct vm_area_struct *vma,
+			     unsigned long start, unsigned long end)
+{
+	struct unpinned_mem_range *range, *next;
+	unsigned int purged = 0;
+	int ret;
+	struct address_space *addrsp;
+
+	if (!vma->vm_file)
+		return -1;
+	addrsp = vma->vm_file->f_mapping;
+
+	/* remove the vma offset */
+	start -= vma->vm_start;
+	end -= vma->vm_start;
+
+	printk("Madvise_volatile: start: 0x%lx  end: 0x%lx vma start: 0x%lx\n",
+		start, end, vma->vm_start);
+
+restart:
+	list_for_each_entry_safe(range, next, &addrsp->unpinned_list,
+				unpinned) {
+		if (range_before_page(range, start))
+			break;
+		if (page_range_subsumed_by_range(range, start, end))
+			return 0;
+		if (page_range_in_range(range, start, end)) {
+			start = min_t(size_t, range->start, start);
+			end = max_t(size_t, range->end, end);
+			purged |= range->purged;
+			unpinned_range_del(range);
+			goto restart;
+		}
+
+	}
+	ret = unpinned_range_alloc(addrsp, range, purged, start, end);
+	return ret;
+}
+
+/*
+ * Mark a region as nonvolatile, returns 1 if any pages in the region
+ * were purged.
+ */
+static long madvise_nonvolatile(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	struct unpinned_mem_range *range, *next;
+	struct address_space *addrsp;
+	int ret  = 0;
+
+	if (!vma->vm_file)
+		return -1;
+	addrsp = vma->vm_file->f_mapping;
+
+	/* remove the vma offset */
+	start -= vma->vm_start;
+	end -= vma->vm_start;
+
+	list_for_each_entry_safe(range, next, &addrsp->unpinned_list,
+				unpinned) {
+		if (range_before_page(range, start))
+			break;
+
+		if (page_range_in_range(range, start, end)) {
+			ret |= range->purged;
+			/* Case #1: Easy. Just nuke the whole thing. */
+			if (page_range_subsumes_range(range, start, end)) {
+				unpinned_range_del(range);
+				continue;
+			}
+
+			/* Case #2: We overlap from the start, so adjust it */
+			if (range->start >= start) {
+				unpinned_range_shrink(range, end + 1,
+							range->end);
+				continue;
+			}
+
+			/* Case #3: We overlap from the rear, so adjust it */
+			if (range->end <= end) {
+				unpinned_range_shrink(range, range->start,
+							start-1);
+				continue;
+			}
+
+			/*
+			 * Case #4: We eat a chunk out of the middle. A bit
+			 * more complicated, we allocate a new range for the
+			 * second half and adjust the first chunk's endpoint.
+			 */
+			unpinned_range_alloc(addrsp, range,
+						range->purged, end + 1,
+						range->end);
+			unpinned_range_shrink(range, range->start, start - 1);
+		}
+	}
+	printk("Madvise_nonvolatile: start: 0x%lx  end: 0x%lx vma start: 0x%lx ret: %i\n",
+			start, end, vma->vm_start, ret);
+	return ret;
+
+
+}
+
+/*
+ * Returns if a region has been marked volatile or not.
+ * Does not return if the region has been purged.
+ */
+static long madvise_isvolatile(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	struct unpinned_mem_range *range;
+	struct address_space *addrsp;
+	long ret = 0;
+
+	if (!vma->vm_file)
+		return -1;
+	addrsp = vma->vm_file->f_mapping;
+
+	/* remove the vma offset */
+	start -= vma->vm_start;
+	end -= vma->vm_start;
+
+	list_for_each_entry(range, &addrsp->unpinned_list, unpinned) {
+		if (range_before_page(range, start))
+			break;
+		if (page_range_in_range(range, start, end)) {
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Error injection support for memory error handling.
@@ -268,6 +449,12 @@  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_DONTNEED:
 		return madvise_dontneed(vma, prev, start, end);
+	case MADV_VOLATILE:
+		return madvise_volatile(vma, start, end);
+	case MADV_ISVOLATILE:
+		return madvise_isvolatile(vma, start, end);
+	case MADV_NONVOLATILE:
+		return madvise_nonvolatile(vma, start, end);
 	default:
 		return madvise_behavior(vma, prev, start, end, behavior);
 	}
@@ -293,6 +480,9 @@  madvise_behavior_valid(int behavior)
 	case MADV_HUGEPAGE:
 	case MADV_NOHUGEPAGE:
 #endif
+	case MADV_VOLATILE:
+	case MADV_ISVOLATILE:
+	case MADV_NONVOLATILE:
 		return 1;
 
 	default:
diff --git a/mm/shmem.c b/mm/shmem.c
index d672250..90aa946 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -679,6 +679,25 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	index = page->index;
 	inode = mapping->host;
 	info = SHMEM_I(inode);
+
+	if (!list_empty(&mapping->unpinned_list)) {
+		struct unpinned_mem_range *range, *next;
+		printk("shmem_writepage:\n");
+		list_for_each_entry_safe(range, next, &mapping->unpinned_list,
+					unpinned) {
+			printk("	range: 0x%lx - 0x%lx vs 0x%lx\n",
+					range->start, range->end,
+					(long)index << PAGE_SHIFT);
+
+			if (page_in_range(range, index << PAGE_SHIFT)) {
+				range->purged = 1;
+				unlock_page(page);
+				printk("	Purged page!\n");
+				return 0;
+			}
+		}
+	}
+
 	if (info->flags & VM_LOCKED)
 		goto redirty;
 	if (!total_swap_pages)