diff mbox

[Xen-devel,4/4] xen: arm: support up to (almost) 1TB of guest RAM

Message ID 1396966760-7752-4-git-send-email-ian.campbell@citrix.com
State New
Headers show

Commit Message

Ian Campbell April 8, 2014, 2:19 p.m. UTC
This creates a second bank of RAM starting at 8GB and potentially extending to
the 1TB boundary, which is the limit imposed by our current use of a 3 level
p2m with 2 pages at level 0 (2^40 bits).

I've deliberately left a gap between the two banks just to exercise those code paths.

The second bank is 1016GB in size which plus the 3GB below 4GB is 1019GB
maximum guest RAM. At the point where the fact that this is slightly less than
a full TB starts to become an issue for people then we can switch to a 4 level
p2m, which would be needed to support guests larger than 1TB anyhow.

Tested on 32-bit with 1, 4 and 6GB guests. Anything more than ~3GB requires an
LPAE enabled kernel, or a 64-bit guest.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 tools/libxc/xc_dom_arm.c      |   95 +++++++++++++++++++++++++++++------------
 tools/libxl/libxl_arm.c       |   27 +++++++++---
 xen/include/public/arch-arm.h |   13 +++++-
 3 files changed, 100 insertions(+), 35 deletions(-)

Comments

Julien Grall April 8, 2014, 3:35 p.m. UTC | #1
Hi Ian,

On 04/08/2014 03:19 PM, Ian Campbell wrote:
> This creates a second bank of RAM starting at 8GB and potentially extending to
> the 1TB boundary, which is the limit imposed by our current use of a 3 level
> p2m with 2 pages at level 0 (2^40 bits).
> 
> I've deliberately left a gap between the two banks just to exercise those code paths.
> 
> The second bank is 1016GB in size which plus the 3GB below 4GB is 1019GB
> maximum guest RAM. At the point where the fact that this is slightly less than
> a full TB starts to become an issue for people then we can switch to a 4 level
> p2m, which would be needed to support guests larger than 1TB anyhow.
> 
> Tested on 32-bit with 1, 4 and 6GB guests. Anything more than ~3GB requires an
> LPAE enabled kernel, or a 64-bit guest.

What happen if you try to boot a non-LPAE guest with more than ~3GB?
Does it boot or crash?

> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> ---
>  tools/libxc/xc_dom_arm.c      |   95 +++++++++++++++++++++++++++++------------
>  tools/libxl/libxl_arm.c       |   27 +++++++++---
>  xen/include/public/arch-arm.h |   13 +++++-
>  3 files changed, 100 insertions(+), 35 deletions(-)
> 
> diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> index 36b1487..65464c7 100644
> --- a/tools/libxc/xc_dom_arm.c
> +++ b/tools/libxc/xc_dom_arm.c
> @@ -18,6 +18,7 @@
>   * Copyright (c) 2011, Citrix Systems
>   */
>  #include <inttypes.h>
> +#include <assert.h>
>  
>  #include <xen/xen.h>
>  #include <xen/io/protocols.h>
> @@ -245,28 +246,73 @@ static int set_mode(xc_interface *xch, domid_t domid, char *guest_type)
>      return rc;
>  }
>  
> +static int populate_guest_memory(struct xc_dom_image *dom,
> +                                 xen_pfn_t base_pfn, xen_pfn_t nr_pfns)
> +{
> +    int rc;
> +    xen_pfn_t allocsz, pfn;
> +
> +    if (!nr_pfns)

if ( !nr_pfns )

> +        return 0;
> +
> +    DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
> +              __FUNCTION__,
> +              (uint64_t)base_pfn << XC_PAGE_SHIFT,
> +              (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
> +              (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
> +
> +    for ( pfn = 0; pfn < nr_pfns; pfn++ )
> +        dom->p2m_host[pfn] = base_pfn + pfn;
> +
> +    for ( pfn = rc = allocsz = 0; (pfn < nr_pfns) && !rc; pfn += allocsz )

It's very confusing to see pfn = rc = allocsz = 0 and each are not
related. Any reason to not initialized rc/allocsz earlier?

> +    {
> +        allocsz = nr_pfns - pfn;

In fact you are using allocsz here... so you don't need to initialize it.

> +        if ( allocsz > 1024*1024 )
> +            allocsz = 1024*1024;
> +
> +        rc = xc_domain_populate_physmap_exact(
> +            dom->xch, dom->guest_domid, allocsz,
> +            0, 0, &dom->p2m_host[pfn]);
> +    }
> +
> +    return rc;
> +}
> +
>  int arch_setup_meminit(struct xc_dom_image *dom)
>  {
>      int rc;
> -    xen_pfn_t pfn, allocsz, i;
> +    xen_pfn_t pfn;
>      uint64_t modbase;
>  
>      /* Convenient */
> -    const uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT;
> -    const uint64_t ramend = rambase + ( dom->total_pages << XC_PAGE_SHIFT );
> +    const uint64_t ramsize = (uint64_t)dom->total_pages << XC_PAGE_SHIFT;
> +
> +    const uint64_t ram0size =
> +        ramsize > GUEST_RAM0_SIZE ? GUEST_RAM0_SIZE : ramsize;
> +    const uint64_t ram0end = GUEST_RAM0_BASE + ram0size;
> +    const uint64_t ram1size =
> +        ramsize > ram0size ? ramsize - ram0size : 0;
> +    const uint64_t ram1end = GUEST_RAM1_BASE + ram1size;
> +
> +    const xen_pfn_t p2m_size = ram1size ?
> +        (ram1end - GUEST_RAM0_BASE) >> XC_PAGE_SHIFT :
> +        (ram0end - GUEST_RAM0_BASE) >> XC_PAGE_SHIFT;
> +
>      const uint64_t kernend = ROUNDUP(dom->kernel_seg.vend, 21/*2MB*/);
>      const uint64_t dtb_size = dom->devicetree_blob ?
>          ROUNDUP(dom->devicetree_size, XC_PAGE_SHIFT) : 0;
>      const uint64_t ramdisk_size = dom->ramdisk_blob ?
>          ROUNDUP(dom->ramdisk_size, XC_PAGE_SHIFT) : 0;
>      const uint64_t modsize = dtb_size + ramdisk_size;
> -    const uint64_t ram128mb = rambase + (128<<20);
> +    const uint64_t ram128mb = GUEST_RAM0_BASE + (128<<20);
>  
> -    if ( ramend - 1 > GUEST_RAM_END )
> +    assert(dom->rambase_pfn << XC_PAGE_SHIFT == GUEST_RAM0_BASE);

Can you add parenthesis here for readability?

[..]

> diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
> index 4f0f0e2..6170454 100644
> --- a/tools/libxl/libxl_arm.c
> +++ b/tools/libxl/libxl_arm.c
> @@ -255,9 +255,9 @@ static int make_psci_node(libxl__gc *gc, void *fdt)
>      return 0;
>  }
>  
> -static int make_memory_node(libxl__gc *gc, void *fdt,
> -                            unsigned long long base,
> -                            unsigned long long size)> 
> +static int make_one_memory_node(libxl__gc *gc, void *fdt,
> +                                unsigned long long base,
> +                                unsigned long long size)

I would use uint64_t rather unsigned long long. It's more clear the
base/size are 64 bits.

>  {
>      int res;
>      const char *name = GCSPRINTF("memory@%08llx", base);
> @@ -269,7 +269,7 @@ static int make_memory_node(libxl__gc *gc, void *fdt,
>      if (res) return res;
>  
>      res = fdt_property_regs(gc, fdt, ROOT_ADDRESS_CELLS, ROOT_SIZE_CELLS,
> -                            1, (uint64_t)base, (uint64_t)size);
> +                            1, base, size);
>      if (res) return res;
>  
>      res = fdt_end_node(fdt);
> @@ -278,6 +278,24 @@ static int make_memory_node(libxl__gc *gc, void *fdt,
>      return 0;
>  }
>  
> +static int make_memory_node(libxl__gc *gc, void *fdt,
> +                            unsigned long long size)

Same here.

> +{
> +    int res;
> +    /* This had better match libxc's arch_setup_meminit... */
> +    const uint64_t size0 = size > GUEST_RAM0_SIZE ? GUEST_RAM0_SIZE : size;
> +    const uint64_t size1 = size > GUEST_RAM0_SIZE ? size - size0 : 0;
> +
> +    res = make_one_memory_node(gc, fdt, GUEST_RAM0_BASE, size0);
> +    if (res) return res;
> +    if (size1) {
> +        res = make_one_memory_node(gc, fdt, GUEST_RAM1_BASE, size1);
> +        if (res) return res;
> +    }

Any reason to not gather the both bank in one memory node?

Regards,
Ian Campbell April 8, 2014, 3:43 p.m. UTC | #2
On Tue, 2014-04-08 at 16:35 +0100, Julien Grall wrote:
> Hi Ian,
> 
> On 04/08/2014 03:19 PM, Ian Campbell wrote:
> > This creates a second bank of RAM starting at 8GB and potentially extending to
> > the 1TB boundary, which is the limit imposed by our current use of a 3 level
> > p2m with 2 pages at level 0 (2^40 bits).
> > 
> > I've deliberately left a gap between the two banks just to exercise those code paths.
> > 
> > The second bank is 1016GB in size which plus the 3GB below 4GB is 1019GB
> > maximum guest RAM. At the point where the fact that this is slightly less than
> > a full TB starts to become an issue for people then we can switch to a 4 level
> > p2m, which would be needed to support guests larger than 1TB anyhow.
> > 
> > Tested on 32-bit with 1, 4 and 6GB guests. Anything more than ~3GB requires an
> > LPAE enabled kernel, or a 64-bit guest.
> 
> What happen if you try to boot a non-LPAE guest with more than ~3GB?
> Does it boot or crash?

It logs a message in dmesg about either truncating or ignoring banks and
then boots with the lesser amount of ram. Logically it has too so you
can install on native using a distro kernel even on a big box and have
the installed choose the LPAE kernel for you.

> > +        return 0;
> > +
> > +    DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
> > +              __FUNCTION__,
> > +              (uint64_t)base_pfn << XC_PAGE_SHIFT,
> > +              (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
> > +              (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
> > +
> > +    for ( pfn = 0; pfn < nr_pfns; pfn++ )
> > +        dom->p2m_host[pfn] = base_pfn + pfn;
> > +
> > +    for ( pfn = rc = allocsz = 0; (pfn < nr_pfns) && !rc; pfn += allocsz )
> 
> It's very confusing to see pfn = rc = allocsz = 0 and each are not
> related. Any reason to not initialized rc/allocsz earlier?

This was mainly code motion, the original loop was the same.  I can try
and clean it up a bit though.

> > +    {
> > +        allocsz = nr_pfns - pfn;
> 
> In fact you are using allocsz here... so you don't need to initialize it.

I'll see if the compiler complains about the use in the for loop
increment.

> > +    assert(dom->rambase_pfn << XC_PAGE_SHIFT == GUEST_RAM0_BASE);
> 
> Can you add parenthesis here for readability?

OK.

> 
> [..]
> 
> > diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
> > index 4f0f0e2..6170454 100644
> > --- a/tools/libxl/libxl_arm.c
> > +++ b/tools/libxl/libxl_arm.c
> > @@ -255,9 +255,9 @@ static int make_psci_node(libxl__gc *gc, void *fdt)
> >      return 0;
> >  }
> >  
> > -static int make_memory_node(libxl__gc *gc, void *fdt,
> > -                            unsigned long long base,
> > -                            unsigned long long size)> 
> > +static int make_one_memory_node(libxl__gc *gc, void *fdt,
> > +                                unsigned long long base,
> > +                                unsigned long long size)
> 
> I would use uint64_t rather unsigned long long. It's more clear the
> base/size are 64 bits.

I will add a patch to change all the unsigned long long's in this file,
of which there are several others.

> > +{
> > +    int res;
> > +    /* This had better match libxc's arch_setup_meminit... */
> > +    const uint64_t size0 = size > GUEST_RAM0_SIZE ? GUEST_RAM0_SIZE : size;
> > +    const uint64_t size1 = size > GUEST_RAM0_SIZE ? size - size0 : 0;
> > +
> > +    res = make_one_memory_node(gc, fdt, GUEST_RAM0_BASE, size0);
> > +    if (res) return res;
> > +    if (size1) {
> > +        res = make_one_memory_node(gc, fdt, GUEST_RAM1_BASE, size1);
> > +        if (res) return res;
> > +    }
> 
> Any reason to not gather the both bank in one memory node?

This is simpler to code and is equally valid DT.

Ian.
diff mbox

Patch

diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
index 36b1487..65464c7 100644
--- a/tools/libxc/xc_dom_arm.c
+++ b/tools/libxc/xc_dom_arm.c
@@ -18,6 +18,7 @@ 
  * Copyright (c) 2011, Citrix Systems
  */
 #include <inttypes.h>
+#include <assert.h>
 
 #include <xen/xen.h>
 #include <xen/io/protocols.h>
@@ -245,28 +246,73 @@  static int set_mode(xc_interface *xch, domid_t domid, char *guest_type)
     return rc;
 }
 
+static int populate_guest_memory(struct xc_dom_image *dom,
+                                 xen_pfn_t base_pfn, xen_pfn_t nr_pfns)
+{
+    int rc;
+    xen_pfn_t allocsz, pfn;
+
+    if (!nr_pfns)
+        return 0;
+
+    DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
+              __FUNCTION__,
+              (uint64_t)base_pfn << XC_PAGE_SHIFT,
+              (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
+              (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
+
+    for ( pfn = 0; pfn < nr_pfns; pfn++ )
+        dom->p2m_host[pfn] = base_pfn + pfn;
+
+    for ( pfn = rc = allocsz = 0; (pfn < nr_pfns) && !rc; pfn += allocsz )
+    {
+        allocsz = nr_pfns - pfn;
+        if ( allocsz > 1024*1024 )
+            allocsz = 1024*1024;
+
+        rc = xc_domain_populate_physmap_exact(
+            dom->xch, dom->guest_domid, allocsz,
+            0, 0, &dom->p2m_host[pfn]);
+    }
+
+    return rc;
+}
+
 int arch_setup_meminit(struct xc_dom_image *dom)
 {
     int rc;
-    xen_pfn_t pfn, allocsz, i;
+    xen_pfn_t pfn;
     uint64_t modbase;
 
     /* Convenient */
-    const uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT;
-    const uint64_t ramend = rambase + ( dom->total_pages << XC_PAGE_SHIFT );
+    const uint64_t ramsize = (uint64_t)dom->total_pages << XC_PAGE_SHIFT;
+
+    const uint64_t ram0size =
+        ramsize > GUEST_RAM0_SIZE ? GUEST_RAM0_SIZE : ramsize;
+    const uint64_t ram0end = GUEST_RAM0_BASE + ram0size;
+    const uint64_t ram1size =
+        ramsize > ram0size ? ramsize - ram0size : 0;
+    const uint64_t ram1end = GUEST_RAM1_BASE + ram1size;
+
+    const xen_pfn_t p2m_size = ram1size ?
+        (ram1end - GUEST_RAM0_BASE) >> XC_PAGE_SHIFT :
+        (ram0end - GUEST_RAM0_BASE) >> XC_PAGE_SHIFT;
+
     const uint64_t kernend = ROUNDUP(dom->kernel_seg.vend, 21/*2MB*/);
     const uint64_t dtb_size = dom->devicetree_blob ?
         ROUNDUP(dom->devicetree_size, XC_PAGE_SHIFT) : 0;
     const uint64_t ramdisk_size = dom->ramdisk_blob ?
         ROUNDUP(dom->ramdisk_size, XC_PAGE_SHIFT) : 0;
     const uint64_t modsize = dtb_size + ramdisk_size;
-    const uint64_t ram128mb = rambase + (128<<20);
+    const uint64_t ram128mb = GUEST_RAM0_BASE + (128<<20);
 
-    if ( ramend - 1 > GUEST_RAM_END )
+    assert(dom->rambase_pfn << XC_PAGE_SHIFT == GUEST_RAM0_BASE);
+
+    if ( ramsize > GUEST_RAM_MAX )
     {
         DOMPRINTF("%s: ram size is too large for guest address space: "
                   "%"PRIx64" > %"PRIx64,
-                  __FUNCTION__, ramend, GUEST_RAM_END);
+                  __FUNCTION__, ramsize, GUEST_RAM_MAX);
         return -1;
     }
 
@@ -276,38 +322,31 @@  int arch_setup_meminit(struct xc_dom_image *dom)
 
     dom->shadow_enabled = 1;
 
-    dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * dom->total_pages);
+    dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * p2m_size);
     if ( dom->p2m_host == NULL )
         return -EINVAL;
+    for ( pfn = 0; pfn < p2m_size; pfn++ )
+        dom->p2m_host[pfn] = INVALID_MFN;
 
-    /* setup initial p2m */
-    for ( pfn = 0; pfn < dom->total_pages; pfn++ )
-        dom->p2m_host[pfn] = pfn + dom->rambase_pfn;
-
-    /* allocate guest memory */
-    for ( i = rc = allocsz = 0;
-          (i < dom->total_pages) && !rc;
-          i += allocsz )
-    {
-        allocsz = dom->total_pages - i;
-        if ( allocsz > 1024*1024 )
-            allocsz = 1024*1024;
-
-        rc = xc_domain_populate_physmap_exact(
-            dom->xch, dom->guest_domid, allocsz,
-            0, 0, &dom->p2m_host[i]);
-    }
-
+    /* setup initial p2m and allocate guest memory */
+    if ((rc = populate_guest_memory(dom,
+                                    GUEST_RAM0_BASE >> XC_PAGE_SHIFT,
+                                    ram0size >> XC_PAGE_SHIFT)))
+        return rc;
+    if ((rc = populate_guest_memory(dom,
+                                    GUEST_RAM1_BASE >> XC_PAGE_SHIFT,
+                                    ram1size >> XC_PAGE_SHIFT)))
+        return rc;
 
     /*
-     * Place boot modules at 128MB into RAM if there is enough RAM and
+     * Place boot modules at 128MB into low RAM if there is enough RAM and
      * the kernel does not overlap. Otherwise place them immediately
      * after the kernel. If there is no space after the kernel then
      * there is insufficient RAM and we fail.
      */
-    if ( ramend >= ram128mb + modsize && kernend < ram128mb )
+    if ( ram0end >= ram128mb + modsize && kernend < ram128mb )
         modbase = ram128mb;
-    else if ( ramend >= kernend + modsize )
+    else if ( ram0end >= kernend + modsize )
         modbase = kernend;
     else
         return -1;
diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
index 4f0f0e2..6170454 100644
--- a/tools/libxl/libxl_arm.c
+++ b/tools/libxl/libxl_arm.c
@@ -255,9 +255,9 @@  static int make_psci_node(libxl__gc *gc, void *fdt)
     return 0;
 }
 
-static int make_memory_node(libxl__gc *gc, void *fdt,
-                            unsigned long long base,
-                            unsigned long long size)
+static int make_one_memory_node(libxl__gc *gc, void *fdt,
+                                unsigned long long base,
+                                unsigned long long size)
 {
     int res;
     const char *name = GCSPRINTF("memory@%08llx", base);
@@ -269,7 +269,7 @@  static int make_memory_node(libxl__gc *gc, void *fdt,
     if (res) return res;
 
     res = fdt_property_regs(gc, fdt, ROOT_ADDRESS_CELLS, ROOT_SIZE_CELLS,
-                            1, (uint64_t)base, (uint64_t)size);
+                            1, base, size);
     if (res) return res;
 
     res = fdt_end_node(fdt);
@@ -278,6 +278,24 @@  static int make_memory_node(libxl__gc *gc, void *fdt,
     return 0;
 }
 
+static int make_memory_node(libxl__gc *gc, void *fdt,
+                            unsigned long long size)
+{
+    int res;
+    /* This had better match libxc's arch_setup_meminit... */
+    const uint64_t size0 = size > GUEST_RAM0_SIZE ? GUEST_RAM0_SIZE : size;
+    const uint64_t size1 = size > GUEST_RAM0_SIZE ? size - size0 : 0;
+
+    res = make_one_memory_node(gc, fdt, GUEST_RAM0_BASE, size0);
+    if (res) return res;
+    if (size1) {
+        res = make_one_memory_node(gc, fdt, GUEST_RAM1_BASE, size1);
+        if (res) return res;
+    }
+
+    return 0;
+}
+
 static int make_intc_node(libxl__gc *gc, void *fdt,
                           unsigned long long gicd_base,
                           unsigned long long gicd_size,
@@ -493,7 +511,6 @@  next_resize:
         FDT( make_psci_node(gc, fdt) );
 
         FDT( make_memory_node(gc, fdt,
-                              dom->rambase_pfn << XC_PAGE_SHIFT,
                               info->target_memkb * 1024) );
         FDT( make_intc_node(gc, fdt,
                             GUEST_GICD_BASE, GUEST_GICD_SIZE,
diff --git a/xen/include/public/arch-arm.h b/xen/include/public/arch-arm.h
index 5840453..d15a977 100644
--- a/xen/include/public/arch-arm.h
+++ b/xen/include/public/arch-arm.h
@@ -374,8 +374,17 @@  typedef uint64_t xen_callback_t;
 
 #define GUEST_MAGIC_BASE  0x39000000ULL
 
-#define GUEST_RAM_BASE    0x40000000ULL /* 3GB of RAM @ 1GB */
-#define GUEST_RAM_END     0xffffffffULL
+#define GUEST_RAM0_BASE   0x40000000ULL /* 3GB of low RAM @ 1GB */
+#define GUEST_RAM0_SIZE   0xc0000000ULL
+#define GUEST_RAM0_END    0xffffffffULL
+
+#define GUEST_RAM1_BASE   0x0200000000ULL /* 1016GB of RAM @ 8GB */
+#define GUEST_RAM1_SIZE   0xfe00000000ULL
+#define GUEST_RAM1_END    0xffffffffffULL
+
+#define GUEST_RAM_BASE    GUEST_RAM0_BASE /* Lowest RAM address */
+ /* Largest amount of actual RAM, not including holes */
+#define GUEST_RAM_MAX     (GUEST_RAM0_SIZE + GUEST_RAM1_SIZE)
 
 /* Interrupts */
 #define GUEST_TIMER_VIRT_PPI    27