diff mbox

[v3,1/4] linux-gen: pool: optimize thread local buffer cache

Message ID 1469181897-19168-1-git-send-email-matias.elo@nokia.com
State New
Headers show

Commit Message

Elo, Matias (Nokia - FI/Espoo) July 22, 2016, 10:04 a.m. UTC
Optimize local buffer cache performance which is critical to
many use cases - including packet IO.

Main parts of the optimization are:
 * Local cache implemented as an array of buf_hdr pointers,
   instead of a linked list (which causes a lot of cache misses)
 * Alloc and free N buffers per operation

All above steps are needed to demonstrate the performance upgrade.
Some related pool functions (get_buf(), ret_buf(), etc) were moved
from pool header to c source file, since those were actual local
to the c source file. Also some unused pool variables are removed
also.

Signed-off-by: Petri Savolainen <petri.savolainen@nokia.com>

Signed-off-by: Matias Elo <matias.elo@nokia.com>

---

V2:
- Split pktio modifications into a separate patch (Bill)
- Improve performance by adding separate functions for single buffer
  alloc/free operations

 .../linux-generic/include/odp_buffer_inlines.h     |  26 +-
 .../linux-generic/include/odp_buffer_internal.h    |   5 +-
 platform/linux-generic/include/odp_internal.h      |   2 -
 platform/linux-generic/include/odp_pool_internal.h | 143 +------
 platform/linux-generic/odp_buffer.c                |   3 -
 platform/linux-generic/odp_packet.c                |   5 +-
 platform/linux-generic/odp_pool.c                  | 473 +++++++++++++++++----
 7 files changed, 426 insertions(+), 231 deletions(-)

-- 
2.7.4

Comments

Bill Fischofer July 26, 2016, 12:15 a.m. UTC | #1
For this series:

Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org>

On Fri, Jul 22, 2016 at 5:04 AM, Matias Elo <matias.elo@nokia.com> wrote:

> Optimize local buffer cache performance which is critical to

> many use cases - including packet IO.

>

> Main parts of the optimization are:

>  * Local cache implemented as an array of buf_hdr pointers,

>    instead of a linked list (which causes a lot of cache misses)

>  * Alloc and free N buffers per operation

>

> All above steps are needed to demonstrate the performance upgrade.

> Some related pool functions (get_buf(), ret_buf(), etc) were moved

> from pool header to c source file, since those were actual local

> to the c source file. Also some unused pool variables are removed

> also.

>

> Signed-off-by: Petri Savolainen <petri.savolainen@nokia.com>

> Signed-off-by: Matias Elo <matias.elo@nokia.com>

> ---

>

> V2:

> - Split pktio modifications into a separate patch (Bill)

> - Improve performance by adding separate functions for single buffer

>   alloc/free operations

>

>  .../linux-generic/include/odp_buffer_inlines.h     |  26 +-

>  .../linux-generic/include/odp_buffer_internal.h    |   5 +-

>  platform/linux-generic/include/odp_internal.h      |   2 -

>  platform/linux-generic/include/odp_pool_internal.h | 143 +------

>  platform/linux-generic/odp_buffer.c                |   3 -

>  platform/linux-generic/odp_packet.c                |   5 +-

>  platform/linux-generic/odp_pool.c                  | 473

> +++++++++++++++++----

>  7 files changed, 426 insertions(+), 231 deletions(-)

>

> diff --git a/platform/linux-generic/include/odp_buffer_inlines.h

> b/platform/linux-generic/include/odp_buffer_inlines.h

> index 3f4d9fd..2b1ab42 100644

> --- a/platform/linux-generic/include/odp_buffer_inlines.h

> +++ b/platform/linux-generic/include/odp_buffer_inlines.h

> @@ -56,30 +56,12 @@ static inline odp_buffer_hdr_t

> *odp_buf_to_hdr(odp_buffer_t buf)

>                 (pool->pool_mdata_addr + (index * ODP_CACHE_LINE_SIZE));

>  }

>

> -static inline uint32_t odp_buffer_refcount(odp_buffer_hdr_t *buf)

> +static inline uint32_t pool_id_from_buf(odp_buffer_t buf)

>  {

> -       return odp_atomic_load_u32(&buf->ref_count);

> -}

> +       odp_buffer_bits_t handle;

>

> -static inline uint32_t odp_buffer_incr_refcount(odp_buffer_hdr_t *buf,

> -                                               uint32_t val)

> -{

> -       return odp_atomic_fetch_add_u32(&buf->ref_count, val) + val;

> -}

> -

> -static inline uint32_t odp_buffer_decr_refcount(odp_buffer_hdr_t *buf,

> -                                               uint32_t val)

> -{

> -       uint32_t tmp;

> -

> -       tmp = odp_atomic_fetch_sub_u32(&buf->ref_count, val);

> -

> -       if (tmp < val) {

> -               odp_atomic_fetch_add_u32(&buf->ref_count, val - tmp);

> -               return 0;

> -       } else {

> -               return tmp - val;

> -       }

> +       handle.handle = buf;

> +       return handle.pool_id;

>  }

>

>  static inline odp_buffer_hdr_t *validate_buf(odp_buffer_t buf)

> diff --git a/platform/linux-generic/include/odp_buffer_internal.h

> b/platform/linux-generic/include/odp_buffer_internal.h

> index f21364c..7b0ef8b 100644

> --- a/platform/linux-generic/include/odp_buffer_internal.h

> +++ b/platform/linux-generic/include/odp_buffer_internal.h

> @@ -114,7 +114,6 @@ struct odp_buffer_hdr_t {

>         union {

>                 uint32_t all;

>                 struct {

> -                       uint32_t zeroized:1; /* Zeroize buf data on free */

>                         uint32_t hdrdata:1;  /* Data is in buffer hdr */

>                         uint32_t sustain:1;  /* Sustain order */

>                 };

> @@ -123,7 +122,6 @@ struct odp_buffer_hdr_t {

>         int8_t                   type;       /* buffer type */

>         odp_event_type_t         event_type; /* for reuse as event */

>         uint32_t                 size;       /* max data size */

> -       odp_atomic_u32_t         ref_count;  /* reference count */

>         odp_pool_t               pool_hdl;   /* buffer pool handle */

>         union {

>                 uint64_t         buf_u64;    /* user u64 */

> @@ -174,6 +172,9 @@ typedef struct {

>  odp_buffer_t buffer_alloc(odp_pool_t pool, size_t size);

>  int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,

>                        odp_buffer_t buf[], int num);

> +void buffer_free(uint32_t pool_id, const odp_buffer_t buf);

> +void buffer_free_multi(uint32_t pool_id,

> +                      const odp_buffer_t buf[], int num_free);

>  int seg_alloc_head(odp_buffer_hdr_t *buf_hdr, int segcount);

>  void seg_free_head(odp_buffer_hdr_t *buf_hdr, int segcount);

>  int seg_alloc_tail(odp_buffer_hdr_t *buf_hdr, int segcount);

> diff --git a/platform/linux-generic/include/odp_internal.h

> b/platform/linux-generic/include/odp_internal.h

> index d12f850..8bad450 100644

> --- a/platform/linux-generic/include/odp_internal.h

> +++ b/platform/linux-generic/include/odp_internal.h

> @@ -119,8 +119,6 @@ int odp_tm_term_global(void);

>  int _odp_int_name_tbl_init_global(void);

>  int _odp_int_name_tbl_term_global(void);

>

> -void _odp_flush_caches(void);

> -

>  int cpuinfo_parser(FILE *file, system_info_t *sysinfo);

>  uint64_t odp_cpu_hz_current(int id);

>

> diff --git a/platform/linux-generic/include/odp_pool_internal.h

> b/platform/linux-generic/include/odp_pool_internal.h

> index 3317bd0..d6717ff 100644

> --- a/platform/linux-generic/include/odp_pool_internal.h

> +++ b/platform/linux-generic/include/odp_pool_internal.h

> @@ -51,15 +51,25 @@ typedef struct _odp_buffer_pool_init_t {

>         void *buf_init_arg;        /**< Argument to be passed to

> buf_init() */

>  } _odp_buffer_pool_init_t;         /**< Type of buffer initialization

> struct */

>

> +#define POOL_MAX_LOCAL_CHUNKS 4

> +#define POOL_CHUNK_SIZE       32

> +#define POOL_MAX_LOCAL_BUFS   (POOL_MAX_LOCAL_CHUNKS * POOL_CHUNK_SIZE)

> +

> +struct local_cache_s {

> +       uint64_t bufallocs;  /* Local buffer alloc count */

> +       uint64_t buffrees;   /* Local buffer free count */

> +

> +       uint32_t num_buf;

> +       odp_buffer_hdr_t *buf[POOL_MAX_LOCAL_BUFS];

> +};

> +

>  /* Local cache for buffer alloc/free acceleration */

>  typedef struct local_cache_t {

>         union {

> -               struct {

> -                       odp_buffer_hdr_t *buf_freelist;  /* The local

> cache */

> -                       uint64_t bufallocs;  /* Local buffer alloc count */

> -                       uint64_t buffrees;   /* Local buffer free count */

> -               };

> -               uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(sizeof(uint64_t))];

> +               struct local_cache_s s;

> +

> +               uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(

> +                           sizeof(struct local_cache_s))];

>         };

>  } local_cache_t;

>

> @@ -214,127 +224,6 @@ static inline void ret_blk(struct pool_entry_s

> *pool, void *block)

>         odp_atomic_inc_u64(&pool->poolstats.blkfrees);

>  }

>

> -static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)

> -{

> -       odp_buffer_hdr_t *myhead;

> -       POOL_LOCK(&pool->buf_lock);

> -

> -       myhead = pool->buf_freelist;

> -

> -       if (odp_unlikely(myhead == NULL)) {

> -               POOL_UNLOCK(&pool->buf_lock);

> -               odp_atomic_inc_u64(&pool->poolstats.bufempty);

> -       } else {

> -               pool->buf_freelist = myhead->next;

> -               POOL_UNLOCK(&pool->buf_lock);

> -               uint64_t bufcount =

> -                       odp_atomic_fetch_sub_u32(&pool->bufcount, 1) - 1;

> -

> -               /* Check for low watermark condition */

> -               if (bufcount == pool->buf_low_wm &&

> !pool->buf_low_wm_assert) {

> -                       pool->buf_low_wm_assert = 1;

> -

>  odp_atomic_inc_u64(&pool->poolstats.buf_low_wm_count);

> -               }

> -

> -               odp_atomic_inc_u64(&pool->poolstats.bufallocs);

> -       }

> -

> -       return (void *)myhead;

> -}

> -

> -static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t

> *buf)

> -{

> -       if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {

> -               while (buf->segcount > 0) {

> -                       if (buffer_is_secure(buf) || pool_is_secure(pool))

> -                               memset(buf->addr[buf->segcount - 1],

> -                                      0, buf->segsize);

> -                       ret_blk(pool, buf->addr[--buf->segcount]);

> -               }

> -               buf->size = 0;

> -       }

> -

> -       buf->allocator = ODP_FREEBUF;  /* Mark buffer free */

> -       POOL_LOCK(&pool->buf_lock);

> -       buf->next = pool->buf_freelist;

> -       pool->buf_freelist = buf;

> -       POOL_UNLOCK(&pool->buf_lock);

> -

> -       uint64_t bufcount = odp_atomic_fetch_add_u32(&pool->bufcount, 1) +

> 1;

> -

> -       /* Check if low watermark condition should be deasserted */

> -       if (bufcount == pool->buf_high_wm && pool->buf_low_wm_assert) {

> -               pool->buf_low_wm_assert = 0;

> -               odp_atomic_inc_u64(&pool->poolstats.buf_high_wm_count);

> -       }

> -

> -       odp_atomic_inc_u64(&pool->poolstats.buffrees);

> -}

> -

> -static inline void *get_local_buf(local_cache_t *buf_cache,

> -                                 struct pool_entry_s *pool,

> -                                 size_t totsize)

> -{

> -       odp_buffer_hdr_t *buf = buf_cache->buf_freelist;

> -

> -       if (odp_likely(buf != NULL)) {

> -               buf_cache->buf_freelist = buf->next;

> -

> -               if (odp_unlikely(buf->size < totsize)) {

> -                       intmax_t needed = totsize - buf->size;

> -

> -                       do {

> -                               void *blk = get_blk(pool);

> -                               if (odp_unlikely(blk == NULL)) {

> -                                       ret_buf(pool, buf);

> -                                       buf_cache->buffrees--;

> -                                       return NULL;

> -                               }

> -                               buf->addr[buf->segcount++] = blk;

> -                               needed -= pool->seg_size;

> -                       } while (needed > 0);

> -

> -                       buf->size = buf->segcount * pool->seg_size;

> -               }

> -

> -               buf_cache->bufallocs++;

> -       }

> -

> -       return buf;

> -}

> -

> -static inline void ret_local_buf(local_cache_t *buf_cache,

> -                               odp_buffer_hdr_t *buf)

> -{

> -       buf->allocator = ODP_FREEBUF;

> -       buf->next = buf_cache->buf_freelist;

> -       buf_cache->buf_freelist = buf;

> -

> -       buf_cache->buffrees++;

> -}

> -

> -static inline void flush_cache(local_cache_t *buf_cache,

> -                              struct pool_entry_s *pool)

> -{

> -       odp_buffer_hdr_t *buf = buf_cache->buf_freelist;

> -       uint32_t flush_count = 0;

> -

> -       while (buf != NULL) {

> -               odp_buffer_hdr_t *next = buf->next;

> -               ret_buf(pool, buf);

> -               buf = next;

> -               flush_count++;

> -       }

> -

> -       odp_atomic_add_u64(&pool->poolstats.bufallocs,

> buf_cache->bufallocs);

> -       odp_atomic_add_u64(&pool->poolstats.buffrees,

> -                          buf_cache->buffrees - flush_count);

> -

> -       buf_cache->buf_freelist = NULL;

> -       buf_cache->bufallocs = 0;

> -       buf_cache->buffrees = 0;

> -}

> -

>  static inline odp_pool_t pool_index_to_handle(uint32_t pool_id)

>  {

>         return _odp_cast_scalar(odp_pool_t, pool_id);

> diff --git a/platform/linux-generic/odp_buffer.c

> b/platform/linux-generic/odp_buffer.c

> index e7e4d58..ce2fdba 100644

> --- a/platform/linux-generic/odp_buffer.c

> +++ b/platform/linux-generic/odp_buffer.c

> @@ -67,9 +67,6 @@ int odp_buffer_snprint(char *str, uint32_t n,

> odp_buffer_t buf)

>         len += snprintf(&str[len], n-len,

>                         "  size         %" PRIu32 "\n",        hdr->size);

>         len += snprintf(&str[len], n-len,

> -                       "  ref_count    %" PRIu32 "\n",

> -                       odp_atomic_load_u32(&hdr->ref_count));

> -       len += snprintf(&str[len], n-len,

>                         "  type         %i\n",        hdr->type);

>

>         return len;

> diff --git a/platform/linux-generic/odp_packet.c

> b/platform/linux-generic/odp_packet.c

> index 0e319d2..474fa81 100644

> --- a/platform/linux-generic/odp_packet.c

> +++ b/platform/linux-generic/odp_packet.c

> @@ -972,10 +972,7 @@ int _odp_packet_copy_md_to_packet(odp_packet_t

> srcpkt, odp_packet_t dstpkt)

>                        srchdr->buf_hdr.uarea_size ?

>                        dsthdr->buf_hdr.uarea_size :

>                        srchdr->buf_hdr.uarea_size);

> -       odp_atomic_store_u32(

> -               &dsthdr->buf_hdr.ref_count,

> -               odp_atomic_load_u32(

> -                       &srchdr->buf_hdr.ref_count));

> +

>         copy_packet_parser_metadata(srchdr, dsthdr);

>

>         /* Metadata copied, but return indication of whether the packet

> diff --git a/platform/linux-generic/odp_pool.c

> b/platform/linux-generic/odp_pool.c

> index 419f03f..0a427ed 100644

> --- a/platform/linux-generic/odp_pool.c

> +++ b/platform/linux-generic/odp_pool.c

> @@ -57,8 +57,15 @@ static const char SHM_DEFAULT_NAME[] =

> "odp_buffer_pools";

>  /* Pool entry pointers (for inlining) */

>  void *pool_entry_ptr[ODP_CONFIG_POOLS];

>

> -/* Cache thread id locally for local cache performance */

> -static __thread int local_id;

> +/* Thread local variables */

> +typedef struct pool_local_t {

> +       local_cache_t *cache[ODP_CONFIG_POOLS];

> +       int thr_id;

> +} pool_local_t;

> +

> +static __thread pool_local_t local;

> +

> +static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s

> *pool);

>

>  int odp_pool_init_global(void)

>  {

> @@ -111,7 +118,19 @@ int odp_pool_init_global(void)

>

>  int odp_pool_init_local(void)

>  {

> -       local_id = odp_thread_id();

> +       pool_entry_t *pool;

> +       int i;

> +       int thr_id = odp_thread_id();

> +

> +       memset(&local, 0, sizeof(pool_local_t));

> +

> +       for (i = 0; i < ODP_CONFIG_POOLS; i++) {

> +               pool           = get_pool_entry(i);

> +               local.cache[i] = &pool->s.local_cache[thr_id];

> +               local.cache[i]->s.num_buf = 0;

> +       }

> +

> +       local.thr_id = thr_id;

>         return 0;

>  }

>

> @@ -144,7 +163,14 @@ int odp_pool_term_global(void)

>

>  int odp_pool_term_local(void)

>  {

> -       _odp_flush_caches();

> +       int i;

> +

> +       for (i = 0; i < ODP_CONFIG_POOLS; i++) {

> +               pool_entry_t *pool = get_pool_entry(i);

> +

> +               flush_cache(local.cache[i], &pool->s);

> +       }

> +

>         return 0;

>  }

>

> @@ -179,10 +205,53 @@ int odp_pool_capability(odp_pool_capability_t *capa)

>         return 0;

>  }

>

> -/**

> +static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)

> +{

> +       odp_buffer_hdr_t *myhead;

> +

> +       POOL_LOCK(&pool->buf_lock);

> +

> +       myhead = pool->buf_freelist;

> +

> +       if (odp_unlikely(myhead == NULL)) {

> +               POOL_UNLOCK(&pool->buf_lock);

> +               odp_atomic_inc_u64(&pool->poolstats.bufempty);

> +       } else {

> +               pool->buf_freelist = myhead->next;

> +               POOL_UNLOCK(&pool->buf_lock);

> +

> +               odp_atomic_fetch_sub_u32(&pool->bufcount, 1);

> +               odp_atomic_inc_u64(&pool->poolstats.bufallocs);

> +       }

> +

> +       return (void *)myhead;

> +}

> +

> +static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t

> *buf)

> +{

> +       if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {

> +               while (buf->segcount > 0) {

> +                       if (buffer_is_secure(buf) || pool_is_secure(pool))

> +                               memset(buf->addr[buf->segcount - 1],

> +                                      0, buf->segsize);

> +                       ret_blk(pool, buf->addr[--buf->segcount]);

> +               }

> +               buf->size = 0;

> +       }

> +

> +       buf->allocator = ODP_FREEBUF;  /* Mark buffer free */

> +       POOL_LOCK(&pool->buf_lock);

> +       buf->next = pool->buf_freelist;

> +       pool->buf_freelist = buf;

> +       POOL_UNLOCK(&pool->buf_lock);

> +

> +       odp_atomic_fetch_add_u32(&pool->bufcount, 1);

> +       odp_atomic_inc_u64(&pool->poolstats.buffrees);

> +}

> +

> +/*

>   * Pool creation

>   */

> -

>  odp_pool_t _pool_create(const char *name,

>                         odp_pool_param_t *params,

>                         uint32_t shmflags)

> @@ -208,9 +277,6 @@ odp_pool_t _pool_create(const char *name,

>         /* Restriction for v1.0: All non-packet buffers are unsegmented */

>         int unseg = 1;

>

> -       /* Restriction for v1.0: No zeroization support */

> -       const int zeroized = 0;

> -

>         uint32_t blk_size, buf_stride, buf_num, blk_num, seg_len = 0;

>         uint32_t buf_align =

>                 params->type == ODP_POOL_BUFFER ? params->buf.align : 0;

> @@ -350,7 +416,6 @@ odp_pool_t _pool_create(const char *name,

>                 POOL_UNLOCK(&pool->s.lock);

>

>                 pool->s.flags.unsegmented = unseg;

> -               pool->s.flags.zeroized = zeroized;

>                 pool->s.seg_size = unseg ? blk_size : seg_len;

>                 pool->s.blk_size = blk_size;

>

> @@ -383,9 +448,7 @@ odp_pool_t _pool_create(const char *name,

>                         /* Iniitalize buffer metadata */

>                         tmp->allocator = ODP_FREEBUF;

>                         tmp->flags.all = 0;

> -                       tmp->flags.zeroized = zeroized;

>                         tmp->size = 0;

> -                       odp_atomic_init_u32(&tmp->ref_count, 0);

>                         tmp->type = params->type;

>                         tmp->event_type = params->type;

>                         tmp->pool_hdl = pool->s.pool_hdl;

> @@ -503,6 +566,41 @@ int odp_pool_info(odp_pool_t pool_hdl,

> odp_pool_info_t *info)

>         return 0;

>  }

>

> +static inline void get_local_cache_bufs(local_cache_t *buf_cache,

> uint32_t idx,

> +                                       odp_buffer_hdr_t *buf_hdr[],

> +                                       uint32_t num)

> +{

> +       uint32_t i;

> +

> +       for (i = 0; i < num; i++) {

> +               buf_hdr[i] = buf_cache->s.buf[idx + i];

> +               odp_prefetch(buf_hdr[i]);

> +               odp_prefetch_store(buf_hdr[i]);

> +       }

> +}

> +

> +static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s

> *pool)

> +{

> +       uint32_t flush_count = 0;

> +       uint32_t num;

> +

> +       while ((num = buf_cache->s.num_buf)) {

> +               odp_buffer_hdr_t *buf;

> +

> +               buf = buf_cache->s.buf[num - 1];

> +               ret_buf(pool, buf);

> +               flush_count++;

> +               buf_cache->s.num_buf--;

> +       }

> +

> +       odp_atomic_add_u64(&pool->poolstats.bufallocs,

> buf_cache->s.bufallocs);

> +       odp_atomic_add_u64(&pool->poolstats.buffrees,

> +                          buf_cache->s.buffrees - flush_count);

> +

> +       buf_cache->s.bufallocs = 0;

> +       buf_cache->s.buffrees = 0;

> +}

> +

>  int odp_pool_destroy(odp_pool_t pool_hdl)

>  {

>         uint32_t pool_id = pool_handle_to_index(pool_hdl);

> @@ -621,71 +719,207 @@ void seg_free_tail(odp_buffer_hdr_t *buf_hdr, int

> segcount)

>         buf_hdr->size      = buf_hdr->segcount * pool->s.seg_size;

>  }

>

> -odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)

> +static inline int get_local_bufs(local_cache_t *buf_cache,

> +                                odp_buffer_hdr_t *buf_hdr[], uint32_t

> max_num)

> +{

> +       uint32_t num_buf = buf_cache->s.num_buf;

> +       uint32_t num = num_buf;

> +

> +       if (odp_unlikely(num_buf == 0))

> +               return 0;

> +

> +       if (odp_likely(max_num < num))

> +               num = max_num;

> +

> +       get_local_cache_bufs(buf_cache, num_buf - num, buf_hdr, num);

> +       buf_cache->s.num_buf   -= num;

> +       buf_cache->s.bufallocs += num;

> +

> +       return num;

> +}

> +

> +static inline void ret_local_buf(local_cache_t *buf_cache, uint32_t idx,

> +                                odp_buffer_hdr_t *buf)

> +{

> +       buf_cache->s.buf[idx] = buf;

> +       buf_cache->s.num_buf++;

> +       buf_cache->s.buffrees++;

> +}

> +

> +static inline void ret_local_bufs(local_cache_t *buf_cache, uint32_t idx,

> +                                 odp_buffer_hdr_t *buf[], int num_buf)

> +{

> +       int i;

> +

> +       for (i = 0; i < num_buf; i++)

> +               buf_cache->s.buf[idx + i] = buf[i];

> +

> +       buf_cache->s.num_buf  += num_buf;

> +       buf_cache->s.buffrees += num_buf;

> +}

> +

> +int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,

> +                      odp_buffer_t buf[], int max_num)

>  {

>         uint32_t pool_id = pool_handle_to_index(pool_hdl);

>         pool_entry_t *pool = get_pool_entry(pool_id);

>         uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;

> -       odp_anybuf_t *buf;

> +       odp_buffer_hdr_t *buf_tbl[max_num];

> +       odp_buffer_hdr_t *buf_hdr;

> +       int num, i;

> +       intmax_t needed;

> +       void *blk;

>

>         /* Reject oversized allocation requests */

>         if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||

>             (!pool->s.flags.unsegmented &&

>              totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))

> -               return ODP_BUFFER_INVALID;

> +               return 0;

>

>         /* Try to satisfy request from the local cache */

> -       buf = (odp_anybuf_t *)

> -               (void *)get_local_buf(&pool->s.local_cache[local_id],

> -                                     &pool->s, totsize);

> +       num = get_local_bufs(local.cache[pool_id], buf_tbl, max_num);

>

>         /* If cache is empty, satisfy request from the pool */

> -       if (odp_unlikely(buf == NULL)) {

> -               buf = (odp_anybuf_t *)(void *)get_buf(&pool->s);

> +       if (odp_unlikely(num < max_num)) {

> +               for (; num < max_num; num++) {

> +                       buf_hdr = get_buf(&pool->s);

>

> -               if (odp_unlikely(buf == NULL))

> +                       if (odp_unlikely(buf_hdr == NULL))

> +                               goto pool_empty;

> +

> +                       /* Get blocks for this buffer, if pool uses

> +                        * application data */

> +                       if (buf_hdr->size < totsize) {

> +                               uint32_t segcount;

> +

> +                               needed = totsize - buf_hdr->size;

> +                               do {

> +                                       blk = get_blk(&pool->s);

> +                                       if (odp_unlikely(blk == NULL)) {

> +                                               ret_buf(&pool->s, buf_hdr);

> +                                               goto pool_empty;

> +                                       }

> +

> +                                       segcount = buf_hdr->segcount++;

> +                                       buf_hdr->addr[segcount] = blk;

> +                                       needed -= pool->s.seg_size;

> +                               } while (needed > 0);

> +                               buf_hdr->size = buf_hdr->segcount *

> +                                               pool->s.seg_size;

> +                       }

> +

> +                       buf_tbl[num] = buf_hdr;

> +               }

> +       }

> +

> +pool_empty:

> +       for (i = 0; i < num; i++) {

> +               buf_hdr = buf_tbl[i];

> +

> +               /* Mark buffer as allocated */

> +               buf_hdr->allocator = local.thr_id;

> +

> +               /* By default, buffers are not associated with

> +                * an ordered queue */

> +               buf_hdr->origin_qe = NULL;

> +

> +               buf[i] = odp_hdr_to_buf(buf_hdr);

> +

> +               /* Add more segments if buffer from local cache is too

> small */

> +               if (odp_unlikely(buf_hdr->size < totsize)) {

> +                       needed = totsize - buf_hdr->size;

> +                       do {

> +                               blk = get_blk(&pool->s);

> +                               if (odp_unlikely(blk == NULL)) {

> +                                       int j;

> +

> +                                       ret_buf(&pool->s, buf_hdr);

> +                                       buf_hdr = NULL;

> +                                       local.cache[pool_id]->s.buffrees--;

> +

> +                                       /* move remaining bufs up one step

> +                                        * and update loop counters */

> +                                       num--;

> +                                       for (j = i; j < num; j++)

> +                                               buf_tbl[j] = buf_tbl[j +

> 1];

> +

> +                                       i--;

> +                                       break;

> +                               }

> +                               needed -= pool->s.seg_size;

> +                               buf_hdr->addr[buf_hdr->segcount++] = blk;

> +                               buf_hdr->size = buf_hdr->segcount *

> +                                               pool->s.seg_size;

> +                       } while (needed > 0);

> +               }

> +       }

> +

> +       return num;

> +}

> +

> +odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)

> +{

> +       uint32_t pool_id = pool_handle_to_index(pool_hdl);

> +       pool_entry_t *pool = get_pool_entry(pool_id);

> +       uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;

> +       odp_buffer_hdr_t *buf_hdr;

> +       intmax_t needed;

> +       void *blk;

> +

> +       /* Reject oversized allocation requests */

> +       if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||

> +           (!pool->s.flags.unsegmented &&

> +            totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))

> +               return 0;

> +

> +       /* Try to satisfy request from the local cache. If cache is empty,

> +        * satisfy request from the pool */

> +       if (odp_unlikely(!get_local_bufs(local.cache[pool_id], &buf_hdr,

> 1))) {

> +               buf_hdr = get_buf(&pool->s);

> +

> +               if (odp_unlikely(buf_hdr == NULL))

>                         return ODP_BUFFER_INVALID;

>

>                 /* Get blocks for this buffer, if pool uses application

> data */

> -               if (buf->buf.size < totsize) {

> -                       intmax_t needed = totsize - buf->buf.size;

> +               if (buf_hdr->size < totsize) {

> +                       needed = totsize - buf_hdr->size;

>                         do {

> -                               uint8_t *blk = get_blk(&pool->s);

> -                               if (blk == NULL) {

> -                                       ret_buf(&pool->s, &buf->buf);

> +                               blk = get_blk(&pool->s);

> +                               if (odp_unlikely(blk == NULL)) {

> +                                       ret_buf(&pool->s, buf_hdr);

>                                         return ODP_BUFFER_INVALID;

>                                 }

> -                               buf->buf.addr[buf->buf.segcount++] = blk;

> +                               buf_hdr->addr[buf_hdr->segcount++] = blk;

>                                 needed -= pool->s.seg_size;

>                         } while (needed > 0);

> -                       buf->buf.size = buf->buf.segcount *

> pool->s.seg_size;

> +                       buf_hdr->size = buf_hdr->segcount *

> pool->s.seg_size;

>                 }

>         }

> -

>         /* Mark buffer as allocated */

> -       buf->buf.allocator = local_id;

> +       buf_hdr->allocator = local.thr_id;

>

> -       /* By default, buffers inherit their pool's zeroization setting */

> -       buf->buf.flags.zeroized = pool->s.flags.zeroized;

> +       /* By default, buffers are not associated with

> +        * an ordered queue */

> +       buf_hdr->origin_qe = NULL;

>

> -       /* By default, buffers are not associated with an ordered queue */

> -       buf->buf.origin_qe = NULL;

> -

> -       return odp_hdr_to_buf(&buf->buf);

> -}

> -

> -int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,

> -                      odp_buffer_t buf[], int num)

> -{

> -       int count;

> -

> -       for (count = 0; count < num; ++count) {

> -               buf[count] = buffer_alloc(pool_hdl, size);

> -               if (buf[count] == ODP_BUFFER_INVALID)

> -                       break;

> +       /* Add more segments if buffer from local cache is too small */

> +       if (odp_unlikely(buf_hdr->size < totsize)) {

> +               needed = totsize - buf_hdr->size;

> +               do {

> +                       blk = get_blk(&pool->s);

> +                       if (odp_unlikely(blk == NULL)) {

> +                               ret_buf(&pool->s, buf_hdr);

> +                               buf_hdr = NULL;

> +                               local.cache[pool_id]->s.buffrees--;

> +                               return ODP_BUFFER_INVALID;

> +                       }

> +                       buf_hdr->addr[buf_hdr->segcount++] = blk;

> +                       needed -= pool->s.seg_size;

> +               } while (needed > 0);

> +               buf_hdr->size = buf_hdr->segcount * pool->s.seg_size;

>         }

>

> -       return count;

> +       return odp_hdr_to_buf(buf_hdr);

>  }

>

>  odp_buffer_t odp_buffer_alloc(odp_pool_t pool_hdl)

> @@ -701,35 +935,132 @@ int odp_buffer_alloc_multi(odp_pool_t pool_hdl,

> odp_buffer_t buf[], int num)

>         return buffer_alloc_multi(pool_hdl, buf_size, buf, num);

>  }

>

> -void odp_buffer_free(odp_buffer_t buf)

> +static void multi_pool_free(odp_buffer_hdr_t *buf_hdr[], int num_buf)

>  {

> -       odp_buffer_hdr_t *buf_hdr = odp_buf_to_hdr(buf);

> -       pool_entry_t *pool = odp_buf_to_pool(buf_hdr);

> +       uint32_t pool_id, num;

> +       local_cache_t *buf_cache;

> +       pool_entry_t *pool;

> +       int i, j, idx;

>

> +       for (i = 0; i < num_buf; i++) {

> +               pool_id   =  pool_handle_to_index(buf_hdr[i]->pool_hdl);

> +               buf_cache = local.cache[pool_id];

> +               num       = buf_cache->s.num_buf;

> +

> +               if (num < POOL_MAX_LOCAL_BUFS) {

> +                       ret_local_buf(buf_cache, num, buf_hdr[i]);

> +                       continue;

> +               }

> +

> +               idx  = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;

> +               pool = get_pool_entry(pool_id);

> +

> +               /* local cache full, return a chunk */

> +               for (j = 0; j < POOL_CHUNK_SIZE; j++) {

> +                       odp_buffer_hdr_t *tmp;

> +

> +                       tmp = buf_cache->s.buf[idx + i];

> +                       ret_buf(&pool->s, tmp);

> +               }

> +

> +               num = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;

> +               buf_cache->s.num_buf = num;

> +               ret_local_buf(buf_cache, num, buf_hdr[i]);

> +       }

> +}

> +

> +void buffer_free_multi(uint32_t pool_id,

> +                      const odp_buffer_t buf[], int num_free)

> +{

> +       local_cache_t *buf_cache = local.cache[pool_id];

> +       uint32_t num;

> +       int i, idx;

> +       pool_entry_t *pool;

> +       odp_buffer_hdr_t *buf_hdr[num_free];

> +       int multi_pool = 0;

> +

> +       for (i = 0; i < num_free; i++) {

> +               uint32_t id;

> +

> +               buf_hdr[i] = odp_buf_to_hdr(buf[i]);

> +               ODP_ASSERT(buf_hdr[i]->allocator != ODP_FREEBUF);

> +               buf_hdr[i]->allocator = ODP_FREEBUF;

> +               id = pool_handle_to_index(buf_hdr[i]->pool_hdl);

> +               multi_pool |= (pool_id != id);

> +       }

> +

> +       if (odp_unlikely(multi_pool)) {

> +               multi_pool_free(buf_hdr, num_free);

> +               return;

> +       }

> +

> +       num = buf_cache->s.num_buf;

> +

> +       if (odp_likely((num + num_free) < POOL_MAX_LOCAL_BUFS)) {

> +               ret_local_bufs(buf_cache, num, buf_hdr, num_free);

> +               return;

> +       }

> +

> +       pool = get_pool_entry(pool_id);

> +

> +       /* Return at least one chunk into the global pool */

> +       if (odp_unlikely(num_free > POOL_CHUNK_SIZE)) {

> +               for (i = 0; i < num_free; i++)

> +                       ret_buf(&pool->s, buf_hdr[i]);

> +

> +               return;

> +       }

> +

> +       idx = num - POOL_CHUNK_SIZE;

> +       for (i = 0; i < POOL_CHUNK_SIZE; i++)

> +               ret_buf(&pool->s, buf_cache->s.buf[idx + i]);

> +

> +       num -= POOL_CHUNK_SIZE;

> +       buf_cache->s.num_buf = num;

> +       ret_local_bufs(buf_cache, num, buf_hdr, num_free);

> +}

> +

> +void buffer_free(uint32_t pool_id, const odp_buffer_t buf)

> +{

> +       local_cache_t *buf_cache = local.cache[pool_id];

> +       uint32_t num;

> +       int i;

> +       pool_entry_t *pool;

> +       odp_buffer_hdr_t *buf_hdr;

> +

> +       buf_hdr = odp_buf_to_hdr(buf);

>         ODP_ASSERT(buf_hdr->allocator != ODP_FREEBUF);

> +       buf_hdr->allocator = ODP_FREEBUF;

>

> -       if (odp_unlikely(pool->s.buf_low_wm_assert ||

> pool->s.blk_low_wm_assert))

> -               ret_buf(&pool->s, buf_hdr);

> -       else

> -               ret_local_buf(&pool->s.local_cache[local_id], buf_hdr);

> +       num = buf_cache->s.num_buf;

> +

> +       if (odp_likely((num + 1) < POOL_MAX_LOCAL_BUFS)) {

> +               ret_local_bufs(buf_cache, num, &buf_hdr, 1);

> +               return;

> +       }

> +

> +       pool = get_pool_entry(pool_id);

> +

> +       num -= POOL_CHUNK_SIZE;

> +       for (i = 0; i < POOL_CHUNK_SIZE; i++)

> +               ret_buf(&pool->s, buf_cache->s.buf[num + i]);

> +

> +       buf_cache->s.num_buf = num;

> +       ret_local_bufs(buf_cache, num, &buf_hdr, 1);

> +}

> +

> +void odp_buffer_free(odp_buffer_t buf)

> +{

> +       uint32_t pool_id = pool_id_from_buf(buf);

> +

> +       buffer_free(pool_id, buf);

>  }

>

>  void odp_buffer_free_multi(const odp_buffer_t buf[], int num)

>  {

> -       int i;

> +       uint32_t pool_id = pool_id_from_buf(buf[0]);

>

> -       for (i = 0; i < num; ++i)

> -               odp_buffer_free(buf[i]);

> -}

> -

> -void _odp_flush_caches(void)

> -{

> -       int i;

> -

> -       for (i = 0; i < ODP_CONFIG_POOLS; i++) {

> -               pool_entry_t *pool = get_pool_entry(i);

> -               flush_cache(&pool->s.local_cache[local_id], &pool->s);

> -       }

> +       buffer_free_multi(pool_id, buf, num);

>  }

>

>  void odp_pool_print(odp_pool_t pool_hdl)

> @@ -774,7 +1105,6 @@ void odp_pool_print(odp_pool_t pool_hdl)

>                 pool->s.quiesced ? "quiesced" : "active");

>         ODP_DBG(" pool opts       %s, %s, %s\n",

>                 pool->s.flags.unsegmented ? "unsegmented" : "segmented",

> -               pool->s.flags.zeroized ? "zeroized" : "non-zeroized",

>                 pool->s.flags.predefined  ? "predefined" : "created");

>         ODP_DBG(" pool base       %p\n",  pool->s.pool_base_addr);

>         ODP_DBG(" pool size       %zu (%zu pages)\n",

> @@ -817,10 +1147,11 @@ void odp_pool_print(odp_pool_t pool_hdl)

>         ODP_DBG(" blk low wm count    %lu\n", blklowmct);

>  }

>

> -

>  odp_pool_t odp_buffer_pool(odp_buffer_t buf)

>  {

> -       return odp_buf_to_hdr(buf)->pool_hdl;

> +       uint32_t pool_id = pool_id_from_buf(buf);

> +

> +       return pool_index_to_handle(pool_id);

>  }

>

>  void odp_pool_param_init(odp_pool_param_t *params)

> --

> 2.7.4

>

>
Maxim Uvarov July 26, 2016, 1:30 p.m. UTC | #2
Merged,
Maxim.

On 07/26/16 03:15, Bill Fischofer wrote:
> For this series:

>

> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org>

>

> On Fri, Jul 22, 2016 at 5:04 AM, Matias Elo <matias.elo@nokia.com> wrote:

>

>> Optimize local buffer cache performance which is critical to

>> many use cases - including packet IO.

>>

>> Main parts of the optimization are:

>>   * Local cache implemented as an array of buf_hdr pointers,

>>     instead of a linked list (which causes a lot of cache misses)

>>   * Alloc and free N buffers per operation

>>

>> All above steps are needed to demonstrate the performance upgrade.

>> Some related pool functions (get_buf(), ret_buf(), etc) were moved

>> from pool header to c source file, since those were actual local

>> to the c source file. Also some unused pool variables are removed

>> also.

>>

>> Signed-off-by: Petri Savolainen <petri.savolainen@nokia.com>

>> Signed-off-by: Matias Elo <matias.elo@nokia.com>

>> ---

>>

>> V2:

>> - Split pktio modifications into a separate patch (Bill)

>> - Improve performance by adding separate functions for single buffer

>>    alloc/free operations

>>

>>   .../linux-generic/include/odp_buffer_inlines.h     |  26 +-

>>   .../linux-generic/include/odp_buffer_internal.h    |   5 +-

>>   platform/linux-generic/include/odp_internal.h      |   2 -

>>   platform/linux-generic/include/odp_pool_internal.h | 143 +------

>>   platform/linux-generic/odp_buffer.c                |   3 -

>>   platform/linux-generic/odp_packet.c                |   5 +-

>>   platform/linux-generic/odp_pool.c                  | 473

>> +++++++++++++++++----

>>   7 files changed, 426 insertions(+), 231 deletions(-)

>>

>> diff --git a/platform/linux-generic/include/odp_buffer_inlines.h

>> b/platform/linux-generic/include/odp_buffer_inlines.h

>> index 3f4d9fd..2b1ab42 100644

>> --- a/platform/linux-generic/include/odp_buffer_inlines.h

>> +++ b/platform/linux-generic/include/odp_buffer_inlines.h

>> @@ -56,30 +56,12 @@ static inline odp_buffer_hdr_t

>> *odp_buf_to_hdr(odp_buffer_t buf)

>>                  (pool->pool_mdata_addr + (index * ODP_CACHE_LINE_SIZE));

>>   }

>>

>> -static inline uint32_t odp_buffer_refcount(odp_buffer_hdr_t *buf)

>> +static inline uint32_t pool_id_from_buf(odp_buffer_t buf)

>>   {

>> -       return odp_atomic_load_u32(&buf->ref_count);

>> -}

>> +       odp_buffer_bits_t handle;

>>

>> -static inline uint32_t odp_buffer_incr_refcount(odp_buffer_hdr_t *buf,

>> -                                               uint32_t val)

>> -{

>> -       return odp_atomic_fetch_add_u32(&buf->ref_count, val) + val;

>> -}

>> -

>> -static inline uint32_t odp_buffer_decr_refcount(odp_buffer_hdr_t *buf,

>> -                                               uint32_t val)

>> -{

>> -       uint32_t tmp;

>> -

>> -       tmp = odp_atomic_fetch_sub_u32(&buf->ref_count, val);

>> -

>> -       if (tmp < val) {

>> -               odp_atomic_fetch_add_u32(&buf->ref_count, val - tmp);

>> -               return 0;

>> -       } else {

>> -               return tmp - val;

>> -       }

>> +       handle.handle = buf;

>> +       return handle.pool_id;

>>   }

>>

>>   static inline odp_buffer_hdr_t *validate_buf(odp_buffer_t buf)

>> diff --git a/platform/linux-generic/include/odp_buffer_internal.h

>> b/platform/linux-generic/include/odp_buffer_internal.h

>> index f21364c..7b0ef8b 100644

>> --- a/platform/linux-generic/include/odp_buffer_internal.h

>> +++ b/platform/linux-generic/include/odp_buffer_internal.h

>> @@ -114,7 +114,6 @@ struct odp_buffer_hdr_t {

>>          union {

>>                  uint32_t all;

>>                  struct {

>> -                       uint32_t zeroized:1; /* Zeroize buf data on free */

>>                          uint32_t hdrdata:1;  /* Data is in buffer hdr */

>>                          uint32_t sustain:1;  /* Sustain order */

>>                  };

>> @@ -123,7 +122,6 @@ struct odp_buffer_hdr_t {

>>          int8_t                   type;       /* buffer type */

>>          odp_event_type_t         event_type; /* for reuse as event */

>>          uint32_t                 size;       /* max data size */

>> -       odp_atomic_u32_t         ref_count;  /* reference count */

>>          odp_pool_t               pool_hdl;   /* buffer pool handle */

>>          union {

>>                  uint64_t         buf_u64;    /* user u64 */

>> @@ -174,6 +172,9 @@ typedef struct {

>>   odp_buffer_t buffer_alloc(odp_pool_t pool, size_t size);

>>   int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,

>>                         odp_buffer_t buf[], int num);

>> +void buffer_free(uint32_t pool_id, const odp_buffer_t buf);

>> +void buffer_free_multi(uint32_t pool_id,

>> +                      const odp_buffer_t buf[], int num_free);

>>   int seg_alloc_head(odp_buffer_hdr_t *buf_hdr, int segcount);

>>   void seg_free_head(odp_buffer_hdr_t *buf_hdr, int segcount);

>>   int seg_alloc_tail(odp_buffer_hdr_t *buf_hdr, int segcount);

>> diff --git a/platform/linux-generic/include/odp_internal.h

>> b/platform/linux-generic/include/odp_internal.h

>> index d12f850..8bad450 100644

>> --- a/platform/linux-generic/include/odp_internal.h

>> +++ b/platform/linux-generic/include/odp_internal.h

>> @@ -119,8 +119,6 @@ int odp_tm_term_global(void);

>>   int _odp_int_name_tbl_init_global(void);

>>   int _odp_int_name_tbl_term_global(void);

>>

>> -void _odp_flush_caches(void);

>> -

>>   int cpuinfo_parser(FILE *file, system_info_t *sysinfo);

>>   uint64_t odp_cpu_hz_current(int id);

>>

>> diff --git a/platform/linux-generic/include/odp_pool_internal.h

>> b/platform/linux-generic/include/odp_pool_internal.h

>> index 3317bd0..d6717ff 100644

>> --- a/platform/linux-generic/include/odp_pool_internal.h

>> +++ b/platform/linux-generic/include/odp_pool_internal.h

>> @@ -51,15 +51,25 @@ typedef struct _odp_buffer_pool_init_t {

>>          void *buf_init_arg;        /**< Argument to be passed to

>> buf_init() */

>>   } _odp_buffer_pool_init_t;         /**< Type of buffer initialization

>> struct */

>>

>> +#define POOL_MAX_LOCAL_CHUNKS 4

>> +#define POOL_CHUNK_SIZE       32

>> +#define POOL_MAX_LOCAL_BUFS   (POOL_MAX_LOCAL_CHUNKS * POOL_CHUNK_SIZE)

>> +

>> +struct local_cache_s {

>> +       uint64_t bufallocs;  /* Local buffer alloc count */

>> +       uint64_t buffrees;   /* Local buffer free count */

>> +

>> +       uint32_t num_buf;

>> +       odp_buffer_hdr_t *buf[POOL_MAX_LOCAL_BUFS];

>> +};

>> +

>>   /* Local cache for buffer alloc/free acceleration */

>>   typedef struct local_cache_t {

>>          union {

>> -               struct {

>> -                       odp_buffer_hdr_t *buf_freelist;  /* The local

>> cache */

>> -                       uint64_t bufallocs;  /* Local buffer alloc count */

>> -                       uint64_t buffrees;   /* Local buffer free count */

>> -               };

>> -               uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(sizeof(uint64_t))];

>> +               struct local_cache_s s;

>> +

>> +               uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(

>> +                           sizeof(struct local_cache_s))];

>>          };

>>   } local_cache_t;

>>

>> @@ -214,127 +224,6 @@ static inline void ret_blk(struct pool_entry_s

>> *pool, void *block)

>>          odp_atomic_inc_u64(&pool->poolstats.blkfrees);

>>   }

>>

>> -static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)

>> -{

>> -       odp_buffer_hdr_t *myhead;

>> -       POOL_LOCK(&pool->buf_lock);

>> -

>> -       myhead = pool->buf_freelist;

>> -

>> -       if (odp_unlikely(myhead == NULL)) {

>> -               POOL_UNLOCK(&pool->buf_lock);

>> -               odp_atomic_inc_u64(&pool->poolstats.bufempty);

>> -       } else {

>> -               pool->buf_freelist = myhead->next;

>> -               POOL_UNLOCK(&pool->buf_lock);

>> -               uint64_t bufcount =

>> -                       odp_atomic_fetch_sub_u32(&pool->bufcount, 1) - 1;

>> -

>> -               /* Check for low watermark condition */

>> -               if (bufcount == pool->buf_low_wm &&

>> !pool->buf_low_wm_assert) {

>> -                       pool->buf_low_wm_assert = 1;

>> -

>>   odp_atomic_inc_u64(&pool->poolstats.buf_low_wm_count);

>> -               }

>> -

>> -               odp_atomic_inc_u64(&pool->poolstats.bufallocs);

>> -       }

>> -

>> -       return (void *)myhead;

>> -}

>> -

>> -static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t

>> *buf)

>> -{

>> -       if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {

>> -               while (buf->segcount > 0) {

>> -                       if (buffer_is_secure(buf) || pool_is_secure(pool))

>> -                               memset(buf->addr[buf->segcount - 1],

>> -                                      0, buf->segsize);

>> -                       ret_blk(pool, buf->addr[--buf->segcount]);

>> -               }

>> -               buf->size = 0;

>> -       }

>> -

>> -       buf->allocator = ODP_FREEBUF;  /* Mark buffer free */

>> -       POOL_LOCK(&pool->buf_lock);

>> -       buf->next = pool->buf_freelist;

>> -       pool->buf_freelist = buf;

>> -       POOL_UNLOCK(&pool->buf_lock);

>> -

>> -       uint64_t bufcount = odp_atomic_fetch_add_u32(&pool->bufcount, 1) +

>> 1;

>> -

>> -       /* Check if low watermark condition should be deasserted */

>> -       if (bufcount == pool->buf_high_wm && pool->buf_low_wm_assert) {

>> -               pool->buf_low_wm_assert = 0;

>> -               odp_atomic_inc_u64(&pool->poolstats.buf_high_wm_count);

>> -       }

>> -

>> -       odp_atomic_inc_u64(&pool->poolstats.buffrees);

>> -}

>> -

>> -static inline void *get_local_buf(local_cache_t *buf_cache,

>> -                                 struct pool_entry_s *pool,

>> -                                 size_t totsize)

>> -{

>> -       odp_buffer_hdr_t *buf = buf_cache->buf_freelist;

>> -

>> -       if (odp_likely(buf != NULL)) {

>> -               buf_cache->buf_freelist = buf->next;

>> -

>> -               if (odp_unlikely(buf->size < totsize)) {

>> -                       intmax_t needed = totsize - buf->size;

>> -

>> -                       do {

>> -                               void *blk = get_blk(pool);

>> -                               if (odp_unlikely(blk == NULL)) {

>> -                                       ret_buf(pool, buf);

>> -                                       buf_cache->buffrees--;

>> -                                       return NULL;

>> -                               }

>> -                               buf->addr[buf->segcount++] = blk;

>> -                               needed -= pool->seg_size;

>> -                       } while (needed > 0);

>> -

>> -                       buf->size = buf->segcount * pool->seg_size;

>> -               }

>> -

>> -               buf_cache->bufallocs++;

>> -       }

>> -

>> -       return buf;

>> -}

>> -

>> -static inline void ret_local_buf(local_cache_t *buf_cache,

>> -                               odp_buffer_hdr_t *buf)

>> -{

>> -       buf->allocator = ODP_FREEBUF;

>> -       buf->next = buf_cache->buf_freelist;

>> -       buf_cache->buf_freelist = buf;

>> -

>> -       buf_cache->buffrees++;

>> -}

>> -

>> -static inline void flush_cache(local_cache_t *buf_cache,

>> -                              struct pool_entry_s *pool)

>> -{

>> -       odp_buffer_hdr_t *buf = buf_cache->buf_freelist;

>> -       uint32_t flush_count = 0;

>> -

>> -       while (buf != NULL) {

>> -               odp_buffer_hdr_t *next = buf->next;

>> -               ret_buf(pool, buf);

>> -               buf = next;

>> -               flush_count++;

>> -       }

>> -

>> -       odp_atomic_add_u64(&pool->poolstats.bufallocs,

>> buf_cache->bufallocs);

>> -       odp_atomic_add_u64(&pool->poolstats.buffrees,

>> -                          buf_cache->buffrees - flush_count);

>> -

>> -       buf_cache->buf_freelist = NULL;

>> -       buf_cache->bufallocs = 0;

>> -       buf_cache->buffrees = 0;

>> -}

>> -

>>   static inline odp_pool_t pool_index_to_handle(uint32_t pool_id)

>>   {

>>          return _odp_cast_scalar(odp_pool_t, pool_id);

>> diff --git a/platform/linux-generic/odp_buffer.c

>> b/platform/linux-generic/odp_buffer.c

>> index e7e4d58..ce2fdba 100644

>> --- a/platform/linux-generic/odp_buffer.c

>> +++ b/platform/linux-generic/odp_buffer.c

>> @@ -67,9 +67,6 @@ int odp_buffer_snprint(char *str, uint32_t n,

>> odp_buffer_t buf)

>>          len += snprintf(&str[len], n-len,

>>                          "  size         %" PRIu32 "\n",        hdr->size);

>>          len += snprintf(&str[len], n-len,

>> -                       "  ref_count    %" PRIu32 "\n",

>> -                       odp_atomic_load_u32(&hdr->ref_count));

>> -       len += snprintf(&str[len], n-len,

>>                          "  type         %i\n",        hdr->type);

>>

>>          return len;

>> diff --git a/platform/linux-generic/odp_packet.c

>> b/platform/linux-generic/odp_packet.c

>> index 0e319d2..474fa81 100644

>> --- a/platform/linux-generic/odp_packet.c

>> +++ b/platform/linux-generic/odp_packet.c

>> @@ -972,10 +972,7 @@ int _odp_packet_copy_md_to_packet(odp_packet_t

>> srcpkt, odp_packet_t dstpkt)

>>                         srchdr->buf_hdr.uarea_size ?

>>                         dsthdr->buf_hdr.uarea_size :

>>                         srchdr->buf_hdr.uarea_size);

>> -       odp_atomic_store_u32(

>> -               &dsthdr->buf_hdr.ref_count,

>> -               odp_atomic_load_u32(

>> -                       &srchdr->buf_hdr.ref_count));

>> +

>>          copy_packet_parser_metadata(srchdr, dsthdr);

>>

>>          /* Metadata copied, but return indication of whether the packet

>> diff --git a/platform/linux-generic/odp_pool.c

>> b/platform/linux-generic/odp_pool.c

>> index 419f03f..0a427ed 100644

>> --- a/platform/linux-generic/odp_pool.c

>> +++ b/platform/linux-generic/odp_pool.c

>> @@ -57,8 +57,15 @@ static const char SHM_DEFAULT_NAME[] =

>> "odp_buffer_pools";

>>   /* Pool entry pointers (for inlining) */

>>   void *pool_entry_ptr[ODP_CONFIG_POOLS];

>>

>> -/* Cache thread id locally for local cache performance */

>> -static __thread int local_id;

>> +/* Thread local variables */

>> +typedef struct pool_local_t {

>> +       local_cache_t *cache[ODP_CONFIG_POOLS];

>> +       int thr_id;

>> +} pool_local_t;

>> +

>> +static __thread pool_local_t local;

>> +

>> +static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s

>> *pool);

>>

>>   int odp_pool_init_global(void)

>>   {

>> @@ -111,7 +118,19 @@ int odp_pool_init_global(void)

>>

>>   int odp_pool_init_local(void)

>>   {

>> -       local_id = odp_thread_id();

>> +       pool_entry_t *pool;

>> +       int i;

>> +       int thr_id = odp_thread_id();

>> +

>> +       memset(&local, 0, sizeof(pool_local_t));

>> +

>> +       for (i = 0; i < ODP_CONFIG_POOLS; i++) {

>> +               pool           = get_pool_entry(i);

>> +               local.cache[i] = &pool->s.local_cache[thr_id];

>> +               local.cache[i]->s.num_buf = 0;

>> +       }

>> +

>> +       local.thr_id = thr_id;

>>          return 0;

>>   }

>>

>> @@ -144,7 +163,14 @@ int odp_pool_term_global(void)

>>

>>   int odp_pool_term_local(void)

>>   {

>> -       _odp_flush_caches();

>> +       int i;

>> +

>> +       for (i = 0; i < ODP_CONFIG_POOLS; i++) {

>> +               pool_entry_t *pool = get_pool_entry(i);

>> +

>> +               flush_cache(local.cache[i], &pool->s);

>> +       }

>> +

>>          return 0;

>>   }

>>

>> @@ -179,10 +205,53 @@ int odp_pool_capability(odp_pool_capability_t *capa)

>>          return 0;

>>   }

>>

>> -/**

>> +static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)

>> +{

>> +       odp_buffer_hdr_t *myhead;

>> +

>> +       POOL_LOCK(&pool->buf_lock);

>> +

>> +       myhead = pool->buf_freelist;

>> +

>> +       if (odp_unlikely(myhead == NULL)) {

>> +               POOL_UNLOCK(&pool->buf_lock);

>> +               odp_atomic_inc_u64(&pool->poolstats.bufempty);

>> +       } else {

>> +               pool->buf_freelist = myhead->next;

>> +               POOL_UNLOCK(&pool->buf_lock);

>> +

>> +               odp_atomic_fetch_sub_u32(&pool->bufcount, 1);

>> +               odp_atomic_inc_u64(&pool->poolstats.bufallocs);

>> +       }

>> +

>> +       return (void *)myhead;

>> +}

>> +

>> +static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t

>> *buf)

>> +{

>> +       if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {

>> +               while (buf->segcount > 0) {

>> +                       if (buffer_is_secure(buf) || pool_is_secure(pool))

>> +                               memset(buf->addr[buf->segcount - 1],

>> +                                      0, buf->segsize);

>> +                       ret_blk(pool, buf->addr[--buf->segcount]);

>> +               }

>> +               buf->size = 0;

>> +       }

>> +

>> +       buf->allocator = ODP_FREEBUF;  /* Mark buffer free */

>> +       POOL_LOCK(&pool->buf_lock);

>> +       buf->next = pool->buf_freelist;

>> +       pool->buf_freelist = buf;

>> +       POOL_UNLOCK(&pool->buf_lock);

>> +

>> +       odp_atomic_fetch_add_u32(&pool->bufcount, 1);

>> +       odp_atomic_inc_u64(&pool->poolstats.buffrees);

>> +}

>> +

>> +/*

>>    * Pool creation

>>    */

>> -

>>   odp_pool_t _pool_create(const char *name,

>>                          odp_pool_param_t *params,

>>                          uint32_t shmflags)

>> @@ -208,9 +277,6 @@ odp_pool_t _pool_create(const char *name,

>>          /* Restriction for v1.0: All non-packet buffers are unsegmented */

>>          int unseg = 1;

>>

>> -       /* Restriction for v1.0: No zeroization support */

>> -       const int zeroized = 0;

>> -

>>          uint32_t blk_size, buf_stride, buf_num, blk_num, seg_len = 0;

>>          uint32_t buf_align =

>>                  params->type == ODP_POOL_BUFFER ? params->buf.align : 0;

>> @@ -350,7 +416,6 @@ odp_pool_t _pool_create(const char *name,

>>                  POOL_UNLOCK(&pool->s.lock);

>>

>>                  pool->s.flags.unsegmented = unseg;

>> -               pool->s.flags.zeroized = zeroized;

>>                  pool->s.seg_size = unseg ? blk_size : seg_len;

>>                  pool->s.blk_size = blk_size;

>>

>> @@ -383,9 +448,7 @@ odp_pool_t _pool_create(const char *name,

>>                          /* Iniitalize buffer metadata */

>>                          tmp->allocator = ODP_FREEBUF;

>>                          tmp->flags.all = 0;

>> -                       tmp->flags.zeroized = zeroized;

>>                          tmp->size = 0;

>> -                       odp_atomic_init_u32(&tmp->ref_count, 0);

>>                          tmp->type = params->type;

>>                          tmp->event_type = params->type;

>>                          tmp->pool_hdl = pool->s.pool_hdl;

>> @@ -503,6 +566,41 @@ int odp_pool_info(odp_pool_t pool_hdl,

>> odp_pool_info_t *info)

>>          return 0;

>>   }

>>

>> +static inline void get_local_cache_bufs(local_cache_t *buf_cache,

>> uint32_t idx,

>> +                                       odp_buffer_hdr_t *buf_hdr[],

>> +                                       uint32_t num)

>> +{

>> +       uint32_t i;

>> +

>> +       for (i = 0; i < num; i++) {

>> +               buf_hdr[i] = buf_cache->s.buf[idx + i];

>> +               odp_prefetch(buf_hdr[i]);

>> +               odp_prefetch_store(buf_hdr[i]);

>> +       }

>> +}

>> +

>> +static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s

>> *pool)

>> +{

>> +       uint32_t flush_count = 0;

>> +       uint32_t num;

>> +

>> +       while ((num = buf_cache->s.num_buf)) {

>> +               odp_buffer_hdr_t *buf;

>> +

>> +               buf = buf_cache->s.buf[num - 1];

>> +               ret_buf(pool, buf);

>> +               flush_count++;

>> +               buf_cache->s.num_buf--;

>> +       }

>> +

>> +       odp_atomic_add_u64(&pool->poolstats.bufallocs,

>> buf_cache->s.bufallocs);

>> +       odp_atomic_add_u64(&pool->poolstats.buffrees,

>> +                          buf_cache->s.buffrees - flush_count);

>> +

>> +       buf_cache->s.bufallocs = 0;

>> +       buf_cache->s.buffrees = 0;

>> +}

>> +

>>   int odp_pool_destroy(odp_pool_t pool_hdl)

>>   {

>>          uint32_t pool_id = pool_handle_to_index(pool_hdl);

>> @@ -621,71 +719,207 @@ void seg_free_tail(odp_buffer_hdr_t *buf_hdr, int

>> segcount)

>>          buf_hdr->size      = buf_hdr->segcount * pool->s.seg_size;

>>   }

>>

>> -odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)

>> +static inline int get_local_bufs(local_cache_t *buf_cache,

>> +                                odp_buffer_hdr_t *buf_hdr[], uint32_t

>> max_num)

>> +{

>> +       uint32_t num_buf = buf_cache->s.num_buf;

>> +       uint32_t num = num_buf;

>> +

>> +       if (odp_unlikely(num_buf == 0))

>> +               return 0;

>> +

>> +       if (odp_likely(max_num < num))

>> +               num = max_num;

>> +

>> +       get_local_cache_bufs(buf_cache, num_buf - num, buf_hdr, num);

>> +       buf_cache->s.num_buf   -= num;

>> +       buf_cache->s.bufallocs += num;

>> +

>> +       return num;

>> +}

>> +

>> +static inline void ret_local_buf(local_cache_t *buf_cache, uint32_t idx,

>> +                                odp_buffer_hdr_t *buf)

>> +{

>> +       buf_cache->s.buf[idx] = buf;

>> +       buf_cache->s.num_buf++;

>> +       buf_cache->s.buffrees++;

>> +}

>> +

>> +static inline void ret_local_bufs(local_cache_t *buf_cache, uint32_t idx,

>> +                                 odp_buffer_hdr_t *buf[], int num_buf)

>> +{

>> +       int i;

>> +

>> +       for (i = 0; i < num_buf; i++)

>> +               buf_cache->s.buf[idx + i] = buf[i];

>> +

>> +       buf_cache->s.num_buf  += num_buf;

>> +       buf_cache->s.buffrees += num_buf;

>> +}

>> +

>> +int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,

>> +                      odp_buffer_t buf[], int max_num)

>>   {

>>          uint32_t pool_id = pool_handle_to_index(pool_hdl);

>>          pool_entry_t *pool = get_pool_entry(pool_id);

>>          uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;

>> -       odp_anybuf_t *buf;

>> +       odp_buffer_hdr_t *buf_tbl[max_num];

>> +       odp_buffer_hdr_t *buf_hdr;

>> +       int num, i;

>> +       intmax_t needed;

>> +       void *blk;

>>

>>          /* Reject oversized allocation requests */

>>          if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||

>>              (!pool->s.flags.unsegmented &&

>>               totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))

>> -               return ODP_BUFFER_INVALID;

>> +               return 0;

>>

>>          /* Try to satisfy request from the local cache */

>> -       buf = (odp_anybuf_t *)

>> -               (void *)get_local_buf(&pool->s.local_cache[local_id],

>> -                                     &pool->s, totsize);

>> +       num = get_local_bufs(local.cache[pool_id], buf_tbl, max_num);

>>

>>          /* If cache is empty, satisfy request from the pool */

>> -       if (odp_unlikely(buf == NULL)) {

>> -               buf = (odp_anybuf_t *)(void *)get_buf(&pool->s);

>> +       if (odp_unlikely(num < max_num)) {

>> +               for (; num < max_num; num++) {

>> +                       buf_hdr = get_buf(&pool->s);

>>

>> -               if (odp_unlikely(buf == NULL))

>> +                       if (odp_unlikely(buf_hdr == NULL))

>> +                               goto pool_empty;

>> +

>> +                       /* Get blocks for this buffer, if pool uses

>> +                        * application data */

>> +                       if (buf_hdr->size < totsize) {

>> +                               uint32_t segcount;

>> +

>> +                               needed = totsize - buf_hdr->size;

>> +                               do {

>> +                                       blk = get_blk(&pool->s);

>> +                                       if (odp_unlikely(blk == NULL)) {

>> +                                               ret_buf(&pool->s, buf_hdr);

>> +                                               goto pool_empty;

>> +                                       }

>> +

>> +                                       segcount = buf_hdr->segcount++;

>> +                                       buf_hdr->addr[segcount] = blk;

>> +                                       needed -= pool->s.seg_size;

>> +                               } while (needed > 0);

>> +                               buf_hdr->size = buf_hdr->segcount *

>> +                                               pool->s.seg_size;

>> +                       }

>> +

>> +                       buf_tbl[num] = buf_hdr;

>> +               }

>> +       }

>> +

>> +pool_empty:

>> +       for (i = 0; i < num; i++) {

>> +               buf_hdr = buf_tbl[i];

>> +

>> +               /* Mark buffer as allocated */

>> +               buf_hdr->allocator = local.thr_id;

>> +

>> +               /* By default, buffers are not associated with

>> +                * an ordered queue */

>> +               buf_hdr->origin_qe = NULL;

>> +

>> +               buf[i] = odp_hdr_to_buf(buf_hdr);

>> +

>> +               /* Add more segments if buffer from local cache is too

>> small */

>> +               if (odp_unlikely(buf_hdr->size < totsize)) {

>> +                       needed = totsize - buf_hdr->size;

>> +                       do {

>> +                               blk = get_blk(&pool->s);

>> +                               if (odp_unlikely(blk == NULL)) {

>> +                                       int j;

>> +

>> +                                       ret_buf(&pool->s, buf_hdr);

>> +                                       buf_hdr = NULL;

>> +                                       local.cache[pool_id]->s.buffrees--;

>> +

>> +                                       /* move remaining bufs up one step

>> +                                        * and update loop counters */

>> +                                       num--;

>> +                                       for (j = i; j < num; j++)

>> +                                               buf_tbl[j] = buf_tbl[j +

>> 1];

>> +

>> +                                       i--;

>> +                                       break;

>> +                               }

>> +                               needed -= pool->s.seg_size;

>> +                               buf_hdr->addr[buf_hdr->segcount++] = blk;

>> +                               buf_hdr->size = buf_hdr->segcount *

>> +                                               pool->s.seg_size;

>> +                       } while (needed > 0);

>> +               }

>> +       }

>> +

>> +       return num;

>> +}

>> +

>> +odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)

>> +{

>> +       uint32_t pool_id = pool_handle_to_index(pool_hdl);

>> +       pool_entry_t *pool = get_pool_entry(pool_id);

>> +       uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;

>> +       odp_buffer_hdr_t *buf_hdr;

>> +       intmax_t needed;

>> +       void *blk;

>> +

>> +       /* Reject oversized allocation requests */

>> +       if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||

>> +           (!pool->s.flags.unsegmented &&

>> +            totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))

>> +               return 0;

>> +

>> +       /* Try to satisfy request from the local cache. If cache is empty,

>> +        * satisfy request from the pool */

>> +       if (odp_unlikely(!get_local_bufs(local.cache[pool_id], &buf_hdr,

>> 1))) {

>> +               buf_hdr = get_buf(&pool->s);

>> +

>> +               if (odp_unlikely(buf_hdr == NULL))

>>                          return ODP_BUFFER_INVALID;

>>

>>                  /* Get blocks for this buffer, if pool uses application

>> data */

>> -               if (buf->buf.size < totsize) {

>> -                       intmax_t needed = totsize - buf->buf.size;

>> +               if (buf_hdr->size < totsize) {

>> +                       needed = totsize - buf_hdr->size;

>>                          do {

>> -                               uint8_t *blk = get_blk(&pool->s);

>> -                               if (blk == NULL) {

>> -                                       ret_buf(&pool->s, &buf->buf);

>> +                               blk = get_blk(&pool->s);

>> +                               if (odp_unlikely(blk == NULL)) {

>> +                                       ret_buf(&pool->s, buf_hdr);

>>                                          return ODP_BUFFER_INVALID;

>>                                  }

>> -                               buf->buf.addr[buf->buf.segcount++] = blk;

>> +                               buf_hdr->addr[buf_hdr->segcount++] = blk;

>>                                  needed -= pool->s.seg_size;

>>                          } while (needed > 0);

>> -                       buf->buf.size = buf->buf.segcount *

>> pool->s.seg_size;

>> +                       buf_hdr->size = buf_hdr->segcount *

>> pool->s.seg_size;

>>                  }

>>          }

>> -

>>          /* Mark buffer as allocated */

>> -       buf->buf.allocator = local_id;

>> +       buf_hdr->allocator = local.thr_id;

>>

>> -       /* By default, buffers inherit their pool's zeroization setting */

>> -       buf->buf.flags.zeroized = pool->s.flags.zeroized;

>> +       /* By default, buffers are not associated with

>> +        * an ordered queue */

>> +       buf_hdr->origin_qe = NULL;

>>

>> -       /* By default, buffers are not associated with an ordered queue */

>> -       buf->buf.origin_qe = NULL;

>> -

>> -       return odp_hdr_to_buf(&buf->buf);

>> -}

>> -

>> -int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,

>> -                      odp_buffer_t buf[], int num)

>> -{

>> -       int count;

>> -

>> -       for (count = 0; count < num; ++count) {

>> -               buf[count] = buffer_alloc(pool_hdl, size);

>> -               if (buf[count] == ODP_BUFFER_INVALID)

>> -                       break;

>> +       /* Add more segments if buffer from local cache is too small */

>> +       if (odp_unlikely(buf_hdr->size < totsize)) {

>> +               needed = totsize - buf_hdr->size;

>> +               do {

>> +                       blk = get_blk(&pool->s);

>> +                       if (odp_unlikely(blk == NULL)) {

>> +                               ret_buf(&pool->s, buf_hdr);

>> +                               buf_hdr = NULL;

>> +                               local.cache[pool_id]->s.buffrees--;

>> +                               return ODP_BUFFER_INVALID;

>> +                       }

>> +                       buf_hdr->addr[buf_hdr->segcount++] = blk;

>> +                       needed -= pool->s.seg_size;

>> +               } while (needed > 0);

>> +               buf_hdr->size = buf_hdr->segcount * pool->s.seg_size;

>>          }

>>

>> -       return count;

>> +       return odp_hdr_to_buf(buf_hdr);

>>   }

>>

>>   odp_buffer_t odp_buffer_alloc(odp_pool_t pool_hdl)

>> @@ -701,35 +935,132 @@ int odp_buffer_alloc_multi(odp_pool_t pool_hdl,

>> odp_buffer_t buf[], int num)

>>          return buffer_alloc_multi(pool_hdl, buf_size, buf, num);

>>   }

>>

>> -void odp_buffer_free(odp_buffer_t buf)

>> +static void multi_pool_free(odp_buffer_hdr_t *buf_hdr[], int num_buf)

>>   {

>> -       odp_buffer_hdr_t *buf_hdr = odp_buf_to_hdr(buf);

>> -       pool_entry_t *pool = odp_buf_to_pool(buf_hdr);

>> +       uint32_t pool_id, num;

>> +       local_cache_t *buf_cache;

>> +       pool_entry_t *pool;

>> +       int i, j, idx;

>>

>> +       for (i = 0; i < num_buf; i++) {

>> +               pool_id   =  pool_handle_to_index(buf_hdr[i]->pool_hdl);

>> +               buf_cache = local.cache[pool_id];

>> +               num       = buf_cache->s.num_buf;

>> +

>> +               if (num < POOL_MAX_LOCAL_BUFS) {

>> +                       ret_local_buf(buf_cache, num, buf_hdr[i]);

>> +                       continue;

>> +               }

>> +

>> +               idx  = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;

>> +               pool = get_pool_entry(pool_id);

>> +

>> +               /* local cache full, return a chunk */

>> +               for (j = 0; j < POOL_CHUNK_SIZE; j++) {

>> +                       odp_buffer_hdr_t *tmp;

>> +

>> +                       tmp = buf_cache->s.buf[idx + i];

>> +                       ret_buf(&pool->s, tmp);

>> +               }

>> +

>> +               num = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;

>> +               buf_cache->s.num_buf = num;

>> +               ret_local_buf(buf_cache, num, buf_hdr[i]);

>> +       }

>> +}

>> +

>> +void buffer_free_multi(uint32_t pool_id,

>> +                      const odp_buffer_t buf[], int num_free)

>> +{

>> +       local_cache_t *buf_cache = local.cache[pool_id];

>> +       uint32_t num;

>> +       int i, idx;

>> +       pool_entry_t *pool;

>> +       odp_buffer_hdr_t *buf_hdr[num_free];

>> +       int multi_pool = 0;

>> +

>> +       for (i = 0; i < num_free; i++) {

>> +               uint32_t id;

>> +

>> +               buf_hdr[i] = odp_buf_to_hdr(buf[i]);

>> +               ODP_ASSERT(buf_hdr[i]->allocator != ODP_FREEBUF);

>> +               buf_hdr[i]->allocator = ODP_FREEBUF;

>> +               id = pool_handle_to_index(buf_hdr[i]->pool_hdl);

>> +               multi_pool |= (pool_id != id);

>> +       }

>> +

>> +       if (odp_unlikely(multi_pool)) {

>> +               multi_pool_free(buf_hdr, num_free);

>> +               return;

>> +       }

>> +

>> +       num = buf_cache->s.num_buf;

>> +

>> +       if (odp_likely((num + num_free) < POOL_MAX_LOCAL_BUFS)) {

>> +               ret_local_bufs(buf_cache, num, buf_hdr, num_free);

>> +               return;

>> +       }

>> +

>> +       pool = get_pool_entry(pool_id);

>> +

>> +       /* Return at least one chunk into the global pool */

>> +       if (odp_unlikely(num_free > POOL_CHUNK_SIZE)) {

>> +               for (i = 0; i < num_free; i++)

>> +                       ret_buf(&pool->s, buf_hdr[i]);

>> +

>> +               return;

>> +       }

>> +

>> +       idx = num - POOL_CHUNK_SIZE;

>> +       for (i = 0; i < POOL_CHUNK_SIZE; i++)

>> +               ret_buf(&pool->s, buf_cache->s.buf[idx + i]);

>> +

>> +       num -= POOL_CHUNK_SIZE;

>> +       buf_cache->s.num_buf = num;

>> +       ret_local_bufs(buf_cache, num, buf_hdr, num_free);

>> +}

>> +

>> +void buffer_free(uint32_t pool_id, const odp_buffer_t buf)

>> +{

>> +       local_cache_t *buf_cache = local.cache[pool_id];

>> +       uint32_t num;

>> +       int i;

>> +       pool_entry_t *pool;

>> +       odp_buffer_hdr_t *buf_hdr;

>> +

>> +       buf_hdr = odp_buf_to_hdr(buf);

>>          ODP_ASSERT(buf_hdr->allocator != ODP_FREEBUF);

>> +       buf_hdr->allocator = ODP_FREEBUF;

>>

>> -       if (odp_unlikely(pool->s.buf_low_wm_assert ||

>> pool->s.blk_low_wm_assert))

>> -               ret_buf(&pool->s, buf_hdr);

>> -       else

>> -               ret_local_buf(&pool->s.local_cache[local_id], buf_hdr);

>> +       num = buf_cache->s.num_buf;

>> +

>> +       if (odp_likely((num + 1) < POOL_MAX_LOCAL_BUFS)) {

>> +               ret_local_bufs(buf_cache, num, &buf_hdr, 1);

>> +               return;

>> +       }

>> +

>> +       pool = get_pool_entry(pool_id);

>> +

>> +       num -= POOL_CHUNK_SIZE;

>> +       for (i = 0; i < POOL_CHUNK_SIZE; i++)

>> +               ret_buf(&pool->s, buf_cache->s.buf[num + i]);

>> +

>> +       buf_cache->s.num_buf = num;

>> +       ret_local_bufs(buf_cache, num, &buf_hdr, 1);

>> +}

>> +

>> +void odp_buffer_free(odp_buffer_t buf)

>> +{

>> +       uint32_t pool_id = pool_id_from_buf(buf);

>> +

>> +       buffer_free(pool_id, buf);

>>   }

>>

>>   void odp_buffer_free_multi(const odp_buffer_t buf[], int num)

>>   {

>> -       int i;

>> +       uint32_t pool_id = pool_id_from_buf(buf[0]);

>>

>> -       for (i = 0; i < num; ++i)

>> -               odp_buffer_free(buf[i]);

>> -}

>> -

>> -void _odp_flush_caches(void)

>> -{

>> -       int i;

>> -

>> -       for (i = 0; i < ODP_CONFIG_POOLS; i++) {

>> -               pool_entry_t *pool = get_pool_entry(i);

>> -               flush_cache(&pool->s.local_cache[local_id], &pool->s);

>> -       }

>> +       buffer_free_multi(pool_id, buf, num);

>>   }

>>

>>   void odp_pool_print(odp_pool_t pool_hdl)

>> @@ -774,7 +1105,6 @@ void odp_pool_print(odp_pool_t pool_hdl)

>>                  pool->s.quiesced ? "quiesced" : "active");

>>          ODP_DBG(" pool opts       %s, %s, %s\n",

>>                  pool->s.flags.unsegmented ? "unsegmented" : "segmented",

>> -               pool->s.flags.zeroized ? "zeroized" : "non-zeroized",

>>                  pool->s.flags.predefined  ? "predefined" : "created");

>>          ODP_DBG(" pool base       %p\n",  pool->s.pool_base_addr);

>>          ODP_DBG(" pool size       %zu (%zu pages)\n",

>> @@ -817,10 +1147,11 @@ void odp_pool_print(odp_pool_t pool_hdl)

>>          ODP_DBG(" blk low wm count    %lu\n", blklowmct);

>>   }

>>

>> -

>>   odp_pool_t odp_buffer_pool(odp_buffer_t buf)

>>   {

>> -       return odp_buf_to_hdr(buf)->pool_hdl;

>> +       uint32_t pool_id = pool_id_from_buf(buf);

>> +

>> +       return pool_index_to_handle(pool_id);

>>   }

>>

>>   void odp_pool_param_init(odp_pool_param_t *params)

>> --

>> 2.7.4

>>

>>
diff mbox

Patch

diff --git a/platform/linux-generic/include/odp_buffer_inlines.h b/platform/linux-generic/include/odp_buffer_inlines.h
index 3f4d9fd..2b1ab42 100644
--- a/platform/linux-generic/include/odp_buffer_inlines.h
+++ b/platform/linux-generic/include/odp_buffer_inlines.h
@@ -56,30 +56,12 @@  static inline odp_buffer_hdr_t *odp_buf_to_hdr(odp_buffer_t buf)
 		(pool->pool_mdata_addr + (index * ODP_CACHE_LINE_SIZE));
 }
 
-static inline uint32_t odp_buffer_refcount(odp_buffer_hdr_t *buf)
+static inline uint32_t pool_id_from_buf(odp_buffer_t buf)
 {
-	return odp_atomic_load_u32(&buf->ref_count);
-}
+	odp_buffer_bits_t handle;
 
-static inline uint32_t odp_buffer_incr_refcount(odp_buffer_hdr_t *buf,
-						uint32_t val)
-{
-	return odp_atomic_fetch_add_u32(&buf->ref_count, val) + val;
-}
-
-static inline uint32_t odp_buffer_decr_refcount(odp_buffer_hdr_t *buf,
-						uint32_t val)
-{
-	uint32_t tmp;
-
-	tmp = odp_atomic_fetch_sub_u32(&buf->ref_count, val);
-
-	if (tmp < val) {
-		odp_atomic_fetch_add_u32(&buf->ref_count, val - tmp);
-		return 0;
-	} else {
-		return tmp - val;
-	}
+	handle.handle = buf;
+	return handle.pool_id;
 }
 
 static inline odp_buffer_hdr_t *validate_buf(odp_buffer_t buf)
diff --git a/platform/linux-generic/include/odp_buffer_internal.h b/platform/linux-generic/include/odp_buffer_internal.h
index f21364c..7b0ef8b 100644
--- a/platform/linux-generic/include/odp_buffer_internal.h
+++ b/platform/linux-generic/include/odp_buffer_internal.h
@@ -114,7 +114,6 @@  struct odp_buffer_hdr_t {
 	union {
 		uint32_t all;
 		struct {
-			uint32_t zeroized:1; /* Zeroize buf data on free */
 			uint32_t hdrdata:1;  /* Data is in buffer hdr */
 			uint32_t sustain:1;  /* Sustain order */
 		};
@@ -123,7 +122,6 @@  struct odp_buffer_hdr_t {
 	int8_t                   type;       /* buffer type */
 	odp_event_type_t         event_type; /* for reuse as event */
 	uint32_t                 size;       /* max data size */
-	odp_atomic_u32_t         ref_count;  /* reference count */
 	odp_pool_t               pool_hdl;   /* buffer pool handle */
 	union {
 		uint64_t         buf_u64;    /* user u64 */
@@ -174,6 +172,9 @@  typedef struct {
 odp_buffer_t buffer_alloc(odp_pool_t pool, size_t size);
 int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,
 		       odp_buffer_t buf[], int num);
+void buffer_free(uint32_t pool_id, const odp_buffer_t buf);
+void buffer_free_multi(uint32_t pool_id,
+		       const odp_buffer_t buf[], int num_free);
 int seg_alloc_head(odp_buffer_hdr_t *buf_hdr, int segcount);
 void seg_free_head(odp_buffer_hdr_t *buf_hdr, int segcount);
 int seg_alloc_tail(odp_buffer_hdr_t *buf_hdr, int segcount);
diff --git a/platform/linux-generic/include/odp_internal.h b/platform/linux-generic/include/odp_internal.h
index d12f850..8bad450 100644
--- a/platform/linux-generic/include/odp_internal.h
+++ b/platform/linux-generic/include/odp_internal.h
@@ -119,8 +119,6 @@  int odp_tm_term_global(void);
 int _odp_int_name_tbl_init_global(void);
 int _odp_int_name_tbl_term_global(void);
 
-void _odp_flush_caches(void);
-
 int cpuinfo_parser(FILE *file, system_info_t *sysinfo);
 uint64_t odp_cpu_hz_current(int id);
 
diff --git a/platform/linux-generic/include/odp_pool_internal.h b/platform/linux-generic/include/odp_pool_internal.h
index 3317bd0..d6717ff 100644
--- a/platform/linux-generic/include/odp_pool_internal.h
+++ b/platform/linux-generic/include/odp_pool_internal.h
@@ -51,15 +51,25 @@  typedef struct _odp_buffer_pool_init_t {
 	void *buf_init_arg;        /**< Argument to be passed to buf_init() */
 } _odp_buffer_pool_init_t;         /**< Type of buffer initialization struct */
 
+#define POOL_MAX_LOCAL_CHUNKS 4
+#define POOL_CHUNK_SIZE       32
+#define POOL_MAX_LOCAL_BUFS   (POOL_MAX_LOCAL_CHUNKS * POOL_CHUNK_SIZE)
+
+struct local_cache_s {
+	uint64_t bufallocs;  /* Local buffer alloc count */
+	uint64_t buffrees;   /* Local buffer free count */
+
+	uint32_t num_buf;
+	odp_buffer_hdr_t *buf[POOL_MAX_LOCAL_BUFS];
+};
+
 /* Local cache for buffer alloc/free acceleration */
 typedef struct local_cache_t {
 	union {
-		struct {
-			odp_buffer_hdr_t *buf_freelist;  /* The local cache */
-			uint64_t bufallocs;  /* Local buffer alloc count */
-			uint64_t buffrees;   /* Local buffer free count */
-		};
-		uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(sizeof(uint64_t))];
+		struct local_cache_s s;
+
+		uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(
+			    sizeof(struct local_cache_s))];
 	};
 } local_cache_t;
 
@@ -214,127 +224,6 @@  static inline void ret_blk(struct pool_entry_s *pool, void *block)
 	odp_atomic_inc_u64(&pool->poolstats.blkfrees);
 }
 
-static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)
-{
-	odp_buffer_hdr_t *myhead;
-	POOL_LOCK(&pool->buf_lock);
-
-	myhead = pool->buf_freelist;
-
-	if (odp_unlikely(myhead == NULL)) {
-		POOL_UNLOCK(&pool->buf_lock);
-		odp_atomic_inc_u64(&pool->poolstats.bufempty);
-	} else {
-		pool->buf_freelist = myhead->next;
-		POOL_UNLOCK(&pool->buf_lock);
-		uint64_t bufcount =
-			odp_atomic_fetch_sub_u32(&pool->bufcount, 1) - 1;
-
-		/* Check for low watermark condition */
-		if (bufcount == pool->buf_low_wm && !pool->buf_low_wm_assert) {
-			pool->buf_low_wm_assert = 1;
-			odp_atomic_inc_u64(&pool->poolstats.buf_low_wm_count);
-		}
-
-		odp_atomic_inc_u64(&pool->poolstats.bufallocs);
-	}
-
-	return (void *)myhead;
-}
-
-static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t *buf)
-{
-	if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {
-		while (buf->segcount > 0) {
-			if (buffer_is_secure(buf) || pool_is_secure(pool))
-				memset(buf->addr[buf->segcount - 1],
-				       0, buf->segsize);
-			ret_blk(pool, buf->addr[--buf->segcount]);
-		}
-		buf->size = 0;
-	}
-
-	buf->allocator = ODP_FREEBUF;  /* Mark buffer free */
-	POOL_LOCK(&pool->buf_lock);
-	buf->next = pool->buf_freelist;
-	pool->buf_freelist = buf;
-	POOL_UNLOCK(&pool->buf_lock);
-
-	uint64_t bufcount = odp_atomic_fetch_add_u32(&pool->bufcount, 1) + 1;
-
-	/* Check if low watermark condition should be deasserted */
-	if (bufcount == pool->buf_high_wm && pool->buf_low_wm_assert) {
-		pool->buf_low_wm_assert = 0;
-		odp_atomic_inc_u64(&pool->poolstats.buf_high_wm_count);
-	}
-
-	odp_atomic_inc_u64(&pool->poolstats.buffrees);
-}
-
-static inline void *get_local_buf(local_cache_t *buf_cache,
-				  struct pool_entry_s *pool,
-				  size_t totsize)
-{
-	odp_buffer_hdr_t *buf = buf_cache->buf_freelist;
-
-	if (odp_likely(buf != NULL)) {
-		buf_cache->buf_freelist = buf->next;
-
-		if (odp_unlikely(buf->size < totsize)) {
-			intmax_t needed = totsize - buf->size;
-
-			do {
-				void *blk = get_blk(pool);
-				if (odp_unlikely(blk == NULL)) {
-					ret_buf(pool, buf);
-					buf_cache->buffrees--;
-					return NULL;
-				}
-				buf->addr[buf->segcount++] = blk;
-				needed -= pool->seg_size;
-			} while (needed > 0);
-
-			buf->size = buf->segcount * pool->seg_size;
-		}
-
-		buf_cache->bufallocs++;
-	}
-
-	return buf;
-}
-
-static inline void ret_local_buf(local_cache_t *buf_cache,
-				odp_buffer_hdr_t *buf)
-{
-	buf->allocator = ODP_FREEBUF;
-	buf->next = buf_cache->buf_freelist;
-	buf_cache->buf_freelist = buf;
-
-	buf_cache->buffrees++;
-}
-
-static inline void flush_cache(local_cache_t *buf_cache,
-			       struct pool_entry_s *pool)
-{
-	odp_buffer_hdr_t *buf = buf_cache->buf_freelist;
-	uint32_t flush_count = 0;
-
-	while (buf != NULL) {
-		odp_buffer_hdr_t *next = buf->next;
-		ret_buf(pool, buf);
-		buf = next;
-		flush_count++;
-	}
-
-	odp_atomic_add_u64(&pool->poolstats.bufallocs, buf_cache->bufallocs);
-	odp_atomic_add_u64(&pool->poolstats.buffrees,
-			   buf_cache->buffrees - flush_count);
-
-	buf_cache->buf_freelist = NULL;
-	buf_cache->bufallocs = 0;
-	buf_cache->buffrees = 0;
-}
-
 static inline odp_pool_t pool_index_to_handle(uint32_t pool_id)
 {
 	return _odp_cast_scalar(odp_pool_t, pool_id);
diff --git a/platform/linux-generic/odp_buffer.c b/platform/linux-generic/odp_buffer.c
index e7e4d58..ce2fdba 100644
--- a/platform/linux-generic/odp_buffer.c
+++ b/platform/linux-generic/odp_buffer.c
@@ -67,9 +67,6 @@  int odp_buffer_snprint(char *str, uint32_t n, odp_buffer_t buf)
 	len += snprintf(&str[len], n-len,
 			"  size         %" PRIu32 "\n",        hdr->size);
 	len += snprintf(&str[len], n-len,
-			"  ref_count    %" PRIu32 "\n",
-			odp_atomic_load_u32(&hdr->ref_count));
-	len += snprintf(&str[len], n-len,
 			"  type         %i\n",        hdr->type);
 
 	return len;
diff --git a/platform/linux-generic/odp_packet.c b/platform/linux-generic/odp_packet.c
index 0e319d2..474fa81 100644
--- a/platform/linux-generic/odp_packet.c
+++ b/platform/linux-generic/odp_packet.c
@@ -972,10 +972,7 @@  int _odp_packet_copy_md_to_packet(odp_packet_t srcpkt, odp_packet_t dstpkt)
 		       srchdr->buf_hdr.uarea_size ?
 		       dsthdr->buf_hdr.uarea_size :
 		       srchdr->buf_hdr.uarea_size);
-	odp_atomic_store_u32(
-		&dsthdr->buf_hdr.ref_count,
-		odp_atomic_load_u32(
-			&srchdr->buf_hdr.ref_count));
+
 	copy_packet_parser_metadata(srchdr, dsthdr);
 
 	/* Metadata copied, but return indication of whether the packet
diff --git a/platform/linux-generic/odp_pool.c b/platform/linux-generic/odp_pool.c
index 419f03f..0a427ed 100644
--- a/platform/linux-generic/odp_pool.c
+++ b/platform/linux-generic/odp_pool.c
@@ -57,8 +57,15 @@  static const char SHM_DEFAULT_NAME[] = "odp_buffer_pools";
 /* Pool entry pointers (for inlining) */
 void *pool_entry_ptr[ODP_CONFIG_POOLS];
 
-/* Cache thread id locally for local cache performance */
-static __thread int local_id;
+/* Thread local variables */
+typedef struct pool_local_t {
+	local_cache_t *cache[ODP_CONFIG_POOLS];
+	int thr_id;
+} pool_local_t;
+
+static __thread pool_local_t local;
+
+static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s *pool);
 
 int odp_pool_init_global(void)
 {
@@ -111,7 +118,19 @@  int odp_pool_init_global(void)
 
 int odp_pool_init_local(void)
 {
-	local_id = odp_thread_id();
+	pool_entry_t *pool;
+	int i;
+	int thr_id = odp_thread_id();
+
+	memset(&local, 0, sizeof(pool_local_t));
+
+	for (i = 0; i < ODP_CONFIG_POOLS; i++) {
+		pool           = get_pool_entry(i);
+		local.cache[i] = &pool->s.local_cache[thr_id];
+		local.cache[i]->s.num_buf = 0;
+	}
+
+	local.thr_id = thr_id;
 	return 0;
 }
 
@@ -144,7 +163,14 @@  int odp_pool_term_global(void)
 
 int odp_pool_term_local(void)
 {
-	_odp_flush_caches();
+	int i;
+
+	for (i = 0; i < ODP_CONFIG_POOLS; i++) {
+		pool_entry_t *pool = get_pool_entry(i);
+
+		flush_cache(local.cache[i], &pool->s);
+	}
+
 	return 0;
 }
 
@@ -179,10 +205,53 @@  int odp_pool_capability(odp_pool_capability_t *capa)
 	return 0;
 }
 
-/**
+static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)
+{
+	odp_buffer_hdr_t *myhead;
+
+	POOL_LOCK(&pool->buf_lock);
+
+	myhead = pool->buf_freelist;
+
+	if (odp_unlikely(myhead == NULL)) {
+		POOL_UNLOCK(&pool->buf_lock);
+		odp_atomic_inc_u64(&pool->poolstats.bufempty);
+	} else {
+		pool->buf_freelist = myhead->next;
+		POOL_UNLOCK(&pool->buf_lock);
+
+		odp_atomic_fetch_sub_u32(&pool->bufcount, 1);
+		odp_atomic_inc_u64(&pool->poolstats.bufallocs);
+	}
+
+	return (void *)myhead;
+}
+
+static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t *buf)
+{
+	if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {
+		while (buf->segcount > 0) {
+			if (buffer_is_secure(buf) || pool_is_secure(pool))
+				memset(buf->addr[buf->segcount - 1],
+				       0, buf->segsize);
+			ret_blk(pool, buf->addr[--buf->segcount]);
+		}
+		buf->size = 0;
+	}
+
+	buf->allocator = ODP_FREEBUF;  /* Mark buffer free */
+	POOL_LOCK(&pool->buf_lock);
+	buf->next = pool->buf_freelist;
+	pool->buf_freelist = buf;
+	POOL_UNLOCK(&pool->buf_lock);
+
+	odp_atomic_fetch_add_u32(&pool->bufcount, 1);
+	odp_atomic_inc_u64(&pool->poolstats.buffrees);
+}
+
+/*
  * Pool creation
  */
-
 odp_pool_t _pool_create(const char *name,
 			odp_pool_param_t *params,
 			uint32_t shmflags)
@@ -208,9 +277,6 @@  odp_pool_t _pool_create(const char *name,
 	/* Restriction for v1.0: All non-packet buffers are unsegmented */
 	int unseg = 1;
 
-	/* Restriction for v1.0: No zeroization support */
-	const int zeroized = 0;
-
 	uint32_t blk_size, buf_stride, buf_num, blk_num, seg_len = 0;
 	uint32_t buf_align =
 		params->type == ODP_POOL_BUFFER ? params->buf.align : 0;
@@ -350,7 +416,6 @@  odp_pool_t _pool_create(const char *name,
 		POOL_UNLOCK(&pool->s.lock);
 
 		pool->s.flags.unsegmented = unseg;
-		pool->s.flags.zeroized = zeroized;
 		pool->s.seg_size = unseg ? blk_size : seg_len;
 		pool->s.blk_size = blk_size;
 
@@ -383,9 +448,7 @@  odp_pool_t _pool_create(const char *name,
 			/* Iniitalize buffer metadata */
 			tmp->allocator = ODP_FREEBUF;
 			tmp->flags.all = 0;
-			tmp->flags.zeroized = zeroized;
 			tmp->size = 0;
-			odp_atomic_init_u32(&tmp->ref_count, 0);
 			tmp->type = params->type;
 			tmp->event_type = params->type;
 			tmp->pool_hdl = pool->s.pool_hdl;
@@ -503,6 +566,41 @@  int odp_pool_info(odp_pool_t pool_hdl, odp_pool_info_t *info)
 	return 0;
 }
 
+static inline void get_local_cache_bufs(local_cache_t *buf_cache, uint32_t idx,
+					odp_buffer_hdr_t *buf_hdr[],
+					uint32_t num)
+{
+	uint32_t i;
+
+	for (i = 0; i < num; i++) {
+		buf_hdr[i] = buf_cache->s.buf[idx + i];
+		odp_prefetch(buf_hdr[i]);
+		odp_prefetch_store(buf_hdr[i]);
+	}
+}
+
+static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s *pool)
+{
+	uint32_t flush_count = 0;
+	uint32_t num;
+
+	while ((num = buf_cache->s.num_buf)) {
+		odp_buffer_hdr_t *buf;
+
+		buf = buf_cache->s.buf[num - 1];
+		ret_buf(pool, buf);
+		flush_count++;
+		buf_cache->s.num_buf--;
+	}
+
+	odp_atomic_add_u64(&pool->poolstats.bufallocs, buf_cache->s.bufallocs);
+	odp_atomic_add_u64(&pool->poolstats.buffrees,
+			   buf_cache->s.buffrees - flush_count);
+
+	buf_cache->s.bufallocs = 0;
+	buf_cache->s.buffrees = 0;
+}
+
 int odp_pool_destroy(odp_pool_t pool_hdl)
 {
 	uint32_t pool_id = pool_handle_to_index(pool_hdl);
@@ -621,71 +719,207 @@  void seg_free_tail(odp_buffer_hdr_t *buf_hdr, int segcount)
 	buf_hdr->size      = buf_hdr->segcount * pool->s.seg_size;
 }
 
-odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)
+static inline int get_local_bufs(local_cache_t *buf_cache,
+				 odp_buffer_hdr_t *buf_hdr[], uint32_t max_num)
+{
+	uint32_t num_buf = buf_cache->s.num_buf;
+	uint32_t num = num_buf;
+
+	if (odp_unlikely(num_buf == 0))
+		return 0;
+
+	if (odp_likely(max_num < num))
+		num = max_num;
+
+	get_local_cache_bufs(buf_cache, num_buf - num, buf_hdr, num);
+	buf_cache->s.num_buf   -= num;
+	buf_cache->s.bufallocs += num;
+
+	return num;
+}
+
+static inline void ret_local_buf(local_cache_t *buf_cache, uint32_t idx,
+				 odp_buffer_hdr_t *buf)
+{
+	buf_cache->s.buf[idx] = buf;
+	buf_cache->s.num_buf++;
+	buf_cache->s.buffrees++;
+}
+
+static inline void ret_local_bufs(local_cache_t *buf_cache, uint32_t idx,
+				  odp_buffer_hdr_t *buf[], int num_buf)
+{
+	int i;
+
+	for (i = 0; i < num_buf; i++)
+		buf_cache->s.buf[idx + i] = buf[i];
+
+	buf_cache->s.num_buf  += num_buf;
+	buf_cache->s.buffrees += num_buf;
+}
+
+int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,
+		       odp_buffer_t buf[], int max_num)
 {
 	uint32_t pool_id = pool_handle_to_index(pool_hdl);
 	pool_entry_t *pool = get_pool_entry(pool_id);
 	uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;
-	odp_anybuf_t *buf;
+	odp_buffer_hdr_t *buf_tbl[max_num];
+	odp_buffer_hdr_t *buf_hdr;
+	int num, i;
+	intmax_t needed;
+	void *blk;
 
 	/* Reject oversized allocation requests */
 	if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||
 	    (!pool->s.flags.unsegmented &&
 	     totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))
-		return ODP_BUFFER_INVALID;
+		return 0;
 
 	/* Try to satisfy request from the local cache */
-	buf = (odp_anybuf_t *)
-		(void *)get_local_buf(&pool->s.local_cache[local_id],
-				      &pool->s, totsize);
+	num = get_local_bufs(local.cache[pool_id], buf_tbl, max_num);
 
 	/* If cache is empty, satisfy request from the pool */
-	if (odp_unlikely(buf == NULL)) {
-		buf = (odp_anybuf_t *)(void *)get_buf(&pool->s);
+	if (odp_unlikely(num < max_num)) {
+		for (; num < max_num; num++) {
+			buf_hdr = get_buf(&pool->s);
 
-		if (odp_unlikely(buf == NULL))
+			if (odp_unlikely(buf_hdr == NULL))
+				goto pool_empty;
+
+			/* Get blocks for this buffer, if pool uses
+			 * application data */
+			if (buf_hdr->size < totsize) {
+				uint32_t segcount;
+
+				needed = totsize - buf_hdr->size;
+				do {
+					blk = get_blk(&pool->s);
+					if (odp_unlikely(blk == NULL)) {
+						ret_buf(&pool->s, buf_hdr);
+						goto pool_empty;
+					}
+
+					segcount = buf_hdr->segcount++;
+					buf_hdr->addr[segcount] = blk;
+					needed -= pool->s.seg_size;
+				} while (needed > 0);
+				buf_hdr->size = buf_hdr->segcount *
+						pool->s.seg_size;
+			}
+
+			buf_tbl[num] = buf_hdr;
+		}
+	}
+
+pool_empty:
+	for (i = 0; i < num; i++) {
+		buf_hdr = buf_tbl[i];
+
+		/* Mark buffer as allocated */
+		buf_hdr->allocator = local.thr_id;
+
+		/* By default, buffers are not associated with
+		 * an ordered queue */
+		buf_hdr->origin_qe = NULL;
+
+		buf[i] = odp_hdr_to_buf(buf_hdr);
+
+		/* Add more segments if buffer from local cache is too small */
+		if (odp_unlikely(buf_hdr->size < totsize)) {
+			needed = totsize - buf_hdr->size;
+			do {
+				blk = get_blk(&pool->s);
+				if (odp_unlikely(blk == NULL)) {
+					int j;
+
+					ret_buf(&pool->s, buf_hdr);
+					buf_hdr = NULL;
+					local.cache[pool_id]->s.buffrees--;
+
+					/* move remaining bufs up one step
+					 * and update loop counters */
+					num--;
+					for (j = i; j < num; j++)
+						buf_tbl[j] = buf_tbl[j + 1];
+
+					i--;
+					break;
+				}
+				needed -= pool->s.seg_size;
+				buf_hdr->addr[buf_hdr->segcount++] = blk;
+				buf_hdr->size = buf_hdr->segcount *
+						pool->s.seg_size;
+			} while (needed > 0);
+		}
+	}
+
+	return num;
+}
+
+odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)
+{
+	uint32_t pool_id = pool_handle_to_index(pool_hdl);
+	pool_entry_t *pool = get_pool_entry(pool_id);
+	uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;
+	odp_buffer_hdr_t *buf_hdr;
+	intmax_t needed;
+	void *blk;
+
+	/* Reject oversized allocation requests */
+	if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||
+	    (!pool->s.flags.unsegmented &&
+	     totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))
+		return 0;
+
+	/* Try to satisfy request from the local cache. If cache is empty,
+	 * satisfy request from the pool */
+	if (odp_unlikely(!get_local_bufs(local.cache[pool_id], &buf_hdr, 1))) {
+		buf_hdr = get_buf(&pool->s);
+
+		if (odp_unlikely(buf_hdr == NULL))
 			return ODP_BUFFER_INVALID;
 
 		/* Get blocks for this buffer, if pool uses application data */
-		if (buf->buf.size < totsize) {
-			intmax_t needed = totsize - buf->buf.size;
+		if (buf_hdr->size < totsize) {
+			needed = totsize - buf_hdr->size;
 			do {
-				uint8_t *blk = get_blk(&pool->s);
-				if (blk == NULL) {
-					ret_buf(&pool->s, &buf->buf);
+				blk = get_blk(&pool->s);
+				if (odp_unlikely(blk == NULL)) {
+					ret_buf(&pool->s, buf_hdr);
 					return ODP_BUFFER_INVALID;
 				}
-				buf->buf.addr[buf->buf.segcount++] = blk;
+				buf_hdr->addr[buf_hdr->segcount++] = blk;
 				needed -= pool->s.seg_size;
 			} while (needed > 0);
-			buf->buf.size = buf->buf.segcount * pool->s.seg_size;
+			buf_hdr->size = buf_hdr->segcount * pool->s.seg_size;
 		}
 	}
-
 	/* Mark buffer as allocated */
-	buf->buf.allocator = local_id;
+	buf_hdr->allocator = local.thr_id;
 
-	/* By default, buffers inherit their pool's zeroization setting */
-	buf->buf.flags.zeroized = pool->s.flags.zeroized;
+	/* By default, buffers are not associated with
+	 * an ordered queue */
+	buf_hdr->origin_qe = NULL;
 
-	/* By default, buffers are not associated with an ordered queue */
-	buf->buf.origin_qe = NULL;
-
-	return odp_hdr_to_buf(&buf->buf);
-}
-
-int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,
-		       odp_buffer_t buf[], int num)
-{
-	int count;
-
-	for (count = 0; count < num; ++count) {
-		buf[count] = buffer_alloc(pool_hdl, size);
-		if (buf[count] == ODP_BUFFER_INVALID)
-			break;
+	/* Add more segments if buffer from local cache is too small */
+	if (odp_unlikely(buf_hdr->size < totsize)) {
+		needed = totsize - buf_hdr->size;
+		do {
+			blk = get_blk(&pool->s);
+			if (odp_unlikely(blk == NULL)) {
+				ret_buf(&pool->s, buf_hdr);
+				buf_hdr = NULL;
+				local.cache[pool_id]->s.buffrees--;
+				return ODP_BUFFER_INVALID;
+			}
+			buf_hdr->addr[buf_hdr->segcount++] = blk;
+			needed -= pool->s.seg_size;
+		} while (needed > 0);
+		buf_hdr->size = buf_hdr->segcount * pool->s.seg_size;
 	}
 
-	return count;
+	return odp_hdr_to_buf(buf_hdr);
 }
 
 odp_buffer_t odp_buffer_alloc(odp_pool_t pool_hdl)
@@ -701,35 +935,132 @@  int odp_buffer_alloc_multi(odp_pool_t pool_hdl, odp_buffer_t buf[], int num)
 	return buffer_alloc_multi(pool_hdl, buf_size, buf, num);
 }
 
-void odp_buffer_free(odp_buffer_t buf)
+static void multi_pool_free(odp_buffer_hdr_t *buf_hdr[], int num_buf)
 {
-	odp_buffer_hdr_t *buf_hdr = odp_buf_to_hdr(buf);
-	pool_entry_t *pool = odp_buf_to_pool(buf_hdr);
+	uint32_t pool_id, num;
+	local_cache_t *buf_cache;
+	pool_entry_t *pool;
+	int i, j, idx;
 
+	for (i = 0; i < num_buf; i++) {
+		pool_id   =  pool_handle_to_index(buf_hdr[i]->pool_hdl);
+		buf_cache = local.cache[pool_id];
+		num       = buf_cache->s.num_buf;
+
+		if (num < POOL_MAX_LOCAL_BUFS) {
+			ret_local_buf(buf_cache, num, buf_hdr[i]);
+			continue;
+		}
+
+		idx  = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;
+		pool = get_pool_entry(pool_id);
+
+		/* local cache full, return a chunk */
+		for (j = 0; j < POOL_CHUNK_SIZE; j++) {
+			odp_buffer_hdr_t *tmp;
+
+			tmp = buf_cache->s.buf[idx + i];
+			ret_buf(&pool->s, tmp);
+		}
+
+		num = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;
+		buf_cache->s.num_buf = num;
+		ret_local_buf(buf_cache, num, buf_hdr[i]);
+	}
+}
+
+void buffer_free_multi(uint32_t pool_id,
+		       const odp_buffer_t buf[], int num_free)
+{
+	local_cache_t *buf_cache = local.cache[pool_id];
+	uint32_t num;
+	int i, idx;
+	pool_entry_t *pool;
+	odp_buffer_hdr_t *buf_hdr[num_free];
+	int multi_pool = 0;
+
+	for (i = 0; i < num_free; i++) {
+		uint32_t id;
+
+		buf_hdr[i] = odp_buf_to_hdr(buf[i]);
+		ODP_ASSERT(buf_hdr[i]->allocator != ODP_FREEBUF);
+		buf_hdr[i]->allocator = ODP_FREEBUF;
+		id = pool_handle_to_index(buf_hdr[i]->pool_hdl);
+		multi_pool |= (pool_id != id);
+	}
+
+	if (odp_unlikely(multi_pool)) {
+		multi_pool_free(buf_hdr, num_free);
+		return;
+	}
+
+	num = buf_cache->s.num_buf;
+
+	if (odp_likely((num + num_free) < POOL_MAX_LOCAL_BUFS)) {
+		ret_local_bufs(buf_cache, num, buf_hdr, num_free);
+		return;
+	}
+
+	pool = get_pool_entry(pool_id);
+
+	/* Return at least one chunk into the global pool */
+	if (odp_unlikely(num_free > POOL_CHUNK_SIZE)) {
+		for (i = 0; i < num_free; i++)
+			ret_buf(&pool->s, buf_hdr[i]);
+
+		return;
+	}
+
+	idx = num - POOL_CHUNK_SIZE;
+	for (i = 0; i < POOL_CHUNK_SIZE; i++)
+		ret_buf(&pool->s, buf_cache->s.buf[idx + i]);
+
+	num -= POOL_CHUNK_SIZE;
+	buf_cache->s.num_buf = num;
+	ret_local_bufs(buf_cache, num, buf_hdr, num_free);
+}
+
+void buffer_free(uint32_t pool_id, const odp_buffer_t buf)
+{
+	local_cache_t *buf_cache = local.cache[pool_id];
+	uint32_t num;
+	int i;
+	pool_entry_t *pool;
+	odp_buffer_hdr_t *buf_hdr;
+
+	buf_hdr = odp_buf_to_hdr(buf);
 	ODP_ASSERT(buf_hdr->allocator != ODP_FREEBUF);
+	buf_hdr->allocator = ODP_FREEBUF;
 
-	if (odp_unlikely(pool->s.buf_low_wm_assert || pool->s.blk_low_wm_assert))
-		ret_buf(&pool->s, buf_hdr);
-	else
-		ret_local_buf(&pool->s.local_cache[local_id], buf_hdr);
+	num = buf_cache->s.num_buf;
+
+	if (odp_likely((num + 1) < POOL_MAX_LOCAL_BUFS)) {
+		ret_local_bufs(buf_cache, num, &buf_hdr, 1);
+		return;
+	}
+
+	pool = get_pool_entry(pool_id);
+
+	num -= POOL_CHUNK_SIZE;
+	for (i = 0; i < POOL_CHUNK_SIZE; i++)
+		ret_buf(&pool->s, buf_cache->s.buf[num + i]);
+
+	buf_cache->s.num_buf = num;
+	ret_local_bufs(buf_cache, num, &buf_hdr, 1);
+}
+
+void odp_buffer_free(odp_buffer_t buf)
+{
+	uint32_t pool_id = pool_id_from_buf(buf);
+
+	buffer_free(pool_id, buf);
 }
 
 void odp_buffer_free_multi(const odp_buffer_t buf[], int num)
 {
-	int i;
+	uint32_t pool_id = pool_id_from_buf(buf[0]);
 
-	for (i = 0; i < num; ++i)
-		odp_buffer_free(buf[i]);
-}
-
-void _odp_flush_caches(void)
-{
-	int i;
-
-	for (i = 0; i < ODP_CONFIG_POOLS; i++) {
-		pool_entry_t *pool = get_pool_entry(i);
-		flush_cache(&pool->s.local_cache[local_id], &pool->s);
-	}
+	buffer_free_multi(pool_id, buf, num);
 }
 
 void odp_pool_print(odp_pool_t pool_hdl)
@@ -774,7 +1105,6 @@  void odp_pool_print(odp_pool_t pool_hdl)
 		pool->s.quiesced ? "quiesced" : "active");
 	ODP_DBG(" pool opts       %s, %s, %s\n",
 		pool->s.flags.unsegmented ? "unsegmented" : "segmented",
-		pool->s.flags.zeroized ? "zeroized" : "non-zeroized",
 		pool->s.flags.predefined  ? "predefined" : "created");
 	ODP_DBG(" pool base       %p\n",  pool->s.pool_base_addr);
 	ODP_DBG(" pool size       %zu (%zu pages)\n",
@@ -817,10 +1147,11 @@  void odp_pool_print(odp_pool_t pool_hdl)
 	ODP_DBG(" blk low wm count    %lu\n", blklowmct);
 }
 
-
 odp_pool_t odp_buffer_pool(odp_buffer_t buf)
 {
-	return odp_buf_to_hdr(buf)->pool_hdl;
+	uint32_t pool_id = pool_id_from_buf(buf);
+
+	return pool_index_to_handle(pool_id);
 }
 
 void odp_pool_param_init(odp_pool_param_t *params)