diff mbox series

[v2,4/5] resolv: Optimize inet_ntop

Message ID 20250604204332.2090912-5-adhemerval.zanella@linaro.org
State New
Headers show
Series Optimize inet_ntop | expand

Commit Message

Adhemerval Zanella Netto June 4, 2025, 8:42 p.m. UTC
The benchtests/inet_ntop_ipv4 and benchtests/inet_ntop_ipv6 profile
shows that most of time is spent in costly sprint operations:

$ perf record ./benchtests/bench-inet_ntop_ipv4 && perf report --stdio
[...]
    38.53%  bench-inet_ntop  libc.so               [.] __printf_buffer
    18.69%  bench-inet_ntop  libc.so               [.] __printf_buffer_write
    11.01%  bench-inet_ntop  libc.so               [.] _itoa_word
     8.02%  bench-inet_ntop  bench-inet_ntop_ipv4  [.] bench_start
     6.99%  bench-inet_ntop  libc.so               [.] __memmove_avx_unaligned_erms
     3.86%  bench-inet_ntop  libc.so               [.] __strchrnul_avx2
     2.82%  bench-inet_ntop  libc.so               [.] __strcpy_avx2
     1.90%  bench-inet_ntop  libc.so               [.] inet_ntop4
     1.78%  bench-inet_ntop  libc.so               [.] __vsprintf_internal
     1.55%  bench-inet_ntop  libc.so               [.] __sprintf_chk
     1.18%  bench-inet_ntop  libc.so               [.] __GI___inet_ntop

$ perf record ./benchtests/bench-inet_ntop_ipv6 && perf report --stdio
    35.44%  bench-inet_ntop  libc.so               [.] __printf_buffer
    14.35%  bench-inet_ntop  libc.so               [.] __printf_buffer_write
    10.27%  bench-inet_ntop  libc.so               [.] __GI___inet_ntop
     7.93%  bench-inet_ntop  libc.so               [.] _itoa_word
     7.00%  bench-inet_ntop  libc.so               [.] __sprintf_chk
     6.20%  bench-inet_ntop  libc.so               [.] __vsprintf_internal
     5.26%  bench-inet_ntop  libc.so               [.] __strchrnul_avx2
     5.05%  bench-inet_ntop  bench-inet_ntop_ipv6  [.] bench_start
     3.70%  bench-inet_ntop  libc.so               [.] __memmove_avx_unaligned_erms
     2.11%  bench-inet_ntop  libc.so               [.] __printf_buffer_done

A new implementation is used instead:

  * The printf usage is replaced with an expanded function that prints
    either an IPv4 octet or an IPv6 quartet;

  * The strcpy is replaced with a memcpy (since ABIs usually tends to
    optimize the latter);

  * For IPv6, the '::' shorthanding is done in-place instead of using
    a temporary buffer.

  * An temporary buffer is used iff the size if larger than
    INET_ADDRSTRLEN/INET6_ADDRSTRLEN.

  * Inline is used for both inet_ntop4 and inet_ntop6,

The code is significand rewrote, so I take this requires a new license.

The performance results on aarch64 Neoverse1 with gcc 14.2.1:

* master

aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 1.43067e+09,
    "iterations": 8e+06,
    "reciprocal-throughput": 178.572,
    "latency": 179.096,
    "max-throughput": 5.59997e+06,
    "min-throughput": 5.58359e+06
   }
aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6
  "inet_ntop_ipv6": {
   "workload-ipv6-random": {
    "duration": 1.68539e+09,
    "iterations": 4e+06,
    "reciprocal-throughput": 421.307,
    "latency": 421.388,
    "max-throughput": 2.37357e+06,
    "min-throughput": 2.37311e+06
   }
  }

* patched

aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 1.06133e+09,
    "iterations": 5.6e+07,
    "reciprocal-throughput": 18.8482,
    "latency": 19.0565,
    "max-throughput": 5.30555e+07,
    "min-throughput": 5.24755e+07
   }
  }
aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6
  "inet_ntop_ipv6": {
   "workload-ipv6-random": {
    "duration": 1.01246e+09,
    "iterations": 2.4e+07,
    "reciprocal-throughput": 42.5576,
    "latency": 41.8139,
    "max-throughput": 2.34976e+07,
    "min-throughput": 2.39155e+07
   }
  }

Checked on aarch64-linux-gnu and x86_64-linux-gnu.
---
 resolv/inet_ntop.c | 301 +++++++++++++++++++++++----------------------
 1 file changed, 156 insertions(+), 145 deletions(-)
diff mbox series

Patch

diff --git a/resolv/inet_ntop.c b/resolv/inet_ntop.c
index 5c414373b7..299756f8ee 100644
--- a/resolv/inet_ntop.c
+++ b/resolv/inet_ntop.c
@@ -1,136 +1,155 @@ 
-/*
- * Copyright (c) 1996-1999 by Internet Software Consortium.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
- * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
- * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
- * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
- * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
+/* Convert IPv4/IPv6 addresses from binary to text form.
+   Copyright (C) 1996-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/socket.h>
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
 
-#include <netinet/in.h>
 #include <arpa/inet.h>
 #include <arpa/nameser.h>
-
 #include <errno.h>
-#include <stdio.h>
 #include <string.h>
+#include <_itoa.h>
 
-#ifdef SPRINTF_CHAR
-# define SPRINTF(x) strlen (sprintf /**/ x)
-#else
-# define SPRINTF(x) ((size_t) sprintf x)
-#endif
-
-/*
- * WARNING: Don't even consider trying to compile this on a system where
- * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
- */
-
-static const char *inet_ntop4 (const u_char *src, char *dst, socklen_t size);
-static const char *inet_ntop6 (const u_char *src, char *dst, socklen_t size);
-
-/* char *
- * __inet_ntop(af, src, dst, size)
- *	convert a network format address to presentation format.
- * return:
- *	pointer to presentation format address (`dst'), or NULL (see errno).
- * author:
- *	Paul Vixie, 1996.
- */
-const char *
-__inet_ntop (int af, const void *src, char *dst, socklen_t size)
+static inline char *
+put_uint8 (uint8_t word, char *tp)
 {
-  switch (af)
+  int s = 1;
+  if (word >= 10)
     {
-    case AF_INET:
-      return (inet_ntop4 (src, dst, size));
-    case AF_INET6:
-      return (inet_ntop6 (src, dst, size));
-    default:
-      __set_errno (EAFNOSUPPORT);
-      return (NULL);
+      if (word >= 100)
+	{
+	  tp[2] = '0' + word % 10;
+	  word /= 10;
+	  s += 1;
+	}
+
+      tp[1] = '0' + word % 10;
+      word /= 10;
+      s += 1;
     }
-  /* NOTREACHED */
+  *tp = '0' + word % 10;
+  return tp + s;
 }
-libc_hidden_def (__inet_ntop)
-weak_alias (__inet_ntop, inet_ntop)
 
-/* const char *
- * inet_ntop4(src, dst, size)
- *	format an IPv4 address
- * return:
- *	`dst' (as a const)
- * notes:
- *	(1) uses no statics
- *	(2) takes a u_char* not an in_addr as input
- * author:
- *	Paul Vixie, 1996.
- */
-static const char *
-inet_ntop4 (const u_char *src, char *dst, socklen_t size)
+static inline char *
+put_uint16 (uint16_t word, char *tp)
 {
-  static const char fmt[] = "%u.%u.%u.%u";
-  char tmp[sizeof "255.255.255.255"];
+  if (word >= 0x1000)
+    *tp++ = _itoa_lower_digits[(word >> 12) & 0xf];
+  if (word >= 0x100)
+    *tp++ = _itoa_lower_digits[(word >> 8) & 0xf];
+  if (word >= 0x10)
+    *tp++ = _itoa_lower_digits[(word >> 4) & 0xf];
+  *tp++ = _itoa_lower_digits[word & 0xf];
+  return tp;
+}
 
-  if (SPRINTF ((tmp, fmt, src[0], src[1], src[2], src[3])) >= size)
+static __always_inline char *
+inet_ntop4_format (const uint8_t *src, char *dst)
+{
+  dst = put_uint8 (src[0], dst);
+  *(dst++) = '.';
+  dst = put_uint8 (src[1], dst);
+  *(dst++) = '.';
+  dst = put_uint8 (src[2], dst);
+  *(dst++) = '.';
+  dst = put_uint8 (src[3], dst);
+  *dst++ = '\0';
+  return dst;
+}
+
+static __always_inline const char *
+inet_ntop4 (const uint8_t *src, char *dst, socklen_t size)
+{
+  if (size >= INET_ADDRSTRLEN)
+    {
+      inet_ntop4_format (src, dst);
+      return dst;
+    }
+
+  char tmp[INET_ADDRSTRLEN];
+  char *tp = inet_ntop4_format (src, tmp);
+  socklen_t tmp_s = tp - tmp;
+  if (tmp_s > size)
     {
       __set_errno (ENOSPC);
-      return (NULL);
+      return NULL;
     }
-  return strcpy (dst, tmp);
+  return memcpy (dst, tmp, tmp_s);
 }
 
-/* const char *
- * inet_ntop6(src, dst, size)
- *	convert IPv6 binary address into presentation (printable) format
- * author:
- *	Paul Vixie, 1996.
- */
-static const char *
-inet_ntop6 (const u_char *src, char *dst, socklen_t size)
+struct best_t
 {
-  /*
-   * Note that int32_t and int16_t need only be "at least" large enough
-   * to contain a value of the specified size.  On some systems, like
-   * Crays, there is no such thing as an integer variable with 16 bits.
-   * Keep this in mind if you think this function should have been coded
-   * to use pointer overlays.  All the world's not a VAX.
-   */
-  char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp;
-  struct
-  {
-    int base, len;
-  } best, cur;
-  u_int words[NS_IN6ADDRSZ / NS_INT16SZ];
-  int i;
+  int base;
+  int len;
+};
 
-  /*
-   * Preprocess:
-   *	Copy the input (bytewise) array into a wordwise array.
-   *	Find the longest run of 0x00's in src[] for :: shorthanding.
-   */
-  memset (words, '\0', sizeof words);
-  for (i = 0; i < NS_IN6ADDRSZ; i += 2)
-    words[i / 2] = (src[i] << 8) | src[i + 1];
-  best.base = -1;
-  cur.base = -1;
-  best.len = 0;
-  cur.len = 0;
-  for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++)
+static inline uint16_t
+in6_addr_addr16 (const struct in6_addr *src, int idx)
+{
+  const struct { uint16_t x; } __attribute__((__packed__)) *pptr =
+    (typeof(pptr))(&src->s6_addr16[idx]);
+  return ntohs (pptr->x);
+}
+
+static __always_inline char *
+inet_ntop6_format (const struct in6_addr *src, struct best_t best, char *dst)
+{
+  char *tp = dst;
+  for (int i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++)
     {
-      if (words[i] == 0)
+      /* Are we inside the best run of 0x00's? */
+      if (best.base != -1 && i >= best.base && i < (best.base + best.len))
+	{
+	  if (i == best.base)
+	    *tp++ = ':';
+	  continue;
+	}
+      /* Are we following an initial run of 0x00s or any real hex? */
+      if (i != 0)
+	*tp++ = ':';
+      /* Is this address an encapsulated IPv4? */
+      if (i == 6 && best.base == 0
+	  && (best.len == 6 || (best.len == 5
+				&& in6_addr_addr16 (src, 5) == 0xffff)))
+	{
+	  if (!inet_ntop4 (src->s6_addr + 12, tp,
+			   INET6_ADDRSTRLEN - (tp - dst)))
+	    return NULL;
+	  tp += strlen (tp);
+	  break;
+	}
+      tp = put_uint16 (in6_addr_addr16 (src, i), tp);
+    }
+  /* Was it a trailing run of 0x00's? */
+  if (best.base != -1 && (best.base + best.len) == (NS_IN6ADDRSZ / NS_INT16SZ))
+    *tp++ = ':';
+  *tp++ = '\0';
+
+  return tp;
+}
+
+static inline const char *
+inet_ntop6 (const struct in6_addr *src, char *dst, socklen_t size)
+{
+  struct best_t best = { -1, 0 }, cur = { -1, 0 };
+
+  /* ind the longest run of 0x00's in src[] for :: shorthanding.  */
+  for (int i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++)
+    {
+      if (in6_addr_addr16 (src, i) == 0)
 	{
 	  if (cur.base == -1)
 	    cur.base = i, cur.len = 1;
@@ -155,45 +174,37 @@  inet_ntop6 (const u_char *src, char *dst, socklen_t size)
   if (best.base != -1 && best.len < 2)
     best.base = -1;
 
-  /*
-   * Format the result.
-   */
-  tp = tmp;
-  for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++)
+  if (size >= INET6_ADDRSTRLEN)
     {
-      /* Are we inside the best run of 0x00's? */
-      if (best.base != -1 && i >= best.base && i < (best.base + best.len))
-	{
-	  if (i == best.base)
-	    *tp++ = ':';
-	  continue;
-	}
-      /* Are we following an initial run of 0x00s or any real hex? */
-      if (i != 0)
-	*tp++ = ':';
-      /* Is this address an encapsulated IPv4? */
-      if (i == 6 && best.base == 0
-	  && (best.len == 6 || (best.len == 5 && words[5] == 0xffff)))
-	{
-	  if (!inet_ntop4 (src + 12, tp, sizeof tmp - (tp - tmp)))
-	    return (NULL);
-	  tp += strlen (tp);
-	  break;
-	}
-      tp += SPRINTF ((tp, "%x", words[i]));
+      inet_ntop6_format (src, best, dst);
+      return dst;
     }
-  /* Was it a trailing run of 0x00's? */
-  if (best.base != -1 && (best.base + best.len) == (NS_IN6ADDRSZ / NS_INT16SZ))
-    *tp++ = ':';
-  *tp++ = '\0';
 
-  /*
-   * Check for overflow, copy, and we're done.
-   */
-  if ((socklen_t) (tp - tmp) > size)
+  char tmp[INET6_ADDRSTRLEN];
+  char *tp = inet_ntop6_format (src, best, tmp);
+
+  socklen_t tmp_s = tp - tmp;
+  if (tmp_s > size)
     {
       __set_errno (ENOSPC);
       return (NULL);
     }
-  return strcpy (dst, tmp);
+  return memcpy (dst, tmp, tmp_s);
 }
+
+const char *
+__inet_ntop (int af, const void *src, char *dst, socklen_t size)
+{
+  switch (af)
+    {
+    case AF_INET:
+      return (inet_ntop4 (src, dst, size));
+    case AF_INET6:
+      return (inet_ntop6 (src, dst, size));
+    default:
+      __set_errno (EAFNOSUPPORT);
+      return (NULL);
+    }
+}
+libc_hidden_def (__inet_ntop)
+weak_alias (__inet_ntop, inet_ntop)