diff mbox series

[3/3] resolv: Optimize inet_ntop

Message ID 20250603195332.2822499-4-adhemerval.zanella@linaro.org
State New
Headers show
Series Optimize inet_ntop | expand

Commit Message

Adhemerval Zanella Netto June 3, 2025, 7:51 p.m. UTC
The benchtests/inet_ntop_ipv4 and benchtests/inet_ntop_ipv6 profile
shows that most of time is spent in costly sprint operations:

$ perf record ./benchtests/bench-inet_ntop_ipv4 && perf report --stdio
[...]
    38.53%  bench-inet_ntop  libc.so               [.] __printf_buffer
    18.69%  bench-inet_ntop  libc.so               [.] __printf_buffer_write
    11.01%  bench-inet_ntop  libc.so               [.] _itoa_word
     8.02%  bench-inet_ntop  bench-inet_ntop_ipv4  [.] bench_start
     6.99%  bench-inet_ntop  libc.so               [.] __memmove_avx_unaligned_erms
     3.86%  bench-inet_ntop  libc.so               [.] __strchrnul_avx2
     2.82%  bench-inet_ntop  libc.so               [.] __strcpy_avx2
     1.90%  bench-inet_ntop  libc.so               [.] inet_ntop4
     1.78%  bench-inet_ntop  libc.so               [.] __vsprintf_internal
     1.55%  bench-inet_ntop  libc.so               [.] __sprintf_chk
     1.18%  bench-inet_ntop  libc.so               [.] __GI___inet_ntop

$ perf record ./benchtests/bench-inet_ntop_ipv6 && perf report --stdio
    35.44%  bench-inet_ntop  libc.so               [.] __printf_buffer
    14.35%  bench-inet_ntop  libc.so               [.] __printf_buffer_write
    10.27%  bench-inet_ntop  libc.so               [.] __GI___inet_ntop
     7.93%  bench-inet_ntop  libc.so               [.] _itoa_word
     7.00%  bench-inet_ntop  libc.so               [.] __sprintf_chk
     6.20%  bench-inet_ntop  libc.so               [.] __vsprintf_internal
     5.26%  bench-inet_ntop  libc.so               [.] __strchrnul_avx2
     5.05%  bench-inet_ntop  bench-inet_ntop_ipv6  [.] bench_start
     3.70%  bench-inet_ntop  libc.so               [.] __memmove_avx_unaligned_erms
     2.11%  bench-inet_ntop  libc.so               [.] __printf_buffer_done

The printf usage is replaced with an expanded function that prints
either an IPv4 octet or and IPv6 quartet, the strcpy is replaced
with a memcpy (since ABIs usually optimizes the symbol), and
inline is used for both inet_ntop4 and inet_ntop6.

The performance results on aarch64 Neoverse1 with gcc 14.2.1:

* master

aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 1.43067e+09,
    "iterations": 8e+06,
    "reciprocal-throughput": 178.572,
    "latency": 179.096,
    "max-throughput": 5.59997e+06,
    "min-throughput": 5.58359e+06
   }
aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6
  "inet_ntop_ipv6": {
   "workload-ipv6-random": {
    "duration": 1.68539e+09,
    "iterations": 4e+06,
    "reciprocal-throughput": 421.307,
    "latency": 421.388,
    "max-throughput": 2.37357e+06,
    "min-throughput": 2.37311e+06
   }
  }

* patched

aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 1.06324e+09,
    "iterations": 4.4e+07,
    "reciprocal-throughput": 24.1509,
    "latency": 24.178,
    "max-throughput": 4.14063e+07,
    "min-throughput": 4.13599e+07
   }
aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6
  "inet_ntop_ipv6": {
   "workload-ipv6-random": {
    "duration": 1.08476e+09,
    "iterations": 2.4e+07,
    "reciprocal-throughput": 45.2794,
    "latency": 45.1174,
    "max-throughput": 2.20851e+07,
    "min-throughput": 2.21644e+07
   }
  }

Checked on aarch64-linux-gnu and x86_64-linux-gnu.
---
 resolv/inet_ntop.c | 134 +++++++++++++++++++++++++++++----------------
 1 file changed, 87 insertions(+), 47 deletions(-)

Comments

Paul Eggert June 3, 2025, 10:27 p.m. UTC | #1
On 6/3/25 12:51, Adhemerval Zanella wrote:


> static inline char *put_uint8 (uint8_t word, char *tp)

Put the function name at the start of a line.

> +  intptr_t s = 1;

There should be no need for intptr_t; 'int' is good enough and clearer 
and should generate the same machine code.

> +      tp[2] = _itoa_lower_digits[word % 10];

Don't consult an array; just add '0'.

> +  if (word >= 100)
> ...
> +  if (word >= 10)

Since this is for performance, shouldn't the code avoid conditional 
branches? Something like the following, say:

   static inline char *
   put_uint8 (uint8_t word, char *tp)
   {
     *tp = '0' + word / 100;
     tp += 100 <= word;
     *tp = '0' + word / 10 % 10;
     tp += 10 <= word;
     *tp++ = '0' + word % 10;
     return tp;
   }

Alternatively, if we're going to have conditional branches, there is no 
need to test (word >= 100) if (word >= 10) is false.
Collin Funk June 3, 2025, 10:32 p.m. UTC | #2
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:

> The performance results on aarch64 Neoverse1 with gcc 14.2.1:
>
> * master
>
> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4
>   "inet_ntop_ipv4": {
>    "workload-ipv4-random": {
>     "duration": 1.43067e+09,
>     "iterations": 8e+06,
>     "reciprocal-throughput": 178.572,
>     "latency": 179.096,
>     "max-throughput": 5.59997e+06,
>     "min-throughput": 5.58359e+06
>    }
> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6
>   "inet_ntop_ipv6": {
>    "workload-ipv6-random": {
>     "duration": 1.68539e+09,
>     "iterations": 4e+06,
>     "reciprocal-throughput": 421.307,
>     "latency": 421.388,
>     "max-throughput": 2.37357e+06,
>     "min-throughput": 2.37311e+06
>    }
>   }
>
> * patched
>
> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4
>   "inet_ntop_ipv4": {
>    "workload-ipv4-random": {
>     "duration": 1.06324e+09,
>     "iterations": 4.4e+07,
>     "reciprocal-throughput": 24.1509,
>     "latency": 24.178,
>     "max-throughput": 4.14063e+07,
>     "min-throughput": 4.13599e+07
>    }
> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6
>   "inet_ntop_ipv6": {
>    "workload-ipv6-random": {
>     "duration": 1.08476e+09,
>     "iterations": 2.4e+07,
>     "reciprocal-throughput": 45.2794,
>     "latency": 45.1174,
>     "max-throughput": 2.20851e+07,
>     "min-throughput": 2.21644e+07
>    }
>   }
>
> Checked on aarch64-linux-gnu and x86_64-linux-gnu.
> ---
>  resolv/inet_ntop.c | 134 +++++++++++++++++++++++++++++----------------
>  1 file changed, 87 insertions(+), 47 deletions(-)

This part of the patch series looks good to me. So:

Reviewed-by: Collin Funk <collin.funk1@gmail.com>

I see similar results as well:

    $ grep '^model name' /proc/cpuinfo | sed 1q
    model name	: AMD Ryzen 7 3700X 8-Core Processor
    $ gcc --version | sed 1q
    gcc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2)
    # Pre patch
    $ cat benchtests/bench.out 
    {"timing_type": "hp_timing",
     "functions": {
      "inet_ntop_ipv4": {
       "workload-ipv4-random": {
        "duration": 3.94723e+09,
        "iterations": 8e+06,
        "reciprocal-throughput": 492.274,
        "latency": 494.534,
        "max-throughput": 2.03139e+06,
        "min-throughput": 2.0221e+06
       }
      }
    ,
      "inet_ntop_ipv6": {
       "workload-ipv6-random": {
        "duration": 4.67659e+09,
        "iterations": 4e+06,
        "reciprocal-throughput": 1164.13,
        "latency": 1174.17,
        "max-throughput": 859010,
        "min-throughput": 851668
       }
      }
    
     }
    }
    # Post patch
    $ cat benchtests/bench.out 
    {"timing_type": "hp_timing",
     "functions": {
      "inet_ntop_ipv4": {
       "workload-ipv4-random": {
        "duration": 3.67214e+09,
        "iterations": 8.4e+07,
        "reciprocal-throughput": 43.7862,
        "latency": 43.6457,
        "max-throughput": 2.28382e+07,
        "min-throughput": 2.29118e+07
       }
      }
    ,
      "inet_ntop_ipv6": {
       "workload-ipv6-random": {
        "duration": 3.9483e+09,
        "iterations": 3.6e+07,
        "reciprocal-throughput": 109.916,
        "latency": 109.434,
        "max-throughput": 9.09786e+06,
        "min-throughput": 9.13793e+06
       }
      }
    
     }
    }
    
Thanks,
Collin
Florian Weimer June 4, 2025, 8:09 a.m. UTC | #3
* Adhemerval Zanella:

> -static const char *
> +static inline const char *
>  inet_ntop6 (const u_char *src, char *dst, socklen_t size)
>  {
>  	/*
> @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size)
>  	 */
>  	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp;
>  	struct { int base, len; } best, cur;
> -	u_int words[NS_IN6ADDRSZ / NS_INT16SZ];
> +	uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 };
>  	int i;
>  
>  	/*
> @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size)
>  	 *	Copy the input (bytewise) array into a wordwise array.
>  	 *	Find the longest run of 0x00's in src[] for :: shorthanding.
>  	 */
> -	memset(words, '\0', sizeof words);
>  	for (i = 0; i < NS_IN6ADDRSZ; i += 2)
>  		words[i / 2] = (src[i] << 8) | src[i + 1];

The words copy shouldn't really be needed, GCC can usually combine two
byte accesses into single 16-bit loads.

It should be possible to optimize put_uint16 using SWAR techniques.  We
don't need to do byte-wise output, either.  We can use full 32-bit
writes, potentially with overlaps.

If the user-supplied output buffer has the maximum possible size, we
don't need the temporary copy.  (This applies to the IPv4 variant, too.)

Thanks,
Florian
Adhemerval Zanella Netto June 4, 2025, 2:12 p.m. UTC | #4
On 03/06/25 19:27, Paul Eggert wrote:
> On 6/3/25 12:51, Adhemerval Zanella wrote:
> 
> 
>> static inline char *put_uint8 (uint8_t word, char *tp)
> 
> Put the function name at the start of a line.

Ack.

> 
>> +  intptr_t s = 1;
> 
> There should be no need for intptr_t; 'int' is good enough and clearer and should generate the same machine code.

Ack.

> 
>> +      tp[2] = _itoa_lower_digits[word % 10];
> 
> Don't consult an array; just add '0'.
> 
>> +  if (word >= 100)
>> ...
>> +  if (word >= 10)
> 
> Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say:
> 
>   static inline char *
>   put_uint8 (uint8_t word, char *tp)
>   {
>     *tp = '0' + word / 100;
>     tp += 100 <= word;
>     *tp = '0' + word / 10 % 10;
>     tp += 10 <= word;
>     *tp++ = '0' + word % 10;
>     return tp;
>   }
> 

I am not sure, with gcc 14.2.1 this version is indeed faster on aarch64 (Neoverse N1),
but slower on x86_64 (Zen3):

* patch

x86_64-linux-gnu $ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 3.74298e+09,
    "iterations": 1.12e+08,
    "reciprocal-throughput": 33.4216,
    "latency": 33.4173,
    "max-throughput": 2.99208e+07,
    "min-throughput": 2.99246e+07
   }
  }

* suggestion

x86_64-linux-gnu $ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 3.86481e+09,
    "iterations": 8.8e+07,
    "reciprocal-throughput": 43.8419,
    "latency": 43.9948,
    "max-throughput": 2.28092e+07,
    "min-throughput": 2.27299e+07
   }
  }

* patch

aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 1.07013e+09,
    "iterations": 4.4e+07,
    "reciprocal-throughput": 24.5301,
    "latency": 24.1121,
    "max-throughput": 4.07662e+07,
    "min-throughput": 4.14729e+07
   }
  }

* suggestion

aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 1.05304e+09,
    "iterations": 5.6e+07,
    "reciprocal-throughput": 18.8547,
    "latency": 18.7539,
    "max-throughput": 5.30373e+07,
    "min-throughput": 5.33221e+07
   }
  }

> Alternatively, if we're going to have conditional branches, there is no need to test (word >= 100) if (word >= 10) is false.

It indeed seems to help some chips (aarch64 Neoverse N1):

static inline char *put_uint8 (uint8_t word, char *tp)
{
  int s = 1;
  if (word >= 10)
    {
      if (word >= 100)
        {
          tp[2] = '0' + word % 10;
          word /= 10;
          s += 1;
        }

      tp[1] = '0' + word % 10;
      word /= 10;
      s += 1;
    }
  *tp = '0' + word % 10;
  return tp + s;
}

aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4
  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 1.07388e+09,
    "iterations": 5.2e+07,
    "reciprocal-throughput": 20.3728,
    "latency": 20.9305,
    "max-throughput": 4.90851e+07,
    "min-throughput": 4.77772e+07
   }
  }

I think I will go this version since it seems to be better compromise. I might check
on some other chips I have access.
Adhemerval Zanella Netto June 4, 2025, 2:13 p.m. UTC | #5
On 03/06/25 19:32, Collin Funk wrote:
> Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> 
>> The performance results on aarch64 Neoverse1 with gcc 14.2.1:
>>
>> * master
>>
>> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4
>>   "inet_ntop_ipv4": {
>>    "workload-ipv4-random": {
>>     "duration": 1.43067e+09,
>>     "iterations": 8e+06,
>>     "reciprocal-throughput": 178.572,
>>     "latency": 179.096,
>>     "max-throughput": 5.59997e+06,
>>     "min-throughput": 5.58359e+06
>>    }
>> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6
>>   "inet_ntop_ipv6": {
>>    "workload-ipv6-random": {
>>     "duration": 1.68539e+09,
>>     "iterations": 4e+06,
>>     "reciprocal-throughput": 421.307,
>>     "latency": 421.388,
>>     "max-throughput": 2.37357e+06,
>>     "min-throughput": 2.37311e+06
>>    }
>>   }
>>
>> * patched
>>
>> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4
>>   "inet_ntop_ipv4": {
>>    "workload-ipv4-random": {
>>     "duration": 1.06324e+09,
>>     "iterations": 4.4e+07,
>>     "reciprocal-throughput": 24.1509,
>>     "latency": 24.178,
>>     "max-throughput": 4.14063e+07,
>>     "min-throughput": 4.13599e+07
>>    }
>> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6
>>   "inet_ntop_ipv6": {
>>    "workload-ipv6-random": {
>>     "duration": 1.08476e+09,
>>     "iterations": 2.4e+07,
>>     "reciprocal-throughput": 45.2794,
>>     "latency": 45.1174,
>>     "max-throughput": 2.20851e+07,
>>     "min-throughput": 2.21644e+07
>>    }
>>   }
>>
>> Checked on aarch64-linux-gnu and x86_64-linux-gnu.
>> ---
>>  resolv/inet_ntop.c | 134 +++++++++++++++++++++++++++++----------------
>>  1 file changed, 87 insertions(+), 47 deletions(-)
> 
> This part of the patch series looks good to me. So:
> 
> Reviewed-by: Collin Funk <collin.funk1@gmail.com>

Thanks.

> 
> I see similar results as well:
> 
>     $ grep '^model name' /proc/cpuinfo | sed 1q
>     model name	: AMD Ryzen 7 3700X 8-Core Processor
>     $ gcc --version | sed 1q
>     gcc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2)
>     # Pre patch
>     $ cat benchtests/bench.out 
>     {"timing_type": "hp_timing",
>      "functions": {
>       "inet_ntop_ipv4": {
>        "workload-ipv4-random": {
>         "duration": 3.94723e+09,
>         "iterations": 8e+06,
>         "reciprocal-throughput": 492.274,
>         "latency": 494.534,
>         "max-throughput": 2.03139e+06,
>         "min-throughput": 2.0221e+06
>        }
>       }
>     ,
>       "inet_ntop_ipv6": {
>        "workload-ipv6-random": {
>         "duration": 4.67659e+09,
>         "iterations": 4e+06,
>         "reciprocal-throughput": 1164.13,
>         "latency": 1174.17,
>         "max-throughput": 859010,
>         "min-throughput": 851668
>        }
>       }
>     
>      }
>     }
>     # Post patch
>     $ cat benchtests/bench.out 
>     {"timing_type": "hp_timing",
>      "functions": {
>       "inet_ntop_ipv4": {
>        "workload-ipv4-random": {
>         "duration": 3.67214e+09,
>         "iterations": 8.4e+07,
>         "reciprocal-throughput": 43.7862,
>         "latency": 43.6457,
>         "max-throughput": 2.28382e+07,
>         "min-throughput": 2.29118e+07
>        }
>       }
>     ,
>       "inet_ntop_ipv6": {
>        "workload-ipv6-random": {
>         "duration": 3.9483e+09,
>         "iterations": 3.6e+07,
>         "reciprocal-throughput": 109.916,
>         "latency": 109.434,
>         "max-throughput": 9.09786e+06,
>         "min-throughput": 9.13793e+06
>        }
>       }
>     
>      }
>     }
>     
> Thanks,
> Collin
Adhemerval Zanella Netto June 4, 2025, 3:21 p.m. UTC | #6
On 04/06/25 05:09, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> -static const char *
>> +static inline const char *
>>  inet_ntop6 (const u_char *src, char *dst, socklen_t size)
>>  {
>>  	/*
>> @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size)
>>  	 */
>>  	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp;
>>  	struct { int base, len; } best, cur;
>> -	u_int words[NS_IN6ADDRSZ / NS_INT16SZ];
>> +	uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 };
>>  	int i;
>>  
>>  	/*
>> @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size)
>>  	 *	Copy the input (bytewise) array into a wordwise array.
>>  	 *	Find the longest run of 0x00's in src[] for :: shorthanding.
>>  	 */
>> -	memset(words, '\0', sizeof words);
>>  	for (i = 0; i < NS_IN6ADDRSZ; i += 2)
>>  		words[i / 2] = (src[i] << 8) | src[i + 1];
> 
> The words copy shouldn't really be needed, GCC can usually combine two
> byte accesses into single 16-bit loads.

There is no need for the words array in fact, we can ntohs when
accessing the s6_addr16 for source (and on most chips it should be
lowered efficiently). 

> 
> It should be possible to optimize put_uint16 using SWAR techniques.  We
> don't need to do byte-wise output, either.  We can use full 32-bit
> writes, potentially with overlaps.

Not sure about that because the usual way is to present ipv6 in
quartet and using 32-bit would require to decompose it anyway.  Also
Paul's suggestion to do overlap writes showed mixed results on different
chips.

> 
> If the user-supplied output buffer has the maximum possible size, we
> don't need the temporary copy.  (This applies to the IPv4 variant, too.)
It shows a small performance increase, I have added this change.
Florian Weimer June 4, 2025, 5:08 p.m. UTC | #7
* Adhemerval Zanella Netto:

>> The words copy shouldn't really be needed, GCC can usually combine two
>> byte accesses into single 16-bit loads.
>
> There is no need for the words array in fact, we can ntohs when
> accessing the s6_addr16 for source (and on most chips it should be
> lowered efficiently).

I believe this introduces an alignment requirement that the inet_ntop
specification does not actually provide.

Thanks,
Florian
Adhemerval Zanella Netto June 4, 2025, 5:30 p.m. UTC | #8
On 04/06/25 14:08, Florian Weimer wrote:
> * Adhemerval Zanella Netto:
> 
>>> The words copy shouldn't really be needed, GCC can usually combine two
>>> byte accesses into single 16-bit loads.
>>
>> There is no need for the words array in fact, we can ntohs when
>> accessing the s6_addr16 for source (and on most chips it should be
>> lowered efficiently).
> 
> I believe this introduces an alignment requirement that the inet_ntop
> specification does not actually provide.

Sure, and we can just use a wrapper to handle this correctly:

static inline uint16_t
in6_addr_addr16 (const struct in6_addr *src, int idx)
{
  const struct { uint16_t x; } __attribute__((__packed__)) *pptr =
    (typeof(pptr))(&src->s6_addr16[idx]);
  return ntohs (pptr->x);
}

It uses byte-load on architectures that does not allow unaligned
access (like sparc).  It also generates slight better code than
the usual temporary plus memcpy we do for other uses on sparc
(it does not spill store-byte for the temporary).
Collin Funk June 4, 2025, 5:36 p.m. UTC | #9
Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:

>> This part of the patch series looks good to me. So:
>> 
>> Reviewed-by: Collin Funk <collin.funk1@gmail.com>
>
> Thanks.

NP. How about I write a patch to have 'inet_ntoa' use
'inet_ntop (..., AF_INET)' once all the issues others have noticed are
addressed?

That function could benefit from the same optimization.

Collin
Adhemerval Zanella Netto June 4, 2025, 5:37 p.m. UTC | #10
On 04/06/25 14:36, Collin Funk wrote:
> Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:
> 
>>> This part of the patch series looks good to me. So:
>>>
>>> Reviewed-by: Collin Funk <collin.funk1@gmail.com>
>>
>> Thanks.
> 
> NP. How about I write a patch to have 'inet_ntoa' use
> 'inet_ntop (..., AF_INET)' once all the issues others have noticed are
> addressed?
> 
> That function could benefit from the same optimization.

Sure, if you don't mind I can also add on the v2.
Maciej W. Rozycki June 6, 2025, 4:51 p.m. UTC | #11
On Wed, 4 Jun 2025, Adhemerval Zanella Netto wrote:

> >> +  if (word >= 100)
> >> ...
> >> +  if (word >= 10)
> > 
> > Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say:
> > 
> >   static inline char *
> >   put_uint8 (uint8_t word, char *tp)
> >   {
> >     *tp = '0' + word / 100;
> >     tp += 100 <= word;
> >     *tp = '0' + word / 10 % 10;
> >     tp += 10 <= word;
> >     *tp++ = '0' + word % 10;
> >     return tp;
> >   }
> > 
> 
> I am not sure, with gcc 14.2.1 this version is indeed faster on aarch64 (Neoverse N1),
> but slower on x86_64 (Zen3):

 How many samples did you collect?

 Hmm, I suppose on a modern write-back D$ system consecutive plain writes 
to the same memory location will be coalesced.  But do all the systems do 
write allocation nowadays?  I think this approach asks for a prefetch hint 
at the start of the function.

 I have benchmarked this alternative:

static inline char *
put_uint8 (uint8_t word, char *tp)
{
  unsigned int o, t, h;

  h = word;
  o = h % 10;
  h /= 10;
  t = h % 10;
  h /= 10;

  *tp = '0' + h;
  tp += !!h;
  *tp = '0' + t;
  tp += !!t;
  *tp++ = '0' + o;

  return tp;
}

on my POWER9 system and it produces results similar to Paul's proposal 
(taking single samples only; there's some variation between runs and I 
didn't bother taking the average as that'd require some manual effort):

  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 5.36267e+08,
    "iterations": 3.6e+07,
    "reciprocal-throughput": 14.8943,
    "latency": 14.8984,
    "max-throughput": 6.714e+07,
    "min-throughput": 6.71215e+07
   }
  }

(Paul's) vs:

  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 5.38168e+08,
    "iterations": 3.6e+07,
    "reciprocal-throughput": 14.9995,
    "latency": 14.8988,
    "max-throughput": 6.66691e+07,
    "min-throughput": 6.71196e+07
   }
  }

(mine).  If your benchmarking turns out good with this proposal, then I 
can obtain a representative set of samples for POWER9.

 I can't benchmark Aarch64 or x86-64 easily, but I note that my proposal 
produces fewer multiplications for both plus lets x86-64 take advantage of 
the carry flag via SBB instead of using a discrete conditional-set+add 
sequence, so chances are it'll perform better.  It produces more compact 
code too, and I think it might be more readable for some though YMMV.

 Though the dependency on the quality of the optimiser seems very fragile 
here, e.g. if the locals are changed to a signed data type, then POWER9 
produces an extra instruction that causes performance to drop by ~2.5%, 
but x86-64 is able to convert another conditional-set+add to SBB, which 
for a change likely causes a performance gain.

 Also I think you need to fold the update quoted below into 1/3, so as to 
let the newly-added group of benchmarks be run on its own, or one gets an 
error otherwise:

$ make BENCHSET=bench-resolv bench
The following values in BENCHSET are invalid: bench-resolv
The valid ones are: bench-math bench-pthread bench-string calloc-simple calloc-tcache calloc-thread hash-benchset malloc-simple malloc-tcache malloc-thread math-benchset stdio-benchset stdio-common-benchset stdlib-benchset string-benchset wcsmbs-benchset
Makefile:485: *** Invalid BENCHSET value.  Stop.

  Maciej

---
 benchtests/Makefile |    1 +
 1 file changed, 1 insertion(+)

glibc-bench-resolv.diff
Index: glibc/benchtests/Makefile
===================================================================
--- glibc.orig/benchtests/Makefile
+++ glibc/benchtests/Makefile
@@ -462,6 +462,7 @@ ifneq ($(strip ${BENCHSET}),)
 VALIDBENCHSETNAMES := \
   bench-math \
   bench-pthread \
+  bench-resolv \
   bench-string \
   calloc-simple \
   calloc-tcache \
Maciej W. Rozycki June 6, 2025, 5:13 p.m. UTC | #12
On Fri, 6 Jun 2025, Maciej W. Rozycki wrote:

>   *tp = '0' + t;
>   tp += !!t;

 It has to be:

  tp += !!t | !!h;

obviously here.

>   "inet_ntop_ipv4": {
>    "workload-ipv4-random": {
>     "duration": 5.38168e+08,
>     "iterations": 3.6e+07,
>     "reciprocal-throughput": 14.9995,
>     "latency": 14.8988,
>     "max-throughput": 6.66691e+07,
>     "min-throughput": 6.71196e+07
>    }
>   }

 And it adds an instruction back, scoring:

  "inet_ntop_ipv4": {
   "workload-ipv4-random": {
    "duration": 5.66505e+08,
    "iterations": 3.6e+07,
    "reciprocal-throughput": 15.7215,
    "latency": 15.751,
    "max-throughput": 6.3607e+07,
    "min-throughput": 6.34881e+07
   }
  }

so it may not be worth it on POWER9, but still seems to produce better 
code on Aarch64 and x86-64.

 Sorry about the confusion.

  Maciej
diff mbox series

Patch

diff --git a/resolv/inet_ntop.c b/resolv/inet_ntop.c
index acf5f3cb88..3f2a9ddd87 100644
--- a/resolv/inet_ntop.c
+++ b/resolv/inet_ntop.c
@@ -26,45 +26,44 @@ 
 #include <errno.h>
 #include <stdio.h>
 #include <string.h>
-
-#ifdef SPRINTF_CHAR
-# define SPRINTF(x) strlen(sprintf/**/x)
-#else
-# define SPRINTF(x) ((size_t)sprintf x)
-#endif
+#include <intprops.h>
+#include <_itoa.h>
 
 /*
  * WARNING: Don't even consider trying to compile this on a system where
  * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
  */
 
-static const char *inet_ntop4 (const u_char *src, char *dst, socklen_t size);
-static const char *inet_ntop6 (const u_char *src, char *dst, socklen_t size);
-
-/* char *
- * __inet_ntop(af, src, dst, size)
- *	convert a network format address to presentation format.
- * return:
- *	pointer to presentation format address (`dst'), or NULL (see errno).
- * author:
- *	Paul Vixie, 1996.
- */
-const char *
-__inet_ntop (int af, const void *src, char *dst, socklen_t size)
+static inline char *put_uint8 (uint8_t word, char *tp)
 {
-	switch (af) {
-	case AF_INET:
-		return (inet_ntop4(src, dst, size));
-	case AF_INET6:
-		return (inet_ntop6(src, dst, size));
-	default:
-		__set_errno (EAFNOSUPPORT);
-		return (NULL);
-	}
-	/* NOTREACHED */
+  intptr_t s = 1;
+  if (word >= 100)
+    {
+      tp[2] = _itoa_lower_digits[word % 10];
+      word /= 10;
+      s += 1;
+    }
+  if (word >= 10)
+    {
+      tp[1] = _itoa_lower_digits[word % 10];
+      word /= 10;
+      s += 1;
+    }
+  *tp = _itoa_lower_digits[word % 10];
+  return tp + s;
+}
+
+static inline char *put_uint16 (uint16_t word, char *tp)
+{
+  if (word >= 0x1000)
+    *tp++ = _itoa_lower_digits[(word >> 12) & 0xf];
+  if (word >= 0x100)
+    *tp++ = _itoa_lower_digits[(word >> 8) & 0xf];
+  if (word >= 0x10)
+    *tp++ = _itoa_lower_digits[(word >> 4) & 0xf];
+  *tp++ = _itoa_lower_digits[word & 0xf];
+  return tp;
 }
-libc_hidden_def (__inet_ntop)
-weak_alias (__inet_ntop, inet_ntop)
 
 /* const char *
  * inet_ntop4(src, dst, size)
@@ -74,20 +73,36 @@  weak_alias (__inet_ntop, inet_ntop)
  * notes:
  *	(1) uses no statics
  *	(2) takes a u_char* not an in_addr as input
- * author:
- *	Paul Vixie, 1996.
  */
-static const char *
+static __always_inline const char *
 inet_ntop4 (const u_char *src, char *dst, socklen_t size)
 {
-	static const char fmt[] = "%u.%u.%u.%u";
-	char tmp[sizeof "255.255.255.255"];
+  enum
+    {
+      oct_size = INT_BUFSIZE_BOUND (u_char),
+      tmp_size = 4 * oct_size + 3 /* '.' */ + 1 /* '\0' */
+    };
+  char tmp[tmp_size];
+  char *tmp_r = tmp;
 
-	if (SPRINTF((tmp, fmt, src[0], src[1], src[2], src[3])) >= size) {
-		__set_errno (ENOSPC);
-		return (NULL);
-	}
-	return strcpy(dst, tmp);
+  tmp_r = put_uint8 (src[0], tmp_r);
+  *(tmp_r++) = '.';
+  tmp_r = put_uint8 (src[1], tmp_r);
+  *(tmp_r++) = '.';
+  tmp_r = put_uint8 (src[2], tmp_r);
+  *(tmp_r++) = '.';
+  tmp_r = put_uint8 (src[3], tmp_r);
+  *tmp_r++ = '\0';
+
+  socklen_t tmp_s = tmp_r - tmp;
+  if (tmp_s > size)
+    {
+      __set_errno (ENOSPC);
+      return NULL;
+    }
+  memcpy (dst, tmp, tmp_s);
+
+  return dst;
 }
 
 /* const char *
@@ -96,7 +111,7 @@  inet_ntop4 (const u_char *src, char *dst, socklen_t size)
  * author:
  *	Paul Vixie, 1996.
  */
-static const char *
+static inline const char *
 inet_ntop6 (const u_char *src, char *dst, socklen_t size)
 {
 	/*
@@ -108,7 +123,7 @@  inet_ntop6 (const u_char *src, char *dst, socklen_t size)
 	 */
 	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp;
 	struct { int base, len; } best, cur;
-	u_int words[NS_IN6ADDRSZ / NS_INT16SZ];
+	uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 };
 	int i;
 
 	/*
@@ -116,7 +131,6 @@  inet_ntop6 (const u_char *src, char *dst, socklen_t size)
 	 *	Copy the input (bytewise) array into a wordwise array.
 	 *	Find the longest run of 0x00's in src[] for :: shorthanding.
 	 */
-	memset(words, '\0', sizeof words);
 	for (i = 0; i < NS_IN6ADDRSZ; i += 2)
 		words[i / 2] = (src[i] << 8) | src[i + 1];
 	best.base = -1;
@@ -167,7 +181,7 @@  inet_ntop6 (const u_char *src, char *dst, socklen_t size)
 			tp += strlen(tp);
 			break;
 		}
-		tp += SPRINTF((tp, "%x", words[i]));
+		tp = put_uint16 (words[i], tp);
 	}
 	/* Was it a trailing run of 0x00's? */
 	if (best.base != -1 && (best.base + best.len) ==
@@ -178,9 +192,35 @@  inet_ntop6 (const u_char *src, char *dst, socklen_t size)
 	/*
 	 * Check for overflow, copy, and we're done.
 	 */
-	if ((socklen_t)(tp - tmp) > size) {
+	socklen_t tmp_s = tp - tmp;
+	if (tmp_s > size) {
 		__set_errno (ENOSPC);
 		return (NULL);
 	}
-	return strcpy(dst, tmp);
+	return memcpy(dst, tmp, tmp_s);
 }
+
+/* char *
+ * __inet_ntop(af, src, dst, size)
+ *	convert a network format address to presentation format.
+ * return:
+ *	pointer to presentation format address (`dst'), or NULL (see errno).
+ * author:
+ *	Paul Vixie, 1996.
+ */
+const char *
+__inet_ntop (int af, const void *src, char *dst, socklen_t size)
+{
+	switch (af) {
+	case AF_INET:
+		return (inet_ntop4(src, dst, size));
+	case AF_INET6:
+		return (inet_ntop6(src, dst, size));
+	default:
+		__set_errno (EAFNOSUPPORT);
+		return (NULL);
+	}
+	/* NOTREACHED */
+}
+libc_hidden_def (__inet_ntop)
+weak_alias (__inet_ntop, inet_ntop)