Message ID | 20250603195332.2822499-4-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Optimize inet_ntop | expand |
On 6/3/25 12:51, Adhemerval Zanella wrote: > static inline char *put_uint8 (uint8_t word, char *tp) Put the function name at the start of a line. > + intptr_t s = 1; There should be no need for intptr_t; 'int' is good enough and clearer and should generate the same machine code. > + tp[2] = _itoa_lower_digits[word % 10]; Don't consult an array; just add '0'. > + if (word >= 100) > ... > + if (word >= 10) Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say: static inline char * put_uint8 (uint8_t word, char *tp) { *tp = '0' + word / 100; tp += 100 <= word; *tp = '0' + word / 10 % 10; tp += 10 <= word; *tp++ = '0' + word % 10; return tp; } Alternatively, if we're going to have conditional branches, there is no need to test (word >= 100) if (word >= 10) is false.
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes: > The performance results on aarch64 Neoverse1 with gcc 14.2.1: > > * master > > aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4 > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 1.43067e+09, > "iterations": 8e+06, > "reciprocal-throughput": 178.572, > "latency": 179.096, > "max-throughput": 5.59997e+06, > "min-throughput": 5.58359e+06 > } > aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6 > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 1.68539e+09, > "iterations": 4e+06, > "reciprocal-throughput": 421.307, > "latency": 421.388, > "max-throughput": 2.37357e+06, > "min-throughput": 2.37311e+06 > } > } > > * patched > > aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 1.06324e+09, > "iterations": 4.4e+07, > "reciprocal-throughput": 24.1509, > "latency": 24.178, > "max-throughput": 4.14063e+07, > "min-throughput": 4.13599e+07 > } > aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6 > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 1.08476e+09, > "iterations": 2.4e+07, > "reciprocal-throughput": 45.2794, > "latency": 45.1174, > "max-throughput": 2.20851e+07, > "min-throughput": 2.21644e+07 > } > } > > Checked on aarch64-linux-gnu and x86_64-linux-gnu. > --- > resolv/inet_ntop.c | 134 +++++++++++++++++++++++++++++---------------- > 1 file changed, 87 insertions(+), 47 deletions(-) This part of the patch series looks good to me. So: Reviewed-by: Collin Funk <collin.funk1@gmail.com> I see similar results as well: $ grep '^model name' /proc/cpuinfo | sed 1q model name : AMD Ryzen 7 3700X 8-Core Processor $ gcc --version | sed 1q gcc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) # Pre patch $ cat benchtests/bench.out {"timing_type": "hp_timing", "functions": { "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.94723e+09, "iterations": 8e+06, "reciprocal-throughput": 492.274, "latency": 494.534, "max-throughput": 2.03139e+06, "min-throughput": 2.0221e+06 } } , "inet_ntop_ipv6": { "workload-ipv6-random": { "duration": 4.67659e+09, "iterations": 4e+06, "reciprocal-throughput": 1164.13, "latency": 1174.17, "max-throughput": 859010, "min-throughput": 851668 } } } } # Post patch $ cat benchtests/bench.out {"timing_type": "hp_timing", "functions": { "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.67214e+09, "iterations": 8.4e+07, "reciprocal-throughput": 43.7862, "latency": 43.6457, "max-throughput": 2.28382e+07, "min-throughput": 2.29118e+07 } } , "inet_ntop_ipv6": { "workload-ipv6-random": { "duration": 3.9483e+09, "iterations": 3.6e+07, "reciprocal-throughput": 109.916, "latency": 109.434, "max-throughput": 9.09786e+06, "min-throughput": 9.13793e+06 } } } } Thanks, Collin
* Adhemerval Zanella: > -static const char * > +static inline const char * > inet_ntop6 (const u_char *src, char *dst, socklen_t size) > { > /* > @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) > */ > char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp; > struct { int base, len; } best, cur; > - u_int words[NS_IN6ADDRSZ / NS_INT16SZ]; > + uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 }; > int i; > > /* > @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) > * Copy the input (bytewise) array into a wordwise array. > * Find the longest run of 0x00's in src[] for :: shorthanding. > */ > - memset(words, '\0', sizeof words); > for (i = 0; i < NS_IN6ADDRSZ; i += 2) > words[i / 2] = (src[i] << 8) | src[i + 1]; The words copy shouldn't really be needed, GCC can usually combine two byte accesses into single 16-bit loads. It should be possible to optimize put_uint16 using SWAR techniques. We don't need to do byte-wise output, either. We can use full 32-bit writes, potentially with overlaps. If the user-supplied output buffer has the maximum possible size, we don't need the temporary copy. (This applies to the IPv4 variant, too.) Thanks, Florian
On 03/06/25 19:27, Paul Eggert wrote: > On 6/3/25 12:51, Adhemerval Zanella wrote: > > >> static inline char *put_uint8 (uint8_t word, char *tp) > > Put the function name at the start of a line. Ack. > >> + intptr_t s = 1; > > There should be no need for intptr_t; 'int' is good enough and clearer and should generate the same machine code. Ack. > >> + tp[2] = _itoa_lower_digits[word % 10]; > > Don't consult an array; just add '0'. > >> + if (word >= 100) >> ... >> + if (word >= 10) > > Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say: > > static inline char * > put_uint8 (uint8_t word, char *tp) > { > *tp = '0' + word / 100; > tp += 100 <= word; > *tp = '0' + word / 10 % 10; > tp += 10 <= word; > *tp++ = '0' + word % 10; > return tp; > } > I am not sure, with gcc 14.2.1 this version is indeed faster on aarch64 (Neoverse N1), but slower on x86_64 (Zen3): * patch x86_64-linux-gnu $ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.74298e+09, "iterations": 1.12e+08, "reciprocal-throughput": 33.4216, "latency": 33.4173, "max-throughput": 2.99208e+07, "min-throughput": 2.99246e+07 } } * suggestion x86_64-linux-gnu $ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.86481e+09, "iterations": 8.8e+07, "reciprocal-throughput": 43.8419, "latency": 43.9948, "max-throughput": 2.28092e+07, "min-throughput": 2.27299e+07 } } * patch aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 1.07013e+09, "iterations": 4.4e+07, "reciprocal-throughput": 24.5301, "latency": 24.1121, "max-throughput": 4.07662e+07, "min-throughput": 4.14729e+07 } } * suggestion aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 1.05304e+09, "iterations": 5.6e+07, "reciprocal-throughput": 18.8547, "latency": 18.7539, "max-throughput": 5.30373e+07, "min-throughput": 5.33221e+07 } } > Alternatively, if we're going to have conditional branches, there is no need to test (word >= 100) if (word >= 10) is false. It indeed seems to help some chips (aarch64 Neoverse N1): static inline char *put_uint8 (uint8_t word, char *tp) { int s = 1; if (word >= 10) { if (word >= 100) { tp[2] = '0' + word % 10; word /= 10; s += 1; } tp[1] = '0' + word % 10; word /= 10; s += 1; } *tp = '0' + word % 10; return tp + s; } aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 1.07388e+09, "iterations": 5.2e+07, "reciprocal-throughput": 20.3728, "latency": 20.9305, "max-throughput": 4.90851e+07, "min-throughput": 4.77772e+07 } } I think I will go this version since it seems to be better compromise. I might check on some other chips I have access.
On 03/06/25 19:32, Collin Funk wrote: > Adhemerval Zanella <adhemerval.zanella@linaro.org> writes: > >> The performance results on aarch64 Neoverse1 with gcc 14.2.1: >> >> * master >> >> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4 >> "inet_ntop_ipv4": { >> "workload-ipv4-random": { >> "duration": 1.43067e+09, >> "iterations": 8e+06, >> "reciprocal-throughput": 178.572, >> "latency": 179.096, >> "max-throughput": 5.59997e+06, >> "min-throughput": 5.58359e+06 >> } >> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6 >> "inet_ntop_ipv6": { >> "workload-ipv6-random": { >> "duration": 1.68539e+09, >> "iterations": 4e+06, >> "reciprocal-throughput": 421.307, >> "latency": 421.388, >> "max-throughput": 2.37357e+06, >> "min-throughput": 2.37311e+06 >> } >> } >> >> * patched >> >> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 >> "inet_ntop_ipv4": { >> "workload-ipv4-random": { >> "duration": 1.06324e+09, >> "iterations": 4.4e+07, >> "reciprocal-throughput": 24.1509, >> "latency": 24.178, >> "max-throughput": 4.14063e+07, >> "min-throughput": 4.13599e+07 >> } >> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6 >> "inet_ntop_ipv6": { >> "workload-ipv6-random": { >> "duration": 1.08476e+09, >> "iterations": 2.4e+07, >> "reciprocal-throughput": 45.2794, >> "latency": 45.1174, >> "max-throughput": 2.20851e+07, >> "min-throughput": 2.21644e+07 >> } >> } >> >> Checked on aarch64-linux-gnu and x86_64-linux-gnu. >> --- >> resolv/inet_ntop.c | 134 +++++++++++++++++++++++++++++---------------- >> 1 file changed, 87 insertions(+), 47 deletions(-) > > This part of the patch series looks good to me. So: > > Reviewed-by: Collin Funk <collin.funk1@gmail.com> Thanks. > > I see similar results as well: > > $ grep '^model name' /proc/cpuinfo | sed 1q > model name : AMD Ryzen 7 3700X 8-Core Processor > $ gcc --version | sed 1q > gcc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) > # Pre patch > $ cat benchtests/bench.out > {"timing_type": "hp_timing", > "functions": { > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 3.94723e+09, > "iterations": 8e+06, > "reciprocal-throughput": 492.274, > "latency": 494.534, > "max-throughput": 2.03139e+06, > "min-throughput": 2.0221e+06 > } > } > , > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 4.67659e+09, > "iterations": 4e+06, > "reciprocal-throughput": 1164.13, > "latency": 1174.17, > "max-throughput": 859010, > "min-throughput": 851668 > } > } > > } > } > # Post patch > $ cat benchtests/bench.out > {"timing_type": "hp_timing", > "functions": { > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 3.67214e+09, > "iterations": 8.4e+07, > "reciprocal-throughput": 43.7862, > "latency": 43.6457, > "max-throughput": 2.28382e+07, > "min-throughput": 2.29118e+07 > } > } > , > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 3.9483e+09, > "iterations": 3.6e+07, > "reciprocal-throughput": 109.916, > "latency": 109.434, > "max-throughput": 9.09786e+06, > "min-throughput": 9.13793e+06 > } > } > > } > } > > Thanks, > Collin
On 04/06/25 05:09, Florian Weimer wrote: > * Adhemerval Zanella: > >> -static const char * >> +static inline const char * >> inet_ntop6 (const u_char *src, char *dst, socklen_t size) >> { >> /* >> @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) >> */ >> char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp; >> struct { int base, len; } best, cur; >> - u_int words[NS_IN6ADDRSZ / NS_INT16SZ]; >> + uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 }; >> int i; >> >> /* >> @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) >> * Copy the input (bytewise) array into a wordwise array. >> * Find the longest run of 0x00's in src[] for :: shorthanding. >> */ >> - memset(words, '\0', sizeof words); >> for (i = 0; i < NS_IN6ADDRSZ; i += 2) >> words[i / 2] = (src[i] << 8) | src[i + 1]; > > The words copy shouldn't really be needed, GCC can usually combine two > byte accesses into single 16-bit loads. There is no need for the words array in fact, we can ntohs when accessing the s6_addr16 for source (and on most chips it should be lowered efficiently). > > It should be possible to optimize put_uint16 using SWAR techniques. We > don't need to do byte-wise output, either. We can use full 32-bit > writes, potentially with overlaps. Not sure about that because the usual way is to present ipv6 in quartet and using 32-bit would require to decompose it anyway. Also Paul's suggestion to do overlap writes showed mixed results on different chips. > > If the user-supplied output buffer has the maximum possible size, we > don't need the temporary copy. (This applies to the IPv4 variant, too.) It shows a small performance increase, I have added this change.
* Adhemerval Zanella Netto: >> The words copy shouldn't really be needed, GCC can usually combine two >> byte accesses into single 16-bit loads. > > There is no need for the words array in fact, we can ntohs when > accessing the s6_addr16 for source (and on most chips it should be > lowered efficiently). I believe this introduces an alignment requirement that the inet_ntop specification does not actually provide. Thanks, Florian
On 04/06/25 14:08, Florian Weimer wrote: > * Adhemerval Zanella Netto: > >>> The words copy shouldn't really be needed, GCC can usually combine two >>> byte accesses into single 16-bit loads. >> >> There is no need for the words array in fact, we can ntohs when >> accessing the s6_addr16 for source (and on most chips it should be >> lowered efficiently). > > I believe this introduces an alignment requirement that the inet_ntop > specification does not actually provide. Sure, and we can just use a wrapper to handle this correctly: static inline uint16_t in6_addr_addr16 (const struct in6_addr *src, int idx) { const struct { uint16_t x; } __attribute__((__packed__)) *pptr = (typeof(pptr))(&src->s6_addr16[idx]); return ntohs (pptr->x); } It uses byte-load on architectures that does not allow unaligned access (like sparc). It also generates slight better code than the usual temporary plus memcpy we do for other uses on sparc (it does not spill store-byte for the temporary).
Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes: >> This part of the patch series looks good to me. So: >> >> Reviewed-by: Collin Funk <collin.funk1@gmail.com> > > Thanks. NP. How about I write a patch to have 'inet_ntoa' use 'inet_ntop (..., AF_INET)' once all the issues others have noticed are addressed? That function could benefit from the same optimization. Collin
On 04/06/25 14:36, Collin Funk wrote: > Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes: > >>> This part of the patch series looks good to me. So: >>> >>> Reviewed-by: Collin Funk <collin.funk1@gmail.com> >> >> Thanks. > > NP. How about I write a patch to have 'inet_ntoa' use > 'inet_ntop (..., AF_INET)' once all the issues others have noticed are > addressed? > > That function could benefit from the same optimization. Sure, if you don't mind I can also add on the v2.
On Wed, 4 Jun 2025, Adhemerval Zanella Netto wrote: > >> + if (word >= 100) > >> ... > >> + if (word >= 10) > > > > Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say: > > > > static inline char * > > put_uint8 (uint8_t word, char *tp) > > { > > *tp = '0' + word / 100; > > tp += 100 <= word; > > *tp = '0' + word / 10 % 10; > > tp += 10 <= word; > > *tp++ = '0' + word % 10; > > return tp; > > } > > > > I am not sure, with gcc 14.2.1 this version is indeed faster on aarch64 (Neoverse N1), > but slower on x86_64 (Zen3): How many samples did you collect? Hmm, I suppose on a modern write-back D$ system consecutive plain writes to the same memory location will be coalesced. But do all the systems do write allocation nowadays? I think this approach asks for a prefetch hint at the start of the function. I have benchmarked this alternative: static inline char * put_uint8 (uint8_t word, char *tp) { unsigned int o, t, h; h = word; o = h % 10; h /= 10; t = h % 10; h /= 10; *tp = '0' + h; tp += !!h; *tp = '0' + t; tp += !!t; *tp++ = '0' + o; return tp; } on my POWER9 system and it produces results similar to Paul's proposal (taking single samples only; there's some variation between runs and I didn't bother taking the average as that'd require some manual effort): "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 5.36267e+08, "iterations": 3.6e+07, "reciprocal-throughput": 14.8943, "latency": 14.8984, "max-throughput": 6.714e+07, "min-throughput": 6.71215e+07 } } (Paul's) vs: "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 5.38168e+08, "iterations": 3.6e+07, "reciprocal-throughput": 14.9995, "latency": 14.8988, "max-throughput": 6.66691e+07, "min-throughput": 6.71196e+07 } } (mine). If your benchmarking turns out good with this proposal, then I can obtain a representative set of samples for POWER9. I can't benchmark Aarch64 or x86-64 easily, but I note that my proposal produces fewer multiplications for both plus lets x86-64 take advantage of the carry flag via SBB instead of using a discrete conditional-set+add sequence, so chances are it'll perform better. It produces more compact code too, and I think it might be more readable for some though YMMV. Though the dependency on the quality of the optimiser seems very fragile here, e.g. if the locals are changed to a signed data type, then POWER9 produces an extra instruction that causes performance to drop by ~2.5%, but x86-64 is able to convert another conditional-set+add to SBB, which for a change likely causes a performance gain. Also I think you need to fold the update quoted below into 1/3, so as to let the newly-added group of benchmarks be run on its own, or one gets an error otherwise: $ make BENCHSET=bench-resolv bench The following values in BENCHSET are invalid: bench-resolv The valid ones are: bench-math bench-pthread bench-string calloc-simple calloc-tcache calloc-thread hash-benchset malloc-simple malloc-tcache malloc-thread math-benchset stdio-benchset stdio-common-benchset stdlib-benchset string-benchset wcsmbs-benchset Makefile:485: *** Invalid BENCHSET value. Stop. Maciej --- benchtests/Makefile | 1 + 1 file changed, 1 insertion(+) glibc-bench-resolv.diff Index: glibc/benchtests/Makefile =================================================================== --- glibc.orig/benchtests/Makefile +++ glibc/benchtests/Makefile @@ -462,6 +462,7 @@ ifneq ($(strip ${BENCHSET}),) VALIDBENCHSETNAMES := \ bench-math \ bench-pthread \ + bench-resolv \ bench-string \ calloc-simple \ calloc-tcache \
On Fri, 6 Jun 2025, Maciej W. Rozycki wrote: > *tp = '0' + t; > tp += !!t; It has to be: tp += !!t | !!h; obviously here. > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 5.38168e+08, > "iterations": 3.6e+07, > "reciprocal-throughput": 14.9995, > "latency": 14.8988, > "max-throughput": 6.66691e+07, > "min-throughput": 6.71196e+07 > } > } And it adds an instruction back, scoring: "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 5.66505e+08, "iterations": 3.6e+07, "reciprocal-throughput": 15.7215, "latency": 15.751, "max-throughput": 6.3607e+07, "min-throughput": 6.34881e+07 } } so it may not be worth it on POWER9, but still seems to produce better code on Aarch64 and x86-64. Sorry about the confusion. Maciej
diff --git a/resolv/inet_ntop.c b/resolv/inet_ntop.c index acf5f3cb88..3f2a9ddd87 100644 --- a/resolv/inet_ntop.c +++ b/resolv/inet_ntop.c @@ -26,45 +26,44 @@ #include <errno.h> #include <stdio.h> #include <string.h> - -#ifdef SPRINTF_CHAR -# define SPRINTF(x) strlen(sprintf/**/x) -#else -# define SPRINTF(x) ((size_t)sprintf x) -#endif +#include <intprops.h> +#include <_itoa.h> /* * WARNING: Don't even consider trying to compile this on a system where * sizeof(int) < 4. sizeof(int) > 4 is fine; all the world's not a VAX. */ -static const char *inet_ntop4 (const u_char *src, char *dst, socklen_t size); -static const char *inet_ntop6 (const u_char *src, char *dst, socklen_t size); - -/* char * - * __inet_ntop(af, src, dst, size) - * convert a network format address to presentation format. - * return: - * pointer to presentation format address (`dst'), or NULL (see errno). - * author: - * Paul Vixie, 1996. - */ -const char * -__inet_ntop (int af, const void *src, char *dst, socklen_t size) +static inline char *put_uint8 (uint8_t word, char *tp) { - switch (af) { - case AF_INET: - return (inet_ntop4(src, dst, size)); - case AF_INET6: - return (inet_ntop6(src, dst, size)); - default: - __set_errno (EAFNOSUPPORT); - return (NULL); - } - /* NOTREACHED */ + intptr_t s = 1; + if (word >= 100) + { + tp[2] = _itoa_lower_digits[word % 10]; + word /= 10; + s += 1; + } + if (word >= 10) + { + tp[1] = _itoa_lower_digits[word % 10]; + word /= 10; + s += 1; + } + *tp = _itoa_lower_digits[word % 10]; + return tp + s; +} + +static inline char *put_uint16 (uint16_t word, char *tp) +{ + if (word >= 0x1000) + *tp++ = _itoa_lower_digits[(word >> 12) & 0xf]; + if (word >= 0x100) + *tp++ = _itoa_lower_digits[(word >> 8) & 0xf]; + if (word >= 0x10) + *tp++ = _itoa_lower_digits[(word >> 4) & 0xf]; + *tp++ = _itoa_lower_digits[word & 0xf]; + return tp; } -libc_hidden_def (__inet_ntop) -weak_alias (__inet_ntop, inet_ntop) /* const char * * inet_ntop4(src, dst, size) @@ -74,20 +73,36 @@ weak_alias (__inet_ntop, inet_ntop) * notes: * (1) uses no statics * (2) takes a u_char* not an in_addr as input - * author: - * Paul Vixie, 1996. */ -static const char * +static __always_inline const char * inet_ntop4 (const u_char *src, char *dst, socklen_t size) { - static const char fmt[] = "%u.%u.%u.%u"; - char tmp[sizeof "255.255.255.255"]; + enum + { + oct_size = INT_BUFSIZE_BOUND (u_char), + tmp_size = 4 * oct_size + 3 /* '.' */ + 1 /* '\0' */ + }; + char tmp[tmp_size]; + char *tmp_r = tmp; - if (SPRINTF((tmp, fmt, src[0], src[1], src[2], src[3])) >= size) { - __set_errno (ENOSPC); - return (NULL); - } - return strcpy(dst, tmp); + tmp_r = put_uint8 (src[0], tmp_r); + *(tmp_r++) = '.'; + tmp_r = put_uint8 (src[1], tmp_r); + *(tmp_r++) = '.'; + tmp_r = put_uint8 (src[2], tmp_r); + *(tmp_r++) = '.'; + tmp_r = put_uint8 (src[3], tmp_r); + *tmp_r++ = '\0'; + + socklen_t tmp_s = tmp_r - tmp; + if (tmp_s > size) + { + __set_errno (ENOSPC); + return NULL; + } + memcpy (dst, tmp, tmp_s); + + return dst; } /* const char * @@ -96,7 +111,7 @@ inet_ntop4 (const u_char *src, char *dst, socklen_t size) * author: * Paul Vixie, 1996. */ -static const char * +static inline const char * inet_ntop6 (const u_char *src, char *dst, socklen_t size) { /* @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) */ char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp; struct { int base, len; } best, cur; - u_int words[NS_IN6ADDRSZ / NS_INT16SZ]; + uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 }; int i; /* @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) * Copy the input (bytewise) array into a wordwise array. * Find the longest run of 0x00's in src[] for :: shorthanding. */ - memset(words, '\0', sizeof words); for (i = 0; i < NS_IN6ADDRSZ; i += 2) words[i / 2] = (src[i] << 8) | src[i + 1]; best.base = -1; @@ -167,7 +181,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) tp += strlen(tp); break; } - tp += SPRINTF((tp, "%x", words[i])); + tp = put_uint16 (words[i], tp); } /* Was it a trailing run of 0x00's? */ if (best.base != -1 && (best.base + best.len) == @@ -178,9 +192,35 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) /* * Check for overflow, copy, and we're done. */ - if ((socklen_t)(tp - tmp) > size) { + socklen_t tmp_s = tp - tmp; + if (tmp_s > size) { __set_errno (ENOSPC); return (NULL); } - return strcpy(dst, tmp); + return memcpy(dst, tmp, tmp_s); } + +/* char * + * __inet_ntop(af, src, dst, size) + * convert a network format address to presentation format. + * return: + * pointer to presentation format address (`dst'), or NULL (see errno). + * author: + * Paul Vixie, 1996. + */ +const char * +__inet_ntop (int af, const void *src, char *dst, socklen_t size) +{ + switch (af) { + case AF_INET: + return (inet_ntop4(src, dst, size)); + case AF_INET6: + return (inet_ntop6(src, dst, size)); + default: + __set_errno (EAFNOSUPPORT); + return (NULL); + } + /* NOTREACHED */ +} +libc_hidden_def (__inet_ntop) +weak_alias (__inet_ntop, inet_ntop)