Message ID | 20250422153602.54787-10-chia-yu.chang@nokia-bell-labs.com |
---|---|
State | New |
Headers | show |
Series | AccECN protocol patch series | expand |
On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote: > @@ -302,10 +303,13 @@ struct tcp_sock { > u32 snd_up; /* Urgent pointer */ > u32 delivered; /* Total data packets delivered incl. rexmits */ > u32 delivered_ce; /* Like the above but only ECE marked packets */ > + u32 delivered_ecn_bytes[3]; This new fields do not belong to this cacheline group. I'm unsure they belong to fast-path at all. Also u32 will wrap-around very soon. [...] > diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > index dc8fdc80e16b..74ac8a5d2e00 100644 > --- a/include/uapi/linux/tcp.h > +++ b/include/uapi/linux/tcp.h > @@ -298,6 +298,13 @@ struct tcp_info { > __u32 tcpi_snd_wnd; /* peer's advertised receive window after > * scaling (bytes) > */ > + __u32 tcpi_received_ce; /* # of CE marks received */ > + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ > + __u32 tcpi_delivered_e0_bytes; > + __u32 tcpi_delivered_ce_bytes; > + __u32 tcpi_received_e1_bytes; > + __u32 tcpi_received_e0_bytes; > + __u32 tcpi_received_ce_bytes; This will break uAPI: new fields must be addded at the end, or must fill existing holes. Also u32 set in stone in uAPI for a byte counter looks way too small. > @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) > /* 32bit arches with 8byte alignment on u64 fields might need padding > * before tcp_clock_cache. > */ > - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7); > + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 122 + 6); The above means an additional cacheline in fast-path WRT the current status. IMHO should be avoided. > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index 5bd7fc9bcf66..41e45b9aff3f 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -70,6 +70,7 @@ > #include <linux/sysctl.h> > #include <linux/kernel.h> > #include <linux/prefetch.h> > +#include <linux/bitops.h> > #include <net/dst.h> > #include <net/tcp.h> > #include <net/proto_memory.h> > @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr > return false; > } > > +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given > + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). > + */ > +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) > +{ > + switch (ecnfield) { > + case INET_ECN_NOT_ECT: > + return 0; /* AccECN does not send counts of NOT_ECT */ > + case INET_ECN_ECT_1: > + return 1; > + case INET_ECN_CE: > + return 2; > + case INET_ECN_ECT_0: > + return 3; > + default: > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); No WARN_ONCE() above please: either the 'ecnfield' data is masked vs INET_ECN_MASK and the WARN_ONCE should not be possible or a remote sender can deterministically trigger a WARN() which nowadays will in turn raise a CVE... [...] > +static u32 tcp_accecn_field_init_offset(u8 ecnfield) > +{ > + switch (ecnfield) { > + case INET_ECN_NOT_ECT: > + return 0; /* AccECN does not send counts of NOT_ECT */ > + case INET_ECN_ECT_1: > + return TCP_ACCECN_E1B_INIT_OFFSET; > + case INET_ECN_CE: > + return TCP_ACCECN_CEB_INIT_OFFSET; > + case INET_ECN_ECT_0: > + return TCP_ACCECN_E0B_INIT_OFFSET; > + default: > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); Same as above. > + } > + return 0; > +} > + > +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ > +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield, > + bool order) > +{ > + u8 tmp; > + > + optfield = order ? 2 - optfield : optfield; > + tmp = optfield + 2; > + > + return (tmp + (tmp >> 2)) & INET_ECN_MASK; > +} > + > +/* Handles AccECN option ECT and CE 24-bit byte counters update into > + * the u32 value in tcp_sock. As we're processing TCP options, it is > + * safe to access from - 1. > + */ > +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 init_offset) > +{ > + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & > + 0xFFFFFFU; > + u32 delta = (truncated - *cnt) & 0xFFFFFFU; > + > + /* If delta has the highest bit set (24th bit) indicating > + * negative, sign extend to correct an estimation using > + * sign_extend32(delta, 24 - 1) > + */ > + delta = sign_extend32(delta, 23); > + *cnt += delta; > + return (s32)delta; > +} > + > +/* Returns true if the byte counters can be used */ > +static bool tcp_accecn_process_option(struct tcp_sock *tp, > + const struct sk_buff *skb, > + u32 delivered_bytes, int flag) > +{ > + u8 estimate_ecnfield = tp->est_ecnfield; > + bool ambiguous_ecn_bytes_incr = false; > + bool first_changed = false; > + unsigned int optlen; > + unsigned char *ptr; > + bool order1, res; > + unsigned int i; > + > + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { > + if (estimate_ecnfield) { > + u8 ecnfield = estimate_ecnfield - 1; > + > + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; > + return true; > + } > + return false; > + } > + > + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; > + optlen = ptr[1] - 2; This assumes optlen is greater then 2, but I don't see the relevant check. Are tcp options present at all? > + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1); Please, don't warn for arbitrary wrong data sent from the peer. > + order1 = (ptr[0] == TCPOPT_ACCECN1); > + ptr += 2; > + > + res = !!estimate_ecnfield; > + for (i = 0; i < 3; i++) { > + if (optlen >= TCPOLEN_ACCECN_PERFIELD) { > + u32 init_offset; > + u8 ecnfield; > + s32 delta; > + u32 *cnt; > + > + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); > + init_offset = tcp_accecn_field_init_offset(ecnfield); > + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; > + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); > + if (delta) { > + if (delta < 0) { > + res = false; > + ambiguous_ecn_bytes_incr = true; > + } > + if (ecnfield != estimate_ecnfield) { > + if (!first_changed) { > + tp->est_ecnfield = ecnfield; > + first_changed = true; > + } else { > + res = false; > + ambiguous_ecn_bytes_incr = true; > + } At least 2 indentation levels above the maximum readable. [...] > @@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net, > > ptr = (const unsigned char *)(th + 1); > opt_rx->saw_tstamp = 0; > + opt_rx->accecn = 0; > opt_rx->saw_unknown = 0; It would be good to be able to zero both 'accecn' and 'saw_unknown' with a single statement. [...] > @@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, > *ptr++ = htonl(opts->tsecr); > } > > + if (OPTION_ACCECN & options) { > + const u8 ect0_idx = INET_ECN_ECT_0 - 1; > + const u8 ect1_idx = INET_ECN_ECT_1 - 1; > + const u8 ce_idx = INET_ECN_CE - 1; > + u32 e0b; > + u32 e1b; > + u32 ceb; > + u8 len; > + > + e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; > + e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; > + ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; > + len = TCPOLEN_ACCECN_BASE + > + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; > + > + if (opts->num_accecn_fields == 2) { > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > + ((e1b >> 8) & 0xffff)); > + *ptr++ = htonl(((e1b & 0xff) << 24) | > + (ceb & 0xffffff)); > + } else if (opts->num_accecn_fields == 1) { > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > + ((e1b >> 8) & 0xffff)); > + leftover_bytes = ((e1b & 0xff) << 8) | > + TCPOPT_NOP; > + leftover_size = 1; > + } else if (opts->num_accecn_fields == 0) { > + leftover_bytes = (TCPOPT_ACCECN1 << 8) | len; > + leftover_size = 2; > + } else if (opts->num_accecn_fields == 3) { > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > + ((e1b >> 8) & 0xffff)); > + *ptr++ = htonl(((e1b & 0xff) << 24) | > + (ceb & 0xffffff)); > + *ptr++ = htonl(((e0b & 0xffffff) << 8) | > + TCPOPT_NOP); The above chunck and the contents of patch 7 must be in the same patch. This split makes the review even harder. [...] > @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb > opts->num_sack_blocks = 0; > } > > + if (tcp_ecn_mode_accecn(tp) && > + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) { > + int saving = opts->num_sack_blocks > 0 ? 2 : 0; > + int remaining = MAX_TCP_OPTION_SPACE - size; AFACS the above means tcp_options_fit_accecn() must clear any already set options, but apparently it does not do so. Have you tested with something adding largish options like mptcp? /P
> -----Original Message----- > From: Paolo Abeni <pabeni@redhat.com> > Sent: Tuesday, April 29, 2025 1:56 PM > To: Chia-Yu Chang (Nokia) <chia-yu.chang@nokia-bell-labs.com>; horms@kernel.org; dsahern@kernel.org; kuniyu@amazon.com; bpf@vger.kernel.org; netdev@vger.kernel.org; dave.taht@gmail.com; jhs@mojatatu.com; kuba@kernel.org; stephen@networkplumber.org; xiyou.wangcong@gmail.com; jiri@resnulli.us; davem@davemloft.net; edumazet@google.com; andrew+netdev@lunn.ch; donald.hunter@gmail.com; ast@fiberby.net; liuhangbin@gmail.com; shuah@kernel.org; linux-kselftest@vger.kernel.org; ij@kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) <koen.de_schepper@nokia-bell-labs.com>; g.white <g.white@cablelabs.com>; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel <vidhi_goel@apple.com> > Subject: Re: [PATCH v5 net-next 09/15] tcp: accecn: AccECN option > > > CAUTION: This is an external email. Please be very careful when clicking links or opening attachments. See the URL nok.it/ext for additional information. > > > > On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote: > > @@ -302,10 +303,13 @@ struct tcp_sock { > > u32 snd_up; /* Urgent pointer */ > > u32 delivered; /* Total data packets delivered incl. rexmits */ > > u32 delivered_ce; /* Like the above but only ECE marked packets */ > > + u32 delivered_ecn_bytes[3]; > > This new fields do not belong to this cacheline group. I'm unsure they belong to fast-path at all. Also u32 will wrap-around very soon. Hi Paolo, Thanks for the feedback. Could you help to advise then which cacheline group ie belongs to? If there are some tools can be shared will be appreciated. > > [...] > > diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index > > dc8fdc80e16b..74ac8a5d2e00 100644 > > --- a/include/uapi/linux/tcp.h > > +++ b/include/uapi/linux/tcp.h > > @@ -298,6 +298,13 @@ struct tcp_info { > > __u32 tcpi_snd_wnd; /* peer's advertised receive window after > > * scaling (bytes) > > */ > > + __u32 tcpi_received_ce; /* # of CE marks received */ > > + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ > > + __u32 tcpi_delivered_e0_bytes; > > + __u32 tcpi_delivered_ce_bytes; > > + __u32 tcpi_received_e1_bytes; > > + __u32 tcpi_received_e0_bytes; > > + __u32 tcpi_received_ce_bytes; > > This will break uAPI: new fields must be addded at the end, or must fill existing holes. Also u32 set in stone in uAPI for a byte counter looks way too small. I will move at the end or fill existing holes using pahole. Indeed u32 is not big, but based on the algorithms in A.2.1 and A.1. of AccECN draft, the byte counter greater than 24b shall be fine. And this is also verfied using TCP Prague. > > > @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) > > /* 32bit arches with 8byte alignment on u64 fields might need padding > > * before tcp_clock_cache. > > */ > > - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7); > > + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, > > + tcp_sock_write_txrx, 122 + 6); > > The above means an additional cacheline in fast-path WRT the current status. IMHO should be avoided. OK, I did this to avoid the line width warning of patchcheck, but will change it back. > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index > > 5bd7fc9bcf66..41e45b9aff3f 100644 > > --- a/net/ipv4/tcp_input.c > > +++ b/net/ipv4/tcp_input.c > > @@ -70,6 +70,7 @@ > > #include <linux/sysctl.h> > > #include <linux/kernel.h> > > #include <linux/prefetch.h> > > +#include <linux/bitops.h> > > #include <net/dst.h> > > #include <net/tcp.h> > > #include <net/proto_memory.h> > > @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr > > return false; > > } > > > > +/* Maps IP ECN field ECT/CE code point to AccECN option field number, > > +given > > + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). > > + */ > > +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) { > > + switch (ecnfield) { > > + case INET_ECN_NOT_ECT: > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > + case INET_ECN_ECT_1: > > + return 1; > > + case INET_ECN_CE: > > + return 2; > > + case INET_ECN_ECT_0: > > + return 3; > > + default: > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > No WARN_ONCE() above please: either the 'ecnfield' data is masked vs INET_ECN_MASK and the WARN_ONCE should not be possible or a remote sender can deterministically trigger a WARN() which nowadays will in turn raise a CVE... Sure, I will add the mask here. > > [...] > > +static u32 tcp_accecn_field_init_offset(u8 ecnfield) { > > + switch (ecnfield) { > > + case INET_ECN_NOT_ECT: > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > + case INET_ECN_ECT_1: > > + return TCP_ACCECN_E1B_INIT_OFFSET; > > + case INET_ECN_CE: > > + return TCP_ACCECN_CEB_INIT_OFFSET; > > + case INET_ECN_ECT_0: > > + return TCP_ACCECN_E0B_INIT_OFFSET; > > + default: > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > Same as above. > > > + } > > + return 0; > > +} > > + > > +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ static > > +unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield, > > + bool order) { > > + u8 tmp; > > + > > + optfield = order ? 2 - optfield : optfield; > > + tmp = optfield + 2; > > + > > + return (tmp + (tmp >> 2)) & INET_ECN_MASK; } > > + > > +/* Handles AccECN option ECT and CE 24-bit byte counters update into > > + * the u32 value in tcp_sock. As we're processing TCP options, it is > > + * safe to access from - 1. > > + */ > > +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 > > +init_offset) { > > + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & > > + 0xFFFFFFU; > > + u32 delta = (truncated - *cnt) & 0xFFFFFFU; > > + > > + /* If delta has the highest bit set (24th bit) indicating > > + * negative, sign extend to correct an estimation using > > + * sign_extend32(delta, 24 - 1) > > + */ > > + delta = sign_extend32(delta, 23); > > + *cnt += delta; > > + return (s32)delta; > > +} > > + > > +/* Returns true if the byte counters can be used */ static bool > > +tcp_accecn_process_option(struct tcp_sock *tp, > > + const struct sk_buff *skb, > > + u32 delivered_bytes, int flag) { > > + u8 estimate_ecnfield = tp->est_ecnfield; > > + bool ambiguous_ecn_bytes_incr = false; > > + bool first_changed = false; > > + unsigned int optlen; > > + unsigned char *ptr; > > + bool order1, res; > > + unsigned int i; > > + > > + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { > > + if (estimate_ecnfield) { > > + u8 ecnfield = estimate_ecnfield - 1; > > + > > + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; > > + return true; > > + } > > + return false; > > + } > > + > > + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; > > + optlen = ptr[1] - 2; > > This assumes optlen is greater then 2, but I don't see the relevant check. Are tcp options present at all? This function is executed only when AccECN mode is negotiated. And the above condition "if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn)" covers the case in which AccECN option is not present. So, I would think this is safe; please let me know if you think otherwise. > > > + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != > > + TCPOPT_ACCECN1); > > Please, don't warn for arbitrary wrong data sent from the peer. Sure, will remove. > > > + order1 = (ptr[0] == TCPOPT_ACCECN1); > > + ptr += 2; > > + > > + res = !!estimate_ecnfield; > > + for (i = 0; i < 3; i++) { > > + if (optlen >= TCPOLEN_ACCECN_PERFIELD) { > > + u32 init_offset; > > + u8 ecnfield; > > + s32 delta; > > + u32 *cnt; > > + > > + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); > > + init_offset = tcp_accecn_field_init_offset(ecnfield); > > + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; > > + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); > > + if (delta) { > > + if (delta < 0) { > > + res = false; > > + ambiguous_ecn_bytes_incr = true; > > + } > > + if (ecnfield != estimate_ecnfield) { > > + if (!first_changed) { > > + tp->est_ecnfield = ecnfield; > > + first_changed = true; > > + } else { > > + res = false; > > + ambiguous_ecn_bytes_incr = true; > > + } > > At least 2 indentation levels above the maximum readable. OK, let me think how to simplify it in next version. > > [...] > > @@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net, > > > > ptr = (const unsigned char *)(th + 1); > > opt_rx->saw_tstamp = 0; > > + opt_rx->accecn = 0; > > opt_rx->saw_unknown = 0; > > It would be good to be able to zero both 'accecn' and 'saw_unknown' with a single statement. ok, will do. > > [...] > > @@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, > > *ptr++ = htonl(opts->tsecr); > > } > > > > + if (OPTION_ACCECN & options) { > > + const u8 ect0_idx = INET_ECN_ECT_0 - 1; > > + const u8 ect1_idx = INET_ECN_ECT_1 - 1; > > + const u8 ce_idx = INET_ECN_CE - 1; > > + u32 e0b; > > + u32 e1b; > > + u32 ceb; > > + u8 len; > > + > > + e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; > > + e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; > > + ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; > > + len = TCPOLEN_ACCECN_BASE + > > + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; > > + > > + if (opts->num_accecn_fields == 2) { > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > + ((e1b >> 8) & 0xffff)); > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > + (ceb & 0xffffff)); > > + } else if (opts->num_accecn_fields == 1) { > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > + ((e1b >> 8) & 0xffff)); > > + leftover_bytes = ((e1b & 0xff) << 8) | > > + TCPOPT_NOP; > > + leftover_size = 1; > > + } else if (opts->num_accecn_fields == 0) { > > + leftover_bytes = (TCPOPT_ACCECN1 << 8) | len; > > + leftover_size = 2; > > + } else if (opts->num_accecn_fields == 3) { > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > + ((e1b >> 8) & 0xffff)); > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > + (ceb & 0xffffff)); > > + *ptr++ = htonl(((e0b & 0xffffff) << 8) | > > + TCPOPT_NOP); > > The above chunck and the contents of patch 7 must be in the same patch. > This split makes the review even harder. Thanks for feedback, I will mrege these 2 patches. > > [...] > > @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb > > opts->num_sack_blocks = 0; > > } > > > > + if (tcp_ecn_mode_accecn(tp) && > > + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) { > > + int saving = opts->num_sack_blocks > 0 ? 2 : 0; > > + int remaining = MAX_TCP_OPTION_SPACE - size; > > AFACS the above means tcp_options_fit_accecn() must clear any already set options, but apparently it does not do so. Have you tested with something adding largish options like mptcp? I see this part is NOT to clear already set option, but to calculate how long AccECN option will fit to remaining option space. > > /P
On Tue, 29 Apr 2025, Paolo Abeni wrote: > On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote: > > @@ -302,10 +303,13 @@ struct tcp_sock { > > u32 snd_up; /* Urgent pointer */ > > u32 delivered; /* Total data packets delivered incl. rexmits */ > > u32 delivered_ce; /* Like the above but only ECE marked packets */ > > + u32 delivered_ecn_bytes[3]; > > This new fields do not belong to this cacheline group. I'm unsure they > belong to fast-path at all. Also u32 will wrap-around very soon. > > [...] > > diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > > index dc8fdc80e16b..74ac8a5d2e00 100644 > > --- a/include/uapi/linux/tcp.h > > +++ b/include/uapi/linux/tcp.h > > @@ -298,6 +298,13 @@ struct tcp_info { > > __u32 tcpi_snd_wnd; /* peer's advertised receive window after > > * scaling (bytes) > > */ > > + __u32 tcpi_received_ce; /* # of CE marks received */ > > + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ > > + __u32 tcpi_delivered_e0_bytes; > > + __u32 tcpi_delivered_ce_bytes; > > + __u32 tcpi_received_e1_bytes; > > + __u32 tcpi_received_e0_bytes; > > + __u32 tcpi_received_ce_bytes; > > This will break uAPI: new fields must be addded at the end, or must fill > existing holes. Also u32 set in stone in uAPI for a byte counter looks > way too small. > > > @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) > > /* 32bit arches with 8byte alignment on u64 fields might need padding > > * before tcp_clock_cache. > > */ > > - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7); > > + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 122 + 6); > > The above means an additional cacheline in fast-path WRT the current > status. IMHO should be avoided. > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > > index 5bd7fc9bcf66..41e45b9aff3f 100644 > > --- a/net/ipv4/tcp_input.c > > +++ b/net/ipv4/tcp_input.c > > @@ -70,6 +70,7 @@ > > #include <linux/sysctl.h> > > #include <linux/kernel.h> > > #include <linux/prefetch.h> > > +#include <linux/bitops.h> > > #include <net/dst.h> > > #include <net/tcp.h> > > #include <net/proto_memory.h> > > @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr > > return false; > > } > > > > +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given > > + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). > > + */ > > +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) > > +{ > > + switch (ecnfield) { > > + case INET_ECN_NOT_ECT: > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > + case INET_ECN_ECT_1: > > + return 1; > > + case INET_ECN_CE: > > + return 2; > > + case INET_ECN_ECT_0: > > + return 3; > > + default: > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > No WARN_ONCE() above please: either the 'ecnfield' data is masked vs > INET_ECN_MASK and the WARN_ONCE should not be possible or a remote > sender can deterministically trigger a WARN() which nowadays will in > turn raise a CVE... > > [...] > > +static u32 tcp_accecn_field_init_offset(u8 ecnfield) > > +{ > > + switch (ecnfield) { > > + case INET_ECN_NOT_ECT: > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > + case INET_ECN_ECT_1: > > + return TCP_ACCECN_E1B_INIT_OFFSET; > > + case INET_ECN_CE: > > + return TCP_ACCECN_CEB_INIT_OFFSET; > > + case INET_ECN_ECT_0: > > + return TCP_ACCECN_E0B_INIT_OFFSET; > > + default: > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > Same as above. > > > + } > > + return 0; > > +} > > + > > +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ > > +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield, > > + bool order) > > +{ > > + u8 tmp; > > + > > + optfield = order ? 2 - optfield : optfield; > > + tmp = optfield + 2; > > + > > + return (tmp + (tmp >> 2)) & INET_ECN_MASK; > > +} > > + > > +/* Handles AccECN option ECT and CE 24-bit byte counters update into > > + * the u32 value in tcp_sock. As we're processing TCP options, it is > > + * safe to access from - 1. > > + */ > > +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 init_offset) > > +{ > > + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & > > + 0xFFFFFFU; > > + u32 delta = (truncated - *cnt) & 0xFFFFFFU; > > + > > + /* If delta has the highest bit set (24th bit) indicating > > + * negative, sign extend to correct an estimation using > > + * sign_extend32(delta, 24 - 1) > > + */ > > + delta = sign_extend32(delta, 23); > > + *cnt += delta; > > + return (s32)delta; > > +} > > + > > +/* Returns true if the byte counters can be used */ > > +static bool tcp_accecn_process_option(struct tcp_sock *tp, > > + const struct sk_buff *skb, > > + u32 delivered_bytes, int flag) > > +{ > > + u8 estimate_ecnfield = tp->est_ecnfield; > > + bool ambiguous_ecn_bytes_incr = false; > > + bool first_changed = false; > > + unsigned int optlen; > > + unsigned char *ptr; u8 would we more appropriate type for binary data. > > + bool order1, res; > > + unsigned int i; > > + > > + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { > > + if (estimate_ecnfield) { > > + u8 ecnfield = estimate_ecnfield - 1; > > + > > + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; > > + return true; > > + } > > + return false; > > + } > > + > > + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; > > + optlen = ptr[1] - 2; > > This assumes optlen is greater then 2, but I don't see the relevant > check. The options parser should check that, please see the "silly options" check. > Are tcp options present at all? There is !tp->rx_opt.accecn check above which should ensure we're processing only AccECN Option that is present. > > + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1); > > Please, don't warn for arbitrary wrong data sent from the peer. If there isn't AccECN option at ptr, there's bug elsewhere in the code (in the option parse code). So this is an internal sanity check that tp->rx_opt.accecn points to AccECN option for real like it should. If you still want that removed, no problem but it's should not be arbitrary data at this point because the options parsing code should have validated this condition already, thus WARN_ON_ONCE() seemed appropriate to me. > > + order1 = (ptr[0] == TCPOPT_ACCECN1); > > + ptr += 2; > > + > > + res = !!estimate_ecnfield; > > + for (i = 0; i < 3; i++) { > > + if (optlen >= TCPOLEN_ACCECN_PERFIELD) { It's easy to reverse logic here and use continue, which buys one level of indentation. > > + u32 init_offset; > > + u8 ecnfield; > > + s32 delta; > > + u32 *cnt; > > + > > + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); > > + init_offset = tcp_accecn_field_init_offset(ecnfield); > > + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; > > + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); > > + if (delta) { > > + if (delta < 0) { > > + res = false; > > + ambiguous_ecn_bytes_incr = true; > > + } > > + if (ecnfield != estimate_ecnfield) { > > + if (!first_changed) { > > + tp->est_ecnfield = ecnfield; > > + first_changed = true; > > + } else { > > + res = false; > > + ambiguous_ecn_bytes_incr = true; > > + } > > At least 2 indentation levels above the maximum readable. > > [...] > > @@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net, > > > > ptr = (const unsigned char *)(th + 1); > > opt_rx->saw_tstamp = 0; > > + opt_rx->accecn = 0; > > opt_rx->saw_unknown = 0; > > It would be good to be able to zero both 'accecn' and 'saw_unknown' with > a single statement. > > [...] > > @@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, > > *ptr++ = htonl(opts->tsecr); > > } > > > > + if (OPTION_ACCECN & options) { > > + const u8 ect0_idx = INET_ECN_ECT_0 - 1; > > + const u8 ect1_idx = INET_ECN_ECT_1 - 1; > > + const u8 ce_idx = INET_ECN_CE - 1; > > + u32 e0b; > > + u32 e1b; > > + u32 ceb; > > + u8 len; > > + > > + e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; > > + e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; > > + ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; > > + len = TCPOLEN_ACCECN_BASE + > > + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; > > + > > + if (opts->num_accecn_fields == 2) { > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > + ((e1b >> 8) & 0xffff)); > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > + (ceb & 0xffffff)); > > + } else if (opts->num_accecn_fields == 1) { > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > + ((e1b >> 8) & 0xffff)); > > + leftover_bytes = ((e1b & 0xff) << 8) | > > + TCPOPT_NOP; > > + leftover_size = 1; > > + } else if (opts->num_accecn_fields == 0) { > > + leftover_bytes = (TCPOPT_ACCECN1 << 8) | len; > > + leftover_size = 2; > > + } else if (opts->num_accecn_fields == 3) { > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > + ((e1b >> 8) & 0xffff)); > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > + (ceb & 0xffffff)); > > + *ptr++ = htonl(((e0b & 0xffffff) << 8) | > > + TCPOPT_NOP); > > The above chunck and the contents of patch 7 must be in the same patch. > This split makes the review even harder. > > [...] > > @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb > > opts->num_sack_blocks = 0; > > } > > > > + if (tcp_ecn_mode_accecn(tp) && > > + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) { > > + int saving = opts->num_sack_blocks > 0 ? 2 : 0; > > + int remaining = MAX_TCP_OPTION_SPACE - size; > > AFACS the above means tcp_options_fit_accecn() must clear any already > set options, but apparently it does not do so. Have you tested with > something adding largish options like mptcp? This "fitting" for AccEcn option is not to make room for the option but to check if AccECN option fits and in what length, and how it can take advantage of some nop bytes when available to save option space.
> -----Original Message----- > From: Ilpo Järvinen <ij@kernel.org> > Sent: Tuesday, May 6, 2025 12:54 AM > To: Paolo Abeni <pabeni@redhat.com> > Cc: Chia-Yu Chang (Nokia) <chia-yu.chang@nokia-bell-labs.com>; horms@kernel.org; dsahern@kernel.org; kuniyu@amazon.com; bpf@vger.kernel.org; netdev@vger.kernel.org; dave.taht@gmail.com; jhs@mojatatu.com; kuba@kernel.org; stephen@networkplumber.org; xiyou.wangcong@gmail.com; jiri@resnulli.us; davem@davemloft.net; edumazet@google.com; andrew+netdev@lunn.ch; donald.hunter@gmail.com; ast@fiberby.net; liuhangbin@gmail.com; shuah@kernel.org; linux-kselftest@vger.kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) <koen.de_schepper@nokia-bell-labs.com>; g.white <g.white@cablelabs.com>; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel <vidhi_goel@apple.com> > Subject: Re: [PATCH v5 net-next 09/15] tcp: accecn: AccECN option > > > CAUTION: This is an external email. Please be very careful when clicking links or opening attachments. See the URL nok.it/ext for additional information. > > > > On Tue, 29 Apr 2025, Paolo Abeni wrote: > > > On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote: > > > @@ -302,10 +303,13 @@ struct tcp_sock { > > > u32 snd_up; /* Urgent pointer */ > > > u32 delivered; /* Total data packets delivered incl. rexmits */ > > > u32 delivered_ce; /* Like the above but only ECE marked packets */ > > > + u32 delivered_ecn_bytes[3]; > > > > This new fields do not belong to this cacheline group. I'm unsure they > > belong to fast-path at all. Also u32 will wrap-around very soon. > > > > [...] > > > diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > > > index dc8fdc80e16b..74ac8a5d2e00 100644 > > > --- a/include/uapi/linux/tcp.h > > > +++ b/include/uapi/linux/tcp.h > > > @@ -298,6 +298,13 @@ struct tcp_info { > > > __u32 tcpi_snd_wnd; /* peer's advertised receive window after > > > * scaling (bytes) > > > */ > > > + __u32 tcpi_received_ce; /* # of CE marks received */ > > > + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ > > > + __u32 tcpi_delivered_e0_bytes; > > > + __u32 tcpi_delivered_ce_bytes; > > > + __u32 tcpi_received_e1_bytes; > > > + __u32 tcpi_received_e0_bytes; > > > + __u32 tcpi_received_ce_bytes; > > > > This will break uAPI: new fields must be addded at the end, or must > > fill existing holes. Also u32 set in stone in uAPI for a byte counter > > looks way too small. > > > > > @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) > > > /* 32bit arches with 8byte alignment on u64 fields might need padding > > > * before tcp_clock_cache. > > > */ > > > - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7); > > > + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, > > > + tcp_sock_write_txrx, 122 + 6); > > > > The above means an additional cacheline in fast-path WRT the current > > status. IMHO should be avoided. > > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index > > > 5bd7fc9bcf66..41e45b9aff3f 100644 > > > --- a/net/ipv4/tcp_input.c > > > +++ b/net/ipv4/tcp_input.c > > > @@ -70,6 +70,7 @@ > > > #include <linux/sysctl.h> > > > #include <linux/kernel.h> > > > #include <linux/prefetch.h> > > > +#include <linux/bitops.h> > > > #include <net/dst.h> > > > #include <net/tcp.h> > > > #include <net/proto_memory.h> > > > @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr > > > return false; > > > } > > > > > > +/* Maps IP ECN field ECT/CE code point to AccECN option field > > > +number, given > > > + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). > > > + */ > > > +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) { > > > + switch (ecnfield) { > > > + case INET_ECN_NOT_ECT: > > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > > + case INET_ECN_ECT_1: > > > + return 1; > > > + case INET_ECN_CE: > > > + return 2; > > > + case INET_ECN_ECT_0: > > > + return 3; > > > + default: > > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > > > No WARN_ONCE() above please: either the 'ecnfield' data is masked vs > > INET_ECN_MASK and the WARN_ONCE should not be possible or a remote > > sender can deterministically trigger a WARN() which nowadays will in > > turn raise a CVE... > > > > [...] > > > +static u32 tcp_accecn_field_init_offset(u8 ecnfield) { > > > + switch (ecnfield) { > > > + case INET_ECN_NOT_ECT: > > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > > + case INET_ECN_ECT_1: > > > + return TCP_ACCECN_E1B_INIT_OFFSET; > > > + case INET_ECN_CE: > > > + return TCP_ACCECN_CEB_INIT_OFFSET; > > > + case INET_ECN_ECT_0: > > > + return TCP_ACCECN_E0B_INIT_OFFSET; > > > + default: > > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > > > Same as above. > > > > > + } > > > + return 0; > > > +} > > > + > > > +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ > > > +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield, > > > + bool order) { > > > + u8 tmp; > > > + > > > + optfield = order ? 2 - optfield : optfield; > > > + tmp = optfield + 2; > > > + > > > + return (tmp + (tmp >> 2)) & INET_ECN_MASK; } > > > + > > > +/* Handles AccECN option ECT and CE 24-bit byte counters update > > > +into > > > + * the u32 value in tcp_sock. As we're processing TCP options, it > > > +is > > > + * safe to access from - 1. > > > + */ > > > +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 > > > +init_offset) { > > > + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & > > > + 0xFFFFFFU; > > > + u32 delta = (truncated - *cnt) & 0xFFFFFFU; > > > + > > > + /* If delta has the highest bit set (24th bit) indicating > > > + * negative, sign extend to correct an estimation using > > > + * sign_extend32(delta, 24 - 1) > > > + */ > > > + delta = sign_extend32(delta, 23); > > > + *cnt += delta; > > > + return (s32)delta; > > > +} > > > + > > > +/* Returns true if the byte counters can be used */ static bool > > > +tcp_accecn_process_option(struct tcp_sock *tp, > > > + const struct sk_buff *skb, > > > + u32 delivered_bytes, int flag) { > > > + u8 estimate_ecnfield = tp->est_ecnfield; > > > + bool ambiguous_ecn_bytes_incr = false; > > > + bool first_changed = false; > > > + unsigned int optlen; > > > + unsigned char *ptr; > > u8 would we more appropriate type for binary data. Hi Ilpo, Not sure I understand your point, could you elaborate which binary data you think shall use u8? > > > > + bool order1, res; > > > + unsigned int i; > > > + > > > + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { > > > + if (estimate_ecnfield) { > > > + u8 ecnfield = estimate_ecnfield - 1; > > > + > > > + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; > > > + return true; > > > + } > > > + return false; > > > + } > > > + > > > + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; > > > + optlen = ptr[1] - 2; > > > > This assumes optlen is greater then 2, but I don't see the relevant > > check. > > The options parser should check that, please see the "silly options" > check. > > > Are tcp options present at all? > > There is !tp->rx_opt.accecn check above which should ensure we're processing only AccECN Option that is present. > > > > + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != > > > + TCPOPT_ACCECN1); > > > > Please, don't warn for arbitrary wrong data sent from the peer. > > If there isn't AccECN option at ptr, there's bug elsewhere in the code (in the option parse code). So this is an internal sanity check that > tp->rx_opt.accecn points to AccECN option for real like it should. > > If you still want that removed, no problem but it's should not be arbitrary data at this point because the options parsing code should have validated this condition already, thus WARN_ON_ONCE() seemed appropriate to me. Indeed, then I will keep this for next version, but can be adjust once receiving further feedback. > > > > + order1 = (ptr[0] == TCPOPT_ACCECN1); > > > + ptr += 2; > > > + > > > + res = !!estimate_ecnfield; > > > + for (i = 0; i < 3; i++) { > > > + if (optlen >= TCPOLEN_ACCECN_PERFIELD) { > > It's easy to reverse logic here and use continue, which buys one level of indentation. Sure, thanks for explicit suggestion, will do. Chia-Yu > > > > + u32 init_offset; > > > + u8 ecnfield; > > > + s32 delta; > > > + u32 *cnt; > > > + > > > + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); > > > + init_offset = tcp_accecn_field_init_offset(ecnfield); > > > + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; > > > + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); > > > + if (delta) { > > > + if (delta < 0) { > > > + res = false; > > > + ambiguous_ecn_bytes_incr = true; > > > + } > > > + if (ecnfield != estimate_ecnfield) { > > > + if (!first_changed) { > > > + tp->est_ecnfield = ecnfield; > > > + first_changed = true; > > > + } else { > > > + res = false; > > > + ambiguous_ecn_bytes_incr = true; > > > + } > > > > At least 2 indentation levels above the maximum readable. > > > > [...] > > > @@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net, > > > > > > ptr = (const unsigned char *)(th + 1); > > > opt_rx->saw_tstamp = 0; > > > + opt_rx->accecn = 0; > > > opt_rx->saw_unknown = 0; > > > > It would be good to be able to zero both 'accecn' and 'saw_unknown' > > with a single statement. > > > > [...] > > > @@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, > > > *ptr++ = htonl(opts->tsecr); > > > } > > > > > > + if (OPTION_ACCECN & options) { > > > + const u8 ect0_idx = INET_ECN_ECT_0 - 1; > > > + const u8 ect1_idx = INET_ECN_ECT_1 - 1; > > > + const u8 ce_idx = INET_ECN_CE - 1; > > > + u32 e0b; > > > + u32 e1b; > > > + u32 ceb; > > > + u8 len; > > > + > > > + e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; > > > + e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; > > > + ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; > > > + len = TCPOLEN_ACCECN_BASE + > > > + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; > > > + > > > + if (opts->num_accecn_fields == 2) { > > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > > + ((e1b >> 8) & 0xffff)); > > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > > + (ceb & 0xffffff)); > > > + } else if (opts->num_accecn_fields == 1) { > > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > > + ((e1b >> 8) & 0xffff)); > > > + leftover_bytes = ((e1b & 0xff) << 8) | > > > + TCPOPT_NOP; > > > + leftover_size = 1; > > > + } else if (opts->num_accecn_fields == 0) { > > > + leftover_bytes = (TCPOPT_ACCECN1 << 8) | len; > > > + leftover_size = 2; > > > + } else if (opts->num_accecn_fields == 3) { > > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > > + ((e1b >> 8) & 0xffff)); > > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > > + (ceb & 0xffffff)); > > > + *ptr++ = htonl(((e0b & 0xffffff) << 8) | > > > + TCPOPT_NOP); > > > > The above chunck and the contents of patch 7 must be in the same patch. > > This split makes the review even harder. > > > > [...] > > > @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb > > > opts->num_sack_blocks = 0; > > > } > > > > > > + if (tcp_ecn_mode_accecn(tp) && > > > + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) { > > > + int saving = opts->num_sack_blocks > 0 ? 2 : 0; > > > + int remaining = MAX_TCP_OPTION_SPACE - size; > > > > AFACS the above means tcp_options_fit_accecn() must clear any already > > set options, but apparently it does not do so. Have you tested with > > something adding largish options like mptcp? > > This "fitting" for AccEcn option is not to make room for the option but to check if AccECN option fits and in what length, and how it can take advantage of some nop bytes when available to save option space. > > -- > i.
On Tue, 6 May 2025, Chia-Yu Chang (Nokia) wrote: > > -----Original Message----- > > From: Ilpo Järvinen <ij@kernel.org> > > Sent: Tuesday, May 6, 2025 12:54 AM > > To: Paolo Abeni <pabeni@redhat.com> > > Cc: Chia-Yu Chang (Nokia) <chia-yu.chang@nokia-bell-labs.com>; horms@kernel.org; dsahern@kernel.org; kuniyu@amazon.com; bpf@vger.kernel.org; netdev@vger.kernel.org; dave.taht@gmail.com; jhs@mojatatu.com; kuba@kernel.org; stephen@networkplumber.org; xiyou.wangcong@gmail.com; jiri@resnulli.us; davem@davemloft.net; edumazet@google.com; andrew+netdev@lunn.ch; donald.hunter@gmail.com; ast@fiberby.net; liuhangbin@gmail.com; shuah@kernel.org; linux-kselftest@vger.kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) <koen.de_schepper@nokia-bell-labs.com>; g.white <g.white@cablelabs.com>; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel <vidhi_goel@apple.com> > > Subject: Re: [PATCH v5 net-next 09/15] tcp: accecn: AccECN option > > > > > > CAUTION: This is an external email. Please be very careful when clicking links or opening attachments. See the URL nok.it/ext for additional information. > > > > > > > > On Tue, 29 Apr 2025, Paolo Abeni wrote: > > > > > On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote: > > > > @@ -302,10 +303,13 @@ struct tcp_sock { > > > > u32 snd_up; /* Urgent pointer */ > > > > u32 delivered; /* Total data packets delivered incl. rexmits */ > > > > u32 delivered_ce; /* Like the above but only ECE marked packets */ > > > > + u32 delivered_ecn_bytes[3]; > > > > > > This new fields do not belong to this cacheline group. I'm unsure they > > > belong to fast-path at all. Also u32 will wrap-around very soon. > > > > > > [...] > > > > diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > > > > index dc8fdc80e16b..74ac8a5d2e00 100644 > > > > --- a/include/uapi/linux/tcp.h > > > > +++ b/include/uapi/linux/tcp.h > > > > @@ -298,6 +298,13 @@ struct tcp_info { > > > > __u32 tcpi_snd_wnd; /* peer's advertised receive window after > > > > * scaling (bytes) > > > > */ > > > > + __u32 tcpi_received_ce; /* # of CE marks received */ > > > > + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ > > > > + __u32 tcpi_delivered_e0_bytes; > > > > + __u32 tcpi_delivered_ce_bytes; > > > > + __u32 tcpi_received_e1_bytes; > > > > + __u32 tcpi_received_e0_bytes; > > > > + __u32 tcpi_received_ce_bytes; > > > > > > This will break uAPI: new fields must be addded at the end, or must > > > fill existing holes. Also u32 set in stone in uAPI for a byte counter > > > looks way too small. > > > > > > > @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) > > > > /* 32bit arches with 8byte alignment on u64 fields might need padding > > > > * before tcp_clock_cache. > > > > */ > > > > - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7); > > > > + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, > > > > + tcp_sock_write_txrx, 122 + 6); > > > > > > The above means an additional cacheline in fast-path WRT the current > > > status. IMHO should be avoided. > > > > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index > > > > 5bd7fc9bcf66..41e45b9aff3f 100644 > > > > --- a/net/ipv4/tcp_input.c > > > > +++ b/net/ipv4/tcp_input.c > > > > @@ -70,6 +70,7 @@ > > > > #include <linux/sysctl.h> > > > > #include <linux/kernel.h> > > > > #include <linux/prefetch.h> > > > > +#include <linux/bitops.h> > > > > #include <net/dst.h> > > > > #include <net/tcp.h> > > > > #include <net/proto_memory.h> > > > > @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr > > > > return false; > > > > } > > > > > > > > +/* Maps IP ECN field ECT/CE code point to AccECN option field > > > > +number, given > > > > + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). > > > > + */ > > > > +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) { > > > > + switch (ecnfield) { > > > > + case INET_ECN_NOT_ECT: > > > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > > > + case INET_ECN_ECT_1: > > > > + return 1; > > > > + case INET_ECN_CE: > > > > + return 2; > > > > + case INET_ECN_ECT_0: > > > > + return 3; > > > > + default: > > > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > > > > > No WARN_ONCE() above please: either the 'ecnfield' data is masked vs > > > INET_ECN_MASK and the WARN_ONCE should not be possible or a remote > > > sender can deterministically trigger a WARN() which nowadays will in > > > turn raise a CVE... > > > > > > [...] > > > > +static u32 tcp_accecn_field_init_offset(u8 ecnfield) { > > > > + switch (ecnfield) { > > > > + case INET_ECN_NOT_ECT: > > > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > > > + case INET_ECN_ECT_1: > > > > + return TCP_ACCECN_E1B_INIT_OFFSET; > > > > + case INET_ECN_CE: > > > > + return TCP_ACCECN_CEB_INIT_OFFSET; > > > > + case INET_ECN_ECT_0: > > > > + return TCP_ACCECN_E0B_INIT_OFFSET; > > > > + default: > > > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > > > > > Same as above. > > > > > > > + } > > > > + return 0; > > > > +} > > > > + > > > > +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ > > > > +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield, > > > > + bool order) { > > > > + u8 tmp; > > > > + > > > > + optfield = order ? 2 - optfield : optfield; > > > > + tmp = optfield + 2; > > > > + > > > > + return (tmp + (tmp >> 2)) & INET_ECN_MASK; } > > > > + > > > > +/* Handles AccECN option ECT and CE 24-bit byte counters update > > > > +into > > > > + * the u32 value in tcp_sock. As we're processing TCP options, it > > > > +is > > > > + * safe to access from - 1. > > > > + */ > > > > +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 > > > > +init_offset) { > > > > + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & > > > > + 0xFFFFFFU; > > > > + u32 delta = (truncated - *cnt) & 0xFFFFFFU; > > > > + > > > > + /* If delta has the highest bit set (24th bit) indicating > > > > + * negative, sign extend to correct an estimation using > > > > + * sign_extend32(delta, 24 - 1) > > > > + */ > > > > + delta = sign_extend32(delta, 23); > > > > + *cnt += delta; > > > > + return (s32)delta; > > > > +} > > > > + > > > > +/* Returns true if the byte counters can be used */ static bool > > > > +tcp_accecn_process_option(struct tcp_sock *tp, > > > > + const struct sk_buff *skb, > > > > + u32 delivered_bytes, int flag) { > > > > + u8 estimate_ecnfield = tp->est_ecnfield; > > > > + bool ambiguous_ecn_bytes_incr = false; > > > > + bool first_changed = false; > > > > + unsigned int optlen; > > > > + unsigned char *ptr; > > > > u8 would we more appropriate type for binary data. > > Hi Ilpo, > > Not sure I understand your point, could you elaborate which binary data > you think shall use u8? The header/option is binary data so u8 seems the right type for it. So: u8 *ptr; -- i.
On Tue, 6 May 2025, Ilpo Järvinen wrote: > On Tue, 29 Apr 2025, Paolo Abeni wrote: > > On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote: > > > @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb > > > opts->num_sack_blocks = 0; > > > } > > > > > > + if (tcp_ecn_mode_accecn(tp) && > > > + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) { > > > + int saving = opts->num_sack_blocks > 0 ? 2 : 0; > > > + int remaining = MAX_TCP_OPTION_SPACE - size; > > > > AFACS the above means tcp_options_fit_accecn() must clear any already > > set options, but apparently it does not do so. Have you tested with > > something adding largish options like mptcp? > > This "fitting" for AccEcn option is not to make room for the option but to > check if AccECN option fits and in what length, and how it can take > advantage of some nop bytes when available to save option space. A minor correction. SACK blocks will naturally fill the entire option space if there are enough holes which would "starve" AccECN from using option space during loss recovery. Thus, AccECN option is allowed allowed grab some of that space from SACK. There's redundancy in SACK blocks anyway so it shouldn't usually impact SACK signal much.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9cbfefd693e3..0e032d9631ac 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -302,10 +303,13 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 delivered_ecn_bytes[3]; u32 received_ce; /* Like the above but for rcvd CE marked pkts */ u32 received_ecn_bytes[3]; u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2;/* ECN field for AccECN delivered estimates */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6373e3f17da8..4569a9ef4fb8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ffa4ae085db..bfff2a9f95bf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -204,6 +204,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -221,6 +223,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -234,6 +237,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) /* tp->accecn_fail_mode */ #define TCP_ACCECN_ACE_FAIL_SEND BIT(0) @@ -1056,6 +1066,9 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->est_ecnfield = 0; } /* State flags for sacked in struct tcp_skb_cb */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dc8fdc80e16b..74ac8a5d2e00 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -298,6 +298,13 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */ + __u32 tcpi_received_ce; /* # of CE marks received */ + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ + __u32 tcpi_delivered_e0_bytes; + __u32 tcpi_delivered_ce_bytes; + __u32 tcpi_received_e1_bytes; + __u32 tcpi_received_e0_bytes; + __u32 tcpi_received_ce_bytes; __u32 tcpi_rcv_wnd; /* local advertised receive window after * scaling (bytes) */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 75ec1a599b52..1d7fd86ca7b9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -731,6 +731,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1e21bdf43f23..89799f73c451 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include <net/icmp.h> #include <net/inet_common.h> +#include <net/inet_ecn.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/proto_memory.h> @@ -4109,6 +4110,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4227,6 +4231,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + info->tcpi_total_rto = tp->total_rto; info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; info->tcpi_total_rto_time = tp->total_rto_time; @@ -5091,6 +5103,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 122 + 6); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bd7fc9bcf66..41e45b9aff3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> +#include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> #include <net/proto_memory.h> @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr return false; } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + default: + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + default: + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield, + bool order) +{ + u8 tmp; + + optfield = order ? 2 - optfield : optfield; + tmp = optfield + 2; + + return (tmp + (tmp >> 2)) & INET_ECN_MASK; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) +{ + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + unsigned char *ptr; + bool order1, res; + unsigned int i; + + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1); + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + if (optlen >= TCPOLEN_ACCECN_PERFIELD) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta) { + if (delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; +} + static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -515,7 +654,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int flag) + u32 delivered_pkts, u32 delivered_bytes, + int flag) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -526,6 +666,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -551,12 +693,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int *flag) + u32 delivered_pkts, u32 delivered_bytes, + int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; @@ -4212,6 +4356,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); @@ -4251,6 +4396,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ @@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4469,6 +4616,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4529,11 +4682,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6133,8 +6289,12 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); - if (payload_len > 0) + if (payload_len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); tp->received_ecn_bytes[ecnfield - 1] += payload_len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + } } } @@ -6358,6 +6518,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5c5d4b94b59c..3f3e285fc973 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3450,6 +3450,7 @@ static void __net_init tcp_set_hashinfo(struct net *net) static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn_option = 2; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad97bb9951fd..a36de6c539da 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -491,6 +491,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -512,12 +513,14 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields; /* number of AccECN fields needed */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; + u32 *ecn_bytes; /* AccECN ECT/CE byte counters */ }; static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, @@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_bytes = ((e1b & 0xff) << 8) | + TCPOPT_NOP; + leftover_size = 1; + } else if (opts->num_accecn_fields == 0) { + leftover_bytes = (TCPOPT_ACCECN1 << 8) | len; + leftover_size = 2; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) + tp->accecn_minlen = 0; + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((leftover_bytes << 16) | (TCPOPT_SACK_PERM << 8) | @@ -886,6 +930,60 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining, int max_combine_saving) +{ + int size = TCP_ACCECN_MAXSIZE; + + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + + while (opts->num_accecn_fields >= required) { + int leftover_size = size & 0x3; + /* Pad to dword if cannot combine */ + if (leftover_size > max_combine_saving) + leftover_size = -((4 - leftover_size) & 0x3); + + if (remaining >= size - leftover_size) { + size -= leftover_size; + break; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -968,6 +1066,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + sock_net(sk)->ipv4.sysctl_tcp_ecn_option && + remaining >= TCPOLEN_ACCECN_BASE)) { + u32 saving = tcp_synack_options_combine_saving(opts); + + opts->ecn_bytes = synack_ecn_bytes; + remaining -= tcp_options_fit_accecn(opts, 0, remaining, saving); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -985,6 +1094,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -1047,6 +1157,14 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && + remaining >= TCPOLEN_ACCECN_BASE) { + u32 saving = tcp_synack_options_combine_saving(opts); + + opts->ecn_bytes = synack_ecn_bytes; + remaining -= tcp_options_fit_accecn(opts, 0, remaining, saving); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } + if (tcp_ecn_mode_accecn(tp) && + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) { + int saving = opts->num_sack_blocks > 0 ? 2 : 0; + int remaining = MAX_TCP_OPTION_SPACE - size; + + opts->ecn_bytes = tp->received_ecn_bytes; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + remaining, + saving); + } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size;