diff mbox series

[net-next,v12,01/10] virtio_net: Add functions for hashing

Message ID 20250530-rss-v12-1-95d8b348de91@daynix.com
State New
Headers show
Series tun: Introduce virtio-net hashing feature | expand

Commit Message

Akihiko Odaki May 30, 2025, 4:50 a.m. UTC
They are useful to implement VIRTIO_NET_F_RSS and
VIRTIO_NET_F_HASH_REPORT.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)

Comments

Jason Wang June 4, 2025, 1:18 a.m. UTC | #1
On Tue, Jun 3, 2025 at 1:31 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2025/06/03 12:19, Jason Wang wrote:
> > On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> They are useful to implement VIRTIO_NET_F_RSS and
> >> VIRTIO_NET_F_HASH_REPORT.
> >>
> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> >> Tested-by: Lei Yang <leiyang@redhat.com>
> >> ---
> >>   include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 188 insertions(+)
> >>
> >> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
> >> index 02a9f4dc594d..426f33b4b824 100644
> >> --- a/include/linux/virtio_net.h
> >> +++ b/include/linux/virtio_net.h
> >> @@ -9,6 +9,194 @@
> >>   #include <uapi/linux/tcp.h>
> >>   #include <uapi/linux/virtio_net.h>
> >>
> >> +struct virtio_net_hash {
> >> +       u32 value;
> >> +       u16 report;
> >> +};
> >> +
> >> +struct virtio_net_toeplitz_state {
> >> +       u32 hash;
> >> +       const u32 *key;
> >> +};
> >> +
> >> +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
> >> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
> >> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
> >> +                                        VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
> >> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
> >> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
> >> +
> >> +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
> >> +
> >> +static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len)
> >> +{
> >> +       while (len >= sizeof(*input)) {
> >> +               *input = be32_to_cpu((__force __be32)*input);
> >> +               input++;
> >> +               len -= sizeof(*input);
> >> +       }
> >> +}
> >> +
> >> +static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state,
> >> +                                           const __be32 *input, size_t len)
> >> +{
> >> +       while (len >= sizeof(*input)) {
> >> +               for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) {
> >> +                       u32 i = ffs(map);
> >> +
> >> +                       state->hash ^= state->key[0] << (32 - i) |
> >> +                                      (u32)((u64)state->key[1] >> i);
> >> +               }
> >> +
> >> +               state->key++;
> >> +               input++;
> >> +               len -= sizeof(*input);
> >> +       }
> >> +}
> >> +
> >> +static inline u8 virtio_net_hash_key_length(u32 types)
> >> +{
> >> +       size_t len = 0;
> >> +
> >> +       if (types & VIRTIO_NET_HASH_REPORT_IPv4)
> >> +               len = max(len,
> >> +                         sizeof(struct flow_dissector_key_ipv4_addrs));
> >> +
> >> +       if (types &
> >> +           (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
> >> +               len = max(len,
> >> +                         sizeof(struct flow_dissector_key_ipv4_addrs) +
> >> +                         sizeof(struct flow_dissector_key_ports));
> >> +
> >> +       if (types & VIRTIO_NET_HASH_REPORT_IPv6)
> >> +               len = max(len,
> >> +                         sizeof(struct flow_dissector_key_ipv6_addrs));
> >> +
> >> +       if (types &
> >> +           (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
> >> +               len = max(len,
> >> +                         sizeof(struct flow_dissector_key_ipv6_addrs) +
> >> +                         sizeof(struct flow_dissector_key_ports));
> >> +
> >> +       return len + sizeof(u32);
> >> +}
> >> +
> >> +static inline u32 virtio_net_hash_report(u32 types,
> >> +                                        const struct flow_keys_basic *keys)
> >> +{
> >> +       switch (keys->basic.n_proto) {
> >> +       case cpu_to_be16(ETH_P_IP):
> >> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
> >> +                               return VIRTIO_NET_HASH_REPORT_TCPv4;
> >> +
> >> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
> >> +                               return VIRTIO_NET_HASH_REPORT_UDPv4;
> >> +               }
> >> +
> >> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
> >> +                       return VIRTIO_NET_HASH_REPORT_IPv4;
> >> +
> >> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >> +
> >> +       case cpu_to_be16(ETH_P_IPV6):
> >> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
> >> +                               return VIRTIO_NET_HASH_REPORT_TCPv6;
> >> +
> >> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
> >> +                               return VIRTIO_NET_HASH_REPORT_UDPv6;
> >> +               }
> >> +
> >> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
> >> +                       return VIRTIO_NET_HASH_REPORT_IPv6;
> >> +
> >> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >> +
> >> +       default:
> >> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >> +       }
> >> +}
> >> +
> >> +static inline void virtio_net_hash_rss(const struct sk_buff *skb,
> >> +                                      u32 types, const u32 *key,
> >> +                                      struct virtio_net_hash *hash)
> >> +{
> >> +       struct virtio_net_toeplitz_state toeplitz_state = { .key = key };
> >> +       struct flow_keys flow;
> >> +       struct flow_keys_basic flow_basic;
> >> +       u16 report;
> >> +
> >> +       if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) {
> >> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >> +               return;
> >> +       }
> >> +
> >> +       flow_basic = (struct flow_keys_basic) {
> >> +               .control = flow.control,
> >> +               .basic = flow.basic
> >> +       };
> >> +
> >> +       report = virtio_net_hash_report(types, &flow_basic);
> >> +
> >> +       switch (report) {
> >> +       case VIRTIO_NET_HASH_REPORT_IPv4:
> >> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >> +                                        (__be32 *)&flow.addrs.v4addrs,
> >> +                                        sizeof(flow.addrs.v4addrs));
> >> +               break;
> >> +
> >> +       case VIRTIO_NET_HASH_REPORT_TCPv4:
> >> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >> +                                        (__be32 *)&flow.addrs.v4addrs,
> >> +                                        sizeof(flow.addrs.v4addrs));
> >> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >> +                                        sizeof(flow.ports.ports));
> >> +               break;
> >> +
> >> +       case VIRTIO_NET_HASH_REPORT_UDPv4:
> >> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >> +                                        (__be32 *)&flow.addrs.v4addrs,
> >> +                                        sizeof(flow.addrs.v4addrs));
> >> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >> +                                        sizeof(flow.ports.ports));
> >> +               break;
> >> +
> >> +       case VIRTIO_NET_HASH_REPORT_IPv6:
> >> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >> +                                        (__be32 *)&flow.addrs.v6addrs,
> >> +                                        sizeof(flow.addrs.v6addrs));
> >> +               break;
> >> +
> >> +       case VIRTIO_NET_HASH_REPORT_TCPv6:
> >> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >> +                                        (__be32 *)&flow.addrs.v6addrs,
> >> +                                        sizeof(flow.addrs.v6addrs));
> >> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >> +                                        sizeof(flow.ports.ports));
> >> +               break;
> >> +
> >> +       case VIRTIO_NET_HASH_REPORT_UDPv6:
> >> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >> +                                        (__be32 *)&flow.addrs.v6addrs,
> >> +                                        sizeof(flow.addrs.v6addrs));
> >> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >> +                                        sizeof(flow.ports.ports));
> >> +               break;
> >> +
> >> +       default:
> >> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >> +               return;
> >
> > So I still think we need a comment here to explain why this is not an
> > issue if the device can report HASH_XXX_EX. Or we need to add the
> > support, since this is the code from the driver side, I don't think we
> > need to worry about the device implementation issues.
>
> This is on the device side, and don't report HASH_TYPE_XXX_EX.
>
> >
> > For the issue of the number of options, does the spec forbid fallback
> > to VIRTIO_NET_HASH_REPORT_NONE? If not, we can do that.
>
> 5.1.6.4.3.4 "IPv6 packets with extension header" says:
>  > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
>  > header, the hash is calculated over the following fields:
>  > - Home address from the home address option in the IPv6 destination
>  >   options header. If the extension header is not present, use the
>  >   Source IPv6 address.
>  > - IPv6 address that is contained in the Routing-Header-Type-2 from the
>  >   associated extension header. If the extension header is not present,
>  >   use the Destination IPv6 address.
>  > - Source TCP port
>  > - Destination TCP port
>
> Therefore, if VIRTIO_NET_HASH_TYPE_TCP_EX is set, the packet has a TCPv6
> and an home address option in the IPv6 destination options header is
> present, the hash is calculated over the home address. If the hash is
> not calculated over the home address in such a case, the device is
> contradicting with this section and violating the spec. The same goes
> for the other HASH_TYPE_XXX_EX types and Routing-Header-Type-2.

Just to make sure we are one the same page. I meant:

1) If the hash is not calculated over the home address (in the case of
IPv6 destination destination), it can still report
VIRTIO_NET_RSS_HASH_TYPE_IPv6. This is what you implemented in your
series. So the device can simply fallback to e.g TCPv6 if it can't
understand all or part of the IPv6 options.
2) the VIRTIO_NET_SUPPORTED_HASH_TYPES is not checked against the
tun_vnet_ioctl_sethash(), so userspace may set
VIRTIO_NET_HASH_TYPE_TCP_EX regardless of what has been returned by
tun_vnet_ioctl_gethashtypes(). In this case they won't get
VIRTIO_NET_HASH_TYPE_TCP_EX.
3) implementing part of the hash types might complicate the migration
or at least we need to describe the expectations of libvirt or other
management in this case. For example, do we plan to have a dedicated
Qemu command line like:

-device virtio-net-pci,hash_report=on,supported_hash_types=X,Y,Z?

Thanks

>
> Regards,
> Akihiko Odaki
>
Jason Wang June 5, 2025, 1:53 a.m. UTC | #2
On Wed, Jun 4, 2025 at 3:20 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2025/06/04 10:18, Jason Wang wrote:
> > On Tue, Jun 3, 2025 at 1:31 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> On 2025/06/03 12:19, Jason Wang wrote:
> >>> On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>
> >>>> They are useful to implement VIRTIO_NET_F_RSS and
> >>>> VIRTIO_NET_F_HASH_REPORT.
> >>>>
> >>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> >>>> Tested-by: Lei Yang <leiyang@redhat.com>
> >>>> ---
> >>>>    include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++
> >>>>    1 file changed, 188 insertions(+)
> >>>>
> >>>> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
> >>>> index 02a9f4dc594d..426f33b4b824 100644
> >>>> --- a/include/linux/virtio_net.h
> >>>> +++ b/include/linux/virtio_net.h
> >>>> @@ -9,6 +9,194 @@
> >>>>    #include <uapi/linux/tcp.h>
> >>>>    #include <uapi/linux/virtio_net.h>
> >>>>
> >>>> +struct virtio_net_hash {
> >>>> +       u32 value;
> >>>> +       u16 report;
> >>>> +};
> >>>> +
> >>>> +struct virtio_net_toeplitz_state {
> >>>> +       u32 hash;
> >>>> +       const u32 *key;
> >>>> +};
> >>>> +
> >>>> +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
> >>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
> >>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
> >>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
> >>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
> >>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
> >>>> +
> >>>> +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
> >>>> +
> >>>> +static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len)
> >>>> +{
> >>>> +       while (len >= sizeof(*input)) {
> >>>> +               *input = be32_to_cpu((__force __be32)*input);
> >>>> +               input++;
> >>>> +               len -= sizeof(*input);
> >>>> +       }
> >>>> +}
> >>>> +
> >>>> +static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state,
> >>>> +                                           const __be32 *input, size_t len)
> >>>> +{
> >>>> +       while (len >= sizeof(*input)) {
> >>>> +               for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) {
> >>>> +                       u32 i = ffs(map);
> >>>> +
> >>>> +                       state->hash ^= state->key[0] << (32 - i) |
> >>>> +                                      (u32)((u64)state->key[1] >> i);
> >>>> +               }
> >>>> +
> >>>> +               state->key++;
> >>>> +               input++;
> >>>> +               len -= sizeof(*input);
> >>>> +       }
> >>>> +}
> >>>> +
> >>>> +static inline u8 virtio_net_hash_key_length(u32 types)
> >>>> +{
> >>>> +       size_t len = 0;
> >>>> +
> >>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv4)
> >>>> +               len = max(len,
> >>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs));
> >>>> +
> >>>> +       if (types &
> >>>> +           (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
> >>>> +               len = max(len,
> >>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs) +
> >>>> +                         sizeof(struct flow_dissector_key_ports));
> >>>> +
> >>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv6)
> >>>> +               len = max(len,
> >>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs));
> >>>> +
> >>>> +       if (types &
> >>>> +           (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
> >>>> +               len = max(len,
> >>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs) +
> >>>> +                         sizeof(struct flow_dissector_key_ports));
> >>>> +
> >>>> +       return len + sizeof(u32);
> >>>> +}
> >>>> +
> >>>> +static inline u32 virtio_net_hash_report(u32 types,
> >>>> +                                        const struct flow_keys_basic *keys)
> >>>> +{
> >>>> +       switch (keys->basic.n_proto) {
> >>>> +       case cpu_to_be16(ETH_P_IP):
> >>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
> >>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv4;
> >>>> +
> >>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
> >>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv4;
> >>>> +               }
> >>>> +
> >>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
> >>>> +                       return VIRTIO_NET_HASH_REPORT_IPv4;
> >>>> +
> >>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>> +
> >>>> +       case cpu_to_be16(ETH_P_IPV6):
> >>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
> >>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv6;
> >>>> +
> >>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
> >>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv6;
> >>>> +               }
> >>>> +
> >>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
> >>>> +                       return VIRTIO_NET_HASH_REPORT_IPv6;
> >>>> +
> >>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>> +
> >>>> +       default:
> >>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>> +       }
> >>>> +}
> >>>> +
> >>>> +static inline void virtio_net_hash_rss(const struct sk_buff *skb,
> >>>> +                                      u32 types, const u32 *key,
> >>>> +                                      struct virtio_net_hash *hash)
> >>>> +{
> >>>> +       struct virtio_net_toeplitz_state toeplitz_state = { .key = key };
> >>>> +       struct flow_keys flow;
> >>>> +       struct flow_keys_basic flow_basic;
> >>>> +       u16 report;
> >>>> +
> >>>> +       if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) {
> >>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >>>> +               return;
> >>>> +       }
> >>>> +
> >>>> +       flow_basic = (struct flow_keys_basic) {
> >>>> +               .control = flow.control,
> >>>> +               .basic = flow.basic
> >>>> +       };
> >>>> +
> >>>> +       report = virtio_net_hash_report(types, &flow_basic);
> >>>> +
> >>>> +       switch (report) {
> >>>> +       case VIRTIO_NET_HASH_REPORT_IPv4:
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>> +               break;
> >>>> +
> >>>> +       case VIRTIO_NET_HASH_REPORT_TCPv4:
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>> +                                        sizeof(flow.ports.ports));
> >>>> +               break;
> >>>> +
> >>>> +       case VIRTIO_NET_HASH_REPORT_UDPv4:
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>> +                                        sizeof(flow.ports.ports));
> >>>> +               break;
> >>>> +
> >>>> +       case VIRTIO_NET_HASH_REPORT_IPv6:
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>> +               break;
> >>>> +
> >>>> +       case VIRTIO_NET_HASH_REPORT_TCPv6:
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>> +                                        sizeof(flow.ports.ports));
> >>>> +               break;
> >>>> +
> >>>> +       case VIRTIO_NET_HASH_REPORT_UDPv6:
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>> +                                        sizeof(flow.ports.ports));
> >>>> +               break;
> >>>> +
> >>>> +       default:
> >>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >>>> +               return;
> >>>
> >>> So I still think we need a comment here to explain why this is not an
> >>> issue if the device can report HASH_XXX_EX. Or we need to add the
> >>> support, since this is the code from the driver side, I don't think we
> >>> need to worry about the device implementation issues.
> >>
> >> This is on the device side, and don't report HASH_TYPE_XXX_EX.
> >>
> >>>
> >>> For the issue of the number of options, does the spec forbid fallback
> >>> to VIRTIO_NET_HASH_REPORT_NONE? If not, we can do that.
> >>
> >> 5.1.6.4.3.4 "IPv6 packets with extension header" says:
> >>   > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
> >>   > header, the hash is calculated over the following fields:
> >>   > - Home address from the home address option in the IPv6 destination
> >>   >   options header. If the extension header is not present, use the
> >>   >   Source IPv6 address.
> >>   > - IPv6 address that is contained in the Routing-Header-Type-2 from the
> >>   >   associated extension header. If the extension header is not present,
> >>   >   use the Destination IPv6 address.
> >>   > - Source TCP port
> >>   > - Destination TCP port
> >>
> >> Therefore, if VIRTIO_NET_HASH_TYPE_TCP_EX is set, the packet has a TCPv6
> >> and an home address option in the IPv6 destination options header is
> >> present, the hash is calculated over the home address. If the hash is
> >> not calculated over the home address in such a case, the device is
> >> contradicting with this section and violating the spec. The same goes
> >> for the other HASH_TYPE_XXX_EX types and Routing-Header-Type-2.
> >
> > Just to make sure we are one the same page. I meant:
> >
> > 1) If the hash is not calculated over the home address (in the case of
> > IPv6 destination destination), it can still report
> > VIRTIO_NET_RSS_HASH_TYPE_IPv6. This is what you implemented in your
> > series. So the device can simply fallback to e.g TCPv6 if it can't
> > understand all or part of the IPv6 options.
>
> The spec says it can fallback if "the extension header is not present",
> not if the device can't understand the extension header.

I don't think so,

1) spec had a condition beforehand:

"""
If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
header, the hash is calculated over the following fields:
...
If the extension header is not present ...
"""

So the device can choose not to set VIRTIO_NET_HASH_TYPE_TCP_EX as
spec doesn't say device MUST set VIRTIO_NET_HASH_TYPE_TCP_EX if ...

2) implementation wise, since device has limited resources, we can't
expect the device can parse arbitrary number of ipv6 options

3) if 1) and 2) not the case, we need fix the spec otherwise implement
a spec compliant device is impractical

>
> > 2) the VIRTIO_NET_SUPPORTED_HASH_TYPES is not checked against the
> > tun_vnet_ioctl_sethash(), so userspace may set
> > VIRTIO_NET_HASH_TYPE_TCP_EX regardless of what has been returned by
> > tun_vnet_ioctl_gethashtypes(). In this case they won't get
> > VIRTIO_NET_HASH_TYPE_TCP_EX.
>
> That's right. It's the responsibility of the userspace to set only the
> supported hash types.

Well, the kernel should filter out the unsupported one to have a
robust uAPI. Otherwise, we give green light to the buggy userspace
which will have unexpected results.

>
> > 3) implementing part of the hash types might complicate the migration
> > or at least we need to describe the expectations of libvirt or other
> > management in this case. For example, do we plan to have a dedicated
> > Qemu command line like:
> >
> > -device virtio-net-pci,hash_report=on,supported_hash_types=X,Y,Z?
>
> I posted a patch series to implement such a command line for vDPA[1].
> The patch series that wires this tuntap feature up[2] reuses the
> infrastructure so it doesn't bring additional complexity.
>
> [1]
> https://lore.kernel.org/qemu-devel/20250530-vdpa-v1-0-5af4109b1c19@daynix.com/
> [2]
> https://lore.kernel.org/qemu-devel/20250530-hash-v5-0-343d7d7a8200@daynix.com/

I meant, if we implement a full hash report feature, it means a single
hash cmdline option is more than sufficient and so compatibility code
can just turn it off when dealing with machine types. This is much
more simpler than

1) having both hash as well as supported_hash_features
2) dealing both hash as well as supported_hash_features in compatibility codes
3) libvirt will be happy

For [1], it seems it introduces a per has type option, this seems to
be a burden to the management layer as it need to learn new option
everytime a new hash type is supported

Thanks

>
> Regards,
> Akihiko Odaki
>
Akihiko Odaki June 5, 2025, 7:57 a.m. UTC | #3
On 2025/06/05 10:53, Jason Wang wrote:
> On Wed, Jun 4, 2025 at 3:20 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> On 2025/06/04 10:18, Jason Wang wrote:
>>> On Tue, Jun 3, 2025 at 1:31 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>
>>>> On 2025/06/03 12:19, Jason Wang wrote:
>>>>> On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>
>>>>>> They are useful to implement VIRTIO_NET_F_RSS and
>>>>>> VIRTIO_NET_F_HASH_REPORT.
>>>>>>
>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>>>>>> Tested-by: Lei Yang <leiyang@redhat.com>
>>>>>> ---
>>>>>>     include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>     1 file changed, 188 insertions(+)
>>>>>>
>>>>>> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
>>>>>> index 02a9f4dc594d..426f33b4b824 100644
>>>>>> --- a/include/linux/virtio_net.h
>>>>>> +++ b/include/linux/virtio_net.h
>>>>>> @@ -9,6 +9,194 @@
>>>>>>     #include <uapi/linux/tcp.h>
>>>>>>     #include <uapi/linux/virtio_net.h>
>>>>>>
>>>>>> +struct virtio_net_hash {
>>>>>> +       u32 value;
>>>>>> +       u16 report;
>>>>>> +};
>>>>>> +
>>>>>> +struct virtio_net_toeplitz_state {
>>>>>> +       u32 hash;
>>>>>> +       const u32 *key;
>>>>>> +};
>>>>>> +
>>>>>> +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
>>>>>> +
>>>>>> +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
>>>>>> +
>>>>>> +static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len)
>>>>>> +{
>>>>>> +       while (len >= sizeof(*input)) {
>>>>>> +               *input = be32_to_cpu((__force __be32)*input);
>>>>>> +               input++;
>>>>>> +               len -= sizeof(*input);
>>>>>> +       }
>>>>>> +}
>>>>>> +
>>>>>> +static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state,
>>>>>> +                                           const __be32 *input, size_t len)
>>>>>> +{
>>>>>> +       while (len >= sizeof(*input)) {
>>>>>> +               for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) {
>>>>>> +                       u32 i = ffs(map);
>>>>>> +
>>>>>> +                       state->hash ^= state->key[0] << (32 - i) |
>>>>>> +                                      (u32)((u64)state->key[1] >> i);
>>>>>> +               }
>>>>>> +
>>>>>> +               state->key++;
>>>>>> +               input++;
>>>>>> +               len -= sizeof(*input);
>>>>>> +       }
>>>>>> +}
>>>>>> +
>>>>>> +static inline u8 virtio_net_hash_key_length(u32 types)
>>>>>> +{
>>>>>> +       size_t len = 0;
>>>>>> +
>>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv4)
>>>>>> +               len = max(len,
>>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs));
>>>>>> +
>>>>>> +       if (types &
>>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
>>>>>> +               len = max(len,
>>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs) +
>>>>>> +                         sizeof(struct flow_dissector_key_ports));
>>>>>> +
>>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv6)
>>>>>> +               len = max(len,
>>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs));
>>>>>> +
>>>>>> +       if (types &
>>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
>>>>>> +               len = max(len,
>>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs) +
>>>>>> +                         sizeof(struct flow_dissector_key_ports));
>>>>>> +
>>>>>> +       return len + sizeof(u32);
>>>>>> +}
>>>>>> +
>>>>>> +static inline u32 virtio_net_hash_report(u32 types,
>>>>>> +                                        const struct flow_keys_basic *keys)
>>>>>> +{
>>>>>> +       switch (keys->basic.n_proto) {
>>>>>> +       case cpu_to_be16(ETH_P_IP):
>>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
>>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv4;
>>>>>> +
>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
>>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv4;
>>>>>> +               }
>>>>>> +
>>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
>>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv4;
>>>>>> +
>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
>>>>>> +
>>>>>> +       case cpu_to_be16(ETH_P_IPV6):
>>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
>>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv6;
>>>>>> +
>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
>>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv6;
>>>>>> +               }
>>>>>> +
>>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
>>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv6;
>>>>>> +
>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
>>>>>> +
>>>>>> +       default:
>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
>>>>>> +       }
>>>>>> +}
>>>>>> +
>>>>>> +static inline void virtio_net_hash_rss(const struct sk_buff *skb,
>>>>>> +                                      u32 types, const u32 *key,
>>>>>> +                                      struct virtio_net_hash *hash)
>>>>>> +{
>>>>>> +       struct virtio_net_toeplitz_state toeplitz_state = { .key = key };
>>>>>> +       struct flow_keys flow;
>>>>>> +       struct flow_keys_basic flow_basic;
>>>>>> +       u16 report;
>>>>>> +
>>>>>> +       if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) {
>>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
>>>>>> +               return;
>>>>>> +       }
>>>>>> +
>>>>>> +       flow_basic = (struct flow_keys_basic) {
>>>>>> +               .control = flow.control,
>>>>>> +               .basic = flow.basic
>>>>>> +       };
>>>>>> +
>>>>>> +       report = virtio_net_hash_report(types, &flow_basic);
>>>>>> +
>>>>>> +       switch (report) {
>>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv4:
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
>>>>>> +                                        sizeof(flow.addrs.v4addrs));
>>>>>> +               break;
>>>>>> +
>>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv4:
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
>>>>>> +                                        sizeof(flow.addrs.v4addrs));
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>> +               break;
>>>>>> +
>>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv4:
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
>>>>>> +                                        sizeof(flow.addrs.v4addrs));
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>> +               break;
>>>>>> +
>>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv6:
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
>>>>>> +                                        sizeof(flow.addrs.v6addrs));
>>>>>> +               break;
>>>>>> +
>>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv6:
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
>>>>>> +                                        sizeof(flow.addrs.v6addrs));
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>> +               break;
>>>>>> +
>>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv6:
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
>>>>>> +                                        sizeof(flow.addrs.v6addrs));
>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>> +               break;
>>>>>> +
>>>>>> +       default:
>>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
>>>>>> +               return;
>>>>>
>>>>> So I still think we need a comment here to explain why this is not an
>>>>> issue if the device can report HASH_XXX_EX. Or we need to add the
>>>>> support, since this is the code from the driver side, I don't think we
>>>>> need to worry about the device implementation issues.
>>>>
>>>> This is on the device side, and don't report HASH_TYPE_XXX_EX.
>>>>
>>>>>
>>>>> For the issue of the number of options, does the spec forbid fallback
>>>>> to VIRTIO_NET_HASH_REPORT_NONE? If not, we can do that.
>>>>
>>>> 5.1.6.4.3.4 "IPv6 packets with extension header" says:
>>>>    > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
>>>>    > header, the hash is calculated over the following fields:
>>>>    > - Home address from the home address option in the IPv6 destination
>>>>    >   options header. If the extension header is not present, use the
>>>>    >   Source IPv6 address.
>>>>    > - IPv6 address that is contained in the Routing-Header-Type-2 from the
>>>>    >   associated extension header. If the extension header is not present,
>>>>    >   use the Destination IPv6 address.
>>>>    > - Source TCP port
>>>>    > - Destination TCP port
>>>>
>>>> Therefore, if VIRTIO_NET_HASH_TYPE_TCP_EX is set, the packet has a TCPv6
>>>> and an home address option in the IPv6 destination options header is
>>>> present, the hash is calculated over the home address. If the hash is
>>>> not calculated over the home address in such a case, the device is
>>>> contradicting with this section and violating the spec. The same goes
>>>> for the other HASH_TYPE_XXX_EX types and Routing-Header-Type-2.
>>>
>>> Just to make sure we are one the same page. I meant:
>>>
>>> 1) If the hash is not calculated over the home address (in the case of
>>> IPv6 destination destination), it can still report
>>> VIRTIO_NET_RSS_HASH_TYPE_IPv6. This is what you implemented in your
>>> series. So the device can simply fallback to e.g TCPv6 if it can't
>>> understand all or part of the IPv6 options.
>>
>> The spec says it can fallback if "the extension header is not present",
>> not if the device can't understand the extension header.
> 
> I don't think so,
> 
> 1) spec had a condition beforehand:
> 
> """
> If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
> header, the hash is calculated over the following fields:
> ...
> If the extension header is not present ...
> """
> 
> So the device can choose not to set VIRTIO_NET_HASH_TYPE_TCP_EX as
> spec doesn't say device MUST set VIRTIO_NET_HASH_TYPE_TCP_EX if ...
> 
> 2) implementation wise, since device has limited resources, we can't
> expect the device can parse arbitrary number of ipv6 options
> 
> 3) if 1) and 2) not the case, we need fix the spec otherwise implement
> a spec compliant device is impractical

The statement is preceded by the following:
 >  The device calculates the hash on IPv4 packets according to
 > ’Enabled hash types’ bitmask as follows:

The 'Enabled hash types' bitmask is specified by the device.

I think the spec needs amendment.

I wonder if there are any people interested in the feature though. 
Looking at virtnet_set_hashflow() in drivers/net/virtio_net.c, the 
driver of Linux does not let users configure HASH_TYPE_XXX_EX. I suppose 
Windows supports HASH_TYPE_XXX_EX, but those who care network 
performance most would use Linux so HASH_TYPE_XXX_EX support without 
Linux driver's support may not be useful.

> 
>>
>>> 2) the VIRTIO_NET_SUPPORTED_HASH_TYPES is not checked against the
>>> tun_vnet_ioctl_sethash(), so userspace may set
>>> VIRTIO_NET_HASH_TYPE_TCP_EX regardless of what has been returned by
>>> tun_vnet_ioctl_gethashtypes(). In this case they won't get
>>> VIRTIO_NET_HASH_TYPE_TCP_EX.
>>
>> That's right. It's the responsibility of the userspace to set only the
>> supported hash types.
> 
> Well, the kernel should filter out the unsupported one to have a
> robust uAPI. Otherwise, we give green light to the buggy userspace
> which will have unexpected results.

My reasoning was that it may be fine for some use cases other than VM 
(e.g., DPDK); in such a use case, it is fine as long as the UAPI works 
in the best-effort basis.

For example, suppose a userspace program that processes TCP packets; the 
program can enable: HASH_TYPE_IPv4, HASH_TYPE_TCPv4, HASH_TYPE_IPv6, and 
HASH_TYPE_TCPv6. Ideally, the kernel should support all the hash types, 
but, even if e.g., HASH_TYPE_TCPv6 is not available, it will fall back 
to HASH_TYPE_IPv6, which still does something good and may be acceptable.

That said, for a use case that involves VM and implements virtio-net 
(e.g., QEMU), setting an unsupported hash type here is definitely a bug. 
Catching the bug may outweigh the extra trouble for other use cases.

> 
>>
>>> 3) implementing part of the hash types might complicate the migration
>>> or at least we need to describe the expectations of libvirt or other
>>> management in this case. For example, do we plan to have a dedicated
>>> Qemu command line like:
>>>
>>> -device virtio-net-pci,hash_report=on,supported_hash_types=X,Y,Z?
>>
>> I posted a patch series to implement such a command line for vDPA[1].
>> The patch series that wires this tuntap feature up[2] reuses the
>> infrastructure so it doesn't bring additional complexity.
>>
>> [1]
>> https://lore.kernel.org/qemu-devel/20250530-vdpa-v1-0-5af4109b1c19@daynix.com/
>> [2]
>> https://lore.kernel.org/qemu-devel/20250530-hash-v5-0-343d7d7a8200@daynix.com/
> 
> I meant, if we implement a full hash report feature, it means a single
> hash cmdline option is more than sufficient and so compatibility code
> can just turn it off when dealing with machine types. This is much
> more simpler than
> 
> 1) having both hash as well as supported_hash_features
> 2) dealing both hash as well as supported_hash_features in compatibility codes
> 3) libvirt will be happy
> 
> For [1], it seems it introduces a per has type option, this seems to
> be a burden to the management layer as it need to learn new option
> everytime a new hash type is supported

Even with the command line you proposed (supported_hash_types=X,Y,Z), it 
is still necessary to know the values the supported_hash_types property 
accepts (X.Y,Z), so I don't think it makes difference.

The burden to the management layer is already present for features, so 
it is an existing problem (or its mere extension).

This problem was discussed in the following thread in the past, but no 
solution is implemented yet, and probably solving it will be difficult.
https://lore.kernel.org/qemu-devel/20230731223148.1002258-5-yuri.benditovich@daynix.com/

Regards,
Akihiko Odaki
Jason Wang June 6, 2025, 12:48 a.m. UTC | #4
On Thu, Jun 5, 2025 at 3:58 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2025/06/05 10:53, Jason Wang wrote:
> > On Wed, Jun 4, 2025 at 3:20 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> On 2025/06/04 10:18, Jason Wang wrote:
> >>> On Tue, Jun 3, 2025 at 1:31 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>
> >>>> On 2025/06/03 12:19, Jason Wang wrote:
> >>>>> On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>
> >>>>>> They are useful to implement VIRTIO_NET_F_RSS and
> >>>>>> VIRTIO_NET_F_HASH_REPORT.
> >>>>>>
> >>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> >>>>>> Tested-by: Lei Yang <leiyang@redhat.com>
> >>>>>> ---
> >>>>>>     include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>     1 file changed, 188 insertions(+)
> >>>>>>
> >>>>>> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
> >>>>>> index 02a9f4dc594d..426f33b4b824 100644
> >>>>>> --- a/include/linux/virtio_net.h
> >>>>>> +++ b/include/linux/virtio_net.h
> >>>>>> @@ -9,6 +9,194 @@
> >>>>>>     #include <uapi/linux/tcp.h>
> >>>>>>     #include <uapi/linux/virtio_net.h>
> >>>>>>
> >>>>>> +struct virtio_net_hash {
> >>>>>> +       u32 value;
> >>>>>> +       u16 report;
> >>>>>> +};
> >>>>>> +
> >>>>>> +struct virtio_net_toeplitz_state {
> >>>>>> +       u32 hash;
> >>>>>> +       const u32 *key;
> >>>>>> +};
> >>>>>> +
> >>>>>> +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
> >>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
> >>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
> >>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
> >>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
> >>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
> >>>>>> +
> >>>>>> +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
> >>>>>> +
> >>>>>> +static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len)
> >>>>>> +{
> >>>>>> +       while (len >= sizeof(*input)) {
> >>>>>> +               *input = be32_to_cpu((__force __be32)*input);
> >>>>>> +               input++;
> >>>>>> +               len -= sizeof(*input);
> >>>>>> +       }
> >>>>>> +}
> >>>>>> +
> >>>>>> +static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state,
> >>>>>> +                                           const __be32 *input, size_t len)
> >>>>>> +{
> >>>>>> +       while (len >= sizeof(*input)) {
> >>>>>> +               for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) {
> >>>>>> +                       u32 i = ffs(map);
> >>>>>> +
> >>>>>> +                       state->hash ^= state->key[0] << (32 - i) |
> >>>>>> +                                      (u32)((u64)state->key[1] >> i);
> >>>>>> +               }
> >>>>>> +
> >>>>>> +               state->key++;
> >>>>>> +               input++;
> >>>>>> +               len -= sizeof(*input);
> >>>>>> +       }
> >>>>>> +}
> >>>>>> +
> >>>>>> +static inline u8 virtio_net_hash_key_length(u32 types)
> >>>>>> +{
> >>>>>> +       size_t len = 0;
> >>>>>> +
> >>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv4)
> >>>>>> +               len = max(len,
> >>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs));
> >>>>>> +
> >>>>>> +       if (types &
> >>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
> >>>>>> +               len = max(len,
> >>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs) +
> >>>>>> +                         sizeof(struct flow_dissector_key_ports));
> >>>>>> +
> >>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv6)
> >>>>>> +               len = max(len,
> >>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs));
> >>>>>> +
> >>>>>> +       if (types &
> >>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
> >>>>>> +               len = max(len,
> >>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs) +
> >>>>>> +                         sizeof(struct flow_dissector_key_ports));
> >>>>>> +
> >>>>>> +       return len + sizeof(u32);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static inline u32 virtio_net_hash_report(u32 types,
> >>>>>> +                                        const struct flow_keys_basic *keys)
> >>>>>> +{
> >>>>>> +       switch (keys->basic.n_proto) {
> >>>>>> +       case cpu_to_be16(ETH_P_IP):
> >>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
> >>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv4;
> >>>>>> +
> >>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
> >>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv4;
> >>>>>> +               }
> >>>>>> +
> >>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
> >>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv4;
> >>>>>> +
> >>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>> +
> >>>>>> +       case cpu_to_be16(ETH_P_IPV6):
> >>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
> >>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
> >>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
> >>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv6;
> >>>>>> +
> >>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
> >>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
> >>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv6;
> >>>>>> +               }
> >>>>>> +
> >>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
> >>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv6;
> >>>>>> +
> >>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>> +
> >>>>>> +       default:
> >>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>> +       }
> >>>>>> +}
> >>>>>> +
> >>>>>> +static inline void virtio_net_hash_rss(const struct sk_buff *skb,
> >>>>>> +                                      u32 types, const u32 *key,
> >>>>>> +                                      struct virtio_net_hash *hash)
> >>>>>> +{
> >>>>>> +       struct virtio_net_toeplitz_state toeplitz_state = { .key = key };
> >>>>>> +       struct flow_keys flow;
> >>>>>> +       struct flow_keys_basic flow_basic;
> >>>>>> +       u16 report;
> >>>>>> +
> >>>>>> +       if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) {
> >>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>> +               return;
> >>>>>> +       }
> >>>>>> +
> >>>>>> +       flow_basic = (struct flow_keys_basic) {
> >>>>>> +               .control = flow.control,
> >>>>>> +               .basic = flow.basic
> >>>>>> +       };
> >>>>>> +
> >>>>>> +       report = virtio_net_hash_report(types, &flow_basic);
> >>>>>> +
> >>>>>> +       switch (report) {
> >>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv4:
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>>>> +               break;
> >>>>>> +
> >>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv4:
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>> +               break;
> >>>>>> +
> >>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv4:
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
> >>>>>> +                                        sizeof(flow.addrs.v4addrs));
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>> +               break;
> >>>>>> +
> >>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv6:
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>>>> +               break;
> >>>>>> +
> >>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv6:
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>> +               break;
> >>>>>> +
> >>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv6:
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
> >>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
> >>>>>> +                                        sizeof(flow.addrs.v6addrs));
> >>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
> >>>>>> +                                        sizeof(flow.ports.ports));
> >>>>>> +               break;
> >>>>>> +
> >>>>>> +       default:
> >>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
> >>>>>> +               return;
> >>>>>
> >>>>> So I still think we need a comment here to explain why this is not an
> >>>>> issue if the device can report HASH_XXX_EX. Or we need to add the
> >>>>> support, since this is the code from the driver side, I don't think we
> >>>>> need to worry about the device implementation issues.
> >>>>
> >>>> This is on the device side, and don't report HASH_TYPE_XXX_EX.
> >>>>
> >>>>>
> >>>>> For the issue of the number of options, does the spec forbid fallback
> >>>>> to VIRTIO_NET_HASH_REPORT_NONE? If not, we can do that.
> >>>>
> >>>> 5.1.6.4.3.4 "IPv6 packets with extension header" says:
> >>>>    > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
> >>>>    > header, the hash is calculated over the following fields:
> >>>>    > - Home address from the home address option in the IPv6 destination
> >>>>    >   options header. If the extension header is not present, use the
> >>>>    >   Source IPv6 address.
> >>>>    > - IPv6 address that is contained in the Routing-Header-Type-2 from the
> >>>>    >   associated extension header. If the extension header is not present,
> >>>>    >   use the Destination IPv6 address.
> >>>>    > - Source TCP port
> >>>>    > - Destination TCP port
> >>>>
> >>>> Therefore, if VIRTIO_NET_HASH_TYPE_TCP_EX is set, the packet has a TCPv6
> >>>> and an home address option in the IPv6 destination options header is
> >>>> present, the hash is calculated over the home address. If the hash is
> >>>> not calculated over the home address in such a case, the device is
> >>>> contradicting with this section and violating the spec. The same goes
> >>>> for the other HASH_TYPE_XXX_EX types and Routing-Header-Type-2.
> >>>
> >>> Just to make sure we are one the same page. I meant:
> >>>
> >>> 1) If the hash is not calculated over the home address (in the case of
> >>> IPv6 destination destination), it can still report
> >>> VIRTIO_NET_RSS_HASH_TYPE_IPv6. This is what you implemented in your
> >>> series. So the device can simply fallback to e.g TCPv6 if it can't
> >>> understand all or part of the IPv6 options.
> >>
> >> The spec says it can fallback if "the extension header is not present",
> >> not if the device can't understand the extension header.
> >
> > I don't think so,
> >
> > 1) spec had a condition beforehand:
> >
> > """
> > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
> > header, the hash is calculated over the following fields:
> > ...
> > If the extension header is not present ...
> > """
> >
> > So the device can choose not to set VIRTIO_NET_HASH_TYPE_TCP_EX as
> > spec doesn't say device MUST set VIRTIO_NET_HASH_TYPE_TCP_EX if ...
> >
> > 2) implementation wise, since device has limited resources, we can't
> > expect the device can parse arbitrary number of ipv6 options
> >
> > 3) if 1) and 2) not the case, we need fix the spec otherwise implement
> > a spec compliant device is impractical
>
> The statement is preceded by the following:
>  >  The device calculates the hash on IPv4 packets according to
>  > ’Enabled hash types’ bitmask as follows:
>
> The 'Enabled hash types' bitmask is specified by the device.
>
> I think the spec needs amendment.

Michael, can you help to clarify here?

>
> I wonder if there are any people interested in the feature though.
> Looking at virtnet_set_hashflow() in drivers/net/virtio_net.c, the
> driver of Linux does not let users configure HASH_TYPE_XXX_EX. I suppose
> Windows supports HASH_TYPE_XXX_EX, but those who care network
> performance most would use Linux so HASH_TYPE_XXX_EX support without
> Linux driver's support may not be useful.

It might be still interesting for example for the hardware virtio
vendors to support windows etc.

>
> >
> >>
> >>> 2) the VIRTIO_NET_SUPPORTED_HASH_TYPES is not checked against the
> >>> tun_vnet_ioctl_sethash(), so userspace may set
> >>> VIRTIO_NET_HASH_TYPE_TCP_EX regardless of what has been returned by
> >>> tun_vnet_ioctl_gethashtypes(). In this case they won't get
> >>> VIRTIO_NET_HASH_TYPE_TCP_EX.
> >>
> >> That's right. It's the responsibility of the userspace to set only the
> >> supported hash types.
> >
> > Well, the kernel should filter out the unsupported one to have a
> > robust uAPI. Otherwise, we give green light to the buggy userspace
> > which will have unexpected results.
>
> My reasoning was that it may be fine for some use cases other than VM
> (e.g., DPDK); in such a use case, it is fine as long as the UAPI works
> in the best-effort basis.

Best-effort might increase the chance for user visisable changes after
migration.

>
> For example, suppose a userspace program that processes TCP packets; the
> program can enable: HASH_TYPE_IPv4, HASH_TYPE_TCPv4, HASH_TYPE_IPv6, and
> HASH_TYPE_TCPv6. Ideally, the kernel should support all the hash types,
> but, even if e.g., HASH_TYPE_TCPv6 is not available,

For "available" did you mean it is not supported by the device?

> it will fall back
> to HASH_TYPE_IPv6, which still does something good and may be acceptable.

This fallback is exactly the same as I said above, let
VIRTIO_NET_HASH_TYPE_TCP_EX to fallback.

My point is that, the implementation should either:

1) allow fallback so it can claim to support all hash types

or

2) don't allow fallback so it can only support a part of the hash types

If we're doing something in the middle, for example, allow part of the
type to fallback.

>
> That said, for a use case that involves VM and implements virtio-net
> (e.g., QEMU), setting an unsupported hash type here is definitely a bug.
> Catching the bug may outweigh the extra trouble for other use cases.
>
> >
> >>
> >>> 3) implementing part of the hash types might complicate the migration
> >>> or at least we need to describe the expectations of libvirt or other
> >>> management in this case. For example, do we plan to have a dedicated
> >>> Qemu command line like:
> >>>
> >>> -device virtio-net-pci,hash_report=on,supported_hash_types=X,Y,Z?
> >>
> >> I posted a patch series to implement such a command line for vDPA[1].
> >> The patch series that wires this tuntap feature up[2] reuses the
> >> infrastructure so it doesn't bring additional complexity.
> >>
> >> [1]
> >> https://lore.kernel.org/qemu-devel/20250530-vdpa-v1-0-5af4109b1c19@daynix.com/
> >> [2]
> >> https://lore.kernel.org/qemu-devel/20250530-hash-v5-0-343d7d7a8200@daynix.com/
> >
> > I meant, if we implement a full hash report feature, it means a single
> > hash cmdline option is more than sufficient and so compatibility code
> > can just turn it off when dealing with machine types. This is much
> > more simpler than
> >
> > 1) having both hash as well as supported_hash_features
> > 2) dealing both hash as well as supported_hash_features in compatibility codes
> > 3) libvirt will be happy
> >
> > For [1], it seems it introduces a per has type option, this seems to
> > be a burden to the management layer as it need to learn new option
> > everytime a new hash type is supported
>
> Even with the command line you proposed (supported_hash_types=X,Y,Z), it
> is still necessary to know the values the supported_hash_types property
> accepts (X.Y,Z), so I don't think it makes difference.

It could be a uint32_t.

>
> The burden to the management layer is already present for features, so
> it is an existing problem (or its mere extension).

Yes, but since this feature is new it's better to try our best to avoid that.

>
> This problem was discussed in the following thread in the past, but no
> solution is implemented yet, and probably solving it will be difficult.
> https://lore.kernel.org/qemu-devel/20230731223148.1002258-5-yuri.benditovich@daynix.com/

It's a similar issue but not the same, it looks more like a discussion
on whether the fallback from vhost-net to qemu works for missing
features etc.

Thanks

>
> Regards,
> Akihiko Odaki
>
Akihiko Odaki June 6, 2025, 9:10 a.m. UTC | #5
On 2025/06/06 9:48, Jason Wang wrote:
> On Thu, Jun 5, 2025 at 3:58 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> On 2025/06/05 10:53, Jason Wang wrote:
>>> On Wed, Jun 4, 2025 at 3:20 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>
>>>> On 2025/06/04 10:18, Jason Wang wrote:
>>>>> On Tue, Jun 3, 2025 at 1:31 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>
>>>>>> On 2025/06/03 12:19, Jason Wang wrote:
>>>>>>> On Fri, May 30, 2025 at 12:50 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>>>
>>>>>>>> They are useful to implement VIRTIO_NET_F_RSS and
>>>>>>>> VIRTIO_NET_F_HASH_REPORT.
>>>>>>>>
>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com>
>>>>>>>> ---
>>>>>>>>      include/linux/virtio_net.h | 188 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>      1 file changed, 188 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
>>>>>>>> index 02a9f4dc594d..426f33b4b824 100644
>>>>>>>> --- a/include/linux/virtio_net.h
>>>>>>>> +++ b/include/linux/virtio_net.h
>>>>>>>> @@ -9,6 +9,194 @@
>>>>>>>>      #include <uapi/linux/tcp.h>
>>>>>>>>      #include <uapi/linux/virtio_net.h>
>>>>>>>>
>>>>>>>> +struct virtio_net_hash {
>>>>>>>> +       u32 value;
>>>>>>>> +       u16 report;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +struct virtio_net_toeplitz_state {
>>>>>>>> +       u32 hash;
>>>>>>>> +       const u32 *key;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
>>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
>>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
>>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
>>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
>>>>>>>> +                                        VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
>>>>>>>> +
>>>>>>>> +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
>>>>>>>> +
>>>>>>>> +static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len)
>>>>>>>> +{
>>>>>>>> +       while (len >= sizeof(*input)) {
>>>>>>>> +               *input = be32_to_cpu((__force __be32)*input);
>>>>>>>> +               input++;
>>>>>>>> +               len -= sizeof(*input);
>>>>>>>> +       }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state,
>>>>>>>> +                                           const __be32 *input, size_t len)
>>>>>>>> +{
>>>>>>>> +       while (len >= sizeof(*input)) {
>>>>>>>> +               for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) {
>>>>>>>> +                       u32 i = ffs(map);
>>>>>>>> +
>>>>>>>> +                       state->hash ^= state->key[0] << (32 - i) |
>>>>>>>> +                                      (u32)((u64)state->key[1] >> i);
>>>>>>>> +               }
>>>>>>>> +
>>>>>>>> +               state->key++;
>>>>>>>> +               input++;
>>>>>>>> +               len -= sizeof(*input);
>>>>>>>> +       }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline u8 virtio_net_hash_key_length(u32 types)
>>>>>>>> +{
>>>>>>>> +       size_t len = 0;
>>>>>>>> +
>>>>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv4)
>>>>>>>> +               len = max(len,
>>>>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs));
>>>>>>>> +
>>>>>>>> +       if (types &
>>>>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
>>>>>>>> +               len = max(len,
>>>>>>>> +                         sizeof(struct flow_dissector_key_ipv4_addrs) +
>>>>>>>> +                         sizeof(struct flow_dissector_key_ports));
>>>>>>>> +
>>>>>>>> +       if (types & VIRTIO_NET_HASH_REPORT_IPv6)
>>>>>>>> +               len = max(len,
>>>>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs));
>>>>>>>> +
>>>>>>>> +       if (types &
>>>>>>>> +           (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
>>>>>>>> +               len = max(len,
>>>>>>>> +                         sizeof(struct flow_dissector_key_ipv6_addrs) +
>>>>>>>> +                         sizeof(struct flow_dissector_key_ports));
>>>>>>>> +
>>>>>>>> +       return len + sizeof(u32);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline u32 virtio_net_hash_report(u32 types,
>>>>>>>> +                                        const struct flow_keys_basic *keys)
>>>>>>>> +{
>>>>>>>> +       switch (keys->basic.n_proto) {
>>>>>>>> +       case cpu_to_be16(ETH_P_IP):
>>>>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
>>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
>>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
>>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv4;
>>>>>>>> +
>>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
>>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
>>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv4;
>>>>>>>> +               }
>>>>>>>> +
>>>>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
>>>>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv4;
>>>>>>>> +
>>>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
>>>>>>>> +
>>>>>>>> +       case cpu_to_be16(ETH_P_IPV6):
>>>>>>>> +               if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
>>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_TCP &&
>>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
>>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_TCPv6;
>>>>>>>> +
>>>>>>>> +                       if (keys->basic.ip_proto == IPPROTO_UDP &&
>>>>>>>> +                           (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
>>>>>>>> +                               return VIRTIO_NET_HASH_REPORT_UDPv6;
>>>>>>>> +               }
>>>>>>>> +
>>>>>>>> +               if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
>>>>>>>> +                       return VIRTIO_NET_HASH_REPORT_IPv6;
>>>>>>>> +
>>>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
>>>>>>>> +
>>>>>>>> +       default:
>>>>>>>> +               return VIRTIO_NET_HASH_REPORT_NONE;
>>>>>>>> +       }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline void virtio_net_hash_rss(const struct sk_buff *skb,
>>>>>>>> +                                      u32 types, const u32 *key,
>>>>>>>> +                                      struct virtio_net_hash *hash)
>>>>>>>> +{
>>>>>>>> +       struct virtio_net_toeplitz_state toeplitz_state = { .key = key };
>>>>>>>> +       struct flow_keys flow;
>>>>>>>> +       struct flow_keys_basic flow_basic;
>>>>>>>> +       u16 report;
>>>>>>>> +
>>>>>>>> +       if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) {
>>>>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
>>>>>>>> +               return;
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       flow_basic = (struct flow_keys_basic) {
>>>>>>>> +               .control = flow.control,
>>>>>>>> +               .basic = flow.basic
>>>>>>>> +       };
>>>>>>>> +
>>>>>>>> +       report = virtio_net_hash_report(types, &flow_basic);
>>>>>>>> +
>>>>>>>> +       switch (report) {
>>>>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv4:
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
>>>>>>>> +                                        sizeof(flow.addrs.v4addrs));
>>>>>>>> +               break;
>>>>>>>> +
>>>>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv4:
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
>>>>>>>> +                                        sizeof(flow.addrs.v4addrs));
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>>>> +               break;
>>>>>>>> +
>>>>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv4:
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>>>> +                                        (__be32 *)&flow.addrs.v4addrs,
>>>>>>>> +                                        sizeof(flow.addrs.v4addrs));
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>>>> +               break;
>>>>>>>> +
>>>>>>>> +       case VIRTIO_NET_HASH_REPORT_IPv6:
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
>>>>>>>> +                                        sizeof(flow.addrs.v6addrs));
>>>>>>>> +               break;
>>>>>>>> +
>>>>>>>> +       case VIRTIO_NET_HASH_REPORT_TCPv6:
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
>>>>>>>> +                                        sizeof(flow.addrs.v6addrs));
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>>>> +               break;
>>>>>>>> +
>>>>>>>> +       case VIRTIO_NET_HASH_REPORT_UDPv6:
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state,
>>>>>>>> +                                        (__be32 *)&flow.addrs.v6addrs,
>>>>>>>> +                                        sizeof(flow.addrs.v6addrs));
>>>>>>>> +               virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
>>>>>>>> +                                        sizeof(flow.ports.ports));
>>>>>>>> +               break;
>>>>>>>> +
>>>>>>>> +       default:
>>>>>>>> +               hash->report = VIRTIO_NET_HASH_REPORT_NONE;
>>>>>>>> +               return;
>>>>>>>
>>>>>>> So I still think we need a comment here to explain why this is not an
>>>>>>> issue if the device can report HASH_XXX_EX. Or we need to add the
>>>>>>> support, since this is the code from the driver side, I don't think we
>>>>>>> need to worry about the device implementation issues.
>>>>>>
>>>>>> This is on the device side, and don't report HASH_TYPE_XXX_EX.
>>>>>>
>>>>>>>
>>>>>>> For the issue of the number of options, does the spec forbid fallback
>>>>>>> to VIRTIO_NET_HASH_REPORT_NONE? If not, we can do that.
>>>>>>
>>>>>> 5.1.6.4.3.4 "IPv6 packets with extension header" says:
>>>>>>     > If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
>>>>>>     > header, the hash is calculated over the following fields:
>>>>>>     > - Home address from the home address option in the IPv6 destination
>>>>>>     >   options header. If the extension header is not present, use the
>>>>>>     >   Source IPv6 address.
>>>>>>     > - IPv6 address that is contained in the Routing-Header-Type-2 from the
>>>>>>     >   associated extension header. If the extension header is not present,
>>>>>>     >   use the Destination IPv6 address.
>>>>>>     > - Source TCP port
>>>>>>     > - Destination TCP port
>>>>>>
>>>>>> Therefore, if VIRTIO_NET_HASH_TYPE_TCP_EX is set, the packet has a TCPv6
>>>>>> and an home address option in the IPv6 destination options header is
>>>>>> present, the hash is calculated over the home address. If the hash is
>>>>>> not calculated over the home address in such a case, the device is
>>>>>> contradicting with this section and violating the spec. The same goes
>>>>>> for the other HASH_TYPE_XXX_EX types and Routing-Header-Type-2.
>>>>>
>>>>> Just to make sure we are one the same page. I meant:
>>>>>
>>>>> 1) If the hash is not calculated over the home address (in the case of
>>>>> IPv6 destination destination), it can still report
>>>>> VIRTIO_NET_RSS_HASH_TYPE_IPv6. This is what you implemented in your
>>>>> series. So the device can simply fallback to e.g TCPv6 if it can't
>>>>> understand all or part of the IPv6 options.
>>>>
>>>> The spec says it can fallback if "the extension header is not present",
>>>> not if the device can't understand the extension header.
>>>
>>> I don't think so,
>>>
>>> 1) spec had a condition beforehand:
>>>
>>> """
>>> If VIRTIO_NET_HASH_TYPE_TCP_EX is set and the packet has a TCPv6
>>> header, the hash is calculated over the following fields:
>>> ...
>>> If the extension header is not present ...
>>> """
>>>
>>> So the device can choose not to set VIRTIO_NET_HASH_TYPE_TCP_EX as
>>> spec doesn't say device MUST set VIRTIO_NET_HASH_TYPE_TCP_EX if ...
>>>
>>> 2) implementation wise, since device has limited resources, we can't
>>> expect the device can parse arbitrary number of ipv6 options
>>>
>>> 3) if 1) and 2) not the case, we need fix the spec otherwise implement
>>> a spec compliant device is impractical
>>
>> The statement is preceded by the following:
>>   >  The device calculates the hash on IPv4 packets according to
>>   > ’Enabled hash types’ bitmask as follows:
>>
>> The 'Enabled hash types' bitmask is specified by the device.
>>
>> I think the spec needs amendment.
> 
> Michael, can you help to clarify here?
> 
>>
>> I wonder if there are any people interested in the feature though.
>> Looking at virtnet_set_hashflow() in drivers/net/virtio_net.c, the
>> driver of Linux does not let users configure HASH_TYPE_XXX_EX. I suppose
>> Windows supports HASH_TYPE_XXX_EX, but those who care network
>> performance most would use Linux so HASH_TYPE_XXX_EX support without
>> Linux driver's support may not be useful.
> 
> It might be still interesting for example for the hardware virtio
> vendors to support windows etc.

I don't know if Windows needs them for e.g., device/driver certification 
so surveying Windows makes sense.

> 
>>
>>>
>>>>
>>>>> 2) the VIRTIO_NET_SUPPORTED_HASH_TYPES is not checked against the
>>>>> tun_vnet_ioctl_sethash(), so userspace may set
>>>>> VIRTIO_NET_HASH_TYPE_TCP_EX regardless of what has been returned by
>>>>> tun_vnet_ioctl_gethashtypes(). In this case they won't get
>>>>> VIRTIO_NET_HASH_TYPE_TCP_EX.
>>>>
>>>> That's right. It's the responsibility of the userspace to set only the
>>>> supported hash types.
>>>
>>> Well, the kernel should filter out the unsupported one to have a
>>> robust uAPI. Otherwise, we give green light to the buggy userspace
>>> which will have unexpected results.
>>
>> My reasoning was that it may be fine for some use cases other than VM
>> (e.g., DPDK); in such a use case, it is fine as long as the UAPI works
>> in the best-effort basis.
> 
> Best-effort might increase the chance for user visisable changes after
> migration.

It is a trade-off between catching a migration bug for VMM and making 
life a bit easier for userspace programs other than VMM.

> 
>>
>> For example, suppose a userspace program that processes TCP packets; the
>> program can enable: HASH_TYPE_IPv4, HASH_TYPE_TCPv4, HASH_TYPE_IPv6, and
>> HASH_TYPE_TCPv6. Ideally, the kernel should support all the hash types,
>> but, even if e.g., HASH_TYPE_TCPv6 is not available,
> 
> For "available" did you mean it is not supported by the device?
> 
>> it will fall back
>> to HASH_TYPE_IPv6, which still does something good and may be acceptable.
> 
> This fallback is exactly the same as I said above, let
> VIRTIO_NET_HASH_TYPE_TCP_EX to fallback.
> 
> My point is that, the implementation should either:
> 
> 1) allow fallback so it can claim to support all hash types
> 
> or
> 
> 2) don't allow fallback so it can only support a part of the hash types
> 
> If we're doing something in the middle, for example, allow part of the
> type to fallback.

1) or the middle will make it unsuitable for VM because it violates the 
virtio spec. 2) makes sense though the trade-off I mentioned should be 
taken into consideration.

> 
>>
>> That said, for a use case that involves VM and implements virtio-net
>> (e.g., QEMU), setting an unsupported hash type here is definitely a bug.
>> Catching the bug may outweigh the extra trouble for other use cases.
>>
>>>
>>>>
>>>>> 3) implementing part of the hash types might complicate the migration
>>>>> or at least we need to describe the expectations of libvirt or other
>>>>> management in this case. For example, do we plan to have a dedicated
>>>>> Qemu command line like:
>>>>>
>>>>> -device virtio-net-pci,hash_report=on,supported_hash_types=X,Y,Z?
>>>>
>>>> I posted a patch series to implement such a command line for vDPA[1].
>>>> The patch series that wires this tuntap feature up[2] reuses the
>>>> infrastructure so it doesn't bring additional complexity.
>>>>
>>>> [1]
>>>> https://lore.kernel.org/qemu-devel/20250530-vdpa-v1-0-5af4109b1c19@daynix.com/
>>>> [2]
>>>> https://lore.kernel.org/qemu-devel/20250530-hash-v5-0-343d7d7a8200@daynix.com/
>>>
>>> I meant, if we implement a full hash report feature, it means a single
>>> hash cmdline option is more than sufficient and so compatibility code
>>> can just turn it off when dealing with machine types. This is much
>>> more simpler than
>>>
>>> 1) having both hash as well as supported_hash_features
>>> 2) dealing both hash as well as supported_hash_features in compatibility codes
>>> 3) libvirt will be happy
>>>
>>> For [1], it seems it introduces a per has type option, this seems to
>>> be a burden to the management layer as it need to learn new option
>>> everytime a new hash type is supported
>>
>> Even with the command line you proposed (supported_hash_types=X,Y,Z), it
>> is still necessary to know the values the supported_hash_types property
>> accepts (X.Y,Z), so I don't think it makes difference.
> 
> It could be a uint32_t.

The management layer will need to know what bits are accepted even with 
uint32_t.

> 
>>
>> The burden to the management layer is already present for features, so
>> it is an existing problem (or its mere extension).
> 
> Yes, but since this feature is new it's better to try our best to avoid that.
> 
>>
>> This problem was discussed in the following thread in the past, but no
>> solution is implemented yet, and probably solving it will be difficult.
>> https://lore.kernel.org/qemu-devel/20230731223148.1002258-5-yuri.benditovich@daynix.com/
> 
> It's a similar issue but not the same, it looks more like a discussion
> on whether the fallback from vhost-net to qemu works for missing
> features etc.

Perhaps we may be able to do better since this feature is new as you say 
and we don't have to worry much about breaking change. I don't have an 
idea for that yet.

Regards,
Akihiko Odaki
diff mbox series

Patch

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 02a9f4dc594d..426f33b4b824 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -9,6 +9,194 @@ 
 #include <uapi/linux/tcp.h>
 #include <uapi/linux/virtio_net.h>
 
+struct virtio_net_hash {
+	u32 value;
+	u16 report;
+};
+
+struct virtio_net_toeplitz_state {
+	u32 hash;
+	const u32 *key;
+};
+
+#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
+					 VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
+					 VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
+					 VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
+					 VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
+					 VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
+
+#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
+
+static inline void virtio_net_toeplitz_convert_key(u32 *input, size_t len)
+{
+	while (len >= sizeof(*input)) {
+		*input = be32_to_cpu((__force __be32)*input);
+		input++;
+		len -= sizeof(*input);
+	}
+}
+
+static inline void virtio_net_toeplitz_calc(struct virtio_net_toeplitz_state *state,
+					    const __be32 *input, size_t len)
+{
+	while (len >= sizeof(*input)) {
+		for (u32 map = be32_to_cpu(*input); map; map &= (map - 1)) {
+			u32 i = ffs(map);
+
+			state->hash ^= state->key[0] << (32 - i) |
+				       (u32)((u64)state->key[1] >> i);
+		}
+
+		state->key++;
+		input++;
+		len -= sizeof(*input);
+	}
+}
+
+static inline u8 virtio_net_hash_key_length(u32 types)
+{
+	size_t len = 0;
+
+	if (types & VIRTIO_NET_HASH_REPORT_IPv4)
+		len = max(len,
+			  sizeof(struct flow_dissector_key_ipv4_addrs));
+
+	if (types &
+	    (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
+		len = max(len,
+			  sizeof(struct flow_dissector_key_ipv4_addrs) +
+			  sizeof(struct flow_dissector_key_ports));
+
+	if (types & VIRTIO_NET_HASH_REPORT_IPv6)
+		len = max(len,
+			  sizeof(struct flow_dissector_key_ipv6_addrs));
+
+	if (types &
+	    (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
+		len = max(len,
+			  sizeof(struct flow_dissector_key_ipv6_addrs) +
+			  sizeof(struct flow_dissector_key_ports));
+
+	return len + sizeof(u32);
+}
+
+static inline u32 virtio_net_hash_report(u32 types,
+					 const struct flow_keys_basic *keys)
+{
+	switch (keys->basic.n_proto) {
+	case cpu_to_be16(ETH_P_IP):
+		if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
+			if (keys->basic.ip_proto == IPPROTO_TCP &&
+			    (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
+				return VIRTIO_NET_HASH_REPORT_TCPv4;
+
+			if (keys->basic.ip_proto == IPPROTO_UDP &&
+			    (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
+				return VIRTIO_NET_HASH_REPORT_UDPv4;
+		}
+
+		if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
+			return VIRTIO_NET_HASH_REPORT_IPv4;
+
+		return VIRTIO_NET_HASH_REPORT_NONE;
+
+	case cpu_to_be16(ETH_P_IPV6):
+		if (!(keys->control.flags & FLOW_DIS_IS_FRAGMENT)) {
+			if (keys->basic.ip_proto == IPPROTO_TCP &&
+			    (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
+				return VIRTIO_NET_HASH_REPORT_TCPv6;
+
+			if (keys->basic.ip_proto == IPPROTO_UDP &&
+			    (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
+				return VIRTIO_NET_HASH_REPORT_UDPv6;
+		}
+
+		if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
+			return VIRTIO_NET_HASH_REPORT_IPv6;
+
+		return VIRTIO_NET_HASH_REPORT_NONE;
+
+	default:
+		return VIRTIO_NET_HASH_REPORT_NONE;
+	}
+}
+
+static inline void virtio_net_hash_rss(const struct sk_buff *skb,
+				       u32 types, const u32 *key,
+				       struct virtio_net_hash *hash)
+{
+	struct virtio_net_toeplitz_state toeplitz_state = { .key = key };
+	struct flow_keys flow;
+	struct flow_keys_basic flow_basic;
+	u16 report;
+
+	if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) {
+		hash->report = VIRTIO_NET_HASH_REPORT_NONE;
+		return;
+	}
+
+	flow_basic = (struct flow_keys_basic) {
+		.control = flow.control,
+		.basic = flow.basic
+	};
+
+	report = virtio_net_hash_report(types, &flow_basic);
+
+	switch (report) {
+	case VIRTIO_NET_HASH_REPORT_IPv4:
+		virtio_net_toeplitz_calc(&toeplitz_state,
+					 (__be32 *)&flow.addrs.v4addrs,
+					 sizeof(flow.addrs.v4addrs));
+		break;
+
+	case VIRTIO_NET_HASH_REPORT_TCPv4:
+		virtio_net_toeplitz_calc(&toeplitz_state,
+					 (__be32 *)&flow.addrs.v4addrs,
+					 sizeof(flow.addrs.v4addrs));
+		virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
+					 sizeof(flow.ports.ports));
+		break;
+
+	case VIRTIO_NET_HASH_REPORT_UDPv4:
+		virtio_net_toeplitz_calc(&toeplitz_state,
+					 (__be32 *)&flow.addrs.v4addrs,
+					 sizeof(flow.addrs.v4addrs));
+		virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
+					 sizeof(flow.ports.ports));
+		break;
+
+	case VIRTIO_NET_HASH_REPORT_IPv6:
+		virtio_net_toeplitz_calc(&toeplitz_state,
+					 (__be32 *)&flow.addrs.v6addrs,
+					 sizeof(flow.addrs.v6addrs));
+		break;
+
+	case VIRTIO_NET_HASH_REPORT_TCPv6:
+		virtio_net_toeplitz_calc(&toeplitz_state,
+					 (__be32 *)&flow.addrs.v6addrs,
+					 sizeof(flow.addrs.v6addrs));
+		virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
+					 sizeof(flow.ports.ports));
+		break;
+
+	case VIRTIO_NET_HASH_REPORT_UDPv6:
+		virtio_net_toeplitz_calc(&toeplitz_state,
+					 (__be32 *)&flow.addrs.v6addrs,
+					 sizeof(flow.addrs.v6addrs));
+		virtio_net_toeplitz_calc(&toeplitz_state, &flow.ports.ports,
+					 sizeof(flow.ports.ports));
+		break;
+
+	default:
+		hash->report = VIRTIO_NET_HASH_REPORT_NONE;
+		return;
+	}
+
+	hash->value = toeplitz_state.hash;
+	hash->report = report;
+}
+
 static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
 {
 	switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {