Message ID | 20210426170411.1789186-7-tobias@waldekranz.com |
---|---|
State | New |
Headers | show |
Series | net: bridge: Forward offloading | expand |
On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote: > Allow DSA drivers to support forward offloading from a bridge by: > > - Passing calls to .ndo_dfwd_{add,del}_station to the drivers. > > - Recording the subordinate device of offloaded skbs in the control > buffer so that the tagger can take the appropriate action. > > Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> > --- > include/net/dsa.h | 7 +++++++ > net/dsa/slave.c | 36 ++++++++++++++++++++++++++++++++++-- > 2 files changed, 41 insertions(+), 2 deletions(-) > > diff --git a/include/net/dsa.h b/include/net/dsa.h > index 1f9ba9889034..77d4df819299 100644 > --- a/include/net/dsa.h > +++ b/include/net/dsa.h > @@ -119,6 +119,7 @@ struct dsa_netdevice_ops { > > struct dsa_skb_cb { > struct sk_buff *clone; > + struct net_device *sb_dev; > }; > > struct __dsa_skb_cb { > @@ -828,6 +829,12 @@ struct dsa_switch_ops { > const struct switchdev_obj_ring_role_mrp *mrp); > int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, > const struct switchdev_obj_ring_role_mrp *mrp); > + > + /* L2 forward offloading */ > + void * (*dfwd_add_station)(struct dsa_switch *ds, int port, > + struct net_device *sb_dev); > + void (*dfwd_del_station)(struct dsa_switch *ds, int port, > + struct net_device *sb_dev); > }; > > #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ > diff --git a/net/dsa/slave.c b/net/dsa/slave.c > index 77b33bd161b8..3689ffa2dbb8 100644 > --- a/net/dsa/slave.c > +++ b/net/dsa/slave.c > @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) > return dsa_enqueue_skb(nskb, dev); > } > > +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb, > + struct net_device *sb_dev) > +{ > + DSA_SKB_CB(skb)->sb_dev = sb_dev; > + return netdev_pick_tx(dev, skb, sb_dev); > +} > + DSA_SKB_CB is going away: https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/ Let's either negotiate with Yangbo on keeping it, or make .ndo_select_queue a bypass towards the tagger, where it can use its own SKB_CB structure and be more flexible in general (I think I'm leaning towards the latter).
On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote: > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote: >> Allow DSA drivers to support forward offloading from a bridge by: >> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers. >> >> - Recording the subordinate device of offloaded skbs in the control >> buffer so that the tagger can take the appropriate action. >> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> >> --- >> include/net/dsa.h | 7 +++++++ >> net/dsa/slave.c | 36 ++++++++++++++++++++++++++++++++++-- >> 2 files changed, 41 insertions(+), 2 deletions(-) >> >> diff --git a/include/net/dsa.h b/include/net/dsa.h >> index 1f9ba9889034..77d4df819299 100644 >> --- a/include/net/dsa.h >> +++ b/include/net/dsa.h >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops { >> >> struct dsa_skb_cb { >> struct sk_buff *clone; >> + struct net_device *sb_dev; >> }; >> >> struct __dsa_skb_cb { >> @@ -828,6 +829,12 @@ struct dsa_switch_ops { >> const struct switchdev_obj_ring_role_mrp *mrp); >> int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, >> const struct switchdev_obj_ring_role_mrp *mrp); >> + >> + /* L2 forward offloading */ >> + void * (*dfwd_add_station)(struct dsa_switch *ds, int port, >> + struct net_device *sb_dev); >> + void (*dfwd_del_station)(struct dsa_switch *ds, int port, >> + struct net_device *sb_dev); >> }; >> >> #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c >> index 77b33bd161b8..3689ffa2dbb8 100644 >> --- a/net/dsa/slave.c >> +++ b/net/dsa/slave.c >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) >> return dsa_enqueue_skb(nskb, dev); >> } >> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb, >> + struct net_device *sb_dev) >> +{ >> + DSA_SKB_CB(skb)->sb_dev = sb_dev; >> + return netdev_pick_tx(dev, skb, sb_dev); >> +} >> + > > DSA_SKB_CB is going away: > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/ > > Let's either negotiate with Yangbo on keeping it, or make > .ndo_select_queue a bypass towards the tagger, where it can use its own > SKB_CB structure and be more flexible in general (I think I'm leaning > towards the latter). Thus far, Yangbo is a tough negotiator, giving me the silent treatment: https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/ :) That memset is giving me a hard time. I have just disabled it on my branch at the moment. Any ideas on how to get rid of it without breaking timestamping?
On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote: > On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote: > > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote: > >> Allow DSA drivers to support forward offloading from a bridge by: > >> > >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers. > >> > >> - Recording the subordinate device of offloaded skbs in the control > >> buffer so that the tagger can take the appropriate action. > >> > >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> > >> --- > >> include/net/dsa.h | 7 +++++++ > >> net/dsa/slave.c | 36 ++++++++++++++++++++++++++++++++++-- > >> 2 files changed, 41 insertions(+), 2 deletions(-) > >> > >> diff --git a/include/net/dsa.h b/include/net/dsa.h > >> index 1f9ba9889034..77d4df819299 100644 > >> --- a/include/net/dsa.h > >> +++ b/include/net/dsa.h > >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops { > >> > >> struct dsa_skb_cb { > >> struct sk_buff *clone; > >> + struct net_device *sb_dev; > >> }; > >> > >> struct __dsa_skb_cb { > >> @@ -828,6 +829,12 @@ struct dsa_switch_ops { > >> const struct switchdev_obj_ring_role_mrp *mrp); > >> int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, > >> const struct switchdev_obj_ring_role_mrp *mrp); > >> + > >> + /* L2 forward offloading */ > >> + void * (*dfwd_add_station)(struct dsa_switch *ds, int port, > >> + struct net_device *sb_dev); > >> + void (*dfwd_del_station)(struct dsa_switch *ds, int port, > >> + struct net_device *sb_dev); > >> }; > >> > >> #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ > >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c > >> index 77b33bd161b8..3689ffa2dbb8 100644 > >> --- a/net/dsa/slave.c > >> +++ b/net/dsa/slave.c > >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) > >> return dsa_enqueue_skb(nskb, dev); > >> } > >> > >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb, > >> + struct net_device *sb_dev) > >> +{ > >> + DSA_SKB_CB(skb)->sb_dev = sb_dev; > >> + return netdev_pick_tx(dev, skb, sb_dev); > >> +} > >> + > > > > DSA_SKB_CB is going away: > > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/ > > > > Let's either negotiate with Yangbo on keeping it, or make > > .ndo_select_queue a bypass towards the tagger, where it can use its own > > SKB_CB structure and be more flexible in general (I think I'm leaning > > towards the latter). > > Thus far, Yangbo is a tough negotiator, giving me the silent treatment: > > https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/ > > :) > > That memset is giving me a hard time. I have just disabled it on my > branch at the moment. Any ideas on how to get rid of it without breaking > timestamping? :) Is there any guarantee written somewhere that the ownership of skb->cb belongs to the NIC driver at the time of the ndo_select_queue call? If there is, then the trivial solution is to just move the memset in ndo_select_queue. If there isn't, then we've got bigger issues (such as, for example, the qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev).
On Tue, May 04, 2021 at 18:21, Vladimir Oltean <olteanv@gmail.com> wrote: > On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote: >> On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote: >> > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote: >> >> Allow DSA drivers to support forward offloading from a bridge by: >> >> >> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers. >> >> >> >> - Recording the subordinate device of offloaded skbs in the control >> >> buffer so that the tagger can take the appropriate action. >> >> >> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> >> >> --- >> >> include/net/dsa.h | 7 +++++++ >> >> net/dsa/slave.c | 36 ++++++++++++++++++++++++++++++++++-- >> >> 2 files changed, 41 insertions(+), 2 deletions(-) >> >> >> >> diff --git a/include/net/dsa.h b/include/net/dsa.h >> >> index 1f9ba9889034..77d4df819299 100644 >> >> --- a/include/net/dsa.h >> >> +++ b/include/net/dsa.h >> >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops { >> >> >> >> struct dsa_skb_cb { >> >> struct sk_buff *clone; >> >> + struct net_device *sb_dev; >> >> }; >> >> >> >> struct __dsa_skb_cb { >> >> @@ -828,6 +829,12 @@ struct dsa_switch_ops { >> >> const struct switchdev_obj_ring_role_mrp *mrp); >> >> int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, >> >> const struct switchdev_obj_ring_role_mrp *mrp); >> >> + >> >> + /* L2 forward offloading */ >> >> + void * (*dfwd_add_station)(struct dsa_switch *ds, int port, >> >> + struct net_device *sb_dev); >> >> + void (*dfwd_del_station)(struct dsa_switch *ds, int port, >> >> + struct net_device *sb_dev); >> >> }; >> >> >> >> #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ >> >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c >> >> index 77b33bd161b8..3689ffa2dbb8 100644 >> >> --- a/net/dsa/slave.c >> >> +++ b/net/dsa/slave.c >> >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) >> >> return dsa_enqueue_skb(nskb, dev); >> >> } >> >> >> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb, >> >> + struct net_device *sb_dev) >> >> +{ >> >> + DSA_SKB_CB(skb)->sb_dev = sb_dev; >> >> + return netdev_pick_tx(dev, skb, sb_dev); >> >> +} >> >> + >> > >> > DSA_SKB_CB is going away: >> > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/ >> > >> > Let's either negotiate with Yangbo on keeping it, or make >> > .ndo_select_queue a bypass towards the tagger, where it can use its own >> > SKB_CB structure and be more flexible in general (I think I'm leaning >> > towards the latter). >> >> Thus far, Yangbo is a tough negotiator, giving me the silent treatment: >> >> https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/ >> >> :) >> >> That memset is giving me a hard time. I have just disabled it on my >> branch at the moment. Any ideas on how to get rid of it without breaking >> timestamping? > > :) > > Is there any guarantee written somewhere that the ownership of skb->cb > belongs to the NIC driver at the time of the ndo_select_queue call? > > If there is, then the trivial solution is to just move the memset in > ndo_select_queue. > > If there isn't, then we've got bigger issues (such as, for example, the > qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev). The comment says: "This is owned by whoever has the skb queued ATM." But qdisc_skb_cb is a thing as it turns out - so I think I can kiss the idea of stashing the pointer in the CB goodbye. Looking at some of the other users of .ndo_select_queue, I get the feeling that we should really: - Pre-generate a FROM_CPU tag template and store it under "TxQ 0" - Pre-generate a FORWARD tag template and store it under "TxQ 1" - Redfine tag_dsa's .ndo_select_queue to be: `return sb_dev ? 1 : 0;` - Fetch the template using skb_queue_mapping, fill in the VID, and send it. There is really no need to recompute the static parts of the tags on each skb. It would mean moving some knowledge of the tagging format to the driver. But that boundary is pretty artificial for mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does not work with any other tagger. I suppose you could even move the whole tagger to drivers/net/dsa/mv88e6xxx/? What do you think? Andrew?
> There is really no need to recompute the static parts of the tags on > each skb. It would mean moving some knowledge of the tagging format to > the driver. But that boundary is pretty artificial for > mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does > not work with any other tagger. I suppose you could even move the whole > tagger to drivers/net/dsa/mv88e6xxx/? > > What do you think? > > Andrew? We have resisted this before. What information do you actually need to share between the tagger and the driver? Both tag_lan9303.c and tag_ocelot_8021q.c do reference their switch driver data structures, so some sharing is allowed. But please try to keep the surface areas down. Andrew
On Tue, May 04, 2021 at 10:07:14PM +0200, Tobias Waldekranz wrote: > On Tue, May 04, 2021 at 18:21, Vladimir Oltean <olteanv@gmail.com> wrote: > > On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote: > >> On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote: > >> > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote: > >> >> Allow DSA drivers to support forward offloading from a bridge by: > >> >> > >> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers. > >> >> > >> >> - Recording the subordinate device of offloaded skbs in the control > >> >> buffer so that the tagger can take the appropriate action. > >> >> > >> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> > >> >> --- > >> >> include/net/dsa.h | 7 +++++++ > >> >> net/dsa/slave.c | 36 ++++++++++++++++++++++++++++++++++-- > >> >> 2 files changed, 41 insertions(+), 2 deletions(-) > >> >> > >> >> diff --git a/include/net/dsa.h b/include/net/dsa.h > >> >> index 1f9ba9889034..77d4df819299 100644 > >> >> --- a/include/net/dsa.h > >> >> +++ b/include/net/dsa.h > >> >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops { > >> >> > >> >> struct dsa_skb_cb { > >> >> struct sk_buff *clone; > >> >> + struct net_device *sb_dev; > >> >> }; > >> >> > >> >> struct __dsa_skb_cb { > >> >> @@ -828,6 +829,12 @@ struct dsa_switch_ops { > >> >> const struct switchdev_obj_ring_role_mrp *mrp); > >> >> int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, > >> >> const struct switchdev_obj_ring_role_mrp *mrp); > >> >> + > >> >> + /* L2 forward offloading */ > >> >> + void * (*dfwd_add_station)(struct dsa_switch *ds, int port, > >> >> + struct net_device *sb_dev); > >> >> + void (*dfwd_del_station)(struct dsa_switch *ds, int port, > >> >> + struct net_device *sb_dev); > >> >> }; > >> >> > >> >> #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ > >> >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c > >> >> index 77b33bd161b8..3689ffa2dbb8 100644 > >> >> --- a/net/dsa/slave.c > >> >> +++ b/net/dsa/slave.c > >> >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) > >> >> return dsa_enqueue_skb(nskb, dev); > >> >> } > >> >> > >> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb, > >> >> + struct net_device *sb_dev) > >> >> +{ > >> >> + DSA_SKB_CB(skb)->sb_dev = sb_dev; > >> >> + return netdev_pick_tx(dev, skb, sb_dev); > >> >> +} > >> >> + > >> > > >> > DSA_SKB_CB is going away: > >> > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/ > >> > > >> > Let's either negotiate with Yangbo on keeping it, or make > >> > .ndo_select_queue a bypass towards the tagger, where it can use its own > >> > SKB_CB structure and be more flexible in general (I think I'm leaning > >> > towards the latter). > >> > >> Thus far, Yangbo is a tough negotiator, giving me the silent treatment: > >> > >> https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/ > >> > >> :) > >> > >> That memset is giving me a hard time. I have just disabled it on my > >> branch at the moment. Any ideas on how to get rid of it without breaking > >> timestamping? > > > > :) > > > > Is there any guarantee written somewhere that the ownership of skb->cb > > belongs to the NIC driver at the time of the ndo_select_queue call? > > > > If there is, then the trivial solution is to just move the memset in > > ndo_select_queue. > > > > If there isn't, then we've got bigger issues (such as, for example, the > > qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev). > > The comment says: > > "This is owned by whoever has the skb queued ATM." > > But qdisc_skb_cb is a thing as it turns out - so I think I can kiss the > idea of stashing the pointer in the CB goodbye. > > Looking at some of the other users of .ndo_select_queue, I get the > feeling that we should really: > > - Pre-generate a FROM_CPU tag template and store it under "TxQ 0" > - Pre-generate a FORWARD tag template and store it under "TxQ 1" > - Redfine tag_dsa's .ndo_select_queue to be: `return sb_dev ? 1 : 0;` > - Fetch the template using skb_queue_mapping, fill in the VID, and send > it. Different drivers use TX queues in different ways. For example, for the switches with TSN offloads, we set ds->num_tx_queues to a value equal to the number of hardware traffic classes, so that the CPU can inject packets with a specific QOS_CLASS field in the DSA header (think VLAN PCP). This is really visible with tc-taprio where some traffic classes can be completely turned off, so you can easily tell which TC was a packet enqueued to. Other switches use TX queues in other ways. Some Broadcom tagging protocols use the skb queue_mapping to direct the packets to one of multiple TX queues of the DSA master, in order to apply backpressure in case there is congestion on the front port. Selecting a TX queue based on which upper netdev the packet is coming form sounds to me like the oddest of the bunch. It really adds one more dimension to the existing uses, I am not sure that this is how it was intended to be done [ and why, for example, if the sb_dev was propagated so deeply into dev_queue_xmit, why was it not propagated all the way to .ndo_start_xmit ], but on the other hand, you have more working experience with the dev_queue_xmit_accel API than the zero I have. By the way (to show how little I know) what does "d" in "dfwd" stand for? It almost sounds to me like a typo that was carried along from NETIF_F_HW_L2FW_DOFFLOAD_BIT. We might need to ask for the input of some people from Intel who worked on this offload framework. For example, I just added Alexander Duyck hoping he can provide some suggestions. We just want the sb_dev in ndo_start_xmit, and abusing ndo_select_queue seems like a huge hack just to obtain that. > There is really no need to recompute the static parts of the tags on > each skb. It would mean moving some knowledge of the tagging format to > the driver. But that boundary is pretty artificial for > mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does > not work with any other tagger. I suppose you could even move the whole > tagger to drivers/net/dsa/mv88e6xxx/? > > What do you think? > > Andrew? [ not Andrew, but ] I made that mistake so that you don't have to. You don't actually gain as much as you think (performance is about the same, what you win in instruction count and conditionals you lose in the memcpy), and you create a dependency between the tagger and the switch driver which was supposed by design to not exist. For my drivers I tried to remove this dependency - see commit 7c4bb540e917 ("net: dsa: tag_ocelot: create separate tagger for Seville"). Also, in the case of Ocelot switches, a template was used to mask out handling differences between switch generations, and present them to user space as "the same tagger". Another bad idea. In general, if a tagging protocol is testable with dsa_loop this is a plus. People at NXP wanted to see how their drivers perform with Marvell switches (what are their options for balancing with RFS/RSS) and this is what they did, changed DSA_TAG_PROTO_NONE from what dsa_loop advertises. If they need the actual switch driver to initialize the tagger's template, suddenly it's not so fun anymore. If it ever becomes important enough, I suppose dsa_loop could even gain support for the new .change_tag_protocol API to advertise the feasibility of the idea in general, although given how DYI dsa_loop is in general, maybe changing the tag protocol at runtime isn't so important.
On Tue, May 04, 2021 at 22:33, Andrew Lunn <andrew@lunn.ch> wrote: >> There is really no need to recompute the static parts of the tags on >> each skb. It would mean moving some knowledge of the tagging format to >> the driver. But that boundary is pretty artificial for >> mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does >> not work with any other tagger. I suppose you could even move the whole >> tagger to drivers/net/dsa/mv88e6xxx/? >> >> What do you think? >> >> Andrew? > > We have resisted this before. > > What information do you actually need to share between the tagger and > the driver? So far: - Trunk/LAG ID to netdev mappings (this is stored on the dst now, but I think I have seen the light and agree with Vladimir that it really has no business there). - DSA dev/port to bridge netdev mappings for the forwarding offloading in this RFC (or preferably the actual tag templates to use on egress since that would probably give you better performance) In the future: - Completions for in-flight remote management operations. - FlowID to TC rule mappings (from the "Switch Egress header" when we enable that) - In-band signaling between firmware running on the IMP and the driver for things like MRP and CFM offloading. > Both tag_lan9303.c and tag_ocelot_8021q.c do reference > their switch driver data structures, so some sharing is allowed. But > please try to keep the surface areas down. If you have a surface area keep it small, yes, agreed. I guess my question is more why we should have any surface area at all? What do we gain by the tagger/driver separation in the case of mv88e6xxx? > Andrew
On Tue, May 04, 2021 at 23:58, Vladimir Oltean <olteanv@gmail.com> wrote: > On Tue, May 04, 2021 at 10:07:14PM +0200, Tobias Waldekranz wrote: >> On Tue, May 04, 2021 at 18:21, Vladimir Oltean <olteanv@gmail.com> wrote: >> > On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote: >> >> On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote: >> >> > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote: >> >> >> Allow DSA drivers to support forward offloading from a bridge by: >> >> >> >> >> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers. >> >> >> >> >> >> - Recording the subordinate device of offloaded skbs in the control >> >> >> buffer so that the tagger can take the appropriate action. >> >> >> >> >> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> >> >> >> --- >> >> >> include/net/dsa.h | 7 +++++++ >> >> >> net/dsa/slave.c | 36 ++++++++++++++++++++++++++++++++++-- >> >> >> 2 files changed, 41 insertions(+), 2 deletions(-) >> >> >> >> >> >> diff --git a/include/net/dsa.h b/include/net/dsa.h >> >> >> index 1f9ba9889034..77d4df819299 100644 >> >> >> --- a/include/net/dsa.h >> >> >> +++ b/include/net/dsa.h >> >> >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops { >> >> >> >> >> >> struct dsa_skb_cb { >> >> >> struct sk_buff *clone; >> >> >> + struct net_device *sb_dev; >> >> >> }; >> >> >> >> >> >> struct __dsa_skb_cb { >> >> >> @@ -828,6 +829,12 @@ struct dsa_switch_ops { >> >> >> const struct switchdev_obj_ring_role_mrp *mrp); >> >> >> int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, >> >> >> const struct switchdev_obj_ring_role_mrp *mrp); >> >> >> + >> >> >> + /* L2 forward offloading */ >> >> >> + void * (*dfwd_add_station)(struct dsa_switch *ds, int port, >> >> >> + struct net_device *sb_dev); >> >> >> + void (*dfwd_del_station)(struct dsa_switch *ds, int port, >> >> >> + struct net_device *sb_dev); >> >> >> }; >> >> >> >> >> >> #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ >> >> >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c >> >> >> index 77b33bd161b8..3689ffa2dbb8 100644 >> >> >> --- a/net/dsa/slave.c >> >> >> +++ b/net/dsa/slave.c >> >> >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) >> >> >> return dsa_enqueue_skb(nskb, dev); >> >> >> } >> >> >> >> >> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb, >> >> >> + struct net_device *sb_dev) >> >> >> +{ >> >> >> + DSA_SKB_CB(skb)->sb_dev = sb_dev; >> >> >> + return netdev_pick_tx(dev, skb, sb_dev); >> >> >> +} >> >> >> + >> >> > >> >> > DSA_SKB_CB is going away: >> >> > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/ >> >> > >> >> > Let's either negotiate with Yangbo on keeping it, or make >> >> > .ndo_select_queue a bypass towards the tagger, where it can use its own >> >> > SKB_CB structure and be more flexible in general (I think I'm leaning >> >> > towards the latter). >> >> >> >> Thus far, Yangbo is a tough negotiator, giving me the silent treatment: >> >> >> >> https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/ >> >> >> >> :) >> >> >> >> That memset is giving me a hard time. I have just disabled it on my >> >> branch at the moment. Any ideas on how to get rid of it without breaking >> >> timestamping? >> > >> > :) >> > >> > Is there any guarantee written somewhere that the ownership of skb->cb >> > belongs to the NIC driver at the time of the ndo_select_queue call? >> > >> > If there is, then the trivial solution is to just move the memset in >> > ndo_select_queue. >> > >> > If there isn't, then we've got bigger issues (such as, for example, the >> > qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev). >> >> The comment says: >> >> "This is owned by whoever has the skb queued ATM." >> >> But qdisc_skb_cb is a thing as it turns out - so I think I can kiss the >> idea of stashing the pointer in the CB goodbye. >> >> Looking at some of the other users of .ndo_select_queue, I get the >> feeling that we should really: >> >> - Pre-generate a FROM_CPU tag template and store it under "TxQ 0" >> - Pre-generate a FORWARD tag template and store it under "TxQ 1" >> - Redfine tag_dsa's .ndo_select_queue to be: `return sb_dev ? 1 : 0;` >> - Fetch the template using skb_queue_mapping, fill in the VID, and send >> it. > > Different drivers use TX queues in different ways. For example, for the > switches with TSN offloads, we set ds->num_tx_queues to a value equal to > the number of hardware traffic classes, so that the CPU can inject > packets with a specific QOS_CLASS field in the DSA header (think VLAN PCP). > This is really visible with tc-taprio where some traffic classes can be > completely turned off, so you can easily tell which TC was a packet > enqueued to. Other switches use TX queues in other ways. Some Broadcom > tagging protocols use the skb queue_mapping to direct the packets to one > of multiple TX queues of the DSA master, in order to apply backpressure > in case there is congestion on the front port. > > Selecting a TX queue based on which upper netdev the packet is coming > form sounds to me like the oddest of the bunch. It really adds one more > dimension to the existing uses, I am not sure that this is how it was > intended to be done [ and why, for example, if the sb_dev was propagated > so deeply into dev_queue_xmit, why was it not propagated all the way to > .ndo_start_xmit ], but on the other hand, you have more working > experience with the dev_queue_xmit_accel API than the zero I have. Yeah it does not feel right. I expect mv88e6xxx will also want to expose the real number of queues in the future. Some of the newer devices have support for time aware shapers for example. As for why sb_dev is not propagated to .ndo_start_xmit: I chalked it up to the existing users managing the macvlan offloads by directing those flows to a particular TxQ. I.e. they simply had no need for it. Or perhaps they did not have the nerve to send the commit that changed the signature of _every_ driver's .ndo_start_xmit :) > By the way (to show how little I know) what does "d" in "dfwd" stand for? > It almost sounds to me like a typo that was carried along from > NETIF_F_HW_L2FW_DOFFLOAD_BIT. That has been bugging me as well! I have no idea. > We might need to ask for the input of some people from Intel who worked > on this offload framework. For example, I just added Alexander Duyck > hoping he can provide some suggestions. We just want the sb_dev in > ndo_start_xmit, and abusing ndo_select_queue seems like a huge hack just > to obtain that. I think you are right. >> There is really no need to recompute the static parts of the tags on >> each skb. It would mean moving some knowledge of the tagging format to >> the driver. But that boundary is pretty artificial for >> mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does >> not work with any other tagger. I suppose you could even move the whole >> tagger to drivers/net/dsa/mv88e6xxx/? >> >> What do you think? >> >> Andrew? > > [ not Andrew, but ] > > I made that mistake so that you don't have to. You don't actually gain > as much as you think (performance is about the same, what you win in > instruction count and conditionals you lose in the memcpy), That is valuable info, thank you. But I think the most important improvement I see would be the ability to couple the tagger tighter to the driver when we add more complicated features. > and you > create a dependency between the tagger and the switch driver which was > supposed by design to not exist. Sure, but _why_ should it not exist? Many fields in the tag can only be correctly generated/interpreted in combination with knowledge of the current configuration, which is the driver's domain. The dependency is already there, etched in silicon. > For my drivers I tried to remove this > dependency - see commit 7c4bb540e917 ("net: dsa: tag_ocelot: create > separate tagger for Seville"). Also, in the case of Ocelot switches, > a template was used to mask out handling differences between switch > generations, and present them to user space as "the same tagger". > Another bad idea. In general, if a tagging protocol is testable with > dsa_loop this is a plus. People at NXP wanted to see how their drivers > perform with Marvell switches (what are their options for balancing with > RFS/RSS) and this is what they did, changed DSA_TAG_PROTO_NONE from what > dsa_loop advertises. If they need the actual switch driver to initialize > the tagger's template, suddenly it's not so fun anymore. I shall have to look more closely at dsa_loop, so far I have just seen the name float by on a few occasions. > If it ever becomes important enough, I suppose dsa_loop could even gain > support for the new .change_tag_protocol API to advertise the > feasibility of the idea in general, although given how DYI dsa_loop is > in general, maybe changing the tag protocol at runtime isn't so > important.
On Wed, May 05, 2021 at 12:12:15AM +0200, Tobias Waldekranz wrote: > > and you create a dependency between the tagger and the switch driver > > which was supposed by design to not exist. > > Sure, but _why_ should it not exist? Many fields in the tag can only be > correctly generated/interpreted in combination with knowledge of the > current configuration, which is the driver's domain. The dependency is > already there, etched in silicon. I'm a bit more of a pragmatic person, it's not so much that I think that Lennert Buytenhek's original DSA design from 2008 was the holy grail and that we should do everything we can to preserve it intact. Far from it. But I actually like having the option to inject a DSA-tagged packet using Spirent TestCenter and measure IP forwarding between dsa_loop "switch" ports (actually a one-armed router is what it is). I also like, as a reviewer, to be able to test, if I want to, how a tail tagger behaves even if I don't own a switch with tail tagging. And this separation between the switch driver and the tag protocol driver makes that possible, just see it as a nice perk which we don't want to lose. As for more advanced features, like "the hardware requires me to invent a unique number based on a rolling counter, call it a TX timestamp ID, put it in the DSA header, then when transmission is done, an IRQ will be raised, and I need to match that TX timestamp that just became available to me, which is identifiable via the timestamp ID that I put in the DSA header, with the original skb", of course you can't do that without communication between the tagger and the driver itself, unless you make the tagger handle interrupts (and then there's the whole issue that the tagging protocol driver needs to be instantiated per switch, if it's going to be stateful), or the switch driver send packets. As a general rule of thumb, just don't break dsa_loop and we should be fine. For example, yes, PTP requires driver <-> tagger communication, but PTP timestamping is also not enabled by default, and guarded by an ioctl which dsa_loop doesn't implement. So the tagger can never trigger faulty code, dereferencing a ds->priv pointer which it thinks is "struct mv88e6xxx_chip" but is actually "struct dsa_loop_priv".
On Wed, May 05, 2021 at 02:04, Vladimir Oltean <olteanv@gmail.com> wrote: > On Wed, May 05, 2021 at 12:12:15AM +0200, Tobias Waldekranz wrote: >> > and you create a dependency between the tagger and the switch driver >> > which was supposed by design to not exist. >> >> Sure, but _why_ should it not exist? Many fields in the tag can only be >> correctly generated/interpreted in combination with knowledge of the >> current configuration, which is the driver's domain. The dependency is >> already there, etched in silicon. > > I'm a bit more of a pragmatic person, Excuse me sir, I believe you left your dagger IN MY HEART :) > it's not so much that I think that > Lennert Buytenhek's original DSA design from 2008 was the holy grail and > that we should do everything we can to preserve it intact. Far from it. > But I actually like having the option to inject a DSA-tagged packet > using Spirent TestCenter and measure IP forwarding between dsa_loop > "switch" ports (actually a one-armed router is what it is). I also like, > as a reviewer, to be able to test, if I want to, how a tail tagger > behaves even if I don't own a switch with tail tagging. And this > separation between the switch driver and the tag protocol driver makes > that possible, just see it as a nice perk which we don't want to lose. Completely understandable. I was trying to extrapolate where we will end up with this separation as we add more and more features and couple the tagger closer to the driver, and see if the current architecture was still the optimal one. Trying to be ...pragmatic, if you will. > As for more advanced features, like "the hardware requires me to invent > a unique number based on a rolling counter, call it a TX timestamp ID, > put it in the DSA header, then when transmission is done, an IRQ will be > raised, and I need to match that TX timestamp that just became available > to me, which is identifiable via the timestamp ID that I put in the DSA > header, with the original skb", of course you can't do that without > communication between the tagger and the driver itself, unless you make > the tagger handle interrupts (and then there's the whole issue that the > tagging protocol driver needs to be instantiated per switch, if it's > going to be stateful), or the switch driver send packets. As a general > rule of thumb, just don't break dsa_loop and we should be fine. For > example, yes, PTP requires driver <-> tagger communication, but PTP > timestamping is also not enabled by default, and guarded by an ioctl > which dsa_loop doesn't implement. So the tagger can never trigger faulty > code, dereferencing a ds->priv pointer which it thinks is "struct > mv88e6xxx_chip" but is actually "struct dsa_loop_priv". This should also hold for forward offloading, since dsa_loop would not implement .ndo_dfwd_{add,del}_station. Alright, include/linux/dsa/mv88e6xxx.h here I come!
On Wed, May 05, 2021 at 11:01:09AM +0200, Tobias Waldekranz wrote: > On Wed, May 05, 2021 at 02:04, Vladimir Oltean <olteanv@gmail.com> wrote: > > On Wed, May 05, 2021 at 12:12:15AM +0200, Tobias Waldekranz wrote: > >> > and you create a dependency between the tagger and the switch driver > >> > which was supposed by design to not exist. > >> > >> Sure, but _why_ should it not exist? Many fields in the tag can only be > >> correctly generated/interpreted in combination with knowledge of the > >> current configuration, which is the driver's domain. The dependency is > >> already there, etched in silicon. > > > > I'm a bit more of a pragmatic person, > > Excuse me sir, I believe you left your dagger IN MY HEART :) You might have misinterpreted my words, I did not mean to say "look what a good quality I have and you don't", in fact I don't view pragmatism as much of a desirable quality at all. What I meant to say in the context is that, even though in general I value functionality more than how it is implemented, I would still like to keep the separation between taggers and switch drivers at least at the most basic RX/TX level, for the reasons explained.
diff --git a/include/net/dsa.h b/include/net/dsa.h index 1f9ba9889034..77d4df819299 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -119,6 +119,7 @@ struct dsa_netdevice_ops { struct dsa_skb_cb { struct sk_buff *clone; + struct net_device *sb_dev; }; struct __dsa_skb_cb { @@ -828,6 +829,12 @@ struct dsa_switch_ops { const struct switchdev_obj_ring_role_mrp *mrp); int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); + + /* L2 forward offloading */ + void * (*dfwd_add_station)(struct dsa_switch *ds, int port, + struct net_device *sb_dev); + void (*dfwd_del_station)(struct dsa_switch *ds, int port, + struct net_device *sb_dev); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 77b33bd161b8..3689ffa2dbb8 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) return dsa_enqueue_skb(nskb, dev); } +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + DSA_SKB_CB(skb)->sb_dev = sb_dev; + return netdev_pick_tx(dev, skb, sb_dev); +} + /* ethtool operations *******************************************************/ static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -1708,10 +1715,33 @@ static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx, return 0; } +static void *dsa_slave_dfwd_add_station(struct net_device *dev, + struct net_device *sb_dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->dfwd_add_station) + return ds->ops->dfwd_add_station(ds, dp->index, sb_dev); + + return ERR_PTR(-EOPNOTSUPP); +} + +static void dsa_slave_dfwd_del_station(struct net_device *dev, + void *sb_dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->dfwd_del_station) + ds->ops->dfwd_del_station(ds, dp->index, sb_dev); +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = dsa_slave_xmit, + .ndo_select_queue = dsa_slave_select_queue, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, @@ -1734,6 +1764,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_get_devlink_port = dsa_slave_get_devlink_port, .ndo_change_mtu = dsa_slave_change_mtu, .ndo_fill_forward_path = dsa_slave_fill_forward_path, + .ndo_dfwd_add_station = dsa_slave_dfwd_add_station, + .ndo_dfwd_del_station = dsa_slave_dfwd_del_station, }; static struct device_type dsa_type = { @@ -1914,8 +1946,8 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->features = master->vlan_features | NETIF_F_HW_TC; if (ds->ops->port_vlan_add && ds->ops->port_vlan_del) slave_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; - slave_dev->hw_features |= NETIF_F_HW_TC; - slave_dev->features |= NETIF_F_LLTX; + slave_dev->hw_features |= NETIF_F_HW_TC | NETIF_F_HW_L2FW_DOFFLOAD; + slave_dev->features |= NETIF_F_LLTX | NETIF_F_HW_L2FW_DOFFLOAD; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; if (!is_zero_ether_addr(port->mac)) ether_addr_copy(slave_dev->dev_addr, port->mac);
Allow DSA drivers to support forward offloading from a bridge by: - Passing calls to .ndo_dfwd_{add,del}_station to the drivers. - Recording the subordinate device of offloaded skbs in the control buffer so that the tagger can take the appropriate action. Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> --- include/net/dsa.h | 7 +++++++ net/dsa/slave.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-)