Message ID | 20210213214421.226357-5-xiyou.wangcong@gmail.com |
---|---|
State | New |
Headers | show |
Series | sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT | expand |
Cong Wang wrote: > From: Cong Wang <cong.wang@bytedance.com> > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly > does not work for any other non-TCP protocols. We can move them to > skb ext instead of playing with skb cb, which is harder to make > correct. > > Cc: John Fastabend <john.fastabend@gmail.com> > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: Jakub Sitnicki <jakub@cloudflare.com> > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com> > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > --- I'm not seeing the advantage of doing this at the moment. We can continue to use cb[] here, which is simpler IMO and use the ext if needed for the other use cases. This is adding a per packet alloc cost that we don't have at the moment as I understand it. [...] > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > index e3bb712af257..d5c711ef6d4b 100644 > --- a/include/linux/skmsg.h > +++ b/include/linux/skmsg.h > @@ -459,4 +459,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) > return false; > return !!psock->saved_data_ready; > } > + > +struct skb_bpf_ext { > + __u32 flags; > + struct sock *sk_redir; > +}; > + > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) > +static inline > +bool skb_bpf_ext_ingress(const struct sk_buff *skb) > +{ > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > + > + return ext->flags & BPF_F_INGRESS; > +} > + > +static inline > +void skb_bpf_ext_set_ingress(const struct sk_buff *skb) > +{ > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > + > + ext->flags |= BPF_F_INGRESS; > +} > + > +static inline > +struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb) > +{ > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > + > + return ext->sk_redir; > +} > + > +static inline > +void skb_bpf_ext_redirect_clear(struct sk_buff *skb) > +{ > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > + > + ext->flags = 0; > + ext->sk_redir = NULL; > +} > +#endif /* CONFIG_NET_SOCK_MSG */ So we will have some slight duplication for cb[] variant and ext variant above. I'm OK with that to avoid an allocation. [...] > @@ -1003,11 +1008,17 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, > goto out; > } > skb_set_owner_r(skb, sk); > + if (!skb_ext_add(skb, SKB_EXT_BPF)) { > + len = 0; > + kfree_skb(skb); > + goto out; > + } > + per packet cost here. Perhaps you can argue small alloc will usually not be noticable in such a large stack, but once we convert over it will be very hard to go back. And I'm looking at optimizing this path now. > prog = READ_ONCE(psock->progs.skb_verdict); > if (likely(prog)) { > - tcp_skb_bpf_redirect_clear(skb); > + skb_bpf_ext_redirect_clear(skb); > ret = sk_psock_bpf_run(psock, prog, skb); > - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); > + ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb)); > } > sk_psock_verdict_apply(psock, skb, ret); Thanks for the series Cong. Drop this patch and resubmit carry ACKs forward and then lets revisit this later. Thanks, John
On Mon, Feb 15, 2021 at 11:20 AM John Fastabend <john.fastabend@gmail.com> wrote: > > Cong Wang wrote: > > From: Cong Wang <cong.wang@bytedance.com> > > > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly > > does not work for any other non-TCP protocols. We can move them to > > skb ext instead of playing with skb cb, which is harder to make > > correct. > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > --- > > I'm not seeing the advantage of doing this at the moment. We can > continue to use cb[] here, which is simpler IMO and use the ext > if needed for the other use cases. This is adding a per packet > alloc cost that we don't have at the moment as I understand it. Hmm? How can we continue using TCP_SKB_CB() for UDP or AF_UNIX? I am not sure I get your "at the moment" correctly, do you mean I should move this patch to a later patchset, maybe the UDP patchset? At least this patch is needed, no matter by which patchset, so it should not be dropped. > > [...] > > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > > index e3bb712af257..d5c711ef6d4b 100644 > > --- a/include/linux/skmsg.h > > +++ b/include/linux/skmsg.h > > @@ -459,4 +459,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) > > return false; > > return !!psock->saved_data_ready; > > } > > + > > +struct skb_bpf_ext { > > + __u32 flags; > > + struct sock *sk_redir; > > +}; > > + > > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) > > +static inline > > +bool skb_bpf_ext_ingress(const struct sk_buff *skb) > > +{ > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > + > > + return ext->flags & BPF_F_INGRESS; > > +} > > + > > +static inline > > +void skb_bpf_ext_set_ingress(const struct sk_buff *skb) > > +{ > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > + > > + ext->flags |= BPF_F_INGRESS; > > +} > > + > > +static inline > > +struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb) > > +{ > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > + > > + return ext->sk_redir; > > +} > > + > > +static inline > > +void skb_bpf_ext_redirect_clear(struct sk_buff *skb) > > +{ > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > + > > + ext->flags = 0; > > + ext->sk_redir = NULL; > > +} > > +#endif /* CONFIG_NET_SOCK_MSG */ > > So we will have some slight duplication for cb[] variant and ext > variant above. I'm OK with that to avoid an allocation. Not sure what you mean by "duplication", these are removed from TCP_SKB_CB(), so there is clearly no duplication. > > [...] > > > @@ -1003,11 +1008,17 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, > > goto out; > > } > > skb_set_owner_r(skb, sk); > > + if (!skb_ext_add(skb, SKB_EXT_BPF)) { > > + len = 0; > > + kfree_skb(skb); > > + goto out; > > + } > > + > > per packet cost here. Perhaps you can argue small alloc will usually not be > noticable in such a large stack, but once we convert over it will be very > hard to go back. And I'm looking at optimizing this path now. This is a price we need to pay to avoid CB, and skb_ext_add() has been used on other fast paths too, for example, tcf_classify_ingress() and mptcp_incoming_options(). So, it is definitely acceptable. > > > prog = READ_ONCE(psock->progs.skb_verdict); > > if (likely(prog)) { > > - tcp_skb_bpf_redirect_clear(skb); > > + skb_bpf_ext_redirect_clear(skb); > > ret = sk_psock_bpf_run(psock, prog, skb); > > - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); > > + ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb)); > > } > > sk_psock_verdict_apply(psock, skb, ret); > > Thanks for the series Cong. Drop this patch and resubmit carry ACKs forward > and then lets revisit this later. I still believe it is best to stay in this patchset, as it does not change any functionality and is a preparation too. And the next patchset will be UDP/AF_UNIX changes as you suggested, it is very awkward to put this patch into either UDP or AF_UNIX changes. So, let's keep it in this patchset, and I am happy to address any concerns and open to other ideas than using skb ext. Thanks.
Cong Wang wrote: > On Mon, Feb 15, 2021 at 11:20 AM John Fastabend > <john.fastabend@gmail.com> wrote: > > > > Cong Wang wrote: > > > From: Cong Wang <cong.wang@bytedance.com> > > > > > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly > > > does not work for any other non-TCP protocols. We can move them to > > > skb ext instead of playing with skb cb, which is harder to make > > > correct. > > > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com> > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > > --- > > > > I'm not seeing the advantage of doing this at the moment. We can > > continue to use cb[] here, which is simpler IMO and use the ext > > if needed for the other use cases. This is adding a per packet > > alloc cost that we don't have at the moment as I understand it. > > Hmm? How can we continue using TCP_SKB_CB() for UDP or > AF_UNIX? > > I am not sure I get your "at the moment" correctly, do you mean > I should move this patch to a later patchset, maybe the UDP > patchset? At least this patch is needed, no matter by which patchset, > so it should not be dropped. Agree, the skb_bpf_ext{} pieces are needed for UDP and AF_UNIX. Its not required for TCP side though. What I'm suggesting is leave the TCP side as-is, using the cb[] fields. Then use the skb_bpf_ext fields from UDP and AF_UNIX. > > > > > > [...] > > > > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > > > index e3bb712af257..d5c711ef6d4b 100644 > > > --- a/include/linux/skmsg.h > > > +++ b/include/linux/skmsg.h > > > @@ -459,4 +459,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) > > > return false; > > > return !!psock->saved_data_ready; > > > } > > > + > > > +struct skb_bpf_ext { > > > + __u32 flags; > > > + struct sock *sk_redir; > > > +}; > > > + > > > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) > > > +static inline > > > +bool skb_bpf_ext_ingress(const struct sk_buff *skb) > > > +{ > > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > > + > > > + return ext->flags & BPF_F_INGRESS; > > > +} > > > + > > > +static inline > > > +void skb_bpf_ext_set_ingress(const struct sk_buff *skb) > > > +{ > > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > > + > > > + ext->flags |= BPF_F_INGRESS; > > > +} > > > + > > > +static inline > > > +struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb) > > > +{ > > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > > + > > > + return ext->sk_redir; > > > +} > > > + > > > +static inline > > > +void skb_bpf_ext_redirect_clear(struct sk_buff *skb) > > > +{ > > > + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); > > > + > > + ext->flags = 0; > > > + ext->sk_redir = NULL; > > > +} > > > +#endif /* CONFIG_NET_SOCK_MSG */ > > > > So we will have some slight duplication for cb[] variant and ext > > variant above. I'm OK with that to avoid an allocation. > > Not sure what you mean by "duplication", these are removed from > TCP_SKB_CB(), so there is clearly no duplication. In this patch yes, no duplication. But, I want to leave TCP alone and have it continue to use cb[] to avoid alloc per packet. > > > > > [...] > > > > > @@ -1003,11 +1008,17 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, > > > goto out; > > > } > > > skb_set_owner_r(skb, sk); > > > + if (!skb_ext_add(skb, SKB_EXT_BPF)) { > > > + len = 0; > > > + kfree_skb(skb); > > > + goto out; > > > + } > > > + > > > > per packet cost here. Perhaps you can argue small alloc will usually not be > > noticable in such a large stack, but once we convert over it will be very > > hard to go back. And I'm looking at optimizing this path now. > > This is a price we need to pay to avoid CB, and skb_ext_add() has been > used on other fast paths too, for example, tcf_classify_ingress() and > mptcp_incoming_options(). So, it is definitely acceptable. For TCP case we can continue to use CB and not pay the price. For UDP and AF_UNIX we can do the extra alloc. The use in tcf_classify_ingress is a miss case so not the common path. If it is/was in the common path I would suggest we rip it out. > > > > > > prog = READ_ONCE(psock->progs.skb_verdict); > > > if (likely(prog)) { > > > - tcp_skb_bpf_redirect_clear(skb); > > > + skb_bpf_ext_redirect_clear(skb); > > > ret = sk_psock_bpf_run(psock, prog, skb); > > > - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); > > > + ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb)); > > > } > > > sk_psock_verdict_apply(psock, skb, ret); > > > > Thanks for the series Cong. Drop this patch and resubmit carry ACKs forward > > and then lets revisit this later. > > I still believe it is best to stay in this patchset, as it does not change > any functionality and is a preparation too. And the next patchset will be > UDP/AF_UNIX changes as you suggested, it is very awkward to put this > patch into either UDP or AF_UNIX changes. Disagree. It adds extra code to the TCP side that I think is not needed. Any reason the TCP implementation can't continue to use cb[]? > > So, let's keep it in this patchset, and I am happy to address any concerns > and open to other ideas than using skb ext. The idea here is to just use cb[] in TCP case per above. I only scanned your other patches, but presumably this can be patch 1 with just the functions skb_bpf_ext_ingress() skb_bpf_ext_set_ingress() skb_bpf_ext_redirect_fetch() skb_bpf_ext_redirect_clear() And none of the removals from TCP side. .John
On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote: > > For TCP case we can continue to use CB and not pay the price. For UDP > and AF_UNIX we can do the extra alloc. I see your point, but specializing TCP case does not give much benefit here, the skmsg code would have to check skb->protocol etc. to decide whether to use TCP_SKB_CB() or skb_ext: if (skb->protocol == ...) TCP_SKB_CB(skb) = ...; else ext = skb_ext_find(skb); which looks ugly to me. And I doubt skb->protocol alone is sufficient to distinguish TCP, so we may end up having more checks above. So do you really want to trade code readability with an extra alloc? > > The use in tcf_classify_ingress is a miss case so not the common path. If > it is/was in the common path I would suggest we rip it out. > Excellent point, what about nf_bridge_unshare()? It is a common path for bridge netfilter, which is also probably why skb ext was introduced (IIRC). secpath_set() seems on a common path for XFRM too. Are you suggesting to remove them all? ;) Thanks.
Cong Wang wrote: > On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > For TCP case we can continue to use CB and not pay the price. For UDP > > and AF_UNIX we can do the extra alloc. > > I see your point, but specializing TCP case does not give much benefit > here, the skmsg code would have to check skb->protocol etc. to decide > whether to use TCP_SKB_CB() or skb_ext: > > if (skb->protocol == ...) > TCP_SKB_CB(skb) = ...; > else > ext = skb_ext_find(skb); > > which looks ugly to me. And I doubt skb->protocol alone is sufficient to > distinguish TCP, so we may end up having more checks above. > > So do you really want to trade code readability with an extra alloc? Above is ugly. So I look at where the patch replaces things, sk_psock_tls_strp_read(), this is TLS specific read hook so can't really work in generic case anyways. sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these even work outside TCP cases. For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(), sk_psock_backlog(), can't we just do some refactoring around their hook points so we know the context. For example sk_psock_tls_verdict_apply is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect() and a sk_psock_udp_redirect(). There are likely some optimizations we can deploy this way. We've already don this for tls and sk_msg types for example. Then the helpers will know their types by program type, just use the right variants. So not suggestiong if/else the checks so much as having per type hooks. > > > > > The use in tcf_classify_ingress is a miss case so not the common path. If > > it is/was in the common path I would suggest we rip it out. > > > > Excellent point, what about nf_bridge_unshare()? It is a common path > for bridge netfilter, which is also probably why skb ext was introduced > (IIRC). secpath_set() seems on a common path for XFRM too. Yeah not nice, but we don't use nf_bridge so doesn't bother me. > > Are you suggesting to remove them all? ;) From the hotpath where I care about perfromance yes. > > Thanks.
On Mon, Feb 15, 2021 at 4:54 PM John Fastabend <john.fastabend@gmail.com> wrote: > > Cong Wang wrote: > > On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > > > For TCP case we can continue to use CB and not pay the price. For UDP > > > and AF_UNIX we can do the extra alloc. > > > > I see your point, but specializing TCP case does not give much benefit > > here, the skmsg code would have to check skb->protocol etc. to decide > > whether to use TCP_SKB_CB() or skb_ext: > > > > if (skb->protocol == ...) > > TCP_SKB_CB(skb) = ...; > > else > > ext = skb_ext_find(skb); > > > > which looks ugly to me. And I doubt skb->protocol alone is sufficient to > > distinguish TCP, so we may end up having more checks above. > > > > So do you really want to trade code readability with an extra alloc? > > Above is ugly. So I look at where the patch replaces things, > > sk_psock_tls_strp_read(), this is TLS specific read hook so can't really > work in generic case anyways. > > sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these > even work outside TCP cases. > > For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(), > sk_psock_backlog(), can't we just do some refactoring around their > hook points so we know the context. For example sk_psock_tls_verdict_apply > is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect() > and a sk_psock_udp_redirect(). There are likely some optimizations we can > deploy this way. We've already don this for tls and sk_msg types for example. > > Then the helpers will know their types by program type, just use the right > variants. > > So not suggestiong if/else the checks so much as having per type hooks. > Hmm, but sk_psock_backlog() is still the only one that handles all three above cases, right? It uses TCP_SKB_CB() too and more importantly it is also why we can't use a per-cpu struct here (see bpf_redirect_info). Thanks.
Cong Wang wrote: > On Mon, Feb 15, 2021 at 4:54 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > Cong Wang wrote: > > > On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > > > > > For TCP case we can continue to use CB and not pay the price. For UDP > > > > and AF_UNIX we can do the extra alloc. > > > > > > I see your point, but specializing TCP case does not give much benefit > > > here, the skmsg code would have to check skb->protocol etc. to decide > > > whether to use TCP_SKB_CB() or skb_ext: > > > > > > if (skb->protocol == ...) > > > TCP_SKB_CB(skb) = ...; > > > else > > > ext = skb_ext_find(skb); > > > > > > which looks ugly to me. And I doubt skb->protocol alone is sufficient to > > > distinguish TCP, so we may end up having more checks above. > > > > > > So do you really want to trade code readability with an extra alloc? > > > > Above is ugly. So I look at where the patch replaces things, > > > > sk_psock_tls_strp_read(), this is TLS specific read hook so can't really > > work in generic case anyways. > > > > sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these > > even work outside TCP cases. > > > > For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(), > > sk_psock_backlog(), can't we just do some refactoring around their > > hook points so we know the context. For example sk_psock_tls_verdict_apply > > is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect() > > and a sk_psock_udp_redirect(). There are likely some optimizations we can > > deploy this way. We've already don this for tls and sk_msg types for example. > > > > Then the helpers will know their types by program type, just use the right > > variants. > > > > So not suggestiong if/else the checks so much as having per type hooks. > > > > Hmm, but sk_psock_backlog() is still the only one that handles all three > above cases, right? It uses TCP_SKB_CB() too and more importantly it > is also why we can't use a per-cpu struct here (see bpf_redirect_info). Right, but the workqueue is created at init time where we will know the socket type based on the program/map types so can build the redirect backlog queue there based on the type needed. I also have a patch in mind that would do more specific TCP things in that code anyways. I can flush it out this week if anyone cares. The idea is we are wasting lots of cycles using skb_send_sock_locked when we can just inject the packet directlyy into the tcp stack. Also on the original patch here, we can't just kfree_skb() on alloc errors because that will look like a data drop. Errors need to be handled gracefully without dropping data. At least in the TCP case, but probably also in UDP and AF_UNIX cases as well. Original code was pretty loose in this regard, but it caused users to write bug reports and then I've been fixing most of them. If you see more cases let me know. > > Thanks.
On Mon, Feb 15, 2021 at 5:50 PM John Fastabend <john.fastabend@gmail.com> wrote: > > Cong Wang wrote: > > On Mon, Feb 15, 2021 at 4:54 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > > > Cong Wang wrote: > > > > On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote: > > > > > > > > > > For TCP case we can continue to use CB and not pay the price. For UDP > > > > > and AF_UNIX we can do the extra alloc. > > > > > > > > I see your point, but specializing TCP case does not give much benefit > > > > here, the skmsg code would have to check skb->protocol etc. to decide > > > > whether to use TCP_SKB_CB() or skb_ext: > > > > > > > > if (skb->protocol == ...) > > > > TCP_SKB_CB(skb) = ...; > > > > else > > > > ext = skb_ext_find(skb); > > > > > > > > which looks ugly to me. And I doubt skb->protocol alone is sufficient to > > > > distinguish TCP, so we may end up having more checks above. > > > > > > > > So do you really want to trade code readability with an extra alloc? > > > > > > Above is ugly. So I look at where the patch replaces things, > > > > > > sk_psock_tls_strp_read(), this is TLS specific read hook so can't really > > > work in generic case anyways. > > > > > > sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these > > > even work outside TCP cases. > > > > > > For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(), > > > sk_psock_backlog(), can't we just do some refactoring around their > > > hook points so we know the context. For example sk_psock_tls_verdict_apply > > > is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect() > > > and a sk_psock_udp_redirect(). There are likely some optimizations we can > > > deploy this way. We've already don this for tls and sk_msg types for example. > > > > > > Then the helpers will know their types by program type, just use the right > > > variants. > > > > > > So not suggestiong if/else the checks so much as having per type hooks. > > > > > > > Hmm, but sk_psock_backlog() is still the only one that handles all three > > above cases, right? It uses TCP_SKB_CB() too and more importantly it > > is also why we can't use a per-cpu struct here (see bpf_redirect_info). > > Right, but the workqueue is created at init time where we will know the > socket type based on the program/map types so can build the redirect > backlog queue there based on the type needed. I also have a patch in Hmm? How could a socket type match the skb type when we redirect across-protocol? In my use case, I want to redirect an AF_UNIX skb to a UDP socket, clearly checking the UDP socket workqueue can't find out it is an AF_UNIX skb. It has to be a per-skb check. > mind that would do more specific TCP things in that code anyways. I > can flush it out this week if anyone cares. The idea is we are wasting > lots of cycles using skb_send_sock_locked when we can just inject > the packet directlyy into the tcp stack. Actually I did try the same, it clearly doesn't work for cross-protocol. Anyway, please let me know what your suggestion for skb ext here? It looks like we either have some ugly packet type checks, or just use the skb ext. I don't see any other way yet, I also explored the struct sk_buff again and still can not find anything we can reuse. (_skb_refdst can only be reused very briefly with tcp_skb_tsorted_save().) Therefore, I believe using skb ext is still the best solution here. > > Also on the original patch here, we can't just kfree_skb() on alloc > errors because that will look like a data drop. Errors need to be > handled gracefully without dropping data. At least in the TCP case, > but probably also in UDP and AF_UNIX cases as well. Original code > was pretty loose in this regard, but it caused users to write bug > reports and then I've been fixing most of them. If you see more > cases let me know. What's your suggestion here? Return -EAGAIN? But it requires the caller put it in a loop to be graceful, but we can't do it in, for example, sk_psock_tls_strp_read(). Thanks.
On Mon, 15 Feb 2021 at 19:20, John Fastabend <john.fastabend@gmail.com> wrote: > > Cong Wang wrote: > > From: Cong Wang <cong.wang@bytedance.com> > > > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly > > does not work for any other non-TCP protocols. We can move them to > > skb ext instead of playing with skb cb, which is harder to make > > correct. > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > --- > > I'm not seeing the advantage of doing this at the moment. We can > continue to use cb[] here, which is simpler IMO and use the ext > if needed for the other use cases. This is adding a per packet > alloc cost that we don't have at the moment as I understand it. John, do you have a benchmark we can look at? Right now we're arguing in the abstract.
Lorenz Bauer wrote: > On Mon, 15 Feb 2021 at 19:20, John Fastabend <john.fastabend@gmail.com> wrote: > > > > Cong Wang wrote: > > > From: Cong Wang <cong.wang@bytedance.com> > > > > > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly > > > does not work for any other non-TCP protocols. We can move them to > > > skb ext instead of playing with skb cb, which is harder to make > > > correct. > > > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com> > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > > --- > > > > I'm not seeing the advantage of doing this at the moment. We can > > continue to use cb[] here, which is simpler IMO and use the ext > > if needed for the other use cases. This is adding a per packet > > alloc cost that we don't have at the moment as I understand it. > > John, do you have a benchmark we can look at? Right now we're arguing > in the abstract. Sure, but looks like Cong found some spare fields in sk_buff so that looks much nicer. I'll mess aound a bit with our benchmarks and see where we can publish them. It would be good to have some repeatable tests here folks can use. Thanks, John
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 46f901adf1a8..2d4ffe77ef47 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4166,6 +4166,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MPTCP) SKB_EXT_MPTCP, +#endif +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + SKB_EXT_BPF, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e3bb712af257..d5c711ef6d4b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -459,4 +459,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return false; return !!psock->saved_data_ready; } + +struct skb_bpf_ext { + __u32 flags; + struct sock *sk_redir; +}; + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) +static inline +bool skb_bpf_ext_ingress(const struct sk_buff *skb) +{ + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); + + return ext->flags & BPF_F_INGRESS; +} + +static inline +void skb_bpf_ext_set_ingress(const struct sk_buff *skb) +{ + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); + + ext->flags |= BPF_F_INGRESS; +} + +static inline +struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb) +{ + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); + + return ext->sk_redir; +} + +static inline +void skb_bpf_ext_redirect_clear(struct sk_buff *skb) +{ + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); + + ext->flags = 0; + ext->sk_redir = NULL; +} +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 697712178eff..e35881f837b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -882,30 +882,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/Kconfig b/net/Kconfig index a4f60d0c630f..9b4dd1ad2188 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -419,6 +419,7 @@ config SOCK_VALIDATE_XMIT config NET_SOCK_MSG bool + select SKB_EXTENSIONS default n help The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 145503d3f06b..7695a2b65832 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -60,6 +60,7 @@ #include <linux/prefetch.h> #include <linux/if_vlan.h> #include <linux/mpls.h> +#include <linux/skmsg.h> #include <net/protocol.h> #include <net/dst.h> @@ -4259,6 +4260,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MPTCP) [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), #endif +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + [SKB_EXT_BPF] = SKB_EXT_CHUNKSIZEOF(struct skb_bpf_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -4275,6 +4279,9 @@ static __always_inline unsigned int skb_ext_total_length(void) #endif #if IS_ENABLED(CONFIG_MPTCP) skb_ext_type_len[SKB_EXT_MPTCP] + +#endif +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + skb_ext_type_len[SKB_EXT_BPF] + #endif 0; } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2d8bbb3fd87c..9404dbf5d57b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ext_ingress(skb); + skb_ext_del(skb, SKB_EXT_BPF); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -752,7 +753,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_ext_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -794,6 +795,9 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) struct bpf_prog *prog; int ret = __SK_PASS; + if (!skb_ext_add(skb, SKB_EXT_BPF)) + return __SK_DROP; + rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { @@ -802,9 +806,9 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_bpf_ext_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -816,7 +820,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -828,9 +831,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; - + skb_bpf_ext_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to * queue work otherwise we may get OOO data. Otherwise, @@ -888,11 +889,15 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); + if (!skb_ext_add(skb, SKB_EXT_BPF)) { + kfree_skb(skb); + goto out; + } prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_bpf_ext_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1003,11 +1008,17 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); + if (!skb_ext_add(skb, SKB_EXT_BPF)) { + len = 0; + kfree_skb(skb); + goto out; + } + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_bpf_ext_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..e9f2a17fb665 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,7 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +667,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + ext->flags = flags; + ext->sk_redir = sk; return SK_PASS; } @@ -1250,7 +1250,7 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1260,8 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + ext->flags = flags; + ext->sk_redir = sk; return SK_PASS; }