diff mbox series

[net-next,v3,5/6] net: devmem: Implement TX path

Message ID 20250203223916.1064540-6-almasrymina@google.com
State New
Headers show
Series [net-next,v3,1/6] net: add devmem TCP TX documentation | expand

Commit Message

Mina Almasry Feb. 3, 2025, 10:39 p.m. UTC
Augment dmabuf binding to be able to handle TX. Additional to all the RX
binding, we also create tx_vec needed for the TX path.

Provide API for sendmsg to be able to send dmabufs bound to this device:

- Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from.
- MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf.

Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY
implementation, while disabling instances where MSG_ZEROCOPY falls back
to copying.

We additionally pipe the binding down to the new
zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems
instead of the traditional page netmems.

We also special case skb_frag_dma_map to return the dma-address of these
dmabuf net_iovs instead of attempting to map pages.

Based on work by Stanislav Fomichev <sdf@fomichev.me>. A lot of the meat
of the implementation came from devmem TCP RFC v1[1], which included the
TX path, but Stan did all the rebasing on top of netmem/net_iov.

Cc: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>


---

v3:
- Use kvmalloc_array instead of kcalloc (Stan).
- Fix unreachable code warning (Simon).

v2:
- Remove dmabuf_offset from the dmabuf cmsg.
- Update zerocopy_fill_skb_from_devmem to interpret the
  iov_base/iter_iov_addr as the offset into the dmabuf to send from
  (Stan).
- Remove the confusing binding->tx_iter which is not needed if we
  interpret the iov_base/iter_iov_addr as offset into the dmabuf (Stan).
- Remove check for binding->sgt and binding->sgt->nents in dmabuf
  binding.
- Simplify the calculation of binding->tx_vec.
- Check in net_devmem_get_binding that the binding we're returning
  has ifindex matching the sending socket (Willem).
---
 include/linux/skbuff.h                  | 15 +++-
 include/net/sock.h                      |  1 +
 include/uapi/linux/uio.h                |  6 +-
 net/core/datagram.c                     | 41 ++++++++++-
 net/core/devmem.c                       | 97 +++++++++++++++++++++++--
 net/core/devmem.h                       | 42 ++++++++++-
 net/core/netdev-genl.c                  | 64 +++++++++++++++-
 net/core/skbuff.c                       |  6 +-
 net/core/sock.c                         |  8 ++
 net/ipv4/tcp.c                          | 36 ++++++---
 net/vmw_vsock/virtio_transport_common.c |  3 +-
 11 files changed, 285 insertions(+), 34 deletions(-)

Comments

Pavel Begunkov Feb. 5, 2025, 12:20 p.m. UTC | #1
On 2/3/25 22:39, Mina Almasry wrote:
...
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index bb2b751d274a..3ff8f568c382 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1711,9 +1711,12 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
...
>   int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>   				struct iov_iter *from, size_t length);
> @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>   static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
>   					  struct msghdr *msg, int len)
>   {
> -	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
> +	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
> +				       NULL);

Instead of propagating it all the way down and carving a new path, why
not reuse the existing infra? You already hook into where ubuf is
allocated, you can stash the binding in there. And
zerocopy_fill_skb_from_devmem can implement ->sg_from_iter,
see __zerocopy_sg_from_iter().

...
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index f0693707aece..c989606ff58d 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -63,6 +63,8 @@
> +static int
> +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
> +			      int length,
> +			      struct net_devmem_dmabuf_binding *binding)
> +{
> +	int i = skb_shinfo(skb)->nr_frags;
> +	size_t virt_addr, size, off;
> +	struct net_iov *niov;
> +
> +	while (length && iov_iter_count(from)) {
> +		if (i == MAX_SKB_FRAGS)
> +			return -EMSGSIZE;
> +
> +		virt_addr = (size_t)iter_iov_addr(from);

Unless I missed it somewhere it needs to check that the iter
is iovec based.

> +		niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
> +		if (!niov)
> +			return -EFAULT;
> +
> +		size = min_t(size_t, size, length);
> +		size = min_t(size_t, size, iter_iov_len(from));
> +
> +		get_netmem(net_iov_to_netmem(niov));
> +		skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
> +				       size, PAGE_SIZE);
> +		iov_iter_advance(from, size);
> +		length -= size;
> +		i++;
> +	}
> +
> +	return 0;
> +}
> +
>   int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
>   			    struct sk_buff *skb, struct iov_iter *from,
> -			    size_t length)
> +			    size_t length,
> +			    struct net_devmem_dmabuf_binding *binding)
>   {
>   	unsigned long orig_size = skb->truesize;
>   	unsigned long truesize;
> @@ -702,6 +737,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
>   
>   	if (msg && msg->msg_ubuf && msg->sg_from_iter)
>   		ret = msg->sg_from_iter(skb, from, length);

As mentioned above, you can implement this callback. The callback can
also be moved into ubuf_info ops if that's more convenient, I had
patches stashed for that.

> +	else if (unlikely(binding))
> +		ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
>   	else
>   		ret = zerocopy_fill_skb_from_iter(skb, from, length);
>   
> @@ -735,7 +772,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
>   	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
>   		return -EFAULT;

...

> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 0d704bda6c41..44198ae7e44c 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1051,6 +1051,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
>   
>   int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
>   {
> +	struct net_devmem_dmabuf_binding *binding = NULL;
>   	struct tcp_sock *tp = tcp_sk(sk);
>   	struct ubuf_info *uarg = NULL;
>   	struct sk_buff *skb;
> @@ -1063,6 +1064,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
>   
>   	flags = msg->msg_flags;
>   
> +	sockcm_init(&sockc, sk);
> +	if (msg->msg_controllen) {
> +		err = sock_cmsg_send(sk, msg, &sockc);
> +		if (unlikely(err)) {
> +			err = -EINVAL;
> +			goto out_err;
> +		}
> +	}
> +
>   	if ((flags & MSG_ZEROCOPY) && size) {
>   		if (msg->msg_ubuf) {
>   			uarg = msg->msg_ubuf;
> @@ -1080,6 +1090,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
>   			else
>   				uarg_to_msgzc(uarg)->zerocopy = 0;
>   		}
> +
> +		if (sockc.dmabuf_id != 0) {

It's better to be mutually exclusive with msg->msg_ubuf, the callers
have expectations about the buffers used. And you likely don't want
to mix it with normal MSG_ZEROCOPY in a single skb and/or ubuf_info,
you can force reallocation of ubuf_info here.

> +			binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
> +			if (IS_ERR(binding)) {
> +				err = PTR_ERR(binding);
> +				binding = NULL;
> +				goto out_err;
> +			}
> +		}
Willem de Bruijn Feb. 5, 2025, 9:56 p.m. UTC | #2
Mina Almasry wrote:
> Augment dmabuf binding to be able to handle TX. Additional to all the RX
> binding, we also create tx_vec needed for the TX path.
> 
> Provide API for sendmsg to be able to send dmabufs bound to this device:
> 
> - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from.
> - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf.
> 
> Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY
> implementation, while disabling instances where MSG_ZEROCOPY falls back
> to copying.
> 
> We additionally pipe the binding down to the new
> zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems
> instead of the traditional page netmems.
> 
> We also special case skb_frag_dma_map to return the dma-address of these
> dmabuf net_iovs instead of attempting to map pages.
> 
> Based on work by Stanislav Fomichev <sdf@fomichev.me>. A lot of the meat
> of the implementation came from devmem TCP RFC v1[1], which included the
> TX path, but Stan did all the rebasing on top of netmem/net_iov.
> 
> Cc: Stanislav Fomichev <sdf@fomichev.me>
> Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> 
> 
> ---
> 
> v3:
> - Use kvmalloc_array instead of kcalloc (Stan).
> - Fix unreachable code warning (Simon).
> 
> v2:
> - Remove dmabuf_offset from the dmabuf cmsg.
> - Update zerocopy_fill_skb_from_devmem to interpret the
>   iov_base/iter_iov_addr as the offset into the dmabuf to send from
>   (Stan).
> - Remove the confusing binding->tx_iter which is not needed if we
>   interpret the iov_base/iter_iov_addr as offset into the dmabuf (Stan).
> - Remove check for binding->sgt and binding->sgt->nents in dmabuf
>   binding.
> - Simplify the calculation of binding->tx_vec.
> - Check in net_devmem_get_binding that the binding we're returning
>   has ifindex matching the sending socket (Willem).
> ---
>  include/linux/skbuff.h                  | 15 +++-
>  include/net/sock.h                      |  1 +
>  include/uapi/linux/uio.h                |  6 +-
>  net/core/datagram.c                     | 41 ++++++++++-
>  net/core/devmem.c                       | 97 +++++++++++++++++++++++--
>  net/core/devmem.h                       | 42 ++++++++++-
>  net/core/netdev-genl.c                  | 64 +++++++++++++++-
>  net/core/skbuff.c                       |  6 +-
>  net/core/sock.c                         |  8 ++
>  net/ipv4/tcp.c                          | 36 ++++++---
>  net/vmw_vsock/virtio_transport_common.c |  3 +-
>  11 files changed, 285 insertions(+), 34 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index bb2b751d274a..3ff8f568c382 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1711,9 +1711,12 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
>  
>  void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
>  
> +struct net_devmem_dmabuf_binding;
> +
>  int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
>  			    struct sk_buff *skb, struct iov_iter *from,
> -			    size_t length);
> +			    size_t length,
> +			    struct net_devmem_dmabuf_binding *binding);
>  
>  int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>  				struct iov_iter *from, size_t length);
> @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>  static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
>  					  struct msghdr *msg, int len)
>  {
> -	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
> +	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
> +				       NULL);
>  }
>  
>  int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
>  			     struct msghdr *msg, int len,
> -			     struct ubuf_info *uarg);
> +			     struct ubuf_info *uarg,
> +			     struct net_devmem_dmabuf_binding *binding);
>  
>  /* Internal */
>  #define skb_shinfo(SKB)	((struct skb_shared_info *)(skb_end_pointer(SKB)))
> @@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
>  					    size_t offset, size_t size,
>  					    enum dma_data_direction dir)
>  {
> +	if (skb_frag_is_net_iov(frag)) {
> +		return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
> +		       frag->offset;
> +	}
>  	return dma_map_page(dev, skb_frag_page(frag),
>  			    skb_frag_off(frag) + offset, size, dir);
>  }
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 8036b3b79cd8..09eb918525b6 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1822,6 +1822,7 @@ struct sockcm_cookie {
>  	u32 tsflags;
>  	u32 ts_opt_id;
>  	u32 priority;
> +	u32 dmabuf_id;
>  };
>  
>  static inline void sockcm_init(struct sockcm_cookie *sockc,
> diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
> index 649739e0c404..866bd5dfe39f 100644
> --- a/include/uapi/linux/uio.h
> +++ b/include/uapi/linux/uio.h
> @@ -38,10 +38,14 @@ struct dmabuf_token {
>  	__u32 token_count;
>  };
>  
> +struct dmabuf_tx_cmsg {
> +	__u32 dmabuf_id;
> +};
> +

Why a wrapper struct instead of just __u32?
Pavel Begunkov Feb. 12, 2025, 3:53 p.m. UTC | #3
On 2/10/25 21:09, Mina Almasry wrote:
> On Wed, Feb 5, 2025 at 4:20 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 2/3/25 22:39, Mina Almasry wrote:
>> ...
>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>> index bb2b751d274a..3ff8f568c382 100644
>>> --- a/include/linux/skbuff.h
>>> +++ b/include/linux/skbuff.h
>>> @@ -1711,9 +1711,12 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
>> ...
>>>    int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>>>                                struct iov_iter *from, size_t length);
>>> @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>>>    static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
>>>                                          struct msghdr *msg, int len)
>>>    {
>>> -     return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
>>> +     return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
>>> +                                    NULL);
>>
>> Instead of propagating it all the way down and carving a new path, why
>> not reuse the existing infra? You already hook into where ubuf is
>> allocated, you can stash the binding in there. And
> 
> It looks like it's not possible to increase the side of ubuf_info at
> all, otherwise the BUILD_BUG_ON in msg_zerocopy_alloc() fires.
> 
> It's asserting that sizeof(ubuf_info_msgzc) <= sizeof(skb->cb), and
> I'm guessing increasing skb->cb size is not really the way to go.
> 
> What I may be able to do here is stash the binding somewhere in
> ubuf_info_msgzc via union with fields we don't need for devmem, and/or

It doesn't need to account the memory against the user, and you
actually don't want that because dmabuf should take care of that.
So, it should be fine to reuse ->mmp.

It's also not a real sk_buff, so maybe maintainers wouldn't mind
reusing some more space out of it, if that would even be needed.

> stashing the binding in ubuf_info_ops (very hacky). Neither approach
> seems ideal, but the former may work and may be cleaner.
> 
> I'll take a deeper look here. I had looked before and concluded that
> we're piggybacking devmem TX on MSG_ZEROCOPY path, because we need
> almost all of the functionality there (no copying, send complete
> notifications, etc), with one minor change in the skb filling. I had
> concluded that if MSG_ZEROCOPY was never updated to use the existing
> infra, then it's appropriate for devmem TX piggybacking on top of it

MSG_ZEROCOPY does use the common infra, i.e. passing ubuf_info,
but doesn't need ->sg_from_iter as zerocopy_fill_skb_from_iter()
and it's what was there first.

> to follow that. I would not want to get into a refactor of
> MSG_ZEROCOPY for no real reason.
> 
> But I'll take a deeper look here and see if I can make something
> slightly cleaner work.
> 
>> zerocopy_fill_skb_from_devmem can implement ->sg_from_iter,
>> see __zerocopy_sg_from_iter().
>>
>> ...
>>> diff --git a/net/core/datagram.c b/net/core/datagram.c
>>> index f0693707aece..c989606ff58d 100644
>>> --- a/net/core/datagram.c
>>> +++ b/net/core/datagram.c
>>> @@ -63,6 +63,8 @@
>>> +static int
>>> +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
>>> +                           int length,
>>> +                           struct net_devmem_dmabuf_binding *binding)
>>> +{
>>> +     int i = skb_shinfo(skb)->nr_frags;
>>> +     size_t virt_addr, size, off;
>>> +     struct net_iov *niov;
>>> +
>>> +     while (length && iov_iter_count(from)) {
>>> +             if (i == MAX_SKB_FRAGS)
>>> +                     return -EMSGSIZE;
>>> +
>>> +             virt_addr = (size_t)iter_iov_addr(from);
>>
>> Unless I missed it somewhere it needs to check that the iter
>> is iovec based.
>>
> 
> How do we end up here with an iterator that is not iovec based? Is the
> user able to trigger that somehow and I missed it?

Hopefully not, but for example io_uring passes bvecs for a number of
requests that can end up in tcp_sendmsg_locked(). Those probably
would work with the current patch, but check the order of some of the
checks it will break. And once io_uring starts passing bvecs for
normal send[msg] requests, it'd definitely be possible. And there
are other in kernel users apart from send(2) path, so who knows.

The api allows it and therefore should be checked, it's better to
avoid quite possible latent bugs.
Pavel Begunkov Feb. 13, 2025, 1:18 p.m. UTC | #4
On 2/12/25 19:18, Mina Almasry wrote:
> On Wed, Feb 12, 2025 at 7:52 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 2/10/25 21:09, Mina Almasry wrote:
>>> On Wed, Feb 5, 2025 at 4:20 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>>
>>>> On 2/3/25 22:39, Mina Almasry wrote:
>>>> ...
>>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>>>> index bb2b751d274a..3ff8f568c382 100644
>>>>> --- a/include/linux/skbuff.h
>>>>> +++ b/include/linux/skbuff.h
>>>>> @@ -1711,9 +1711,12 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
>>>> ...
>>>>>     int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>>>>>                                 struct iov_iter *from, size_t length);
>>>>> @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
>>>>>     static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
>>>>>                                           struct msghdr *msg, int len)
>>>>>     {
>>>>> -     return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
>>>>> +     return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
>>>>> +                                    NULL);
>>>>
>>>> Instead of propagating it all the way down and carving a new path, why
>>>> not reuse the existing infra? You already hook into where ubuf is
>>>> allocated, you can stash the binding in there. And
>>>
>>> It looks like it's not possible to increase the side of ubuf_info at
>>> all, otherwise the BUILD_BUG_ON in msg_zerocopy_alloc() fires.
>>>
>>> It's asserting that sizeof(ubuf_info_msgzc) <= sizeof(skb->cb), and
>>> I'm guessing increasing skb->cb size is not really the way to go.
>>>
>>> What I may be able to do here is stash the binding somewhere in
>>> ubuf_info_msgzc via union with fields we don't need for devmem, and/or
>>
>> It doesn't need to account the memory against the user, and you
>> actually don't want that because dmabuf should take care of that.
>> So, it should be fine to reuse ->mmp.
>>
>> It's also not a real sk_buff, so maybe maintainers wouldn't mind
>> reusing some more space out of it, if that would even be needed.
>>
> 
> netmem skb are real sk_buff, with the modification that frags are not

We were discussing ubuf_info allocation, take a look at
msg_zerocopy_alloc(), it has nothing to do with netmems and all that.

> readable, only in the case that the netmem is unreadable. I would not
> approve of considering netmem/devmem skbs "not real skbs", and start
> messing with the semantics of skb fields for devmem skbs, and having
> to start adding skb_is_devmem() checks through all code in the skb
> handlers that touch the fields being overwritten in the devmem case.
> No, I don't think we can re-use random fields in the skb for devmem.
> 
>>> stashing the binding in ubuf_info_ops (very hacky). Neither approach
>>> seems ideal, but the former may work and may be cleaner.
>>>
>>> I'll take a deeper look here. I had looked before and concluded that
>>> we're piggybacking devmem TX on MSG_ZEROCOPY path, because we need
>>> almost all of the functionality there (no copying, send complete
>>> notifications, etc), with one minor change in the skb filling. I had
>>> concluded that if MSG_ZEROCOPY was never updated to use the existing
>>> infra, then it's appropriate for devmem TX piggybacking on top of it
>>
>> MSG_ZEROCOPY does use the common infra, i.e. passing ubuf_info,
>> but doesn't need ->sg_from_iter as zerocopy_fill_skb_from_iter()
>> and it's what was there first.
>>
> 
> But MSG_ZEROCOPY doesn't set msg->msg_ubuf. And not setting
> msg->msg_ubuf fails to trigger msg->sg_from_iter altogether.
> 
> And also currently sg_from_iter isn't set up to take in a ubuf_info.
> We'd need that if we stash the binding in the ubuf_info.

https://github.com/isilence/linux.git sg-iter-ops

I have old patches for all of that, they even rebased cleanly. That
should do it for you, and I need to send then regardless of devmem.


> All in all I think I wanna prototype an msg->sg_from_iter approach and
> make a judgement call on whether it's cleaner than just passing the
> binding through a couple of helpers just as I'm doing here. My feeling
> is that the implementation in this patch may be cleaner than
> refactoring the entire msg_ubuf/sg_from_iter flows so we can sort of
> use it for MSG_ZEROCOPY with devmem when it currently doesn't use it.
> 
>>> to follow that. I would not want to get into a refactor of
>>> MSG_ZEROCOPY for no real reason.
>>>
>>> But I'll take a deeper look here and see if I can make something
>>> slightly cleaner work.
Mina Almasry Feb. 17, 2025, 11:26 p.m. UTC | #5
On Thu, Feb 13, 2025 at 5:17 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 2/12/25 19:18, Mina Almasry wrote:
> > On Wed, Feb 12, 2025 at 7:52 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 2/10/25 21:09, Mina Almasry wrote:
> >>> On Wed, Feb 5, 2025 at 4:20 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>>>
> >>>> On 2/3/25 22:39, Mina Almasry wrote:
> >>>> ...
> >>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> >>>>> index bb2b751d274a..3ff8f568c382 100644
> >>>>> --- a/include/linux/skbuff.h
> >>>>> +++ b/include/linux/skbuff.h
> >>>>> @@ -1711,9 +1711,12 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
> >>>> ...
> >>>>>     int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
> >>>>>                                 struct iov_iter *from, size_t length);
> >>>>> @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
> >>>>>     static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
> >>>>>                                           struct msghdr *msg, int len)
> >>>>>     {
> >>>>> -     return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
> >>>>> +     return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
> >>>>> +                                    NULL);
> >>>>
> >>>> Instead of propagating it all the way down and carving a new path, why
> >>>> not reuse the existing infra? You already hook into where ubuf is
> >>>> allocated, you can stash the binding in there. And
> >>>
> >>> It looks like it's not possible to increase the side of ubuf_info at
> >>> all, otherwise the BUILD_BUG_ON in msg_zerocopy_alloc() fires.
> >>>
> >>> It's asserting that sizeof(ubuf_info_msgzc) <= sizeof(skb->cb), and
> >>> I'm guessing increasing skb->cb size is not really the way to go.
> >>>
> >>> What I may be able to do here is stash the binding somewhere in
> >>> ubuf_info_msgzc via union with fields we don't need for devmem, and/or
> >>
> >> It doesn't need to account the memory against the user, and you
> >> actually don't want that because dmabuf should take care of that.
> >> So, it should be fine to reuse ->mmp.
> >>
> >> It's also not a real sk_buff, so maybe maintainers wouldn't mind
> >> reusing some more space out of it, if that would even be needed.
> >>
> >
> > netmem skb are real sk_buff, with the modification that frags are not
>
> We were discussing ubuf_info allocation, take a look at
> msg_zerocopy_alloc(), it has nothing to do with netmems and all that.
>

Yes. My response was regarding the suggestion that we can use space in
devmem skbs however we want though.

> > readable, only in the case that the netmem is unreadable. I would not
> > approve of considering netmem/devmem skbs "not real skbs", and start
> > messing with the semantics of skb fields for devmem skbs, and having
> > to start adding skb_is_devmem() checks through all code in the skb
> > handlers that touch the fields being overwritten in the devmem case.
> > No, I don't think we can re-use random fields in the skb for devmem.
> >
> >>> stashing the binding in ubuf_info_ops (very hacky). Neither approach
> >>> seems ideal, but the former may work and may be cleaner.
> >>>
> >>> I'll take a deeper look here. I had looked before and concluded that
> >>> we're piggybacking devmem TX on MSG_ZEROCOPY path, because we need
> >>> almost all of the functionality there (no copying, send complete
> >>> notifications, etc), with one minor change in the skb filling. I had
> >>> concluded that if MSG_ZEROCOPY was never updated to use the existing
> >>> infra, then it's appropriate for devmem TX piggybacking on top of it
> >>
> >> MSG_ZEROCOPY does use the common infra, i.e. passing ubuf_info,
> >> but doesn't need ->sg_from_iter as zerocopy_fill_skb_from_iter()
> >> and it's what was there first.
> >>
> >
> > But MSG_ZEROCOPY doesn't set msg->msg_ubuf. And not setting
> > msg->msg_ubuf fails to trigger msg->sg_from_iter altogether.
> >
> > And also currently sg_from_iter isn't set up to take in a ubuf_info.
> > We'd need that if we stash the binding in the ubuf_info.
>
> https://github.com/isilence/linux.git sg-iter-ops
>
> I have old patches for all of that, they even rebased cleanly. That
> should do it for you, and I need to send then regardless of devmem.
>
>

These patches help a bit, but do not make any meaningful dent in
addressing the concern I have in the earlier emails.

The concern is that we're piggybacking devmem TX on MSG_ZEROCOPY, and
currently the MSG_ZEROCOPY code carefully avoids any code paths
setting msg->[sg_from_iter|msg_ubuf].

If we want devmem to reuse both the MSG_ZEROCOPY mechanisms and the
msg->[sg_from_iter|ubuf_info] mechanism, I have to dissect the
MSG_ZEROCOPY code carefully so that it works with and without
setting msg->[ubuf_info|msg->sg_from_iter]. Having gone through this
rabbit hole so far I see that it complicates the implementation and
adds more checks to the fast MSG_ZEROCOPY paths.

The complication could be worth it if there was some upside, but I
don't see one tbh. Passing the binding down to
zerocopy_fill_skb_from_devmem seems like a better approach to my eye
so far

I'm afraid I'm going to table this for now. If there is overwhelming
consensus that msg->sg_from_iter is the right approach here I will
revisit, but it seems to me to complicate code without a significant
upside.


--
Thanks,
Mina
Pavel Begunkov Feb. 19, 2025, 10:41 p.m. UTC | #6
On 2/17/25 23:26, Mina Almasry wrote:
> On Thu, Feb 13, 2025 at 5:17 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
...
>>>>> It's asserting that sizeof(ubuf_info_msgzc) <= sizeof(skb->cb), and
>>>>> I'm guessing increasing skb->cb size is not really the way to go.
>>>>>
>>>>> What I may be able to do here is stash the binding somewhere in
>>>>> ubuf_info_msgzc via union with fields we don't need for devmem, and/or
>>>>
>>>> It doesn't need to account the memory against the user, and you
>>>> actually don't want that because dmabuf should take care of that.
>>>> So, it should be fine to reuse ->mmp.
>>>>
>>>> It's also not a real sk_buff, so maybe maintainers wouldn't mind
>>>> reusing some more space out of it, if that would even be needed.
>>>>
>>>
>>> netmem skb are real sk_buff, with the modification that frags are not
>>
>> We were discussing ubuf_info allocation, take a look at
>> msg_zerocopy_alloc(), it has nothing to do with netmems and all that.
>>
> 
> Yes. My response was regarding the suggestion that we can use space in
> devmem skbs however we want though.

Well, at least I didn't suggest that, assuming "devmem skbs" are skbs
filled with devmem frags. I think the confusion here is thinking
that skb->cb you mentioned above is about "devmem skbs", while it's
special skbs without data used only to piggy back ubuf allocation.
Functionally speaking, it'd be perfectly fine to get rid of the
warning and allocate it with kmalloc().

...
>>> But MSG_ZEROCOPY doesn't set msg->msg_ubuf. And not setting
>>> msg->msg_ubuf fails to trigger msg->sg_from_iter altogether.
>>>
>>> And also currently sg_from_iter isn't set up to take in a ubuf_info.
>>> We'd need that if we stash the binding in the ubuf_info.
>>
>> https://github.com/isilence/linux.git sg-iter-ops
>>
>> I have old patches for all of that, they even rebased cleanly. That
>> should do it for you, and I need to send then regardless of devmem.
>>
>>
> 
> These patches help a bit, but do not make any meaningful dent in
> addressing the concern I have in the earlier emails.
> 
> The concern is that we're piggybacking devmem TX on MSG_ZEROCOPY, and
> currently the MSG_ZEROCOPY code carefully avoids any code paths
> setting msg->[sg_from_iter|msg_ubuf].

Fwiw, with that branch you don't need ->msg_ubuf at all, just pass
it as an argument from tcp_sendmsg_locked() as usual, and
->sg_from_iter is gone from there as well.

> If we want devmem to reuse both the MSG_ZEROCOPY mechanisms and the
> msg->[sg_from_iter|ubuf_info] mechanism, I have to dissect the
> MSG_ZEROCOPY code carefully so that it works with and without
> setting msg->[ubuf_info|msg->sg_from_iter]. Having gone through this
> rabbit hole so far I see that it complicates the implementation and
> adds more checks to the fast MSG_ZEROCOPY paths.

If you've already done, maybe you can post it as a draft? At least
it'll be obvious why you say it's more complicated.

> The complication could be worth it if there was some upside, but I
> don't see one tbh. Passing the binding down to
> zerocopy_fill_skb_from_devmem seems like a better approach to my eye
> so far

The upside is that 1) you currently you add overhead to common
path (incl copy), 2) passing it down through all the function also
have overhead to the zerocopy and MSG_ZEROCOPY path, which I'd
assume is comparable to those extra checks you have. 3) tcp would
need to know about devmem tcp and its bindings, while it all could
be in one spot under the MSG_ZEROCOPY check. 4) When you'd want
another protocol to support that, instead of a simple

ubuf = get_devmem_ubuf();

You'd need to plumb binding passing through the stack there as
well.

5) And keeping it in one place makes it easier to keep around.

I just don't see why it'd be complicated, but maybe I miss
something, which is why a draft prototype would explain it
better than any words.

> I'm afraid I'm going to table this for now. If there is overwhelming
> consensus that msg->sg_from_iter is the right approach here I will
> revisit, but it seems to me to complicate code without a significant
> upside.
Pavel Begunkov Feb. 20, 2025, 2:35 p.m. UTC | #7
On 2/20/25 01:46, Mina Almasry wrote:
> On Wed, Feb 19, 2025 at 2:40 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 2/17/25 23:26, Mina Almasry wrote:
>>> On Thu, Feb 13, 2025 at 5:17 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>> ...
>>>>>>> It's asserting that sizeof(ubuf_info_msgzc) <= sizeof(skb->cb), and
>>>>>>> I'm guessing increasing skb->cb size is not really the way to go.
>>>>>>>
>>>>>>> What I may be able to do here is stash the binding somewhere in
>>>>>>> ubuf_info_msgzc via union with fields we don't need for devmem, and/or
>>>>>>
>>>>>> It doesn't need to account the memory against the user, and you
>>>>>> actually don't want that because dmabuf should take care of that.
>>>>>> So, it should be fine to reuse ->mmp.
>>>>>>
>>>>>> It's also not a real sk_buff, so maybe maintainers wouldn't mind
>>>>>> reusing some more space out of it, if that would even be needed.
>>>>>>
>>>>>
>>>>> netmem skb are real sk_buff, with the modification that frags are not
>>>>
>>>> We were discussing ubuf_info allocation, take a look at
>>>> msg_zerocopy_alloc(), it has nothing to do with netmems and all that.
>>>>
>>>
>>> Yes. My response was regarding the suggestion that we can use space in
>>> devmem skbs however we want though.
>>
>> Well, at least I didn't suggest that, assuming "devmem skbs" are skbs
>> filled with devmem frags. I think the confusion here is thinking
>> that skb->cb you mentioned above is about "devmem skbs", while it's
>> special skbs without data used only to piggy back ubuf allocation.
> 
> Ah, I see. I still don't see how we can just increase the size of
> skb->cb when it's shared between these special skbs and regular skbs.

The approach was not to increase ->cb but rather reuse some other unused
in the path sk_buff fields. Though, looking at __msg_zerocopy_callback(),
maybe it's better not to entertain that, as the skb is queued into the
error queue. But again, not like you need it.

>> Functionally speaking, it'd be perfectly fine to get rid of the
>> warning and allocate it with kmalloc().
>>
> 
> More suggestions to refactor unrelated things to force through a
> msg->sg_from_iter approach.

Mina, you're surprising me. Neither I suggested to do that, just
trying to help with your confusion using analogies, nor I said that
it'd be welcome, nor it's somehow "unrelated". And "forcing"
is a misstatement, so far I've been extending a recommendation
on how to make it better.

...
>> If you've already done, maybe you can post it as a draft? At least
>> it'll be obvious why you say it's more complicated.
>>
> 
> I don't have anything worth sharing. Just went down this rabbit hole
> and saw a bunch of MSG_ZEROCOPY checks (!msg->msg_ubuf checks around
> MSG_ZEROCOPY code) and restrictions (skb->cb size) need to be
> addressed and checks to be added. From this thread you seem to be
> suggesting more changes to force in a msg->sg_from_iter approach
> adding to the complications.

To sum up, you haven't tried it.

>>> The complication could be worth it if there was some upside, but I
>>> don't see one tbh. Passing the binding down to
>>> zerocopy_fill_skb_from_devmem seems like a better approach to my eye
>>> so far
>>
>> The upside is that 1) you currently you add overhead to common
>> path (incl copy),
> 
> You mean the unlikely() check for devmem before delegating to
> skb_zerocopy_fill_from_devmem? Should be minimal.

Like keeping the binding in tcp_sendmsg_locked(). The point is,
as you mentioned overhead ("adds more checks to the fast
MSG_ZEROCOPY paths"), all things included the current approach
will be adding more of it to MSG_ZEROCOPY and also other users.

>> 2) passing it down through all the function also
>> have overhead to the zerocopy and MSG_ZEROCOPY path, which I'd
>> assume is comparable to those extra checks you have.
> 
> Complicating/refactoring existing code for devmem TCP to force in a
> msg->sg_from_iter and save 1 arg passed down a couple of functions
> doesn't seem like a good tradeoff IMO.
> 
>> 3) tcp would
>> need to know about devmem tcp and its bindings, while it all could
>> be in one spot under the MSG_ZEROCOPY check.
> 
> I don't see why this is binding to tcp somehow. If anything it makes

I don't get what you're saying, but it refers to devmem binding,
which you add to TCP path, and so tcp now has to know how to work
with devmem instead of all of it being hidden behind the curtains
of ubuf_info. And it sticks out not only for tcp, but for all
zerocopy users by the virtue of dragging it down through all
helpers.

> the devmem TX implementation follow closely MSG_ZEROCOPY, and existing

Following closely would be passing ubuf just like MSG_ZEROCOPY does,
and not plumbing devmem all the way through all helpers.

> MSG_ZEROCOPY code would be easily extended for devmem TX without
> having to also carry refactors to migrate to msg->sg_from_iter

Don't be afraid of refactoring when it makes things better. We're
talking about minor changes touching only bits in the direct
vicinity of your set.

> approach (just grab the binding and pass it to
> skb_zerocopy_iter_stream).
> 
>> 4) When you'd want
>> another protocol to support that, instead of a simple
>>
>> ubuf = get_devmem_ubuf();
>>
>> You'd need to plumb binding passing through the stack there as
>> well.
>>
> 
> Similar to above, I think this approach will actually extend easier to
> any protocol already using MSG_ZEROCOPY, because we follow that
> closely instead of requiring refactors to force msg->sg_from_iter
> approach.
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb2b751d274a..3ff8f568c382 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1711,9 +1711,12 @@  struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
 
 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
+struct net_devmem_dmabuf_binding;
+
 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 			    struct sk_buff *skb, struct iov_iter *from,
-			    size_t length);
+			    size_t length,
+			    struct net_devmem_dmabuf_binding *binding);
 
 int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
 				struct iov_iter *from, size_t length);
@@ -1721,12 +1724,14 @@  int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
 static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
 					  struct msghdr *msg, int len)
 {
-	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
+	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
+				       NULL);
 }
 
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
-			     struct ubuf_info *uarg);
+			     struct ubuf_info *uarg,
+			     struct net_devmem_dmabuf_binding *binding);
 
 /* Internal */
 #define skb_shinfo(SKB)	((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -3697,6 +3702,10 @@  static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
 					    size_t offset, size_t size,
 					    enum dma_data_direction dir)
 {
+	if (skb_frag_is_net_iov(frag)) {
+		return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
+		       frag->offset;
+	}
 	return dma_map_page(dev, skb_frag_page(frag),
 			    skb_frag_off(frag) + offset, size, dir);
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index 8036b3b79cd8..09eb918525b6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1822,6 +1822,7 @@  struct sockcm_cookie {
 	u32 tsflags;
 	u32 ts_opt_id;
 	u32 priority;
+	u32 dmabuf_id;
 };
 
 static inline void sockcm_init(struct sockcm_cookie *sockc,
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 649739e0c404..866bd5dfe39f 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -38,10 +38,14 @@  struct dmabuf_token {
 	__u32 token_count;
 };
 
+struct dmabuf_tx_cmsg {
+	__u32 dmabuf_id;
+};
+
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
- 
+
 #define UIO_FASTIOV	8
 #define UIO_MAXIOV	1024
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f0693707aece..c989606ff58d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -63,6 +63,8 @@ 
 #include <net/busy_poll.h>
 #include <crypto/hash.h>
 
+#include "devmem.h"
+
 /*
  *	Is a socket 'connection oriented' ?
  */
@@ -692,9 +694,42 @@  int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
 	return 0;
 }
 
+static int
+zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
+			      int length,
+			      struct net_devmem_dmabuf_binding *binding)
+{
+	int i = skb_shinfo(skb)->nr_frags;
+	size_t virt_addr, size, off;
+	struct net_iov *niov;
+
+	while (length && iov_iter_count(from)) {
+		if (i == MAX_SKB_FRAGS)
+			return -EMSGSIZE;
+
+		virt_addr = (size_t)iter_iov_addr(from);
+		niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
+		if (!niov)
+			return -EFAULT;
+
+		size = min_t(size_t, size, length);
+		size = min_t(size_t, size, iter_iov_len(from));
+
+		get_netmem(net_iov_to_netmem(niov));
+		skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
+				       size, PAGE_SIZE);
+		iov_iter_advance(from, size);
+		length -= size;
+		i++;
+	}
+
+	return 0;
+}
+
 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 			    struct sk_buff *skb, struct iov_iter *from,
-			    size_t length)
+			    size_t length,
+			    struct net_devmem_dmabuf_binding *binding)
 {
 	unsigned long orig_size = skb->truesize;
 	unsigned long truesize;
@@ -702,6 +737,8 @@  int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 
 	if (msg && msg->msg_ubuf && msg->sg_from_iter)
 		ret = msg->sg_from_iter(skb, from, length);
+	else if (unlikely(binding))
+		ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
 	else
 		ret = zerocopy_fill_skb_from_iter(skb, from, length);
 
@@ -735,7 +772,7 @@  int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
 		return -EFAULT;
 
-	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
+	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
 }
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 20985a570662..5de887545f5e 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -16,6 +16,7 @@ 
 #include <net/netdev_queues.h>
 #include <net/netdev_rx_queue.h>
 #include <net/page_pool/helpers.h>
+#include <net/sock.h>
 #include <trace/events/page_pool.h>
 
 #include "devmem.h"
@@ -64,8 +65,10 @@  void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
 	dma_buf_detach(binding->dmabuf, binding->attachment);
 	dma_buf_put(binding->dmabuf);
 	xa_destroy(&binding->bound_rxqs);
+	kvfree(binding->tx_vec);
 	kfree(binding);
 }
+EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);
 
 struct net_iov *
 net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -110,6 +113,13 @@  void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
 	unsigned long xa_idx;
 	unsigned int rxq_idx;
 
+	xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+	/* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
+	 * erase.
+	 */
+	synchronize_net();
+
 	if (binding->list.next)
 		list_del(&binding->list);
 
@@ -123,8 +133,6 @@  void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
 		WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx));
 	}
 
-	xa_erase(&net_devmem_dmabuf_bindings, binding->id);
-
 	net_devmem_dmabuf_binding_put(binding);
 }
 
@@ -185,8 +193,9 @@  int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
 }
 
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
-		       struct netlink_ext_ack *extack)
+net_devmem_bind_dmabuf(struct net_device *dev,
+		       enum dma_data_direction direction,
+		       unsigned int dmabuf_fd, struct netlink_ext_ack *extack)
 {
 	struct net_devmem_dmabuf_binding *binding;
 	static u32 id_alloc_next;
@@ -229,7 +238,7 @@  net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
 	}
 
 	binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
-						       DMA_FROM_DEVICE);
+						       direction);
 	if (IS_ERR(binding->sgt)) {
 		err = PTR_ERR(binding->sgt);
 		NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
@@ -240,13 +249,23 @@  net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
 	 * binding can be much more flexible than that. We may be able to
 	 * allocate MTU sized chunks here. Leave that for future work...
 	 */
-	binding->chunk_pool =
-		gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
+	binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+					      dev_to_node(&dev->dev));
 	if (!binding->chunk_pool) {
 		err = -ENOMEM;
 		goto err_unmap;
 	}
 
+	if (direction == DMA_TO_DEVICE) {
+		binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+						 sizeof(struct net_iov *),
+						 GFP_KERNEL);
+		if (!binding->tx_vec) {
+			err = -ENOMEM;
+			goto err_free_chunks;
+		}
+	}
+
 	virtual = 0;
 	for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) {
 		dma_addr_t dma_addr = sg_dma_address(sg);
@@ -288,6 +307,8 @@  net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
 			niov->owner = owner;
 			page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
 						      net_devmem_get_dma_addr(niov));
+			if (direction == DMA_TO_DEVICE)
+				binding->tx_vec[owner->base_virtual / PAGE_SIZE + i] = niov;
 		}
 
 		virtual += len;
@@ -313,6 +334,21 @@  net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
 	return ERR_PTR(err);
 }
 
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+	struct net_devmem_dmabuf_binding *binding;
+
+	rcu_read_lock();
+	binding = xa_load(&net_devmem_dmabuf_bindings, id);
+	if (binding) {
+		if (!net_devmem_dmabuf_binding_get(binding))
+			binding = NULL;
+	}
+	rcu_read_unlock();
+
+	return binding;
+}
+
 void dev_dmabuf_uninstall(struct net_device *dev)
 {
 	struct net_devmem_dmabuf_binding *binding;
@@ -343,6 +379,53 @@  void net_devmem_put_net_iov(struct net_iov *niov)
 	net_devmem_dmabuf_binding_put(niov->owner->binding);
 }
 
+struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
+							 unsigned int dmabuf_id)
+{
+	struct net_devmem_dmabuf_binding *binding;
+	struct dst_entry *dst = __sk_dst_get(sk);
+	int err = 0;
+
+	binding = net_devmem_lookup_dmabuf(dmabuf_id);
+	if (!binding || !binding->tx_vec) {
+		err = -EINVAL;
+		goto out_err;
+	}
+
+	/* The dma-addrs in this binding are only reachable to the corresponding
+	 * net_device.
+	 */
+	if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
+		err = -ENODEV;
+		goto out_err;
+	}
+
+	return binding;
+
+out_err:
+	if (binding)
+		net_devmem_dmabuf_binding_put(binding);
+
+	return ERR_PTR(err);
+}
+
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
+		       size_t virt_addr, size_t *off, size_t *size)
+{
+	size_t idx;
+
+	if (virt_addr >= binding->dmabuf->size)
+		return NULL;
+
+	idx = virt_addr / PAGE_SIZE;
+
+	*off = virt_addr % PAGE_SIZE;
+	*size = PAGE_SIZE - *off;
+
+	return binding->tx_vec[idx];
+}
+
 /*** "Dmabuf devmem memory provider" ***/
 
 int mp_dmabuf_devmem_init(struct page_pool *pool)
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 8b51caff5a0e..874e891e70e0 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -46,6 +46,12 @@  struct net_devmem_dmabuf_binding {
 	 * active.
 	 */
 	u32 id;
+
+	/* Array of net_iov pointers for this binding, sorted by virtual
+	 * address. This array is convenient to map the virtual addresses to
+	 * net_iovs in the TX path.
+	 */
+	struct net_iov **tx_vec;
 };
 
 #if defined(CONFIG_NET_DEVMEM)
@@ -70,12 +76,15 @@  struct dmabuf_genpool_chunk_owner {
 
 void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding);
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
-		       struct netlink_ext_ack *extack);
+net_devmem_bind_dmabuf(struct net_device *dev,
+		       enum dma_data_direction direction,
+		       unsigned int dmabuf_fd, struct netlink_ext_ack *extack);
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
 void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
 int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
 				    struct net_devmem_dmabuf_binding *binding,
 				    struct netlink_ext_ack *extack);
+void net_devmem_bind_tx_release(struct sock *sk);
 void dev_dmabuf_uninstall(struct net_device *dev);
 
 static inline struct dmabuf_genpool_chunk_owner *
@@ -108,10 +117,10 @@  static inline u32 net_iov_binding_id(const struct net_iov *niov)
 	return net_iov_owner(niov)->binding->id;
 }
 
-static inline void
+static inline bool
 net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
 {
-	refcount_inc(&binding->ref);
+	return refcount_inc_not_zero(&binding->ref);
 }
 
 static inline void
@@ -130,6 +139,12 @@  struct net_iov *
 net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
 void net_devmem_free_dmabuf(struct net_iov *ppiov);
 
+struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+		       size_t *off, size_t *size);
+
 #else
 struct net_devmem_dmabuf_binding;
 
@@ -153,11 +168,17 @@  __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
 
 static inline struct net_devmem_dmabuf_binding *
 net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+		       enum dma_data_direction direction,
 		       struct netlink_ext_ack *extack)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+	return NULL;
+}
+
 static inline void
 net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
 {
@@ -195,6 +216,19 @@  static inline u32 net_iov_binding_id(const struct net_iov *niov)
 {
 	return 0;
 }
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+		       size_t *off, size_t *size)
+{
+	return NULL;
+}
 #endif
 
 #endif /* _NET_DEVMEM_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 0e41699df419..3ecb3a6d3913 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -854,7 +854,8 @@  int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock;
 	}
 
-	binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack);
+	binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
+					 info->extack);
 	if (IS_ERR(binding)) {
 		err = PTR_ERR(binding);
 		goto err_unlock;
@@ -911,10 +912,67 @@  int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
-/* stub */
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
 {
-	return 0;
+	struct net_devmem_dmabuf_binding *binding;
+	struct list_head *sock_binding_list;
+	struct net_device *netdev;
+	u32 ifindex, dmabuf_fd;
+	struct sk_buff *rsp;
+	int err = 0;
+	void *hdr;
+
+	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
+		return -EINVAL;
+
+	ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+	dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+	sock_binding_list = genl_sk_priv_get(&netdev_nl_family,
+					     NETLINK_CB(skb).sk);
+	if (IS_ERR(sock_binding_list))
+		return PTR_ERR(sock_binding_list);
+
+	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!rsp)
+		return -ENOMEM;
+
+	hdr = genlmsg_iput(rsp, info);
+	if (!hdr) {
+		err = -EMSGSIZE;
+		goto err_genlmsg_free;
+	}
+
+	rtnl_lock();
+
+	netdev = __dev_get_by_index(genl_info_net(info), ifindex);
+	if (!netdev || !netif_device_present(netdev)) {
+		err = -ENODEV;
+		goto err_unlock;
+	}
+
+	binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd,
+					 info->extack);
+	if (IS_ERR(binding)) {
+		err = PTR_ERR(binding);
+		goto err_unlock;
+	}
+
+	list_add(&binding->list, sock_binding_list);
+
+	nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+	genlmsg_end(rsp, hdr);
+
+	rtnl_unlock();
+
+	return genlmsg_reply(rsp, info);
+
+err_unlock:
+	rtnl_unlock();
+err_genlmsg_free:
+	nlmsg_free(rsp);
+	return err;
 }
 
 void netdev_nl_sock_priv_init(struct list_head *priv)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 815245d5c36b..6289ffcbb20b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1882,7 +1882,8 @@  EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
 
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
-			     struct ubuf_info *uarg)
+			     struct ubuf_info *uarg,
+			     struct net_devmem_dmabuf_binding *binding)
 {
 	int err, orig_len = skb->len;
 
@@ -1901,7 +1902,8 @@  int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			return -EEXIST;
 	}
 
-	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
+	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+				      binding);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 		struct sock *save_sk = skb->sk;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index eae2ae70a2e0..353669f124ab 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2911,6 +2911,7 @@  EXPORT_SYMBOL(sock_alloc_send_pskb);
 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
 		     struct sockcm_cookie *sockc)
 {
+	struct dmabuf_tx_cmsg dmabuf_tx;
 	u32 tsflags;
 
 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
@@ -2964,6 +2965,13 @@  int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
 			return -EPERM;
 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
+		break;
+	case SCM_DEVMEM_DMABUF:
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct dmabuf_tx_cmsg)))
+			return -EINVAL;
+		dmabuf_tx = *(struct dmabuf_tx_cmsg *)CMSG_DATA(cmsg);
+		sockc->dmabuf_id = dmabuf_tx.dmabuf_id;
+
 		break;
 	default:
 		return -EINVAL;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0d704bda6c41..44198ae7e44c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1051,6 +1051,7 @@  int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 
 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 {
+	struct net_devmem_dmabuf_binding *binding = NULL;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct ubuf_info *uarg = NULL;
 	struct sk_buff *skb;
@@ -1063,6 +1064,15 @@  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 
 	flags = msg->msg_flags;
 
+	sockcm_init(&sockc, sk);
+	if (msg->msg_controllen) {
+		err = sock_cmsg_send(sk, msg, &sockc);
+		if (unlikely(err)) {
+			err = -EINVAL;
+			goto out_err;
+		}
+	}
+
 	if ((flags & MSG_ZEROCOPY) && size) {
 		if (msg->msg_ubuf) {
 			uarg = msg->msg_ubuf;
@@ -1080,6 +1090,15 @@  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			else
 				uarg_to_msgzc(uarg)->zerocopy = 0;
 		}
+
+		if (sockc.dmabuf_id != 0) {
+			binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
+			if (IS_ERR(binding)) {
+				err = PTR_ERR(binding);
+				binding = NULL;
+				goto out_err;
+			}
+		}
 	} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
 		if (sk->sk_route_caps & NETIF_F_SG)
 			zc = MSG_SPLICE_PAGES;
@@ -1123,15 +1142,6 @@  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 		/* 'common' sending to sendq */
 	}
 
-	sockcm_init(&sockc, sk);
-	if (msg->msg_controllen) {
-		err = sock_cmsg_send(sk, msg, &sockc);
-		if (unlikely(err)) {
-			err = -EINVAL;
-			goto out_err;
-		}
-	}
-
 	/* This should be in poll */
 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
@@ -1248,7 +1258,8 @@  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 					goto wait_for_space;
 			}
 
-			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg,
+						       binding);
 			if (err == -EMSGSIZE || err == -EEXIST) {
 				tcp_mark_push(tp, skb);
 				goto new_segment;
@@ -1329,6 +1340,8 @@  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 	/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
 	if (uarg && !msg->msg_ubuf)
 		net_zcopy_put(uarg);
+	if (binding)
+		net_devmem_dmabuf_binding_put(binding);
 	return copied + copied_syn;
 
 do_error:
@@ -1346,6 +1359,9 @@  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 		sk->sk_write_space(sk);
 		tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
 	}
+	if (binding)
+		net_devmem_dmabuf_binding_put(binding);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 7f7de6d88096..f6d4bb798517 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -107,8 +107,7 @@  static int virtio_transport_fill_skb(struct sk_buff *skb,
 {
 	if (zcopy)
 		return __zerocopy_sg_from_iter(info->msg, NULL, skb,
-					       &info->msg->msg_iter,
-					       len);
+					       &info->msg->msg_iter, len, NULL);
 
 	return memcpy_from_msg(skb_put(skb, len), info->msg, len);
 }