Message ID | 20210805185750.4522-3-xiyou.wangcong@gmail.com |
---|---|
State | New |
Headers | show |
Series | net: add more tracepoints to TCP/IP stack | expand |
On 8/5/21 8:57 PM, Cong Wang wrote: > From: Qitao Xu <qitao.xu@bytedance.com> > > Tracepoint trace_ip_queue_xmit() is introduced to trace skb > at the entrance of IP layer on TX side. > > Reviewed-by: Cong Wang <cong.wang@bytedance.com> > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com> > --- > include/trace/events/ip.h | 42 +++++++++++++++++++++++++++++++++++++++ > net/ipv4/ip_output.c | 10 +++++++++- > 2 files changed, 51 insertions(+), 1 deletion(-) > > diff --git a/include/trace/events/ip.h b/include/trace/events/ip.h > index 008f821ebc50..553ae7276732 100644 > --- a/include/trace/events/ip.h > +++ b/include/trace/events/ip.h > @@ -41,6 +41,48 @@ > TP_STORE_V4MAPPED(__entry, saddr, daddr) > #endif > > +TRACE_EVENT(ip_queue_xmit, > + > + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), > + > + TP_ARGS(sk, skb), > + > + TP_STRUCT__entry( > + __field(const void *, skbaddr) > + __field(const void *, skaddr) > + __field(__u16, sport) > + __field(__u16, dport) > + __array(__u8, saddr, 4) > + __array(__u8, daddr, 4) > + __array(__u8, saddr_v6, 16) > + __array(__u8, daddr_v6, 16) > + ), > + > + TP_fast_assign( > + struct inet_sock *inet = inet_sk(sk); > + __be32 *p32; > + > + __entry->skbaddr = skb; > + __entry->skaddr = sk; > + > + __entry->sport = ntohs(inet->inet_sport); > + __entry->dport = ntohs(inet->inet_dport); > + > + p32 = (__be32 *) __entry->saddr; > + *p32 = inet->inet_saddr; > + > + p32 = (__be32 *) __entry->daddr; > + *p32 = inet->inet_daddr; > + > + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, > + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); > + ), > + > + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skbaddr=%px", > + __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, > + __entry->saddr_v6, __entry->daddr_v6, __entry->skbaddr) > +); > + > #endif /* _TRACE_IP_H */ > > /* This part must be outside protection */ > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index 6b04a88466b2..dcf94059112e 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -82,6 +82,7 @@ > #include <linux/netfilter_bridge.h> > #include <linux/netlink.h> > #include <linux/tcp.h> > +#include <trace/events/ip.h> > > static int > ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, > @@ -536,7 +537,14 @@ EXPORT_SYMBOL(__ip_queue_xmit); > > int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) > { > - return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); > + int ret; > + > + ret = __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); > + if (!ret) > + trace_ip_queue_xmit(sk, skb); > + > + return ret; > + > } > EXPORT_SYMBOL(ip_queue_xmit); > > While it is useful to have stuff like this, ddding so many trace points has a certain cost. I fear that you have not determined this cost on workloads where we enter these functions with cold caches. For instance, before this patch, compiler gives us : 2e10 <ip_queue_xmit>: 2e10: e8 00 00 00 00 callq 2e15 <ip_queue_xmit+0x5> (__fentry__-0x4) 2e15: 0f b6 8f 1c 03 00 00 movzbl 0x31c(%rdi),%ecx 2e1c: e9 ef fb ff ff jmpq 2a10 <__ip_queue_xmit> After patch, we see the compiler had to save/restore registers, and no longer jumps to __ip_queue_xmit. Code is bigger, even when tracepoint is not enabled. 2e10: e8 00 00 00 00 callq 2e15 <ip_queue_xmit+0x5> 2e11: R_X86_64_PLT32 __fentry__-0x4 2e15: 41 55 push %r13 2e17: 49 89 f5 mov %rsi,%r13 2e1a: 41 54 push %r12 2e1c: 55 push %rbp 2e1d: 0f b6 8f 1c 03 00 00 movzbl 0x31c(%rdi),%ecx 2e24: 48 89 fd mov %rdi,%rbp 2e27: e8 00 00 00 00 callq 2e2c <ip_queue_xmit+0x1c> 2e28: R_X86_64_PLT32 __ip_queue_xmit-0x4 2e2c: 41 89 c4 mov %eax,%r12d 2e2f: 85 c0 test %eax,%eax 2e31: 74 09 je 2e3c <ip_queue_xmit+0x2c> 2e33: 44 89 e0 mov %r12d,%eax 2e36: 5d pop %rbp 2e37: 41 5c pop %r12 2e39: 41 5d pop %r13 2e3b: c3 retq 2e3c: 66 90 xchg %ax,%ax 2e3e: 44 89 e0 mov %r12d,%eax 2e41: 5d pop %rbp 2e42: 41 5c pop %r12 2e44: 41 5d pop %r13 2e46: c3 retq ---- tracing code --- 2e47: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax # 2e4e <ip_queue_xmit+0x3e> 2e4a: R_X86_64_PC32 cpu_number-0x4 2e4e: 89 c0 mov %eax,%eax 2e50: 48 0f a3 05 00 00 00 bt %rax,0x0(%rip) # 2e58 <ip_queue_xmit+0x48> 2e57: 00 2e54: R_X86_64_PC32 __cpu_online_mask-0x4 2e58: 73 d9 jae 2e33 <ip_queue_xmit+0x23> 2e5a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 2e61 <ip_queue_xmit+0x51> 2e5d: R_X86_64_PC32 __tracepoint_ip_queue_xmit+0x3c 2e61: 48 85 c0 test %rax,%rax 2e64: 74 0f je 2e75 <ip_queue_xmit+0x65> 2e66: 48 8b 78 08 mov 0x8(%rax),%rdi 2e6a: 4c 89 ea mov %r13,%rdx 2e6d: 48 89 ee mov %rbp,%rsi 2e70: e8 00 00 00 00 callq 2e75 <ip_queue_xmit+0x65> 2e71: R_X86_64_PLT32 __SCT__tp_func_ip_queue_xmit-0x4 2e75: 44 89 e0 mov %r12d,%eax 2e78: 5d pop %rbp 2e79: 41 5c pop %r12 2e7b: 41 5d pop %r13 2e7d: c3 retq
On Fri, Aug 6, 2021 at 3:09 AM Eric Dumazet <eric.dumazet@gmail.com> wrote: > While it is useful to have stuff like this, > ddding so many trace points has a certain cost. > > I fear that you have not determined this cost > on workloads where we enter these functions with cold caches. > > For instance, before this patch, compiler gives us : > > 2e10 <ip_queue_xmit>: > 2e10: e8 00 00 00 00 callq 2e15 <ip_queue_xmit+0x5> (__fentry__-0x4) > 2e15: 0f b6 8f 1c 03 00 00 movzbl 0x31c(%rdi),%ecx > 2e1c: e9 ef fb ff ff jmpq 2a10 <__ip_queue_xmit> > > > After patch, we see the compiler had to save/restore registers, and no longer > jumps to __ip_queue_xmit. Code is bigger, even when tracepoint is not enabled. Interesting, I didn't pay attention to the binary code generated by compilers. Let me check it, as I have moved the trace function before __ip_queue_xmit() (otherwise the order is reversed). Thanks!
On Thu, Aug 05, 2021 at 11:57:39AM -0700, Cong Wang wrote: > From: Qitao Xu <qitao.xu@bytedance.com> > > Tracepoint trace_ip_queue_xmit() is introduced to trace skb > at the entrance of IP layer on TX side. > > Reviewed-by: Cong Wang <cong.wang@bytedance.com> > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com> > --- > include/trace/events/ip.h | 42 +++++++++++++++++++++++++++++++++++++++ > net/ipv4/ip_output.c | 10 +++++++++- > 2 files changed, 51 insertions(+), 1 deletion(-) > > diff --git a/include/trace/events/ip.h b/include/trace/events/ip.h > index 008f821ebc50..553ae7276732 100644 > --- a/include/trace/events/ip.h > +++ b/include/trace/events/ip.h > @@ -41,6 +41,48 @@ > TP_STORE_V4MAPPED(__entry, saddr, daddr) > #endif > > +TRACE_EVENT(ip_queue_xmit, > + > + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), > + > + TP_ARGS(sk, skb), > + > + TP_STRUCT__entry( > + __field(const void *, skbaddr) > + __field(const void *, skaddr) > + __field(__u16, sport) > + __field(__u16, dport) > + __array(__u8, saddr, 4) > + __array(__u8, daddr, 4) > + __array(__u8, saddr_v6, 16) > + __array(__u8, daddr_v6, 16) > + ), > + > + TP_fast_assign( > + struct inet_sock *inet = inet_sk(sk); > + __be32 *p32; > + > + __entry->skbaddr = skb; > + __entry->skaddr = sk; > + > + __entry->sport = ntohs(inet->inet_sport); > + __entry->dport = ntohs(inet->inet_dport); > + > + p32 = (__be32 *) __entry->saddr; > + *p32 = inet->inet_saddr; > + > + p32 = (__be32 *) __entry->daddr; > + *p32 = inet->inet_daddr; > + > + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, > + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); > + ), > + > + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skbaddr=%px", > + __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, > + __entry->saddr_v6, __entry->daddr_v6, __entry->skbaddr) > +); > + > #endif /* _TRACE_IP_H */ > > /* This part must be outside protection */ > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index 6b04a88466b2..dcf94059112e 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -82,6 +82,7 @@ > #include <linux/netfilter_bridge.h> > #include <linux/netlink.h> > #include <linux/tcp.h> > +#include <trace/events/ip.h> > > static int > ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, > @@ -536,7 +537,14 @@ EXPORT_SYMBOL(__ip_queue_xmit); > > int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) > { > - return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); > + int ret; > + > + ret = __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); > + if (!ret) > + trace_ip_queue_xmit(sk, skb); Instead of adding tracepoints, the bpf fexit prog can be used here and the bpf prog will have the sk, skb, and ret available (example in fexit_test.c). Some tracepoints in this set can also be done with bpf fentry/fexit. Does bpf fentry/fexit work for your use case?
On Wed, Aug 11, 2021 at 2:23 PM Martin KaFai Lau <kafai@fb.com> wrote: > Instead of adding tracepoints, the bpf fexit prog can be used here and > the bpf prog will have the sk, skb, and ret available (example in fexit_test.c). > Some tracepoints in this set can also be done with bpf fentry/fexit. > Does bpf fentry/fexit work for your use case? Well, kprobe works too in this perspective. The problem with kprobe or fexit is that there is no guarantee the function still exists in kernel during iteration. Kernel is free to delete or rename it. With tracepoint, even if ip_queue_xmit() were renamed, the same tracepoint must remain in the kernel. Thanks.
On Wed, Aug 11, 2021 at 03:48:36PM -0700, Cong Wang wrote: > On Wed, Aug 11, 2021 at 2:23 PM Martin KaFai Lau <kafai@fb.com> wrote: > > Instead of adding tracepoints, the bpf fexit prog can be used here and > > the bpf prog will have the sk, skb, and ret available (example in fexit_test.c). > > Some tracepoints in this set can also be done with bpf fentry/fexit. > > Does bpf fentry/fexit work for your use case? > > Well, kprobe works too in this perspective. The problem with kprobe > or fexit is that there is no guarantee the function still exists in kernel > during iteration. Kernel is free to delete or rename it. With tracepoint, > even if ip_queue_xmit() were renamed, the same tracepoint must > remain in the kernel. Some of the function names are hardly changed. Considering it is not always cost free based on another thread, this is not a strong enough reason to add so many tracepoints while other options are available.
On Wed, Aug 11, 2021 at 4:08 PM Martin KaFai Lau <kafai@fb.com> wrote:
> Some of the function names are hardly changed.
This is obviously wrong for two reasons:
1. Kernel developers did change them. As a quick example,
tcp_retransmit_skb() has been changed, we do have reasons to only trace
__tcp_retransmit_skb() instead.
2. Even if kernel developers never did, compilers can do inline too. For
example, I see nothing to stop compiler to inline tcp_transmit_skb()
which is static and only calls __tcp_transmit_skb(). You explicitly
mark bpf_fentry_test1() as noinline, don't you?
I understand you are eager to promote ebpf, however, please keep
reasonable on facts.
Thanks.
On Wed, Aug 11, 2021 at 05:37:35PM -0700, Cong Wang wrote: > On Wed, Aug 11, 2021 at 4:08 PM Martin KaFai Lau <kafai@fb.com> wrote: > > Some of the function names are hardly changed. > > This is obviously wrong for two reasons: > > 1. Kernel developers did change them. As a quick example, > tcp_retransmit_skb() has been changed, we do have reasons to only trace > __tcp_retransmit_skb() instead. I did not say it has never changed. how often? I don't believe it changed often enough. > 2. Even if kernel developers never did, compilers can do inline too. For > example, I see nothing to stop compiler to inline tcp_transmit_skb() > which is static and only calls __tcp_transmit_skb(). You explicitly > mark bpf_fentry_test1() as noinline, don't you? Yes, correct, inline is a legit reason. Another one is to track some local variables in the function stack. However, how many functions that you need here are indeed inlined by compiler or need to track the local variables? > I understand you are eager to promote ebpf, however, please keep > reasonable on facts. Absolutely not. There is no need. There is enough scripts using bpf that does network tracing doing useful stuff without adding so many tracepoints in the fast path. I am only exploring other options and trying to understand why it does not work in your case before adding all of them in the fast path. It is sad that you take it this way.
diff --git a/include/trace/events/ip.h b/include/trace/events/ip.h index 008f821ebc50..553ae7276732 100644 --- a/include/trace/events/ip.h +++ b/include/trace/events/ip.h @@ -41,6 +41,48 @@ TP_STORE_V4MAPPED(__entry, saddr, daddr) #endif +TRACE_EVENT(ip_queue_xmit, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + + TP_ARGS(sk, skb), + + TP_STRUCT__entry( + __field(const void *, skbaddr) + __field(const void *, skaddr) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + __be32 *p32; + + __entry->skbaddr = skb; + __entry->skaddr = sk; + + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skbaddr=%px", + __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, __entry->skbaddr) +); + #endif /* _TRACE_IP_H */ /* This part must be outside protection */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6b04a88466b2..dcf94059112e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -82,6 +82,7 @@ #include <linux/netfilter_bridge.h> #include <linux/netlink.h> #include <linux/tcp.h> +#include <trace/events/ip.h> static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -536,7 +537,14 @@ EXPORT_SYMBOL(__ip_queue_xmit); int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); + int ret; + + ret = __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); + if (!ret) + trace_ip_queue_xmit(sk, skb); + + return ret; + } EXPORT_SYMBOL(ip_queue_xmit);