diff --git a/net/Kconfig b/net/Kconfig index 092a1c0902ac9205ad25831cf09be5064786be6e..e9600faa440432a05e071606c9813c97639ba2ea 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -422,6 +422,7 @@ source "net/ceph/Kconfig" source "net/nfc/Kconfig" source "net/psample/Kconfig" source "net/ife/Kconfig" +source "net/nettrace/Kconfig" config LWTUNNEL bool "Network light weight tunnels" diff --git a/net/Makefile b/net/Makefile index 45f3fbaae644e167fde0e2b63b935a21745f0fe7..0c8990098eb97484ab0feef3e4548807b3c8af2b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -80,3 +80,4 @@ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_MCTP) += mctp/ obj-$(CONFIG_NET_HANDSHAKE) += handshake/ +obj-$(CONFIG_NET_TRACE) += nettrace/ diff --git a/net/nettrace/Kconfig b/net/nettrace/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..99203c71729a3092cc6dc1a3e6a2fccbe57a1d19 --- /dev/null +++ b/net/nettrace/Kconfig @@ -0,0 +1,6 @@ +config NET_TRACE + tristate "Net trace" + depends on KPROBES + default n + help + Trace net package from the kernel network proto stack. diff --git a/net/nettrace/Makefile b/net/nettrace/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..a57c128fdba56eb8cf8e01ad1d78ec0944e66f1b --- /dev/null +++ b/net/nettrace/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for nettrace. +# +nettrace-objs := core.o kprobe.o parser.o dump.o help.o group.o handler.o procfs.o utils.o mm.o +obj-$(CONFIG_NET_TRACE) += nettrace.o + diff --git a/net/nettrace/Makefile.alone b/net/nettrace/Makefile.alone new file mode 100644 index 0000000000000000000000000000000000000000..5f9a0aae28bfefd5aa764ec01be0ce6769429cdf --- /dev/null +++ b/net/nettrace/Makefile.alone @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile. +# + +nettrace-objs := core.o kprobe.o parser.o dump.o help.o group.o handler.o procfs.o utils.o +obj-m := ntrace.o + +all: clean build + +build: + make -C $(KERNEL_DIR) M=$(shell pwd) modules + +clean: + make -C $(KERNEL_DIR) M=$(shell pwd) clean + +.PHONY: build diff --git a/net/nettrace/core.c b/net/nettrace/core.c new file mode 100644 index 0000000000000000000000000000000000000000..fdb909fc4f964b5b641133f8b334954a3d6073d9 --- /dev/null +++ b/net/nettrace/core.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include "core.h" +#include "help.h" +#include "group.h" +#include "procfs.h" + +enum nettrace_status nt_status = NT_INIT; + +#define MAX_DUMP_QUEUE_MEM_DEFAULT (SK_RMEM_MAX * 200) +int max_dump_queue_mem = MAX_DUMP_QUEUE_MEM_DEFAULT; +#define MAX_DUMP_SKB_CNT_DEFAULT (100 * 1000) +unsigned int max_dump_skb_cnt = MAX_DUMP_SKB_CNT_DEFAULT; +#define MAX_DUMP_FILE_SIZE_DEFAULT (100 * 1024 * 1024) +unsigned int max_dump_file_size = MAX_DUMP_FILE_SIZE_DEFAULT; + +MODULE_DESCRIPTION("Network debugging tools \n \ + insmod nettrace.ko \[probe=\\] \n \ + \[output=\\] \[flag=\] \n \ + \[dump=\\] \n \ + \[trace=\\] \[proto=\] \n \ + \[saddr=\] \[daddr=\] \[addr=\] \n \ + \[sport=\] \[dport=\] \[port=\] \n \ + \[stack=\<0 or 1\>\] \[ustack=\<0 or 1\>\] \[mm=\]"); +/*module init.*/ +KPROBE_INIT { + int err = -EINVAL; + + init_group(); + err = init_args(); + if (err) + goto on_err; + + err = trace_register(); + if (err) + goto on_init_err; + + err = ntrace_proc_init(); + if (err) + goto on_init_err; + + WRITE_ONCE(nt_status, NT_RUNNING); + return 0; + +on_init_err: + free_all_group(); +on_err: + return err; +} + +/*module exit*/ +KPROBE_EXIT { + WRITE_ONCE(nt_status, NT_EXITING); + /* Now start to free all tracepoints */ + free_all_group(); + free_rules(); + ntrace_proc_exit(); +} diff --git a/net/nettrace/core.h b/net/nettrace/core.h new file mode 100644 index 0000000000000000000000000000000000000000..286c68ce3b8656af3bde06c11f8da763c2fc40f7 --- /dev/null +++ b/net/nettrace/core.h @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * Author: + * Menglong Dong + * Migrator: + * xu xin + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_CORE_H +#define NETDUMP_CORE_H + +#include "kprobe.h" +#include "parser.h" + +/* Internal status of nettrace */ +enum nettrace_status { + /* nettrace is initializing related tracepoints and its dump files */ + NT_INIT, + /* nettrace is ready and tracing */ + NT_RUNNING, + /* somebody is rmmoving the nettrace.ko */ + NT_EXITING +}; + +extern enum nettrace_status nt_status; + +#endif //NETDUMP_CORE_H diff --git a/net/nettrace/dump.c b/net/nettrace/dump.c new file mode 100644 index 0000000000000000000000000000000000000000..1f6a9e9f34179731638a2899ee9c8edc65f56b92 --- /dev/null +++ b/net/nettrace/dump.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include "dump.h" +#include "utils.h" +#include "group.h" +#include "core.h" +#include "mm.h" + +/* The number of packets which is not dumped into pcap due to insufficient memory. */ +unsigned int dump_loss_due_to_no_memory; + +/* The number of packets which is not dumped into pcap due to the limit of max_dump_skb_cnt. */ +unsigned int dump_skb_over_cnt; + +/* The number of packets which is not dumped into pcap due to the limit of max_dump_file_size. */ +unsigned int dump_skb_over_size; + +static int dump_skb(struct sk_buff_nettrace *skb, struct file *to) +{ + struct pcap_pkthdr phdr; + struct timespec64 tv; + struct sk_buff *frag, *tmp; + + ktime_get_real_ts64(&tv); + phdr.ts.tv_sec = tv.tv_sec; + phdr.ts.tv_usec = tv.tv_nsec / 1000; + + phdr.len = skb->total_len; + phdr.caplen = skb->total_len; + + file_append(to, &phdr, sizeof(phdr)); + file_append(to, skb->data, skb->len); + + if (!skb_queue_empty(&skb->frag_list)) { + skb_queue_walk_safe(&skb->frag_list, frag, tmp) + file_append(to, ((struct sk_buff_nettrace *)frag)->data, + ((struct sk_buff_nettrace *)frag)->len); + } + + return 0; +} + +int init_pcap(struct file *f) +{ + struct pcap_file_header hdr; + + hdr.magic = PCAP_MAGIC; + hdr.version_major = PCAP_VERSION_MAJOR; + hdr.version_minor = PCAP_VERSION_MINOR; + hdr.thiszone = sys_tz.tz_dsttime; + hdr.sigfigs = 0; + hdr.snaplen = DEFAULT_SNAPLEN; + hdr.linktype = LINKTYPE_ETHERNET; + + file_append(f, &hdr, sizeof(hdr)); + + return 0; +} + +static __always_inline void dump_queue_lock(struct sk_buff_head *dump_queue, unsigned long flag) +{ + spin_lock_irqsave(&dump_queue->lock, flag); +} + +static __always_inline void dump_queue_unlock(struct sk_buff_head *dump_queue, unsigned long flag) +{ + spin_unlock_irqrestore(&dump_queue->lock, flag); +} + +static void dump_skb_work(struct work_struct *work) +{ + TRACE_POINT *tp = container_of(work, TRACE_POINT, dump_work); + + struct sk_buff_nettrace *skb; + struct sk_buff_head list; + unsigned long flag = 0; + + __skb_queue_head_init(&list); + + dump_queue_lock(&tp->dump_queue, flag); + skb_queue_splice_tail_init(&tp->dump_queue, &list); + dump_queue_unlock(&tp->dump_queue, flag); + + while ((skb = (struct sk_buff_nettrace *) __skb_dequeue(&list))) { + + /* Don't waste time on writing dump_file when exit */ + if (likely(READ_ONCE(nt_status) != NT_EXITING)) { + dump_skb(skb, tp->dump_file); + tp->dump_cnt++; + } + + release_skb_nettrace(skb, tp); + } + return; +} + +void try_dump_skb(struct sk_buff *skb, TRACE_POINT *tp) +{ + struct sk_buff_nettrace *new_skb; + unsigned long flag; + + /* if dump_skb_cnt exceed the uppper limit, drop it */ + if (tp->dump_cnt > max_dump_skb_cnt) { + dump_skb_over_cnt++; + return; + } + + /* if dump_file_size exceed the uppper limit, drop it */ + if (tp->dump_file->f_pos + skb->truesize + + sizeof(struct pcap_pkthdr) >= max_dump_file_size) { + dump_skb_over_size++; + return; + } + + new_skb = skb_copy_nettrace(skb, tp); + + if (!new_skb) { + dump_loss_due_to_no_memory++; + return; + } + + dump_queue_lock(&tp->dump_queue, flag); + __skb_queue_tail(&tp->dump_queue, (struct sk_buff *)new_skb); + dump_queue_unlock(&tp->dump_queue, flag); + + schedule_work(&tp->dump_work); +} + +void init_dump_work(TRACE_POINT *tp) +{ + INIT_WORK(&tp->dump_work, dump_skb_work); + skb_queue_head_init(&tp->dump_queue); +} diff --git a/net/nettrace/dump.h b/net/nettrace/dump.h new file mode 100644 index 0000000000000000000000000000000000000000..8c74d4022f78bd98e176625850bb59bc53d79382 --- /dev/null +++ b/net/nettrace/dump.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_DUMP_H +#define NETDUMP_DUMP_H + +#include +#include "group.h" + +#define PCAP_MAGIC 0xa1b2c3d4 +#define PCAP_VERSION_MAJOR 2 +#define PCAP_VERSION_MINOR 4 + +#define DEFAULT_SNAPLEN 0x40000 + +#define LINKTYPE_NULL 0 +#define LINKTYPE_ETHERNET 1 /* also for 100Mb and up */ +#define LINKTYPE_EXP_ETHERNET 2 /* 3Mb experimental Ethernet */ +#define LINKTYPE_AX25 3 +#define LINKTYPE_PRONET 4 +#define LINKTYPE_CHAOS 5 +#define LINKTYPE_TOKEN_RING 6 /* DLT_IEEE802 is used for Token Ring */ +#define LINKTYPE_ARCNET 7 +#define LINKTYPE_SLIP 8 +#define LINKTYPE_PPP 9 +#define LINKTYPE_FDDI 10 +#define LINKTYPE_PPP_HDLC 50 /* PPP in HDLC-like framing */ +#define LINKTYPE_PPP_ETHER 51 /* NetBSD PPP-over-Ethernet */ +#define LINKTYPE_ATM_RFC1483 100 /* LLC/SNAP-encapsulated ATM */ +#define LINKTYPE_RAW 101 /* raw IP */ +#define LINKTYPE_SLIP_BSDOS 102 /* BSD/OS SLIP BPF header */ +#define LINKTYPE_PPP_BSDOS 103 /* BSD/OS PPP BPF header */ +#define LINKTYPE_C_HDLC 104 /* Cisco HDLC */ +#define LINKTYPE_IEEE802_11 105 /* IEEE 802.11 (wireless) */ +#define LINKTYPE_ATM_CLIP 106 /* Linux Classical IP over ATM */ +#define LINKTYPE_LOOP 108 /* OpenBSD loopback */ +#define LINKTYPE_LINUX_SLL 113 /* Linux cooked socket capture */ +#define LINKTYPE_LTALK 114 /* Apple LocalTalk hardware */ +#define LINKTYPE_ECONET 115 /* Acorn Econet */ +#define LINKTYPE_CISCO_IOS 118 /* For Cisco-internal use */ +#define LINKTYPE_PRISM_HEADER 119 /* 802.11+Prism II monitor mode */ +#define LINKTYPE_AIRONET_HEADER 120 /* FreeBSD Aironet driver stuff */ + +struct pcap_file_header { + uint32_t magic; + uint16_t version_major; + uint16_t version_minor; + int32_t thiszone; /* gmt to local correction */ + uint32_t sigfigs; /* accuracy of timestamps */ + uint32_t snaplen; /* max length saved portion of each pkt */ + uint32_t linktype; /* data link type (LINKTYPE_*) */ +}; + +struct timeval_compat { + uint32_t tv_sec; /* seconds */ + uint32_t tv_usec; /* microseconds */ +}; + +struct pcap_pkthdr { + struct timeval_compat ts; /* time stamp using 32 bits fields */ + uint32_t caplen; /* length of portion present */ + uint32_t len; /* length this packet (off wire) */ +}; + +extern unsigned int dump_loss_due_to_no_memory; +extern unsigned int max_dump_skb_cnt, max_dump_file_size, dump_skb_over_cnt, dump_skb_over_size; + +extern void try_dump_skb(struct sk_buff *skb, TRACE_POINT *tp); + +extern int init_pcap(struct file *f); + +extern void init_dump_work(TRACE_POINT *tp); + +#endif //NETDUMP_DUMP_H diff --git a/net/nettrace/group.c b/net/nettrace/group.c new file mode 100644 index 0000000000000000000000000000000000000000..5cb1edc34111a28889cc1c13aa956fa8b14c3cf5 --- /dev/null +++ b/net/nettrace/group.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include "group.h" +#include "utils.h" +#include "kprobe.h" +#include "help.h" +#include "handler.h" +#include "dump.h" +#include "mm.h" + +/* + * The kernel function that we interest in. Normally, they can be divided into + * four part: ethernet II, IP layer, udp layer, tcp layer and some common + * function, such as kfree_skb(). + */ +struct trace_point all_tp[] = { + +#define SKB_TP(i, g, n) {.skb_index = i, .groups = g, .name = n} +#define PSKB_TP(i, g, n) {.pskb_index = i, .groups = g, .name = n} +#define SK_TP(i, g, n) {.sock_index = i, .groups = g, .name = n} +#define SS_TP(skb, sk, g, n) \ + {.skb_index = skb, .sock_index = sk, .groups = g, .name = n} + + /* net link layout trace points. */ + PSKB_TP(1, "link_input", "__netif_receive_skb_core"), + SKB_TP(2, "link_input", "napi_gro_receive"), + SKB_TP(1, "link_input", "netif_receive_skb_internal"), + SKB_TP(1, "link_input", "__netif_receive_skb"), + SKB_TP(1, "link_input", "netif_rx"), + SKB_TP(1, "link_input", "enqueue_to_backlog"), + SKB_TP(1, "link_output", "__dev_queue_xmit"), + SKB_TP(1, "link_output", "dev_hard_start_xmit"), + SKB_TP(1, "link_output", "dev_queue_xmit_accel"), + SKB_TP(2, "link_output", "dev_forward_skb"), + SKB_TP(1, "link_output", "skb_do_redirect"), + //tc_classify is discarded in Linux 4.19 and later version. + //{.skb_index = 1, .groups = "link_input", .name = "tc_classify", .is_ret = true}, + + /* ip layout trace points. */ + SKB_TP(1, "ip_input", "ip_rcv"), + SKB_TP(3, "ip_input", "ip_rcv_finish"), + SKB_TP(1, "ip_input", "ip_route_input_noref"), + {.skb_index = 1, .groups = "ip_input", .name = "fib_validate_source", .is_ret = true}, + SKB_TP(2, "ip_input", "ip_rcv_finish_core"), + SKB_TP(1, "ip_input", "ip_local_deliver"), + SKB_TP(3, "ip_input", "ip_local_deliver_finish"), + SKB_TP(1, "ip_input", "ip_forward"), + SKB_TP(3, "ip_input", "ip_forward_finish"), + SKB_TP(2, "ip_input", "ip_send_skb"), + SS_TP(3, 2, "ip_output", "__ip_local_out"), + SKB_TP(3, "ip_output", "ip_output"), + SKB_TP(3, "ip_output", "ip_finish_output"), + SKB_TP(3, "ip_output", "ip_finish_output2"), + + /* udp layout trace points. */ + SKB_TP(1, "udp_input", "__udp4_lib_rcv"), + SKB_TP(2, "udp_input", "udp_queue_rcv_skb"), + SKB_TP(2, "udp_input", "__udp_enqueue_schedule_skb"), + SK_TP(1, "udp_input", "__skb_recv_udp"), + SK_TP(1, "udp_input", "udp_recvmsg"), + + /* tcp layout trace points. */ + SKB_TP(1, "tcp_input", "tcp_v4_rcv"), + SS_TP(2, 1, "tcp_input", "tcp_v4_do_rcv"), + SS_TP(2, 1, "tcp_input", "tcp_rcv_established"), + SS_TP(2, 1, "tcp_input", "tcp_rcv_state_process"), + SS_TP(2, 1, "tcp_input", "tcp_data_queue"), + SS_TP(2, 1, "tcp_input", "tcp_queue_rcv"), + SK_TP(1, "tcp_input", "tcp_recvmsg"), + + SK_TP(1, "tcp_output", "tcp_sendmsg"), + SK_TP(1, "tcp_output", "tcp_push"), + SK_TP(1, "tcp_output", "tcp_write_xmit"), + SK_TP(1, "tcp_output", "tcp_set_state"), + SS_TP(2, 1, "tcp_output", "__tcp_transmit_skb"), + + + /* common skb trace points. */ + SKB_TP(1, "error", "kfree_skb_reason"), + SKB_TP(1, "normal", "consume_skb"), + + /* Compile when VNet MACVLAN=y */ + /* macvlan drop stat. */ + SKB_TP(1, "macvlan", "macvlan_start_xmit"), + SKB_TP(1, "macvlan", "macvlan_handle_frame"), + +#undef SKB_TP +#undef SK_TP +#undef SS_TP +}; +const int all_tp_len = sizeof(all_tp) / sizeof(struct trace_point); +TRACE_GROUP *all_group; + +static TRACE_GROUP *query_group(char *name, TRACE_GROUP *tp); + +/**************************************************************************************************** + * + * This is the part for trace point and group query. + * + ****************************************************************************************************/ + +static TRACE_GROUP +*parent_group(TRACE_GROUP *tg, TRACE_GROUP *parent) +{ + TRACE_GROUP *tmp, *tmp2; + list_for_each_entry(tmp, &parent->groups, list) { + if (tmp == tg) + return parent; + if ((tmp2 = parent_group(tg, tmp)) != NULL) + return tmp2; + } + return NULL; +} + +static inline +TRACE_GROUP *parent_group_all(TRACE_GROUP *tg) +{ + return parent_group(tg, all_group); +} + +static TRACE_POINT *query_tp(char *name) +{ + int i = 0; + for (; i < all_tp_len; i++) { + if (streq(name, all_tp[i].name)) + return &all_tp[i]; + } + return NULL; +} + +static void *query_handler(TRACE_GROUP *tg) +{ + if (tg == NULL) + return all_group->handler; + + if (tg->handler) + return tg->handler; + + return query_handler(parent_group_all(tg)); +} + +static void *query_ret_handler(TRACE_GROUP *tg) +{ + if (tg == NULL) + return all_group->ret_handler; + + if (tg->ret_handler) + return tg->ret_handler; + + return query_ret_handler(parent_group_all(tg)); +} + +static TRACE_GROUP +*query_group(char *name, TRACE_GROUP *tp) +{ + TRACE_GROUP *pos, *tmp; + + if (streq(tp->name, name)) + return tp; + + list_for_each_entry(pos, &tp->groups, list) { + if ((tmp = query_group(name, pos)) != NULL) + return tmp; + } + return NULL; +} + +/**************************************************************************************************** + * + * This is the part for trace point register. + * + ****************************************************************************************************/ + +static int +trace_point_register(TRACE_POINT *tp, TRACE_GROUP *tg) +{ + char path[MAX_FILE_NAME] = {}; + struct kprobe *p; + struct file *dump_file; + + if (tp->kprobe) + return 0; + + if (tp->skb_index <= 0 && tp->sock_index <= 0 && tp->pskb_index <= 0) { + log_err("kprobe %s has no index!\n", tp->name); + goto out_err; + } + + if (tp->is_ret) { + p = (struct kprobe *) kretprobe_declare(tp->name, + query_ret_handler(tg), + entry_handler_general); + if (!p) + goto out_err; + ((struct kretprobe *)p)->data_size = sizeof(RET_DATA); + } + else + p = kprobe_declare(tp->name, query_handler(tg)); + + if (!p) { + log_err("kprobe declare failed: %s\n", tp->name); + goto out_err; + } + + /* NOTE: + * When register kprobe, the given SYMBOL NAME may be not found in kallsyms + * for example, suppose we want to trace the kernel func: + * + * "__netif_receive_skb_core", + * + * HOWEVER, that symbol name might be changed by compilers into + * + * "__netif_receive_skb_core.constprop.0" + * + * So, we finally trace __netif_receive_skb_core.constprop.0 and use it as + * tp->name instead of __netif_receive_skb_core! + */ + if ((tp->is_ret && !c_register_kretprobe((struct kretprobe *) p)) || + (!tp->is_ret && !c_register_kprobe(p))) + tp->kprobe = p; + else { + if (strcmp(tp->name, "macvlan_start_xmit") == 0 || + strcmp(tp->name, "macvlan_handle_frame") == 0) { + log_err(" Please confirm if the macvlan module is inserted, no %s\n", tp->name); + } else { + log_err("kprobe register failed: %s\n", tp->name); + } + goto out_free_err; + } + + if (print_dump) { + snprintf(path, sizeof(path), "%s/%s.pcap", print_dump, tp->name); + dump_file = file_create(path); + if (!dump_file) { + log_err("failed to create dump file: %s\n", path); + goto out_err; + } + init_pcap(dump_file); + init_dump_work(tp); + init_dump_mm(tp); + /* Add write barrier to avoid the compiler instruction recombination. + * We must guarantee that tp->dump_file is init after init_dump_work. + */ + wmb(); + tp->dump_file = dump_file; + } + + return 0; +out_free_err: + kfree(p); +out_err: + return -1; +} + +static void trace_reg_group(TRACE_GROUP *tg) +{ + TRACE_GROUP *tmp_tg; + TP_LIST *tpl; + + log_info("begin register group: %s\n", tg->name); + + list_for_each_entry(tpl, &tg->traces, list) trace_point_register(tpl->tp, tg); + + list_for_each_entry(tmp_tg, &tg->groups, list) trace_reg_group(tmp_tg); + + log_info("end register group: %s\n", tg->name); +} + +/*register the kprobe that defined in 'global_kprobe_list'.*/ +int trace_register(void) +{ + int t = 0; + + for (; t < filter_trace_len; t++) { + char *ft = filter_trace[t]; + TRACE_GROUP *tg = query_group(ft, all_group); + if (!tg) { + log_err("trace: %s not founded!\n", ft); + return -EINVAL; + } + trace_reg_group(tg); + } + + for (t = 0; t < filter_probe_len; t++) { + char *name = filter_probe[t]; + TRACE_POINT *tp = query_tp(name); + if (!tp) { + log_err("probe: %s not founded!\n", name); + return -EINVAL; + } + trace_point_register(tp, all_group); + } + + return 0; +} + +/**************************************************************************************************** + * + * This is the part for trace group functions. + * + * In fact, trace group is organized in form of tree. + * + ****************************************************************************************************/ + +/* + * add trace point to group. + */ +static int add2group(TRACE_POINT *tp, char *groups) +{ + char *group; + TP_LIST *tpl; + TRACE_GROUP *g; + + while ((group = strsep(&groups, ",")) != NULL) { + g = query_group(group, all_group); + if (!g) { + log_err("group: %s not exits!", group); + continue; + } + tpl = kmalloc(sizeof(TP_LIST), GFP_KERNEL); + if (!tpl) + return -ENOMEM; + memset(tpl, 1, sizeof(TP_LIST)); + + tpl->tp = tp; + INIT_LIST_HEAD(&tpl->list); + list_add_tail(&tpl->list, &g->traces); + } + + return 0; +} + +static void init_tp(void) +{ + TRACE_POINT *tp; + int i = 0; + + for_each_tp(i, tp) add2group(tp, tp->groups); +} + +static TRACE_GROUP +*add_trace_group(char *name, char *desc, enum trace_group_level lev, + TRACE_GROUP *parent) +{ + TRACE_GROUP *tmp_tp = kmalloc(sizeof(TRACE_GROUP), GFP_KERNEL); + + if (!tmp_tp) + return NULL; + memset(tmp_tp, 0, sizeof(TRACE_GROUP)); + + INIT_LIST_HEAD(&tmp_tp->list); + INIT_LIST_HEAD(&tmp_tp->groups); + INIT_LIST_HEAD(&tmp_tp->traces); + + strncpy(tmp_tp->name, name, MAX_TP_NAME - 1); + strncpy(tmp_tp->desc, desc, MAX_TP_DESC -1); + tmp_tp->level = lev; + + if (parent != NULL) + list_add_tail(&tmp_tp->list, &parent->groups); + else + all_group = tmp_tp; + + return tmp_tp; +} + +/* This function has been discarded */ +/* +static +void copy_tg(TRACE_GROUP *tg, TRACE_GROUP *child) +{ + TRACE_GROUP *tmp_tg = NULL; + TP_LIST *tpl = NULL; + + if (!tg || !child) + return; + + list_for_each_entry(tpl, &child->traces, list) add2group(tpl->tp, tg->name); + + list_for_each_entry(tmp_tg, &child->groups, list) copy_tg(tg, tmp_tg); +} +*/ + +#define ADD_TRACE_GROUP(name, lev, parent, desc) \ + name = add_trace_group(#name, desc, lev, parent); +#define ADD_ANNOY_TRACE_GROUP(name, lev, parent, desc) \ + add_trace_group(#name, desc, lev, parent); + +void init_group(void) +{ + /* define all trace groups. */ + TRACE_GROUP *all, *link, *ip, *tcp, *udp, *error, *macvlan; + + ADD_TRACE_GROUP(all, BASIC, NULL, "the root trace") + ADD_TRACE_GROUP(link, BASIC, all, "the link layer.") + ADD_TRACE_GROUP(ip, BASIC, all, "the ip layer"); + ADD_TRACE_GROUP(tcp, BASIC, all, "the tcp layer"); + ADD_TRACE_GROUP(udp, BASIC, all, "the udp layer"); + ADD_TRACE_GROUP(error, BASIC, all, "the scene that free error package."); + ADD_TRACE_GROUP(macvlan, BASIC, all, "macvlan receive and send skb."); + + /* for kw scan. */ + if (!all) + return; + all->handler = post_handler_general; + all->ret_handler = ret_handler_general; + + /* general network protocol stack. */ + ADD_ANNOY_TRACE_GROUP(link_input, BASIC, link, "the link layer that receive package"); + ADD_ANNOY_TRACE_GROUP(link_output, BASIC, link, "the link layer that send package"); + + ADD_ANNOY_TRACE_GROUP(ip_input, BASIC, ip, "the ip layer that receive package"); + ADD_ANNOY_TRACE_GROUP(ip_output, BASIC, ip, "the ip layer that send package"); + + ADD_ANNOY_TRACE_GROUP(tcp_input, BASIC, tcp, "the tcp layer that receive package"); + ADD_ANNOY_TRACE_GROUP(tcp_output, BASIC, tcp, "the tcp layer that send package"); + + ADD_ANNOY_TRACE_GROUP(udp_input, BASIC, udp, "the udp layer that receive package"); + + ADD_ANNOY_TRACE_GROUP(normal, BASIC, all, "the scene that free normal package."); + + init_tp(); +} + +static void free_group(TRACE_GROUP *tg) +{ + TP_LIST *tp_pos, *tp_next; + TRACE_GROUP *pos, *next; + TRACE_POINT *tp; + + list_for_each_entry_safe(pos, next, &tg->groups, list) free_group(pos); + + list_for_each_entry_safe(tp_pos, tp_next, &tg->traces, list) { + tp = tp_pos->tp; + kfree(tp_pos); + + if (tp->kprobe) { + if (tp->is_ret) + c_unregister_kretprobe((struct kretprobe *) tp->kprobe); + else + c_unregister_kprobe(tp->kprobe); + kfree(tp->kprobe); + tp->kprobe = NULL; + } + + /* Dump skb work must be done before tp is removed */ + if (print_dump && tp->dump_work.func) + flush_work(&tp->dump_work); + + if (tp->dump_file) { + file_close(tp->dump_file); + tp->dump_file = NULL; + release_dump_mm(tp); + } + } + kfree(tg); +} + +void free_all_group(void) +{ + free_group(all_group); +} diff --git a/net/nettrace/group.h b/net/nettrace/group.h new file mode 100644 index 0000000000000000000000000000000000000000..1cc54f1fd394da2c30c1d7dbd7f56871f205af21 --- /dev/null +++ b/net/nettrace/group.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_GROUP_H +#define NETDUMP_GROUP_H + +#include +#include +#include "mm.h" + +#define MAX_TP_NAME KSYM_NAME_LEN +/* The length of group name */ +#define MAX_TP_GROUP 128 +/* The length of TP description */ +#define MAX_TP_DESC 256 + +#define for_each_tp(i, p) \ + for (i = 0; i < all_tp_len && ({ p = &all_tp[i]; 1;}); i++) + +/*Definition of kprobe point that we predefined*/ +typedef struct trace_point { + /* The traced function name */ + char name[MAX_TP_NAME]; + /* To define the position index of struct sk_buff *skb in a certain + * trace_point funcion + */ + int skb_index; + /* To define the position index of struct sk_buff **pskb in a certain + * trace_point funcion + */ + int pskb_index; + + int sock_index; + /* Count the number of skb when dumpping in the current TP. */ + unsigned int dump_cnt; + char desc[MAX_TP_DESC]; + char groups[MAX_TP_GROUP]; + bool is_ret; + struct kprobe *kprobe; + struct file *dump_file; + struct work_struct dump_work; + struct sk_buff_head dump_queue; + struct llist_head skbcache; + int nr_skb_objs; + struct llist_head datacache; + int nr_data_objs; +} TRACE_POINT; + +typedef struct tp_list { + TRACE_POINT *tp; + struct list_head list; +} TP_LIST; + +enum trace_group_level { + BASIC, + MOD +}; + +struct kretprobe_instance; + +typedef struct ret_data { + struct sk_buff *skb; + struct sock *sk; +} RET_DATA; + +typedef struct trace_group { + char name[MAX_TP_NAME]; + char desc[MAX_TP_DESC]; + enum trace_group_level level; + struct list_head list; + struct list_head groups; + struct list_head traces; + bool activated; + + void (*handler)(struct kprobe *p, + struct pt_regs *regs, + unsigned long flags); + int (*ret_handler)(struct kretprobe_instance *ri, + struct pt_regs *regs); +} TRACE_GROUP; + +extern const int all_tp_len; +extern struct trace_point all_tp[]; + +extern TRACE_GROUP *all_group; + +extern void init_group(void); + +extern int trace_register(void); + +extern void free_all_group(void); + +#endif //NETDUMP_GROUP_H diff --git a/net/nettrace/handler.c b/net/nettrace/handler.c new file mode 100644 index 0000000000000000000000000000000000000000..5a4d87d76967eb1d8b1979b3e7941dcf64ab2750 --- /dev/null +++ b/net/nettrace/handler.c @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "handler.h" +#include "kprobe.h" +#include "group.h" +#include "parser.h" +#include "help.h" +#include "dump.h" +#include "utils.h" +#include "core.h" + +/**************************************************************************************************** + * + * This is the part for filter, by skb or sock + * + ****************************************************************************************************/ + +static int filter_skb(struct sk_buff *skb, struct sock *sk, TRACE_POINT *tp, SKB_OUTPUT *skb_output); + +static int filter_sock(struct sock *sk, TRACE_POINT *tp, SKB_OUTPUT *output); + +/* + * This is the general skb info print function. + * + * The info in 'output' will be printed by 'log_data' log level. + */ +static void general_print(SKB_OUTPUT *output, TRACE_POINT *tp) +{ + char saddr[IP_ADDR_LEN] = {}, daddr[IP_ADDR_LEN] = {}, + output_str[MAX_OUTPUT_LEN] = {}; + COMMON_RULE *rule = &output->rule; + struct icmphdr *icmp; + struct sk_buff *skb = output->skb; + struct net_device *dev = skb ? skb->dev : NULL; + char *type = ""; + + str_append(output_str, "[%d]", dev ? dev->ifindex : 0); + if (tp->is_ret) + str_append(output_str, "[%s,ret:%d]:", output->sym_name, output->ret_val); + else + str_append(output_str, "[%s]:", output->sym_name); + if (rule->proto_3 != ETH_P_IP) { + str_append(output_str, "proto: %s", proto3tostr(rule->proto_3)); + goto begin_print; + } + + if (i2ip(rule->saddr, saddr) || i2ip(rule->daddr, daddr)) { + log_err("parse ip addr error!"); + return; + } + str_append(output_str, "IP %s>%s", saddr, daddr); + + switch (rule->proto_4) { + case IPPROTO_TCP: + str_append(output_str, + " // TCP %d>%d,%s", + ntohs(rule->sport), ntohs(rule->dport), output->flags); + break; + case IPPROTO_UDP: + str_append(output_str, + " // UDP %d>%d", + ntohs(rule->sport), ntohs(rule->dport)); + break; + case IPPROTO_ICMP: + if (!output->skb) { + str_append(output_str, " // ICMP"); + break; + } + + if (!skb_transport_header_was_set(skb)) + icmp = (struct icmphdr *) (skb_network_header(skb) + sizeof(struct iphdr)); + else + icmp = icmp_hdr(skb); + + if (icmp->code == 0 && icmp->type == 8) + type = "request"; + if (icmp->code == 0 && icmp->type == 0) + type = "response"; + + str_append(output_str, + " // ICMP %s %u", + type, + ntohs(icmp->un.echo.sequence)); + break; + default: + str_append(output_str, + " // %s", + proto4tostr(rule->proto_4)); + break; + } + +begin_print: + log_data("%s\n", output_str); + +#if defined(CONFIG_BACKTRACE_USRSTACK_ARM64) || defined(CONFIG_BACKTRACE_USRSTACK_X86_64) + if (print_ustack) + backtrace_usrstack(); +#endif + + if (output->skb && tp->dump_file) + try_dump_skb(output->skb, tp); + + if (print_stack) + dump_stack(); +} + +/* + * Print the network information by print the skb. + * + * Note that this is most about the kernel function that receive skb, + * as the package header in skb is not completed in skb send function. + */ +static int filter_skb(struct sk_buff *skb, struct sock *sk, + TRACE_POINT *tp, SKB_OUTPUT *skb_output) +{ + int proto_3, proto_4, sport, dport; + struct ethhdr *eth; + struct tcphdr *tcp; + struct udphdr *udp; + struct iphdr *ip; + + COMMON_RULE *rule = &skb_output->rule; + skb_output->sym_name = tp->name; + rule->s_mask = 0xffffffff; + rule->d_mask = 0xffffffff; + eth = eth_hdr(skb); + if (!sk) + sk = skb->sk; + + if (!skb_mac_header_was_set(skb)) { + proto_3 = ntohs(skb->protocol); + if (proto_3) + goto parse_network; + + if (!sk) + goto error; + + if (sk->sk_family == PF_INET && skb->network_header) { + proto_3 = ETH_P_IP; + goto parse_network; + } + + return filter_sock(sk, tp, skb_output); + } + skb_output->skb = skb; + proto_3 = ntohs(eth->h_proto); + +parse_network: + SET_RULE_FLAGS(rule, proto_3, proto_3); + if (proto_3 != ETH_P_IP) + goto do_match; + + ip = ip_hdr(skb); + if (likely((u8 *)ip >= skb->head && + (u8 *)ip + sizeof(struct iphdr) <= skb_tail_pointer(skb))) { + proto_4 = ip->protocol; + SET_RULE_FLAGS(rule, proto_4, proto_4); + SET_RULE_FLAGS(rule, saddr, ip->saddr); + SET_RULE_FLAGS(rule, daddr, ip->daddr); + } else { + proto_4 = 0; + } + + switch (proto_4) { + case IPPROTO_TCP: + tcp = tcp_hdr(skb); + if (likely((u8 *)tcp >= skb->head && + (u8 *)tcp + sizeof(struct tcphdr) <= skb_tail_pointer(skb))) { + sport = tcp->source; + dport = tcp->dest; + flag2str(tcp, skb_output->flags); + } else { + sport = 0; + dport = 0; + } + goto flag_port; + + case IPPROTO_UDP: + udp = udp_hdr(skb); + if (likely((u8 *)udp >= skb->head && + (u8 *)udp + sizeof(struct udphdr) <= skb_tail_pointer(skb))) { + sport = udp->source; + dport = udp->dest; + } else { + sport = 0; + dport = 0; + } + goto flag_port; + default: + break; + } + +do_match: + if (match_all_rule(rule)) + return 0; + return -1; + +flag_port: + SET_RULE_FLAGS(rule, sport, sport); + SET_RULE_FLAGS(rule, dport, dport); + goto do_match; + +error: + return -1; +} + +/* + * Print the network information by print the sock. + * + * Note this is most about the process that send skb, during which + * skb headers is not ready and we can not get information from it, + * such ip addr or tcp port. + */ +static int filter_sock(struct sock *sk, TRACE_POINT *tp, SKB_OUTPUT *output) +{ + const struct inet_sock *inet; + int sport = 0, dport = 0; + + COMMON_RULE *rule = &output->rule; + rule->s_mask = 0xffffffff; + rule->d_mask = 0xffffffff; + output->sym_name = tp->name; + + SET_RULE_FLAGS(rule, proto_4, sk->sk_protocol); + + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + goto do_filter; + + inet = inet_sk(sk); + + SET_RULE_FLAGS(rule, proto_3, ETH_P_IP); + SET_RULE_FLAGS(rule, saddr, inet->inet_saddr); + SET_RULE_FLAGS(rule, daddr, inet->inet_daddr); + + sport = inet->inet_sport; + dport = inet->inet_dport; + + SET_RULE_FLAGS(rule, sport, sport); + SET_RULE_FLAGS(rule, dport, dport); + +do_filter: + if (match_all_rule(rule)) + return 0; + return -1; +} + +/**************************************************************************************************** + * + * This is the part for all kind of handlers. + * + ****************************************************************************************************/ + +/*the function that handle skb and sock.*/ +void __post_handler_general(struct kprobe *p, struct pt_regs *regs, unsigned long flags) +{ + + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + SKB_OUTPUT skb_output = {}; + TRACE_POINT *tp = (TRACE_POINT *) p->symbol_name; + + if (tp->sock_index > 0) { + sk = (struct sock *) kprobe_parm(regs, tp->sock_index); + } + + if (tp->skb_index > 0) { + skb = (struct sk_buff *) kprobe_parm(regs, tp->skb_index); + goto do_print_skb; + } + + if (tp->pskb_index > 0) { + skb = *((struct sk_buff **) kprobe_parm(regs, tp->pskb_index)); + goto do_print_skb; + } + + if (sk && !filter_sock(sk, tp, &skb_output)) + general_print(&skb_output, tp); + return; + +do_print_skb: + if (skb && !filter_skb(skb, sk, tp, &skb_output)) + general_print(&skb_output, tp); +} + +void post_handler_general(struct kprobe *p, struct pt_regs *regs, unsigned long flags) +{ + /* If being removed, nettrace should stop tracing as soon as possible */ + if (unlikely(READ_ONCE(nt_status) == NT_EXITING)) + return; + + /* If insmoding nettrace is not completed, don't start to parse */ + if (unlikely(READ_ONCE(nt_status) == NT_INIT)) + return; + + __post_handler_general(p, regs, flags); +} + +int entry_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + TRACE_POINT *tp = NULL; + RET_DATA *data = (RET_DATA *) ri->data; + +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct kretprobe *rp = get_kretprobe(ri); + if (unlikely(!rp)) + return 1; + tp = (TRACE_POINT *) rp->kp.symbol_name; +#else + tp = (TRACE_POINT *) ri->rph->rp->kp.symbol_name; +#endif + if (tp->sock_index > 0) { + sk = (struct sock *) kprobe_parm(regs, tp->sock_index); + } + + if (tp->skb_index > 0) { + skb = (struct sk_buff *) kprobe_parm(regs, tp->skb_index); + } + + if (tp->pskb_index > 0) { + skb = *((struct sk_buff **) kprobe_parm(regs, tp->pskb_index)); + } + + data->sk = sk; + data->skb = skb; + + if (sk == NULL && skb == NULL) + return 1; + return 0; +} + +int __ret_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + SKB_OUTPUT skb_output = {}; + TRACE_POINT *tp = NULL; + RET_DATA *data = (RET_DATA *) ri->data; + +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct kretprobe *rp = get_kretprobe(ri); + if (unlikely(!rp)) + return 1; + tp = (TRACE_POINT *) rp->kp.symbol_name; +#else + tp = (TRACE_POINT *) ri->rph->rp->kp.symbol_name; +#endif + sk = data->sk; + skb_output.ret_val = KPROBE_RET_PARM; + + if ((skb = data->skb) != NULL) + goto do_print_skb; + + if (sk && !filter_sock(sk, tp, &skb_output)) + general_print(&skb_output, tp); + return 0; + +do_print_skb: + if (skb && !filter_skb(skb, sk, tp, &skb_output)) + general_print(&skb_output, tp); + return 0; +} + +int ret_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + /* If being removed, nettrace should stop tracing as soon as possible */ + if (unlikely(READ_ONCE(nt_status) == NT_EXITING)) + return 0; + + /* If insmoding nettrace is not completed, don't start to parse */ + if (unlikely(READ_ONCE(nt_status) == NT_INIT)) + return 0; + + return __ret_handler_general(ri, regs); +} + +/*the function that handle skb and sock.*/ +void post_handler_udp_tracer(struct kprobe *p, struct pt_regs *regs, unsigned long flags) +{ + + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + SKB_OUTPUT skb_output = {}; + + TRACE_POINT *tp = (TRACE_POINT *) p->symbol_name; + + if (tp->sock_index > 0) { + sk = (struct sock *) kprobe_parm(regs, tp->sock_index); + } + + if (tp->skb_index > 0) { + skb = (struct sk_buff *) kprobe_parm(regs, tp->skb_index); + goto do_print_skb; + } + + if (tp->pskb_index > 0) { + skb = *((struct sk_buff **) kprobe_parm(regs, tp->pskb_index)); + goto do_print_skb; + } + + if (sk && !filter_sock(sk, tp, &skb_output)) + goto do_print; + return; + +do_print_skb: + if (skb && !filter_skb(skb, sk, tp, &skb_output)) + goto do_print; + return; + +do_print: + general_print(&skb_output, tp); + + if (!print_stack && streq(tp->name, "kfree_skb_reason")) + dump_stack(); +} + diff --git a/net/nettrace/handler.h b/net/nettrace/handler.h new file mode 100644 index 0000000000000000000000000000000000000000..fea3f390a1df4277c338713e1533902bac71aafc --- /dev/null +++ b/net/nettrace/handler.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_HANDLER_H +#define NETDUMP_HANDLER_H + +#include "group.h" +#include "parser.h" + +#define MAX_OUTPUT_LEN 300 + +#if defined(CONFIG_BACKTRACE_USRSTACK_ARM64) || defined(CONFIG_BACKTRACE_USRSTACK_X86_64) +extern void backtrace_usrstack(void); +#endif + +struct kretprobe_instance; + +typedef struct skb_output { + COMMON_RULE rule; + struct sk_buff *skb; + char *sym_name; + char flags[TCP_FLAG_LEN]; + int ret_val; +} SKB_OUTPUT; + +extern TRACE_GROUP *all_group; + +extern void +post_handler_general(struct kprobe *p, struct pt_regs *regs, unsigned long flags); + +extern void +post_handler_udp_tracer(struct kprobe *p, struct pt_regs *regs, unsigned long flags); + +extern int +ret_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs); + +extern int +entry_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs); + +#endif //NETDUMP_HANDLER_H diff --git a/net/nettrace/help.c b/net/nettrace/help.c new file mode 100644 index 0000000000000000000000000000000000000000..e029d035a5d9c8786306d16278b454d48a869f52 --- /dev/null +++ b/net/nettrace/help.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include "help.h" + +#include "utils.h" +#include "kprobe.h" +#include "parser.h" +#include "group.h" + +/*param for package filter*/ +MODULE_PARM_DESC(saddr, "filter source ip address,e.g. insmod nettrace.ko saddr=172.16.6.62"); +PARAM_STRING_NAMED(filter_saddr, saddr, NULL) +MODULE_PARM_DESC(daddr, "filter destination ip address,e.g. insmod nettrace.ko daddr=172.16.6.74"); +PARAM_STRING_NAMED(filter_daddr, daddr, NULL) +MODULE_PARM_DESC(addr, "filter source or destination ip address,e.g. insmod nettrace.ko addr=192.168.2.11"); +PARAM_STRING_NAMED(filter_addr, addr, NULL) +MODULE_PARM_DESC(proto, "filter 3 layer or 4 layer net protocol,e.g. insmod nettrace.ko proto=arp"); +PARAM_STRING_NAMED(filter_proto, proto, NULL) +MODULE_PARM_DESC(port, "filter source port or destination port, e.g. insmod nettrace.ko port=1234"); +PARAM_INT_NAMED(filter_port, port, -1) +MODULE_PARM_DESC(sport, "filter source port,e.g. insmod nettrace.ko port=1234"); +PARAM_INT_NAMED(filter_sport, sport, -1) +MODULE_PARM_DESC(dport, "filter destination port,e.g. insmod nettrace.ko port=1234"); +PARAM_INT_NAMED(filter_dport, dport, -1) + +MODULE_PARM_DESC(trace, "a trace is a serial of kernel tracing for special scene,We support various of trace, such as ip,upd,tcp,macvlan,link,link_input,link_output\n \ + e.g. insmod nettrace.ko trace=link"); +PARAM_STRING_ARRAY(filter_trace, trace, 10) +MODULE_PARM_DESC(probe, "this is a list of kernel function where you want to dump network package info. Use 'probe=?' to see all supported kernel functions"); +PARAM_STRING_ARRAY(filter_probe, probe, 10) + +MODULE_PARM_DESC(stack, "print the kernel function call stack,e.g. insmod nettrace.ko stack=1"); +PARAM_INT_NAMED(print_stack, stack, 0) +MODULE_PARM_DESC(ustack, "print the user space call stack,e.g. insmod nettrace.ko ustack=1"); +PARAM_INT_NAMED(print_ustack, ustack, 0) + +MODULE_PARM_DESC(output, "three kind of output supported: ftrace, kernel and file,When comes up with file, it should be a file path, such as /ntrace.log\n \ + e.g. insmod nettrace.ko output=ftrace"); +PARAM_STRING_NAMED(print_output, output, NULL) +MODULE_PARM_DESC(dump, "the directory where you want to put pcap file in. Once thisoption is set, all package filtered will be saved\n \ + e.g. insmod nettrace.ko trace=error dump=./output"); +PARAM_STRING_NAMED(print_dump, dump, NULL) + +MODULE_PARM_DESC(flag, "addition flags supported values v: print addition info,e.g. insmod nettrace.ko flag=v"); +PARAM_STRING_NAMED(param_flag, flag, NULL) +MODULE_PARM_DESC(mm, "the number of pages reserved for each TP during the initialization phase"); +PARAM_INT_NAMED(param_mm, mm, 100) + +output_type op_type; +int if_print_info = 0; + +typedef struct { + struct list_head list; + char *msg; +} PRINT_ENTRY; + +static struct work_struct print_work; +static LIST_HEAD(print_list); +static spinlock_t print_lock; + +static struct file *output_file = NULL; +static char output_index[64]; + +static void print_process(struct work_struct *work) +{ + PRINT_ENTRY *file, *next; + unsigned long flag; + LIST_HEAD(head); + + spin_lock_irqsave(&print_lock, flag); + list_splice_init(&print_list, &head); + spin_unlock_irqrestore(&print_lock, flag); + + list_for_each_entry_safe(file, next, &head, list) { + file_append(output_file, file->msg, (unsigned int) strlen(file->msg)); + + kfree(file->msg); + kfree(file); + } +} + +int init_output_file(char *path) +{ + output_file = file_create(path); + + if (output_file == NULL) + return -1; + + INIT_WORK(&print_work, print_process); + spin_lock_init(&print_lock); + + return 0; +} + +static void print_enqueue(char *fmt, va_list ap) +{ + char buf[MAX_LOG_BUF]; + char *print_buf; + PRINT_ENTRY *print_file; + unsigned long flag; + + vsnprintf(buf, sizeof(buf), fmt, ap); + print_buf = kcalloc(1, strlen(buf) + 1, GFP_KERNEL); + if (!print_buf) + return; + + strncpy(print_buf, buf, strlen(buf)); + + print_file = kcalloc(1, sizeof(PRINT_ENTRY), GFP_KERNEL); + if (!print_file) { + kfree(print_buf); + return; + } + + print_file->msg = print_buf; + + spin_lock_irqsave(&print_lock, flag); + list_add_tail(&print_file->list, &print_list); + spin_unlock_irqrestore(&print_lock, flag); + + schedule_work(&print_work); +} + +void log_base(char *fmt, ...) +{ + + va_list argptr; + va_start(argptr, fmt); + + switch (op_type) { + default: + vprintk(fmt, argptr); + break; + case OUTPUT_FTRACE: + ftrace_vprintk(fmt, argptr); + break; + case OUTPUT_FILE: + print_enqueue(fmt, argptr); + break; + } + + va_end(argptr); +} + +static void print_help(void) +{ + log_data("============================Netdump========================\n"); + log_data("\n"); + log_data("Welcome to use netdump! This is a tool based on kprobe for\n"); + log_data("network bag grab in kernel.\n"); + log_data("\n"); + log_data("Basic usage:\n"); + log_data("\n"); + log_data(" insmod netdump.ko [probe=] \n"); + log_data(" [output=] [flag=]\n"); + log_data(" [dump=]\n"); + log_data(" [trace=] [proto=]\n"); + log_data(" [saddr=] [daddr=] [addr=] \n"); + log_data(" [sport=] [dport=] [port=] \n"); + log_data(" [stack=<0 or 1>] [ustack=<0 or 1>]\n"); + log_data("\n"); + log_data("trace: a trace is a serial of kernel tracing for special scene.\n"); + log_data(" We support various of trace, such ip, tcp, macvlan, etc.\n"); + log_data(" Trace can have children trace, use 'trace=?' to see all supported\n"); + log_data(" trace.\n"); + log_data("\n"); + log_data("probe: this is a list of kernel function where you want\n"); + log_data(" to dump network package info. Use 'probe=?' to see all\n"); + log_data(" supported kernel functions.\n"); + log_data("\n"); + log_data("dump: the directory where you want to put pcap file in. Once this\n"); + log_data(" option is set, all package filtered will be saved.\n"); + log_data("\n"); + log_data("output: three kind of output supported: ftrace, kernel and file.\n"); + log_data(" When comes up with file, it should be a file path, such as /ntrace.log.\n"); + log_data("\n"); + log_data("flag: addition flags. Supported values:\n"); + log_data(" v: print addition info.\n"); + log_data("\n"); + log_data("mm: the number of pages reserved for each TP during the initialization phase,\n"); + log_data(" which is used for temporarily caching messages.\n"); + log_data(" [mm=100] means that each TP initialization will reserve 2 * 100 pages,\n"); + log_data(" 100 for storing SKBs, and 100 for storing data."); + log_data("\n"); + + log_data("stack: print the kernel function call stack.\n"); + log_data("ustack: print the user spack call stack.\n"); + log_data("\n"); + log_data("-----------------------package filter-------------------\n"); + log_data("saddr: source ip addr\n"); + log_data("daddr: dest ip addr\n"); + log_data("addr: source or dest ip addr\n"); + log_data("\n"); + log_data("sport: source udp or tcp port\n"); + log_data("dport: dest udp or tcp port\n"); + log_data("port: source or dest udp or tcp port\n"); + log_data("proto: the network protocol\n"); + + log_data("\n"); + log_data("\n"); + log_data("============================Netdump========================\n"); +} + +static void print_group(TRACE_GROUP *tg) +{ + TRACE_GROUP *tmp_tg; + char tab[] = " "; + ulong cur_len = strlen(output_index); + + log_data("%s%s: %s\n", output_index, tg->name, tg->desc); + if (cur_len + sizeof(tab) > sizeof(output_index)) + return; + strncpy(output_index + cur_len, tab, strlen(tab)); + list_for_each_entry(tmp_tg, &tg->groups, list) { + print_group(tmp_tg); + } + output_index[cur_len] = '\0'; +} + +static void print_trace(void) +{ + memset(output_index, 0, sizeof(output_index)); + print_group(all_group); +} + +static void print_probe(void) +{ + TRACE_POINT *tp; + int i = 0; + + log_data("all supported kprobe:\n\n"); + for_each_tp(i, tp) { + log_data("\t%s\n", tp->name); + } +} + +static int init_rule(void) +{ + COMMON_RULE *rule; + u32 addr_i = 0; + int proto = 0; + + rule = kcalloc(1, sizeof(COMMON_RULE), GFP_KERNEL); + if (!rule) + return -ENOMEM; + + INIT_LIST_HEAD(&rule->list); + rule->s_mask = rule->d_mask = 0xffffffff; + + if (filter_addr) { + if (ip2i(filter_addr, &addr_i)) { + log_err("ip addr format error!\n"); + goto error; + } + addr_i = htonl(addr_i); + rule->saddr = addr_i; + rule->__flags |= FLAG_addr; + } + + if (filter_saddr) { + if (ip2i(filter_saddr, &addr_i)) { + log_err("ip addr format error!\n"); + goto error; + } + addr_i = htonl(addr_i); + SET_RULE_FLAGS(rule, saddr, addr_i); + } + + if (filter_daddr) { + if (ip2i(filter_daddr, &addr_i)) { + log_err("ip addr format error!\n"); + goto error; + } + addr_i = htonl(addr_i); + SET_RULE_FLAGS(rule, daddr, addr_i); + } + + if (filter_sport > 0xffff || filter_dport > 0xffff || filter_port > 0xffff) { + log_err("port range error!\n"); + goto error; + } + + if (filter_port > 0) { + filter_port = (int) htons((u16) filter_port); + rule->sport = filter_port; + rule->__flags |= FLAG_port; + } + + if (filter_sport > 0) { + filter_sport = (int) htons((u16) filter_sport); + SET_RULE_FLAGS(rule, sport, filter_sport); + } + + if (filter_dport > 0) { + filter_dport = (int) htons((u16) filter_dport); + SET_RULE_FLAGS(rule, dport, filter_dport); + } + + if (filter_proto) { + if ((proto = str2proto3(filter_proto)) >= 0) { + SET_RULE_FLAGS(rule, proto_3, proto); + } else if ((proto = str2proto4(filter_proto)) >= 0) { + SET_RULE_FLAGS(rule, proto_4, proto); + } else { + log_err("proto not found!\n"); + goto error; + } + } + + if (add_rule(rule)) + kfree(rule); + + return 0; +error: + kfree(rule); + return -EINVAL; +} + +int init_args(void) +{ + int err = -EINVAL; + + if (print_output != NULL) { + if (streq(print_output, "ftrace")) + op_type = OUTPUT_FTRACE; + else if (streq(print_output, "kernel")) + op_type = OUTPUT_KERNEL; + else if (!init_output_file(print_output)) + op_type = OUTPUT_FILE; + else { + log_err("output type error!\n"); + goto error; + } + } + + if (print_dump != NULL) { + err = access_path(print_dump); + if (err) { + log_err("[nettrace] access_path err:%d\n", err); + log_err("[nettrace] failed to access the dump output directory: %s\n", + print_dump); + goto error; + } + } + + if (filter_trace_len == 0 && filter_probe_len == 0) { + pr_alert("Don't worry. execute 'dmesg' and see usage of nettrace.ko\n"); + print_help(); + goto error; + } + + if (filter_trace_len == 1 && streq(filter_trace[0], "?")) { + pr_alert("Execute 'dmesg' and see available parameters of 'trace='\n"); + print_trace(); + goto error; + } + + if (filter_probe_len == 1 && streq(filter_probe[0], "?")) { + pr_alert("Execute 'dmesg' and see available parameters of 'probe='\n"); + print_probe(); + goto error; + } + + if (param_flag) { + char *tmp_flag; + while ((tmp_flag = strsep(¶m_flag, ",")) != NULL) { + switch (*tmp_flag) { + case 'v': + if_print_info = 1; + break; + default: + log_err("flags:%c not supported\n", *tmp_flag); + goto error; + } + } + } + + if(init_rule()) + goto error; + + return 0; +error: + return err; +} diff --git a/net/nettrace/help.h b/net/nettrace/help.h new file mode 100644 index 0000000000000000000000000000000000000000..c9b9471d7841f7a43a1e7c3f6a3036d3f836c823 --- /dev/null +++ b/net/nettrace/help.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_HELP_H +#define NETDUMP_HELP_H + +#include +#include + +#define MAX_LOG_BUF 200 + +typedef enum { + OUTPUT_KERNEL, + OUTPUT_FTRACE, + OUTPUT_FILE +} output_type; + +extern char *output_file_path; +extern output_type op_type; +extern int if_print_info; + +extern char *filter_saddr; +extern char *filter_daddr; +extern char *filter_addr; +extern char *filter_proto; + +extern int filter_port; +extern int filter_sport; +extern int filter_dport; + +extern char *filter_trace[]; +extern char *filter_probe[]; +extern int filter_trace_len; +extern int filter_probe_len; + +extern int print_stack; +extern int print_ustack; + +extern char *print_output; +extern char *print_dump; +extern char *param_flag; +extern int param_mm; + +extern int init_output_file(char *path); + +extern void log_base(char *fmt, ...); + +extern int init_args(void); + +#define log_info(fmt, args...) {if(if_print_info)\ + log_base(fmt, ##args);} +#define log_data(fmt, args...) log_base(fmt, ##args) +#define log_err(fmt, args...) log_base(fmt, ##args) +#define log_debug(fmt, args...) log_base(fmt, ##args) + +static inline void print_leave(void) { + log_info("you just exited netdump, welcome back~\n"); +} + +#endif //NETDUMP_HELP_H diff --git a/net/nettrace/kprobe.c b/net/nettrace/kprobe.c new file mode 100644 index 0000000000000000000000000000000000000000..92fe02f6dbd5994ce563922f9c1fbf5bdad57de6 --- /dev/null +++ b/net/nettrace/kprobe.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include "kprobe.h" +#include "help.h" +#include "group.h" + +static kprobe_opcode_t *query_kallsym_addr; + +/* + * Get the function arg with index of 'index' from 'regs'. + * + * This is just another version of the 'KPROBE_PARM'. + */ +unsigned long kprobe_parm(struct pt_regs *regs, int index) +{ + switch (index) { + case 1: + return PT_REGS_PARM1(regs); + case 2: + return PT_REGS_PARM2(regs); + case 3: + return PT_REGS_PARM3(regs); + case 4: + return PT_REGS_PARM4(regs); + case 5: + return PT_REGS_PARM5(regs); + default: + return 0; + } +} + +/* + * Declare the kprobe with function call. + */ +struct kprobe *kprobe_declare(const char *sym, void *handler) +{ + struct kprobe *p = kcalloc(1, sizeof(struct kprobe), GFP_KERNEL); + + if (!p) + return NULL; + + p->symbol_name = sym; + p->post_handler = handler; + + return p; +} + +static int skip_entry_handler(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + return 0; +} + +/* + * Declare the kretprobe with function call. + */ +struct kretprobe *kretprobe_declare(const char *sym, void *handler, + void *entry_handler) +{ + struct kretprobe *p = kcalloc(1, sizeof(struct kretprobe), GFP_KERNEL); + + if (!p) + return NULL; + + if (entry_handler == NULL) + entry_handler = skip_entry_handler; + + p->entry_handler = entry_handler; + p->kp.symbol_name = sym; + p->handler = handler; + + return p; +} + +int filter_syms(void *data, const char *name_buf, unsigned long address) +{ + char *name = data; + + if (strstarts(name_buf, name)) { + if (strlen(name_buf) > MAX_TP_NAME) { + log_err("[nettrace: %s] func name is too long: %s\n", __func__, name_buf); + return -1; + } + strncpy(name, name_buf, MAX_TP_NAME); + query_kallsym_addr = (kprobe_opcode_t *) address; + return -1; + } + return 0; +} + +/* + * query the address of syms in kallsyms. + */ +int query_kallsyms(char *name) +{ + if (strlen(name) + 1 >= MAX_TP_NAME) { + log_err("[nettrace: %s] func name is too long: %s\n", __func__, name); + return -1; + } + strncat(name, ".", 1); + return kallsyms_on_each_symbol(filter_syms, name); +} + +/* + * register the kprobe with function call. + */ +int c_register_kprobe(struct kprobe *p) +{ + if (register_kprobe(p) < 0) { + char name[MAX_TP_NAME] = {}; + strncpy(name, p->symbol_name, sizeof(name) - 1); + if (query_kallsyms(name)) { + pr_alert("[nettrace] Note: The function we want to trace (%s) has become " + "%s bacause the kernel compiler had added a suffix in its symbol! " + "There may be several versions of %s, please check /proc/kallsyms\n", + (char *) p->symbol_name, name, (char *) p->symbol_name); + strncpy((char *) p->symbol_name, name, sizeof(name) - 1); + if (register_kprobe(p) < 0) + goto on_err; + } else { + goto on_err; + } + } + + log_info(" planted kprobe at %p, name %s\n", p->addr, p->symbol_name); + return 0; + +on_err: + return -1; +} + +/* + * register the kprobe with function call. + */ +int c_register_kretprobe(struct kretprobe *p) +{ + if (register_kretprobe(p) < 0) { + char name[MAX_TP_NAME] = {}; + strncpy(name, p->kp.symbol_name, sizeof(name) - 1); + if (query_kallsyms(name)) { + pr_alert("[nettrace] Note: The function we want to trace (%s) has become " + "%s bacause the kernel compiler had added a suffix in its symbol! " + "There may be several versions of %s, please check /proc/kallsyms\n", + (char *) p->kp.symbol_name, name, (char *) p->kp.symbol_name); + strncpy((char *) p->kp.symbol_name, name, sizeof(name) - 1); + if (register_kretprobe(p) < 0) + goto on_err; + } else { + goto on_err; + } + } + + log_info(" planted kretprobe at %p, name %s\n", p->kp.addr, p->kp.symbol_name); + return 0; + +on_err: + log_err(" register kretprobe failed: %s\n", p->kp.symbol_name); + return -1; +} + +/* + * unregister the krpobe with function call. + */ +void c_unregister_kprobe(struct kprobe *p) +{ + unregister_kprobe(p); + log_info("kprobe at %s unregistered\n", p->symbol_name); +} + +/* + * unregister the kretprobe with function call. + */ +void c_unregister_kretprobe(struct kretprobe *p) +{ + unregister_kretprobe(p); + log_info("kretprobe at %s unregistered\n", p->kp.symbol_name); +} diff --git a/net/nettrace/kprobe.h b/net/nettrace/kprobe.h new file mode 100644 index 0000000000000000000000000000000000000000..9f38e788e85f8b95a8895f71009ac6d11b291694 --- /dev/null +++ b/net/nettrace/kprobe.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef KPROBE_COMMON_H +#define KPROBE_COMMON_H + +#include +#include +#include +#include + +#define DECLARE_HANDLER_POST(func) \ + static void post_handler_##func( \ + struct kprobe *p, \ + struct pt_regs *regs, \ + unsigned long flags) +#define DECLARE_HANDLER_RET(func) \ + static int ret_handler##func( \ + struct kretprobe_instance *ri, \ + struct pt_regs *regs) +#define DECLARE_HANDLER_ENTRY(func) \ + static int entry_handler##func( \ + struct kretprobe_instance *ri, \ + struct pt_regs *regs) + +#define DECLARE_KPROBE(func) \ + DECLARE_HANDLER_POST(func); \ + static struct kprobe kprobe_##func = { \ + .symbol_name = #func, \ + .post_handler = post_handler_##func}; \ + DECLARE_HANDLER_POST(func) + +#define DECLARE_RETKPROBE(func) \ + DECLARE_HANDLER_ENTRY(func); \ + DECLARE_HANDLER_RET(func); \ + static struct kretprobe kretprobe_##func = { \ + .handler = ret_handler##func, \ + .entry_handler = entry_handler##func, \ + .kp = { .symbol_name = #func } \ + }; \ + DECLARE_HANDLER_RET(func) + +#define REGISTER_KPROBE(func) \ + if (register_kprobe(&kprobe_##func) < 0) \ + { \ + printk(KERN_INFO "register_kprobe failed:" #func "\n"); \ + return 0; \ + } \ + printk(KERN_INFO "Planted kprobe at %p, handler addr %p, name %s\n",\ + kprobe_##func.addr, kprobe_##func.post_handler, \ + kprobe_##func.symbol_name); + +#define UNREGISTER_KPROBE(func) \ + unregister_kprobe(&kprobe_##func); \ + printk(KERN_INFO "kprobe at %p unregistered\n", kprobe_##func.addr); + +#define PARAM_STRING(name, def) \ + char *name = def; \ + module_param(name, charp, 0); + +#define PARAM_STRING_ARRAY(name, cmdname, length) \ + char *name[length]; \ + int name##_len = 0; \ + module_param_array_named(cmdname, name, charp, &name##_len, 0); + +#define PARAM_STRING_RW(name, default) \ + char *name = default; \ + module_param(name, charp, 0644); + +#define PARAM_STRING_NAMED(name, cmdname, default) \ + char *name = default; \ + module_param_named(cmdname, name, charp, 0644); + +#define PARAM_INT(name, default) \ + int name = default; \ + module_param(name, int, 0); + +#define PARAM_INT_RW(name, default) \ + int name = default; \ + module_param(name, int, 0644); + +#define PARAM_INT_NAMED(name, cmdname, default) \ + int name = default; \ + module_param_named(cmdname, name, int, 0644); + +#define KPROBE_INIT \ + static int __init kprobe_init(void);\ + module_init(kprobe_init); \ + static int __init kprobe_init(void) + +#define KPROBE_EXIT \ + static void __exit kprobe_exit(void); \ + module_exit(kprobe_exit); \ + static void __exit kprobe_exit(void) + +#if defined(__x86_64__) + +#define PT_REGS_PARM1(x) ((x)->di) +#define PT_REGS_PARM2(x) ((x)->si) +#define PT_REGS_PARM3(x) ((x)->dx) +#define PT_REGS_PARM4(x) ((x)->cx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->sp) +#define PT_REGS_FP(x) ((x)->bp) +#define PT_REGS_RC(x) ((x)->ax) +#define PT_REGS_SP(x) ((x)->sp) + +#elif defined(__s390x__) + +#define PT_REGS_PARM1(x) ((x)->gprs[2]) +#define PT_REGS_PARM2(x) ((x)->gprs[3]) +#define PT_REGS_PARM3(x) ((x)->gprs[4]) +#define PT_REGS_PARM4(x) ((x)->gprs[5]) +#define PT_REGS_PARM5(x) ((x)->gprs[6]) +#define PT_REGS_RET(x) ((x)->gprs[14]) +#define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->gprs[2]) +#define PT_REGS_SP(x) ((x)->gprs[15]) + +#elif defined(__aarch64__) + +#define PT_REGS_PARM1(x) ((x)->regs[0]) +#define PT_REGS_PARM2(x) ((x)->regs[1]) +#define PT_REGS_PARM3(x) ((x)->regs[2]) +#define PT_REGS_PARM4(x) ((x)->regs[3]) +#define PT_REGS_PARM5(x) ((x)->regs[4]) +#define PT_REGS_RET(x) ((x)->regs[30]) +#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[0]) +#define PT_REGS_SP(x) ((x)->sp) + +#elif defined(__arm__) + +#define PT_REGS_PARM1(x) ((x)->uregs[0]) +#define PT_REGS_PARM2(x) ((x)->uregs[1]) +#define PT_REGS_PARM3(x) ((x)->uregs[2]) +#define PT_REGS_PARM4(x) ((x)->uregs[3]) +#define PT_REGS_PARM5(x) ((x)->uregs[4]) +#define PT_REGS_RET(x) ((x)->uregs[14]) +#define PT_REGS_FP(x) ((x)->uregs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->uregs[0]) +#define PT_REGS_SP(x) ((x)->uregs[13]) +#define PT_REGS_IP(x) ((x)->uregs[12]) + +#elif defined(__mips__) + +#define PT_REGS_PARM1(x) ((x)->regs[4]) +#define PT_REGS_PARM2(x) ((x)->regs[5]) +#define PT_REGS_PARM3(x) ((x)->regs[6]) +#define PT_REGS_PARM4(x) ((x)->regs[7]) +#define PT_REGS_PARM5(x) ((x)->regs[8]) +#define PT_REGS_RET(x) ((x)->regs[31]) +#define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[2]) +#define PT_REGS_SP(x) ((x)->regs[29]) +#define PT_REGS_IP(x) ((x)->cp0_epc) + +#elif defined(__powerpc__) + +#define PT_REGS_PARM1(x) ((x)->gpr[3]) +#define PT_REGS_PARM2(x) ((x)->gpr[4]) +#define PT_REGS_PARM3(x) ((x)->gpr[5]) +#define PT_REGS_PARM4(x) ((x)->gpr[6]) +#define PT_REGS_PARM5(x) ((x)->gpr[7]) +#define PT_REGS_RC(x) ((x)->gpr[3]) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->nip) + +#endif + +#define KPROBE_PARM(type, name, index) type name = (type)PT_REGS_PARM##index(regs); +#define KPROBE_RET_PARM regs_return_value(regs); + +//Below is the function encapsulation + +extern unsigned long kprobe_parm(struct pt_regs *regs, int index); + +extern struct kprobe *kprobe_declare(const char *sym, void *handler); + +extern struct kretprobe *kretprobe_declare(const char *sym, void *handler, void *entry_handler); + +extern int c_register_kprobe(struct kprobe *p); + +extern int c_register_kretprobe(struct kretprobe *p); + +extern void c_unregister_kprobe(struct kprobe *p); + +extern void c_unregister_kretprobe(struct kretprobe *p); + +MODULE_LICENSE("GPL"); +#endif diff --git a/net/nettrace/mm.c b/net/nettrace/mm.c new file mode 100644 index 0000000000000000000000000000000000000000..2b4e88e967268437ef2f76e88dc9c8d1d4dc3778 --- /dev/null +++ b/net/nettrace/mm.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2024 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include "group.h" +#include "help.h" +#include "mm.h" + +#define MIN_CACHED_SKB_OBJS 100 + +struct sk_buff_nettrace *get_cached_skb(TRACE_POINT *tp) +{ + if (!tp->nr_skb_objs) + return NULL; + + WRITE_ONCE(tp->nr_skb_objs, tp->nr_skb_objs - 1); + return (struct sk_buff_nettrace *)llist_del_first(&tp->skbcache); +} + +bool put_cached_skb(TRACE_POINT *tp, struct sk_buff_nettrace *skb) +{ + if (tp->nr_skb_objs >= param_mm) + return false; + + llist_add((struct llist_node *) skb, &tp->skbcache); + WRITE_ONCE(tp->nr_skb_objs, tp->nr_skb_objs + 1); + return true; +} + +u8 *get_cached_data(TRACE_POINT *tp) +{ + if (!tp->nr_data_objs) + return NULL; + + WRITE_ONCE(tp->nr_data_objs, tp->nr_data_objs - 1); + return (u8 *)llist_del_first(&tp->datacache); +} + +bool put_cached_data(TRACE_POINT *tp, u8 *data) +{ + if (tp->nr_data_objs >= param_mm) + return false; + + llist_add((struct llist_node *) data, &tp->datacache); + WRITE_ONCE(tp->nr_data_objs, tp->nr_data_objs + 1); + return true; +} + +void init_dump_mm(TRACE_POINT *tp) +{ + int i; + struct sk_buff_nettrace *skb; + u8 *data; + + if (param_mm < MIN_CACHED_SKB_OBJS) + param_mm = MIN_CACHED_SKB_OBJS; + + for (i = 0; i < param_mm; i++) { + skb = (struct sk_buff_nettrace *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (skb) + put_cached_skb(tp, skb); + else + pr_err("Failed to preallocate for nettrace dump skb! number:%d\n", i); + } + + for (i = 0; i < param_mm; i++) { + data = (u8 *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (data) + put_cached_data(tp, data); + else + pr_err("Failed to preallocate for nettrace dump data! number:%d\n", i); + } + + /* Partial failure does not affect usage. */ +} + +void release_dump_mm(TRACE_POINT *tp) +{ + int i; + struct sk_buff_nettrace *skb; + u8 *data; + + for (i = 0; i < param_mm; i++) { + skb = get_cached_skb(tp); + if (skb) + free_page((unsigned long)skb); + } + + for (i = 0; i < param_mm; i++) { + data = get_cached_data(tp); + free_page((unsigned long)data); + } +} + +static struct sk_buff_nettrace *__alloc_skb_nettrace(TRACE_POINT *tp) +{ + struct sk_buff_nettrace *skb; + u8 *data; + + skb = get_cached_skb(tp); + if (!skb) + goto out; + prefetchw(skb); + memset(skb, 0, sizeof(struct sk_buff_nettrace)); + + data = get_cached_data(tp); + if (!data) + goto nodata; + + skb->data = data; + memset(data, 0, PAGE_SIZE); + + skb_queue_head_init(&skb->frag_list); +out: + return skb; +nodata: + put_cached_skb(tp, skb); + skb = NULL; + goto out; +} + +void release_skb_nettrace(struct sk_buff_nettrace *skb, TRACE_POINT *tp) +{ + struct sk_buff *frag, *tmp; + if (!skb_queue_empty(&skb->frag_list)) { + skb_queue_walk_safe(&skb->frag_list, frag, tmp) { + put_cached_data(tp, ((struct sk_buff_nettrace *)frag)->data); + put_cached_skb(tp, (struct sk_buff_nettrace *)frag); + } + } + put_cached_data(tp, skb->data); + put_cached_skb(tp, skb); +} + +struct sk_buff_nettrace *skb_copy_nettrace(const struct sk_buff *skb, TRACE_POINT *tp) +{ + int headerlen = skb_headroom(skb) - skb->mac_header; + unsigned int size = skb->len + headerlen; + unsigned int frag_num = size / PAGE_SIZE + 1; + unsigned int i; + struct sk_buff_nettrace *n; + unsigned int dump_size = 0; + + /* Alloc skb. */ + for (i = 0; i < frag_num; i++) { + if (!i) { + n = __alloc_skb_nettrace(tp); + if (!n) + return NULL; + } else { + struct sk_buff_nettrace *frag = __alloc_skb_nettrace(tp); + if (!frag) + goto nofrag; + __skb_queue_tail(&n->frag_list, (struct sk_buff *)frag); + } + } + + /* Copy first page. */ + n->len = size > PAGE_SIZE ? PAGE_SIZE : headerlen + skb->len; + if (n->data) { + if (skb_copy_bits(skb, -headerlen, n->data, n->len)) + pr_warn("[nettrace]: Copy SKB failed.\n"); + } + dump_size += n->len; + /* Only one. */ + if (size <= PAGE_SIZE) { + n->total_len = dump_size; + return n; + } + + /* Copy frag pages. */ + if (!skb_queue_empty(&n->frag_list)) { + struct sk_buff *frag, *tmp; + unsigned int i = 0; + + skb_queue_walk_safe(&n->frag_list, frag, tmp) { + i++; + ((struct sk_buff_nettrace *)frag)->len = size - PAGE_SIZE * i > PAGE_SIZE ? + PAGE_SIZE : size - PAGE_SIZE * i; + if (((struct sk_buff_nettrace *)frag)->data) { + if (skb_copy_bits(skb, -headerlen + PAGE_SIZE * i, + ((struct sk_buff_nettrace *)frag)->data, + ((struct sk_buff_nettrace *)frag)->len)) + pr_warn("[nettrace]: Copy SKB failed.\n"); + } + dump_size += ((struct sk_buff_nettrace *)frag)->len; + } + } + if (n) + n->total_len = dump_size; + return n; + +nofrag: + release_skb_nettrace(n, tp); + return NULL; +} + diff --git a/net/nettrace/mm.h b/net/nettrace/mm.h new file mode 100644 index 0000000000000000000000000000000000000000..00d386b6097324ebec40784715849c9bf63990d1 --- /dev/null +++ b/net/nettrace/mm.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2024 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_MM_H +#define NETDUMP_MM_H + +#include "group.h" + +struct sk_buff_nettrace { + struct sk_buff_nettrace *next; + struct sk_buff_nettrace *prev; + struct sk_buff_head frag_list; + unsigned int len; + unsigned int total_len; + unsigned char *data; +}; + +typedef struct trace_point TRACE_POINT; +struct sk_buff_nettrace *skb_copy_nettrace(const struct sk_buff *skb, TRACE_POINT *tp); +void init_dump_mm(TRACE_POINT *tp); +void release_dump_mm(TRACE_POINT *tp); +void release_skb_nettrace(struct sk_buff_nettrace *skb, TRACE_POINT *tp); + +struct sk_buff_nettrace *get_cached_skb(TRACE_POINT *tp); +bool put_cached_skb(TRACE_POINT *tp, struct sk_buff_nettrace *skb); +u8 *get_cached_data(TRACE_POINT *tp); +bool put_cached_data(TRACE_POINT *tp, u8 *data); + +#endif //NETDUMP_MM_H diff --git a/net/nettrace/parser.c b/net/nettrace/parser.c new file mode 100644 index 0000000000000000000000000000000000000000..dfe06c065f57b88572b605e9d38d43e5536adae7 --- /dev/null +++ b/net/nettrace/parser.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include "parser.h" +#include "help.h" +#include "utils.h" + +LIST_HEAD(rule_list); + +typedef struct inet_proto { + int number; + char *name; +} INET_PROTO; + +const INET_PROTO proto4[] = { + {.number = 0, .name = "ip"}, + {.number = 1, .name = "icmp"}, + {.number = 2, .name = "igmp"}, + {.number = 4, .name = "ipip"}, + {.number = 6, .name = "tcp"}, + {.number = 8, .name = "egp"}, + {.number = 12, .name = "pup"}, + {.number = 17, .name = "udp"}, + {.number = 22, .name = "idp"}, + {.number = 29, .name = "tp"}, + {.number = 33, .name = "dccp"}, + {.number = 41, .name = "ipv6"}, + {.number = 46, .name = "rsvp"}, + {.number = 47, .name = "gre"}, + {.number = 50, .name = "esp"}, + {.number = 51, .name = "ah"}, + {.number = 92, .name = "mtp"}, + {.number = 94, .name = "beetph"}, + {.number = 98, .name = "encap"}, + {.number = 103, .name = "pim"}, + {.number = 108, .name = "comp"}, + {.number = 132, .name = "sctp"}, + {.number = 136, .name = "udplite"}, + {.number = 137, .name = "mpls"}, + {.number = 255, .name = "raw"} +}; +const int proto4_len = sizeof(proto4) / sizeof(INET_PROTO); + +const INET_PROTO proto3[] = { + {.number = 0x0060, .name = "loop"}, + {.number = 0x0200, .name = "pup"}, + {.number = 0x0201, .name = "pupat"}, + {.number = 0x22F0, .name = "tsn"}, + {.number = 0x22EB, .name = "erspan2"}, + {.number = 0x0800, .name = "ip"}, + {.number = 0x0805, .name = "x25"}, + {.number = 0x0806, .name = "arp"}, + {.number = 0x08FF, .name = "bpq"}, + {.number = 0x0a00, .name = "ieeepup"}, + {.number = 0x0a01, .name = "ieeepupat"}, + {.number = 0x4305, .name = "batman"}, + {.number = 0x6000, .name = "dec"}, + {.number = 0x6001, .name = "dna_dl"}, + {.number = 0x6002, .name = "dna_rc"}, + {.number = 0x6003, .name = "dna_rt"}, + {.number = 0x6004, .name = "lat"}, + {.number = 0x6005, .name = "diag"}, + {.number = 0x6006, .name = "cust"}, + {.number = 0x6007, .name = "sca"}, + {.number = 0x6558, .name = "teb"}, + {.number = 0x8035, .name = "rarp"}, + {.number = 0x809B, .name = "atalk"}, + {.number = 0x80F3, .name = "aarp"}, + {.number = 0x8100, .name = "8021q"}, + {.number = 0x88BE, .name = "erspan"}, + {.number = 0x8137, .name = "ipx"}, + {.number = 0x86DD, .name = "ipv6"}, + {.number = 0x8808, .name = "pause"}, + {.number = 0x8809, .name = "slow"}, + {.number = 0x883E, .name = "wccp"}, + {.number = 0x8847, .name = "mpls_uc"}, + {.number = 0x8848, .name = "mpls_mc"}, + {.number = 0x884c, .name = "atmmpoa"}, + {.number = 0x8863, .name = "ppp_disc"}, + {.number = 0x8864, .name = "ppp_ses"}, + {.number = 0x886c, .name = "link_ctl"}, + {.number = 0x8884, .name = "atmfate"}, + {.number = 0x888E, .name = "pae"}, + {.number = 0x88A2, .name = "aoe"}, + {.number = 0x88A8, .name = "8021ad"}, + {.number = 0x88B5, .name = "802_ex1"}, + {.number = 0x88C7, .name = "preauth"}, + {.number = 0x88CA, .name = "tipc"}, + {.number = 0x88CC, .name = "lldp"}, + {.number = 0x88E5, .name = "macsec"}, + {.number = 0x88E7, .name = "8021ah"}, + {.number = 0x88F5, .name = "mvrp"}, + {.number = 0x88F7, .name = "1588"}, + {.number = 0x88F8, .name = "ncsi"}, + {.number = 0x88FB, .name = "prp"}, + {.number = 0x8906, .name = "fcoe"}, + {.number = 0x8915, .name = "iboe"}, + {.number = 0x890D, .name = "tdls"}, + {.number = 0x8914, .name = "fip"}, + {.number = 0x8917, .name = "80221"}, + {.number = 0x892F, .name = "hsr"}, + {.number = 0x894F, .name = "nsh"}, + {.number = 0x9000, .name = "loopback"}, + {.number = 0x9100, .name = "qinq1"}, + {.number = 0x9200, .name = "qinq2"}, + {.number = 0x9300, .name = "qinq3"}, + {.number = 0xDADA, .name = "edsa"}, + {.number = 0xDADB, .name = "dsa_8021q"}, + {.number = 0xED3E, .name = "ife"}, + {.number = 0xFBFB, .name = "af_iucv"}, + {.number = 0x0600, .name = "802_3_min"} +}; +const int proto3_len = sizeof(proto3) / sizeof(INET_PROTO); + + +/**************************************************************************************************** + * + * This is the part for skb bag parse + * + ****************************************************************************************************/ + +/*parse u32 to ip addr string.*/ +int i2ip(__be32 ip, char *dest) +{ + u8 *tmp = (u8 *) &ip; + + return sprintf(dest, "%u.%u.%u.%u", + *(tmp), + *(tmp + 1), + *(tmp + 2), + *(tmp + 3)) < 0; +} + +/*parse ip addr string to u32.*/ +int ip2i(char *ip_str, u32 *ip) +{ + int ip_v[4]; + int i = 0; + u32 ip_tmp = 0; + int tmp; + + if (sscanf(ip_str, "%d.%d.%d.%d", + &ip_v[0], + &ip_v[1], + &ip_v[2], + &ip_v[3]) < 4) + return -1; + + for (; i < 4; i++) { + tmp = ip_v[i]; + if (tmp < 0 || tmp > 255) + return -1; + ip_tmp += (((u32)tmp) << ((3 - i) * 8)); + } + *ip = ip_tmp; + return 0; +} + +static +int str2proto(char *proto_str, INET_PROTO proto[], int len) +{ + int i = 0; + + for (; i < len; i++) { + if (streq(proto[i].name, proto_str)) + return proto[i].number; + } + return -1; +} + +static +char *proto2str(int proto, INET_PROTO protos[], int len) +{ + int i = 0; + + for (; i < len; i++) { + if (protos[i].number == proto) + return protos[i].name; + } + return NULL; +} + +int str2proto3(char *proto_str) +{ + return str2proto(proto_str, (INET_PROTO *) proto3, proto3_len); +} + +int str2proto4(char *proto_str) +{ + return str2proto(proto_str, (INET_PROTO *) proto4, proto4_len); +} + +char *proto3tostr(int p) +{ + return proto2str(p, (INET_PROTO *) proto3, proto3_len); +} + +char *proto4tostr(int p) +{ + return proto2str(p, (INET_PROTO *) proto4, proto4_len); +} + +void flag2str(struct tcphdr *tcp, char *str) +{ + if (strlen(str) + 1 > TCP_FLAG_LEN) { + log_err("%s: string length(%s) exceeds TCP_FLAG_LEN\n", __func__, str); + return; + } + + if (tcp->psh) + strncat(str, "P", 1); + if (tcp->rst) + strncat(str, "R", 1); + if (tcp->syn) + strncat(str, "S", 1); + if (tcp->fin) + strncat(str, "F", 1); +} + +/**************************************************************************************************** + * + * This is the part for ip bag match + * + ****************************************************************************************************/ + +/* + * 'remote' is the match rule that generated form ip package, 'local' is the + * match rule that user define. + * + * The return value is 0 if matched, and -1 otherwise. + */ +bool match_rule(COMMON_RULE *remote, COMMON_RULE *local) +{ + + if ((local->__flags & FLAG_proto_3) && local->proto_3 != remote->proto_3) + return false; + if ((local->__flags & FLAG_proto_4) && local->proto_4 != remote->proto_4) + return false; + + if (local->__flags & FLAG_port && + local->sport != remote->sport && + local->sport != remote->dport) + return false; + + if ((local->__flags & FLAG_dport) && local->dport != remote->dport) + return false; + if ((local->__flags & FLAG_sport) && local->sport != remote->sport) + return false; + + if (local->__flags & FLAG_addr && ( + local->saddr != (remote->saddr & local->s_mask) && + local->saddr != (remote->daddr & local->s_mask))) + return false; + + if ((local->__flags & FLAG_saddr) && + local->saddr != (remote->saddr & local->s_mask)) + return false; + if ((local->__flags & FLAG_daddr) && + local->daddr != (remote->daddr & local->d_mask)) + return false; + + return true; +} + +bool match_all_rule(COMMON_RULE *remote) +{ + bool is_empty = true; + COMMON_RULE *rule; + + list_for_each_entry(rule, &rule_list, list) { + is_empty = false; + if (match_rule(remote, rule)) + return true; + } + + if (is_empty) + return true; + return false; +} + +int add_rule(COMMON_RULE *rule) +{ + if (rule->__flags) { + list_add_tail(&rule->list, &rule_list); + return 0; + } + return -1; +} + +void free_rules(void) { + COMMON_RULE *rule, *pre; + list_for_each_entry_safe(rule, pre, &rule_list, list) { + kfree(rule); + } +} diff --git a/net/nettrace/parser.h b/net/nettrace/parser.h new file mode 100644 index 0000000000000000000000000000000000000000..edbb714a6f43b06d336799e5718df6d951a9cda0 --- /dev/null +++ b/net/nettrace/parser.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef IP_PARSER_H +#define IP_PARSER_H + +#include +#include + +#define IP_ADDR_LEN 16 + +#define TCP_FLAG_LEN 6 + +/* flags for network package. */ +#define FLAG_proto_3 (1UL << 0) +#define FLAG_proto_4 (1UL << 1) +#define FLAG_sport (1UL << 2) +#define FLAG_dport (1UL << 3) +#define FLAG_saddr (1UL << 4) +#define FLAG_daddr (1UL << 5) + +/* flags for match. */ +#define FLAG_addr (1UL << 6) +#define FLAG_port (1UL << 7) + +typedef +struct common_rule { + struct list_head list; + __u16 __flags; + __u16 proto_3; + __u8 proto_4; + __u16 sport; + __u16 dport; + __u32 saddr; + __u32 s_mask; + __u32 daddr; + __u32 d_mask; + __u8 flags; +} COMMON_RULE; + +#define SET_RULE_FLAGS(rule, flags, value) \ +{\ + (rule)->flags = value;\ + (rule)->__flags |= FLAG_##flags;\ +} + +extern int i2ip(u32 ip, char *dest); + +extern int ip2i(char *ip_str, u32 *ip); + +extern int str2proto4(char *proto_str); + +extern int str2proto3(char *proto_str); + +extern char *proto3tostr(int p); + +extern char *proto4tostr(int p); + +extern bool match_rule(COMMON_RULE *remote, COMMON_RULE *local); + +extern bool match_all_rule(COMMON_RULE *remote); + +extern int add_rule(COMMON_RULE *rule); + +extern void free_rules(void); + +extern void flag2str(struct tcphdr *tcp, char *str); + +#endif //IP_PARSER_H diff --git a/net/nettrace/procfs.c b/net/nettrace/procfs.c new file mode 100644 index 0000000000000000000000000000000000000000..ae63fd076d00d3464c1c86530c07e13cc099c081 --- /dev/null +++ b/net/nettrace/procfs.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "procfs.h" +#include "group.h" +#include "dump.h" + +static struct proc_dir_entry *ntrace_proc; + +static int kprobe_tp_info_seq_show(struct seq_file *seq, void *v) +{ + TRACE_POINT *tp; + int i = 0; + + seq_printf(seq, "current registered kernel function:\n"); + for_each_tp(i, tp) { + if (!tp->kprobe) + continue; + seq_printf(seq, "%30s:%p\n", tp->name, tp->kprobe->addr); + } + + return 0; +} + +static int kprobe_tp_info_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, kprobe_tp_info_seq_show, NULL); +} + +static const struct proc_ops kprobe_tp_info_ops = { + .proc_open = kprobe_tp_info_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int nettrace_statistics_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "The total dump_loss: %u\n", dump_loss_due_to_no_memory + + dump_skb_over_cnt + dump_skb_over_size); + seq_printf(seq, "dump_queue_no_memory: %u\n", dump_loss_due_to_no_memory); + seq_printf(seq, "dump_file_over_cnt: %u\n", dump_skb_over_cnt); + seq_printf(seq, "dump_file_over_size:%u\n", dump_skb_over_size); + + return 0; +} + +static int nettrace_statistics_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, nettrace_statistics_seq_show, NULL); +} + +static const struct proc_ops nettrace_statistics_ops = { + .proc_open = nettrace_statistics_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int max_dump_skb_cnt_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", max_dump_skb_cnt); + return 0; +} + +static int max_dump_skb_cnt_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, max_dump_skb_cnt_seq_show, NULL); +} + +static ssize_t max_dump_skb_cnt_proc_write(struct file *file, + const char __user *buf, size_t count, loff_t *pos) +{ + char buffer[32]; + int temp_value; + int err = 0; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &temp_value); + if (err) + goto out; + if (temp_value < 0) { + err = -EINVAL; + goto out; + } + + max_dump_skb_cnt = temp_value; +out: + return err < 0 ? err : count; +} + +static const struct proc_ops max_dump_skb_cnt_ops = { + .proc_open = max_dump_skb_cnt_proc_open, + .proc_read = seq_read, + .proc_write = max_dump_skb_cnt_proc_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int max_dump_file_size_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", max_dump_file_size); + return 0; +} + +static int max_dump_file_size_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, max_dump_file_size_seq_show, NULL); +} + +static ssize_t max_dump_file_size_proc_write(struct file *file, + const char __user *buf, size_t count, loff_t *pos) +{ + char buffer[32]; + int temp_value; + int err = 0; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &temp_value); + if (err) + goto out; + if (temp_value < 0) { + err = -EINVAL; + goto out; + } + + max_dump_file_size = temp_value; +out: + return err < 0 ? err : count; +} + +static const struct proc_ops max_dump_file_size_ops = { + .proc_open = max_dump_file_size_proc_open, + .proc_read = seq_read, + .proc_write = max_dump_file_size_proc_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +int __net_init ntrace_proc_init(void) +{ + + ntrace_proc = proc_mkdir("ntrace", init_net.proc_net); + + if (ntrace_proc == NULL) + return -ENOMEM; + + if (!proc_create("kprobe", S_IRUGO, ntrace_proc, &kprobe_tp_info_ops)) + goto err_rmdir_ntrace; + if (!proc_create("max_dump_skb_cnt", 0644, ntrace_proc, &max_dump_skb_cnt_ops)) + goto err_rmdir_ntrace; + if (!proc_create("max_dump_file_size", 0644, ntrace_proc, &max_dump_file_size_ops)) + goto err_rmdir_ntrace; + if (!proc_create("statistics", 0444, ntrace_proc, &nettrace_statistics_ops)) + goto err_rmdir_ntrace; + + return 0; + +err_rmdir_ntrace: + remove_proc_subtree("ntrace", init_net.proc_net); + return -ENOMEM; +} + +void __net_exit ntrace_proc_exit(void) +{ + remove_proc_entry("kprobe", ntrace_proc); + proc_remove(ntrace_proc); +} diff --git a/net/nettrace/procfs.h b/net/nettrace/procfs.h new file mode 100644 index 0000000000000000000000000000000000000000..f61f85b6a4a4b8bd3450fd41b0272f56c0a8bf5f --- /dev/null +++ b/net/nettrace/procfs.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_PROCFS_H +#define NETDUMP_PROCFS_H + +extern int __net_init ntrace_proc_init(void); + +extern void __net_exit ntrace_proc_exit(void); + +#endif //NETDUMP_PROCFS_H diff --git a/net/nettrace/utils.c b/net/nettrace/utils.c new file mode 100644 index 0000000000000000000000000000000000000000..3263f0cbc12e9d7db17c54ac6c7d37ca3eda4286 --- /dev/null +++ b/net/nettrace/utils.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + + +int file_append(struct file *file, void *data, unsigned int size) +{ + loff_t pos = file->f_pos; + int n = 1; + + while (size > 0 && n > 0) { + n = __kernel_write(file, data, size, &pos); + size -= n; + } + file->f_pos = pos; + return 0; +} + +struct file *file_create(const char *path) +{ + struct file *file = NULL; + + file = filp_open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (IS_ERR(file)) + return NULL; + + return file; +} + +int access_path(const char *path) +{ + struct file *file = NULL; + + file = filp_open(path, O_DIRECTORY, 0644); + if (IS_ERR(file)) + return PTR_ERR(file); + filp_close(file, NULL); + return 0; +} + +void file_close(struct file *file) +{ + filp_close(file, NULL); +} diff --git a/net/nettrace/utils.h b/net/nettrace/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..e9df0cf834a357142240cb696a61a7631531cf9e --- /dev/null +++ b/net/nettrace/utils.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_UTILS_H +#define NETDUMP_UTILS_H + +#include +#include + +#define MAX_FILE_NAME 256 + +#define str_append(dest, fmt, args...) \ + sprintf(dest + strlen(dest), fmt, ##args) + +extern int file_append(struct file *file, void *data, unsigned int size); + +extern struct file *file_create(const char *path); + +extern void file_close(struct file *file); + +static inline int streq(char *a, char *b) +{ + return strcmp(a, b) == 0; +} + +extern int access_path(const char *path); + +#endif //NETDUMP_UTILS_H diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c new file mode 100644 index 0000000000000000000000000000000000000000..23e38f39e97d0aa187d2b808a528f983e657c706 --- /dev/null +++ b/tools/accounting/delaytop.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * delaytop.c - task delay monitoring tool. + * + * This tool provides real-time monitoring and statistics of + * system, container, and task-level delays, including CPU, + * memory, IO, and IRQ and delay accounting. It supports both + * interactive (top-like), and can output delay information + * for the whole system, specific containers (cgroups), or + * individual tasks (PIDs). + * + * Key features: + * - Collects per-task delay accounting statistics via taskstats. + * - Supports sorting, filtering. + * - Supports both interactive (screen refresh). + * + * Copyright (C) Fan Yu, ZTE Corp. 2025 + * Copyright (C) Wang Yaxin, ZTE Corp. 2025 + * + * Compile with + * gcc -I/usr/src/linux/include delaytop.c -o delaytop + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) + +#define TASK_COMM_LEN 16 +#define MAX_MSG_SIZE 1024 +#define MAX_TASKS 1000 +#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field + +/* Program settings structure */ +struct config { + int delay; /* Update interval in seconds */ + int iterations; /* Number of iterations, 0 == infinite */ + int max_processes; /* Maximum number of processes to show */ + char sort_field; /* Field to sort by */ + int output_one_time; /* Output once and exit */ + int monitor_pid; /* Monitor specific PID */ + char *container_path; /* Path to container cgroup */ +}; + +/* Task delay information structure */ +struct task_info { + int pid; + int tgid; + char command[TASK_COMM_LEN]; + unsigned long long cpu_count; + unsigned long long cpu_delay_total; + unsigned long long blkio_count; + unsigned long long blkio_delay_total; + unsigned long long swapin_count; + unsigned long long swapin_delay_total; + unsigned long long freepages_count; + unsigned long long freepages_delay_total; + unsigned long long thrashing_count; + unsigned long long thrashing_delay_total; + unsigned long long compact_count; + unsigned long long compact_delay_total; + unsigned long long wpcopy_count; + unsigned long long wpcopy_delay_total; + unsigned long long irq_count; + unsigned long long irq_delay_total; +}; + +/* Container statistics structure */ +struct container_stats { + int nr_sleeping; /* Number of sleeping processes */ + int nr_running; /* Number of running processes */ + int nr_stopped; /* Number of stopped processes */ + int nr_uninterruptible; /* Number of uninterruptible processes */ + int nr_io_wait; /* Number of processes in IO wait */ +}; + +/* Global variables */ +static struct config cfg; +static struct task_info tasks[MAX_TASKS]; +static int task_count; +static int running = 1; +static struct container_stats container_stats; + +/* Netlink socket variables */ +static int nl_sd = -1; +static int family_id; + +/* Set terminal to non-canonical mode for q-to-quit */ +static struct termios orig_termios; +static void enable_raw_mode(void) +{ + struct termios raw; + + tcgetattr(STDIN_FILENO, &orig_termios); + raw = orig_termios; + raw.c_lflag &= ~(ICANON | ECHO); + tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw); +} +static void disable_raw_mode(void) +{ + tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); +} + +/* Display usage information and command line options */ +static void usage(void) +{ + printf("Usage: delaytop [Options]\n" + "Options:\n" + " -h, --help Show this help message and exit\n" + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" + " -o, --once Display once and exit\n" + " -p, --pid=PID Monitor only the specified PID\n" + " -C, --container=PATH Monitor the container at specified cgroup path\n"); + exit(0); +} + +/* Parse command line arguments and set configuration */ +static void parse_args(int argc, char **argv) +{ + int c; + struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"delay", required_argument, 0, 'd'}, + {"iterations", required_argument, 0, 'n'}, + {"pid", required_argument, 0, 'p'}, + {"once", no_argument, 0, 'o'}, + {"processes", required_argument, 0, 'P'}, + {"container", required_argument, 0, 'C'}, + {0, 0, 0, 0} + }; + + /* Set defaults */ + cfg.delay = 2; + cfg.iterations = 0; + cfg.max_processes = 20; + cfg.sort_field = 'c'; /* Default sort by CPU delay */ + cfg.output_one_time = 0; + cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ + cfg.container_path = NULL; + + while (1) { + int option_index = 0; + + c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + usage(); + break; + case 'd': + cfg.delay = atoi(optarg); + if (cfg.delay < 1) { + fprintf(stderr, "Error: delay must be >= 1.\n"); + exit(1); + } + break; + case 'n': + cfg.iterations = atoi(optarg); + if (cfg.iterations < 0) { + fprintf(stderr, "Error: iterations must be >= 0.\n"); + exit(1); + } + break; + case 'p': + cfg.monitor_pid = atoi(optarg); + if (cfg.monitor_pid < 1) { + fprintf(stderr, "Error: pid must be >= 1.\n"); + exit(1); + } + break; + case 'o': + cfg.output_one_time = 1; + break; + case 'P': + cfg.max_processes = atoi(optarg); + if (cfg.max_processes < 1) { + fprintf(stderr, "Error: processes must be >= 1.\n"); + exit(1); + } + if (cfg.max_processes > MAX_TASKS) { + fprintf(stderr, "Warning: processes capped to %d.\n", + MAX_TASKS); + cfg.max_processes = MAX_TASKS; + } + break; + case 'C': + cfg.container_path = strdup(optarg); + break; + default: + fprintf(stderr, "Try 'delaytop --help' for more information.\n"); + exit(1); + } + } +} + +/* Create a raw netlink socket and bind */ +static int create_nl_socket(void) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd < 0) + return -1; + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) { + close(fd); + return -1; + } + + return fd; +} + +/* Send a command via netlink */ +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct sockaddr_nl nladdr; + struct nlattr *na; + int r, buflen; + char *buf; + + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + +/* Get family ID for taskstats via netlink */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + char name[100]; + + strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1); + name[sizeof(name) - 1] = '\0'; + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) + return 0; + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) + return 0; + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + return id; +} + +static int read_comm(int pid, char *comm_buf, size_t buf_size) +{ + char path[64]; + size_t len; + FILE *fp; + + snprintf(path, sizeof(path), "/proc/%d/comm", pid); + fp = fopen(path, "r"); + if (!fp) + return -1; + if (fgets(comm_buf, buf_size, fp)) { + len = strlen(comm_buf); + if (len > 0 && comm_buf[len - 1] == '\n') + comm_buf[len - 1] = '\0'; + } else { + fclose(fp); + return -1; + } + fclose(fp); + return 0; +} + +static int fetch_and_fill_task_info(int pid, const char *comm) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } resp; + struct taskstats stats; + struct nlattr *nested; + struct nlattr *na; + int nested_len; + int nl_len; + int rc; + + if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) { + return -1; + } + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) + return -1; + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) { + nested = (struct nlattr *) NLA_DATA(na); + nested_len = NLA_PAYLOAD(na->nla_len); + while (nested_len > 0) { + if (nested->nla_type == TASKSTATS_TYPE_STATS) { + memcpy(&stats, NLA_DATA(nested), sizeof(stats)); + if (task_count < MAX_TASKS) { + tasks[task_count].pid = pid; + tasks[task_count].tgid = pid; + strncpy(tasks[task_count].command, comm, + TASK_COMM_LEN - 1); + tasks[task_count].command[TASK_COMM_LEN - 1] = '\0'; + SET_TASK_STAT(task_count, cpu_count); + SET_TASK_STAT(task_count, cpu_delay_total); + SET_TASK_STAT(task_count, blkio_count); + SET_TASK_STAT(task_count, blkio_delay_total); + SET_TASK_STAT(task_count, swapin_count); + SET_TASK_STAT(task_count, swapin_delay_total); + SET_TASK_STAT(task_count, freepages_count); + SET_TASK_STAT(task_count, freepages_delay_total); + SET_TASK_STAT(task_count, thrashing_count); + SET_TASK_STAT(task_count, thrashing_delay_total); + SET_TASK_STAT(task_count, compact_count); + SET_TASK_STAT(task_count, compact_delay_total); + SET_TASK_STAT(task_count, wpcopy_count); + SET_TASK_STAT(task_count, wpcopy_delay_total); + SET_TASK_STAT(task_count, irq_count); + SET_TASK_STAT(task_count, irq_delay_total); + task_count++; + } + break; + } + nested_len -= NLA_ALIGN(nested->nla_len); + nested = NLA_NEXT(nested); + } + } + nl_len -= NLA_ALIGN(na->nla_len); + na = NLA_NEXT(na); + } + return 0; +} + +static void get_task_delays(void) +{ + char comm[TASK_COMM_LEN]; + struct dirent *entry; + DIR *dir; + int pid; + + task_count = 0; + if (cfg.monitor_pid > 0) { + if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0) + fetch_and_fill_task_info(cfg.monitor_pid, comm); + return; + } + + dir = opendir("/proc"); + if (!dir) { + fprintf(stderr, "Error opening /proc directory\n"); + return; + } + + while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) { + if (!isdigit(entry->d_name[0])) + continue; + pid = atoi(entry->d_name); + if (pid == 0) + continue; + if (read_comm(pid, comm, sizeof(comm)) != 0) + continue; + fetch_and_fill_task_info(pid, comm); + } + closedir(dir); +} + +/* Calculate average delay in milliseconds */ +static double average_ms(unsigned long long total, unsigned long long count) +{ + if (count == 0) + return 0; + return (double)total / 1000000.0 / count; +} + +/* Comparison function for sorting tasks */ +static int compare_tasks(const void *a, const void *b) +{ + const struct task_info *t1 = (const struct task_info *)a; + const struct task_info *t2 = (const struct task_info *)b; + double avg1, avg2; + + switch (cfg.sort_field) { + case 'c': /* CPU */ + avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); + avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); + if (avg1 != avg2) + return avg2 > avg1 ? 1 : -1; + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + + default: + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + } +} + +/* Sort tasks by selected field */ +static void sort_tasks(void) +{ + if (task_count > 0) + qsort(tasks, task_count, sizeof(struct task_info), compare_tasks); +} + +/* Get container statistics via cgroupstats */ +static void get_container_stats(void) +{ + int rc, cfd; + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } req, resp; + struct nlattr *na; + int nl_len; + struct cgroupstats stats; + + /* Check if container path is set */ + if (!cfg.container_path) + return; + + /* Open container cgroup */ + cfd = open(cfg.container_path, O_RDONLY); + if (cfd < 0) { + fprintf(stderr, "Error opening container path: %s\n", cfg.container_path); + return; + } + + /* Send request for container stats */ + if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) { + fprintf(stderr, "Failed to send request for container stats\n"); + close(cfd); + return; + } + + /* Receive response */ + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { + fprintf(stderr, "Failed to receive response for container stats\n"); + close(cfd); + return; + } + + /* Parse response */ + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) { + /* Get the cgroupstats structure */ + memcpy(&stats, NLA_DATA(na), sizeof(stats)); + + /* Fill container stats */ + container_stats.nr_sleeping = stats.nr_sleeping; + container_stats.nr_running = stats.nr_running; + container_stats.nr_stopped = stats.nr_stopped; + container_stats.nr_uninterruptible = stats.nr_uninterruptible; + container_stats.nr_io_wait = stats.nr_io_wait; + break; + } + nl_len -= NLA_ALIGN(na->nla_len); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + } + + close(cfd); +} + +/* Display results to stdout or log file */ +static void display_results(void) +{ + time_t now = time(NULL); + struct tm *tm_now = localtime(&now); + char timestamp[32]; + int i, count; + FILE *out = stdout; + + fprintf(out, "\033[H\033[J"); + + if (cfg.container_path) { + fprintf(out, "Container Information (%s):\n", cfg.container_path); + fprintf(out, "Processes: running=%d, sleeping=%d, ", + container_stats.nr_running, container_stats.nr_sleeping); + fprintf(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n", + container_stats.nr_stopped, container_stats.nr_uninterruptible, + container_stats.nr_io_wait); + } + fprintf(out, "Top %d processes (sorted by CPU delay):\n\n", + cfg.max_processes); + fprintf(out, " PID TGID COMMAND CPU(ms) IO(ms) "); + fprintf(out, "SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms)\n"); + fprintf(out, "-----------------------------------------------"); + fprintf(out, "----------------------------------------------\n"); + count = task_count < cfg.max_processes ? task_count : cfg.max_processes; + + for (i = 0; i < count; i++) { + fprintf(out, "%5d %5d %-15s ", + tasks[i].pid, tasks[i].tgid, tasks[i].command); + fprintf(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", + average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), + average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), + average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), + average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), + average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), + average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), + average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), + average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); + } + + fprintf(out, "\n"); +} + +/* Main function */ +int main(int argc, char **argv) +{ + int iterations = 0; + int use_q_quit = 0; + + /* Parse command line arguments */ + parse_args(argc, argv); + + /* Setup netlink socket */ + nl_sd = create_nl_socket(); + if (nl_sd < 0) { + fprintf(stderr, "Error creating netlink socket\n"); + exit(1); + } + + /* Get family ID for taskstats via netlink */ + family_id = get_family_id(nl_sd); + if (!family_id) { + fprintf(stderr, "Error getting taskstats family ID\n"); + close(nl_sd); + exit(1); + } + + if (!cfg.output_one_time) { + use_q_quit = 1; + enable_raw_mode(); + printf("Press 'q' to quit.\n"); + fflush(stdout); + } + + /* Main loop */ + while (running) { + /* Get container stats if container path provided */ + if (cfg.container_path) + get_container_stats(); + + /* Get task delays */ + get_task_delays(); + + /* Sort tasks */ + sort_tasks(); + + /* Display results to stdout or log file */ + display_results(); + + /* Check for iterations */ + if (cfg.iterations > 0 && ++iterations >= cfg.iterations) + break; + + /* Exit if output_one_time is set */ + if (cfg.output_one_time) + break; + + /* Check for 'q' key to quit */ + if (use_q_quit) { + struct timeval tv = {cfg.delay, 0}; + fd_set readfds; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); + + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { + char ch = 0; + + read(STDIN_FILENO, &ch, 1); + if (ch == 'q' || ch == 'Q') { + running = 0; + break; + } + } + } else { + sleep(cfg.delay); + } + } + + /* Restore terminal mode */ + if (use_q_quit) + disable_raw_mode(); + + /* Cleanup */ + close(nl_sd); + if (cfg.container_path) + free(cfg.container_path); + + return 0; +}