From b45e49472b93dd33f2e6b8b4f483e1bf26e297ba Mon Sep 17 00:00:00 2001 From: Yaxin Wang Date: Thu, 19 Jun 2025 21:18:43 +0800 Subject: [PATCH 1/2] tools/accounting/delaytop: add delaytop to record top-n task delay ANBZ: #22226 commit dd6197b9b193f82f5adc5ca5cb6abf20622a8491 upstream. Problem ======= The "getdelays" can only display the latency of a single task by specifying a PID, but it has the following limitations: 1. single-task perspective: only supports querying the latency (CPU, I/O, memory, etc.) of an individual task via PID and cannot provide a global analysis of high-latency processes across the system. 2. lack of High-Latency process awareness: when the overall system latency is high (e.g., a spike in CPU latency), there is no way to quickly identify the top N processes contributing to the highest latency. 3. poor interactivity: It lacks dynamic sorting and refresh capabilities (similar to top), making it difficult to monitor latency changes in real time. Solution ======== To address these limitations, we introduce the "delaytop" with the following capabilities: 1. system view: monitors latency metrics (CPU, I/O, memory, IRQ, etc.) for all system processes 2. supports field-based sorting (e.g., default sort by CPU latency in descending order) 3. dynamic interactive interface: focus on specific processes with --pid; limit displayed entries with --processes 20; control monitoring duration with --iterations; Use case ======== bash# ./delaytop Top 20 processes (sorted by CPU delay): PID TGID COMMAND CPU(ms) IO(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms) --------------------------------------------------------------------------------------------- 26 26 kworker/1:0H 5.55 0.00 0.00 0.00 0.00 0.00 0.00 0.00 32 32 kworker/2:0H-kb 2.93 0.00 0.00 0.00 0.00 0.00 0.00 0.00 38 38 kworker/3:0H-ev 2.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 84 84 kworker/R-vfio- 1.62 0.00 0.00 0.00 0.00 0.00 0.00 0.00 24 24 ksoftirqd/1 1.43 0.00 0.00 0.00 0.00 0.00 0.00 0.00 19 19 idle_inject/0 0.99 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16 16 rcu_exp_par_gp_ 0.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 11 11 kworker/0:1 0.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 22 22 idle_inject/1 0.80 0.00 0.00 0.00 0.00 0.00 0.00 0.00 3 3 pool_workqueue_ 0.74 0.00 0.00 0.00 0.00 0.00 0.00 0.00 81 81 scsi_eh_1 0.59 0.00 0.00 0.00 0.00 0.00 0.00 0.00 30 30 ksoftirqd/2 0.42 0.00 0.00 0.00 0.00 0.00 0.00 0.00 36 36 ksoftirqd/3 0.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00 9 9 kworker/0:0-eve 0.36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 8 8 kworker/R-netns 0.34 0.00 0.00 0.00 0.00 0.00 0.00 0.00 76 76 kworker/1:1-pm 0.32 0.00 0.00 0.00 0.00 0.00 0.00 0.00 21 21 cpuhp/1 0.30 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4 4 kworker/R-rcu_g 0.21 0.00 0.00 0.00 0.00 0.00 0.00 0.00 12 12 kworker/u16:0-i 0.20 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1 1 init 0.18 0.00 0.00 0.00 0.00 0.00 0.08 0.00 Link: https://lkml.kernel.org/r/20250619211843633h05gWrBDMFkEH6xAVm_5y@zte.com.cn Co-developed-by: Fan Yu Signed-off-by: Fan Yu Signed-off-by: Yaxin Wang Cc: Balbir Singh Cc: David Hildenbrand Cc: Peilin He Cc: Qiang Tu Cc: wangyong Cc: xu xin Cc: Yang Yang Cc: ye xingchen Cc: Yunkai Zhang Signed-off-by: Andrew Morton --- tools/accounting/delaytop.c | 673 ++++++++++++++++++++++++++++++++++++ 1 file changed, 673 insertions(+) create mode 100644 tools/accounting/delaytop.c diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c new file mode 100644 index 000000000000..23e38f39e97d --- /dev/null +++ b/tools/accounting/delaytop.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * delaytop.c - task delay monitoring tool. + * + * This tool provides real-time monitoring and statistics of + * system, container, and task-level delays, including CPU, + * memory, IO, and IRQ and delay accounting. It supports both + * interactive (top-like), and can output delay information + * for the whole system, specific containers (cgroups), or + * individual tasks (PIDs). + * + * Key features: + * - Collects per-task delay accounting statistics via taskstats. + * - Supports sorting, filtering. + * - Supports both interactive (screen refresh). + * + * Copyright (C) Fan Yu, ZTE Corp. 2025 + * Copyright (C) Wang Yaxin, ZTE Corp. 2025 + * + * Compile with + * gcc -I/usr/src/linux/include delaytop.c -o delaytop + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) + +#define TASK_COMM_LEN 16 +#define MAX_MSG_SIZE 1024 +#define MAX_TASKS 1000 +#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field + +/* Program settings structure */ +struct config { + int delay; /* Update interval in seconds */ + int iterations; /* Number of iterations, 0 == infinite */ + int max_processes; /* Maximum number of processes to show */ + char sort_field; /* Field to sort by */ + int output_one_time; /* Output once and exit */ + int monitor_pid; /* Monitor specific PID */ + char *container_path; /* Path to container cgroup */ +}; + +/* Task delay information structure */ +struct task_info { + int pid; + int tgid; + char command[TASK_COMM_LEN]; + unsigned long long cpu_count; + unsigned long long cpu_delay_total; + unsigned long long blkio_count; + unsigned long long blkio_delay_total; + unsigned long long swapin_count; + unsigned long long swapin_delay_total; + unsigned long long freepages_count; + unsigned long long freepages_delay_total; + unsigned long long thrashing_count; + unsigned long long thrashing_delay_total; + unsigned long long compact_count; + unsigned long long compact_delay_total; + unsigned long long wpcopy_count; + unsigned long long wpcopy_delay_total; + unsigned long long irq_count; + unsigned long long irq_delay_total; +}; + +/* Container statistics structure */ +struct container_stats { + int nr_sleeping; /* Number of sleeping processes */ + int nr_running; /* Number of running processes */ + int nr_stopped; /* Number of stopped processes */ + int nr_uninterruptible; /* Number of uninterruptible processes */ + int nr_io_wait; /* Number of processes in IO wait */ +}; + +/* Global variables */ +static struct config cfg; +static struct task_info tasks[MAX_TASKS]; +static int task_count; +static int running = 1; +static struct container_stats container_stats; + +/* Netlink socket variables */ +static int nl_sd = -1; +static int family_id; + +/* Set terminal to non-canonical mode for q-to-quit */ +static struct termios orig_termios; +static void enable_raw_mode(void) +{ + struct termios raw; + + tcgetattr(STDIN_FILENO, &orig_termios); + raw = orig_termios; + raw.c_lflag &= ~(ICANON | ECHO); + tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw); +} +static void disable_raw_mode(void) +{ + tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); +} + +/* Display usage information and command line options */ +static void usage(void) +{ + printf("Usage: delaytop [Options]\n" + "Options:\n" + " -h, --help Show this help message and exit\n" + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" + " -o, --once Display once and exit\n" + " -p, --pid=PID Monitor only the specified PID\n" + " -C, --container=PATH Monitor the container at specified cgroup path\n"); + exit(0); +} + +/* Parse command line arguments and set configuration */ +static void parse_args(int argc, char **argv) +{ + int c; + struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"delay", required_argument, 0, 'd'}, + {"iterations", required_argument, 0, 'n'}, + {"pid", required_argument, 0, 'p'}, + {"once", no_argument, 0, 'o'}, + {"processes", required_argument, 0, 'P'}, + {"container", required_argument, 0, 'C'}, + {0, 0, 0, 0} + }; + + /* Set defaults */ + cfg.delay = 2; + cfg.iterations = 0; + cfg.max_processes = 20; + cfg.sort_field = 'c'; /* Default sort by CPU delay */ + cfg.output_one_time = 0; + cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ + cfg.container_path = NULL; + + while (1) { + int option_index = 0; + + c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + usage(); + break; + case 'd': + cfg.delay = atoi(optarg); + if (cfg.delay < 1) { + fprintf(stderr, "Error: delay must be >= 1.\n"); + exit(1); + } + break; + case 'n': + cfg.iterations = atoi(optarg); + if (cfg.iterations < 0) { + fprintf(stderr, "Error: iterations must be >= 0.\n"); + exit(1); + } + break; + case 'p': + cfg.monitor_pid = atoi(optarg); + if (cfg.monitor_pid < 1) { + fprintf(stderr, "Error: pid must be >= 1.\n"); + exit(1); + } + break; + case 'o': + cfg.output_one_time = 1; + break; + case 'P': + cfg.max_processes = atoi(optarg); + if (cfg.max_processes < 1) { + fprintf(stderr, "Error: processes must be >= 1.\n"); + exit(1); + } + if (cfg.max_processes > MAX_TASKS) { + fprintf(stderr, "Warning: processes capped to %d.\n", + MAX_TASKS); + cfg.max_processes = MAX_TASKS; + } + break; + case 'C': + cfg.container_path = strdup(optarg); + break; + default: + fprintf(stderr, "Try 'delaytop --help' for more information.\n"); + exit(1); + } + } +} + +/* Create a raw netlink socket and bind */ +static int create_nl_socket(void) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd < 0) + return -1; + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) { + close(fd); + return -1; + } + + return fd; +} + +/* Send a command via netlink */ +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct sockaddr_nl nladdr; + struct nlattr *na; + int r, buflen; + char *buf; + + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + +/* Get family ID for taskstats via netlink */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + char name[100]; + + strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1); + name[sizeof(name) - 1] = '\0'; + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) + return 0; + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) + return 0; + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + return id; +} + +static int read_comm(int pid, char *comm_buf, size_t buf_size) +{ + char path[64]; + size_t len; + FILE *fp; + + snprintf(path, sizeof(path), "/proc/%d/comm", pid); + fp = fopen(path, "r"); + if (!fp) + return -1; + if (fgets(comm_buf, buf_size, fp)) { + len = strlen(comm_buf); + if (len > 0 && comm_buf[len - 1] == '\n') + comm_buf[len - 1] = '\0'; + } else { + fclose(fp); + return -1; + } + fclose(fp); + return 0; +} + +static int fetch_and_fill_task_info(int pid, const char *comm) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } resp; + struct taskstats stats; + struct nlattr *nested; + struct nlattr *na; + int nested_len; + int nl_len; + int rc; + + if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) { + return -1; + } + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) + return -1; + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) { + nested = (struct nlattr *) NLA_DATA(na); + nested_len = NLA_PAYLOAD(na->nla_len); + while (nested_len > 0) { + if (nested->nla_type == TASKSTATS_TYPE_STATS) { + memcpy(&stats, NLA_DATA(nested), sizeof(stats)); + if (task_count < MAX_TASKS) { + tasks[task_count].pid = pid; + tasks[task_count].tgid = pid; + strncpy(tasks[task_count].command, comm, + TASK_COMM_LEN - 1); + tasks[task_count].command[TASK_COMM_LEN - 1] = '\0'; + SET_TASK_STAT(task_count, cpu_count); + SET_TASK_STAT(task_count, cpu_delay_total); + SET_TASK_STAT(task_count, blkio_count); + SET_TASK_STAT(task_count, blkio_delay_total); + SET_TASK_STAT(task_count, swapin_count); + SET_TASK_STAT(task_count, swapin_delay_total); + SET_TASK_STAT(task_count, freepages_count); + SET_TASK_STAT(task_count, freepages_delay_total); + SET_TASK_STAT(task_count, thrashing_count); + SET_TASK_STAT(task_count, thrashing_delay_total); + SET_TASK_STAT(task_count, compact_count); + SET_TASK_STAT(task_count, compact_delay_total); + SET_TASK_STAT(task_count, wpcopy_count); + SET_TASK_STAT(task_count, wpcopy_delay_total); + SET_TASK_STAT(task_count, irq_count); + SET_TASK_STAT(task_count, irq_delay_total); + task_count++; + } + break; + } + nested_len -= NLA_ALIGN(nested->nla_len); + nested = NLA_NEXT(nested); + } + } + nl_len -= NLA_ALIGN(na->nla_len); + na = NLA_NEXT(na); + } + return 0; +} + +static void get_task_delays(void) +{ + char comm[TASK_COMM_LEN]; + struct dirent *entry; + DIR *dir; + int pid; + + task_count = 0; + if (cfg.monitor_pid > 0) { + if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0) + fetch_and_fill_task_info(cfg.monitor_pid, comm); + return; + } + + dir = opendir("/proc"); + if (!dir) { + fprintf(stderr, "Error opening /proc directory\n"); + return; + } + + while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) { + if (!isdigit(entry->d_name[0])) + continue; + pid = atoi(entry->d_name); + if (pid == 0) + continue; + if (read_comm(pid, comm, sizeof(comm)) != 0) + continue; + fetch_and_fill_task_info(pid, comm); + } + closedir(dir); +} + +/* Calculate average delay in milliseconds */ +static double average_ms(unsigned long long total, unsigned long long count) +{ + if (count == 0) + return 0; + return (double)total / 1000000.0 / count; +} + +/* Comparison function for sorting tasks */ +static int compare_tasks(const void *a, const void *b) +{ + const struct task_info *t1 = (const struct task_info *)a; + const struct task_info *t2 = (const struct task_info *)b; + double avg1, avg2; + + switch (cfg.sort_field) { + case 'c': /* CPU */ + avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); + avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); + if (avg1 != avg2) + return avg2 > avg1 ? 1 : -1; + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + + default: + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + } +} + +/* Sort tasks by selected field */ +static void sort_tasks(void) +{ + if (task_count > 0) + qsort(tasks, task_count, sizeof(struct task_info), compare_tasks); +} + +/* Get container statistics via cgroupstats */ +static void get_container_stats(void) +{ + int rc, cfd; + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } req, resp; + struct nlattr *na; + int nl_len; + struct cgroupstats stats; + + /* Check if container path is set */ + if (!cfg.container_path) + return; + + /* Open container cgroup */ + cfd = open(cfg.container_path, O_RDONLY); + if (cfd < 0) { + fprintf(stderr, "Error opening container path: %s\n", cfg.container_path); + return; + } + + /* Send request for container stats */ + if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) { + fprintf(stderr, "Failed to send request for container stats\n"); + close(cfd); + return; + } + + /* Receive response */ + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { + fprintf(stderr, "Failed to receive response for container stats\n"); + close(cfd); + return; + } + + /* Parse response */ + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) { + /* Get the cgroupstats structure */ + memcpy(&stats, NLA_DATA(na), sizeof(stats)); + + /* Fill container stats */ + container_stats.nr_sleeping = stats.nr_sleeping; + container_stats.nr_running = stats.nr_running; + container_stats.nr_stopped = stats.nr_stopped; + container_stats.nr_uninterruptible = stats.nr_uninterruptible; + container_stats.nr_io_wait = stats.nr_io_wait; + break; + } + nl_len -= NLA_ALIGN(na->nla_len); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + } + + close(cfd); +} + +/* Display results to stdout or log file */ +static void display_results(void) +{ + time_t now = time(NULL); + struct tm *tm_now = localtime(&now); + char timestamp[32]; + int i, count; + FILE *out = stdout; + + fprintf(out, "\033[H\033[J"); + + if (cfg.container_path) { + fprintf(out, "Container Information (%s):\n", cfg.container_path); + fprintf(out, "Processes: running=%d, sleeping=%d, ", + container_stats.nr_running, container_stats.nr_sleeping); + fprintf(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n", + container_stats.nr_stopped, container_stats.nr_uninterruptible, + container_stats.nr_io_wait); + } + fprintf(out, "Top %d processes (sorted by CPU delay):\n\n", + cfg.max_processes); + fprintf(out, " PID TGID COMMAND CPU(ms) IO(ms) "); + fprintf(out, "SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms)\n"); + fprintf(out, "-----------------------------------------------"); + fprintf(out, "----------------------------------------------\n"); + count = task_count < cfg.max_processes ? task_count : cfg.max_processes; + + for (i = 0; i < count; i++) { + fprintf(out, "%5d %5d %-15s ", + tasks[i].pid, tasks[i].tgid, tasks[i].command); + fprintf(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", + average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), + average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), + average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), + average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), + average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), + average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), + average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), + average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); + } + + fprintf(out, "\n"); +} + +/* Main function */ +int main(int argc, char **argv) +{ + int iterations = 0; + int use_q_quit = 0; + + /* Parse command line arguments */ + parse_args(argc, argv); + + /* Setup netlink socket */ + nl_sd = create_nl_socket(); + if (nl_sd < 0) { + fprintf(stderr, "Error creating netlink socket\n"); + exit(1); + } + + /* Get family ID for taskstats via netlink */ + family_id = get_family_id(nl_sd); + if (!family_id) { + fprintf(stderr, "Error getting taskstats family ID\n"); + close(nl_sd); + exit(1); + } + + if (!cfg.output_one_time) { + use_q_quit = 1; + enable_raw_mode(); + printf("Press 'q' to quit.\n"); + fflush(stdout); + } + + /* Main loop */ + while (running) { + /* Get container stats if container path provided */ + if (cfg.container_path) + get_container_stats(); + + /* Get task delays */ + get_task_delays(); + + /* Sort tasks */ + sort_tasks(); + + /* Display results to stdout or log file */ + display_results(); + + /* Check for iterations */ + if (cfg.iterations > 0 && ++iterations >= cfg.iterations) + break; + + /* Exit if output_one_time is set */ + if (cfg.output_one_time) + break; + + /* Check for 'q' key to quit */ + if (use_q_quit) { + struct timeval tv = {cfg.delay, 0}; + fd_set readfds; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); + + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { + char ch = 0; + + read(STDIN_FILENO, &ch, 1); + if (ch == 'q' || ch == 'Q') { + running = 0; + break; + } + } + } else { + sleep(cfg.delay); + } + } + + /* Restore terminal mode */ + if (use_q_quit) + disable_raw_mode(); + + /* Cleanup */ + close(nl_sd); + if (cfg.container_path) + free(cfg.container_path); + + return 0; +} -- Gitee From 47005f68839ae74e12ac90d99d82885df96ecc70 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 23 Sep 2025 11:53:09 +0800 Subject: [PATCH 2/2] nettrace: Add kernel packet tracing support ANBZ: #25465 Nettrace is a new kernel module for network packet capture and tracking. It can capture packets from common protocol stacks and various points within the kernel, detect abnormal packets, and print kernel and user-mode stacks at tracepoints. It is designed to be scalable and to have minimal impact on system performance. What can this do? ================= 1) Packet Capture in kernel network stack * Capable of capturing packets at various layers of the kernel network stack, such as the link layer (L2), IP layer (L3), UDP/TCP modules (L4), and socket layer. * Enables network troubleshooting by tracing the path of packets through the protocol stack. 2) support saved as pcap file like tcpdump * Supports capturing network packets at multiple points in the kernel and saving them in PCAP format. 3) insmod nettrace.ko without any parameter, which will print more detailed usage in kernel dmesg. effects consideration =============== Considering the relatively large code volume and the novelty of the functionality, reviewers might be concerned about potential negative impacts. However, I think it may introduce some runtime network performance overhead, as it observes the traversal path of every packet. Apart from this, there will be no other negative impacts. First, this module is a completely independent and decoupled component that does not alter any existing functional flows in the kernel. Moreover, the module can only be compiled as a loadable kernel module (.ko), meaning it is inactive unless explicitly inserted into the running kernel. Second, this functionality has already been deployed and used in production environments at ZTE for several years, proving its stability and effectiveness in real-world scenarios. Co-developped-by: He Peilin Signed-off-by: He Peilin Signed-off-by: xu xin --- net/Kconfig | 1 + net/Makefile | 1 + net/nettrace/Kconfig | 6 + net/nettrace/Makefile | 7 + net/nettrace/Makefile.alone | 17 ++ net/nettrace/core.c | 78 ++++++ net/nettrace/core.h | 42 ++++ net/nettrace/dump.c | 154 ++++++++++++ net/nettrace/dump.h | 90 +++++++ net/nettrace/group.c | 482 ++++++++++++++++++++++++++++++++++++ net/nettrace/group.h | 109 ++++++++ net/nettrace/handler.c | 442 +++++++++++++++++++++++++++++++++ net/nettrace/handler.h | 56 +++++ net/nettrace/help.c | 402 ++++++++++++++++++++++++++++++ net/nettrace/help.h | 76 ++++++ net/nettrace/kprobe.c | 196 +++++++++++++++ net/nettrace/kprobe.h | 207 ++++++++++++++++ net/nettrace/mm.c | 221 +++++++++++++++++ net/nettrace/mm.h | 45 ++++ net/nettrace/parser.c | 314 +++++++++++++++++++++++ net/nettrace/parser.h | 85 +++++++ net/nettrace/procfs.c | 206 +++++++++++++++ net/nettrace/procfs.h | 27 ++ net/nettrace/utils.c | 64 +++++ net/nettrace/utils.h | 44 ++++ 25 files changed, 3372 insertions(+) create mode 100644 net/nettrace/Kconfig create mode 100644 net/nettrace/Makefile create mode 100644 net/nettrace/Makefile.alone create mode 100644 net/nettrace/core.c create mode 100644 net/nettrace/core.h create mode 100644 net/nettrace/dump.c create mode 100644 net/nettrace/dump.h create mode 100644 net/nettrace/group.c create mode 100644 net/nettrace/group.h create mode 100644 net/nettrace/handler.c create mode 100644 net/nettrace/handler.h create mode 100644 net/nettrace/help.c create mode 100644 net/nettrace/help.h create mode 100644 net/nettrace/kprobe.c create mode 100644 net/nettrace/kprobe.h create mode 100644 net/nettrace/mm.c create mode 100644 net/nettrace/mm.h create mode 100644 net/nettrace/parser.c create mode 100644 net/nettrace/parser.h create mode 100644 net/nettrace/procfs.c create mode 100644 net/nettrace/procfs.h create mode 100644 net/nettrace/utils.c create mode 100644 net/nettrace/utils.h diff --git a/net/Kconfig b/net/Kconfig index 092a1c0902ac..e9600faa4404 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -422,6 +422,7 @@ source "net/ceph/Kconfig" source "net/nfc/Kconfig" source "net/psample/Kconfig" source "net/ife/Kconfig" +source "net/nettrace/Kconfig" config LWTUNNEL bool "Network light weight tunnels" diff --git a/net/Makefile b/net/Makefile index 45f3fbaae644..0c8990098eb9 100644 --- a/net/Makefile +++ b/net/Makefile @@ -80,3 +80,4 @@ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_MCTP) += mctp/ obj-$(CONFIG_NET_HANDSHAKE) += handshake/ +obj-$(CONFIG_NET_TRACE) += nettrace/ diff --git a/net/nettrace/Kconfig b/net/nettrace/Kconfig new file mode 100644 index 000000000000..99203c71729a --- /dev/null +++ b/net/nettrace/Kconfig @@ -0,0 +1,6 @@ +config NET_TRACE + tristate "Net trace" + depends on KPROBES + default n + help + Trace net package from the kernel network proto stack. diff --git a/net/nettrace/Makefile b/net/nettrace/Makefile new file mode 100644 index 000000000000..a57c128fdba5 --- /dev/null +++ b/net/nettrace/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for nettrace. +# +nettrace-objs := core.o kprobe.o parser.o dump.o help.o group.o handler.o procfs.o utils.o mm.o +obj-$(CONFIG_NET_TRACE) += nettrace.o + diff --git a/net/nettrace/Makefile.alone b/net/nettrace/Makefile.alone new file mode 100644 index 000000000000..5f9a0aae28bf --- /dev/null +++ b/net/nettrace/Makefile.alone @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile. +# + +nettrace-objs := core.o kprobe.o parser.o dump.o help.o group.o handler.o procfs.o utils.o +obj-m := ntrace.o + +all: clean build + +build: + make -C $(KERNEL_DIR) M=$(shell pwd) modules + +clean: + make -C $(KERNEL_DIR) M=$(shell pwd) clean + +.PHONY: build diff --git a/net/nettrace/core.c b/net/nettrace/core.c new file mode 100644 index 000000000000..fdb909fc4f96 --- /dev/null +++ b/net/nettrace/core.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include "core.h" +#include "help.h" +#include "group.h" +#include "procfs.h" + +enum nettrace_status nt_status = NT_INIT; + +#define MAX_DUMP_QUEUE_MEM_DEFAULT (SK_RMEM_MAX * 200) +int max_dump_queue_mem = MAX_DUMP_QUEUE_MEM_DEFAULT; +#define MAX_DUMP_SKB_CNT_DEFAULT (100 * 1000) +unsigned int max_dump_skb_cnt = MAX_DUMP_SKB_CNT_DEFAULT; +#define MAX_DUMP_FILE_SIZE_DEFAULT (100 * 1024 * 1024) +unsigned int max_dump_file_size = MAX_DUMP_FILE_SIZE_DEFAULT; + +MODULE_DESCRIPTION("Network debugging tools \n \ + insmod nettrace.ko \[probe=\\] \n \ + \[output=\\] \[flag=\] \n \ + \[dump=\\] \n \ + \[trace=\\] \[proto=\] \n \ + \[saddr=\] \[daddr=\] \[addr=\] \n \ + \[sport=\] \[dport=\] \[port=\] \n \ + \[stack=\<0 or 1\>\] \[ustack=\<0 or 1\>\] \[mm=\]"); +/*module init.*/ +KPROBE_INIT { + int err = -EINVAL; + + init_group(); + err = init_args(); + if (err) + goto on_err; + + err = trace_register(); + if (err) + goto on_init_err; + + err = ntrace_proc_init(); + if (err) + goto on_init_err; + + WRITE_ONCE(nt_status, NT_RUNNING); + return 0; + +on_init_err: + free_all_group(); +on_err: + return err; +} + +/*module exit*/ +KPROBE_EXIT { + WRITE_ONCE(nt_status, NT_EXITING); + /* Now start to free all tracepoints */ + free_all_group(); + free_rules(); + ntrace_proc_exit(); +} diff --git a/net/nettrace/core.h b/net/nettrace/core.h new file mode 100644 index 000000000000..286c68ce3b86 --- /dev/null +++ b/net/nettrace/core.h @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * Author: + * Menglong Dong + * Migrator: + * xu xin + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_CORE_H +#define NETDUMP_CORE_H + +#include "kprobe.h" +#include "parser.h" + +/* Internal status of nettrace */ +enum nettrace_status { + /* nettrace is initializing related tracepoints and its dump files */ + NT_INIT, + /* nettrace is ready and tracing */ + NT_RUNNING, + /* somebody is rmmoving the nettrace.ko */ + NT_EXITING +}; + +extern enum nettrace_status nt_status; + +#endif //NETDUMP_CORE_H diff --git a/net/nettrace/dump.c b/net/nettrace/dump.c new file mode 100644 index 000000000000..1f6a9e9f3417 --- /dev/null +++ b/net/nettrace/dump.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include "dump.h" +#include "utils.h" +#include "group.h" +#include "core.h" +#include "mm.h" + +/* The number of packets which is not dumped into pcap due to insufficient memory. */ +unsigned int dump_loss_due_to_no_memory; + +/* The number of packets which is not dumped into pcap due to the limit of max_dump_skb_cnt. */ +unsigned int dump_skb_over_cnt; + +/* The number of packets which is not dumped into pcap due to the limit of max_dump_file_size. */ +unsigned int dump_skb_over_size; + +static int dump_skb(struct sk_buff_nettrace *skb, struct file *to) +{ + struct pcap_pkthdr phdr; + struct timespec64 tv; + struct sk_buff *frag, *tmp; + + ktime_get_real_ts64(&tv); + phdr.ts.tv_sec = tv.tv_sec; + phdr.ts.tv_usec = tv.tv_nsec / 1000; + + phdr.len = skb->total_len; + phdr.caplen = skb->total_len; + + file_append(to, &phdr, sizeof(phdr)); + file_append(to, skb->data, skb->len); + + if (!skb_queue_empty(&skb->frag_list)) { + skb_queue_walk_safe(&skb->frag_list, frag, tmp) + file_append(to, ((struct sk_buff_nettrace *)frag)->data, + ((struct sk_buff_nettrace *)frag)->len); + } + + return 0; +} + +int init_pcap(struct file *f) +{ + struct pcap_file_header hdr; + + hdr.magic = PCAP_MAGIC; + hdr.version_major = PCAP_VERSION_MAJOR; + hdr.version_minor = PCAP_VERSION_MINOR; + hdr.thiszone = sys_tz.tz_dsttime; + hdr.sigfigs = 0; + hdr.snaplen = DEFAULT_SNAPLEN; + hdr.linktype = LINKTYPE_ETHERNET; + + file_append(f, &hdr, sizeof(hdr)); + + return 0; +} + +static __always_inline void dump_queue_lock(struct sk_buff_head *dump_queue, unsigned long flag) +{ + spin_lock_irqsave(&dump_queue->lock, flag); +} + +static __always_inline void dump_queue_unlock(struct sk_buff_head *dump_queue, unsigned long flag) +{ + spin_unlock_irqrestore(&dump_queue->lock, flag); +} + +static void dump_skb_work(struct work_struct *work) +{ + TRACE_POINT *tp = container_of(work, TRACE_POINT, dump_work); + + struct sk_buff_nettrace *skb; + struct sk_buff_head list; + unsigned long flag = 0; + + __skb_queue_head_init(&list); + + dump_queue_lock(&tp->dump_queue, flag); + skb_queue_splice_tail_init(&tp->dump_queue, &list); + dump_queue_unlock(&tp->dump_queue, flag); + + while ((skb = (struct sk_buff_nettrace *) __skb_dequeue(&list))) { + + /* Don't waste time on writing dump_file when exit */ + if (likely(READ_ONCE(nt_status) != NT_EXITING)) { + dump_skb(skb, tp->dump_file); + tp->dump_cnt++; + } + + release_skb_nettrace(skb, tp); + } + return; +} + +void try_dump_skb(struct sk_buff *skb, TRACE_POINT *tp) +{ + struct sk_buff_nettrace *new_skb; + unsigned long flag; + + /* if dump_skb_cnt exceed the uppper limit, drop it */ + if (tp->dump_cnt > max_dump_skb_cnt) { + dump_skb_over_cnt++; + return; + } + + /* if dump_file_size exceed the uppper limit, drop it */ + if (tp->dump_file->f_pos + skb->truesize + + sizeof(struct pcap_pkthdr) >= max_dump_file_size) { + dump_skb_over_size++; + return; + } + + new_skb = skb_copy_nettrace(skb, tp); + + if (!new_skb) { + dump_loss_due_to_no_memory++; + return; + } + + dump_queue_lock(&tp->dump_queue, flag); + __skb_queue_tail(&tp->dump_queue, (struct sk_buff *)new_skb); + dump_queue_unlock(&tp->dump_queue, flag); + + schedule_work(&tp->dump_work); +} + +void init_dump_work(TRACE_POINT *tp) +{ + INIT_WORK(&tp->dump_work, dump_skb_work); + skb_queue_head_init(&tp->dump_queue); +} diff --git a/net/nettrace/dump.h b/net/nettrace/dump.h new file mode 100644 index 000000000000..8c74d4022f78 --- /dev/null +++ b/net/nettrace/dump.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_DUMP_H +#define NETDUMP_DUMP_H + +#include +#include "group.h" + +#define PCAP_MAGIC 0xa1b2c3d4 +#define PCAP_VERSION_MAJOR 2 +#define PCAP_VERSION_MINOR 4 + +#define DEFAULT_SNAPLEN 0x40000 + +#define LINKTYPE_NULL 0 +#define LINKTYPE_ETHERNET 1 /* also for 100Mb and up */ +#define LINKTYPE_EXP_ETHERNET 2 /* 3Mb experimental Ethernet */ +#define LINKTYPE_AX25 3 +#define LINKTYPE_PRONET 4 +#define LINKTYPE_CHAOS 5 +#define LINKTYPE_TOKEN_RING 6 /* DLT_IEEE802 is used for Token Ring */ +#define LINKTYPE_ARCNET 7 +#define LINKTYPE_SLIP 8 +#define LINKTYPE_PPP 9 +#define LINKTYPE_FDDI 10 +#define LINKTYPE_PPP_HDLC 50 /* PPP in HDLC-like framing */ +#define LINKTYPE_PPP_ETHER 51 /* NetBSD PPP-over-Ethernet */ +#define LINKTYPE_ATM_RFC1483 100 /* LLC/SNAP-encapsulated ATM */ +#define LINKTYPE_RAW 101 /* raw IP */ +#define LINKTYPE_SLIP_BSDOS 102 /* BSD/OS SLIP BPF header */ +#define LINKTYPE_PPP_BSDOS 103 /* BSD/OS PPP BPF header */ +#define LINKTYPE_C_HDLC 104 /* Cisco HDLC */ +#define LINKTYPE_IEEE802_11 105 /* IEEE 802.11 (wireless) */ +#define LINKTYPE_ATM_CLIP 106 /* Linux Classical IP over ATM */ +#define LINKTYPE_LOOP 108 /* OpenBSD loopback */ +#define LINKTYPE_LINUX_SLL 113 /* Linux cooked socket capture */ +#define LINKTYPE_LTALK 114 /* Apple LocalTalk hardware */ +#define LINKTYPE_ECONET 115 /* Acorn Econet */ +#define LINKTYPE_CISCO_IOS 118 /* For Cisco-internal use */ +#define LINKTYPE_PRISM_HEADER 119 /* 802.11+Prism II monitor mode */ +#define LINKTYPE_AIRONET_HEADER 120 /* FreeBSD Aironet driver stuff */ + +struct pcap_file_header { + uint32_t magic; + uint16_t version_major; + uint16_t version_minor; + int32_t thiszone; /* gmt to local correction */ + uint32_t sigfigs; /* accuracy of timestamps */ + uint32_t snaplen; /* max length saved portion of each pkt */ + uint32_t linktype; /* data link type (LINKTYPE_*) */ +}; + +struct timeval_compat { + uint32_t tv_sec; /* seconds */ + uint32_t tv_usec; /* microseconds */ +}; + +struct pcap_pkthdr { + struct timeval_compat ts; /* time stamp using 32 bits fields */ + uint32_t caplen; /* length of portion present */ + uint32_t len; /* length this packet (off wire) */ +}; + +extern unsigned int dump_loss_due_to_no_memory; +extern unsigned int max_dump_skb_cnt, max_dump_file_size, dump_skb_over_cnt, dump_skb_over_size; + +extern void try_dump_skb(struct sk_buff *skb, TRACE_POINT *tp); + +extern int init_pcap(struct file *f); + +extern void init_dump_work(TRACE_POINT *tp); + +#endif //NETDUMP_DUMP_H diff --git a/net/nettrace/group.c b/net/nettrace/group.c new file mode 100644 index 000000000000..5cb1edc34111 --- /dev/null +++ b/net/nettrace/group.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include "group.h" +#include "utils.h" +#include "kprobe.h" +#include "help.h" +#include "handler.h" +#include "dump.h" +#include "mm.h" + +/* + * The kernel function that we interest in. Normally, they can be divided into + * four part: ethernet II, IP layer, udp layer, tcp layer and some common + * function, such as kfree_skb(). + */ +struct trace_point all_tp[] = { + +#define SKB_TP(i, g, n) {.skb_index = i, .groups = g, .name = n} +#define PSKB_TP(i, g, n) {.pskb_index = i, .groups = g, .name = n} +#define SK_TP(i, g, n) {.sock_index = i, .groups = g, .name = n} +#define SS_TP(skb, sk, g, n) \ + {.skb_index = skb, .sock_index = sk, .groups = g, .name = n} + + /* net link layout trace points. */ + PSKB_TP(1, "link_input", "__netif_receive_skb_core"), + SKB_TP(2, "link_input", "napi_gro_receive"), + SKB_TP(1, "link_input", "netif_receive_skb_internal"), + SKB_TP(1, "link_input", "__netif_receive_skb"), + SKB_TP(1, "link_input", "netif_rx"), + SKB_TP(1, "link_input", "enqueue_to_backlog"), + SKB_TP(1, "link_output", "__dev_queue_xmit"), + SKB_TP(1, "link_output", "dev_hard_start_xmit"), + SKB_TP(1, "link_output", "dev_queue_xmit_accel"), + SKB_TP(2, "link_output", "dev_forward_skb"), + SKB_TP(1, "link_output", "skb_do_redirect"), + //tc_classify is discarded in Linux 4.19 and later version. + //{.skb_index = 1, .groups = "link_input", .name = "tc_classify", .is_ret = true}, + + /* ip layout trace points. */ + SKB_TP(1, "ip_input", "ip_rcv"), + SKB_TP(3, "ip_input", "ip_rcv_finish"), + SKB_TP(1, "ip_input", "ip_route_input_noref"), + {.skb_index = 1, .groups = "ip_input", .name = "fib_validate_source", .is_ret = true}, + SKB_TP(2, "ip_input", "ip_rcv_finish_core"), + SKB_TP(1, "ip_input", "ip_local_deliver"), + SKB_TP(3, "ip_input", "ip_local_deliver_finish"), + SKB_TP(1, "ip_input", "ip_forward"), + SKB_TP(3, "ip_input", "ip_forward_finish"), + SKB_TP(2, "ip_input", "ip_send_skb"), + SS_TP(3, 2, "ip_output", "__ip_local_out"), + SKB_TP(3, "ip_output", "ip_output"), + SKB_TP(3, "ip_output", "ip_finish_output"), + SKB_TP(3, "ip_output", "ip_finish_output2"), + + /* udp layout trace points. */ + SKB_TP(1, "udp_input", "__udp4_lib_rcv"), + SKB_TP(2, "udp_input", "udp_queue_rcv_skb"), + SKB_TP(2, "udp_input", "__udp_enqueue_schedule_skb"), + SK_TP(1, "udp_input", "__skb_recv_udp"), + SK_TP(1, "udp_input", "udp_recvmsg"), + + /* tcp layout trace points. */ + SKB_TP(1, "tcp_input", "tcp_v4_rcv"), + SS_TP(2, 1, "tcp_input", "tcp_v4_do_rcv"), + SS_TP(2, 1, "tcp_input", "tcp_rcv_established"), + SS_TP(2, 1, "tcp_input", "tcp_rcv_state_process"), + SS_TP(2, 1, "tcp_input", "tcp_data_queue"), + SS_TP(2, 1, "tcp_input", "tcp_queue_rcv"), + SK_TP(1, "tcp_input", "tcp_recvmsg"), + + SK_TP(1, "tcp_output", "tcp_sendmsg"), + SK_TP(1, "tcp_output", "tcp_push"), + SK_TP(1, "tcp_output", "tcp_write_xmit"), + SK_TP(1, "tcp_output", "tcp_set_state"), + SS_TP(2, 1, "tcp_output", "__tcp_transmit_skb"), + + + /* common skb trace points. */ + SKB_TP(1, "error", "kfree_skb_reason"), + SKB_TP(1, "normal", "consume_skb"), + + /* Compile when VNet MACVLAN=y */ + /* macvlan drop stat. */ + SKB_TP(1, "macvlan", "macvlan_start_xmit"), + SKB_TP(1, "macvlan", "macvlan_handle_frame"), + +#undef SKB_TP +#undef SK_TP +#undef SS_TP +}; +const int all_tp_len = sizeof(all_tp) / sizeof(struct trace_point); +TRACE_GROUP *all_group; + +static TRACE_GROUP *query_group(char *name, TRACE_GROUP *tp); + +/**************************************************************************************************** + * + * This is the part for trace point and group query. + * + ****************************************************************************************************/ + +static TRACE_GROUP +*parent_group(TRACE_GROUP *tg, TRACE_GROUP *parent) +{ + TRACE_GROUP *tmp, *tmp2; + list_for_each_entry(tmp, &parent->groups, list) { + if (tmp == tg) + return parent; + if ((tmp2 = parent_group(tg, tmp)) != NULL) + return tmp2; + } + return NULL; +} + +static inline +TRACE_GROUP *parent_group_all(TRACE_GROUP *tg) +{ + return parent_group(tg, all_group); +} + +static TRACE_POINT *query_tp(char *name) +{ + int i = 0; + for (; i < all_tp_len; i++) { + if (streq(name, all_tp[i].name)) + return &all_tp[i]; + } + return NULL; +} + +static void *query_handler(TRACE_GROUP *tg) +{ + if (tg == NULL) + return all_group->handler; + + if (tg->handler) + return tg->handler; + + return query_handler(parent_group_all(tg)); +} + +static void *query_ret_handler(TRACE_GROUP *tg) +{ + if (tg == NULL) + return all_group->ret_handler; + + if (tg->ret_handler) + return tg->ret_handler; + + return query_ret_handler(parent_group_all(tg)); +} + +static TRACE_GROUP +*query_group(char *name, TRACE_GROUP *tp) +{ + TRACE_GROUP *pos, *tmp; + + if (streq(tp->name, name)) + return tp; + + list_for_each_entry(pos, &tp->groups, list) { + if ((tmp = query_group(name, pos)) != NULL) + return tmp; + } + return NULL; +} + +/**************************************************************************************************** + * + * This is the part for trace point register. + * + ****************************************************************************************************/ + +static int +trace_point_register(TRACE_POINT *tp, TRACE_GROUP *tg) +{ + char path[MAX_FILE_NAME] = {}; + struct kprobe *p; + struct file *dump_file; + + if (tp->kprobe) + return 0; + + if (tp->skb_index <= 0 && tp->sock_index <= 0 && tp->pskb_index <= 0) { + log_err("kprobe %s has no index!\n", tp->name); + goto out_err; + } + + if (tp->is_ret) { + p = (struct kprobe *) kretprobe_declare(tp->name, + query_ret_handler(tg), + entry_handler_general); + if (!p) + goto out_err; + ((struct kretprobe *)p)->data_size = sizeof(RET_DATA); + } + else + p = kprobe_declare(tp->name, query_handler(tg)); + + if (!p) { + log_err("kprobe declare failed: %s\n", tp->name); + goto out_err; + } + + /* NOTE: + * When register kprobe, the given SYMBOL NAME may be not found in kallsyms + * for example, suppose we want to trace the kernel func: + * + * "__netif_receive_skb_core", + * + * HOWEVER, that symbol name might be changed by compilers into + * + * "__netif_receive_skb_core.constprop.0" + * + * So, we finally trace __netif_receive_skb_core.constprop.0 and use it as + * tp->name instead of __netif_receive_skb_core! + */ + if ((tp->is_ret && !c_register_kretprobe((struct kretprobe *) p)) || + (!tp->is_ret && !c_register_kprobe(p))) + tp->kprobe = p; + else { + if (strcmp(tp->name, "macvlan_start_xmit") == 0 || + strcmp(tp->name, "macvlan_handle_frame") == 0) { + log_err(" Please confirm if the macvlan module is inserted, no %s\n", tp->name); + } else { + log_err("kprobe register failed: %s\n", tp->name); + } + goto out_free_err; + } + + if (print_dump) { + snprintf(path, sizeof(path), "%s/%s.pcap", print_dump, tp->name); + dump_file = file_create(path); + if (!dump_file) { + log_err("failed to create dump file: %s\n", path); + goto out_err; + } + init_pcap(dump_file); + init_dump_work(tp); + init_dump_mm(tp); + /* Add write barrier to avoid the compiler instruction recombination. + * We must guarantee that tp->dump_file is init after init_dump_work. + */ + wmb(); + tp->dump_file = dump_file; + } + + return 0; +out_free_err: + kfree(p); +out_err: + return -1; +} + +static void trace_reg_group(TRACE_GROUP *tg) +{ + TRACE_GROUP *tmp_tg; + TP_LIST *tpl; + + log_info("begin register group: %s\n", tg->name); + + list_for_each_entry(tpl, &tg->traces, list) trace_point_register(tpl->tp, tg); + + list_for_each_entry(tmp_tg, &tg->groups, list) trace_reg_group(tmp_tg); + + log_info("end register group: %s\n", tg->name); +} + +/*register the kprobe that defined in 'global_kprobe_list'.*/ +int trace_register(void) +{ + int t = 0; + + for (; t < filter_trace_len; t++) { + char *ft = filter_trace[t]; + TRACE_GROUP *tg = query_group(ft, all_group); + if (!tg) { + log_err("trace: %s not founded!\n", ft); + return -EINVAL; + } + trace_reg_group(tg); + } + + for (t = 0; t < filter_probe_len; t++) { + char *name = filter_probe[t]; + TRACE_POINT *tp = query_tp(name); + if (!tp) { + log_err("probe: %s not founded!\n", name); + return -EINVAL; + } + trace_point_register(tp, all_group); + } + + return 0; +} + +/**************************************************************************************************** + * + * This is the part for trace group functions. + * + * In fact, trace group is organized in form of tree. + * + ****************************************************************************************************/ + +/* + * add trace point to group. + */ +static int add2group(TRACE_POINT *tp, char *groups) +{ + char *group; + TP_LIST *tpl; + TRACE_GROUP *g; + + while ((group = strsep(&groups, ",")) != NULL) { + g = query_group(group, all_group); + if (!g) { + log_err("group: %s not exits!", group); + continue; + } + tpl = kmalloc(sizeof(TP_LIST), GFP_KERNEL); + if (!tpl) + return -ENOMEM; + memset(tpl, 1, sizeof(TP_LIST)); + + tpl->tp = tp; + INIT_LIST_HEAD(&tpl->list); + list_add_tail(&tpl->list, &g->traces); + } + + return 0; +} + +static void init_tp(void) +{ + TRACE_POINT *tp; + int i = 0; + + for_each_tp(i, tp) add2group(tp, tp->groups); +} + +static TRACE_GROUP +*add_trace_group(char *name, char *desc, enum trace_group_level lev, + TRACE_GROUP *parent) +{ + TRACE_GROUP *tmp_tp = kmalloc(sizeof(TRACE_GROUP), GFP_KERNEL); + + if (!tmp_tp) + return NULL; + memset(tmp_tp, 0, sizeof(TRACE_GROUP)); + + INIT_LIST_HEAD(&tmp_tp->list); + INIT_LIST_HEAD(&tmp_tp->groups); + INIT_LIST_HEAD(&tmp_tp->traces); + + strncpy(tmp_tp->name, name, MAX_TP_NAME - 1); + strncpy(tmp_tp->desc, desc, MAX_TP_DESC -1); + tmp_tp->level = lev; + + if (parent != NULL) + list_add_tail(&tmp_tp->list, &parent->groups); + else + all_group = tmp_tp; + + return tmp_tp; +} + +/* This function has been discarded */ +/* +static +void copy_tg(TRACE_GROUP *tg, TRACE_GROUP *child) +{ + TRACE_GROUP *tmp_tg = NULL; + TP_LIST *tpl = NULL; + + if (!tg || !child) + return; + + list_for_each_entry(tpl, &child->traces, list) add2group(tpl->tp, tg->name); + + list_for_each_entry(tmp_tg, &child->groups, list) copy_tg(tg, tmp_tg); +} +*/ + +#define ADD_TRACE_GROUP(name, lev, parent, desc) \ + name = add_trace_group(#name, desc, lev, parent); +#define ADD_ANNOY_TRACE_GROUP(name, lev, parent, desc) \ + add_trace_group(#name, desc, lev, parent); + +void init_group(void) +{ + /* define all trace groups. */ + TRACE_GROUP *all, *link, *ip, *tcp, *udp, *error, *macvlan; + + ADD_TRACE_GROUP(all, BASIC, NULL, "the root trace") + ADD_TRACE_GROUP(link, BASIC, all, "the link layer.") + ADD_TRACE_GROUP(ip, BASIC, all, "the ip layer"); + ADD_TRACE_GROUP(tcp, BASIC, all, "the tcp layer"); + ADD_TRACE_GROUP(udp, BASIC, all, "the udp layer"); + ADD_TRACE_GROUP(error, BASIC, all, "the scene that free error package."); + ADD_TRACE_GROUP(macvlan, BASIC, all, "macvlan receive and send skb."); + + /* for kw scan. */ + if (!all) + return; + all->handler = post_handler_general; + all->ret_handler = ret_handler_general; + + /* general network protocol stack. */ + ADD_ANNOY_TRACE_GROUP(link_input, BASIC, link, "the link layer that receive package"); + ADD_ANNOY_TRACE_GROUP(link_output, BASIC, link, "the link layer that send package"); + + ADD_ANNOY_TRACE_GROUP(ip_input, BASIC, ip, "the ip layer that receive package"); + ADD_ANNOY_TRACE_GROUP(ip_output, BASIC, ip, "the ip layer that send package"); + + ADD_ANNOY_TRACE_GROUP(tcp_input, BASIC, tcp, "the tcp layer that receive package"); + ADD_ANNOY_TRACE_GROUP(tcp_output, BASIC, tcp, "the tcp layer that send package"); + + ADD_ANNOY_TRACE_GROUP(udp_input, BASIC, udp, "the udp layer that receive package"); + + ADD_ANNOY_TRACE_GROUP(normal, BASIC, all, "the scene that free normal package."); + + init_tp(); +} + +static void free_group(TRACE_GROUP *tg) +{ + TP_LIST *tp_pos, *tp_next; + TRACE_GROUP *pos, *next; + TRACE_POINT *tp; + + list_for_each_entry_safe(pos, next, &tg->groups, list) free_group(pos); + + list_for_each_entry_safe(tp_pos, tp_next, &tg->traces, list) { + tp = tp_pos->tp; + kfree(tp_pos); + + if (tp->kprobe) { + if (tp->is_ret) + c_unregister_kretprobe((struct kretprobe *) tp->kprobe); + else + c_unregister_kprobe(tp->kprobe); + kfree(tp->kprobe); + tp->kprobe = NULL; + } + + /* Dump skb work must be done before tp is removed */ + if (print_dump && tp->dump_work.func) + flush_work(&tp->dump_work); + + if (tp->dump_file) { + file_close(tp->dump_file); + tp->dump_file = NULL; + release_dump_mm(tp); + } + } + kfree(tg); +} + +void free_all_group(void) +{ + free_group(all_group); +} diff --git a/net/nettrace/group.h b/net/nettrace/group.h new file mode 100644 index 000000000000..1cc54f1fd394 --- /dev/null +++ b/net/nettrace/group.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_GROUP_H +#define NETDUMP_GROUP_H + +#include +#include +#include "mm.h" + +#define MAX_TP_NAME KSYM_NAME_LEN +/* The length of group name */ +#define MAX_TP_GROUP 128 +/* The length of TP description */ +#define MAX_TP_DESC 256 + +#define for_each_tp(i, p) \ + for (i = 0; i < all_tp_len && ({ p = &all_tp[i]; 1;}); i++) + +/*Definition of kprobe point that we predefined*/ +typedef struct trace_point { + /* The traced function name */ + char name[MAX_TP_NAME]; + /* To define the position index of struct sk_buff *skb in a certain + * trace_point funcion + */ + int skb_index; + /* To define the position index of struct sk_buff **pskb in a certain + * trace_point funcion + */ + int pskb_index; + + int sock_index; + /* Count the number of skb when dumpping in the current TP. */ + unsigned int dump_cnt; + char desc[MAX_TP_DESC]; + char groups[MAX_TP_GROUP]; + bool is_ret; + struct kprobe *kprobe; + struct file *dump_file; + struct work_struct dump_work; + struct sk_buff_head dump_queue; + struct llist_head skbcache; + int nr_skb_objs; + struct llist_head datacache; + int nr_data_objs; +} TRACE_POINT; + +typedef struct tp_list { + TRACE_POINT *tp; + struct list_head list; +} TP_LIST; + +enum trace_group_level { + BASIC, + MOD +}; + +struct kretprobe_instance; + +typedef struct ret_data { + struct sk_buff *skb; + struct sock *sk; +} RET_DATA; + +typedef struct trace_group { + char name[MAX_TP_NAME]; + char desc[MAX_TP_DESC]; + enum trace_group_level level; + struct list_head list; + struct list_head groups; + struct list_head traces; + bool activated; + + void (*handler)(struct kprobe *p, + struct pt_regs *regs, + unsigned long flags); + int (*ret_handler)(struct kretprobe_instance *ri, + struct pt_regs *regs); +} TRACE_GROUP; + +extern const int all_tp_len; +extern struct trace_point all_tp[]; + +extern TRACE_GROUP *all_group; + +extern void init_group(void); + +extern int trace_register(void); + +extern void free_all_group(void); + +#endif //NETDUMP_GROUP_H diff --git a/net/nettrace/handler.c b/net/nettrace/handler.c new file mode 100644 index 000000000000..5a4d87d76967 --- /dev/null +++ b/net/nettrace/handler.c @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "handler.h" +#include "kprobe.h" +#include "group.h" +#include "parser.h" +#include "help.h" +#include "dump.h" +#include "utils.h" +#include "core.h" + +/**************************************************************************************************** + * + * This is the part for filter, by skb or sock + * + ****************************************************************************************************/ + +static int filter_skb(struct sk_buff *skb, struct sock *sk, TRACE_POINT *tp, SKB_OUTPUT *skb_output); + +static int filter_sock(struct sock *sk, TRACE_POINT *tp, SKB_OUTPUT *output); + +/* + * This is the general skb info print function. + * + * The info in 'output' will be printed by 'log_data' log level. + */ +static void general_print(SKB_OUTPUT *output, TRACE_POINT *tp) +{ + char saddr[IP_ADDR_LEN] = {}, daddr[IP_ADDR_LEN] = {}, + output_str[MAX_OUTPUT_LEN] = {}; + COMMON_RULE *rule = &output->rule; + struct icmphdr *icmp; + struct sk_buff *skb = output->skb; + struct net_device *dev = skb ? skb->dev : NULL; + char *type = ""; + + str_append(output_str, "[%d]", dev ? dev->ifindex : 0); + if (tp->is_ret) + str_append(output_str, "[%s,ret:%d]:", output->sym_name, output->ret_val); + else + str_append(output_str, "[%s]:", output->sym_name); + if (rule->proto_3 != ETH_P_IP) { + str_append(output_str, "proto: %s", proto3tostr(rule->proto_3)); + goto begin_print; + } + + if (i2ip(rule->saddr, saddr) || i2ip(rule->daddr, daddr)) { + log_err("parse ip addr error!"); + return; + } + str_append(output_str, "IP %s>%s", saddr, daddr); + + switch (rule->proto_4) { + case IPPROTO_TCP: + str_append(output_str, + " // TCP %d>%d,%s", + ntohs(rule->sport), ntohs(rule->dport), output->flags); + break; + case IPPROTO_UDP: + str_append(output_str, + " // UDP %d>%d", + ntohs(rule->sport), ntohs(rule->dport)); + break; + case IPPROTO_ICMP: + if (!output->skb) { + str_append(output_str, " // ICMP"); + break; + } + + if (!skb_transport_header_was_set(skb)) + icmp = (struct icmphdr *) (skb_network_header(skb) + sizeof(struct iphdr)); + else + icmp = icmp_hdr(skb); + + if (icmp->code == 0 && icmp->type == 8) + type = "request"; + if (icmp->code == 0 && icmp->type == 0) + type = "response"; + + str_append(output_str, + " // ICMP %s %u", + type, + ntohs(icmp->un.echo.sequence)); + break; + default: + str_append(output_str, + " // %s", + proto4tostr(rule->proto_4)); + break; + } + +begin_print: + log_data("%s\n", output_str); + +#if defined(CONFIG_BACKTRACE_USRSTACK_ARM64) || defined(CONFIG_BACKTRACE_USRSTACK_X86_64) + if (print_ustack) + backtrace_usrstack(); +#endif + + if (output->skb && tp->dump_file) + try_dump_skb(output->skb, tp); + + if (print_stack) + dump_stack(); +} + +/* + * Print the network information by print the skb. + * + * Note that this is most about the kernel function that receive skb, + * as the package header in skb is not completed in skb send function. + */ +static int filter_skb(struct sk_buff *skb, struct sock *sk, + TRACE_POINT *tp, SKB_OUTPUT *skb_output) +{ + int proto_3, proto_4, sport, dport; + struct ethhdr *eth; + struct tcphdr *tcp; + struct udphdr *udp; + struct iphdr *ip; + + COMMON_RULE *rule = &skb_output->rule; + skb_output->sym_name = tp->name; + rule->s_mask = 0xffffffff; + rule->d_mask = 0xffffffff; + eth = eth_hdr(skb); + if (!sk) + sk = skb->sk; + + if (!skb_mac_header_was_set(skb)) { + proto_3 = ntohs(skb->protocol); + if (proto_3) + goto parse_network; + + if (!sk) + goto error; + + if (sk->sk_family == PF_INET && skb->network_header) { + proto_3 = ETH_P_IP; + goto parse_network; + } + + return filter_sock(sk, tp, skb_output); + } + skb_output->skb = skb; + proto_3 = ntohs(eth->h_proto); + +parse_network: + SET_RULE_FLAGS(rule, proto_3, proto_3); + if (proto_3 != ETH_P_IP) + goto do_match; + + ip = ip_hdr(skb); + if (likely((u8 *)ip >= skb->head && + (u8 *)ip + sizeof(struct iphdr) <= skb_tail_pointer(skb))) { + proto_4 = ip->protocol; + SET_RULE_FLAGS(rule, proto_4, proto_4); + SET_RULE_FLAGS(rule, saddr, ip->saddr); + SET_RULE_FLAGS(rule, daddr, ip->daddr); + } else { + proto_4 = 0; + } + + switch (proto_4) { + case IPPROTO_TCP: + tcp = tcp_hdr(skb); + if (likely((u8 *)tcp >= skb->head && + (u8 *)tcp + sizeof(struct tcphdr) <= skb_tail_pointer(skb))) { + sport = tcp->source; + dport = tcp->dest; + flag2str(tcp, skb_output->flags); + } else { + sport = 0; + dport = 0; + } + goto flag_port; + + case IPPROTO_UDP: + udp = udp_hdr(skb); + if (likely((u8 *)udp >= skb->head && + (u8 *)udp + sizeof(struct udphdr) <= skb_tail_pointer(skb))) { + sport = udp->source; + dport = udp->dest; + } else { + sport = 0; + dport = 0; + } + goto flag_port; + default: + break; + } + +do_match: + if (match_all_rule(rule)) + return 0; + return -1; + +flag_port: + SET_RULE_FLAGS(rule, sport, sport); + SET_RULE_FLAGS(rule, dport, dport); + goto do_match; + +error: + return -1; +} + +/* + * Print the network information by print the sock. + * + * Note this is most about the process that send skb, during which + * skb headers is not ready and we can not get information from it, + * such ip addr or tcp port. + */ +static int filter_sock(struct sock *sk, TRACE_POINT *tp, SKB_OUTPUT *output) +{ + const struct inet_sock *inet; + int sport = 0, dport = 0; + + COMMON_RULE *rule = &output->rule; + rule->s_mask = 0xffffffff; + rule->d_mask = 0xffffffff; + output->sym_name = tp->name; + + SET_RULE_FLAGS(rule, proto_4, sk->sk_protocol); + + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + goto do_filter; + + inet = inet_sk(sk); + + SET_RULE_FLAGS(rule, proto_3, ETH_P_IP); + SET_RULE_FLAGS(rule, saddr, inet->inet_saddr); + SET_RULE_FLAGS(rule, daddr, inet->inet_daddr); + + sport = inet->inet_sport; + dport = inet->inet_dport; + + SET_RULE_FLAGS(rule, sport, sport); + SET_RULE_FLAGS(rule, dport, dport); + +do_filter: + if (match_all_rule(rule)) + return 0; + return -1; +} + +/**************************************************************************************************** + * + * This is the part for all kind of handlers. + * + ****************************************************************************************************/ + +/*the function that handle skb and sock.*/ +void __post_handler_general(struct kprobe *p, struct pt_regs *regs, unsigned long flags) +{ + + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + SKB_OUTPUT skb_output = {}; + TRACE_POINT *tp = (TRACE_POINT *) p->symbol_name; + + if (tp->sock_index > 0) { + sk = (struct sock *) kprobe_parm(regs, tp->sock_index); + } + + if (tp->skb_index > 0) { + skb = (struct sk_buff *) kprobe_parm(regs, tp->skb_index); + goto do_print_skb; + } + + if (tp->pskb_index > 0) { + skb = *((struct sk_buff **) kprobe_parm(regs, tp->pskb_index)); + goto do_print_skb; + } + + if (sk && !filter_sock(sk, tp, &skb_output)) + general_print(&skb_output, tp); + return; + +do_print_skb: + if (skb && !filter_skb(skb, sk, tp, &skb_output)) + general_print(&skb_output, tp); +} + +void post_handler_general(struct kprobe *p, struct pt_regs *regs, unsigned long flags) +{ + /* If being removed, nettrace should stop tracing as soon as possible */ + if (unlikely(READ_ONCE(nt_status) == NT_EXITING)) + return; + + /* If insmoding nettrace is not completed, don't start to parse */ + if (unlikely(READ_ONCE(nt_status) == NT_INIT)) + return; + + __post_handler_general(p, regs, flags); +} + +int entry_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + TRACE_POINT *tp = NULL; + RET_DATA *data = (RET_DATA *) ri->data; + +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct kretprobe *rp = get_kretprobe(ri); + if (unlikely(!rp)) + return 1; + tp = (TRACE_POINT *) rp->kp.symbol_name; +#else + tp = (TRACE_POINT *) ri->rph->rp->kp.symbol_name; +#endif + if (tp->sock_index > 0) { + sk = (struct sock *) kprobe_parm(regs, tp->sock_index); + } + + if (tp->skb_index > 0) { + skb = (struct sk_buff *) kprobe_parm(regs, tp->skb_index); + } + + if (tp->pskb_index > 0) { + skb = *((struct sk_buff **) kprobe_parm(regs, tp->pskb_index)); + } + + data->sk = sk; + data->skb = skb; + + if (sk == NULL && skb == NULL) + return 1; + return 0; +} + +int __ret_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + SKB_OUTPUT skb_output = {}; + TRACE_POINT *tp = NULL; + RET_DATA *data = (RET_DATA *) ri->data; + +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct kretprobe *rp = get_kretprobe(ri); + if (unlikely(!rp)) + return 1; + tp = (TRACE_POINT *) rp->kp.symbol_name; +#else + tp = (TRACE_POINT *) ri->rph->rp->kp.symbol_name; +#endif + sk = data->sk; + skb_output.ret_val = KPROBE_RET_PARM; + + if ((skb = data->skb) != NULL) + goto do_print_skb; + + if (sk && !filter_sock(sk, tp, &skb_output)) + general_print(&skb_output, tp); + return 0; + +do_print_skb: + if (skb && !filter_skb(skb, sk, tp, &skb_output)) + general_print(&skb_output, tp); + return 0; +} + +int ret_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + /* If being removed, nettrace should stop tracing as soon as possible */ + if (unlikely(READ_ONCE(nt_status) == NT_EXITING)) + return 0; + + /* If insmoding nettrace is not completed, don't start to parse */ + if (unlikely(READ_ONCE(nt_status) == NT_INIT)) + return 0; + + return __ret_handler_general(ri, regs); +} + +/*the function that handle skb and sock.*/ +void post_handler_udp_tracer(struct kprobe *p, struct pt_regs *regs, unsigned long flags) +{ + + struct sk_buff *skb = NULL; + struct sock *sk = NULL; + SKB_OUTPUT skb_output = {}; + + TRACE_POINT *tp = (TRACE_POINT *) p->symbol_name; + + if (tp->sock_index > 0) { + sk = (struct sock *) kprobe_parm(regs, tp->sock_index); + } + + if (tp->skb_index > 0) { + skb = (struct sk_buff *) kprobe_parm(regs, tp->skb_index); + goto do_print_skb; + } + + if (tp->pskb_index > 0) { + skb = *((struct sk_buff **) kprobe_parm(regs, tp->pskb_index)); + goto do_print_skb; + } + + if (sk && !filter_sock(sk, tp, &skb_output)) + goto do_print; + return; + +do_print_skb: + if (skb && !filter_skb(skb, sk, tp, &skb_output)) + goto do_print; + return; + +do_print: + general_print(&skb_output, tp); + + if (!print_stack && streq(tp->name, "kfree_skb_reason")) + dump_stack(); +} + diff --git a/net/nettrace/handler.h b/net/nettrace/handler.h new file mode 100644 index 000000000000..fea3f390a1df --- /dev/null +++ b/net/nettrace/handler.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_HANDLER_H +#define NETDUMP_HANDLER_H + +#include "group.h" +#include "parser.h" + +#define MAX_OUTPUT_LEN 300 + +#if defined(CONFIG_BACKTRACE_USRSTACK_ARM64) || defined(CONFIG_BACKTRACE_USRSTACK_X86_64) +extern void backtrace_usrstack(void); +#endif + +struct kretprobe_instance; + +typedef struct skb_output { + COMMON_RULE rule; + struct sk_buff *skb; + char *sym_name; + char flags[TCP_FLAG_LEN]; + int ret_val; +} SKB_OUTPUT; + +extern TRACE_GROUP *all_group; + +extern void +post_handler_general(struct kprobe *p, struct pt_regs *regs, unsigned long flags); + +extern void +post_handler_udp_tracer(struct kprobe *p, struct pt_regs *regs, unsigned long flags); + +extern int +ret_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs); + +extern int +entry_handler_general(struct kretprobe_instance *ri, struct pt_regs *regs); + +#endif //NETDUMP_HANDLER_H diff --git a/net/nettrace/help.c b/net/nettrace/help.c new file mode 100644 index 000000000000..e029d035a5d9 --- /dev/null +++ b/net/nettrace/help.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include "help.h" + +#include "utils.h" +#include "kprobe.h" +#include "parser.h" +#include "group.h" + +/*param for package filter*/ +MODULE_PARM_DESC(saddr, "filter source ip address,e.g. insmod nettrace.ko saddr=172.16.6.62"); +PARAM_STRING_NAMED(filter_saddr, saddr, NULL) +MODULE_PARM_DESC(daddr, "filter destination ip address,e.g. insmod nettrace.ko daddr=172.16.6.74"); +PARAM_STRING_NAMED(filter_daddr, daddr, NULL) +MODULE_PARM_DESC(addr, "filter source or destination ip address,e.g. insmod nettrace.ko addr=192.168.2.11"); +PARAM_STRING_NAMED(filter_addr, addr, NULL) +MODULE_PARM_DESC(proto, "filter 3 layer or 4 layer net protocol,e.g. insmod nettrace.ko proto=arp"); +PARAM_STRING_NAMED(filter_proto, proto, NULL) +MODULE_PARM_DESC(port, "filter source port or destination port, e.g. insmod nettrace.ko port=1234"); +PARAM_INT_NAMED(filter_port, port, -1) +MODULE_PARM_DESC(sport, "filter source port,e.g. insmod nettrace.ko port=1234"); +PARAM_INT_NAMED(filter_sport, sport, -1) +MODULE_PARM_DESC(dport, "filter destination port,e.g. insmod nettrace.ko port=1234"); +PARAM_INT_NAMED(filter_dport, dport, -1) + +MODULE_PARM_DESC(trace, "a trace is a serial of kernel tracing for special scene,We support various of trace, such as ip,upd,tcp,macvlan,link,link_input,link_output\n \ + e.g. insmod nettrace.ko trace=link"); +PARAM_STRING_ARRAY(filter_trace, trace, 10) +MODULE_PARM_DESC(probe, "this is a list of kernel function where you want to dump network package info. Use 'probe=?' to see all supported kernel functions"); +PARAM_STRING_ARRAY(filter_probe, probe, 10) + +MODULE_PARM_DESC(stack, "print the kernel function call stack,e.g. insmod nettrace.ko stack=1"); +PARAM_INT_NAMED(print_stack, stack, 0) +MODULE_PARM_DESC(ustack, "print the user space call stack,e.g. insmod nettrace.ko ustack=1"); +PARAM_INT_NAMED(print_ustack, ustack, 0) + +MODULE_PARM_DESC(output, "three kind of output supported: ftrace, kernel and file,When comes up with file, it should be a file path, such as /ntrace.log\n \ + e.g. insmod nettrace.ko output=ftrace"); +PARAM_STRING_NAMED(print_output, output, NULL) +MODULE_PARM_DESC(dump, "the directory where you want to put pcap file in. Once thisoption is set, all package filtered will be saved\n \ + e.g. insmod nettrace.ko trace=error dump=./output"); +PARAM_STRING_NAMED(print_dump, dump, NULL) + +MODULE_PARM_DESC(flag, "addition flags supported values v: print addition info,e.g. insmod nettrace.ko flag=v"); +PARAM_STRING_NAMED(param_flag, flag, NULL) +MODULE_PARM_DESC(mm, "the number of pages reserved for each TP during the initialization phase"); +PARAM_INT_NAMED(param_mm, mm, 100) + +output_type op_type; +int if_print_info = 0; + +typedef struct { + struct list_head list; + char *msg; +} PRINT_ENTRY; + +static struct work_struct print_work; +static LIST_HEAD(print_list); +static spinlock_t print_lock; + +static struct file *output_file = NULL; +static char output_index[64]; + +static void print_process(struct work_struct *work) +{ + PRINT_ENTRY *file, *next; + unsigned long flag; + LIST_HEAD(head); + + spin_lock_irqsave(&print_lock, flag); + list_splice_init(&print_list, &head); + spin_unlock_irqrestore(&print_lock, flag); + + list_for_each_entry_safe(file, next, &head, list) { + file_append(output_file, file->msg, (unsigned int) strlen(file->msg)); + + kfree(file->msg); + kfree(file); + } +} + +int init_output_file(char *path) +{ + output_file = file_create(path); + + if (output_file == NULL) + return -1; + + INIT_WORK(&print_work, print_process); + spin_lock_init(&print_lock); + + return 0; +} + +static void print_enqueue(char *fmt, va_list ap) +{ + char buf[MAX_LOG_BUF]; + char *print_buf; + PRINT_ENTRY *print_file; + unsigned long flag; + + vsnprintf(buf, sizeof(buf), fmt, ap); + print_buf = kcalloc(1, strlen(buf) + 1, GFP_KERNEL); + if (!print_buf) + return; + + strncpy(print_buf, buf, strlen(buf)); + + print_file = kcalloc(1, sizeof(PRINT_ENTRY), GFP_KERNEL); + if (!print_file) { + kfree(print_buf); + return; + } + + print_file->msg = print_buf; + + spin_lock_irqsave(&print_lock, flag); + list_add_tail(&print_file->list, &print_list); + spin_unlock_irqrestore(&print_lock, flag); + + schedule_work(&print_work); +} + +void log_base(char *fmt, ...) +{ + + va_list argptr; + va_start(argptr, fmt); + + switch (op_type) { + default: + vprintk(fmt, argptr); + break; + case OUTPUT_FTRACE: + ftrace_vprintk(fmt, argptr); + break; + case OUTPUT_FILE: + print_enqueue(fmt, argptr); + break; + } + + va_end(argptr); +} + +static void print_help(void) +{ + log_data("============================Netdump========================\n"); + log_data("\n"); + log_data("Welcome to use netdump! This is a tool based on kprobe for\n"); + log_data("network bag grab in kernel.\n"); + log_data("\n"); + log_data("Basic usage:\n"); + log_data("\n"); + log_data(" insmod netdump.ko [probe=] \n"); + log_data(" [output=] [flag=]\n"); + log_data(" [dump=]\n"); + log_data(" [trace=] [proto=]\n"); + log_data(" [saddr=] [daddr=] [addr=] \n"); + log_data(" [sport=] [dport=] [port=] \n"); + log_data(" [stack=<0 or 1>] [ustack=<0 or 1>]\n"); + log_data("\n"); + log_data("trace: a trace is a serial of kernel tracing for special scene.\n"); + log_data(" We support various of trace, such ip, tcp, macvlan, etc.\n"); + log_data(" Trace can have children trace, use 'trace=?' to see all supported\n"); + log_data(" trace.\n"); + log_data("\n"); + log_data("probe: this is a list of kernel function where you want\n"); + log_data(" to dump network package info. Use 'probe=?' to see all\n"); + log_data(" supported kernel functions.\n"); + log_data("\n"); + log_data("dump: the directory where you want to put pcap file in. Once this\n"); + log_data(" option is set, all package filtered will be saved.\n"); + log_data("\n"); + log_data("output: three kind of output supported: ftrace, kernel and file.\n"); + log_data(" When comes up with file, it should be a file path, such as /ntrace.log.\n"); + log_data("\n"); + log_data("flag: addition flags. Supported values:\n"); + log_data(" v: print addition info.\n"); + log_data("\n"); + log_data("mm: the number of pages reserved for each TP during the initialization phase,\n"); + log_data(" which is used for temporarily caching messages.\n"); + log_data(" [mm=100] means that each TP initialization will reserve 2 * 100 pages,\n"); + log_data(" 100 for storing SKBs, and 100 for storing data."); + log_data("\n"); + + log_data("stack: print the kernel function call stack.\n"); + log_data("ustack: print the user spack call stack.\n"); + log_data("\n"); + log_data("-----------------------package filter-------------------\n"); + log_data("saddr: source ip addr\n"); + log_data("daddr: dest ip addr\n"); + log_data("addr: source or dest ip addr\n"); + log_data("\n"); + log_data("sport: source udp or tcp port\n"); + log_data("dport: dest udp or tcp port\n"); + log_data("port: source or dest udp or tcp port\n"); + log_data("proto: the network protocol\n"); + + log_data("\n"); + log_data("\n"); + log_data("============================Netdump========================\n"); +} + +static void print_group(TRACE_GROUP *tg) +{ + TRACE_GROUP *tmp_tg; + char tab[] = " "; + ulong cur_len = strlen(output_index); + + log_data("%s%s: %s\n", output_index, tg->name, tg->desc); + if (cur_len + sizeof(tab) > sizeof(output_index)) + return; + strncpy(output_index + cur_len, tab, strlen(tab)); + list_for_each_entry(tmp_tg, &tg->groups, list) { + print_group(tmp_tg); + } + output_index[cur_len] = '\0'; +} + +static void print_trace(void) +{ + memset(output_index, 0, sizeof(output_index)); + print_group(all_group); +} + +static void print_probe(void) +{ + TRACE_POINT *tp; + int i = 0; + + log_data("all supported kprobe:\n\n"); + for_each_tp(i, tp) { + log_data("\t%s\n", tp->name); + } +} + +static int init_rule(void) +{ + COMMON_RULE *rule; + u32 addr_i = 0; + int proto = 0; + + rule = kcalloc(1, sizeof(COMMON_RULE), GFP_KERNEL); + if (!rule) + return -ENOMEM; + + INIT_LIST_HEAD(&rule->list); + rule->s_mask = rule->d_mask = 0xffffffff; + + if (filter_addr) { + if (ip2i(filter_addr, &addr_i)) { + log_err("ip addr format error!\n"); + goto error; + } + addr_i = htonl(addr_i); + rule->saddr = addr_i; + rule->__flags |= FLAG_addr; + } + + if (filter_saddr) { + if (ip2i(filter_saddr, &addr_i)) { + log_err("ip addr format error!\n"); + goto error; + } + addr_i = htonl(addr_i); + SET_RULE_FLAGS(rule, saddr, addr_i); + } + + if (filter_daddr) { + if (ip2i(filter_daddr, &addr_i)) { + log_err("ip addr format error!\n"); + goto error; + } + addr_i = htonl(addr_i); + SET_RULE_FLAGS(rule, daddr, addr_i); + } + + if (filter_sport > 0xffff || filter_dport > 0xffff || filter_port > 0xffff) { + log_err("port range error!\n"); + goto error; + } + + if (filter_port > 0) { + filter_port = (int) htons((u16) filter_port); + rule->sport = filter_port; + rule->__flags |= FLAG_port; + } + + if (filter_sport > 0) { + filter_sport = (int) htons((u16) filter_sport); + SET_RULE_FLAGS(rule, sport, filter_sport); + } + + if (filter_dport > 0) { + filter_dport = (int) htons((u16) filter_dport); + SET_RULE_FLAGS(rule, dport, filter_dport); + } + + if (filter_proto) { + if ((proto = str2proto3(filter_proto)) >= 0) { + SET_RULE_FLAGS(rule, proto_3, proto); + } else if ((proto = str2proto4(filter_proto)) >= 0) { + SET_RULE_FLAGS(rule, proto_4, proto); + } else { + log_err("proto not found!\n"); + goto error; + } + } + + if (add_rule(rule)) + kfree(rule); + + return 0; +error: + kfree(rule); + return -EINVAL; +} + +int init_args(void) +{ + int err = -EINVAL; + + if (print_output != NULL) { + if (streq(print_output, "ftrace")) + op_type = OUTPUT_FTRACE; + else if (streq(print_output, "kernel")) + op_type = OUTPUT_KERNEL; + else if (!init_output_file(print_output)) + op_type = OUTPUT_FILE; + else { + log_err("output type error!\n"); + goto error; + } + } + + if (print_dump != NULL) { + err = access_path(print_dump); + if (err) { + log_err("[nettrace] access_path err:%d\n", err); + log_err("[nettrace] failed to access the dump output directory: %s\n", + print_dump); + goto error; + } + } + + if (filter_trace_len == 0 && filter_probe_len == 0) { + pr_alert("Don't worry. execute 'dmesg' and see usage of nettrace.ko\n"); + print_help(); + goto error; + } + + if (filter_trace_len == 1 && streq(filter_trace[0], "?")) { + pr_alert("Execute 'dmesg' and see available parameters of 'trace='\n"); + print_trace(); + goto error; + } + + if (filter_probe_len == 1 && streq(filter_probe[0], "?")) { + pr_alert("Execute 'dmesg' and see available parameters of 'probe='\n"); + print_probe(); + goto error; + } + + if (param_flag) { + char *tmp_flag; + while ((tmp_flag = strsep(¶m_flag, ",")) != NULL) { + switch (*tmp_flag) { + case 'v': + if_print_info = 1; + break; + default: + log_err("flags:%c not supported\n", *tmp_flag); + goto error; + } + } + } + + if(init_rule()) + goto error; + + return 0; +error: + return err; +} diff --git a/net/nettrace/help.h b/net/nettrace/help.h new file mode 100644 index 000000000000..c9b9471d7841 --- /dev/null +++ b/net/nettrace/help.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_HELP_H +#define NETDUMP_HELP_H + +#include +#include + +#define MAX_LOG_BUF 200 + +typedef enum { + OUTPUT_KERNEL, + OUTPUT_FTRACE, + OUTPUT_FILE +} output_type; + +extern char *output_file_path; +extern output_type op_type; +extern int if_print_info; + +extern char *filter_saddr; +extern char *filter_daddr; +extern char *filter_addr; +extern char *filter_proto; + +extern int filter_port; +extern int filter_sport; +extern int filter_dport; + +extern char *filter_trace[]; +extern char *filter_probe[]; +extern int filter_trace_len; +extern int filter_probe_len; + +extern int print_stack; +extern int print_ustack; + +extern char *print_output; +extern char *print_dump; +extern char *param_flag; +extern int param_mm; + +extern int init_output_file(char *path); + +extern void log_base(char *fmt, ...); + +extern int init_args(void); + +#define log_info(fmt, args...) {if(if_print_info)\ + log_base(fmt, ##args);} +#define log_data(fmt, args...) log_base(fmt, ##args) +#define log_err(fmt, args...) log_base(fmt, ##args) +#define log_debug(fmt, args...) log_base(fmt, ##args) + +static inline void print_leave(void) { + log_info("you just exited netdump, welcome back~\n"); +} + +#endif //NETDUMP_HELP_H diff --git a/net/nettrace/kprobe.c b/net/nettrace/kprobe.c new file mode 100644 index 000000000000..92fe02f6dbd5 --- /dev/null +++ b/net/nettrace/kprobe.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include "kprobe.h" +#include "help.h" +#include "group.h" + +static kprobe_opcode_t *query_kallsym_addr; + +/* + * Get the function arg with index of 'index' from 'regs'. + * + * This is just another version of the 'KPROBE_PARM'. + */ +unsigned long kprobe_parm(struct pt_regs *regs, int index) +{ + switch (index) { + case 1: + return PT_REGS_PARM1(regs); + case 2: + return PT_REGS_PARM2(regs); + case 3: + return PT_REGS_PARM3(regs); + case 4: + return PT_REGS_PARM4(regs); + case 5: + return PT_REGS_PARM5(regs); + default: + return 0; + } +} + +/* + * Declare the kprobe with function call. + */ +struct kprobe *kprobe_declare(const char *sym, void *handler) +{ + struct kprobe *p = kcalloc(1, sizeof(struct kprobe), GFP_KERNEL); + + if (!p) + return NULL; + + p->symbol_name = sym; + p->post_handler = handler; + + return p; +} + +static int skip_entry_handler(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + return 0; +} + +/* + * Declare the kretprobe with function call. + */ +struct kretprobe *kretprobe_declare(const char *sym, void *handler, + void *entry_handler) +{ + struct kretprobe *p = kcalloc(1, sizeof(struct kretprobe), GFP_KERNEL); + + if (!p) + return NULL; + + if (entry_handler == NULL) + entry_handler = skip_entry_handler; + + p->entry_handler = entry_handler; + p->kp.symbol_name = sym; + p->handler = handler; + + return p; +} + +int filter_syms(void *data, const char *name_buf, unsigned long address) +{ + char *name = data; + + if (strstarts(name_buf, name)) { + if (strlen(name_buf) > MAX_TP_NAME) { + log_err("[nettrace: %s] func name is too long: %s\n", __func__, name_buf); + return -1; + } + strncpy(name, name_buf, MAX_TP_NAME); + query_kallsym_addr = (kprobe_opcode_t *) address; + return -1; + } + return 0; +} + +/* + * query the address of syms in kallsyms. + */ +int query_kallsyms(char *name) +{ + if (strlen(name) + 1 >= MAX_TP_NAME) { + log_err("[nettrace: %s] func name is too long: %s\n", __func__, name); + return -1; + } + strncat(name, ".", 1); + return kallsyms_on_each_symbol(filter_syms, name); +} + +/* + * register the kprobe with function call. + */ +int c_register_kprobe(struct kprobe *p) +{ + if (register_kprobe(p) < 0) { + char name[MAX_TP_NAME] = {}; + strncpy(name, p->symbol_name, sizeof(name) - 1); + if (query_kallsyms(name)) { + pr_alert("[nettrace] Note: The function we want to trace (%s) has become " + "%s bacause the kernel compiler had added a suffix in its symbol! " + "There may be several versions of %s, please check /proc/kallsyms\n", + (char *) p->symbol_name, name, (char *) p->symbol_name); + strncpy((char *) p->symbol_name, name, sizeof(name) - 1); + if (register_kprobe(p) < 0) + goto on_err; + } else { + goto on_err; + } + } + + log_info(" planted kprobe at %p, name %s\n", p->addr, p->symbol_name); + return 0; + +on_err: + return -1; +} + +/* + * register the kprobe with function call. + */ +int c_register_kretprobe(struct kretprobe *p) +{ + if (register_kretprobe(p) < 0) { + char name[MAX_TP_NAME] = {}; + strncpy(name, p->kp.symbol_name, sizeof(name) - 1); + if (query_kallsyms(name)) { + pr_alert("[nettrace] Note: The function we want to trace (%s) has become " + "%s bacause the kernel compiler had added a suffix in its symbol! " + "There may be several versions of %s, please check /proc/kallsyms\n", + (char *) p->kp.symbol_name, name, (char *) p->kp.symbol_name); + strncpy((char *) p->kp.symbol_name, name, sizeof(name) - 1); + if (register_kretprobe(p) < 0) + goto on_err; + } else { + goto on_err; + } + } + + log_info(" planted kretprobe at %p, name %s\n", p->kp.addr, p->kp.symbol_name); + return 0; + +on_err: + log_err(" register kretprobe failed: %s\n", p->kp.symbol_name); + return -1; +} + +/* + * unregister the krpobe with function call. + */ +void c_unregister_kprobe(struct kprobe *p) +{ + unregister_kprobe(p); + log_info("kprobe at %s unregistered\n", p->symbol_name); +} + +/* + * unregister the kretprobe with function call. + */ +void c_unregister_kretprobe(struct kretprobe *p) +{ + unregister_kretprobe(p); + log_info("kretprobe at %s unregistered\n", p->kp.symbol_name); +} diff --git a/net/nettrace/kprobe.h b/net/nettrace/kprobe.h new file mode 100644 index 000000000000..9f38e788e85f --- /dev/null +++ b/net/nettrace/kprobe.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef KPROBE_COMMON_H +#define KPROBE_COMMON_H + +#include +#include +#include +#include + +#define DECLARE_HANDLER_POST(func) \ + static void post_handler_##func( \ + struct kprobe *p, \ + struct pt_regs *regs, \ + unsigned long flags) +#define DECLARE_HANDLER_RET(func) \ + static int ret_handler##func( \ + struct kretprobe_instance *ri, \ + struct pt_regs *regs) +#define DECLARE_HANDLER_ENTRY(func) \ + static int entry_handler##func( \ + struct kretprobe_instance *ri, \ + struct pt_regs *regs) + +#define DECLARE_KPROBE(func) \ + DECLARE_HANDLER_POST(func); \ + static struct kprobe kprobe_##func = { \ + .symbol_name = #func, \ + .post_handler = post_handler_##func}; \ + DECLARE_HANDLER_POST(func) + +#define DECLARE_RETKPROBE(func) \ + DECLARE_HANDLER_ENTRY(func); \ + DECLARE_HANDLER_RET(func); \ + static struct kretprobe kretprobe_##func = { \ + .handler = ret_handler##func, \ + .entry_handler = entry_handler##func, \ + .kp = { .symbol_name = #func } \ + }; \ + DECLARE_HANDLER_RET(func) + +#define REGISTER_KPROBE(func) \ + if (register_kprobe(&kprobe_##func) < 0) \ + { \ + printk(KERN_INFO "register_kprobe failed:" #func "\n"); \ + return 0; \ + } \ + printk(KERN_INFO "Planted kprobe at %p, handler addr %p, name %s\n",\ + kprobe_##func.addr, kprobe_##func.post_handler, \ + kprobe_##func.symbol_name); + +#define UNREGISTER_KPROBE(func) \ + unregister_kprobe(&kprobe_##func); \ + printk(KERN_INFO "kprobe at %p unregistered\n", kprobe_##func.addr); + +#define PARAM_STRING(name, def) \ + char *name = def; \ + module_param(name, charp, 0); + +#define PARAM_STRING_ARRAY(name, cmdname, length) \ + char *name[length]; \ + int name##_len = 0; \ + module_param_array_named(cmdname, name, charp, &name##_len, 0); + +#define PARAM_STRING_RW(name, default) \ + char *name = default; \ + module_param(name, charp, 0644); + +#define PARAM_STRING_NAMED(name, cmdname, default) \ + char *name = default; \ + module_param_named(cmdname, name, charp, 0644); + +#define PARAM_INT(name, default) \ + int name = default; \ + module_param(name, int, 0); + +#define PARAM_INT_RW(name, default) \ + int name = default; \ + module_param(name, int, 0644); + +#define PARAM_INT_NAMED(name, cmdname, default) \ + int name = default; \ + module_param_named(cmdname, name, int, 0644); + +#define KPROBE_INIT \ + static int __init kprobe_init(void);\ + module_init(kprobe_init); \ + static int __init kprobe_init(void) + +#define KPROBE_EXIT \ + static void __exit kprobe_exit(void); \ + module_exit(kprobe_exit); \ + static void __exit kprobe_exit(void) + +#if defined(__x86_64__) + +#define PT_REGS_PARM1(x) ((x)->di) +#define PT_REGS_PARM2(x) ((x)->si) +#define PT_REGS_PARM3(x) ((x)->dx) +#define PT_REGS_PARM4(x) ((x)->cx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->sp) +#define PT_REGS_FP(x) ((x)->bp) +#define PT_REGS_RC(x) ((x)->ax) +#define PT_REGS_SP(x) ((x)->sp) + +#elif defined(__s390x__) + +#define PT_REGS_PARM1(x) ((x)->gprs[2]) +#define PT_REGS_PARM2(x) ((x)->gprs[3]) +#define PT_REGS_PARM3(x) ((x)->gprs[4]) +#define PT_REGS_PARM4(x) ((x)->gprs[5]) +#define PT_REGS_PARM5(x) ((x)->gprs[6]) +#define PT_REGS_RET(x) ((x)->gprs[14]) +#define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->gprs[2]) +#define PT_REGS_SP(x) ((x)->gprs[15]) + +#elif defined(__aarch64__) + +#define PT_REGS_PARM1(x) ((x)->regs[0]) +#define PT_REGS_PARM2(x) ((x)->regs[1]) +#define PT_REGS_PARM3(x) ((x)->regs[2]) +#define PT_REGS_PARM4(x) ((x)->regs[3]) +#define PT_REGS_PARM5(x) ((x)->regs[4]) +#define PT_REGS_RET(x) ((x)->regs[30]) +#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[0]) +#define PT_REGS_SP(x) ((x)->sp) + +#elif defined(__arm__) + +#define PT_REGS_PARM1(x) ((x)->uregs[0]) +#define PT_REGS_PARM2(x) ((x)->uregs[1]) +#define PT_REGS_PARM3(x) ((x)->uregs[2]) +#define PT_REGS_PARM4(x) ((x)->uregs[3]) +#define PT_REGS_PARM5(x) ((x)->uregs[4]) +#define PT_REGS_RET(x) ((x)->uregs[14]) +#define PT_REGS_FP(x) ((x)->uregs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->uregs[0]) +#define PT_REGS_SP(x) ((x)->uregs[13]) +#define PT_REGS_IP(x) ((x)->uregs[12]) + +#elif defined(__mips__) + +#define PT_REGS_PARM1(x) ((x)->regs[4]) +#define PT_REGS_PARM2(x) ((x)->regs[5]) +#define PT_REGS_PARM3(x) ((x)->regs[6]) +#define PT_REGS_PARM4(x) ((x)->regs[7]) +#define PT_REGS_PARM5(x) ((x)->regs[8]) +#define PT_REGS_RET(x) ((x)->regs[31]) +#define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[2]) +#define PT_REGS_SP(x) ((x)->regs[29]) +#define PT_REGS_IP(x) ((x)->cp0_epc) + +#elif defined(__powerpc__) + +#define PT_REGS_PARM1(x) ((x)->gpr[3]) +#define PT_REGS_PARM2(x) ((x)->gpr[4]) +#define PT_REGS_PARM3(x) ((x)->gpr[5]) +#define PT_REGS_PARM4(x) ((x)->gpr[6]) +#define PT_REGS_PARM5(x) ((x)->gpr[7]) +#define PT_REGS_RC(x) ((x)->gpr[3]) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->nip) + +#endif + +#define KPROBE_PARM(type, name, index) type name = (type)PT_REGS_PARM##index(regs); +#define KPROBE_RET_PARM regs_return_value(regs); + +//Below is the function encapsulation + +extern unsigned long kprobe_parm(struct pt_regs *regs, int index); + +extern struct kprobe *kprobe_declare(const char *sym, void *handler); + +extern struct kretprobe *kretprobe_declare(const char *sym, void *handler, void *entry_handler); + +extern int c_register_kprobe(struct kprobe *p); + +extern int c_register_kretprobe(struct kretprobe *p); + +extern void c_unregister_kprobe(struct kprobe *p); + +extern void c_unregister_kretprobe(struct kretprobe *p); + +MODULE_LICENSE("GPL"); +#endif diff --git a/net/nettrace/mm.c b/net/nettrace/mm.c new file mode 100644 index 000000000000..2b4e88e96726 --- /dev/null +++ b/net/nettrace/mm.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2024 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include "group.h" +#include "help.h" +#include "mm.h" + +#define MIN_CACHED_SKB_OBJS 100 + +struct sk_buff_nettrace *get_cached_skb(TRACE_POINT *tp) +{ + if (!tp->nr_skb_objs) + return NULL; + + WRITE_ONCE(tp->nr_skb_objs, tp->nr_skb_objs - 1); + return (struct sk_buff_nettrace *)llist_del_first(&tp->skbcache); +} + +bool put_cached_skb(TRACE_POINT *tp, struct sk_buff_nettrace *skb) +{ + if (tp->nr_skb_objs >= param_mm) + return false; + + llist_add((struct llist_node *) skb, &tp->skbcache); + WRITE_ONCE(tp->nr_skb_objs, tp->nr_skb_objs + 1); + return true; +} + +u8 *get_cached_data(TRACE_POINT *tp) +{ + if (!tp->nr_data_objs) + return NULL; + + WRITE_ONCE(tp->nr_data_objs, tp->nr_data_objs - 1); + return (u8 *)llist_del_first(&tp->datacache); +} + +bool put_cached_data(TRACE_POINT *tp, u8 *data) +{ + if (tp->nr_data_objs >= param_mm) + return false; + + llist_add((struct llist_node *) data, &tp->datacache); + WRITE_ONCE(tp->nr_data_objs, tp->nr_data_objs + 1); + return true; +} + +void init_dump_mm(TRACE_POINT *tp) +{ + int i; + struct sk_buff_nettrace *skb; + u8 *data; + + if (param_mm < MIN_CACHED_SKB_OBJS) + param_mm = MIN_CACHED_SKB_OBJS; + + for (i = 0; i < param_mm; i++) { + skb = (struct sk_buff_nettrace *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (skb) + put_cached_skb(tp, skb); + else + pr_err("Failed to preallocate for nettrace dump skb! number:%d\n", i); + } + + for (i = 0; i < param_mm; i++) { + data = (u8 *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (data) + put_cached_data(tp, data); + else + pr_err("Failed to preallocate for nettrace dump data! number:%d\n", i); + } + + /* Partial failure does not affect usage. */ +} + +void release_dump_mm(TRACE_POINT *tp) +{ + int i; + struct sk_buff_nettrace *skb; + u8 *data; + + for (i = 0; i < param_mm; i++) { + skb = get_cached_skb(tp); + if (skb) + free_page((unsigned long)skb); + } + + for (i = 0; i < param_mm; i++) { + data = get_cached_data(tp); + free_page((unsigned long)data); + } +} + +static struct sk_buff_nettrace *__alloc_skb_nettrace(TRACE_POINT *tp) +{ + struct sk_buff_nettrace *skb; + u8 *data; + + skb = get_cached_skb(tp); + if (!skb) + goto out; + prefetchw(skb); + memset(skb, 0, sizeof(struct sk_buff_nettrace)); + + data = get_cached_data(tp); + if (!data) + goto nodata; + + skb->data = data; + memset(data, 0, PAGE_SIZE); + + skb_queue_head_init(&skb->frag_list); +out: + return skb; +nodata: + put_cached_skb(tp, skb); + skb = NULL; + goto out; +} + +void release_skb_nettrace(struct sk_buff_nettrace *skb, TRACE_POINT *tp) +{ + struct sk_buff *frag, *tmp; + if (!skb_queue_empty(&skb->frag_list)) { + skb_queue_walk_safe(&skb->frag_list, frag, tmp) { + put_cached_data(tp, ((struct sk_buff_nettrace *)frag)->data); + put_cached_skb(tp, (struct sk_buff_nettrace *)frag); + } + } + put_cached_data(tp, skb->data); + put_cached_skb(tp, skb); +} + +struct sk_buff_nettrace *skb_copy_nettrace(const struct sk_buff *skb, TRACE_POINT *tp) +{ + int headerlen = skb_headroom(skb) - skb->mac_header; + unsigned int size = skb->len + headerlen; + unsigned int frag_num = size / PAGE_SIZE + 1; + unsigned int i; + struct sk_buff_nettrace *n; + unsigned int dump_size = 0; + + /* Alloc skb. */ + for (i = 0; i < frag_num; i++) { + if (!i) { + n = __alloc_skb_nettrace(tp); + if (!n) + return NULL; + } else { + struct sk_buff_nettrace *frag = __alloc_skb_nettrace(tp); + if (!frag) + goto nofrag; + __skb_queue_tail(&n->frag_list, (struct sk_buff *)frag); + } + } + + /* Copy first page. */ + n->len = size > PAGE_SIZE ? PAGE_SIZE : headerlen + skb->len; + if (n->data) { + if (skb_copy_bits(skb, -headerlen, n->data, n->len)) + pr_warn("[nettrace]: Copy SKB failed.\n"); + } + dump_size += n->len; + /* Only one. */ + if (size <= PAGE_SIZE) { + n->total_len = dump_size; + return n; + } + + /* Copy frag pages. */ + if (!skb_queue_empty(&n->frag_list)) { + struct sk_buff *frag, *tmp; + unsigned int i = 0; + + skb_queue_walk_safe(&n->frag_list, frag, tmp) { + i++; + ((struct sk_buff_nettrace *)frag)->len = size - PAGE_SIZE * i > PAGE_SIZE ? + PAGE_SIZE : size - PAGE_SIZE * i; + if (((struct sk_buff_nettrace *)frag)->data) { + if (skb_copy_bits(skb, -headerlen + PAGE_SIZE * i, + ((struct sk_buff_nettrace *)frag)->data, + ((struct sk_buff_nettrace *)frag)->len)) + pr_warn("[nettrace]: Copy SKB failed.\n"); + } + dump_size += ((struct sk_buff_nettrace *)frag)->len; + } + } + if (n) + n->total_len = dump_size; + return n; + +nofrag: + release_skb_nettrace(n, tp); + return NULL; +} + diff --git a/net/nettrace/mm.h b/net/nettrace/mm.h new file mode 100644 index 000000000000..00d386b60973 --- /dev/null +++ b/net/nettrace/mm.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2024 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_MM_H +#define NETDUMP_MM_H + +#include "group.h" + +struct sk_buff_nettrace { + struct sk_buff_nettrace *next; + struct sk_buff_nettrace *prev; + struct sk_buff_head frag_list; + unsigned int len; + unsigned int total_len; + unsigned char *data; +}; + +typedef struct trace_point TRACE_POINT; +struct sk_buff_nettrace *skb_copy_nettrace(const struct sk_buff *skb, TRACE_POINT *tp); +void init_dump_mm(TRACE_POINT *tp); +void release_dump_mm(TRACE_POINT *tp); +void release_skb_nettrace(struct sk_buff_nettrace *skb, TRACE_POINT *tp); + +struct sk_buff_nettrace *get_cached_skb(TRACE_POINT *tp); +bool put_cached_skb(TRACE_POINT *tp, struct sk_buff_nettrace *skb); +u8 *get_cached_data(TRACE_POINT *tp); +bool put_cached_data(TRACE_POINT *tp, u8 *data); + +#endif //NETDUMP_MM_H diff --git a/net/nettrace/parser.c b/net/nettrace/parser.c new file mode 100644 index 000000000000..dfe06c065f57 --- /dev/null +++ b/net/nettrace/parser.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include "parser.h" +#include "help.h" +#include "utils.h" + +LIST_HEAD(rule_list); + +typedef struct inet_proto { + int number; + char *name; +} INET_PROTO; + +const INET_PROTO proto4[] = { + {.number = 0, .name = "ip"}, + {.number = 1, .name = "icmp"}, + {.number = 2, .name = "igmp"}, + {.number = 4, .name = "ipip"}, + {.number = 6, .name = "tcp"}, + {.number = 8, .name = "egp"}, + {.number = 12, .name = "pup"}, + {.number = 17, .name = "udp"}, + {.number = 22, .name = "idp"}, + {.number = 29, .name = "tp"}, + {.number = 33, .name = "dccp"}, + {.number = 41, .name = "ipv6"}, + {.number = 46, .name = "rsvp"}, + {.number = 47, .name = "gre"}, + {.number = 50, .name = "esp"}, + {.number = 51, .name = "ah"}, + {.number = 92, .name = "mtp"}, + {.number = 94, .name = "beetph"}, + {.number = 98, .name = "encap"}, + {.number = 103, .name = "pim"}, + {.number = 108, .name = "comp"}, + {.number = 132, .name = "sctp"}, + {.number = 136, .name = "udplite"}, + {.number = 137, .name = "mpls"}, + {.number = 255, .name = "raw"} +}; +const int proto4_len = sizeof(proto4) / sizeof(INET_PROTO); + +const INET_PROTO proto3[] = { + {.number = 0x0060, .name = "loop"}, + {.number = 0x0200, .name = "pup"}, + {.number = 0x0201, .name = "pupat"}, + {.number = 0x22F0, .name = "tsn"}, + {.number = 0x22EB, .name = "erspan2"}, + {.number = 0x0800, .name = "ip"}, + {.number = 0x0805, .name = "x25"}, + {.number = 0x0806, .name = "arp"}, + {.number = 0x08FF, .name = "bpq"}, + {.number = 0x0a00, .name = "ieeepup"}, + {.number = 0x0a01, .name = "ieeepupat"}, + {.number = 0x4305, .name = "batman"}, + {.number = 0x6000, .name = "dec"}, + {.number = 0x6001, .name = "dna_dl"}, + {.number = 0x6002, .name = "dna_rc"}, + {.number = 0x6003, .name = "dna_rt"}, + {.number = 0x6004, .name = "lat"}, + {.number = 0x6005, .name = "diag"}, + {.number = 0x6006, .name = "cust"}, + {.number = 0x6007, .name = "sca"}, + {.number = 0x6558, .name = "teb"}, + {.number = 0x8035, .name = "rarp"}, + {.number = 0x809B, .name = "atalk"}, + {.number = 0x80F3, .name = "aarp"}, + {.number = 0x8100, .name = "8021q"}, + {.number = 0x88BE, .name = "erspan"}, + {.number = 0x8137, .name = "ipx"}, + {.number = 0x86DD, .name = "ipv6"}, + {.number = 0x8808, .name = "pause"}, + {.number = 0x8809, .name = "slow"}, + {.number = 0x883E, .name = "wccp"}, + {.number = 0x8847, .name = "mpls_uc"}, + {.number = 0x8848, .name = "mpls_mc"}, + {.number = 0x884c, .name = "atmmpoa"}, + {.number = 0x8863, .name = "ppp_disc"}, + {.number = 0x8864, .name = "ppp_ses"}, + {.number = 0x886c, .name = "link_ctl"}, + {.number = 0x8884, .name = "atmfate"}, + {.number = 0x888E, .name = "pae"}, + {.number = 0x88A2, .name = "aoe"}, + {.number = 0x88A8, .name = "8021ad"}, + {.number = 0x88B5, .name = "802_ex1"}, + {.number = 0x88C7, .name = "preauth"}, + {.number = 0x88CA, .name = "tipc"}, + {.number = 0x88CC, .name = "lldp"}, + {.number = 0x88E5, .name = "macsec"}, + {.number = 0x88E7, .name = "8021ah"}, + {.number = 0x88F5, .name = "mvrp"}, + {.number = 0x88F7, .name = "1588"}, + {.number = 0x88F8, .name = "ncsi"}, + {.number = 0x88FB, .name = "prp"}, + {.number = 0x8906, .name = "fcoe"}, + {.number = 0x8915, .name = "iboe"}, + {.number = 0x890D, .name = "tdls"}, + {.number = 0x8914, .name = "fip"}, + {.number = 0x8917, .name = "80221"}, + {.number = 0x892F, .name = "hsr"}, + {.number = 0x894F, .name = "nsh"}, + {.number = 0x9000, .name = "loopback"}, + {.number = 0x9100, .name = "qinq1"}, + {.number = 0x9200, .name = "qinq2"}, + {.number = 0x9300, .name = "qinq3"}, + {.number = 0xDADA, .name = "edsa"}, + {.number = 0xDADB, .name = "dsa_8021q"}, + {.number = 0xED3E, .name = "ife"}, + {.number = 0xFBFB, .name = "af_iucv"}, + {.number = 0x0600, .name = "802_3_min"} +}; +const int proto3_len = sizeof(proto3) / sizeof(INET_PROTO); + + +/**************************************************************************************************** + * + * This is the part for skb bag parse + * + ****************************************************************************************************/ + +/*parse u32 to ip addr string.*/ +int i2ip(__be32 ip, char *dest) +{ + u8 *tmp = (u8 *) &ip; + + return sprintf(dest, "%u.%u.%u.%u", + *(tmp), + *(tmp + 1), + *(tmp + 2), + *(tmp + 3)) < 0; +} + +/*parse ip addr string to u32.*/ +int ip2i(char *ip_str, u32 *ip) +{ + int ip_v[4]; + int i = 0; + u32 ip_tmp = 0; + int tmp; + + if (sscanf(ip_str, "%d.%d.%d.%d", + &ip_v[0], + &ip_v[1], + &ip_v[2], + &ip_v[3]) < 4) + return -1; + + for (; i < 4; i++) { + tmp = ip_v[i]; + if (tmp < 0 || tmp > 255) + return -1; + ip_tmp += (((u32)tmp) << ((3 - i) * 8)); + } + *ip = ip_tmp; + return 0; +} + +static +int str2proto(char *proto_str, INET_PROTO proto[], int len) +{ + int i = 0; + + for (; i < len; i++) { + if (streq(proto[i].name, proto_str)) + return proto[i].number; + } + return -1; +} + +static +char *proto2str(int proto, INET_PROTO protos[], int len) +{ + int i = 0; + + for (; i < len; i++) { + if (protos[i].number == proto) + return protos[i].name; + } + return NULL; +} + +int str2proto3(char *proto_str) +{ + return str2proto(proto_str, (INET_PROTO *) proto3, proto3_len); +} + +int str2proto4(char *proto_str) +{ + return str2proto(proto_str, (INET_PROTO *) proto4, proto4_len); +} + +char *proto3tostr(int p) +{ + return proto2str(p, (INET_PROTO *) proto3, proto3_len); +} + +char *proto4tostr(int p) +{ + return proto2str(p, (INET_PROTO *) proto4, proto4_len); +} + +void flag2str(struct tcphdr *tcp, char *str) +{ + if (strlen(str) + 1 > TCP_FLAG_LEN) { + log_err("%s: string length(%s) exceeds TCP_FLAG_LEN\n", __func__, str); + return; + } + + if (tcp->psh) + strncat(str, "P", 1); + if (tcp->rst) + strncat(str, "R", 1); + if (tcp->syn) + strncat(str, "S", 1); + if (tcp->fin) + strncat(str, "F", 1); +} + +/**************************************************************************************************** + * + * This is the part for ip bag match + * + ****************************************************************************************************/ + +/* + * 'remote' is the match rule that generated form ip package, 'local' is the + * match rule that user define. + * + * The return value is 0 if matched, and -1 otherwise. + */ +bool match_rule(COMMON_RULE *remote, COMMON_RULE *local) +{ + + if ((local->__flags & FLAG_proto_3) && local->proto_3 != remote->proto_3) + return false; + if ((local->__flags & FLAG_proto_4) && local->proto_4 != remote->proto_4) + return false; + + if (local->__flags & FLAG_port && + local->sport != remote->sport && + local->sport != remote->dport) + return false; + + if ((local->__flags & FLAG_dport) && local->dport != remote->dport) + return false; + if ((local->__flags & FLAG_sport) && local->sport != remote->sport) + return false; + + if (local->__flags & FLAG_addr && ( + local->saddr != (remote->saddr & local->s_mask) && + local->saddr != (remote->daddr & local->s_mask))) + return false; + + if ((local->__flags & FLAG_saddr) && + local->saddr != (remote->saddr & local->s_mask)) + return false; + if ((local->__flags & FLAG_daddr) && + local->daddr != (remote->daddr & local->d_mask)) + return false; + + return true; +} + +bool match_all_rule(COMMON_RULE *remote) +{ + bool is_empty = true; + COMMON_RULE *rule; + + list_for_each_entry(rule, &rule_list, list) { + is_empty = false; + if (match_rule(remote, rule)) + return true; + } + + if (is_empty) + return true; + return false; +} + +int add_rule(COMMON_RULE *rule) +{ + if (rule->__flags) { + list_add_tail(&rule->list, &rule_list); + return 0; + } + return -1; +} + +void free_rules(void) { + COMMON_RULE *rule, *pre; + list_for_each_entry_safe(rule, pre, &rule_list, list) { + kfree(rule); + } +} diff --git a/net/nettrace/parser.h b/net/nettrace/parser.h new file mode 100644 index 000000000000..edbb714a6f43 --- /dev/null +++ b/net/nettrace/parser.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef IP_PARSER_H +#define IP_PARSER_H + +#include +#include + +#define IP_ADDR_LEN 16 + +#define TCP_FLAG_LEN 6 + +/* flags for network package. */ +#define FLAG_proto_3 (1UL << 0) +#define FLAG_proto_4 (1UL << 1) +#define FLAG_sport (1UL << 2) +#define FLAG_dport (1UL << 3) +#define FLAG_saddr (1UL << 4) +#define FLAG_daddr (1UL << 5) + +/* flags for match. */ +#define FLAG_addr (1UL << 6) +#define FLAG_port (1UL << 7) + +typedef +struct common_rule { + struct list_head list; + __u16 __flags; + __u16 proto_3; + __u8 proto_4; + __u16 sport; + __u16 dport; + __u32 saddr; + __u32 s_mask; + __u32 daddr; + __u32 d_mask; + __u8 flags; +} COMMON_RULE; + +#define SET_RULE_FLAGS(rule, flags, value) \ +{\ + (rule)->flags = value;\ + (rule)->__flags |= FLAG_##flags;\ +} + +extern int i2ip(u32 ip, char *dest); + +extern int ip2i(char *ip_str, u32 *ip); + +extern int str2proto4(char *proto_str); + +extern int str2proto3(char *proto_str); + +extern char *proto3tostr(int p); + +extern char *proto4tostr(int p); + +extern bool match_rule(COMMON_RULE *remote, COMMON_RULE *local); + +extern bool match_all_rule(COMMON_RULE *remote); + +extern int add_rule(COMMON_RULE *rule); + +extern void free_rules(void); + +extern void flag2str(struct tcphdr *tcp, char *str); + +#endif //IP_PARSER_H diff --git a/net/nettrace/procfs.c b/net/nettrace/procfs.c new file mode 100644 index 000000000000..ae63fd076d00 --- /dev/null +++ b/net/nettrace/procfs.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "procfs.h" +#include "group.h" +#include "dump.h" + +static struct proc_dir_entry *ntrace_proc; + +static int kprobe_tp_info_seq_show(struct seq_file *seq, void *v) +{ + TRACE_POINT *tp; + int i = 0; + + seq_printf(seq, "current registered kernel function:\n"); + for_each_tp(i, tp) { + if (!tp->kprobe) + continue; + seq_printf(seq, "%30s:%p\n", tp->name, tp->kprobe->addr); + } + + return 0; +} + +static int kprobe_tp_info_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, kprobe_tp_info_seq_show, NULL); +} + +static const struct proc_ops kprobe_tp_info_ops = { + .proc_open = kprobe_tp_info_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int nettrace_statistics_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "The total dump_loss: %u\n", dump_loss_due_to_no_memory + + dump_skb_over_cnt + dump_skb_over_size); + seq_printf(seq, "dump_queue_no_memory: %u\n", dump_loss_due_to_no_memory); + seq_printf(seq, "dump_file_over_cnt: %u\n", dump_skb_over_cnt); + seq_printf(seq, "dump_file_over_size:%u\n", dump_skb_over_size); + + return 0; +} + +static int nettrace_statistics_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, nettrace_statistics_seq_show, NULL); +} + +static const struct proc_ops nettrace_statistics_ops = { + .proc_open = nettrace_statistics_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int max_dump_skb_cnt_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", max_dump_skb_cnt); + return 0; +} + +static int max_dump_skb_cnt_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, max_dump_skb_cnt_seq_show, NULL); +} + +static ssize_t max_dump_skb_cnt_proc_write(struct file *file, + const char __user *buf, size_t count, loff_t *pos) +{ + char buffer[32]; + int temp_value; + int err = 0; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &temp_value); + if (err) + goto out; + if (temp_value < 0) { + err = -EINVAL; + goto out; + } + + max_dump_skb_cnt = temp_value; +out: + return err < 0 ? err : count; +} + +static const struct proc_ops max_dump_skb_cnt_ops = { + .proc_open = max_dump_skb_cnt_proc_open, + .proc_read = seq_read, + .proc_write = max_dump_skb_cnt_proc_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int max_dump_file_size_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", max_dump_file_size); + return 0; +} + +static int max_dump_file_size_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, max_dump_file_size_seq_show, NULL); +} + +static ssize_t max_dump_file_size_proc_write(struct file *file, + const char __user *buf, size_t count, loff_t *pos) +{ + char buffer[32]; + int temp_value; + int err = 0; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &temp_value); + if (err) + goto out; + if (temp_value < 0) { + err = -EINVAL; + goto out; + } + + max_dump_file_size = temp_value; +out: + return err < 0 ? err : count; +} + +static const struct proc_ops max_dump_file_size_ops = { + .proc_open = max_dump_file_size_proc_open, + .proc_read = seq_read, + .proc_write = max_dump_file_size_proc_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +int __net_init ntrace_proc_init(void) +{ + + ntrace_proc = proc_mkdir("ntrace", init_net.proc_net); + + if (ntrace_proc == NULL) + return -ENOMEM; + + if (!proc_create("kprobe", S_IRUGO, ntrace_proc, &kprobe_tp_info_ops)) + goto err_rmdir_ntrace; + if (!proc_create("max_dump_skb_cnt", 0644, ntrace_proc, &max_dump_skb_cnt_ops)) + goto err_rmdir_ntrace; + if (!proc_create("max_dump_file_size", 0644, ntrace_proc, &max_dump_file_size_ops)) + goto err_rmdir_ntrace; + if (!proc_create("statistics", 0444, ntrace_proc, &nettrace_statistics_ops)) + goto err_rmdir_ntrace; + + return 0; + +err_rmdir_ntrace: + remove_proc_subtree("ntrace", init_net.proc_net); + return -ENOMEM; +} + +void __net_exit ntrace_proc_exit(void) +{ + remove_proc_entry("kprobe", ntrace_proc); + proc_remove(ntrace_proc); +} diff --git a/net/nettrace/procfs.h b/net/nettrace/procfs.h new file mode 100644 index 000000000000..f61f85b6a4a4 --- /dev/null +++ b/net/nettrace/procfs.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_PROCFS_H +#define NETDUMP_PROCFS_H + +extern int __net_init ntrace_proc_init(void); + +extern void __net_exit ntrace_proc_exit(void); + +#endif //NETDUMP_PROCFS_H diff --git a/net/nettrace/utils.c b/net/nettrace/utils.c new file mode 100644 index 000000000000..3263f0cbc12e --- /dev/null +++ b/net/nettrace/utils.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + + +int file_append(struct file *file, void *data, unsigned int size) +{ + loff_t pos = file->f_pos; + int n = 1; + + while (size > 0 && n > 0) { + n = __kernel_write(file, data, size, &pos); + size -= n; + } + file->f_pos = pos; + return 0; +} + +struct file *file_create(const char *path) +{ + struct file *file = NULL; + + file = filp_open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (IS_ERR(file)) + return NULL; + + return file; +} + +int access_path(const char *path) +{ + struct file *file = NULL; + + file = filp_open(path, O_DIRECTORY, 0644); + if (IS_ERR(file)) + return PTR_ERR(file); + filp_close(file, NULL); + return 0; +} + +void file_close(struct file *file) +{ + filp_close(file, NULL); +} diff --git a/net/nettrace/utils.h b/net/nettrace/utils.h new file mode 100644 index 000000000000..e9df0cf834a3 --- /dev/null +++ b/net/nettrace/utils.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Nettrace support. + * + * Copyright (C) 2025 ZTE Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef NETDUMP_UTILS_H +#define NETDUMP_UTILS_H + +#include +#include + +#define MAX_FILE_NAME 256 + +#define str_append(dest, fmt, args...) \ + sprintf(dest + strlen(dest), fmt, ##args) + +extern int file_append(struct file *file, void *data, unsigned int size); + +extern struct file *file_create(const char *path); + +extern void file_close(struct file *file); + +static inline int streq(char *a, char *b) +{ + return strcmp(a, b) == 0; +} + +extern int access_path(const char *path); + +#endif //NETDUMP_UTILS_H -- Gitee