diff --git a/.gitignore b/.gitignore index 0eb1688d64905713d38df17451fb1c8355a84388..2075465eb113360be9e6c52ca62efdc7ff07d0ec 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,8 @@ docs/_build lwcb/pylwcb/.env lwcb/.vscode lwcb/target -lwcb/pylwcb/target \ No newline at end of file +lwcb/pylwcb/target + +src/bpf/*.skel.h +src/profiler/*.skel.h +src/security/*.skel.h \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index c2e6b10f7b48f73506a4203710850b3682878814..46c7e98e7073ab788b4cc3ae5012d12bba1a411f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,11 +14,16 @@ option(BUILD_TESTING "Build test cases" OFF) option(BUILD_EXAMPLE "Build example cases" OFF) option(BUILD_LCC "Build lcc project" OFF) OPTION(ENABLE_GCOV "Enable gcov" OFF) +option(ENABLE_ASAN "Enable asan" OFF) +option(ENABLE_PROFILE "Enable profile" ON) +option(ENABLE_STATIC_LINK_ELF "Enable static link libelf" OFF) IF (ENABLE_GCOV) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage") ENDIF() +message(STATUS "coolbpf C compile flags: ${CMAKE_C_FLAGS}") + # compile coolbpf library add_subdirectory(src) # test cases @@ -29,4 +34,4 @@ add_subdirectory(tools) if(BUILD_LCC) add_subdirectory(lcc) add_subdirectory(third) -endif() \ No newline at end of file +endif() diff --git a/scripts/cmake/genskel.cmake b/scripts/cmake/genskel.cmake index 325b581ca9dc0f31860395b78afc4b0f4c2d23c5..32de7dcfa29078380291fc566c525ded4e6de620 100644 --- a/scripts/cmake/genskel.cmake +++ b/scripts/cmake/genskel.cmake @@ -19,6 +19,11 @@ endforeach() message(STATUS "Include Directories: ${include_dirs}") macro(genskel name) + message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") + if (CMAKE_BUILD_TYPE MATCHES Debug) + add_definitions(-COOLBPF_DEBUG) + message(STATUS "add definition: -DCOOLBPF_DEBUG") + endif () SET(BPF_C_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${name}.bpf.c) SET(BPF_O_FILE ${CMAKE_CURRENT_BINARY_DIR}/${name}.bpf.o) SET(BPF_S_FILE ${CMAKE_CURRENT_BINARY_DIR}/${name}.skel.h) @@ -30,6 +35,7 @@ macro(genskel name) COMMENT "Generating BPF object: ${BPF_O_FILE}" ) + message(STATUS "gen skel CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}") add_custom_command( OUTPUT ${BPF_S_FILE} COMMAND ${BPFTOOL} gen skeleton ${BPF_O_FILE} > ${BPF_S_FILE} @@ -41,5 +47,7 @@ macro(genskel name) ${name}_skel DEPENDS ${BPF_S_FILE} ) -endmacro() - + # install skeleton headers + message(STATUS "gen skel INSTALL_INCLUDE_DIR: ${INSTALL_INCLUDE_DIR}") + install(FILES ${BPF_S_FILE} DESTINATION ${INSTALL_INCLUDE_DIR}/coolbpf) +endmacro() \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 315f585a417ae64f18b1bf69f73902415c4efde5..0365b5878e39310d381dfbe73571788d6328fda6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,15 +18,6 @@ else(ELF_LIBRARY) message("Not found libelf library: ${ELF_LIBRARY}") endif(ELF_LIBRARY) -add_subdirectory(bpf) -add_subdirectory(profiler) - -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/coolbpf.pc.in - ${CMAKE_CURRENT_BINARY_DIR}/coolbpf.pc @ONLY) - -include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/bpf) - if(NOT DEFINED INSTALL_LIB_DIR) set(INSTALL_LIB_DIR ${CMAKE_INSTALL_PREFIX}/lib) endif() @@ -35,11 +26,24 @@ if(NOT DEFINED INSTALL_INCLUDE_DIR) set(INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_PREFIX}/include) endif() +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/coolbpf.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/coolbpf.pc @ONLY) + +include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/bpf + ${CMAKE_CURRENT_BINARY_DIR}/bpf) + +add_subdirectory(bpf) +add_subdirectory(security) +if (ENABLE_PROFILE) + add_subdirectory(profiler) +endif() + file(GLOB sources ${PROJECT_SOURCE_DIR}/third/libbpf/src/*.c ${CMAKE_CURRENT_SOURCE_DIR}/*.c) # share library add_library(coolbpf SHARED ${sources}) add_dependencies(coolbpf net_skel) +add_dependencies(coolbpf security_skel) target_link_libraries(coolbpf PRIVATE ${ELF_LIBRARY} ${Z_LIBRARY} pthread) set_target_properties(coolbpf PROPERTIES VERSION ${VERSION} SOVERSION ${VERSION}) @@ -53,6 +57,15 @@ set_target_properties(coolbpf_static PROPERTIES OUTPUT_NAME "coolbpf") install(TARGETS coolbpf LIBRARY DESTINATION ${INSTALL_LIB_DIR}) install(TARGETS coolbpf_static ARCHIVE DESTINATION ${INSTALL_LIB_DIR}) +# file(GLOB skel_headers ${CMAKE_CURRENT_BINARY_DIR}/bpf/*.skel.h ${CMAKE_CURRENT_BINARY_DIR}/security/*.skel.h) +# install(FILES ${skel_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/coolbpf) + +message(STATUS "in coolbpf/src INSTALL_INCLUDE_DIR: ${INSTALL_INCLUDE_DIR}") + +# install security headers +file(GLOB security_headers security/*.h) +install(FILES ${security_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/coolbpf/security) + # install coolbpf headers file(GLOB headers "*.h") install(FILES ${headers} DESTINATION ${INSTALL_INCLUDE_DIR}/coolbpf) @@ -73,6 +86,8 @@ install(FILES ${uapi_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/coolbpf/linux) install(FILES ${PROJECT_SOURCE_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/vmlinux.h DESTINATION ${INSTALL_INCLUDE_DIR}/coolbpf) +if (ENABLE_PROFILE) # install libprofiler install(FILES ${CMAKE_CURRENT_BINARY_DIR}/profiler/release/libprofiler.so - DESTINATION ${INSTALL_LIB_DIR}) \ No newline at end of file + DESTINATION ${INSTALL_LIB_DIR}) +endif() diff --git a/src/bpf/CMakeLists.txt b/src/bpf/CMakeLists.txt index 5b8ef33a8d13d06b661c02d0cae07583ea6a0d2b..bbb98024018d79236e144d4db5697be36649eb90 100644 --- a/src/bpf/CMakeLists.txt +++ b/src/bpf/CMakeLists.txt @@ -1,3 +1,4 @@ include(${PROJECT_SOURCE_DIR}/scripts/cmake/genskel.cmake) +message(STATUS "net bpf INSTALL_INCLUDE_DIR: ${INSTALL_INCLUDE_DIR}") genskel(net) \ No newline at end of file diff --git a/src/bpf/net.bpf.c b/src/bpf/net.bpf.c index 36837dbb953877c5ccafb6aedd7a699d79917261..1c5d5c90819de89800ba8f29c2bf781c748f1561 100644 --- a/src/bpf/net.bpf.c +++ b/src/bpf/net.bpf.c @@ -1,6 +1,7 @@ #include "vmlinux.h" #include "../coolbpf.h" #include "../net.h" +#include "../ebpf_log.h" #define AF_UNIX 1 #define AF_INET 2 /* Internet IP Protocol */ @@ -169,6 +170,38 @@ struct __uint(max_entries, 1); } connect_info_heap SEC(".maps"); +struct +{ + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct container_id_key); + __uint(max_entries, 1); +} container_id_heap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(max_entries, 1024); + __type(key, __u8[sizeof(struct container_id_key)]); // Need to specify as byte array as wouldn't take struct as key type + __type(value, __u64); + __uint(map_flags, BPF_F_NO_PREALLOC); +} enable_container_ids SEC(".maps"); + +struct +{ + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct self_runtime_info); + __uint(max_entries, 1); + __uint(map_flags, BPF_F_NO_PREALLOC); +} self_runtime_info_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct self_runtime_info); +} self_runtime_info_heap SEC(".maps"); + struct trace_event_raw_sys_enter_comp { struct trace_entry ent; @@ -217,6 +250,45 @@ static __always_inline void set_addr_pair_by_sock(struct sock *sk, struct addr_p ap->dport = bpf_ntohs(ap->dport); } +static __always_inline bool match_container_id(struct connect_info_t* conn_info) +{ + u32 index = ContainerIdIndex; + int64_t *cid_prefix_length = bpf_map_lookup_elem(&config_tgid_map, &index); + if (cid_prefix_length == NULL) { + BPF_DEBUG("cid_prefix_length null! pid:%u\n", conn_info->conn_id.tgid); + return true; + } + + u32 trim_len = *cid_prefix_length; + if (trim_len <= 0 || trim_len > KN_NAME_LENGTH) { + BPF_DEBUG("trim_len invalid! pid:%u trim_len:%u\n", conn_info->conn_id.tgid, trim_len); + return false; + } + + if (conn_info->docker_id_length == 0) { + BPF_DEBUG("dockerid length is zero! pid:%u docker_id_length:%u\n", conn_info->conn_id.tgid, conn_info->docker_id_length); + return false; + } + + // check config + u32 zero = 0; + struct container_id_key* prefix = bpf_map_lookup_elem(&container_id_heap, &zero); + if (!prefix) return false; + __builtin_memset(prefix, 0, sizeof(struct container_id_key)); + BPF_DEBUG("after memset! pid:%u, cgroup:%s, real_length:%u \n", conn_info->conn_id.tgid, prefix->data, conn_info->docker_id_length); + bpf_probe_read(prefix->data, CONTAINER_ID_MAX_LENGTH, conn_info->docker_id + trim_len); + prefix->prefixlen = CONTAINER_ID_MAX_LENGTH << 3; + __u64* cid_key = bpf_map_lookup_elem(&enable_container_ids, prefix); + if (cid_key) { + BPF_DEBUG("bingo! pid:%u, cgroup:%s, prefix:%u \n", conn_info->conn_id.tgid, prefix->data, prefix->prefixlen); + // in whitelist + conn_info->cid_key = *cid_key; + return true; + } + BPF_DEBUG("blacklist! pid:%u, cgroup:%s, prefix:%u \n", conn_info->conn_id.tgid, prefix->data, prefix->prefixlen); + return false; +} + static __always_inline enum support_tgid_e match_tgid(const uint32_t tgid) { u32 index = TgidIndex; @@ -249,6 +321,83 @@ static __always_inline enum support_tgid_e match_tgid(const uint32_t tgid) return TgidUnmatch; } +#ifndef unlikely +# define unlikely(X) __builtin_expect(!!(X), 0) +#endif + +static __always_inline const char *get_cgroup_name(const struct cgroup *cgrp) +{ + const char *name; + + if (unlikely(!cgrp)) + return NULL; + + if (BPF_CORE_READ_INTO(&name, cgrp, kn, name) != 0) + return NULL; + + return name; +} + +#define EVENT_ERROR_CGROUP_NAME 0x010000 +#define EVENT_ERROR_CGROUPS 0x100000 +#define EVENT_ERROR_CGROUP_SUBSYSCGRP 0x040000 +#define EVENT_ERROR_CGROUP_SUBSYS 0x080000 +#define VALID_HEX_LENGTH 64 + +// Function to check if a character is a hex digit [a-f0-9] +static __always_inline bool is_hex_digit(char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); +} + +/* Gather current task cgroup name */ +static __always_inline __u32 __event_get_current_cgroup_name(struct cgroup *cgrp, struct connect_info_t * conn_info) +{ + const char *name; + + name = get_cgroup_name(cgrp); + conn_info->docker_id_length = 0; + if (!name) return EVENT_ERROR_CGROUP_NAME; + + int ret = bpf_probe_read_str(conn_info->docker_id, KN_NAME_LENGTH, name); + BPF_DEBUG("pid:%u docker_id:%s ret:%u \n", conn_info->conn_id.tgid, conn_info->docker_id, ret); + conn_info->docker_id_length = ret; + + return name ? 0 : EVENT_ERROR_CGROUP_NAME; +} + +static __always_inline struct cgroup * +get_task_cgroup(struct task_struct *task) +{ + __u32 subsys_idx = 0; + __u32 flags = 0; + struct cgroup_subsys_state *subsys; + struct css_set *cgroups; + struct cgroup *cgrp = NULL; + + bpf_probe_read(&cgroups, sizeof(cgroups), __builtin_preserve_access_index(&task->cgroups)); + if (unlikely(!cgroups)) { + flags |= EVENT_ERROR_CGROUPS; + return cgrp; + } + + if (unlikely(subsys_idx > pids_cgrp_id)) { + flags |= EVENT_ERROR_CGROUP_SUBSYS; + return cgrp; + } + + bpf_probe_read(&subsys, sizeof(subsys), __builtin_preserve_access_index(&cgroups->subsys[subsys_idx])); + if (unlikely(!subsys)) { + flags |= EVENT_ERROR_CGROUP_SUBSYS; + return cgrp; + } + + bpf_probe_read(&cgrp, sizeof(cgrp), __builtin_preserve_access_index(&subsys->cgroup)); + if (!cgrp) + flags |= EVENT_ERROR_CGROUP_SUBSYSCGRP; + + return cgrp; +} + static __always_inline uint64_t get_start_time() { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); @@ -258,11 +407,24 @@ static __always_inline uint64_t get_start_time() sizeof(struct task_struct *), (uint8_t *)task + gl_off); - uint64_t st_off = offsetof(struct task_struct, start_time); uint64_t start_time = 0; - bpf_probe_read(&start_time, + + if (bpf_core_field_exists(group_leader_ptr->start_time)) + { + uint64_t st_off = offsetof(struct task_struct, start_time); + bpf_probe_read(&start_time, + sizeof(uint64_t), + (uint8_t *)group_leader_ptr + st_off); + } + else if (bpf_core_field_exists(group_leader_ptr->start_boottime)) + { + uint64_t st_off = offsetof(struct task_struct, start_boottime); + bpf_probe_read(&start_time, sizeof(uint64_t), (uint8_t *)group_leader_ptr + st_off); + } else { + start_time = bpf_ktime_get_ns(); + } return start_time; // return nsec_to_clock_t(start_time); @@ -281,7 +443,7 @@ static __always_inline void init_conn_id(uint32_t tgid, conn_id->fd = fd; // currently use kernel time for connection id. conn_id->start = bpf_ktime_get_ns(); - ; + // conn_id->start = get_start_time(); } static __always_inline void init_conn_info(uint32_t tgid, @@ -293,6 +455,12 @@ static __always_inline void init_conn_info(uint32_t tgid, conn_info->addr.sa.sa_family = AF_UNKNOWN; conn_info->is_sample = true; conn_info->protocol = ProtoUnknown; + conn_info->cid_key = 0; + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + struct cgroup *cgrp = get_task_cgroup(task); + if (!cgrp) + return; + __event_get_current_cgroup_name(cgrp, conn_info); } static __always_inline int32_t get_buf_32(const char *buf) @@ -912,7 +1080,9 @@ static __always_inline void try_event_output(void *ctx, struct connect_info_t *i struct conn_data_event_t *data = &info->wr_min_ts; data->conn_id = info->conn_id; u64 total_size = (u64)(&data->msg[0]) - (u64)data + info->request_len + info->response_len; - bpf_perf_event_output(ctx, &connect_data_events_map, BPF_F_CURRENT_CPU, data, total_size & (PACKET_MAX_SIZE * 2 - 1)); + if (match_container_id(info)) { + bpf_perf_event_output(ctx, &connect_data_events_map, BPF_F_CURRENT_CPU, data, total_size & (PACKET_MAX_SIZE * 2 - 1)); + } } reset_sock_info(info); } @@ -946,6 +1116,8 @@ static __always_inline struct conn_stats_event_t *add_conn_stats(struct connect_ } event->conn_id = conn_info->conn_id; + event->protocol = conn_info->protocol; + bpf_probe_read_str(event->docker_id, KN_NAME_LENGTH, conn_info->docker_id); event->addr = conn_info->addr; event->role = conn_info->role; event->wr_bytes = conn_info->wr_bytes; @@ -1212,6 +1384,43 @@ static __always_inline enum support_role_e get_sock_role(const struct socket *so return max_ack_backlog == 0 ? IsClient : IsServer; } + +static __always_inline void output_conn_stats(struct trace_event_raw_sys_exit_comp *ctx, + struct connect_info_t *conn_info, + enum support_direction_e direction, + ssize_t return_bytes, bool force) +{ + switch (direction) + { + case DirEgress: + conn_info->wr_bytes += return_bytes; + conn_info->wr_pkts++; + break; + case DirIngress: + conn_info->rd_bytes += return_bytes; + conn_info->rd_pkts++; + break; + } + + uint64_t total_bytes = conn_info->wr_bytes + conn_info->rd_bytes; + uint32_t total_pkts = conn_info->wr_pkts + conn_info->rd_pkts; + + bool real_threshold = (total_bytes >= conn_info->last_output_rd_bytes + conn_info->last_output_wr_bytes + ConnStatsBytesThreshold) || (total_pkts >= conn_info->last_output_rd_pkts + conn_info->last_output_wr_pkts + ConnStatsPacketsThreshold); + if (match_container_id(conn_info) && (real_threshold || force || !conn_info->ever_sent)) + { + struct conn_stats_event_t *event = add_conn_stats(conn_info); + if (event != NULL) + { + bpf_perf_event_output(ctx, &connect_stats_events_map, BPF_F_CURRENT_CPU, event, sizeof(struct conn_stats_event_t)); + } + conn_info->last_output_wr_bytes = conn_info->wr_bytes; + conn_info->last_output_rd_bytes = conn_info->rd_bytes; + conn_info->last_output_wr_pkts = conn_info->wr_pkts; + conn_info->last_output_rd_pkts = conn_info->rd_pkts; + conn_info->ever_sent = true; + } +} + static __always_inline void add_one_conn(struct trace_event_raw_sys_exit_comp *ctx, const struct sockaddr *addr, const struct socket *socket, @@ -1224,6 +1433,10 @@ static __always_inline void add_one_conn(struct trace_event_raw_sys_exit_comp *c return; } + conn_info->ever_sent = false; + + // __builtin_memset(conn_info, 0, sizeof(struct connect_info_t)); + uint32_t tgid = tg_role->tgid; int32_t fd = tg_role->fd; enum support_role_e role = tg_role->role; @@ -1261,6 +1474,7 @@ static __always_inline void add_one_conn(struct trace_event_raw_sys_exit_comp *c uint64_t tgid_fd = combine_tgid_fd(tgid, fd); // net_bpf_print("start ====add_conn\n"); bpf_map_update_elem(&connect_info_map, &tgid_fd, conn_info, BPF_ANY); + output_conn_stats(ctx, conn_info, DirUnknown, 0, true); bpf_map_update_elem(&socket_pidfd_map, &socket, &tgid_fd, BPF_ANY); if (!need_trace_family(conn_info->addr.sa.sa_family)) { @@ -1282,40 +1496,6 @@ static __always_inline void add_one_conn(struct trace_event_raw_sys_exit_comp *c #endif } -static __always_inline void output_conn_stats(struct trace_event_raw_sys_exit_comp *ctx, - struct connect_info_t *conn_info, - enum support_direction_e direction, - ssize_t return_bytes) -{ - switch (direction) - { - case DirEgress: - conn_info->wr_bytes += return_bytes; - conn_info->wr_pkts++; - break; - case DirIngress: - conn_info->rd_bytes += return_bytes; - conn_info->rd_pkts++; - break; - } - - uint64_t total_bytes = conn_info->wr_bytes + conn_info->rd_bytes; - uint32_t total_pkts = conn_info->wr_pkts + conn_info->rd_pkts; - - bool real_threshold = (total_bytes >= conn_info->last_output_rd_bytes + conn_info->last_output_wr_bytes + ConnStatsBytesThreshold) || (total_pkts >= conn_info->last_output_rd_pkts + conn_info->last_output_wr_pkts + ConnStatsPacketsThreshold); - if (real_threshold) - { - struct conn_stats_event_t *event = add_conn_stats(conn_info); - if (event != NULL) - { - bpf_perf_event_output(ctx, &connect_stats_events_map, BPF_F_CURRENT_CPU, event, sizeof(struct conn_stats_event_t)); - } - conn_info->last_output_wr_bytes = conn_info->wr_bytes; - conn_info->last_output_rd_bytes = conn_info->rd_bytes; - conn_info->last_output_wr_pkts = conn_info->wr_pkts; - conn_info->last_output_rd_pkts = conn_info->rd_pkts; - } -} static __always_inline void add_close_event(struct trace_event_raw_sys_exit_comp *ctx, struct connect_info_t *conn_info) { @@ -1325,7 +1505,7 @@ static __always_inline void add_close_event(struct trace_event_raw_sys_exit_comp ctrl_event.conn_id = conn_info->conn_id; ctrl_event.close.rd_bytes = conn_info->rd_bytes; ctrl_event.close.wr_bytes = conn_info->wr_bytes; - if (conn_info->is_sample) + // if (conn_info->is_sample) { bpf_perf_event_output(ctx, &connect_ctrl_events_map, BPF_F_CURRENT_CPU, &ctrl_event, sizeof(struct conn_ctrl_event_t)); @@ -1641,8 +1821,8 @@ static __always_inline void trace_exit_data(struct trace_event_raw_sys_exit_comp handle_server_recv_request(conn_info); } } + output_conn_stats(ctx, conn_info, direction, return_bytes, false); try_event_output(ctx, conn_info, direction); - output_conn_stats(ctx, conn_info, direction, return_bytes); // if (!conn_info->is_sample) // { // return; @@ -1894,11 +2074,8 @@ int BPF_KPROBE(tcp_close, struct sock *sk) * only family is AF_UNIX and no data will no report, but the bytes will be * recorded in first data event and report to user */ - if (need_trace_family(conn_info->addr.sa.sa_family) || - conn_info->wr_bytes != 0 || - conn_info->rd_bytes != 0) + if (match_container_id(conn_info) && need_trace_family(conn_info->addr.sa.sa_family)) { - add_close_event(ctx, conn_info); if (conn_info->last_output_rd_pkts + conn_info->last_output_wr_pkts != conn_info->rd_pkts + conn_info->wr_pkts) { @@ -2444,6 +2621,38 @@ int tp_sys_enter_recvmsg(struct trace_event_raw_sys_enter_comp *ctx) return 0; } +SEC("uprobe/ebpf_get_self_runtime_info") +int ebpf_get_self_runtime_info(struct pt_regs *ctx) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + struct cgroup *cgrp = get_task_cgroup(task); + if (!cgrp) + return 1; + + const char *name; + + name = get_cgroup_name(cgrp); + if (!name) return EVENT_ERROR_CGROUP_NAME; + + int key = 0; + struct self_runtime_info *runtime_info; + runtime_info = bpf_map_lookup_elem(&self_runtime_info_heap, &key); + if (runtime_info == NULL) { + return 1; + } + + int ret = bpf_probe_read_str(runtime_info->docker_id, KN_NAME_LENGTH, name); + runtime_info->docker_id_length = ret; + BPF_DEBUG("[uprobe][ebpf_get_self_runtime_info] docker_id:%s ret:%u \n", cgroup_name, ret); + runtime_info->pid = bpf_get_current_pid_tgid() >> 32; + + + bpf_map_update_elem(&self_runtime_info_map, &key, runtime_info, BPF_ANY); + + // not found + return 0; +} + SEC("uprobe/ebpf_cleanup_dog") int cleanup_dog_probe(struct pt_regs *ctx) { diff --git a/src/coolbpf.c b/src/coolbpf.c index ca39be99fc1f28071b4c3d814482f369aaa3a426..df3cc32e37ff996a0396df5fb1830dbcde4cc01d 100644 --- a/src/coolbpf.c +++ b/src/coolbpf.c @@ -68,7 +68,7 @@ struct coolbpf_object *__coolbpf_object_open(skel_open open, skel_load load, void *skel_obj = open(); if (!skel_obj) { free(obj); - error("failed to open skeleton object\n"); + log_error("failed to open skeleton object\n"); return NULL; } @@ -135,14 +135,14 @@ void *perf_thread_worker(void *ctx) err = libbpf_get_error(pb); if (err) { - error("error new perf buffer: %s\n", strerror(-err)); + log_error("error new perf buffer: %s\n", strerror(-err)); return NULL; } if (!pb) { err = -errno; - error("failed to open perf buffer: %d\n", err); + log_error("failed to open perf buffer: %d\n", err); return NULL; } @@ -151,7 +151,7 @@ void *perf_thread_worker(void *ctx) err = perf_buffer__poll(pb, timeout_ms); if (err < 0 && err != -EINTR) { - error("error polling perf buffer: %s\n", strerror(-err)); + log_error("error polling perf buffer: %s\n", strerror(-err)); goto cleanup; } diff --git a/src/coolbpf.h b/src/coolbpf.h index ea30c97fa9b37e19654a4b4d70b04771a0a0ae71..98f4d97394617a2e8166b0c82fd2940736866057 100644 --- a/src/coolbpf.h +++ b/src/coolbpf.h @@ -54,12 +54,12 @@ struct coolbpf_object int __err = 0; \ struct coolbpf_object *__cb = calloc(1, sizeof(struct coolbpf_object)); \ if (!__cb) { \ - error("failed to allocate memory for coolbpf_object\n"); \ + log_error("failed to allocate memory for coolbpf_object\n"); \ goto __real_out; \ } \ struct skel##_bpf *skel_obj = skel##_bpf__open(); \ if (!skel_obj) { \ - error("failed to open CoolBPF object\n"); \ + log_error("failed to open CoolBPF object\n"); \ goto __failed_out; \ } \ __cb->skel_load = skel##_bpf__load; \ @@ -71,13 +71,13 @@ struct coolbpf_object __cb->ctx = _ctx; \ __err = coolbpf_object_load(__cb); \ if (__err) { \ - error("failed to load CoolBPF object: %d\n", __err); \ + log_error("failed to load CoolBPF object: %d\n", __err); \ coolbpf_object_destroy(__cb); \ goto __failed_out; \ } \ __err = coolbpf_object_attach(__cb); \ if (__err) { \ - error("failed to attach CoolBPF object: %d\n", __err); \ + log_error("failed to attach CoolBPF object: %d\n", __err); \ coolbpf_object_destroy(__cb); \ goto __failed_out; \ } \ diff --git a/src/ebpf_log.h b/src/ebpf_log.h new file mode 100644 index 0000000000000000000000000000000000000000..fdca4685f1d4c3feddfede20b1e1b0d6b8a8f40d --- /dev/null +++ b/src/ebpf_log.h @@ -0,0 +1,13 @@ +#pragma once + +#define BPF_NO_GLOBAL_DATA + +/* Macro to output debug logs to /sys/kernel/debug/tracing/trace_pipe + */ +#ifdef COOLBPF_DEBUG +#include +#define BPF_DEBUG(__fmt, ...) bpf_printk(__fmt, ##__VA_ARGS__) +#else +// No op +#define BPF_DEBUG(__fmt, ...) +#endif \ No newline at end of file diff --git a/src/log.h b/src/log.h index 34735c55efa8b6728b010ed6eea433df612f8b59..656b8ef1377bc3d18687bf8e28b9e7765e384ab1 100644 --- a/src/log.h +++ b/src/log.h @@ -36,12 +36,12 @@ enum { LOG_TRACE, LOG_DEBUG, LOG_INFO, LOG_WARN, LOG_ERROR, LOG_FATAL }; COOLBPF_API void coolbpf_set_loglevel(int level); -#define trace(...) log_log(LOG_TRACE, __FILE__, __LINE__, __VA_ARGS__) -#define debug(...) log_log(LOG_DEBUG, __FILE__, __LINE__, __VA_ARGS__) -#define info(...) log_log(LOG_INFO, __FILE__, __LINE__, __VA_ARGS__) -#define warn(...) log_log(LOG_WARN, __FILE__, __LINE__, __VA_ARGS__) -#define error(...) log_log(LOG_ERROR, __FILE__, __LINE__, __VA_ARGS__) -#define fatal(...) log_log(LOG_FATAL, __FILE__, __LINE__, __VA_ARGS__) +#define log_trace(...) log_log(LOG_TRACE, __FILE__, __LINE__, __VA_ARGS__) +#define log_debug(...) log_log(LOG_DEBUG, __FILE__, __LINE__, __VA_ARGS__) +#define log_info(...) log_log(LOG_INFO, __FILE__, __LINE__, __VA_ARGS__) +#define log_warn(...) log_log(LOG_WARN, __FILE__, __LINE__, __VA_ARGS__) +#define log_error(...) log_log(LOG_ERROR, __FILE__, __LINE__, __VA_ARGS__) +#define log_fatal(...) log_log(LOG_FATAL, __FILE__, __LINE__, __VA_ARGS__) const char* log_level_string(int level); void log_set_lock(log_LockFn fn, void *udata); diff --git a/src/net.c b/src/net.c index 19a5416724fb0ba8ba5f39cfb2075ae5b6e7c495..3aadb24d59f38401c4453969352cb19d4f980685 100644 --- a/src/net.c +++ b/src/net.c @@ -87,6 +87,7 @@ static struct net_env_t struct perf_buffer *pbs[MAX_HAND]; struct callback_t callback[MAX_HAND]; int32_t page_count[MAX_HAND]; + int32_t cid_prefix_len; struct lost_callback_t lost_callback; net_print_fn_t libbpf_print; char version[64]; @@ -252,6 +253,19 @@ static void handle_lost_stat_event(void *ctx, int cpu, __u64 lost_cnt) } } +static int user_config_cid(int config_fd) +{ + int ret; + uint32_t index = ContainerIdIndex; + ret = bpf_map_update_elem(config_fd, &index, &env.cid_prefix_len, BPF_ANY); + if (ret) + net_log(LOG_TYPE_WARN, "Could not update map for cid prefix len %d: %s\n", env.cid_prefix_len, strerror(-ret)); + else + net_log(LOG_TYPE_INFO, "success to update map for cid prefix len: %d\n", env.cid_prefix_len); + + return ret; +} + static int user_config_tgid(int config_fd) { int ret; @@ -324,6 +338,38 @@ static void get_btf_path(void) pclose(fp); } +int32_t ebpf_init_self_runtime_info(char *so, long offset, struct self_runtime_info* info) { + // attach + struct net_bpf *obj = env.obj; + int ret; + + obj->links.ebpf_get_self_runtime_info = bpf_program__attach_uprobe(obj->progs.ebpf_get_self_runtime_info, false, + 0, so, offset); // 0 for self + ret = libbpf_get_error(obj->links.ebpf_get_self_runtime_info); + if (ret != 0) + { + net_log(LOG_TYPE_WARN, "uprobe get_self_runtime_info failed\n"); + return ret; + } + + net_log(LOG_TYPE_INFO, "successfully attach uprobe get_self_runtime_info\n"); + + // trigger + get_self_runtime_info(); + + // read from bpf maps ... + int map_fd = bpf_map__fd(obj->maps.self_runtime_info_map); + + int key = 0; + ret = bpf_map_lookup_elem(map_fd, &key, info); + if (ret && errno != ENOENT) { + net_log(LOG_TYPE_WARN, "failed to lookup element in self_runtime_info_map: %s\n", strerror(errno)); + return ret; + } + + return 0; +} + int32_t ebpf_init(char *btf, int32_t btf_size, char *so, int32_t so_size, long uprobe_offset, long upca_offset, long upps_offset, long upcr_offset) { @@ -355,6 +401,7 @@ int32_t ebpf_init(char *btf, int32_t btf_size, char *so, int32_t so_size, long u bpf_program__set_autoattach(obj->progs.disable_process_probe, false); bpf_program__set_autoattach(obj->progs.update_conn_role_probe, false); bpf_program__set_autoattach(obj->progs.update_conn_addr_probe, false); + bpf_program__set_autoattach(obj->progs.ebpf_get_self_runtime_info, false); err = net_bpf__attach(obj); if (err) { @@ -505,6 +552,11 @@ void ebpf_config(int32_t opt1, int32_t opt2, int32_t params_count, value = (int32_t *)(params[0]); env.page_count[opt2] = *value; break; + case CONTAINER_ID_FILTER: + value = (int32_t *)(params[0]); + env.cid_prefix_len = *value; + user_config_cid(bpf_map__fd(obj->maps.config_tgid_map)); + break; defaults: user_config_proto(bpf_map__fd(obj->maps.config_protocol_map)); user_config_tgid(bpf_map__fd(obj->maps.config_tgid_map)); @@ -513,7 +565,7 @@ void ebpf_config(int32_t opt1, int32_t opt2, int32_t params_count, } } -int32_t ebpf_poll_events(int32_t max_events, int32_t *stop_flag) +int32_t ebpf_poll_events(int32_t max_events, int32_t *stop_flag, int timeout_ms) { int j; /* 100 times one by one ?*/ @@ -522,7 +574,7 @@ int32_t ebpf_poll_events(int32_t max_events, int32_t *stop_flag) { if (g_poll_callback_count < max_events && !*stop_flag) { - int rst = perf_buffer__poll(env.pbs[j], 0); + int rst = perf_buffer__poll(env.pbs[j], timeout_ms); if (rst < 0 && errno != EINTR) { net_log(LOG_TYPE_WARN, "Error polling perf buffer: %d, hand_type:%d\n", @@ -660,3 +712,35 @@ void ebpf_disable_process(uint32_t pid, bool drop) void ebpf_update_conn_role(struct connect_id_t *conn_id, enum support_role_e role_type) { } + +void get_self_runtime_info() {} + +bool ebpf_set_cid_filter(const char* container_id, size_t length, uint64_t cid_key, bool update) +{ + struct net_bpf *obj = env.obj; + int map_fd = bpf_map__fd(obj->maps.enable_container_ids); + + // Prepare the key for update/delete + struct container_id_key key = { + .prefixlen = CONTAINER_ID_MAX_LENGTH * 8 // Full length as prefix length in bits + }; + memset(key.data, 0, CONTAINER_ID_MAX_LENGTH); + memcpy(key.data, container_id, length); + bool ret; + + if (update) { + ret = bpf_map_update_elem(map_fd, &key, &cid_key, BPF_ANY); + if (ret) { + net_log(LOG_TYPE_WARN, "Failed to update element: %s\n", strerror(errno)); + return false; + } + } else { + ret = bpf_map_delete_elem(map_fd, &key); + if (ret) { + net_log(LOG_TYPE_WARN, "Failed to delete element: %s\n", strerror(errno)); + return false; + } + } + + return true; +} diff --git a/src/net.h b/src/net.h index e38dc0a418ba4c85ccc560251c11d422a8686fbe..dfc4d08ac975e89647ea69efd94caeb0425599ba 100644 --- a/src/net.h +++ b/src/net.h @@ -6,13 +6,14 @@ #ifndef COOLBPF_NET_H #define COOLBPF_NET_H +#if defined(__linux__) #ifndef __VMLINUX_H__ #include #include #include #include #endif - +#endif // request or reponse #define PACKET_MAX_SIZE 8192 @@ -43,6 +44,7 @@ enum support_role_e enum tgid_config_e { TgidIndex = 0, + ContainerIdIndex = 1, TgidNum, }; @@ -151,6 +153,7 @@ union sockaddr_t struct sockaddr_in6 in6; }; +#define KN_NAME_LENGTH 128 struct connect_id_t { int32_t fd; @@ -188,7 +191,9 @@ struct conn_stats_event_t struct connect_id_t conn_id; union sockaddr_t addr; struct socket_info si; + enum support_proto_e protocol; enum support_role_e role; + char docker_id[KN_NAME_LENGTH]; int64_t wr_bytes; int64_t rd_bytes; int32_t wr_pkts; @@ -203,9 +208,11 @@ struct conn_stats_event_t struct conn_data_event_t { struct connect_id_t conn_id; + uint64_t cid_key; uint64_t start_ts; uint64_t end_ts; enum support_proto_e protocol; + enum support_role_e role; uint16_t request_len; uint16_t response_len; #ifdef __VMLINUX_H__ @@ -215,6 +222,12 @@ struct conn_data_event_t #endif }; +struct self_runtime_info { + uint32_t pid; + char docker_id[KN_NAME_LENGTH]; + int32_t docker_id_length; +}; + #ifdef __VMLINUX_H__ struct connect_info_t @@ -222,8 +235,9 @@ struct connect_info_t struct connect_id_t conn_id; union sockaddr_t addr; struct socket_info si; - enum support_role_e role; enum support_type_e type; + int32_t docker_id_length; + char docker_id[KN_NAME_LENGTH]; int64_t wr_bytes; int64_t rd_bytes; int32_t wr_pkts; @@ -237,6 +251,7 @@ struct connect_info_t size_t prev_count; char prev_buf[4]; bool try_to_prepend; + bool ever_sent; bool is_sample; uint64_t rt; @@ -244,9 +259,11 @@ struct connect_info_t uint64_t rd_max_ts; uint64_t wr_min_ts; uint64_t wr_max_ts; + uint64_t cid_key; uint64_t start_ts; uint64_t end_ts; enum support_proto_e protocol; + enum support_role_e role; uint16_t request_len; uint16_t response_len; char msg[PACKET_MAX_SIZE * 3]; @@ -302,13 +319,19 @@ struct config_info_t int32_t data_sample; }; +#define CONTAINER_ID_MAX_LENGTH 64 +struct container_id_key { + uint32_t prefixlen; + uint8_t data[CONTAINER_ID_MAX_LENGTH]; +}; + #ifndef __VMLINUX_H__ enum callback_type_e { - CTRL_HAND = 0, + STAT_HAND = 0, INFO_HANDLE, - STAT_HAND, + CTRL_HAND, #ifdef NET_TEST TEST_HAND, #endif @@ -344,6 +367,7 @@ enum ebpf_config_primary_e // 采样的策略:tcp的包,连接建立的ns时间 % 100, 小于采样率即为需要上传,大于的话对该连接进行标记,不上传Data、Ctrl(统计数据还是要上传) // udp的包,接收到数据包的ns时间 % 100, 小于采样率即为需要上传,大于的话不上传Data(统计数据还是要上传 @note 要注意统计数据Map的清理策略) PERF_BUFFER_PAGE, // ring buffer page count, 默认128个页,也就是512KB, opt2 的类型是 callback_type_e + CONTAINER_ID_FILTER, // container id filter, 不配置则全部采集,如果需要开启,则 value 需要设置为 cgroup name 的前缀长度 }; // opt1 列表: // AddProtocolFilter、RemoveProtocolFilter @@ -380,7 +404,7 @@ void ebpf_config(int32_t opt1, int32_t opt2, int32_t params_count, void **params * @param stop_flag 是否需要立即退出 * @return int32_t 正数,返回处理的事件数; -100,stop_flag触发;其他,错误码 */ -int32_t ebpf_poll_events(int32_t max_events, int32_t *stop_flag); +int32_t ebpf_poll_events(int32_t max_events, int32_t *stop_flag, int timeout_ms); // 启动时,会调用init,然后调用start /* @@ -425,8 +449,15 @@ void ebpf_update_conn_addr(struct connect_id_t *conn_id, union sockaddr_t *dest_ // 更新process 观察范围,动态增加pid,drop 为true 是进行删除操作。 void ebpf_disable_process(uint32_t pid, bool drop); +// 更新containerid 观察范围,动态增加 container id,drop 为true 是进行删除操作。 +bool ebpf_set_cid_filter(const char* container_id, size_t length, uint64_t cid_key, bool update); + // 更新conn对应的角色,某些协议内核态无法判断角色 void ebpf_update_conn_role(struct connect_id_t *conn_id, enum support_role_e role_type); +void get_self_runtime_info(); + +int32_t ebpf_init_self_runtime_info(char *so, long offset, struct self_runtime_info* info); + #endif #endif diff --git a/src/profiler/Cargo.lock b/src/profiler/Cargo.lock index 9b6b5695097aaf7720534bb4c5f4683f86e7b741..80738045c896046a9b28fd7be111d7dcd6649a0f 100644 --- a/src/profiler/Cargo.lock +++ b/src/profiler/Cargo.lock @@ -1127,6 +1127,17 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "pagemap" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9b10bd736861cab1e4a800e7547d08f5103929b3549a45a23408aaff0c1ee71" +dependencies = [ + "bitflags 1.3.2", + "libc", + "thiserror", +] + [[package]] name = "parking_lot" version = "0.12.3" @@ -1280,6 +1291,7 @@ dependencies = [ "num_cpus", "object", "once_cell", + "pagemap", "perf-event-open-sys", "procfs", "protobuf", diff --git a/src/profiler/Cargo.toml b/src/profiler/Cargo.toml index 34983b66caf1d392e96805817c94f387c68e594e..02fed57ec69a86ca743099b4440ec049afe3cc64 100644 --- a/src/profiler/Cargo.toml +++ b/src/profiler/Cargo.toml @@ -29,6 +29,7 @@ moka = { version = "0.12.10", features = ["sync"] } num_cpus = "1.16.0" object = "0.36.1" once_cell = "1.19.0" +pagemap = "0.1.0" perf-event-open-sys = "4.0.0" procfs = "0.16.0" protobuf = "3.5.1" diff --git a/src/profiler/build.rs b/src/profiler/build.rs index 5666c0c42b6edf3bbc53a7b4c9f82b5e7627a85f..abe508d977c3796d40577e8e4ec45459c570f65b 100644 --- a/src/profiler/build.rs +++ b/src/profiler/build.rs @@ -8,6 +8,7 @@ fn generate_skeleton(out: &mut PathBuf, name: &str) { out.push(&rs_name); SkeletonBuilder::new() .source(&c_path) + .clang_args(["-DHAS_APM"]) .build_and_generate(&out) .unwrap(); @@ -22,6 +23,7 @@ fn generate_header(out: &mut PathBuf, name: &str) { out.push(&rs_name); let bindings = bindgen::Builder::default() .header(&header_path) + .clang_args(["-DHAS_APM"]) .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) .generate() .unwrap(); diff --git a/src/profiler/src/bin/heatmap.rs b/src/profiler/src/bin/heatmap.rs index a8ac780c9501f2eed7d091793b88686749f328b0..dde5b58010a68bfdc0db36f8defd241e4ecf4a69 100644 --- a/src/profiler/src/bin/heatmap.rs +++ b/src/profiler/src/bin/heatmap.rs @@ -7,7 +7,7 @@ use std::ffi::CString; use structopt::StructOpt; #[derive(Debug, StructOpt)] -#[structopt(name = "rtrace", about = "Diagnosing tools of kernel network")] +#[structopt(name = "heatmap", about = "show cpu heatmap")] pub struct Command { #[structopt(long, help = "Specify the Pid of the tracking process")] pid: u32, diff --git a/src/profiler/src/bpf/interpreter_dispatcher.bpf.c b/src/profiler/src/bpf/interpreter_dispatcher.bpf.c index 4308cdeab033f0b6ece3b2834ab9bba7420f6877..12c90fd2a7e640b8b954afb22b20db52d30e16d3 100644 --- a/src/profiler/src/bpf/interpreter_dispatcher.bpf.c +++ b/src/profiler/src/bpf/interpreter_dispatcher.bpf.c @@ -34,6 +34,27 @@ void maybe_add_apm_info(Trace *trace) { DEBUG_PRINT("Trace is within a process with APM integration enabled"); + if (proc->tracing_type == TRACE_GO_AGENT) { + const struct task_struct* task_ptr = (struct task_struct*)bpf_get_current_task(); + const void* fs_base; + bpf_probe_read(&fs_base, sizeof(void *), &task_ptr->thread.fsbase); + + size_t g_addr; // address of struct runtime.g + bpf_probe_read_user(&g_addr, sizeof(void*), (void*)(fs_base + (-8))); + + size_t go_string_addr; // address of field traceId in runtime.g + bpf_probe_read_user(&go_string_addr, sizeof(void*), (void*)(g_addr + proc->tracing_field_offset + 8)); + + size_t trace_id_addr; + bpf_probe_read_user(&trace_id_addr, sizeof(void*), (void*)(go_string_addr + 0)); + + const char trace_id[32]; + bpf_probe_read_user(trace_id, sizeof(trace_id), (void*)(trace_id_addr)); + + __builtin_memcpy(trace->trace_id, trace_id, sizeof(trace->trace_id)); + return; + } + u64 tsd_base; if (tsd_get_base((void **)&tsd_base) != 0) { increment_metric(metricID_UnwindApmIntErrReadTsdBase); diff --git a/src/profiler/src/bpf/native_stack.bpf.c b/src/profiler/src/bpf/native_stack.bpf.c index e8c7edbe09332facbc3c7d4ef6909d65d1ec3d4a..305f8df9e7a5f9d1f1eb305971c4fdb027b7418d 100644 --- a/src/profiler/src/bpf/native_stack.bpf.c +++ b/src/profiler/src/bpf/native_stack.bpf.c @@ -668,10 +668,20 @@ static inline ErrorCode copy_state_regs(UnwindState *state, // Check if the process is running in 32-bit mode on the x86_64 system. // This check follows the Linux kernel implementation of user_64bit_mode() in // arch/x86/include/asm/ptrace.h. - if (regs->cs == __USER32_CS) - { - return ERR_NATIVE_X64_32BIT_COMPAT_MODE; + if (bpf_core_field_size(regs->cs) == 2) { + u16 cs; + bpf_probe_read_kernel(&cs, sizeof(cs), ®s->cs); + if (cs == __USER32_CS) + { + return ERR_NATIVE_X64_32BIT_COMPAT_MODE; + } + } else { + if (regs->cs == __USER32_CS) + { + return ERR_NATIVE_X64_32BIT_COMPAT_MODE; + } } + state->pc = regs->ip; state->sp = regs->sp; state->fp = regs->bp; @@ -731,9 +741,18 @@ static inline bool ptregs_is_usermode(struct pt_regs *regs) { #if defined(__x86_64__) // On x86_64 the user mode SS should always be __USER_DS. - if (regs->ss != __USER_DS) - { - return false; + if (bpf_core_field_size(regs->ss) == 2) { + u16 ss; + bpf_probe_read_kernel(&ss, sizeof(ss), ®s->ss); + if (ss != __USER_DS) + { + return false; + } + } else { + if (regs->ss != __USER_DS) + { + return false; + } } return true; #elif defined(__aarch64__) diff --git a/src/profiler/src/bpf/tracemgmt.h b/src/profiler/src/bpf/tracemgmt.h index b222ae793d68d805fd681dddacd187d9ec3b6172..8b0a5c23d43b1ff3a99d4cd26b3dedf2d8fb58de 100644 --- a/src/profiler/src/bpf/tracemgmt.h +++ b/src/profiler/src/bpf/tracemgmt.h @@ -248,6 +248,7 @@ static inline PerCPURecord *get_pristine_per_cpu_record() trace->apm_trace_id.as_int.hi = 0; trace->apm_trace_id.as_int.lo = 0; trace->apm_transaction_id.as_int = 0; + __builtin_memset(trace->trace_id, 0, TRACE_ID_LEN); #endif return record; } diff --git a/src/profiler/src/bpf/types.h b/src/profiler/src/bpf/types.h index 1111abc30e5ec87055c69831491f657b26e99ade..8b302e422fe7fc0d7e44b98443e25fb8c3bbd782 100644 --- a/src/profiler/src/bpf/types.h +++ b/src/profiler/src/bpf/types.h @@ -501,6 +501,8 @@ typedef struct V8ProcInfo { // COMM_LEN defines the maximum length we will receive for the comm of a task. #define COMM_LEN 16 +#define TRACE_ID_LEN 32 + #ifdef HAS_APM // 128-bit APM trace ID. typedef union ApmTraceID { @@ -550,6 +552,8 @@ typedef struct Trace { ApmSpanID apm_transaction_id; // APM trace ID or all-zero if not present. ApmTraceID apm_trace_id; + // General trace id + unsigned char trace_id[TRACE_ID_LEN]; #endif // The kernel stack ID. s32 kernel_stack_id; @@ -883,8 +887,15 @@ typedef struct SystemConfig { #define PSR_MODE_MASK 0x0000000f #define PSR_MODE_EL0t 0x00000000 +typedef enum TracingType { + TRACE_NONE, + TRACE_GO_AGENT, +} TracingType; + typedef struct ApmIntProcInfo { u64 tls_offset; + TracingType tracing_type; + u64 tracing_field_offset; } ApmIntProcInfo; #endif diff --git a/src/profiler/src/heatmap.rs b/src/profiler/src/heatmap.rs index 369179b705e3dcdbaff25554256da94e9b5ec2d4..bf632de51f3bcad1e10354365f127d4e68147173 100644 --- a/src/profiler/src/heatmap.rs +++ b/src/profiler/src/heatmap.rs @@ -1,3 +1,4 @@ +use crate::get_host_root_path; use std::collections::HashMap; use std::collections::LinkedList; use std::fs::read_to_string; @@ -34,7 +35,7 @@ pub struct ProcessHeatMap { impl ProcessHeatMap { pub fn add_process(&mut self, pid: u32) { self.heat_maps.entry(pid).or_insert_with(|| { - let comm = match read_to_string(format!("/proc/{pid}/comm")) { + let comm = match read_to_string(format!("{}/proc/{pid}/comm", get_host_root_path())) { Ok(mut comm) => { comm.pop(); comm @@ -57,7 +58,7 @@ impl ProcessHeatMap { // find it in previous heatmap if base < heat.base { for single in self.done.iter_mut() { - if single.base == base { + if pid == single.pid && single.base == base { single.inc(slot); return; } diff --git a/src/profiler/src/lib.rs b/src/profiler/src/lib.rs index 308ec0f7f188206d3460a11634b54c991e43c9f6..2ef76fff50ebca11dacd3d39665781da1ae4a720 100644 --- a/src/profiler/src/lib.rs +++ b/src/profiler/src/lib.rs @@ -5,6 +5,7 @@ use std::ffi::CString; use std::sync::atomic::AtomicBool; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; +use std::sync::OnceLock; pub mod error; pub mod executable; pub mod interpreter; @@ -29,6 +30,12 @@ pub static ENABLE_SYMBOLIZER: AtomicBool = AtomicBool::new(true); pub static SYMBOL_FILE_MAX_SIZE: AtomicU64 = AtomicU64::new(u64::MAX); pub static LIVETRACE_ENABLE_CPU_INFO: AtomicBool = AtomicBool::new(false); pub static LIVETRACE_ENABLE_FUNCTION_OFFSET: AtomicBool = AtomicBool::new(false); +pub static HOST_ROOT_PATH: OnceLock = OnceLock::new(); +pub static ENABLE_TRACING: AtomicBool = AtomicBool::new(false); + +pub fn get_host_root_path() -> &'static str { + HOST_ROOT_PATH.get().map(|s| s.as_str()).unwrap_or("/") +} pub fn is_enable_cpuno() -> bool { LIVETRACE_ENABLE_CPU_INFO.load(Ordering::SeqCst) @@ -50,6 +57,10 @@ pub fn symbol_file_max_size() -> u64 { SYMBOL_FILE_MAX_SIZE.load(Ordering::SeqCst) } +pub fn is_enable_tracing() -> bool { + ENABLE_TRACING.load(Ordering::SeqCst) +} + pub fn symbol_file_max_symbols() -> u64 { let sz = symbol_file_max_size(); if sz == u64::MAX { @@ -102,6 +113,25 @@ pub extern "C" fn livetrace_disable_symbolizer() { ENABLE_SYMBOLIZER.store(false, Ordering::SeqCst); } +#[no_mangle] +pub extern "C" fn livetrace_set_host_root_path(path: *const libc::c_char) -> i32 { + let path = unsafe { CStr::from_ptr(path) }; + let path = match path.to_str() { + Ok(path) => path, + Err(_e) => { + return -1; + } + }; + let path = path.to_string(); + HOST_ROOT_PATH.set(path); + 0 +} + +#[no_mangle] +pub extern "C" fn livetrace_enable_tracing() { + ENABLE_TRACING.store(true, Ordering::SeqCst); +} + #[no_mangle] pub extern "C" fn livetrace_profiler_create() -> *mut Profiler<'static> { Box::into_raw(Box::new(Profiler::new())) @@ -110,7 +140,7 @@ pub extern "C" fn livetrace_profiler_create() -> *mut Profiler<'static> { #[no_mangle] pub extern "C" fn livetrace_profiler_destroy(profiler: *mut Profiler) { if !profiler.is_null() { - unsafe { std::ptr::drop_in_place(profiler) } + unsafe { Box::from_raw(profiler); } } } diff --git a/src/profiler/src/probes/event.rs b/src/profiler/src/probes/event.rs index 4c951457ecb3ef36bf56b285b9315163638ae99b..a51cabb0c7c69a596ea94f85d04a456390ed395d 100644 --- a/src/profiler/src/probes/event.rs +++ b/src/profiler/src/probes/event.rs @@ -68,6 +68,7 @@ pub struct RawStack { pub time: u64, pub kernel: Vec, pub user: RawUserStack, + pub trace_id: Option, } impl RawStack {} diff --git a/src/profiler/src/probes/probes.rs b/src/profiler/src/probes/probes.rs index 9429d7d966ee7913426fe20adafac71b67b2f137..0ac430315cbbadb39712b274a136a469326ee8b1 100644 --- a/src/profiler/src/probes/probes.rs +++ b/src/profiler/src/probes/probes.rs @@ -1,5 +1,6 @@ use crate::is_system_profiling; use crate::process::maps::ProcessMaps; +use crate::HOST_ROOT_PATH; use super::event::ProbeEvent; use super::event::RawStack; @@ -46,10 +47,12 @@ use perf_event_open_sys::bindings::PERF_TYPE_SOFTWARE; use perf_event_open_sys::perf_event_open; use std::collections::HashMap; use std::env::current_exe; +use std::ffi::CStr; use std::ffi::CString; use std::os::fd::AsFd; use std::os::fd::AsRawFd; use std::path; +use std::path::Path; use std::path::PathBuf; use std::str::FromStr; use std::sync::atomic::AtomicBool; @@ -96,9 +99,12 @@ pub static SYSAK_BTF_PATH: Lazy> = Lazy::new(|| { if info.release.starts_with("5.10") { return None; } - return Some( - CString::new(format!("{}/tools/vmlinux-{}", sysak, info.release)).unwrap(), - ); + let path = format!("{}/tools/vmlinux-{}", sysak, info.release); + if !Path::new(&path).exists() { + log::warn!("failed to find custom btf on path: {}", path); + return None; + } + return Some(CString::new(path).unwrap()); } } None @@ -149,7 +155,7 @@ fn set_thread_need_exit() { fn get_self_path() -> PathBuf { let pid = unsafe { libc::getpid() }; - let pm = ProcessMaps::new(pid as u32).unwrap(); + let pm = ProcessMaps::new_local(pid as u32).unwrap(); if let Some(p) = pm.find_so("libmullprof.so") { return PathBuf::from(p); } @@ -158,6 +164,10 @@ fn get_self_path() -> PathBuf { return PathBuf::from(p); } + if let Some(p) = pm.find_so("libprofiler.so") { + return PathBuf::from(p); + } + current_exe().expect("failed to find executable name") } @@ -172,7 +182,7 @@ pub struct Probes<'a> { sched_skel: sched::SchedMonitorSkel<'a>, pub hotspot_skel: hotspot::HotspotSkel<'a>, pub python_skel: python::PythonSkel<'a>, - interpreter_dispatcher_skel: dispatcher::InterpreterDispatcherSkel<'a>, + pub interpreter_dispatcher_skel: dispatcher::InterpreterDispatcherSkel<'a>, links: Vec, pub rx: Receiver, pub pid_maps_info_map: PidMapsInfoMap, @@ -267,7 +277,11 @@ impl<'a> Probes<'a> { let hotspot_skel = load_skel!(maps, hotspot::HotspotSkelBuilder); let python_skel = load_skel!(maps, python::PythonSkelBuilder); - let (tx, rx) = crossbeam_channel::unbounded(); + let ms = profile_period() as usize; + let sample_per_sec = 1000 / ms; + let ten_sec_samples = sample_per_sec * 10 * num_possible_cpus().unwrap_or(1); + log::info!("cache max stack samples: {}", ten_sec_samples); + let (tx, rx) = crossbeam_channel::bounded(ten_sec_samples); let trace_thread_handle = { let mut cloned_tx = tx.clone(); @@ -445,7 +459,7 @@ impl<'a> Probes<'a> { let ret_value = SystemAnalysis::from(value); assert!(ret_value.raw.pid == 0); sc.set_stack_ptregs_offset((ret_value.raw.address - ret_value.code_u64()) as u32); - sc.set_has_pid_namespace(self.pid != self.nspid); + sc.set_has_pid_namespace(self.pid != self.nspid && HOST_ROOT_PATH.get().is_none()); system_config_skel .maps_mut() @@ -562,9 +576,9 @@ fn thread_poll_trace_event(map: &StackMap, tx: &mut Sender, cpu: i32 let user_stackid = (*raw).user_stack_id; let user_stack = if user_stackid == i32::MAX { - RawUserStack::Native((*raw).__bindgen_anon_1.user_stack[..stack_len].to_vec()) + RawUserStack::Native((&(*raw).__bindgen_anon_1.user_stack)[..stack_len].to_vec()) } else { - RawUserStack::Dynamic((*raw).__bindgen_anon_1.frames[..stack_len].to_vec()) + RawUserStack::Dynamic((&(*raw).__bindgen_anon_1.frames)[..stack_len].to_vec()) }; let kernel_stack = if kernel_stackid >= 0 { @@ -573,12 +587,20 @@ fn thread_poll_trace_event(map: &StackMap, tx: &mut Sender, cpu: i32 vec![] }; + let trace_id = if (*raw).trace_id[0] != 0 { + let data = std::slice::from_raw_parts((*raw).trace_id.as_ptr() as *const u8, 32); + Some(unsafe { std::str::from_utf8_unchecked(data) }.to_owned()) + } else { + None + }; + RawStack { cpu: cpu as u32, pid, time: (*raw).ktime, kernel: kernel_stack, user: user_stack, + trace_id, } }; let comm = unsafe { diff --git a/src/profiler/src/probes/stack_delta.rs b/src/profiler/src/probes/stack_delta.rs index a9ca5b31cfbd2edfc97b006d8a7612174f1c9e22..c2b20fca5ad1f333216ae49aef3753090946c851 100644 --- a/src/profiler/src/probes/stack_delta.rs +++ b/src/profiler/src/probes/stack_delta.rs @@ -131,17 +131,15 @@ impl StackDeltaMap { pub fn update(&self, file_id: FileId64, deltas: Vec) -> Result { let map_id = get_map_id(deltas.len() as u32)?; - let inner = self - .create_inner_map(map_id) - .expect("failed to create inner map"); + let inner = self.create_inner_map(map_id)?; let outer = self.outer_map(map_id); if self.batch { - update_batch_inner_map(&inner, deltas); + update_batch_inner_map(&inner, deltas)?; } else { - update_inner_map(&inner, deltas).expect("failed to update inner map"); + update_inner_map(&inner, deltas)?; } - update_outer_map(outer, file_id, &inner).expect("failed to update outer map"); + update_outer_map(outer, file_id, &inner)?; Ok(map_id) } @@ -194,7 +192,7 @@ fn update_inner_map(inner: &MapHandle, deltas: Vec) -> Result<()> { Ok(()) } -fn update_batch_inner_map(inner: &MapHandle, deltas: Vec) { +fn update_batch_inner_map(inner: &MapHandle, deltas: Vec) -> Result<()> { let mut batch_key = Vec::with_capacity(deltas.len() * 4); let mut batch_val: Vec = Vec::with_capacity(deltas.len() * deltas[0].raw_size()); @@ -203,15 +201,14 @@ fn update_batch_inner_map(inner: &MapHandle, deltas: Vec) { batch_key.extend(idx.to_ne_bytes()); batch_val.extend(delta.slice()); } - inner - .update_batch( - &batch_key, - &batch_val, - deltas.len() as u32, - MapFlags::ANY, - MapFlags::ANY, - ) - .expect("failed to update inner map") + inner.update_batch( + &batch_key, + &batch_val, + deltas.len() as u32, + MapFlags::ANY, + MapFlags::ANY, + )?; + Ok(()) } pub fn create_inner_map(map_id: u32) -> Result { diff --git a/src/profiler/src/probes/system_config.rs b/src/profiler/src/probes/system_config.rs index 7aba761a4b1f1251aa7ac0485a14ee5bd356eba7..c87f1ea57408cc54e8f4f9ad60f9abc055dfa82d 100644 --- a/src/profiler/src/probes/system_config.rs +++ b/src/profiler/src/probes/system_config.rs @@ -1,5 +1,6 @@ use crate::SYSTEM_PROFILING; +use std::path::Path; use super::types::SystemConfig; use libbpf_rs::btf::types::Composite; use libbpf_rs::btf::types::MemberAttr; @@ -12,10 +13,15 @@ pub fn get_system_config() -> SystemConfig { let btf_path: Option = { if let Ok(sysak) = std::env::var("SYSAK_WORK_PATH") { if let Ok(info) = uname::uname() { - if !info.release.starts_with("5.10") { - Some(format!("{}/tools/vmlinux-{}", sysak, info.release)) - } else { + if info.release.starts_with("5.10") { None + } else { + let path = format!("{}/tools/vmlinux-{}", sysak, info.release); + if !Path::new(&path).exists() { + None + } else { + Some(path) + } } } else { None diff --git a/src/profiler/src/probes/types.rs b/src/profiler/src/probes/types.rs index 19d5ecc5322765f73fff1093a08fbb9e18d53864..09184480780f6fb97068377cf5f42904fa61f0ab 100644 --- a/src/profiler/src/probes/types.rs +++ b/src/profiler/src/probes/types.rs @@ -156,3 +156,17 @@ pub struct HotspotProcInfo { } impl_default!(HotspotProcInfo); + + +#[repr(u32)] +#[derive(PartialEq)] +pub enum TracingType { + TraceNone = bpf::TracingType_TRACE_NONE, + TraceGoAgent = bpf::TracingType_TRACE_GO_AGENT, +} + +pub struct ApmIntProcInfo { + pub raw: bpf::ApmIntProcInfo, +} + +impl_default!(ApmIntProcInfo); \ No newline at end of file diff --git a/src/profiler/src/process/maps.rs b/src/profiler/src/process/maps.rs index b8bc647b542422767d6a9883a729216de1df2c29..e51400b4e9ec3bde8e86be7515b77f38b3281d58 100644 --- a/src/profiler/src/process/maps.rs +++ b/src/profiler/src/process/maps.rs @@ -1,4 +1,5 @@ use crate::symbollizer::file_id::FileId64; +use crate::get_host_root_path; use anyhow::Result; use std::cmp::Ordering; use std::collections::HashMap; @@ -9,6 +10,7 @@ use std::io::BufReader; use std::ops::Deref; use std::ops::DerefMut; use std::path::Path; +use std::path::PathBuf; #[derive(Debug, Eq, Hash, PartialEq, Clone, Copy)] pub struct DiskFileKey { @@ -114,7 +116,7 @@ impl ProcessMapsEntry { if self.is_anonymous() || self.is_vdso() { "".to_owned() } else { - format!("/proc/{}/root/{}", pid, self.path.as_ref().unwrap()) + format!("{}/proc/{}/root/{}", get_host_root_path(), pid, self.path.as_ref().unwrap()) } } } @@ -142,8 +144,7 @@ impl DerefMut for ProcessMaps { } impl ProcessMaps { - pub fn new(pid: u32) -> Result { - let maps_path = Path::new("/proc").join(pid.to_string()).join("maps"); + fn new_inner(maps_path: PathBuf) -> Result { let file = File::open(maps_path)?; let reader = BufReader::new(file); @@ -188,6 +189,16 @@ impl ProcessMaps { Ok(Self { entries }) } + pub fn new(pid: u32) -> Result { + let maps_path = Path::new(get_host_root_path()).join("proc").join(pid.to_string()).join("maps"); + Self::new_inner(maps_path) + } + + pub fn new_local(pid: u32) -> Result { + let maps_path = Path::new("/proc").join(pid.to_string()).join("maps"); + Self::new_inner(maps_path) + } + /// Compares two `ProcessMaps` instances and returns the added and removed entries. pub fn diff(&self, other: &Self) -> (Vec, Vec) { let mut added = Vec::new(); diff --git a/src/profiler/src/profiler.rs b/src/profiler/src/profiler.rs index 612d0988dd663fe113eed7c16740147168ccbb5d..32156adeefa8852fff16065239f20827883b1eeb 100644 --- a/src/profiler/src/profiler.rs +++ b/src/profiler/src/profiler.rs @@ -1,24 +1,31 @@ use crate::executable::ExecutableCache; +use crate::get_host_root_path; use crate::heatmap::ProcessHeatMap; use crate::heatmap::TenSecHeatMap; use crate::interpreter::Interpreter; use crate::is_enable_symbolizer; +use crate::is_enable_tracing; use crate::is_system_profiling; use crate::probes::event::ProbeEvent; use crate::probes::probes::Probes; +use crate::probes::types::ApmIntProcInfo; +use crate::probes::types::TracingType; use crate::process::maps::ExeMapsEntry; use crate::process::maps::ProcessMaps; use crate::process::process::Process; use crate::stack::Stack; use crate::stack::StackAggregator; use crate::stack::SymbolizedStack; +use crate::symbollizer::elf::ElfFile; use crate::symbollizer::file_cache::FileCache; use crate::symbollizer::symbolizer::Symbolizer; use crate::utils::lpm::Prefix; +use crate::utils::process::get_comm_by_pid; use crate::utils::time::init_tstamp; use crate::utils::time::time_delta; use crate::MIN_PROCESS_SAMPLES; use anyhow::Result; +use libbpf_rs::MapFlags; use std::collections::HashMap; use std::time::Instant; @@ -27,7 +34,7 @@ pub struct Profiler<'a> { probes: Probes<'a>, caches: FileCache, executables: ExecutableCache, - symbolizer: Symbolizer, + pub symbolizer: Symbolizer, interpreters: HashMap, all_system_profiling: bool, @@ -41,7 +48,8 @@ pub struct Profiler<'a> { impl<'a> Profiler<'a> { pub fn new() -> Self { let mut symer = Symbolizer::new(); - symer.add_kernel("/proc/kallsyms"); + let kallsyms_path = format!("{}/proc/kallsyms", get_host_root_path()); + symer.add_kernel(kallsyms_path.as_str()); init_tstamp(); Profiler { pids: HashMap::new(), @@ -169,6 +177,12 @@ impl<'a> Profiler<'a> { proc.exit(&mut self.probes, &mut self.executables)?; } + self.probes + .interpreter_dispatcher_skel + .maps_mut() + .apm_int_procs() + .delete(&pid.to_ne_bytes())?; + if let Some(mut int) = self.interpreters.remove(&pid) { int.exit(&mut self.probes)?; } @@ -241,6 +255,52 @@ impl<'a> Profiler<'a> { } }; + if is_enable_tracing() { + let mut trace_type = TracingType::TraceNone; + let mut field_offset = 0; + if let Ok(true) = ElfFile::check_section_exist(&info.file, ".go.buildinfo") { + log::debug!( + "found .go.buildinfo section in pid: {pid}, exe: {}", + map.file_path(pid) + ); + if let Ok(Some(offset)) = + ElfFile::extract_field_offset(&info.file, "runtime.g", "traceId") + { + log::info!( + "found go traceId field offset: {offset} in pid: {pid}, exe: {}", + map.file_path(pid) + ); + trace_type = TracingType::TraceGoAgent; + field_offset = offset; + } + } + + if trace_type != TracingType::TraceNone { + let mut pinfo = ApmIntProcInfo::default(); + pinfo.raw.tracing_type = trace_type as u32; + pinfo.raw.tracing_field_offset = field_offset as u64; + match self + .probes + .interpreter_dispatcher_skel + .maps_mut() + .apm_int_procs() + .update(&pid.to_ne_bytes(), pinfo.slice(), MapFlags::ANY) + { + Ok(_) => { + log::info!( + "update apm_int_procs map for pid: {pid} with type: {:?}", + pinfo.raw.tracing_type as u32 + ); + } + Err(e) => { + log::warn!( + "failed to update apm_int_procs map for pid: {pid}, error: {e}" + ); + } + } + } + } + let va = match info.file_offset_to_virtual_address(map.offset) { Some(x) => x, None => { @@ -261,7 +321,11 @@ impl<'a> Profiler<'a> { Ok(Some(a)) => a, Ok(None) => continue, Err(e) => { - log::error!("failed to get executable: {e}"); + log::error!( + "failed to get executable for comm {}: {:?}, err: {e}", + get_comm_by_pid(pid), + map + ); continue; } }; diff --git a/src/profiler/src/stack.rs b/src/profiler/src/stack.rs index cb6ed96ada2d3193d373931b10688fceac375186..12539b6494ee6e175566ae2b8a4e78733f715119 100644 --- a/src/profiler/src/stack.rs +++ b/src/profiler/src/stack.rs @@ -1,4 +1,5 @@ use crate::interpreter::Interpreter; +use crate::is_enable_tracing; use crate::pb::LivetraceCell; use crate::pb::LivetraceList; use crate::pb::Ustack; @@ -60,6 +61,7 @@ pub enum Frame { pub struct Stack { pub count: u32, pub frames: Vec, + pub trace_id: Option, } impl Stack { @@ -105,6 +107,7 @@ impl Stack { } stack.count = cnt; + stack.trace_id = raw.trace_id.clone(); Ok(stack) } @@ -138,7 +141,11 @@ impl ToString for Stack { .collect::>() .join(";"); - format!("{} {}", s, self.count) + if is_enable_tracing() { + return format!("{} {} {}", s, self.count, self.trace_id.as_deref().unwrap_or("null")); + } else { + return format!("{} {}", s, self.count); + } } } diff --git a/src/profiler/src/symbollizer/elf.rs b/src/profiler/src/symbollizer/elf.rs index 66e0a0b532b86d9ab108799f65fe06f3c1a57eef..d088339c81300ef09687eddda341931b9e3f4832 100644 --- a/src/profiler/src/symbollizer/elf.rs +++ b/src/profiler/src/symbollizer/elf.rs @@ -381,6 +381,99 @@ impl ElfFile { .ok_or(anyhow!("symbol {} not found", name)) } + pub fn extract_field_offset( + file: &File, + struct_name: &str, + field_name: &str, + ) -> Result> { + let mmap_ref = unsafe { memmap2::Mmap::map(file)? }; + let elf = object::File::parse(&*mmap_ref)?; + let endian = if elf.is_little_endian() { + gimli::RunTimeEndian::Little + } else { + gimli::RunTimeEndian::Big + }; + let arena_data = Arena::new(); + let arena_relocations = Arena::new(); + let mut load_section = |id: gimli::SectionId| -> Result<_> { + load_file_section(id, &elf, endian, false, &arena_data, &arena_relocations) + }; + + let mut dwarf = gimli::Dwarf::load(load_section).unwrap(); + + // iterate over all compilation units + let mut iter = dwarf.units(); + while let Some(header) = iter.next()? { + let unit = dwarf.unit(header)?; + + let mut entries = unit.entries(); + while let Some((depth, entry)) = entries.next_dfs()? { + if entry.tag() == gimli::DW_TAG_structure_type { + if let Some(attr) = entry.attr(gimli::DW_AT_name)? { + let string = dwarf.attr_string(&unit, attr.value())?; + let actual_string = string.to_string_lossy()?.into_owned(); + if actual_string == struct_name { + // now iterate over the children to find field + let mut children = entries.clone(); + while let Some((child_depth, child)) = children.next_dfs()? { + if child.tag() == gimli::DW_TAG_member { + if let Some(attr) = child.attr(gimli::DW_AT_name)? { + let string = dwarf.attr_string(&unit, attr.value())?; + let actual_string = string.to_string_lossy()?.into_owned(); + if actual_string == field_name { + // find the field + // extract the offset from DW_AT_data_member_location + if let Some(attr) = + child.attr(gimli::DW_AT_data_member_location)? + { + match attr.value() { + gimli::AttributeValue::Udata(offset) => { + return Ok(Some(offset as usize)); + } + gimli::AttributeValue::Data1(data) => { + return Ok(Some(data as usize)); + } + gimli::AttributeValue::Data2(data) => { + return Ok(Some(data as usize)); + } + gimli::AttributeValue::Data4(data) => { + return Ok(Some(data as usize)); + } + gimli::AttributeValue::Data8(data) => { + return Ok(Some(data as usize)); + } + _ => { + return Ok(None); + } + } + } else { + return Ok(None); + } + } + } + } else { + break; + } + } + break; + } + } + } + } + } + + return Ok(None); + } + + pub fn check_section_exist(file: &File, section_name: &str) -> Result { + let mmap_ref = unsafe { memmap2::Mmap::map(file)? }; + let elf = object::File::parse(&*mmap_ref)?; + if elf.section_by_name(section_name).is_some() { + return Ok(true); + } + Ok(false) + } + // parse eh_frame and return stack_deltas pub fn parse_eh_frame(file: &File) -> Result> { let mmap_ref = unsafe { memmap2::Mmap::map(file)? }; diff --git a/src/profiler/src/symbollizer/lru_process_files.rs b/src/profiler/src/symbollizer/lru_process_files.rs index 712825522284062db18d311029bf267356f6fd36..c8e169a2e2f414736b2e41012ed196c34dcf663c 100644 --- a/src/profiler/src/symbollizer/lru_process_files.rs +++ b/src/profiler/src/symbollizer/lru_process_files.rs @@ -87,11 +87,17 @@ impl ProcessFiles { } syms } + + pub fn cache(&self, lru_files: &mut LruFileSymbols) { + for file in &self.files { + let _ = lru_files.symbolize_with_path(file.file_id, 0, &file.path); + } + } } #[derive(Debug)] pub struct LruProcessFiles { - lru: LruCache, + pub(crate) lru: LruCache, } impl LruProcessFiles { @@ -124,6 +130,18 @@ impl LruProcessFiles { } } } + + pub fn cache(&mut self, pid: u32, lru_files: &mut LruFileSymbols) { + match self + .lru + .try_get_or_insert(pid, || -> Result { ProcessFiles::new(pid) }) + { + Ok(pf) => pf.cache(lru_files), + Err(e) => { + log::warn!("failed to add process files for pid {pid}: {e}"); + } + } + } } impl Deref for LruProcessFiles { diff --git a/src/profiler/src/symbollizer/symbolizer.rs b/src/profiler/src/symbollizer/symbolizer.rs index 9ebe1e63e4c2951d94151cdaafaf734339f512f5..7fe576767043f326937a8f9502bdf520a09da128 100644 --- a/src/profiler/src/symbollizer/symbolizer.rs +++ b/src/profiler/src/symbollizer/symbolizer.rs @@ -2,6 +2,7 @@ use crate::is_enable_cpuno; use crate::is_enable_function_offset; use crate::process::maps::ProcessMaps; use crate::MAX_NUM_OF_PROCESSES; +use crate::get_host_root_path; use anyhow::bail; use anyhow::Result; use lru::LruCache; @@ -125,7 +126,7 @@ impl Symbolizer { pub fn proc_comm(&mut self, pid: u32) -> Result<&String> { let get_comm = || { - let mut comm = read_to_string(format!("/proc/{pid}/comm"))?; + let mut comm = read_to_string(format!("{}/proc/{pid}/comm", get_host_root_path()))?; comm.pop(); Ok(comm) }; @@ -133,7 +134,7 @@ impl Symbolizer { self.procs .try_get_or_insert(pid, || -> Result { let comm = if let Some(reg) = &self.adb_regex { - let cmdline = read_to_string(format!("/proc/{pid}/cmdline"))?; + let cmdline = read_to_string(format!("{}/proc/{pid}/cmdline", get_host_root_path()))?; reg.find(&cmdline) .map_or_else(|| get_comm(), |x| Ok(x.as_str().to_owned())) } else { @@ -183,6 +184,11 @@ impl Symbolizer { } syms } + + pub fn cache_process(&mut self, pid: u32) { + let _ = self.proc_comm(pid); + self.proc_files.cache(pid, &mut self.file_symbols); + } } #[cfg(test)] diff --git a/src/profiler/src/tpbase/libc.rs b/src/profiler/src/tpbase/libc.rs index d9bb7b6c77e1d0edebf946ef9026b76eb1d1f31d..e416006d136cfe9c3f7202a9d05d752aa39222ad 100644 --- a/src/profiler/src/tpbase/libc.rs +++ b/src/profiler/src/tpbase/libc.rs @@ -17,7 +17,7 @@ use super::libc_decode::extract_tsd_info_native; /// Determines if the DSO filename potentially contains pthread code pub fn is_potential_tsd_dso(filename: &str) -> bool { - let libc_regex: Regex = Regex::new(r".*/(ld-musl|libpthread)([-.].*)?\.so").unwrap(); + let libc_regex: Regex = Regex::new(r".*/(ld-musl|libc|libpthread)([-.].*)?\.so").unwrap(); libc_regex.is_match(filename) } diff --git a/src/profiler/src/utils/mod.rs b/src/profiler/src/utils/mod.rs index 7ce9d57a5032b9229a6a33e3a5cbe185912113f2..6dd49765db10e389af20bdc7a4751416f88216ab 100644 --- a/src/profiler/src/utils/mod.rs +++ b/src/profiler/src/utils/mod.rs @@ -3,3 +3,4 @@ pub mod process; pub mod remote_reader; pub mod safe_reader; pub mod time; +pub mod v2p; diff --git a/src/profiler/src/utils/process.rs b/src/profiler/src/utils/process.rs index b1b323af18369b42dc2e9159fd6c586e9b122caa..cdd00c3d5587bd8edc526d76b116e0697cdc5dd7 100644 --- a/src/profiler/src/utils/process.rs +++ b/src/profiler/src/utils/process.rs @@ -14,3 +14,43 @@ pub fn find_processes_by_comm(pat: &str) -> Vec { ret } + +pub fn get_comm_by_pid(pid: u32) -> String { + let path = format!("/proc/{}/comm", pid); + match std::fs::read_to_string(path) { + Ok(mut s) => { + s.pop(); // 去除末尾换行符 + s + } + Err(_) => "Unknown".to_string(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_valid_pid() { + // 获取当前进程的 PID + let current_pid = std::process::id(); + // 获取当前进程的 comm + let comm = get_comm_by_pid(current_pid); + // 验证 comm 不为空且不为 "Unknown" + assert!( + !comm.is_empty(), + "Expected non-empty comm for current process" + ); + assert_ne!(comm, "Unknown", "Expected valid comm for current process"); + log::info!("Current process (PID {}): {}", current_pid, comm); + } + + #[test] + fn test_invalid_pid() { + // 使用一个不可能存在的 PID + let invalid_pid = u32::MAX; // 4294967295 + let comm = get_comm_by_pid(invalid_pid); + assert_eq!(comm, "Unknown", "Expected 'Unknown' for invalid PID"); + log::info!("Invalid process (PID {}): {}", invalid_pid, comm); + } +} diff --git a/src/profiler/src/utils/v2p.rs b/src/profiler/src/utils/v2p.rs new file mode 100644 index 0000000000000000000000000000000000000000..6bb66980738d5e2fa2371da9ba309149a7a33757 --- /dev/null +++ b/src/profiler/src/utils/v2p.rs @@ -0,0 +1,33 @@ +use anyhow::bail; +use anyhow::Result; +use pagemap::MemoryRegion; +use procfs::page_size; + +// user virtual address to page frame number(pfn) +pub fn v2p(pid: u32, virt: u64) -> Result { + let mut maps = pagemap::PageMap::new(pid as u64)?; + let page = page_size(); + + let start = virt & !(page - 1); + let end = start + page; + + let entries = maps.pagemap_region(&MemoryRegion::from((start, end)))?; + + if entries.len() != 1 { + bail!("Number of entries is not 1") + } + + let pfn = entries[0].pfn()?; + Ok(pfn) +} + +// user virtual address to kernel virtual address +// page_kv = (pfn << 12) + page_offset_base +// kv = page_kv + offset in page +pub fn v2kv(pid: u32, virt: u64, page_offset_base: u64) -> Result { + let pfn = v2p(pid, virt)?; + let page_kv = (pfn << 12) + page_offset_base; + let page = page_size(); + let off = virt & (page - 1); + Ok(page_kv + off) +} diff --git a/src/security/CMakeLists.txt b/src/security/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..28906f8cb7f7cfdc5a7ca4715a2a8a6ddd8c5a79 --- /dev/null +++ b/src/security/CMakeLists.txt @@ -0,0 +1,4 @@ +include(${PROJECT_SOURCE_DIR}/scripts/cmake/genskel.cmake) + +message(STATUS "security bpf INSTALL_INCLUDE_DIR: ${INSTALL_INCLUDE_DIR}") +genskel(security) \ No newline at end of file diff --git a/src/security/addr_lpm_maps.h b/src/security/addr_lpm_maps.h new file mode 100644 index 0000000000000000000000000000000000000000..78cb3eea44c1b4bb6ccaebf337692d798a153a2f --- /dev/null +++ b/src/security/addr_lpm_maps.h @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright Authors of Cilium */ + +#pragma once + +#ifdef __cplusplus +#include +#endif +#include +#include +#include +#include +#include "../coolbpf.h" +#include "type.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, ADDR_LPM_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(max_entries, 1); + __type(key, __u8[8]); // Need to specify as byte array as wouldn't take struct as key type + __type(value, __u8); + __uint(map_flags, BPF_F_NO_PREALLOC); + }); +} addr4lpm_maps SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, ADDR_LPM_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(max_entries, 1); + __type(key, __u8[20]); // Need to specify as byte array as wouldn't take struct as key type + __type(value, __u8); + __uint(map_flags, BPF_F_NO_PREALLOC); + }); +} addr6lpm_maps SEC(".maps"); + + +// struct { +// __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); +// __uint(max_entries, ADDR_LPM_MAPS_OUTER_MAX_ENTRIES); +// __uint(key_size, sizeof(__u32)); +// __array( +// values, struct { +// __uint(type, BPF_MAP_TYPE_LPM_TRIE); +// __uint(max_entries, 16); +// __type(key, __u8[8]); // Need to specify as byte array as wouldn't take struct as key type +// __type(value, __u8); +// __uint(map_flags, BPF_F_NO_PREALLOC); +// }); +// } daddr4lpm_maps SEC(".maps"); + +// struct { +// __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); +// __uint(max_entries, ADDR_LPM_MAPS_OUTER_MAX_ENTRIES); +// __uint(key_size, sizeof(__u32)); +// __array( +// values, struct { +// __uint(type, BPF_MAP_TYPE_LPM_TRIE); +// __uint(max_entries, 16); +// __type(key, __u8[20]); // Need to specify as byte array as wouldn't take struct as key type +// __type(value, __u8); +// __uint(map_flags, BPF_F_NO_PREALLOC); +// }); +// } daddr6lpm_maps SEC(".maps"); diff --git a/src/security/api.h b/src/security/api.h new file mode 100644 index 0000000000000000000000000000000000000000..daf9f740115dd814b770fc8ab970ee2e3bbd6487 --- /dev/null +++ b/src/security/api.h @@ -0,0 +1,316 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_API_H +#define SYSAK_API_H + +/* Note: + * + * This file can be included into eBPF kernel programs. It contains + * a couple of useful helper functions, map/section ABI (bpf_elf.h), + * misc macros and some eBPF specific LLVM built-ins. + */ +#include "bpf_elf.h" + +#ifndef TC_ACT_OK +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#endif +#define TC_ACT_UNSPEC -1 + +/** Misc macros. */ + +#ifndef __stringify +# define __stringify(X) #X +#endif + +#ifndef __maybe_unused +# define __maybe_unused __attribute__((__unused__)) +#endif + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#endif + +#ifndef likely +# define likely(X) __builtin_expect(!!(X), 1) +#endif + +#ifndef unlikely +# define unlikely(X) __builtin_expect(!!(X), 0) +#endif + +#ifndef __inline__ +# define __inline__ __attribute__((always_inline)) +#endif + +/** Section helper macros. */ + +#ifndef __section +# define __section(NAME) \ + __attribute__((section(NAME), used)) +#endif + +#ifndef __section_tail +# define __section_tail(ID, KEY) \ + __section(__stringify(ID) "/" __stringify(KEY)) +#endif + +#ifndef __section_cls_entry +# define __section_cls_entry \ + __section(ELF_SECTION_CLASSIFIER) +#endif + +#ifndef __section_act_entry +# define __section_act_entry \ + __section(ELF_SECTION_ACTION) +#endif + +#ifndef __section_license +# define __section_license \ + __section(ELF_SECTION_LICENSE) +#endif + +#ifndef __section_maps +# define __section_maps \ + __section(ELF_SECTION_MAPS) +#endif + +/** Declaration helper macros. */ + +#ifndef BPF_LICENSE +# define BPF_LICENSE(NAME) \ + char ____license[] __section_license = NAME +#endif + +/** Classifier helper */ + +#ifndef BPF_H_DEFAULT +# define BPF_H_DEFAULT -1 +#endif +// +///** BPF helper functions for tc. Individual flags are in linux/bpf.h */ +// +//#ifndef BPF_FUNC +//# define BPF_FUNC(NAME, ...) \ +// (* NAME)(__VA_ARGS__) __maybe_unused = (void *) bpf_##NAME +//#endif +// +//#ifndef BPF_FUNC2 +//# define BPF_FUNC2(NAME, ...) \ +// (* NAME)(__VA_ARGS__) __maybe_unused +//#endif +// +///* Map access/manipulation */ +//static void *BPF_FUNC(map_lookup_elem, void *map, const void *key); +//static int BPF_FUNC(map_update_elem, void *map, const void *key, +// const void *value, uint32_t flags); +//static int BPF_FUNC(map_delete_elem, void *map, const void *key); +// +///* Memory reads */ +//static int BPF_FUNC(bpf_probe_read, void *dst, uint32_t size, const void *src); +//static int BPF_FUNC(bpf_probe_read_str, void *dst, int size, const void *src); +//static int BPF_FUNC(bpf_probe_read_kernel, void *dst, uint32_t size, const void *src); +// +///* Time access */ +//static uint64_t BPF_FUNC(ktime_get_ns); +//static uint64_t BPF_FUNC(ktime_get_boot_ns); +//static uint64_t BPF_FUNC(ktime_get_coarse_ns); +//static uint64_t BPF_FUNC(jiffies64); +// +///* Platform */ +//static uint64_t BPF_FUNC(get_numa_node_id); +// +///* Timer Callbacks */ +//static long BPF_FUNC(timer_init, struct bpf_timer *timer, void *map, uint64_t flags); +//static long BPF_FUNC(timer_set_callback, struct bpf_timer *timer, void *callback_fun); +//static long BPF_FUNC(timer_start, struct bpf_timer *timer, uint64_t nsecs, uint64_t flags); +//static long BPF_FUNC(timer_cancel, struct bpf_timer *timer); +// +///* Sockets */ +//static uint64_t BPF_FUNC(get_socket_cookie, void *ctx); +// +//static struct bpf_sock *BPF_FUNC(sk_lookup_tcp, void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags); +//static struct bpf_sock *BPF_FUNC(sk_lookup_udp, void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags); +//static uint64_t BPF_FUNC(sk_release, void *sock); +//static struct bpf_sock *BPF_FUNC(sk_fullsock, struct bpf_sock *sk); +//static struct bpf_tcp_sock *BPF_FUNC(tcp_sock, struct bpf_sock *sk); +//static struct bpf_sock *BPF_FUNC(get_listener_sock, struct bpf_sock *sk); +//static struct bpf_sock *BPF_FUNC(skc_lookup_tcp, void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags); +//static void *BPF_FUNC(sk_storage_get, struct bpf_map *map, void *sk, void *value, u64 flags); +//static void *BPF_FUNC(sk_storage_delete, struct bpf_map *map, void *sk); +//static struct tcp6_sock *BPF_FUNC(skc_to_tcp6_sock, void *sk); +//static struct tcp_sock *BPF_FUNC(skc_to_tcp_sock, void *sk); +//static struct tcp_timewait_sock *BPF_FUNC(skc_to_tcp_timewait_sock, void *sk); +//static struct tcp_request_sock *BPF_FUNC(skc_to_tcp_request_sock, void *sk); +//static struct udp6_sock *BPF_FUNC(skc_to_udp6_sock, void *sk); +//static struct socket *BPF_FUNC(sock_from_file, struct file *file); +// +///* Debugging */ +//__attribute__((__format__(__printf__, 1, 0))) +//static void BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...); +//static long BPF_FUNC(trace_vprintk, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len); +// +// +///* Random numbers */ +//static uint32_t BPF_FUNC(get_prandom_u32); +// +///* Tail calls */ +//static void BPF_FUNC(tail_call, void *ctx, void *map, uint32_t index); +// +///* System helpers */ +//static uint32_t BPF_FUNC(get_smp_processor_id); +// +///* Packet misc meta data */ +//static uint32_t BPF_FUNC(get_cgroup_classid, struct __sk_buff *skb); +//static uint32_t BPF_FUNC(get_route_realm, struct __sk_buff *skb); +//static uint32_t BPF_FUNC(get_hash_recalc, struct __sk_buff *skb); +//static uint32_t BPF_FUNC(set_hash_invalid, struct __sk_buff *skb); +// +//static int BPF_FUNC(skb_under_cgroup, void *map, uint32_t index); +// +///* Packet redirection */ +//static int BPF_FUNC(redirect, int ifindex, uint32_t flags); +//static int BPF_FUNC(clone_redirect, struct __sk_buff *skb, int ifindex, +// uint32_t flags); +// +///* Packet manipulation */ +//static int BPF_FUNC(skb_load_bytes_relative, struct __sk_buff *skb, uint32_t off, +// void *to, uint32_t len, uint32_t hdr); +//static int BPF_FUNC(skb_load_bytes, struct __sk_buff *skb, uint32_t off, +// void *to, uint32_t len); +//static int BPF_FUNC(skb_store_bytes, struct __sk_buff *skb, uint32_t off, +// const void *from, uint32_t len, uint32_t flags); +// +//static int BPF_FUNC(l3_csum_replace, struct __sk_buff *skb, uint32_t off, +// uint32_t from, uint32_t to, uint32_t flags); +//static int BPF_FUNC(l4_csum_replace, struct __sk_buff *skb, uint32_t off, +// uint32_t from, uint32_t to, uint32_t flags); +//static int BPF_FUNC(csum_diff, void *from, uint32_t from_size, void *to, +// uint32_t to_size, uint32_t seed); +// +//static int BPF_FUNC(skb_change_type, struct __sk_buff *skb, uint32_t type); +//static int BPF_FUNC(skb_change_proto, struct __sk_buff *skb, uint32_t proto, +// uint32_t flags); +//static int BPF_FUNC(skb_change_tail, struct __sk_buff *skb, uint32_t nlen, +// uint32_t flags); +//static int BPF_FUNC(skb_adjust_room, struct __sk_buff *skb, int32_t len_diff, +// uint32_t mode, uint64_t flags); +//static int BPF_FUNC(skb_pull_data, struct __sk_buff *skb, uint32_t len); +// +///* Packet vlan encap/decap */ +//static int BPF_FUNC(skb_vlan_push, struct __sk_buff *skb, uint16_t proto, +// uint16_t vlan_tci); +//static int BPF_FUNC(skb_vlan_pop, struct __sk_buff *skb); +// +///* Packet tunnel encap/decap */ +//static int BPF_FUNC(skb_get_tunnel_key, struct __sk_buff *skb, +// struct bpf_tunnel_key *to, uint32_t size, uint32_t flags); +//static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff *skb, +// const struct bpf_tunnel_key *from, uint32_t size, +// uint32_t flags); +// +//static int BPF_FUNC(skb_get_tunnel_opt, struct __sk_buff *skb, +// void *to, uint32_t size); +//static int BPF_FUNC(skb_set_tunnel_opt, struct __sk_buff *skb, +// const void *from, uint32_t size); +// +///* Events for user space */ +//static int BPF_FUNC2(skb_event_output, struct __sk_buff *skb, void *map, uint64_t index, +// const void *data, uint32_t size) = (void *)BPF_FUNC_perf_event_output; +// +///* Sockops and SK_MSG helpers */ +//static int BPF_FUNC(sock_map_update, struct bpf_sock_ops *skops, void *map, uint32_t key, uint64_t flags); +//static int BPF_FUNC(sock_hash_update, struct bpf_sock_ops *skops, void *map, void *key, uint64_t flags); +//static int BPF_FUNC(msg_redirect_hash, struct sk_msg_md *md, void *map, void *key, uint64_t flags); +//static int BPF_FUNC(msg_pull_data, struct sk_msg_md *md, __u32 start, __u32 end, __u64 flags); +//static int BPF_FUNC(msg_apply_bytes, struct sk_msg_md *md, __u32 bytes); +//static int BPF_FUNC(msg_cork_bytes, struct sk_msg_md *md, __u32 bytes); +// +//static int BPF_FUNC(fib_lookup, void *ctx, struct bpf_fib_lookup *params, uint32_t plen, uint32_t flags); +// +// +///* Current Process Info */ +//static uint64_t BPF_FUNC(bpf_get_current_task); +//static uint64_t BPF_FUNC(get_current_cgroup_id); +//static uint64_t BPF_FUNC(get_current_ancestor_cgroup_id); +//static uint64_t BPF_FUNC(get_current_uid_gid); +//static uint64_t BPF_FUNC(get_current_pid_tgid); +// +//static int BPF_FUNC(get_current_comm, char *buf, uint32_t size); +// +//static int BPF_FUNC(send_signal, uint32_t sig); +//static int BPF_FUNC(override_return, void *regs, uint64_t rc); +//static long BPF_FUNC(get_stackid, void *ctx, void *map, uint64_t flags); +//static long BPF_FUNC(loop, __u32 nr_loops, void *callback_fn, void *callback_ctx, __u64 flags); +//static __u64 BPF_FUNC(get_attach_cookie, void *ctx); +// +///* Perf and Rignbuffer */ +//static int BPF_FUNC(perf_event_output, void *ctx, void *map, uint64_t flags, void *data, uint64_t size); +// +//static int BPF_FUNC(get_stack, void *ctx, void *buf, uint32_t size, uint64_t flags); +//static long BPF_FUNC(ringbuf_output, void *data, uint64_t size, uint64_t flags); +//static void *BPF_FUNC(ringbuf_reserve, void *ringbuf, uint64_t size, uint64_t flags); +//static void BPF_FUNC(ringbuf_submit, void *data, uint64_t flags); +//static void BPF_FUNC(ringbuf_discard, void *data, uint64_t flags); +//static long BPF_FUNC(ringbuf_query, void *ringbuf, uint64_t flags); +// +//static long BPF_FUNC(ringbuf_reserve_dynptr, void *ringbuf, uint32_t size, uint64_t flags, struct bpf_dynptr *ptr); +//static void BPF_FUNC(ringbuf_submit_dynptr, struct bpf_dynptr *ptr, uint64_t flags); +//static void BPF_FUNC(ringbuf_discard_dynptr, struct bpf_dynptr *ptr, uint64_t flags); +// +//static long BPF_FUNC(dynptr_from_mem, void *data, uint32_t size, uint64_t flags, struct bpf_dynptr *ptr); +//static long BPF_FUNC(dynptr_read, void *dst, uint32_t len, const struct bpf_dynptr *src, uint32_t offset, uint64_t flags); +//static long BPF_FUNC(dynptr_write, const struct bpf_dynptr *dst, uint32_t offset, void *src, uint32_t len, uint64_t flags); +//static void BPF_FUNC(dynptr_data, const struct bpf_dynptr *ptr, uint32_t offset, uint32_t len); + +/** LLVM built-ins, mem*() routines work for constant size */ + +#ifndef lock_xadd +# define lock_xadd(ptr, val) ((void) __sync_fetch_and_add(ptr, val)) +#endif + +#ifndef memset +# define memset(s, c, n) __builtin_memset((s), (c), (n)) +#endif + +#ifndef memcpy +# define memcpy(d, s, n) __builtin_memcpy((d), (s), (n)) +#endif + +#ifndef memmove +# define memmove(d, s, n) __builtin_memmove((d), (s), (n)) +#endif + +/* FIXME: __builtin_memcmp() is not yet fully useable unless llvm bug + * https://llvm.org/bugs/show_bug.cgi?id=26218 gets resolved. Also + * this one would generate a reloc entry (non-map), otherwise. + */ +#if 0 +#ifndef memcmp +# define memcmp(a, b, n) __builtin_memcmp((a), (b), (n)) +#endif +#endif + +#include "compiler.h" +FUNC_INLINE void compiler_barrier(void) +{ + asm volatile("" :: + : "memory"); +} + +#ifndef memset +# define memset(s, c, n) __builtin_memset((s), (c), (n)) +#endif + +#define _(P) (__builtin_preserve_access_index(P)) + + +#endif //SYSAK_API_H diff --git a/src/security/bpf_cgroup.h b/src/security/bpf_cgroup.h new file mode 100644 index 0000000000000000000000000000000000000000..1cd4088c9f6580220cdff44545849df3c4befb3f --- /dev/null +++ b/src/security/bpf_cgroup.h @@ -0,0 +1,429 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_CGROUP_H +#define SYSAK_BPF_CGROUP_H + +#include "../coolbpf.h" +#include + +#include "bpf_event.h" +#include "environ_conf.h" +#include "bpf_common.h" +#include "process.h" + +#define NULL ((void *)0) + +#ifndef CGROUP_SUPER_MAGIC +#define CGROUP_SUPER_MAGIC 0x27e0eb /* Cgroupv1 pseudo FS */ +#endif + +#ifndef CGROUP2_SUPER_MAGIC +#define CGROUP2_SUPER_MAGIC 0x63677270 /* Cgroupv2 pseudo FS */ +#endif + +/* Our kernfs node name length, can be made 256? */ + +/* Max nested cgroups that are tracked. Arbitrary value, nested cgroups + * that are at a level greater than 32 will be attached to the cgroup + * at level 32. + */ +#define CGROUP_MAX_NESTED_LEVEL 32 + +typedef enum { + CGROUP_UNTRACKED = 0, /* Cgroup was created but we did not track it */ + CGROUP_NEW = 1, /* Cgroup was just created */ + CGROUP_RUNNING = 2, /* new => running (fork,exec task inside) */ + CGROUP_RUNNING_PROC = 3, /* Generated from pids of procfs */ +} cgroup_state; + +/* Represent old kernfs node with the kernfs_node_id + * union to read the id in 5.4 kernels and older + */ +//struct kernfs_node___old { +// union kernfs_node_id id; +//}; + +struct cgroup_tracking_value { + /* State of cgroup */ + cgroup_state state; + + /* Unique id for the hierarchy this is mostly for cgroupv1 */ + __u32 hierarchy_id; + + /* The depth this cgroup is at */ + __u32 level; + + __u32 pad; + + /* Cgroup kernfs_node name */ + char name[KN_NAME_LENGTH]; +}; // All fields aligned so no 'packed' attribute. + +struct msg_cgroup_event { + struct msg_common common; + struct msg_execve_key parent; + __u32 cgrp_op; /* Current cgroup operation */ + __u32 pid; + __u32 nspid; + __u32 flags; + __u64 ktime; + __u64 cgrpid_tracker; /* Cgroup ID that is used as a tracker for the current cgroup */ + __u64 cgrpid; /* Current cgroup ID */ + struct cgroup_tracking_value cgrp_data; /* Current cgroup data */ + char path[PATH_MAP_SIZE]; /* Current cgroup path */ +}; // All fields aligned so no 'packed' attribute. + +/* Map to track cgroups per IDs */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 32768); + __type(key, __u64); /* Key is the cgrpid */ + __type(value, struct cgroup_tracking_value); +} tg_cgrps_tracking_map SEC(".maps"); + +/* Heap used to construct a cgroup_tracking_value */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __s32); + __type(value, struct cgroup_tracking_value); +} tg_cgrps_tracking_heap SEC(".maps"); + +/* Heap used to construct a msg_cgroup_event */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct msg_cgroup_event); +} tg_cgrps_msg_heap SEC(".maps"); + +/** + * get_cgroup_kn_name() Returns a pointer to the kernfs node name + * @cgrp: target kernfs node + * + * Returns a pointer to the kernfs node name on success, NULL on failures. + */ +FUNC_INLINE const char *__get_cgroup_kn_name(const struct kernfs_node *kn) +{ + const char *name = NULL; + + if (kn) + bpf_probe_read(&name, sizeof(name), _(&kn->name)); + + return name; +} + +/** + * get_cgroup_kn_id() Returns the kernfs node id + * @cgrp: target kernfs node + * + * Returns the kernfs node id on success, zero on failures. + */ +FUNC_INLINE __u64 __get_cgroup_kn_id(const struct kernfs_node *kn) +{ + __u64 id = 0; + + if (!kn) + return id; + + /* Kernels prior to 5.5 have the kernfs_node_id, but distros (RHEL) + * seem to have kernfs_node_id defined for UAPI reasons even though + * its not used here directly. To resolve this walk struct for id.id + */ +// if (bpf_core_field_exists(((struct kernfs_node___old *)0)->id.id)) { +// struct kernfs_node___old *old_kn; +// +// old_kn = (void *)kn; +// if (BPF_CORE_READ_INTO(&id, old_kn, id.id) != 0) +// return 0; +// } else { +// bpf_probe_read(&id, sizeof(id), _(&kn->id)); +// } + + return id; +} + +/** + * __get_cgroup_kn() Returns the kernfs_node of the cgroup + * @cgrp: target cgroup + * + * Returns the kernfs_node of the cgroup on success, NULL on failures. + */ +FUNC_INLINE struct kernfs_node *__get_cgroup_kn(const struct cgroup *cgrp) +{ + struct kernfs_node *kn = NULL; + + if (cgrp) + bpf_probe_read(&kn, sizeof(cgrp->kn), _(&cgrp->kn)); + + return kn; +} + +/** + * get_cgroup_hierarchy_id() Returns the cgroup hierarchy id + * @cgrp: target cgroup + * + * Returns the cgroup hierarchy id. Make sure you pass a valid + * cgroup, this can not fail. + * + * Returning zero means the cgroup is running on the default + * hierarchy. + */ +FUNC_INLINE __u32 get_cgroup_hierarchy_id(const struct cgroup *cgrp) +{ + __u32 id; + + BPF_CORE_READ_INTO(&id, cgrp, root, hierarchy_id); + + return id; +} + +/** + * get_cgroup_name() Returns a pointer to the cgroup name + * @cgrp: target cgroup + * + * Returns a pointer to the cgroup node name on success that can + * be read with bpf_probe_read(). NULL on failures. + */ +FUNC_INLINE const char *get_cgroup_name(const struct cgroup *cgrp) +{ + const char *name; + + if (unlikely(!cgrp)) + return NULL; + + if (BPF_CORE_READ_INTO(&name, cgrp, kn, name) != 0) + return NULL; + + return name; +} + +/** + * get_cgroup_level() Returns the cgroup level + * @cgrp: target cgroup + * + * Returns the cgroup level, or 0 if it can not be retrieved. + */ +FUNC_INLINE __u32 get_cgroup_level(const struct cgroup *cgrp) +{ + __u32 level = 0; + + bpf_probe_read(&level, sizeof(level), _(&cgrp->level)); + return level; +} + +/** + * get_cgroup_id() Returns cgroup id + * @cgrp: target cgroup + * + * Returns the cgroup id of the target cgroup on success, zero on failures. + */ +FUNC_INLINE __u64 get_cgroup_id(const struct cgroup *cgrp) +{ + struct kernfs_node *kn; + + kn = __get_cgroup_kn(cgrp); + return __get_cgroup_kn_id(kn); +} + +/** + * get_task_cgroup() Returns the accurate or desired cgroup of the css of + * current task that we want to operate on. + * @task: must be current task. + * @subsys_idx: index of the desired cgroup_subsys_state part of css_set. + * Passing a zero as a subsys_idx is fine assuming you want that. + * @error_flags: error flags that will be ORed to indicate errors on + * failures. + * + * Returns the cgroup of the css part of css_set of current task and is + * indexed at subsys_idx on success. NULL on failures, and the error_flags + * will be ORed to indicate the corresponding error. + * + * To get cgroup and kernfs node information we want to operate on the right + * cgroup hierarchy which is setup by user space. However due to the + * incompatibility between cgroup v1 and v2; how user space initialize and + * install cgroup controllers, etc, it can be difficult. + * + * Use this helper and pass the css index that you consider accurate and + * which can be discovered at runtime in user space. + * Usually it is the 'memory' or 'pids' indexes by reading /proc/cgroups + * file where each line number is the index starting from zero without + * counting first comment line. + */ +FUNC_INLINE struct cgroup * +get_task_cgroup(struct task_struct *task, __u32 subsys_idx, __u32 *error_flags) +{ + struct cgroup_subsys_state *subsys; + struct css_set *cgroups; + struct cgroup *cgrp = NULL; + + bpf_probe_read(&cgroups, sizeof(cgroups), _(&task->cgroups)); + if (unlikely(!cgroups)) { + *error_flags |= EVENT_ERROR_CGROUPS; + return cgrp; + } + + /* We are interested only in the cpuset, memory or pids controllers + * which are indexed at 0, 4 and 11 respectively assuming all controllers + * are compiled in. + * When we use the controllers indexes we will first discover these indexes + * dynamically in user space which will work on all setups from reading + * file: /proc/cgroups. If we fail to discover the indexes then passing + * a default index zero should be fine assuming we also want that. + * + * Reference: https://elixir.bootlin.com/linux/v5.19/source/include/linux/cgroup_subsys.h + * + * Notes: + * Newer controllers should be appended at the end. controllers + * that are not upstreamed may mess the calculation here + * especially if they happen to be before the desired subsys_idx, + * we fail. + */ + if (unlikely(subsys_idx > pids_cgrp_id)) { + *error_flags |= EVENT_ERROR_CGROUP_SUBSYS; + return cgrp; + } + + /* Read css from the passed subsys index to ensure that we operate + * on the desired controller. This allows user space to be flexible + * and chose the right per cgroup subsystem to use in order to + * support as much as workload as possible. It also reduces errors + * in a significant way. + */ + bpf_probe_read(&subsys, sizeof(subsys), _(&cgroups->subsys[subsys_idx])); + if (unlikely(!subsys)) { + *error_flags |= EVENT_ERROR_CGROUP_SUBSYS; + return cgrp; + } + + bpf_probe_read(&cgrp, sizeof(cgrp), _(&subsys->cgroup)); + if (!cgrp) + *error_flags |= EVENT_ERROR_CGROUP_SUBSYSCGRP; + + return cgrp; +} + +/** + * __tg_get_current_cgroup_id() Returns the accurate cgroup id of current task. + * @cgrp: cgroup target of current task. + * @cgrpfs_ver: Cgroupfs Magic number either Cgroupv1 or Cgroupv2 + * + * It handles both cgroupv2 and cgroupv1. + * If @cgrpfs_ver is default cgroupv2 hierarchy, then it uses the bpf + * helper bpf_get_current_cgroup_id() to retrieve the cgroup id. Otherwise + * it falls back on using the passed @cgrp + * + * Returns the cgroup id of current task on success, zero on failures. + */ +FUNC_INLINE __u64 +__tg_get_current_cgroup_id(struct cgroup *cgrp, __u64 cgrpfs_ver) +{ + return 0; + /* + * Try the bpf helper on the default hierarchy if available + * and if we are running in unified cgroupv2 + */ +// if (bpf_core_enum_value_exists(enum bpf_func_id, +// bpf_get_current_cgroup_id) && +// cgrpfs_ver == CGROUP2_SUPER_MAGIC) { +// return bpf_get_current_cgroup_id(); +// } else { +// return get_cgroup_id(cgrp); +// } +} + +/** + * tg_get_current_cgroup_id() Returns the accurate cgroup id of current task. + * + * It works similar to __tg_get_current_cgroup_id, but computes the cgrp if it is needed. + * Returns the cgroup id of current task on success, zero on failures. + */ +FUNC_INLINE __u64 tg_get_current_cgroup_id(void) +{ + __u32 error_flags; + struct cgroup *cgrp; + __u64 cgrpfs_magic = 0; + struct task_struct *task; + struct tetragon_conf *conf; + int zero = 0, subsys_idx = 0; + + conf = bpf_map_lookup_elem(&tg_conf_map, &zero); + if (conf) { + /* Select which cgroup version */ + cgrpfs_magic = conf->cgrp_fs_magic; + subsys_idx = conf->tg_cgrp_subsys_idx; + } + + /* + * Try the bpf helper on the default hierarchy if available + * and if we are running in unified cgroupv2 + */ +// if (bpf_core_enum_value_exists(enum bpf_func_id, +// bpf_get_current_cgroup_id) && +// cgrpfs_magic == CGROUP2_SUPER_MAGIC) { +// return bpf_get_current_cgroup_id(); +// } + + task = (struct task_struct *)bpf_get_current_task(); + + // NB: error_flags are ignored for now + cgrp = get_task_cgroup(task, subsys_idx, &error_flags); + if (!cgrp) + return 0; + + return get_cgroup_id(cgrp); +} + +/** + * __get_cgrp_tracking_val_heap() Get a cgroup_tracking_val from the + * tg_cgrps_tracking_heap map while setting its fields. + */ +FUNC_INLINE struct cgroup_tracking_value * +__get_cgrp_tracking_val_heap(cgroup_state state, __u32 hierarchy_id, + __u32 level) +{ + int zero = 0; + struct cgroup_tracking_value *heap; + + heap = bpf_map_lookup_elem(&tg_cgrps_tracking_heap, &zero); + if (!heap) + return heap; + + memset(heap, 0, sizeof(struct cgroup_tracking_value)); + heap->state = state; + heap->hierarchy_id = hierarchy_id; + heap->level = level; + + return heap; +} + +/** + * __init_cgrp_tracking_val_heap() Initialize a cgroup_tracking_val that is + * obtained with __get_cgrp_tracking_val_heap(). It will initialize and + * set the cgroup name too. + */ +FUNC_INLINE struct cgroup_tracking_value * +__init_cgrp_tracking_val_heap(struct cgroup *cgrp, cgroup_state state) +{ + const char *name; + struct kernfs_node *kn; + __u32 level, hierarchy_id; + struct cgroup_tracking_value *heap; + + hierarchy_id = get_cgroup_hierarchy_id(cgrp); + level = get_cgroup_level(cgrp); + heap = __get_cgrp_tracking_val_heap(state, hierarchy_id, level); + if (!heap) + return heap; + + kn = __get_cgroup_kn(cgrp); + name = __get_cgroup_kn_name(kn); + if (name) + bpf_probe_read_str(&heap->name, KN_NAME_LENGTH - 1, name); + + return heap; +} + +#endif //SYSAK_BPF_CGROUP_H diff --git a/src/security/bpf_common.h b/src/security/bpf_common.h new file mode 100644 index 0000000000000000000000000000000000000000..63acceea50e827b5e4a0c55d71cf791ccc118a98 --- /dev/null +++ b/src/security/bpf_common.h @@ -0,0 +1,72 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_COMMON_H +#define SYSAK_BPF_COMMON_H + +#ifdef __cplusplus +#include +#endif + +#define KN_NAME_LENGTH 128 + +/* msg_common internal flags */ +#define MSG_COMMON_FLAG_RETURN BIT(0) +#define MSG_COMMON_FLAG_KERNEL_STACKTRACE BIT(1) +#define MSG_COMMON_FLAG_USER_STACKTRACE BIT(2) + +#define XSTR(s) STR(s) +#define STR(s) #s + +/* Msg Layout */ +struct msg_common { + __u8 op; + __u8 flags; // internal flags not exported + __u8 pad[2]; + __u32 size; + __u64 ktime; +}; + +struct msg_test { + struct msg_common common; + unsigned long arg0; + unsigned long arg1; + unsigned long arg2; + unsigned long arg3; +} __attribute__((packed)); + +#ifndef bpf_ntohs +#define bpf_ntohs(x) __builtin_bswap16(x) +#endif + +#ifndef bpf_htons +#define bpf_htons(x) __builtin_bswap16(x) +#endif + +#ifndef bpf_ntohl +#define bpf_ntohl(x) __builtin_bswap32(x) +#endif + +#ifndef bpf_htonl +#define bpf_htonl(x) __builtin_bswap32(x) +#endif + +//#ifndef bpf_map_def +//struct bpf_map_def { +// unsigned int type; +// unsigned int key_size; +// unsigned int value_size; +// unsigned int max_entries; +// unsigned int map_flags; +//}; +//#endif + +#define BIT(nr) (1 << (nr)) +#define BIT_ULL(nr) (1ULL << (nr)) + +#ifndef PATH_MAP_SIZE +#define PATH_MAP_SIZE 4096 +#endif + +#endif //SYSAK_BPF_COMMON_H diff --git a/src/security/bpf_cred.h b/src/security/bpf_cred.h new file mode 100644 index 0000000000000000000000000000000000000000..e9b861a149b2607ded4e5491df87747b3a47c9e8 --- /dev/null +++ b/src/security/bpf_cred.h @@ -0,0 +1,83 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_CRED_H +#define SYSAK_BPF_CRED_H + +#include "compiler.h" + +// NB: in some cases we want to access the capabilities via an array to simplify the BPF code, which is why we define it as a union. +struct msg_capabilities { + union { + struct { + __u64 permitted; + __u64 effective; + __u64 inheritable; + }; + __u64 c[3]; + }; +}; // All fields aligned so no 'packed' attribute. + +// indexes to access msg_capabilities's array (->c) -- should have the same order as the fields above. +enum { + caps_permitted = 0, + caps_effective = 1, + caps_inheritable = 2, +}; + +struct msg_user_namespace { + __s32 level; + __u32 uid; + __u32 gid; + __u32 ns_inum; +}; + +struct msg_cred { + __u32 uid; + __u32 gid; + __u32 suid; + __u32 sgid; + __u32 euid; + __u32 egid; + __u32 fsuid; + __u32 fsgid; + __u32 securebits; + __u32 pad; + struct msg_capabilities caps; + struct msg_user_namespace user_ns; +} __attribute__((packed)); + +/* Execution and cred related flags shared with userspace */ +#define EXEC_SETUID 0x01 /* This is a set-user-id execution */ +#define EXEC_SETGID 0x02 /* This is a set-group-id execution */ +#define EXEC_FILE_CAPS 0x04 /* This binary execution gained new capabilities through file capabilities execution */ +#define EXEC_SETUID_ROOT 0x08 /* This binary execution gained new privileges through setuid to root execution */ +#define EXEC_SETGID_ROOT 0x10 /* This binary execution gained new privileges through setgid to root execution */ + +/* + * Check if "a" is a subset of "set". + * return true if all of the capabilities in "a" are also in "set" + * __cap_issubset(0100, 1111) will return true + * return false if any of the capabilities in "a" are not in "set" + * __cap_issubset(1111, 0100) will return false + */ +FUNC_INLINE bool __cap_issubset(const __u64 a, const __u64 set) +{ + return !(a & ~set); +} + +#define __cap_gained(target, source) \ + !__cap_issubset(target, source) + +/* + * We check if it user id is global root. Right now we do not + * support per user namespace translation, example checking if + * root in user namespace. + */ +FUNC_INLINE bool __is_uid_global_root(__u32 uid) +{ + return uid == 0; +} + +#endif //SYSAK_BPF_CRED_H diff --git a/src/security/bpf_elf.h b/src/security/bpf_elf.h new file mode 100644 index 0000000000000000000000000000000000000000..0b4ae8f36a984a915990d67ec3a25af306a1524a --- /dev/null +++ b/src/security/bpf_elf.h @@ -0,0 +1,56 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_ELF_H +#define SYSAK_BPF_ELF_H + +/* Note: + * + * Below ELF section names and bpf_elf_map structure definition + * are not (!) kernel ABI. It's rather a "contract" between the + * application and the BPF loader in tc. For compatibility, the + * section names should stay as-is. Introduction of aliases, if + * needed, are a possibility, though. + */ + +/* ELF section names, etc */ +#define ELF_SECTION_LICENSE "license" +#define ELF_SECTION_MAPS "maps" +#define ELF_SECTION_PROG "prog" +#define ELF_SECTION_CLASSIFIER "classifier" +#define ELF_SECTION_ACTION "action" + +#define ELF_MAX_MAPS 64 +#define ELF_MAX_LICENSE_LEN 128 + +/* Object pinning settings */ +#define PIN_NONE 0 +#define PIN_OBJECT_NS 1 +#define PIN_GLOBAL_NS 2 + +/* ELF map definition */ +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; + __u32 inner_id; + __u32 inner_idx; +}; + +#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ + struct ____btf_map_##name { \ + type_key key; \ + type_val value; \ + }; \ + struct ____btf_map_##name \ + __attribute__((section(".maps." #name), used)) \ + ____btf_map_##name = {} + + + +#endif //SYSAK_BPF_ELF_H diff --git a/src/security/bpf_event.h b/src/security/bpf_event.h new file mode 100644 index 0000000000000000000000000000000000000000..31ae22e64f35608c153446dc33c20324fe760e32 --- /dev/null +++ b/src/security/bpf_event.h @@ -0,0 +1,21 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_EVENT_H +#define SYSAK_BPF_EVENT_H + +#include "../coolbpf.h" +#include + +struct event { + int event; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, struct event); +} tcpmon_map SEC(".maps"); + +#endif //SYSAK_BPF_EVENT_H diff --git a/src/security/bpf_exit.h b/src/security/bpf_exit.h new file mode 100644 index 0000000000000000000000000000000000000000..d47cbe1aba35efea606cfbd395699e29fb8113b6 --- /dev/null +++ b/src/security/bpf_exit.h @@ -0,0 +1,87 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_EXIT_H +#define SYSAK_BPF_EXIT_H + + +#include "vmlinux.h" +#include "api.h" + +#include "msg_type.h" +#include "bpf_event.h" +#include "bpf_task.h" +#include "bpf_rate.h" +#include "process.h" +#include "bpf_process_event.h" +#include "../ebpf_log.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct msg_exit); +} exit_heap_map SEC(".maps"); + +FUNC_INLINE void event_exit_send(void *ctx, __u32 tgid) +{ + struct execve_map_value *enter; + + /* It is safe to do a map_lookup_event() here because + * we must have captured the execve case in order for an + * exit to happen. Or in the FGS startup case we pre + * populated it before loading BPF programs. At any rate + * if the entry is _not_ in the execve_map the lookup + * will create an empty entry, the ktime check below will + * catch it and we will quickly delete the entry again. + */ + enter = execve_map_get_noinit(tgid); + if (!enter) + return; + if (enter->key.ktime) { + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + size_t size = sizeof(struct msg_exit); + struct msg_exit *exit; + struct msg_k8s kube; + int zero = 0; + + exit = bpf_map_lookup_elem(&exit_heap_map, &zero); + if (!exit) + return; + + exit->common.op = MSG_OP_EXIT; + exit->common.flags = 0; + exit->common.pad[0] = 0; + exit->common.pad[1] = 0; + exit->common.size = size; + exit->common.ktime = bpf_ktime_get_ns(); + + exit->current.pid = tgid; + exit->current.pad[0] = 0; + exit->current.pad[1] = 0; + exit->current.pad[2] = 0; + exit->current.pad[3] = 0; + exit->current.ktime = enter->key.ktime; + + /** + * Per thread tracking rules TID == PID : + * We want the exit event to match the exec one, and since during exec + * we report the thread group leader, do same here as we read the exec + * entry from the execve_map anyway and explicitly set it to the to tgid. + */ + exit->info.tid = tgid; + bpf_probe_read(&exit->info.code, sizeof(exit->info.code), + _(&task->exit_code)); + + __event_get_cgroup_info(task, &kube); + + if (cgroup_rate(ctx, &kube, exit->common.ktime)) { + perf_event_output_metric(ctx, MSG_OP_EXIT, &tcpmon_map, + BPF_F_CURRENT_CPU, exit, size); + } + } + execve_map_delete(tgid); +} + +#endif //SYSAK_BPF_EXIT_H diff --git a/src/security/bpf_head.h b/src/security/bpf_head.h new file mode 100644 index 0000000000000000000000000000000000000000..3b2c29de5d70c35bfb028efc81b052086cd338cf --- /dev/null +++ b/src/security/bpf_head.h @@ -0,0 +1,141 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_HEAD_H +#define SYSAK_BPF_HEAD_H + +extern "C" { +#include "../coolbpf.h" +#include +}; + +#ifdef COOLBPF_PERF_THREAD + +#define DEFINE_SEKL_OBJECT(skel_name) \ + struct skel_name##_bpf *skel_name = NULL; \ + static pthread_t perf_thread = 0; \ + int thread_worker(struct beeQ *q, void *arg) \ + { \ + perf_thread_worker(arg); \ + return 0; \ + } \ + void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) \ + { \ + printf("Lost %llu events on CPU #%d!\n", lost_cnt, cpu); \ + } + +#define LOAD_SKEL_OBJECT(skel_name, perf) \ + ( \ + { \ + __label__ load_bpf_skel_out; \ + int __ret = 0; \ + skel_name = skel_name##_bpf__open(); \ + if (!skel_name) \ + { \ + printf("failed to open BPF object\n"); \ + __ret = -1; \ + goto load_bpf_skel_out; \ + } \ + __ret = skel_name##_bpf__load(skel_name); \ + if (__ret) \ + { \ + printf("failed to load BPF object: %d\n", __ret); \ + DESTORY_SKEL_BOJECT(skel_name); \ + goto load_bpf_skel_out; \ + } \ + struct bpf_program* prog; \ + prog = bpf_object__find_program_by_name(skel_name->obj, "execve_rate"); \ + if (prog) {bpf_program__set_autoload(prog, false); \ + printf("execve_rate found and set not to autoattach"); \ + } \ + else printf("execve_rate not found "); \ + prog = bpf_object__find_program_by_name(skel_name->obj, "execve_send"); \ + if (prog) {bpf_program__set_autoload(prog, false); \ + printf("execve_send found and set not to autoattach"); \ + } \ + else printf("execve_send not found "); \ + __ret = skel_name##_bpf__attach(skel_name); \ + if (__ret) \ + { \ + printf("failed to attach BPF programs: %s\n", strerror(-__ret)); \ + DESTORY_SKEL_BOJECT(skel_name); \ + goto load_bpf_skel_out; \ + } \ + struct perf_thread_arguments *perf_args = calloc(1, sizeof(struct perf_thread_arguments)); \ + if (!perf_args) \ + { \ + __ret = -ENOMEM; \ + printf("failed to allocate memory: %s\n", strerror(-__ret)); \ + DESTORY_SKEL_BOJECT(skel_name); \ + goto load_bpf_skel_out; \ + } \ + perf_args->mapfd = bpf_map__fd(skel_name->maps.perf); \ + perf_args->sample_cb = handle_event; \ + perf_args->lost_cb = handle_lost_events; \ + perf_args->ctx = arg; \ + perf_thread = beeQ_send_thread(arg, perf_args, thread_worker); \ + load_bpf_skel_out: \ + __ret; \ + }) + +#define DESTORY_SKEL_BOJECT(skel_name) \ + if (perf_thread != 0) \ + plugin_thread_stop(perf_thread); \ + skel_name##_bpf__destroy(skel_name); +#else +#define DEFINE_SEKL_OBJECT(skel_name) \ + struct skel_name##_bpf *skel_name = NULL; + +#define LOAD_SKEL_OBJECT(skel_name, perf) \ + ( \ + { \ + __label__ load_bpf_skel_out; \ + int __ret = 0; \ + skel_name = skel_name##_bpf__open(); \ + if (!skel_name) \ + { \ + printf("failed to open BPF object\n"); \ + __ret = -1; \ + goto load_bpf_skel_out; \ + } \ + __ret = skel_name##_bpf__load(skel_name); \ + if (__ret) \ + { \ + printf("failed to load BPF object: %d\n", __ret); \ + DESTORY_SKEL_BOJECT(skel_name); \ + goto load_bpf_skel_out; \ + } \ + struct bpf_program* prog; \ + prog = bpf_object__find_program_by_name(skel_name->obj, "execve_rate"); \ + if (prog) {bpf_program__set_autoload(prog, false); \ + printf("execve_rate found and set not to autoattach\n"); \ + } \ + else printf("execve_rate not found "); \ + prog = bpf_object__find_program_by_name(skel_name->obj, "execve_send"); \ + if (prog) {bpf_program__set_autoload(prog, false); \ + printf("execve_send found and set not to autoattach\n"); \ + } \ + else printf("execve_send not found "); \ + __ret = skel_name##_bpf__attach(skel_name); \ + if (__ret) \ + { \ + printf("failed to attach BPF programs: %s\n", strerror(-__ret)); \ + DESTORY_SKEL_BOJECT(skel_name); \ + goto load_bpf_skel_out; \ + } \ + load_bpf_skel_out: \ + __ret; \ + }) + +#define DESTORY_SKEL_BOJECT(skel_name) \ + skel_name##_bpf__destroy(skel_name); +#endif + +#define coobpf_map_find(OBJ, NAME) bpf_object__find_map_fd_by_name(OBJ, NAME) +#define coobpf_key_next(FD, KEY, NEXT) bpf_map_get_next_key(FD, KEY, NEXT) +#define coobpf_key_value(FD, KEY, VALUE) bpf_map_lookup_elem(FD, KEY, VALUE) + + + +#endif //SYSAK_BPF_HEAD_H diff --git a/src/security/bpf_process_event.h b/src/security/bpf_process_event.h new file mode 100644 index 0000000000000000000000000000000000000000..1296c9478d1a55874c3b6dc7a6a8528401c08c6d --- /dev/null +++ b/src/security/bpf_process_event.h @@ -0,0 +1,609 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_PROCESS_EVENT_H +#define SYSAK_BPF_PROCESS_EVENT_H + +#include +#include + +#include "../coolbpf.h" +#include "api.h" +#include "bpf_cgroup.h" +#include "bpf_common.h" +#include "bpf_cred.h" +#include "compiler.h" +#include "ebpf_log.h" + +#define ENAMETOOLONG 36 /* File name too long */ + +#define MAX_BUF_LEN 4096 + +struct buffer_heap_map_value { + // Buffer is twice the needed size because of the verifier. In prepend_name + // unit tests, the verifier figures out that 255 is enough and that the + // buffer_offset will not overflow, but in the real use-case it looks like + // it's forgetting about that. + unsigned char buf[MAX_BUF_LEN + 256]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct buffer_heap_map_value); +} buffer_heap_map SEC(".maps"); + +FUNC_INLINE __u64 __get_auid(struct task_struct *task) +{ + // u64 to convince compiler to do 64bit loads early kernels do not + // support 32bit loads from stack, e.g. r1 = *(u32 *)(r10 -8). + __u64 auid = 0; + + if (!task) + return auid; + + if (bpf_core_field_exists(task->loginuid)) { + bpf_probe_read(&auid, sizeof(auid), _(&task->loginuid.val)); + } else { +// struct audit_task_info *audit; +// +// if (bpf_core_field_exists(task->audit)) { +// bpf_probe_read(&audit, sizeof(audit), _(&task->audit)); +// if (audit) { +// bpf_probe_read(&auid, sizeof(__u32), +// _(&audit->loginuid)); +// } +// } + } + + return auid; +} + +FUNC_INLINE __u32 get_auid(void) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + + return __get_auid(task); +} + +#define offsetof_btf(s, memb) ((size_t)((char *)_(&((s *)0)->memb) - (char *)0)) + +#define container_of_btf(ptr, type, member) \ + ({ \ + void *__mptr = (void *)(ptr); \ + ((type *)(__mptr - offsetof_btf(type, member))); \ + }) + +FUNC_INLINE struct mount *real_mount(struct vfsmount *mnt) +{ + return container_of_btf(mnt, struct mount, mnt); +} + +FUNC_INLINE bool IS_ROOT(struct dentry *dentry) +{ + struct dentry *d_parent; + + bpf_probe_read(&d_parent, sizeof(d_parent), _(&dentry->d_parent)); + return (dentry == d_parent); +} + +FUNC_INLINE bool hlist_bl_unhashed(const struct hlist_bl_node *h) +{ + struct hlist_bl_node **pprev; + + bpf_probe_read(&pprev, sizeof(pprev), _(&h->pprev)); + return !pprev; +} + +FUNC_INLINE int d_unhashed(struct dentry *dentry) +{ + return hlist_bl_unhashed(_(&dentry->d_hash)); +} + +FUNC_INLINE int d_unlinked(struct dentry *dentry) +{ + return d_unhashed(dentry) && !IS_ROOT(dentry); +} + +FUNC_INLINE int +prepend_name(char *buf, char **bufptr, int *buflen, const char *name, u32 namelen) +{ + // contains 1 if the buffer is large enough to contain the whole name and a slash prefix + bool write_slash = 1; + + u64 buffer_offset = (u64)(*bufptr) - (u64)buf; + + // Change name and namelen to fit in the buffer. + // We prefer to store the part of it that fits rather than discard it. + if (namelen >= *buflen) { + name += namelen - *buflen; + namelen = *buflen; + write_slash = 0; + } + + *buflen -= (namelen + write_slash); + + if (namelen + write_slash > buffer_offset) + return -ENAMETOOLONG; + + buffer_offset -= (namelen + write_slash); + + // This will never happen. buffer_offset is the diff of the initial buffer pointer + // with the current buffer pointer. This will be at max 256 bytes (similar to the initial + // size). + // Needed to bound that for bpf_probe_read call. + if (buffer_offset >= MAX_BUF_LEN) + return -ENAMETOOLONG; + + if (write_slash) + buf[buffer_offset] = '/'; + + // This ensures that namelen is < 256, which is aligned with kernel's max dentry name length + // that is 255 (https://elixir.bootlin.com/linux/v5.10/source/include/uapi/linux/limits.h#L12). + // Needed to bound that for bpf_probe_read call. + asm volatile("%[namelen] &= 0xff;\n" ::[namelen] "+r"(namelen) + :); + bpf_probe_read(buf + buffer_offset + write_slash, namelen * sizeof(char), name); + + *bufptr = buf + buffer_offset; + return write_slash ? 0 : -ENAMETOOLONG; +} + +/* + * Only called from path_with_deleted function before any path traversals. + * In the current scenarios, always buflen will be 256 and namelen 10. + * For this reason I will never return -ENAMETOOLONG. + */ +FUNC_INLINE int +prepend(char **buffer, int *buflen, const char *str, int namelen) +{ + *buflen -= namelen; + if (*buflen < 0) // will never happen - check function comment + return -ENAMETOOLONG; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + +struct cwd_read_data { + struct dentry *root_dentry; + struct vfsmount *root_mnt; + char *bf; + struct dentry *dentry; + struct vfsmount *vfsmnt; + struct mount *mnt; + char *bptr; + int blen; + bool resolved; +}; + +FUNC_INLINE long cwd_read(struct cwd_read_data *data) +{ + struct qstr d_name; + struct dentry *parent; + struct dentry *vfsmnt_mnt_root; + struct dentry *dentry = data->dentry; + struct vfsmount *vfsmnt = data->vfsmnt; + struct mount *mnt = data->mnt; + int error; + + if (!(dentry != data->root_dentry || vfsmnt != data->root_mnt)) { + data->resolved = + true; // resolved all path components successfully + return 1; + } + + bpf_probe_read(&vfsmnt_mnt_root, sizeof(vfsmnt_mnt_root), + _(&vfsmnt->mnt_root)); + if (dentry == vfsmnt_mnt_root || IS_ROOT(dentry)) { + struct mount *parent; + + bpf_probe_read(&parent, sizeof(parent), _(&mnt->mnt_parent)); + + /* Global root? */ + if (data->mnt != parent) { + bpf_probe_read(&data->dentry, sizeof(data->dentry), + _(&mnt->mnt_mountpoint)); + data->mnt = parent; + data->vfsmnt = _(&parent->mnt); + return 0; + } + // resolved all path components successfully + data->resolved = true; + return 1; + } + bpf_probe_read(&parent, sizeof(parent), _(&dentry->d_parent)); + bpf_probe_read(&d_name, sizeof(d_name), _(&dentry->d_name)); + error = prepend_name(data->bf, &data->bptr, &data->blen, + (const char *)d_name.name, d_name.len); + // This will happen where the dentry name does not fit in the buffer. + // We will stop the loop with resolved == false and later we will + // set the proper value in error before function return. + if (error) + return 1; + + data->dentry = parent; + return 0; +} + +#ifdef __V61_BPF_PROG +static long cwd_read_v61(__u32 index, void *data) +{ + return cwd_read(data); +} +#endif +FUNC_INLINE int +prepend_path(const struct path *path, const struct path *root, char *bf, + char **buffer, int *buflen) +{ + struct cwd_read_data data = { + .bf = bf, + .bptr = *buffer, + .blen = *buflen, + }; + int error = 0; + + bpf_probe_read(&data.root_dentry, sizeof(data.root_dentry), + _(&root->dentry)); + bpf_probe_read(&data.root_mnt, sizeof(data.root_mnt), _(&root->mnt)); + bpf_probe_read(&data.dentry, sizeof(data.dentry), _(&path->dentry)); + bpf_probe_read(&data.vfsmnt, sizeof(data.vfsmnt), _(&path->mnt)); + data.mnt = real_mount(data.vfsmnt); + +#ifndef __V61_BPF_PROG +#pragma unroll + for (int i = 0; i < PROBE_CWD_READ_ITERATIONS; ++i) { + if (cwd_read(&data)) + break; + } +#else + loop(PROBE_CWD_READ_ITERATIONS, cwd_read_v61, (void *)&data, 0); +#endif /* __V61_BPF_PROG */ + + if (data.bptr == *buffer) { + *buflen = 0; + return 0; + } + if (!data.resolved) + error = UNRESOLVED_PATH_COMPONENTS; + *buffer = data.bptr; + *buflen = data.blen; + return error; +} + +FUNC_INLINE int +path_with_deleted(const struct path *path, const struct path *root, char *bf, + char **buf, int *buflen) +{ + struct dentry *dentry; + + bpf_probe_read(&dentry, sizeof(dentry), _(&path->dentry)); + if (d_unlinked(dentry)) { + int error = prepend(buf, buflen, " (deleted)", 10); + if (error) // will never happen as prepend will never return a value != 0 + return error; + } + return prepend_path(path, root, bf, buf, buflen); +} + +/* + * This function returns the path of a dentry and works in a similar + * way to Linux d_path function (https://elixir.bootlin.com/linux/v5.10/source/fs/d_path.c#L262). + * + * Input variables: + * - 'path' is a pointer to a dentry path that we want to resolve + * - 'buf' is the buffer where the path will be stored (this should be always the value of 'buffer_heap_map' map) + * - 'buflen' is the available buffer size to store the path (now 256 in all cases, maybe we can increase that further) + * + * Input buffer layout: + * <-- buflen --> + * ----------------------------- + * | | + * ----------------------------- + * ^ + * | + * buf + * + * + * Output variables: + * - 'buf' is where the path is stored (>= compared to the input argument) + * - 'buflen' the size of the resolved path (0 < buflen <= 256). Will not be negative. If buflen == 0 nothing is written to the buffer. + * - 'error' 0 in case of success or UNRESOLVED_PATH_COMPONENTS in the case where the path is larger than the provided buffer. + * + * Output buffer layout: + * <-- buflen --> + * ----------------------------- + * | /etc/passwd| + * ----------------------------- + * ^ + * | + * buf + * + * ps. The size of the path will be (initial value of buflen) - (return value of buflen) if (buflen != 0) + */ +FUNC_INLINE char * +__d_path_local(const struct path *path, char *buf, int *buflen, int *error) +{ + char *res = buf + *buflen; + struct task_struct *task; + struct fs_struct *fs; + + task = (struct task_struct *)bpf_get_current_task(); + bpf_probe_read(&fs, sizeof(fs), _(&task->fs)); + *error = path_with_deleted(path, _(&fs->root), buf, &res, buflen); + return res; +} + +/* + * Entry point to the codepath used for path resolution. + * + * This function allocates a buffer from 'buffer_heap_map' map and calls + * __d_path_local. After __d_path_local returns, it also does the appropriate + * calculations on the buffer size (check __d_path_local comment). + * + * Returns the buffer where the path is stored. 'buflen' is the size of the + * resolved path (0 < buflen <= 256) and will not be negative. If buflen == 0 + * nothing is written to the buffer (still the value to the buffer is valid). + * 'error' is 0 in case of success or UNRESOLVED_PATH_COMPONENTS in the case + * where the path is larger than the provided buffer. + */ +FUNC_INLINE char * +d_path_local(const struct path *path, int *buflen, int *error) +{ + int zero = 0; + char *buffer = 0; + + buffer = bpf_map_lookup_elem(&buffer_heap_map, &zero); + if (!buffer) + return 0; + + *buflen = MAX_BUF_LEN; + buffer = __d_path_local(path, buffer, buflen, error); + if (*buflen > 0) + *buflen = MAX_BUF_LEN - *buflen; + + return buffer; +} + +FUNC_INLINE __u32 +getcwd(struct msg_process *curr, __u32 offset, __u32 proc_pid) +{ + struct task_struct *task = get_task_from_pid(proc_pid); + struct fs_struct *fs; + int flags = 0, size; + char *buffer; + + bpf_probe_read(&fs, sizeof(fs), _(&task->fs)); + if (!fs) { + curr->flags |= EVENT_ERROR_CWD; + return 0; + } + + buffer = d_path_local(_(&fs->pwd), &size, &flags); + if (!buffer) + return 0; + + asm volatile("%[offset] &= 0x3ff;\n" ::[offset] "+r"(offset) + :); + if (size > 255) + size = 255; + asm volatile("%[size] &= 0xff;\n" ::[size] "+r"(size) + :); + bpf_probe_read((char *)curr + offset, size, buffer); + + // Unfortunate special case for '/' where nothing was added we need + // to truncate with '\n' for parser. + if (size == 0) + curr->flags |= EVENT_ROOT_CWD; + if (flags & UNRESOLVED_PATH_COMPONENTS) + curr->flags |= EVENT_ERROR_PATH_COMPONENTS; + curr->flags = curr->flags & ~(EVENT_NEEDS_CWD | EVENT_ERROR_CWD); + return (__u32)size; +} + +FUNC_INLINE void event_set_clone(struct msg_process *pid) +{ + pid->flags |= EVENT_CLONE; +} + +FUNC_INLINE void +__get_caps(struct msg_capabilities *msg, const struct cred *cred) +{ + bpf_probe_read(&msg->effective, sizeof(__u64), _(&cred->cap_effective)); + bpf_probe_read(&msg->inheritable, sizeof(__u64), _(&cred->cap_inheritable)); + bpf_probe_read(&msg->permitted, sizeof(__u64), _(&cred->cap_permitted)); +} + +/* @get_current_subj_caps: + * Retrieve current task capabilities from the subjective credentials and + * return it into @msg. + * + * Use this function to report current task capabilities that will be used to + * calculate the security access when acting upon other objects. + * + * Special care must be taken to ensure that @task is "current". + * + * From: https://github.com/torvalds/linux/blob/v6.0/include/linux/cred.h#L88 + * " + * The security context of a task + * + * The parts of the context break down into two categories: + * + * (1) The objective context of a task. These parts are used when some other + * task is attempting to affect this one. + * + * (2) The subjective context. These details are used when the task is acting + * upon another object, be that a file, a task, a key or whatever. + * + * A task has two security pointers. task->real_cred points to the objective + * context that defines that task's actual details. The objective part of this + * context is used whenever that task is acted upon. + * + * task->cred points to the subjective context that defines the details of how + * that task is going to act upon another object. This may be overridden + * temporarily to point to another security context, but normally points to the + * same context as task->real_cred. + * " + */ +FUNC_INLINE void +get_current_subj_caps(struct msg_capabilities *msg, struct task_struct *task) +{ + const struct cred *cred; + + /* Get the task's subjective creds */ + bpf_probe_read(&cred, sizeof(cred), _(&task->cred)); + __get_caps(msg, cred); +} + +FUNC_INLINE void +get_current_subj_creds(struct msg_cred *info, struct task_struct *task) +{ + const struct cred *cred; + + /* Get the task's subjective creds */ + bpf_probe_read(&cred, sizeof(cred), _(&task->cred)); + + bpf_probe_read(&info->uid, sizeof(__u32), _(&cred->uid)); + bpf_probe_read(&info->gid, sizeof(__u32), _(&cred->gid)); + bpf_probe_read(&info->euid, sizeof(__u32), _(&cred->euid)); + bpf_probe_read(&info->egid, sizeof(__u32), _(&cred->egid)); + bpf_probe_read(&info->suid, sizeof(__u32), _(&cred->suid)); + bpf_probe_read(&info->sgid, sizeof(__u32), _(&cred->sgid)); + bpf_probe_read(&info->fsuid, sizeof(__u32), _(&cred->fsuid)); + bpf_probe_read(&info->fsgid, sizeof(__u32), _(&cred->fsgid)); + bpf_probe_read(&info->securebits, sizeof(__u32), _(&cred->securebits)); + + /* Get capabilities */ + __get_caps(&info->caps, cred); +} + +FUNC_INLINE void +get_namespaces(struct msg_ns *msg, struct task_struct *task) +{ + struct nsproxy *nsproxy; + struct nsproxy nsp; + + bpf_probe_read(&nsproxy, sizeof(nsproxy), _(&task->nsproxy)); + bpf_probe_read(&nsp, sizeof(nsp), _(nsproxy)); + + bpf_probe_read(&msg->uts_inum, sizeof(msg->uts_inum), + _(&nsp.uts_ns->ns.inum)); + bpf_probe_read(&msg->ipc_inum, sizeof(msg->ipc_inum), + _(&nsp.ipc_ns->ns.inum)); + bpf_probe_read(&msg->mnt_inum, sizeof(msg->mnt_inum), + _(&nsp.mnt_ns->ns.inum)); + { + struct pid *p = 0; + + bpf_probe_read(&p, sizeof(p), _(&task->thread_pid)); + if (p) { + int level = 0; + struct upid up; + + bpf_probe_read(&level, sizeof(level), _(&p->level)); + bpf_probe_read(&up, sizeof(up), _(&p->numbers[level])); + bpf_probe_read(&msg->pid_inum, sizeof(msg->pid_inum), + _(&up.ns->ns.inum)); + } else + msg->pid_inum = 0; + } + bpf_probe_read(&msg->pid_for_children_inum, + sizeof(msg->pid_for_children_inum), + _(&nsp.pid_ns_for_children->ns.inum)); + bpf_probe_read(&msg->net_inum, sizeof(msg->net_inum), + _(&nsp.net_ns->ns.inum)); + + // this also includes time_ns_for_children + if (bpf_core_field_exists(nsproxy->time_ns)) { + bpf_probe_read(&msg->time_inum, sizeof(msg->time_inum), + _(&nsp.time_ns->ns.inum)); + bpf_probe_read(&msg->time_for_children_inum, + sizeof(msg->time_for_children_inum), + _(&nsp.time_ns_for_children->ns.inum)); + } + + bpf_probe_read(&msg->cgroup_inum, sizeof(msg->cgroup_inum), + _(&nsp.cgroup_ns->ns.inum)); + { + struct mm_struct *mm; + struct user_namespace *user_ns; + + bpf_probe_read(&mm, sizeof(mm), _(&task->mm)); + bpf_probe_read(&user_ns, sizeof(user_ns), _(&mm->user_ns)); + bpf_probe_read(&msg->user_inum, sizeof(msg->user_inum), + _(&user_ns->ns.inum)); + } +} + +/* Gather current task cgroup name */ +FUNC_INLINE __u32 +__event_get_current_cgroup_name(struct cgroup *cgrp, struct msg_k8s *kube) +{ + const char *name; + + name = get_cgroup_name(cgrp); + if (name) + bpf_probe_read_str(kube->docker_id, KN_NAME_LENGTH, name); + + return name ? 0 : EVENT_ERROR_CGROUP_NAME; +} + +/** + * __event_get_cgroup_info() Collect cgroup info from current task. + * @task: must be current task. + * @msg: the msg_execve_event where to store collected information. + * + * Checks the tg_conf_map BPF map for cgroup and runtime configurations then + * collects cgroup information from current task. This allows to operate on + * different machines and workflows. + */ +FUNC_INLINE __u32 +__event_get_cgroup_info(struct task_struct *task, struct msg_k8s *kube) +{ + __u64 cgrpfs_magic = 0; + int zero = 0, subsys_idx = 0; + struct cgroup *cgrp; + struct tetragon_conf *conf; + __u32 flags = 0; + + /* Clear cgroup info at the beginning, so if we return early we do not pass previous data */ + memset(kube, 0, sizeof(struct msg_k8s)); + + conf = bpf_map_lookup_elem(&tg_conf_map, &zero); + if (conf) { + /* Select which cgroup version */ + cgrpfs_magic = conf->cgrp_fs_magic; + subsys_idx = conf->tg_cgrp_subsys_idx; + } + + cgrp = get_task_cgroup(task, subsys_idx, &flags); + if (!cgrp) + return 0; + + /* Collect event cgroup ID */ + kube->cgrpid = __tg_get_current_cgroup_id(cgrp, cgrpfs_magic); + if (!kube->cgrpid) + flags |= EVENT_ERROR_CGROUP_ID; + + /* Get the cgroup name of this event. */ + flags |= __event_get_current_cgroup_name(cgrp, kube); + return flags; +} + +FUNC_INLINE void +set_in_init_tree(struct execve_map_value *curr, struct execve_map_value *parent) +{ + if (parent && parent->flags & EVENT_IN_INIT_TREE) { + curr->flags |= EVENT_IN_INIT_TREE; + BPF_DEBUG("%s: parent in init tree", __func__); + return; + } + + if (curr->nspid == 1) { + curr->flags |= EVENT_IN_INIT_TREE; + BPF_DEBUG("%s: nspid=1", __func__); + } +} +#endif // SYSAK_BPF_PROCESS_EVENT_H diff --git a/src/security/bpf_process_event_type.h b/src/security/bpf_process_event_type.h new file mode 100644 index 0000000000000000000000000000000000000000..13bc12420cb4809c3e6a668f7d93c6dd7b632c16 --- /dev/null +++ b/src/security/bpf_process_event_type.h @@ -0,0 +1,339 @@ +// +// Created by qianlu on 2024/6/20. +// + +#ifndef SYSAK_BPF_PROCESS_EVENT_TYPE_H +#define SYSAK_BPF_PROCESS_EVENT_TYPE_H + +#ifdef __cplusplus +#include +#endif +#include "bpf_common.h" +#include "bpf_cred.h" +#include "msg_type.h" + + +/* Max number of args to parse */ +#define MAXARGS 20 +/* Max length of any given arg */ +#define MAXARGLENGTH 256 +/* This is the absolute buffer size for args and filenames including some + * extra head room so we can append last args string to buffer. The extra + * headroom is an unfortunate result of bounds on offset/size in + * event_args_builder(). + * + * For example given an offset bounds + * + * offset <- (0, 100) + * + * We will read into the buffer using this offset giving a max offset + * of eargs + 100. + * + * args[offset] <- (0, 100) + * + * Now we want to read this with call 45 aka bpf_probe_read_str as follows, + * where 'kernel_struct_arg' is the kernel data struct we are reading. + * + * bpf_probe_read_str(args[offset], size, kernel_struct_arg) + * + * But we have a bit of a problem determining if 'size' is out of array + * range. The math would be, + * + * size = length - offset + * + * Giving the remainder of the buffer, + * + * args offset length + * |---------------|------------------| + * + * |-------size-------| + * + * But verifier math works on bounds so bounds analysis of size is the + * following, + * + * length = 1024 + * offset = (0, 100) + * + * size = length - offset + * size = (1024) - (0, 100) + * size <- (924, 1124) + * + * And verifier throws an error because args[offset + size] with bounds + * anaylsis, + * + * args_(max)[100 + 1024] = args_(max)[1124] + * + * To circumvent this, at least until we teach the verifier about + * dependent variables, create a maxarg value and pad arg buffer with + * it. Giving a args buffer of size 'length + pad' with above bounds + * analysis, + * + * size = length - offset + * size = (1024) - (0, 100) + * if size > pad goto done + * size <- (924, 1124) // 1124 < length + pad + * + * Phew all clear now? + */ +#define CWD_MAX 256 +#define BUFFER 1024 +#define SIZEOF_EVENT 56 +#define PADDED_BUFFER \ + (BUFFER + MAXARGLENGTH + SIZEOF_EVENT + SIZEOF_EVENT + CWD_MAX) +/* This is the usable buffer size for args and filenames. It is calculated + * as the (BUFFER SIZE - sizeof(parent) - sizeof(curr) but unfortunately + * preprocess doesn't know types so we do it manually without sizeof(). + */ +#define ARGSBUFFER (BUFFER - SIZEOF_EVENT - SIZEOF_EVENT) +#define __ASM_ARGSBUFFER 976 +#define ARGSBUFFERMASK (ARGSBUFFER - 1) +#define MAXARGMASK (MAXARG - 1) +#define PATHNAME_SIZE 256 + +/* Task flags */ +#ifndef PF_KTHREAD +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#endif + +/* Msg flags */ +#define EVENT_UNKNOWN 0x00 +#define EVENT_EXECVE 0x01 +#define EVENT_EXECVEAT 0x02 +#define EVENT_PROCFS 0x04 +#define EVENT_TRUNC_FILENAME 0x08 +#define EVENT_TRUNC_ARGS 0x10 +#define EVENT_TASK_WALK 0x20 +#define EVENT_MISS 0x40 +#define EVENT_NEEDS_AUID 0x80 +#define EVENT_ERROR_FILENAME 0x100 +#define EVENT_ERROR_ARGS 0x200 +#define EVENT_NEEDS_CWD 0x400 +#define EVENT_NO_CWD_SUPPORT 0x800 +#define EVENT_ROOT_CWD 0x1000 +#define EVENT_ERROR_CWD 0x2000 +#define EVENT_CLONE 0x4000 +#define EVENT_ERROR_SOCK 0x8000 +#define EVENT_ERROR_CGROUP_NAME 0x010000 +#define EVENT_ERROR_CGROUP_KN 0x020000 +#define EVENT_ERROR_CGROUP_SUBSYSCGRP 0x040000 +#define EVENT_ERROR_CGROUP_SUBSYS 0x080000 +#define EVENT_ERROR_CGROUPS 0x100000 +#define EVENT_ERROR_CGROUP_ID 0x200000 +#define EVENT_ERROR_PATH_COMPONENTS 0x400000 +#define EVENT_DATA_FILENAME 0x800000 +#define EVENT_DATA_ARGS 0x1000000 +#define EVENT_IN_INIT_TREE 0x2000000 + +#define EVENT_COMMON_FLAG_CLONE 0x01 + +/* Docker IDs are unique at first 12 characters, but we want to get + * 12chars plus any extra prefix used by the container environment. + * Minikube for example prepends 'docker-' to the id. So lets copy + * 32B and assume at least 12B of it is ID info. + */ +#define DOCKER_ID_LENGTH 128 + +struct msg_execve_key { + __u32 pid; // Process TGID + __u8 pad[4]; + __u64 ktime; +}; // All fields aligned so no 'packed' attribute. + +/* This is the struct stored in bpf map to share info between + * different execve hooks. + */ +struct execve_info { + /* The secureexec is to reflect the kernel bprm->secureexec that is exposed + * to userspace through auxiliary vector which can be read from + * /proc/self/auxv or https://man7.org/linux/man-pages/man3/getauxval.3.html + * + * The AT_SECURE of auxv can have a value of 1 or 0 and it is set from + * the bprm->secureexec that is a bit field. + * If bprm->secureexec is 1 then it means executable should be treated securely. + * Most commonly, 1 indicates that the process is executing a set-user-ID + * or set-group-ID binary (so that its real and effective UIDs or GIDs differ + * from one another), or that it gained capabilities by executing a binary file + * that has capabilities (see capabilities(7)). + * Alternatively, a nonzero value may be triggered by a Linux Security Module. + * When this value is nonzero, the dynamic linker disables the use of certain + * environment variables. + * + * The secureexec here can have the following bit flags: + * EXEC_SETUID or EXEC_SETGID + */ + __u32 secureexec; + __u32 i_nlink; /* inode links */ + __u64 i_ino; /* inode number */ +}; + +/* process information + * + * Manually linked to ARGSBUFFER and PADDED_BUFFER if this changes then please + * also change SIZEOF_EVENT. + */ +struct msg_process { + __u32 size; + __u32 pid; // Process TGID + __u32 tid; // Process thread + __u32 nspid; + __u32 secureexec; + __u32 uid; + __u32 auid; + __u32 flags; + __u32 i_nlink; + __u32 pad; + __u64 i_ino; + __u64 ktime; + char *args; +}; // All fields aligned so no 'packed' attribute. + +/* msg_clone_event holds only the necessary fields to construct a new entry from + * the parent after a clone() event. + */ +struct msg_clone_event { + struct msg_common common; + struct msg_execve_key parent; + __u32 tgid; + __u32 tid; + __u32 nspid; + __u32 flags; + __u64 ktime; +} __attribute__((packed)); + +struct exit_info { + __u32 code; + __u32 tid; // Thread ID +}; + +struct msg_exit { + struct msg_common common; + struct msg_execve_key current; + struct exit_info info; +}; // All fields aligned so no 'packed' attribute. + +enum { + ns_uts = 0, + ns_ipc = 1, + ns_mnt = 2, + ns_pid = 3, + ns_pid_for_children = 4, + ns_net = 5, + ns_time = 6, + ns_time_for_children = 7, + ns_cgroup = 8, + ns_user = 9, + + // If you update the value of ns_max_types you + // should also update parseMatchNamespaces() + // in kernel.go + ns_max_types = 10, +}; + +struct msg_ns { + union { + struct { + __u32 uts_inum; + __u32 ipc_inum; + __u32 mnt_inum; + __u32 pid_inum; + __u32 pid_for_children_inum; + __u32 net_inum; + __u32 time_inum; + __u32 time_for_children_inum; + __u32 cgroup_inum; + __u32 user_inum; + }; + __u32 inum[ns_max_types]; + }; +}; // All fields aligned so no 'packed' attribute. + +struct msg_k8s { + __u64 cgrpid; + char docker_id[DOCKER_ID_LENGTH]; +}; // All fields aligned so no 'packed' attribute. + +#define BINARY_PATH_MAX_LEN 256 + +struct heap_exe { + char buf[BINARY_PATH_MAX_LEN]; + __u32 len; + __u32 error; +}; // All fields aligned so no 'packed' attribute. + +struct msg_execve_event { + struct msg_common common; + struct msg_k8s kube; + struct msg_execve_key parent; + __u64 parent_flags; + struct msg_cred creds; + struct msg_ns ns; + struct msg_execve_key cleanup_key; + /* if add anything above please also update the args of + * validate_msg_execve_size() in bpf_execve_event.c */ + union { + struct msg_process process; + char buffer[PADDED_BUFFER]; + }; + /* below fields are not part of the event, serve just as + * heap for execve programs + */ +#ifdef __LARGE_BPF_PROG + struct heap_exe exe; +#endif +}; // All fields aligned so no 'packed' attribute. + +// This structure stores the binary path that was recorded on execve. +// Technically PATH_MAX is 4096 but we limit the length we store since we have +// limits on the length of the string to compare: +// - Artificial limits for full string comparison. +// - Technical limits for prefix and postfix, using LPM_TRIE that have a 256 +// bytes size limit. +struct binary { + // length of the path stored in path, this should be < BINARY_PATH_MAX_LEN + // but can contain negative value in case of copy error. + // While s16 would be sufficient, 64 bits are handy for alignment. + __s64 path_length; + // BINARY_PATH_MAX_LEN first bytes of the path + char path[BINARY_PATH_MAX_LEN]; +}; // All fields aligned so no 'packed' attribute + +// The execve_map_value is tracked by the TGID of the thread group +// the msg_execve_key.pid. The thread IDs are recorded on the +// fly and sent with every corresponding event. +struct execve_map_value { + struct msg_execve_key key; + struct msg_execve_key pkey; + __u32 flags; + __u32 nspid; + struct msg_ns ns; + struct msg_capabilities caps; + struct binary bin; +} __attribute__((packed)) __attribute__((aligned(8))); + + +struct msg_throttle { + struct msg_common common; + struct msg_k8s kube; +}; + + +struct cgroup_rate_key { + __u64 id; +}; + +struct cgroup_rate_value { + __u64 curr; + __u64 prev; + __u64 time; + __u64 rate; + __u64 throttled; +}; + +struct cgroup_rate_options { + __u64 events; + __u64 interval; +}; + + + +#endif //SYSAK_BPF_PROCESS_EVENT_TYPE_H diff --git a/src/security/bpf_rate.h b/src/security/bpf_rate.h new file mode 100644 index 0000000000000000000000000000000000000000..c46cebf67988d45ff2144fdbb28f1929eb03a797 --- /dev/null +++ b/src/security/bpf_rate.h @@ -0,0 +1,140 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_RATE_H +#define SYSAK_BPF_RATE_H + +#include "../coolbpf.h" +#include +#include + +#include "msg_type.h" +#include "bpf_process_event_type.h" + + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 32768); + __type(key, struct cgroup_rate_key); + __type(value, struct cgroup_rate_value); +} cgroup_rate_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct cgroup_rate_options); +} cgroup_rate_options_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct msg_throttle); +} throttle_heap_map SEC(".maps"); + +FUNC_INLINE void send_throttle(void *ctx, struct msg_k8s *kube, __u64 time) +{ + struct msg_throttle *msg; + size_t size = sizeof(*msg); + + msg = bpf_map_lookup_elem(&throttle_heap_map, &(__u32){ 0 }); + if (!msg) + return; + + msg->common.size = size; + msg->common.ktime = time; + msg->common.op = MSG_OP_THROTTLE; + msg->common.flags = 0; + + __builtin_memcpy(&msg->kube, kube, sizeof(*kube)); + + perf_event_output_metric(ctx, MSG_OP_THROTTLE, &tcpmon_map, + BPF_F_CURRENT_CPU, msg, size); +} + +FUNC_INLINE bool cgroup_rate(void *ctx, struct msg_k8s *kube, __u64 time) +{ + struct cgroup_rate_options *opt; + struct cgroup_rate_key key = { + .id = kube->cgrpid, + }; + struct cgroup_rate_value *val; + __u64 delta, interval, slide; + __u32 zero = 0; + + opt = bpf_map_lookup_elem(&cgroup_rate_options_map, &zero); + if (!opt) + return true; + + interval = opt->interval; + if (!interval) + return true; + + val = bpf_map_lookup_elem(&cgroup_rate_map, &key); + if (!val) { + struct cgroup_rate_value new_value = { + .time = (time / interval) * interval, + .curr = 1, + }; + + bpf_map_update_elem(&cgroup_rate_map, &key, &new_value, 0); + return true; + } + + /* + * We split the time in interval windows and keep track of events + * of events count in current (val->curr) and previous (val->prev) + * intervals. + */ + + delta = time - val->time; + if (delta > interval) { + if (delta > 2 * interval) { + val->prev = 0; + val->time = (time / interval) * interval; + } else { + val->prev = val->curr; + val->time += interval; + } + val->curr = 0; + } + + val->curr++; + + /* + * We compute the size of the slide window in previous interval and + * based on that we compute partial amount of events from previous + * interval window. Then we add current interval count and we have + * rate value. + * + * val->time + * | + * <--- interval ----->|<--- interval ----->| + * | + * val->prev | val->curr + * |-------------------|----------- + * val->rate + * |-------------------| + * time + */ + + slide = interval - (time - val->time); + val->rate = (slide * val->prev) / interval + val->curr; + + if (!val->throttled && val->rate >= opt->events) { + val->throttled = time; + send_throttle(ctx, kube, time); + } + + return !val->throttled; +} + +FUNC_INLINE void cgroup_rate_del(__u64 cgroupid) +{ + bpf_map_delete_elem(&cgroup_rate_map, &cgroupid); +} + + +#endif //SYSAK_BPF_RATE_H diff --git a/src/security/bpf_task.h b/src/security/bpf_task.h new file mode 100644 index 0000000000000000000000000000000000000000..550113d6459828768ed1c65eefd4f269c6f6fea3 --- /dev/null +++ b/src/security/bpf_task.h @@ -0,0 +1,180 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_BPF_TASK_H +#define SYSAK_BPF_TASK_H + +#include "../coolbpf.h" +#include +#include + +#include "compiler.h" +#include "bpf_event.h" +#include "generic.h" + +/* __d_path_local flags */ +// #define UNRESOLVED_MOUNT_POINTS 0x01 // (deprecated) +// this error is returned by __d_path_local in the following cases: +// - the path walk did not conclude (too many dentry) +// - the path was too long to fit in the buffer +#define UNRESOLVED_PATH_COMPONENTS 0x02 + +#ifdef __LARGE_BPF_PROG +#define PROBE_CWD_READ_ITERATIONS 128 +#else +#define PROBE_CWD_READ_ITERATIONS 11 +#endif + +FUNC_INLINE struct task_struct *get_parent(struct task_struct *t) +{ + struct task_struct *task; + + /* Read the real parent */ + bpf_probe_read(&task, sizeof(task), _(&t->real_parent)); + if (!task) + return 0; + return task; +} + +FUNC_INLINE struct task_struct *get_task_from_pid(__u32 pid) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + __u32 cpid = 0; + int i; + +#define TASK_PID_LOOP 20 +#pragma unroll + for (i = 0; i < TASK_PID_LOOP; i++) { + if (!task) { + i = TASK_PID_LOOP; + continue; + } + bpf_probe_read(&cpid, sizeof(cpid), _(&task->tgid)); + if (cpid == pid) { + i = TASK_PID_LOOP; + continue; + } + task = get_parent(task); + } + if (cpid != pid) + return 0; + return task; +} + +FUNC_INLINE __u32 get_task_pid_vnr(void) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int thread_pid_exists; + unsigned int level; + struct upid upid; + struct pid *pid; + int upid_sz; + + thread_pid_exists = bpf_core_field_exists(task->thread_pid); + if (thread_pid_exists) { + bpf_probe_read(&pid, sizeof(pid), _(&task->thread_pid)); + if (!pid) + return 0; + } else { +// struct pid_link link; +// int link_sz = bpf_core_field_size(task->pids); +// +// /* 4.14 verifier did not prune this branch even though we +// * have the if (0) above after BTF exists check. So it will +// * try to run this bpf_probe_read and throw an error. So lets +// * sanitize it for the verifier. +// */ +// if (!thread_pid_exists) +// link_sz = +// 24; // voodoo magic, hard-code 24 to init stack +// bpf_probe_read(&link, link_sz, +// (void *)_(&task->pids) + (PIDTYPE_PID * link_sz)); +// pid = link.pid; + } + upid_sz = bpf_core_field_size(pid->numbers[0]); + bpf_probe_read(&level, sizeof(level), _(&pid->level)); + if (level < 1) + return 0; + bpf_probe_read(&upid, upid_sz, + (void *)_(&pid->numbers) + (level * upid_sz)); + return upid.nr; +} + +FUNC_INLINE __u32 event_find_parent_pid(struct task_struct *t) +{ + struct task_struct *task = get_parent(t); + __u32 pid; + + if (!task) + return 0; + bpf_probe_read(&pid, sizeof(pid), _(&task->tgid)); + return pid; +} + +FUNC_INLINE struct execve_map_value * +__event_find_parent(struct task_struct *task) +{ + __u32 pid; + struct execve_map_value *value = 0; + int i; + +#pragma unroll + for (i = 0; i < 4; i++) { + bpf_probe_read(&task, sizeof(task), _(&task->real_parent)); + if (!task) + break; + bpf_probe_read(&pid, sizeof(pid), _(&task->tgid)); + value = execve_map_get_noinit(pid); + if (value && value->key.ktime != 0) + return value; + } + return 0; +} + +FUNC_INLINE struct execve_map_value *event_find_parent(void) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + + return __event_find_parent(task); +} + +FUNC_INLINE void +event_minimal_parent(struct msg_execve_event *event, struct task_struct *task) +{ + event->parent.pid = event_find_parent_pid(task); + event->parent.ktime = 0; + event->parent_flags = EVENT_MISS; +} + +FUNC_INLINE void event_minimal_curr(struct execve_map_value *event) +{ + event->key.pid = (bpf_get_current_pid_tgid() >> 32); + event->key.ktime = 0; // should we insert a time? + event->flags = EVENT_MISS; +} + +FUNC_INLINE struct execve_map_value *event_find_curr(__u32 *ppid, bool *walked) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + struct execve_map_value *value = 0; + int i; + __u32 pid; + +#pragma unroll + for (i = 0; i < 4; i++) { + bpf_probe_read(&pid, sizeof(pid), _(&task->tgid)); + value = execve_map_get_noinit(pid); + if (value && value->key.ktime != 0) + break; + value = 0; + *walked = 1; + bpf_probe_read(&task, sizeof(task), _(&task->real_parent)); + if (!task) + break; + } + *ppid = pid; + return value; +} + +#endif //SYSAK_BPF_TASK_H diff --git a/src/security/compiler.h b/src/security/compiler.h new file mode 100644 index 0000000000000000000000000000000000000000..2b105d6cf2e059b92d7a460bfe4ff43d72062c02 --- /dev/null +++ b/src/security/compiler.h @@ -0,0 +1,17 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_COMPILER_H +#define SYSAK_COMPILER_H + +#ifdef __V61_BPF_PROG +#define FUNC_LOCAL static __attribute__((noinline)) __attribute__((__unused__)) +#define FUNC_INLINE static inline __attribute__((always_inline)) +#else +/* Older kernels have all functions inlined. */ +#define FUNC_LOCAL static inline __attribute__((always_inline)) +#define FUNC_INLINE static inline __attribute__((always_inline)) +#endif + +#endif //SYSAK_COMPILER_H diff --git a/src/security/data_event.h b/src/security/data_event.h new file mode 100644 index 0000000000000000000000000000000000000000..e34a956f0dc025b5d3bda81deee3efbe202ef466 --- /dev/null +++ b/src/security/data_event.h @@ -0,0 +1,225 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_DATA_EVENT_H +#define SYSAK_DATA_EVENT_H + +#include +#include "data_msg.h" +#include "bpf_common.h" + +FUNC_LOCAL long +__do_bytes(void *ctx, struct msg_data *msg, unsigned long uptr, size_t bytes) +{ + int err; + + /* Code movement from clang forces us to inline bounds checks here */ + asm volatile goto( + "if %[bytes] < 0 goto %l[b]\n;" + "if %[bytes] < " XSTR(MSG_DATA_ARG_LEN) " goto %l[a]\n;" + : + : [bytes] "+r"(bytes)::a, b); + bytes = MSG_DATA_ARG_LEN; + a: + // < 5.3 verifier still requires value masking like 'val &= xxx' +#ifndef __LARGE_BPF_PROG + asm volatile goto("if %[bytes] < 0x3fff goto %l[c]\n;" : : [bytes] "+r"(bytes)::c); + bytes = 0x3fff; + c: +#endif + err = bpf_probe_read(&msg->arg[0], bytes, (char *)uptr); + if (err < 0) + return err; + + msg->common.size = offsetof(struct msg_data, arg) + bytes; + err = bpf_perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, msg->common.size); + if (err < 0) + return err; + return bytes; + b: + return -1; +} + +FUNC_LOCAL long +do_bytes(void *ctx, struct msg_data *msg, unsigned long arg, size_t bytes) +{ + size_t rd_bytes = 0; + int err = 0, i __maybe_unused; + +#ifdef __LARGE_BPF_PROG + for (i = 0; i < 10; i++) { + err = __do_bytes(ctx, msg, arg + rd_bytes, bytes - rd_bytes); + if (err < 0) + goto error; + rd_bytes += err; + if (rd_bytes == bytes) + return rd_bytes; + } +#else +#define BYTES_COPY \ + err = __do_bytes(ctx, msg, arg + rd_bytes, bytes - rd_bytes); \ + if (err < 0) \ + goto error; \ + rd_bytes += err; \ + if (rd_bytes == bytes) \ + return rd_bytes; + +#define BYTES_COPY_5 BYTES_COPY BYTES_COPY BYTES_COPY BYTES_COPY BYTES_COPY + + BYTES_COPY_5 + BYTES_COPY_5 + +#undef BYTES_COPY_5 +#endif /* __LARGE_BPF_PROG */ + + /* leftover */ + return rd_bytes; +error: + perf_event_output_update_error_metric(MSG_OP_DATA, err); + return err; +} + +FUNC_LOCAL long +__do_str(void *ctx, struct msg_data *msg, unsigned long arg, bool *done) +{ + size_t size, max = sizeof(msg->arg) - 1; + long ret; + + /* Code movement from clang forces us to inline bounds checks here */ + asm volatile("%[max] &= 0x7fff;\n" + "if %[max] < 32736 goto +1\n;" + "%[max] = 32736;\n" + : + : [max] "+r"(max) + :); + + ret = bpf_probe_read_str(&msg->arg[0], max, (char *)arg); + if (ret < 0) + return ret; + + *done = ret != max; + if (ret == 0) + return 0; + /* cut out the zero byte */ + ret -= 1; + + size = ret + offsetof(struct msg_data, arg); + /* Code movement from clang forces us to inline bounds checks here */ + asm volatile("%[size] &= 0x7fff;\n" + : + : [size] "+r"(size) + :); + msg->common.size = size; + perf_event_output_metric(ctx, MSG_OP_DATA, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size); + return ret; +} + +FUNC_LOCAL long +do_str(void *ctx, struct msg_data *msg, unsigned long arg, + size_t bytes __maybe_unused) +{ + size_t rd_bytes = 0; + bool done = false; + long ret; + int i; + +#define __CNT 2 +#pragma unroll + for (i = 0; i < __CNT; i++) { + ret = __do_str(ctx, msg, arg + rd_bytes, &done); + if (ret < 0) + return ret; + rd_bytes += ret; + if (done) + break; + } +#undef __CNT + + /* we have no idea what's string leftover */ + return rd_bytes; +} + +FUNC_INLINE size_t data_event( + void *ctx, struct data_event_desc *desc, unsigned long uptr, + size_t size, struct bpf_map_def *heap, + long (*do_data_event)(void *, struct msg_data *, unsigned long, size_t)) +{ + struct msg_data *msg; + int zero = 0, err; + + msg = bpf_map_lookup_elem(heap, &zero); + if (!msg) + return 0; + + msg->common.op = MSG_OP_DATA; + msg->common.flags = 0; + msg->common.pad[0] = 0; + msg->common.pad[1] = 0; + + msg->id.pid = bpf_get_current_pid_tgid(); + if (msg->id.pid == (__u64)-22) // -EINVAL -- current == NULL + msg->id.pid = PT_REGS_FP_CORE((struct pt_regs *)ctx); + + msg->id.time = bpf_ktime_get_ns(); + desc->id = msg->id; + + /* + * Notes: + * The @size argument is valid only for do_bytes, it's -1 * for do_str. + * The do_data_event callback returns size of posted data. + * Leftover for data_event_str is always 0, because we don't know + * how much more was there to copy. + */ + err = do_data_event(ctx, msg, uptr, size); + + if (err < 0) { + desc->error = err; + desc->pad = 0; + desc->leftover = 0; + desc->size = 0; + } else { + desc->error = 0; + desc->pad = 0; + desc->leftover = size == (size_t)-1 ? 0 : size - err; + desc->size = err; + } + return sizeof(*desc); +} + +/** + * data_event_bytes - sends data event for raw data + * + * @uptr: pointer to data + * @size: size of the data + * + * Sends data event with raw data specified by @uptr and @size and + * writes status values into @desc object. + * + * Returns size of struct @desc object or 0 in case of error. + */ +FUNC_LOCAL size_t +data_event_bytes(void *ctx, struct data_event_desc *desc, unsigned long uptr, + size_t size, struct bpf_map_def *heap) +{ + return data_event(ctx, desc, uptr, size, heap, do_bytes); +} + +/** + * data_event_str - sends data event for string + * + * @uptr: pointer to string + * + * Sends data event with string specified by @uptr and writes status + * values into @desc object. + * + * Returns size of struct @desc object or 0 in case of error. + */ +FUNC_LOCAL size_t +data_event_str(void *ctx, struct data_event_desc *desc, unsigned long uptr, + struct bpf_map_def *heap) +{ + return data_event(ctx, desc, uptr, -1, heap, do_str); +} + +#endif //SYSAK_DATA_EVENT_H diff --git a/src/security/data_msg.h b/src/security/data_msg.h new file mode 100644 index 0000000000000000000000000000000000000000..bc22848c50c86001164d9200353998a729405ebc --- /dev/null +++ b/src/security/data_msg.h @@ -0,0 +1,42 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_DATA_MSG_H +#define SYSAK_DATA_MSG_H + +#ifdef __cplusplus +#include +#endif + +#include "bpf_common.h" + +#define MSG_DATA_ARG_LEN 32736 + +struct data_event_id { + __u64 pid; + __u64 time; +} __attribute__((packed)); + +struct data_event_desc { + __s32 error; + __u32 pad; + __u32 leftover; + __u32 size; + struct data_event_id id; +} __attribute__((packed)); + +struct msg_data { + struct msg_common common; + struct data_event_id id; + /* To have a fast way to check buffer size we use 32736 (MSG_DATA_ARG_LEN) + * as arg size, which is: + * 0x8000 - offsetof(struct msg_kprobe_arg, arg) + * so we can make verifier happy with: + * 'size &= 0x7fff' check + */ + char arg[MSG_DATA_ARG_LEN]; +} __attribute__((packed)); + + +#endif //SYSAK_DATA_MSG_H diff --git a/src/security/environ_conf.h b/src/security/environ_conf.h new file mode 100644 index 0000000000000000000000000000000000000000..f0560ba43922d49fdcbd59ec6f205b564ce454d8 --- /dev/null +++ b/src/security/environ_conf.h @@ -0,0 +1,40 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_ENVIRON_CONF_H +#define SYSAK_ENVIRON_CONF_H + + +/* bpf runtime log levels that follow Golang logrus levels + * https://pkg.go.dev/github.com/sirupsen/logrus#Level + */ +enum { + LOG_ERROR_LEVEL = 2, + LOG_WARN_LEVEL = 3, + LOG_INFO_LEVEL = 4, + LOG_DEBUG_LEVEL = 5, + LOG_TRACE_LEVEL = 6, +}; + +/* Tetragon runtime configuration */ +struct tetragon_conf { + __u32 loglevel; /* Tetragon log level */ + __u32 pid; /* Tetragon pid for debugging purpose */ + __u32 nspid; /* Tetragon pid in namespace for debugging purpose */ + __u32 tg_cgrp_hierarchy; /* Tetragon tracked hierarchy ID */ + __u32 tg_cgrp_subsys_idx; /* Tetragon tracked cgroup subsystem state index at compile time */ + __u32 tg_cgrp_level; /* Tetragon cgroup level */ + __u64 tg_cgrpid; /* Tetragon current cgroup ID to avoid filtering blocking itself */ + __u64 cgrp_fs_magic; /* Cgroupv1 or Cgroupv2 */ +}; // All fields aligned so no 'packed' attribute. + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __s32); + __type(value, struct tetragon_conf); +} tg_conf_map SEC(".maps"); + + +#endif //SYSAK_ENVIRON_CONF_H diff --git a/src/security/filter.h b/src/security/filter.h new file mode 100644 index 0000000000000000000000000000000000000000..1705e1db24b5f5c9cc09ce932e72689908c3c87e --- /dev/null +++ b/src/security/filter.h @@ -0,0 +1,37 @@ +#pragma once +#include +#include +#include +#include +#include "../coolbpf.h" +#include "type.h" + + +// #define POLICY_FILTER_MAX_FILTERS 128 +// #define FILTER_SIZE 4096 + + +// struct filter_map_value { +// unsigned char buf[FILTER_SIZE]; +// }; + +/* Arrays of size 1 will be rewritten to direct loads in verifier */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, SECURE_FUNCS_MAX); + __type(key, int); + __type(value, struct selector_filters); +} filter_map SEC(".maps"); + +// struct { +// __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); +// __uint(max_entries, POLICY_FILTER_MAX_FILTERS); +// __uint(key_size, sizeof(u32)); /* call name id */ +// __array( +// values, struct { +// __uint(type, BPF_MAP_TYPE_ARRAY); +// __uint(max_entries, 1); +// __type(key, __u64); +// __type(value, __u8); +// }); +// } filter_maps SEC(".maps"); \ No newline at end of file diff --git a/src/security/generic.h b/src/security/generic.h new file mode 100644 index 0000000000000000000000000000000000000000..9960c6b2c656ad1a593a00df0233f36e13163113 --- /dev/null +++ b/src/security/generic.h @@ -0,0 +1,67 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_GENERIC_H +#define SYSAK_GENERIC_H + + +#include "bpf_common.h" +#include "msg_type.h" +#include "process.h" + +/* The namespace and capability changes filters require later kernels */ +#ifdef __LARGE_BPF_PROG +#define __NS_CHANGES_FILTER +#define __CAP_CHANGES_FILTER +#endif + +#define FILTER_SIZE 4096 + +#define MAX_POSSIBLE_ARGS 5 +#define MAX_POSSIBLE_SELECTORS 31 +#define SELECTORS_ACTIVE 31 +#define MAX_CONFIGURED_SELECTORS MAX_POSSIBLE_SELECTORS + 1 + +struct msg_selector_data { + __u64 curr; + bool pass; + bool active[MAX_CONFIGURED_SELECTORS]; +#ifdef __NS_CHANGES_FILTER + __u64 match_ns; +#endif +#ifdef __CAP_CHANGES_FILTER + __u64 match_cap; +#endif + bool is32BitSyscall; +}; + +struct msg_generic_kprobe { + struct msg_common common; + struct msg_execve_key current; + struct msg_ns ns; + struct msg_capabilities caps; + __u64 func_id; + __u64 retprobe_id; + __u64 action; + __u32 action_arg_id; // only one URL or FQDN action can be fired per match + __u32 tid; // Thread ID that triggered the event + __u64 kernel_stack_id; // Kernel stack trace ID on u32 and potential error, see flag in msg_common.flags + __u64 user_stack_id; // User Stack trace ID + /* anything above is shared with the userspace so it should match structs MsgGenericKprobe and MsgGenericTracepoint in Go */ + char args[24000]; + unsigned long a0, a1, a2, a3, a4; + long argsoff[MAX_POSSIBLE_ARGS]; + struct msg_selector_data sel; + __u32 idx; // attach cookie index + __u32 tailcall_index_process; // recursion index for generic_process_event + __u32 tailcall_index_selector; // recursion index for filter_read_arg + int pass; +}; + +FUNC_INLINE size_t generic_kprobe_common_size(void) +{ + return offsetof(struct msg_generic_kprobe, args); +} + +#endif //SYSAK_GENERIC_H diff --git a/src/security/int_maps.h b/src/security/int_maps.h new file mode 100644 index 0000000000000000000000000000000000000000..40bfdf85dd9b3fa7e568249ad2ccf6334addd46b --- /dev/null +++ b/src/security/int_maps.h @@ -0,0 +1,38 @@ +#ifndef INT_MAPS_H__ +#define INT_MAPS_H__ + +#include +#include +#include +#include +#include "../coolbpf.h" +#include "type.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, INT_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, INT_MAPS_INNER_MAX_ENTRIES); + __type(key, __u32); + __type(value, __u8); + }); +} port_maps SEC(".maps"); + + +// struct { +// __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); +// __uint(max_entries, INT_MAPS_OUTER_MAX_ENTRIES); +// __uint(key_size, sizeof(__u32)); +// __array( +// values, struct { +// __uint(type, BPF_MAP_TYPE_HASH); +// __uint(max_entries, 1); +// __type(key, __u32); +// __type(value, __u8); +// }); +// } dport_maps SEC(".maps"); + +#endif // INT_MAPS_H__ diff --git a/src/security/msg_type.h b/src/security/msg_type.h new file mode 100644 index 0000000000000000000000000000000000000000..c507ab7b9217d2ceb6fceebbce6bca5ca6bf7a92 --- /dev/null +++ b/src/security/msg_type.h @@ -0,0 +1,53 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_MSG_TYPE_H +#define SYSAK_MSG_TYPE_H + + +/* Msg Types */ +enum msg_ops { + MSG_OP_UNDEF = 0, + MSG_OP_EXECVE = 5, + MSG_OP_EXIT = 7, + MSG_OP_GENERIC_KPROBE = 13, + MSG_OP_GENERIC_TRACEPOINT = 14, + MSG_OP_GENERIC_UPROBE = 15, + + MSG_OP_TEST = 254, + + /* These ops went through a few iterations of experimentation + * and some of those experiments exist in the wild. So just + * bump deprecated space to some large value and start over. + * This way its easy to phase out the old ones. And any new + * ops are clear to see in database and logs. + */ + MSG_OP_DEPRECATE_SPACE = 1000, + + MSG_OP_CLONE = 23, + + MSG_OP_DATA = 24, + + MSG_OP_CGROUP = 25, + + MSG_OP_LOADER = 26, + + MSG_OP_THROTTLE = 27, + + MSG_OP_MAX, +}; + +enum msg_cgroup_ops { + MSG_OP_CGROUP_UNDEF = 0, + MSG_OP_CGROUP_MKDIR = + 1, /* cgroup_mkdir tracepoint, used for debugging */ + MSG_OP_CGROUP_RMDIR = + 2, /* cgroup_rmdir tracepoint, used for debugging */ + MSG_OP_CGROUP_RELEASE = + 3, /* cgroup_release tracepoint, used for debugging */ + MSG_OP_CGROUP_ATTACH_TASK = 10, /* cgroup_attach_task tracepoint */ +}; + + +#endif //SYSAK_MSG_TYPE_H diff --git a/src/security/process.h b/src/security/process.h new file mode 100644 index 0000000000000000000000000000000000000000..bc5b29a1f6a84eac8eeed64f78f31d54f5c35c1b --- /dev/null +++ b/src/security/process.h @@ -0,0 +1,306 @@ +// +// Created by qianlu on 2024/6/16. +// + +#ifndef SYSAK_PROCESS_H +#define SYSAK_PROCESS_H + +#include "../coolbpf.h" +#include +#include +#include "bpf_event.h" +#include "bpf_cred.h" +#include "bpf_common.h" +#include "compiler.h" +#include "api.h" + +#include "type.h" +#include "bpf_process_event_type.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct msg_execve_event); +} execve_msg_heap_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 32768); + __type(key, __u32); + __type(value, struct execve_map_value); +} execve_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, __s32); + __type(value, __s64); +} execve_map_stats SEC(".maps"); + +enum { + MAP_STATS_COUNT = 0, + MAP_STATS_ERROR = 1, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __s32); + __type(value, struct execve_map_value); +} execve_val SEC(".maps"); + +struct execve_heap { + union { + char pathname[PATHNAME_SIZE]; + char maxpath[4096]; + }; + struct execve_info info; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __s32); + __type(value, struct execve_heap); +} execve_heap SEC(".maps"); + +/* The tg_execve_joined_info_map allows to join and combine + * exec info that is gathered during different hooks + * through the execve call. The list of current hooks is: + * 1. kprobe/security_bprm_committing_creds + * For details check tg_kp_bprm_committing_creds bpf program. + * 2. tracepoint/sys_execve + * For details see event_execve bpf program. + * + * Important: the information stored here is complementary + * information only, the core logic should not depend on entries + * of this map to be present. + * + * tgid+tid is key as execve is a complex syscall where failures + * may happen at different levels and hooks, also the thread + * that triggered and succeeded at execve will be the only new + * and main thread. + */ +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 8192); + __type(key, __u64); + __type(value, struct execve_info); +} tg_execve_joined_info_map SEC(".maps"); + +/* The tg_execve_joined_info_map_stats will hold stats about + * entries and map update errors. + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, __s32); + __type(value, __s64); +} tg_execve_joined_info_map_stats SEC(".maps"); + +FUNC_INLINE int64_t validate_msg_execve_size(int64_t size) +{ + size_t max = sizeof(struct msg_execve_event); + + /* validate_msg_size() calls need to happen near caller using the + * size. Otherwise, depending on kernel version, the verifier may + * lose track of the size bounds. Place a compiler barrier here + * otherwise clang will likely place this check near other msg + * population calls which can be significant distance away resulting + * in losing bounds on older kernels where bounds are not tracked + * as rigorously. + */ + compiler_barrier(); + if (size > max) + size = max; + if (size < 1) + size = offsetof(struct msg_execve_event, buffer); + compiler_barrier(); + return size; +} + +// execve_map_error() will increment the map error counter +FUNC_INLINE void execve_map_error(void) +{ + int one = MAP_STATS_ERROR; + __s64 *cntr; + + cntr = bpf_map_lookup_elem(&execve_map_stats, &one); + if (cntr) + *cntr = *cntr + 1; +} + +FUNC_INLINE uint64_t get_start_time() +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + uint64_t gl_off = offsetof(struct task_struct, group_leader); + struct task_struct *group_leader_ptr; + bpf_probe_read(&group_leader_ptr, + sizeof(struct task_struct *), + (uint8_t *)task + gl_off); + + uint64_t start_time = 0; + + if (bpf_core_field_exists(group_leader_ptr->start_time)) + { + uint64_t st_off = offsetof(struct task_struct, start_time); + bpf_probe_read(&start_time, + sizeof(uint64_t), + (uint8_t *)group_leader_ptr + st_off); + } + else if (bpf_core_field_exists(group_leader_ptr->start_boottime)) + { + uint64_t st_off = offsetof(struct task_struct, start_boottime); + bpf_probe_read(&start_time, + sizeof(uint64_t), + (uint8_t *)group_leader_ptr + st_off); + } else { + start_time = bpf_ktime_get_ns(); + } + + return start_time; + // return nsec_to_clock_t(start_time); +} + +// execve_map_get will look up if pid exists and return it if it does. If it +// does not, it will create a new one and return it. +FUNC_INLINE struct execve_map_value *execve_map_get(__u32 pid) +{ + struct execve_map_value *event; + + event = bpf_map_lookup_elem(&execve_map, &pid); + if (!event) { + struct execve_map_value *value; + int err, zero = MAP_STATS_COUNT; + __s64 *cntr; + + value = bpf_map_lookup_elem(&execve_val, &zero); + if (!value) + return 0; + + memset(value, 0, sizeof(struct execve_map_value)); + err = bpf_map_update_elem(&execve_map, &pid, value, 0); + if (!err) { + cntr = bpf_map_lookup_elem(&execve_map_stats, &zero); + if (cntr) + *cntr = *cntr + 1; + } else { + execve_map_error(); + } + event = bpf_map_lookup_elem(&execve_map, &pid); + } + return event; +} + +FUNC_INLINE struct execve_map_value *execve_map_get_noinit(__u32 pid) +{ + return bpf_map_lookup_elem(&execve_map, &pid); +} + +FUNC_INLINE void execve_map_delete(__u32 pid) +{ + int err = bpf_map_delete_elem(&execve_map, &pid); + int zero = MAP_STATS_COUNT; + __s64 *cntr; + + if (!err) { + cntr = bpf_map_lookup_elem(&execve_map_stats, &zero); + if (cntr) + *cntr = *cntr - 1; + } else { + execve_map_error(); + } +} + +// execve_joined_info_map_error() will increment the map error counter +FUNC_INLINE void execve_joined_info_map_error(void) +{ + int one = MAP_STATS_ERROR; + __s64 *cntr; + + cntr = bpf_map_lookup_elem(&tg_execve_joined_info_map_stats, &one); + if (cntr) + *cntr = *cntr + 1; +} + +FUNC_INLINE void execve_joined_info_map_set(__u64 tid, struct execve_info *info) +{ + int err, zero = MAP_STATS_COUNT; + __s64 *cntr; + + err = bpf_map_update_elem(&tg_execve_joined_info_map, &tid, info, BPF_ANY); + if (err < 0) { + /* -EBUSY or -ENOMEM with the help of the cntr error + * on the stats map this can be a good indication of + * long running workloads and if we have to make the + * map size bigger for such cases. + */ + execve_joined_info_map_error(); + return; + } + + cntr = bpf_map_lookup_elem(&tg_execve_joined_info_map_stats, &zero); + if (cntr) + *cntr = *cntr + 1; +} + +/* Clear up some space for next threads */ +FUNC_INLINE void execve_joined_info_map_clear(__u64 tid) +{ + int err, zero = MAP_STATS_COUNT; + __s64 *cntr; + + err = bpf_map_delete_elem(&tg_execve_joined_info_map, &tid); + if (!err) { + cntr = bpf_map_lookup_elem(&tg_execve_joined_info_map_stats, &zero); + if (cntr) + *cntr = *cntr - 1; +} +/* We don't care here about -ENOENT as there is no guarantee entries + * will be present anyway. + */ +} + +/* Returns an execve_info if found. A missing entry is perfectly fine as it + * could mean we are not interested into storing more information about this task. + */ +FUNC_INLINE struct execve_info *execve_joined_info_map_get(__u64 tid) +{ + return bpf_map_lookup_elem(&tg_execve_joined_info_map, &tid); +} + +_Static_assert(sizeof(struct execve_map_value) % 8 == 0, + "struct execve_map_value should have size multiple of 8 bytes"); + +struct kernel_stats { + __u64 sent_failed[256]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, __u32); + __type(value, struct kernel_stats); + __uint(max_entries, 1); +} tg_stats_map SEC(".maps"); + +FUNC_INLINE void perf_event_output_update_error_metric(u8 msg_op, long err) { + struct kernel_stats *valp; + __u32 zero = 0; + + valp = bpf_map_lookup_elem(&tg_stats_map, &zero); + if (valp) { + __sync_fetch_and_add(&valp->sent_failed[msg_op], 1); + } +} + +FUNC_INLINE void perf_event_output_metric(void *ctx, u8 msg_op, void *map, + u64 flags, void *data, u64 size) { + long err; + + err = bpf_perf_event_output(ctx, map, flags, data, size); + if (err < 0) + perf_event_output_update_error_metric(msg_op, err); +} + +#endif //SYSAK_PROCESS_H diff --git a/src/security/security.bpf.c b/src/security/security.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..915f82e6b3410db6e328c80db97038dc30d7f349 --- /dev/null +++ b/src/security/security.bpf.c @@ -0,0 +1,1358 @@ +// +// Created by qianlu on 2024/6/12. +// + +#include +#include +#include +#include +#include "../coolbpf.h" + +#include "int_maps.h" +#include "filter.h" +#include "type.h" +#include "process.h" +#include "addr_lpm_maps.h" +#include "string_maps.h" +#include "bpf_exit.h" +#include "tailcall_stack.h" +//#include "bpf_execve.h" +#include "../ebpf_log.h" + +BPF_ARRAY(cidr_filter_list, struct cidr_entry, SYSAK_SECURE_MAX_CIDR_LIMIT); +BPF_ARRAY(port_filter_list, struct port_entry, SYSAK_SECURE_MAX_PORT_LIMIT); + +BPF_HASH(sock_secure_port_filter, u16, struct port_entry, 1024); +BPF_PERF_OUTPUT(sock_secure_output, 1024); +BPF_PERCPU_ARRAY(sock_secure_data_heap, struct tcp_data_t, 1); + +BPF_ARRAY(path_filter_list, struct path_entry, SYSAK_SECURE_MAX_PATH_LIMIT); +BPF_PERF_OUTPUT(file_secure_output, 1024); +BPF_PERCPU_ARRAY(file_secure_data_heap, struct file_data_t, 1); +BPF_PERCPU_ARRAY(tailcall_stack, struct secure_tailcall_stack, 1); + +struct +{ + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} file_path_filter_calls SEC(".maps"); + +struct +{ + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} secure_tailcall_map SEC(".maps"); +//////////////////////////// process //////////////////////////// +///////////////////////////////////////////////////////////////// + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} execve_calls SEC(".maps"); + +#include "data_event.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct msg_data); +} data_heap SEC(".maps"); + +FUNC_INLINE __u32 +read_args(void *ctx, struct msg_execve_event *event) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + struct msg_process *p = &event->process; + unsigned long start_stack, end_stack; + unsigned long free_size, args_size; + __u32 zero = 0, size = 0; + struct execve_heap *heap; + struct mm_struct *mm; + char *args; + long off; + int err; + + bpf_probe_read(&mm, sizeof(mm), _(&task->mm)); + if (!mm) + return 0; + + bpf_probe_read(&start_stack, sizeof(start_stack), + _(&mm->arg_start)); + bpf_probe_read(&end_stack, sizeof(start_stack), _(&mm->arg_end)); + + if (!start_stack || !end_stack) + return 0; + + /* skip first argument - binary path */ + heap = bpf_map_lookup_elem(&execve_heap, &zero); + if (!heap) + return 0; + + /* poor man's strlen */ + off = bpf_probe_read_str(&heap->maxpath, 4096, (char *)start_stack); + if (off < 0) + return 0; + + start_stack += off; + + size = p->size & 0x1ff /* 2*MAXARGLENGTH - 1*/; + args = (char *)p + size; + + if (args >= (char *)&event->process + BUFFER) + return 0; + + /* Read arguments either to rest of the space in the event, + * or use data event to send it separatelly. + */ + free_size = (char *)&event->process + BUFFER - args; + args_size = end_stack - start_stack; + + if (args_size < BUFFER && args_size < free_size) { + size = args_size & 0x3ff /* BUFFER - 1 */; + err = bpf_probe_read(args, size, (char *)start_stack); + if (err < 0) { + p->flags |= EVENT_ERROR_ARGS; + size = 0; + } + } else { + size = data_event_bytes(ctx, (struct data_event_desc *)args, + (unsigned long)start_stack, + args_size, + (struct bpf_map_def *)&data_heap); + if (size > 0) + p->flags |= EVENT_DATA_ARGS; + } + return size; +} + +FUNC_INLINE __u32 +read_path(void *ctx, struct msg_execve_event *event, void *filename) +{ + struct msg_process *p = &event->process; + __u32 size = 0; + __u32 flags = 0; + char *earg; + + earg = (void *)p + offsetof(struct msg_process, args); + + size = bpf_probe_read_str(earg, MAXARGLENGTH - 1, filename); + if (size < 0) { + flags |= EVENT_ERROR_FILENAME; + size = 0; + } else if (size == MAXARGLENGTH - 1) { + size = data_event_str(ctx, (struct data_event_desc *)earg, + (unsigned long)filename, + (struct bpf_map_def *)&data_heap); + if (size == 0) + flags |= EVENT_ERROR_FILENAME; + else + flags |= EVENT_DATA_FILENAME; + } + + p->flags |= flags; + return size; +} + +FUNC_INLINE __u32 +read_cwd(void *ctx, struct msg_process *p) +{ + if (p->flags & EVENT_ERROR_CWD) + return 0; + return getcwd(p, p->size, p->pid); +} + +FUNC_INLINE void +read_execve_shared_info(void *ctx, struct msg_process *p, __u64 pid) +{ + struct execve_info *info; + + info = execve_joined_info_map_get(pid); + if (!info) { + p->secureexec = 0; + p->i_ino = 0; + p->i_nlink = 0; + return; + } + + p->secureexec = info->secureexec; + p->i_ino = info->i_ino; + p->i_nlink = info->i_nlink; + execve_joined_info_map_clear(pid); +} + +/** + * read_exe() Reads the path from the backing executable file of the current + * process. + * + * The executable file of a process can change using the prctl() system call + * and PR_SET_MM_EXE_FILE. Thus, this function should only be used under the + * execve path since the executable file is locked and usually there is only + * one remaining thread at its exit path. + */ +#ifdef __LARGE_BPF_PROG +FUNC_INLINE __u32 +read_exe(struct task_struct *task, struct heap_exe *exe) +{ + struct file *file = BPF_CORE_READ(task, mm, exe_file); + struct path *path = __builtin_preserve_access_index(&file->f_path); + + // we need to walk the complete 4096 len dentry in order to have an accurate + // matching on the prefix operators, even if we only keep a subset of that + char *buffer; + + buffer = d_path_local(path, (int *)&exe->len, (int *)&exe->error); + if (!buffer) + return 0; + + // buffer used by d_path_local can contain up to MAX_BUF_LEN i.e. 4096 we + // only keep the first 255 chars for our needs (we sacrifice one char to the + // verifier for the > 0 check) + if (exe->len > 255) + exe->len = 255; + asm volatile("%[len] &= 0xff;\n" + : [len] "+r"(exe->len)); + probe_read(exe->buf, exe->len, buffer); + + return exe->len; +} +#endif + +// int wake_up_process(struct task_struct *p) +SEC("kprobe/wake_up_new_task") +int BPF_KPROBE(event_wake_up_new_task, struct task_struct *task) +{ + struct execve_map_value *curr, *parent; + struct msg_clone_event msg; + u64 msg_size = sizeof(struct msg_clone_event); + struct msg_k8s kube; + u32 tgid = 0; + + if (!task) + return 0; + + tgid = BPF_CORE_READ(task, tgid); + + /* Do not try to create any msg or calling execve_map_get + * (that will add a new process in the execve_map) if we + * cannot find it's parent in the execve_map. + */ + parent = __event_find_parent(task); + if (!parent) + return 0; + + curr = execve_map_get(tgid); + if (!curr) + return 0; + + /* Generate an EVENT_COMMON_FLAG_CLONE event once per process, + * that is, thread group. + */ + if (curr->key.ktime != 0) + return 0; + + /* Setup the execve_map entry. */ + curr->flags = EVENT_COMMON_FLAG_CLONE; + curr->key.pid = tgid; + curr->key.ktime = bpf_ktime_get_ns(); + curr->nspid = get_task_pid_vnr(); + memcpy(&curr->bin, &parent->bin, sizeof(curr->bin)); + curr->pkey = parent->key; + + /* Store the thread leader capabilities so we can check later + * before the execve hook point if they changed or not. + * This needs to be converted later to credentials. + */ + get_current_subj_caps(&curr->caps, task); + + /* Store the thread leader namespaces so we can check later + * before the execve hook point if they changed or not. + */ + get_namespaces(&curr->ns, task); + + /* Set EVENT_IN_INIT_TREE flag on the process if its parent is in a + * container's init tree or if it has nspid=1. + */ + set_in_init_tree(curr, parent); + + /* Setup the msg_clone_event and sent to the user. */ + msg.common.op = MSG_OP_CLONE; + msg.common.size = msg_size; + msg.common.ktime = curr->key.ktime; + msg.parent = curr->pkey; + msg.tgid = curr->key.pid; + /* Per thread tracking rules TID == PID : + * Since we generate one event per thread group, then when this task + * wakes up it will be the only one in the thread group, and it is + * the leader. Ensure to pass TID to user space. + */ + msg.tid = BPF_CORE_READ(task, pid); + msg.ktime = curr->key.ktime; + msg.nspid = curr->nspid; + msg.flags = curr->flags; + + __event_get_cgroup_info(task, &kube); + + if (cgroup_rate(ctx, &kube, msg.ktime)) { + perf_event_output_metric(ctx, MSG_OP_CLONE, &tcpmon_map, BPF_F_CURRENT_CPU, &msg, msg_size); + } + + return 0; +} + +SEC("tracepoint/sched/sched_process_exec") +int event_execve(struct trace_event_raw_sched_process_exec *ctx) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + char *filename = (char *)ctx + (_(ctx->__data_loc_filename) & 0xFFFF); + struct msg_execve_event *event; + struct execve_map_value *parent; + struct msg_process *p; + __u32 zero = 0; + __u64 pid; + + event = bpf_map_lookup_elem(&execve_msg_heap_map, &zero); + if (!event) + return 0; + + pid = bpf_get_current_pid_tgid(); + parent = event_find_parent(); + if (parent) { + event->parent = parent->key; + } else { + event_minimal_parent(event, task); + } + + p = &event->process; + p->flags = EVENT_EXECVE; + /** + * Per thread tracking rules TID == PID : + * At exec all threads other than the calling one are destroyed, so + * current becomes the new thread leader since we hook late during + * execve. + */ + p->pid = pid >> 32; + p->tid = (__u32)pid; + p->nspid = get_task_pid_vnr(); + p->ktime = bpf_ktime_get_ns(); + p->size = offsetof(struct msg_process, args); + p->auid = get_auid(); + read_execve_shared_info(ctx, p, pid); + + p->size += read_path(ctx, event, filename); + p->size += read_args(ctx, event); + p->size += read_cwd(ctx, p); + + event->common.op = MSG_OP_EXECVE; + event->common.ktime = p->ktime; + event->common.size = offsetof(struct msg_execve_event, process) + p->size; + + get_current_subj_creds(&event->creds, task); + /** + * Instead of showing the task owner, we want to display the effective + * uid that is used to calculate the privileges of current task when + * acting upon other objects. This allows to be compatible with the 'ps' + * tool that reports snapshot of current processes. + */ + p->uid = event->creds.euid; + get_namespaces(&event->ns, task); + p->flags |= __event_get_cgroup_info(task, &event->kube); + + bpf_tail_call(ctx, &execve_calls, 0); + return 0; +} + +//__attribute__((section("tracepoint/0"), used)) +SEC("tracepoint/0") +int execve_rate(void *ctx) +{ + struct msg_execve_event *msg; + + __u32 zero = 0; + + msg = bpf_map_lookup_elem(&execve_msg_heap_map, &zero); + if (!msg) + return 0; + + if (cgroup_rate(ctx, &msg->kube, msg->common.ktime)) + bpf_tail_call(ctx, &execve_calls, 1); + return 0; +} + +///** +// * execve_send() sends the collected execve event data. +// * +// * This function is the last tail call of the execve event, its sole purpose +// * is to update the pid execve_map entry to reflect the new execve event that +// * has already been collected, then send it to the perf buffer. +// */ +////__attribute__((section("tracepoint/1"), used)) int +SEC("tracepoint/1") +int execve_send(void *ctx) +{ + struct msg_execve_event *event; + struct execve_map_value *curr; + struct msg_process *p; + __u32 zero = 0; + uint64_t size; + __u32 pid; +#if defined(__NS_CHANGES_FILTER) || defined(__CAP_CHANGES_FILTER) + bool init_curr = 0; +#endif + + event = bpf_map_lookup_elem(&execve_msg_heap_map, &zero); + if (!event) + return 0; + +#ifdef __LARGE_BPF_PROG + // Reading the absolute path of the process exe for matchBinaries. + // Historically we used the filename, a potentially relative path (maybe to + // a symlink) coming from the execve tracepoint. For kernels not supporting + // large BPF prog, we still use the filename. + read_exe((struct task_struct *)bpf_get_current_task(), &event->exe); +#endif + + p = &event->process; + + pid = (bpf_get_current_pid_tgid() >> 32); + + curr = execve_map_get_noinit(pid); + if (curr) { + event->cleanup_key = curr->key; +#if defined(__NS_CHANGES_FILTER) || defined(__CAP_CHANGES_FILTER) + /* if this exec event preceds a clone, initialize capabilities + * and namespaces as well. + */ + if (curr->flags == EVENT_COMMON_FLAG_CLONE) + init_curr = 1; +#endif + curr->key.pid = p->pid; + curr->key.ktime = p->ktime; + curr->nspid = p->nspid; + curr->pkey = event->parent; + if (curr->flags & EVENT_COMMON_FLAG_CLONE) { + event_set_clone(p); + } + curr->flags &= ~EVENT_COMMON_FLAG_CLONE; + /* Set EVENT_IN_INIT_TREE flag on the process if nspid=1. + */ + set_in_init_tree(curr, NULL); + if (curr->flags & EVENT_IN_INIT_TREE) { + event->process.flags |= EVENT_IN_INIT_TREE; + } +#ifdef __NS_CHANGES_FILTER + if (init_curr) + memcpy(&(curr->ns), &(event->ns), + sizeof(struct msg_ns)); +#endif +#ifdef __CAP_CHANGES_FILTER + if (init_curr) { + curr->caps.permitted = event->creds.caps.permitted; + curr->caps.effective = event->creds.caps.effective; + curr->caps.inheritable = event->creds.caps.inheritable; + } +#endif + // buffer can be written at clone stage with parent's info, if previous + // path is longer than current, we can have leftovers at the end. + memset(&curr->bin, 0, sizeof(curr->bin)); +#ifdef __LARGE_BPF_PROG + // read from proc exe stored at execve time + if (event->exe.len <= BINARY_PATH_MAX_LEN) { + curr->bin.path_length = bpf_probe_read(curr->bin.path, event->exe.len, event->exe.buf); + if (curr->bin.path_length == 0) + curr->bin.path_length = event->exe.len; + } +#else + // reuse p->args first string that contains the filename, this can't be + // above 256 in size (otherwise the complete will be send via data msg) + // which is okay because we need the 256 first bytes. + curr->bin.path_length = bpf_probe_read_str(curr->bin.path, BINARY_PATH_MAX_LEN, &p->args); + if (curr->bin.path_length > 1) { + // don't include the NULL byte in the length + curr->bin.path_length--; + } +#endif + } + + event->common.flags = 0; + size = validate_msg_execve_size( + sizeof(struct msg_common) + sizeof(struct msg_k8s) + + sizeof(struct msg_execve_key) + sizeof(__u64) + + sizeof(struct msg_cred) + sizeof(struct msg_ns) + + sizeof(struct msg_execve_key) + p->size); + perf_event_output_metric(ctx, MSG_OP_EXECVE, &tcpmon_map, BPF_F_CURRENT_CPU, event, size); + return 0; +} + +// +// +// +//// exit +// +////__attribute__((section("kprobe/acct_process"), used)) +SEC("kprobe/acct_process") +int event_exit_acct_process(struct pt_regs *ctx) +{ + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 pid = pid_tgid >> 32; + event_exit_send(ctx, pid); + return 0; +} + +/* + * Hooking on acct_process kernel function, which is called on the task's + * exit path once the task is the last one in the group. It's stable since + * v4.19, so it's safe to hook for us. + * + * It's called with on_exit argument != 0 when called from do_exit + * function with same conditions like for acct_process described above. + */ +//__attribute__((section("kprobe/disassociate_ctty"), used)) int + +SEC("kprobe/disassociate_ctty") +int event_exit_disassociate_ctty(struct pt_regs *ctx) +{ + int on_exit = (int)PT_REGS_PARM1_CORE(ctx); + __u32 pid = bpf_get_current_pid_tgid() >> 32; + if (on_exit) + event_exit_send(ctx, pid); + return 0; +} + + +//////////////////////////// filters //////////////////////////// + +#define POLICY_FILTER_MAX_POLICIES 128 + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, POLICY_FILTER_MAX_POLICIES); + __uint(key_size, sizeof(u32)); /* policy id */ + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u64); /* cgroup id */ + __type(value, __u8); /* empty */ + }); +} policy_filter_maps SEC(".maps"); + +///////////////////////////////////////////////////////////////// + + +//////////////////////////// network //////////////////////////// +///////////////////////////////////////////////////////////////// + +static __always_inline u16 bpf_core_sock_sk_protocol_ak(struct sock *sk) +{ + return (u16)BPF_CORE_READ_BITFIELD_PROBED(sk, sk_protocol); +} + +static inline int cidr_match(__u32 addr, __u32 net, __u32 subnet) { + __u32 mask = subnet == 0 ? 0 : (0xFFFFFFFF << (32 - subnet)); + return (addr & mask) == (net & mask); +} + +// return value: +// 0 --- pass +// 1 --- reject +// direction: 0 for source addr, 1 for dest addr +int port_filter(__u16 port, int direction) { + int start = 0; + if (direction == 1) { + start = (SYSAK_SECURE_MAX_PORT_LIMIT >> 1); + } + + int key; + struct port_entry *entry; + // 0 for blacklist + // 1 for whitelist + int blacklist = 3; + +#pragma unroll + for (key = 0; key < SYSAK_SECURE_MAX_PORT_LIMIT; key++) { + int tmp = start + key; + entry = bpf_map_lookup_elem(&port_filter_list, &tmp); + if (!entry || entry->inited == 0) { + // need stop + break; + } + blacklist = entry->black; + BPF_DEBUG("[kprobe][port_filter] black:%u, port:%u, income_port:%u", entry->black, entry->port, port); + if (port == entry->port) { + if (blacklist == 1) { + // blacklist + BPF_DEBUG("[kprobe][port_filter] filtered by blacklist port, port:%u : disabled.", + port); + return 1; + } else if (blacklist == 0) { + // whitelist + return 0; + } + } + + return (entry->black == 0) ? 1 : 0; + } + + // blacklist + if (blacklist == 1) return 0; + if (blacklist == 0) { + // whitelist + BPF_DEBUG("[kprobe][port_filter] filtered by whitelist port, port:%u . disabled.", port); + return 1; + } + + // no filters + return 0; +} + +// return value: +// 0 --- pass +// 1 --- reject +// direction: 0 for source addr, 1 for dest addr +int addr_filter(__u32 addr, int direction) { + int start = 0; + if (direction == 1) { + start = (SYSAK_SECURE_MAX_CIDR_LIMIT >> 1); + } + + int key; + + // 0 for blacklist + // 1 for whitelist + int blacklist = 3; +#pragma unroll + for (key = 0; key < SYSAK_SECURE_MAX_CIDR_LIMIT; key++) { + int tmp = start + key; + struct cidr_entry *entry = bpf_map_lookup_elem(&cidr_filter_list, &tmp); + if (!entry || entry->inited == 0) break; + BPF_DEBUG("[kprobe][addr_filter] black:%u, net:%u, mask:%u", entry->black, entry->net, entry->mask); + blacklist = entry->black; + if (cidr_match(addr, entry->net, entry->mask)) { + if (blacklist == 1) { + // bingo black list + BPF_DEBUG("[kprobe][addr_filter] filtered by blacklist cidr, ip:%u net:%u mask:%u: disabled.", + addr, entry->net, entry->mask); + return 1; + } else if (blacklist == 0) { + // bingo white list + return 0; + } + } + } + + // blacklist + if (blacklist == 1) return 0; + if (blacklist == 0) { + // whitelist + BPF_DEBUG("[kprobe][addr_filter] filtered by white cidr, ip:%u disabled.", + addr); + return 1; + } + + // no filters + return 0; +} + +static __always_inline u32 get_netns(struct sock *sk) { + return BPF_CORE_READ(sk, __sk_common.skc_net.net, ns.inum); +} + +// int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +SEC("kprobe/tcp_sendmsg") +int BPF_KPROBE(kprobe_tcp_sendmsg, struct sock *sk, struct msghdr *msg, size_t size) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct execve_map_value *enter; + enter = execve_map_get_noinit(pid); + if (!enter || enter->key.ktime == 0) { + BPF_DEBUG("[kprobe][kprobe_tcp_sendmsg] pid:%u never enter. skip collect", pid); + return 0; + } + BPF_DEBUG("[kprobe][kprobe_tcp_sendmsg] pid:%u ktime:%llu already enter.", pid, enter->key.ktime); + + // define event + __u32 zero = 0; + struct tcp_data_t* data = NULL; + data = bpf_map_lookup_elem(&sock_secure_data_heap, &zero); + if (!data) return 0; + memset(data, 0, sizeof(data)); + + data->func = TRACEPOINT_FUNC_TCP_SENDMSG; + data->key = enter->key; + data->pkey = enter->pkey; + + struct inet_sock *inet = (struct inet_sock *)sk; + data->timestamp = bpf_ktime_get_ns(); + unsigned int daddr = BPF_CORE_READ(sk, __sk_common.skc_daddr); + data->daddr = bpf_htonl(daddr); + unsigned short dport = BPF_CORE_READ(sk, __sk_common.skc_dport); + data->dport = bpf_htons(dport); + unsigned int saddr = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); + data->saddr = bpf_htonl(saddr); + unsigned short sport = BPF_CORE_READ(inet, inet_sport); + data->sport = bpf_htons(sport); + data->state = BPF_CORE_READ(sk, __sk_common.skc_state); + data->family = BPF_CORE_READ(sk, __sk_common.skc_family); + data->net_ns = get_netns(sk); + data->protocol = bpf_core_sock_sk_protocol_ak(sk); + data->bytes = size; + + struct secure_tailcall_stack* stack = NULL; + stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!stack) return 0; + memset(stack, 0, sizeof(stack)); + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_TCP_SENDMSG; + stack->tcp_data.func = TRACEPOINT_FUNC_TCP_SENDMSG; + stack->tcp_data.key = enter->key; + stack->tcp_data.pkey = enter->pkey; + stack->tcp_data.timestamp = bpf_ktime_get_ns(); + stack->tcp_data.daddr = daddr; + stack->tcp_data.dport = bpf_htons(dport); + stack->tcp_data.saddr = saddr; + stack->tcp_data.sport = bpf_htons(sport); + stack->tcp_data.state = BPF_CORE_READ(sk, __sk_common.skc_state); + stack->tcp_data.family = BPF_CORE_READ(sk, __sk_common.skc_family); + stack->tcp_data.net_ns = get_netns(sk); + stack->tcp_data.protocol = bpf_core_sock_sk_protocol_ak(sk); + stack->tcp_data.bytes = size; + BPF_DEBUG("[kprobe][kprobe_tcp_sendmsg][dump] saddr:%u, daddr:%u, family:%u", + stack->tcp_data.saddr, stack->tcp_data.daddr, data->family); + BPF_DEBUG("[kprobe][kprobe_tcp_sendmsg][dump] daddr:%u, sport:%u, state:%u", + stack->tcp_data.daddr, stack->tcp_data.sport, data->state); + + + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_FILTER_PROG); + // do filters + // int sf, df, sp, dp; + // sf = addr_filter(data->saddr, 0); + // df = addr_filter(data->daddr, 1); + // sp = port_filter(data->sport, 0); + // dp = port_filter(data->dport, 1); + // if (sf || df || sp || dp) { + // BPF_DEBUG("[kprobe][kprobe_tcp_sendmsg] skip submit because of filters."); + // return 0; + // } + + // bpf_perf_event_output(ctx, &sock_secure_output, BPF_F_CURRENT_CPU, data, sizeof(struct tcp_data_t)); + // BPF_DEBUG("[kprobe][kprobe_tcp_sendmsg] pid:%u ktime:%llu send to perfbuffer.", pid, enter->key.ktime); + return 0; +} + +// void tcp_close(struct sock *sk, long timeout); +SEC("kprobe/tcp_close") +int BPF_KPROBE(kprobe_tcp_close, struct sock *sk) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct execve_map_value *enter; + enter = execve_map_get_noinit(pid); + if (!enter || enter->key.ktime == 0) { + BPF_DEBUG("[kprobe][kprobe_tcp_close] pid:%u never enter. skip collect", pid); + return 0; + } + BPF_DEBUG("[kprobe][kprobe_tcp_close] pid:%u ktime:%llu already enter.", pid, enter->key.ktime); + + __u32 zero = 0; + struct tcp_data_t* data = NULL; + data = bpf_map_lookup_elem(&sock_secure_data_heap, &zero); + if (!data) return 0; + memset(data, 0, sizeof(data)); + + data->func = TRACEPOINT_FUNC_TCP_CLOSE; + data->key = enter->key; + data->pkey = enter->pkey; + struct inet_sock *inet = (struct inet_sock *)sk; + data->timestamp = bpf_ktime_get_ns(); + unsigned int daddr = BPF_CORE_READ(sk, __sk_common.skc_daddr); + data->daddr = bpf_htonl(daddr); + unsigned short dport = BPF_CORE_READ(sk, __sk_common.skc_dport); + data->dport = bpf_htons(dport); + unsigned int saddr = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); + data->saddr = bpf_htonl(saddr); + unsigned short sport = BPF_CORE_READ(inet, inet_sport); + data->sport = bpf_htons(sport); + data->state = BPF_CORE_READ(sk, __sk_common.skc_state); + data->family = BPF_CORE_READ(sk, __sk_common.skc_family); + data->net_ns = get_netns(sk); + data->protocol = bpf_core_sock_sk_protocol_ak(sk); + + struct secure_tailcall_stack* stack = NULL; + stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!stack) return 0; + memset(stack, 0, sizeof(stack)); + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_TCP_CLOSE; + stack->tcp_data.func = TRACEPOINT_FUNC_TCP_CLOSE; + stack->tcp_data.key = enter->key; + stack->tcp_data.pkey = enter->pkey; + stack->tcp_data.timestamp = bpf_ktime_get_ns(); + stack->tcp_data.daddr = daddr; + stack->tcp_data.dport = bpf_htons(dport); + stack->tcp_data.saddr = saddr; + stack->tcp_data.sport = bpf_htons(sport); + stack->tcp_data.state = BPF_CORE_READ(sk, __sk_common.skc_state); + stack->tcp_data.family = BPF_CORE_READ(sk, __sk_common.skc_family); + stack->tcp_data.net_ns = get_netns(sk); + stack->tcp_data.protocol = bpf_core_sock_sk_protocol_ak(sk); + BPF_DEBUG("[kprobe][kprobe_tcp_close][dump] saddr:%u, daddr:%u, family:%u", + stack->tcp_data.saddr, stack->tcp_data.daddr, data->family); + BPF_DEBUG("[kprobe][kprobe_tcp_close][dump] daddr:%u, sport:%u, state:%u", + stack->tcp_data.daddr, stack->tcp_data.sport, data->state); + + + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_FILTER_PROG); + + // do filters +// int sf, df, sp, dp; +// sf = addr_filter(data->saddr, 0); +// df = addr_filter(data->daddr, 1); +// sp = port_filter(data->sport, 0); +// dp = port_filter(data->dport, 1); +// if (sf || df || sp || dp) { +// BPF_DEBUG("[kprobe][kprobe_tcp_close] skip submit because of filters."); +// return 0; +// } + +// // BPF_DEBUG("Packet matched CIDR: %x/%x/%u/%u\n", entry->net, entry->mask, entry->enable, entry->src); +// bpf_perf_event_output(ctx, &sock_secure_output, BPF_F_CURRENT_CPU, data, sizeof(struct tcp_data_t)); +// BPF_DEBUG("[kprobe][kprobe_tcp_close] pid:%u ktime:%llu send to perfbuffer.", pid, enter->key.ktime); + return 0; +} + +// +SEC("kprobe/tcp_connect") +int BPF_KPROBE(kprobe_tcp_connect, struct sock *sk) { + __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct execve_map_value *enter; + enter = execve_map_get_noinit(pid); + if (!enter || enter->key.ktime == 0) { + BPF_DEBUG("[kprobe][kprobe_tcp_connect] pid:%u never enter. skip collect", pid); + return 0; + } + BPF_DEBUG("[kprobe][kprobe_tcp_connect] pid:%u ktime:%llu already enter.", pid, enter->key.ktime); + + __u32 zero = 0; + struct tcp_data_t* data = NULL; + data = bpf_map_lookup_elem(&sock_secure_data_heap, &zero); + if (!data) return 0; + memset(data, 0, sizeof(data)); + + data->func = TRACEPOINT_FUNC_TCP_CONNECT; + data->key = enter->key; + data->pkey = enter->pkey; + // struct inet_sock *inet = (struct inet_sock *)sk; + // data->timestamp = bpf_ktime_get_ns(); + // data->daddr = BPF_CORE_READ(sk, __sk_common.skc_daddr); + // data->daddr = bpf_htonl(data->daddr); + // data->dport = BPF_CORE_READ(sk, __sk_common.skc_dport); + // data->dport = bpf_htons(data->dport); + // data->saddr = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); + // data->saddr = bpf_htonl(data->saddr); + // data->sport = BPF_CORE_READ(inet, inet_sport); + // data->sport = bpf_htons(data->sport); + // data->state = BPF_CORE_READ(sk, __sk_common.skc_state); + // data->family = BPF_CORE_READ(sk, __sk_common.skc_family); + // data->net_ns = get_netns(sk); + // data->protocol = bpf_core_sock_sk_protocol_ak(sk); + + struct inet_sock *inet = (struct inet_sock *)sk; + data->timestamp = bpf_ktime_get_ns(); + unsigned int daddr = BPF_CORE_READ(sk, __sk_common.skc_daddr); + data->daddr = bpf_htonl(daddr); + unsigned short dport = BPF_CORE_READ(sk, __sk_common.skc_dport); + data->dport = bpf_htons(dport); + unsigned int saddr = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); + data->saddr = bpf_htonl(saddr); + unsigned short sport = BPF_CORE_READ(inet, inet_sport); + data->sport = bpf_htons(sport); + data->state = BPF_CORE_READ(sk, __sk_common.skc_state); + data->family = BPF_CORE_READ(sk, __sk_common.skc_family); + data->net_ns = get_netns(sk); + data->protocol = bpf_core_sock_sk_protocol_ak(sk); + + + struct secure_tailcall_stack* stack = NULL; + stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!stack) return 0; + memset(stack, 0, sizeof(stack)); + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_TCP_CONNECT; + stack->tcp_data.func = TRACEPOINT_FUNC_TCP_CONNECT; + stack->tcp_data.key = enter->key; + stack->tcp_data.pkey = enter->pkey; + stack->tcp_data.timestamp = bpf_ktime_get_ns(); + stack->tcp_data.daddr = daddr; + stack->tcp_data.dport = bpf_htons(dport); + stack->tcp_data.saddr = saddr; + stack->tcp_data.sport = bpf_htons(sport); + stack->tcp_data.state = BPF_CORE_READ(sk, __sk_common.skc_state); + stack->tcp_data.family = BPF_CORE_READ(sk, __sk_common.skc_family); + stack->tcp_data.net_ns = get_netns(sk); + stack->tcp_data.protocol = bpf_core_sock_sk_protocol_ak(sk); + BPF_DEBUG("[kprobe][kprobe_tcp_connect][dump] saddr:%u, daddr:%u, family:%u", + stack->tcp_data.saddr, stack->tcp_data.daddr, data->family); + BPF_DEBUG("[kprobe][kprobe_tcp_connect][dump] daddr:%u, sport:%u, state:%u", + stack->tcp_data.daddr, stack->tcp_data.sport, data->state); + + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_FILTER_PROG); + + // do filters +// int sf, df, sp, dp; +// sf = addr_filter(data->saddr, 0); +// df = addr_filter(data->daddr, 1); +// sp = port_filter(data->sport, 0); +// dp = port_filter(data->dport, 1); +// if (sf || df || sp || dp) { +// BPF_DEBUG("[kprobe][kprobe_tcp_connect] skip submit because of filters."); +// return 0; +// } + +// // BPF_DEBUG("Packet matched CIDR: %x/%x/%u/%u\n", entry->net, entry->mask, entry->enable, entry->src); +// bpf_perf_event_output(ctx, &sock_secure_output, BPF_F_CURRENT_CPU, data, sizeof(struct tcp_data_t)); +// BPF_DEBUG("[kprobe][kprobe_tcp_connect] pid:%u ktime:%llu send to perfbuffer.", pid, enter->key.ktime); + return 0; +} + + + +////// file ////// +// char _license[] SEC("license") = "GPL"; +// Function to calculate the length of the string +static inline __attribute__((always_inline)) u32 str_len(const char *str) +{ + u32 len = 0; +#pragma unroll + for (int i = 0; i < SYSAK_SECURE_MAX_PATH_LENGTH_LIMIT; i++) + { + if (str[i] == '\0') + break; + len++; + } + return len; +} + +static inline __attribute__((always_inline)) long copy_path(char *args, const struct path *arg) +{ + int *s = (int *)args; + int size = 0, flags = 0; + char *buffer; + void *curr = &args[4]; + umode_t i_mode; + buffer = d_path_local(arg, &size, &flags); + if (!buffer) + return 0; + // tips: path size between 0~255 + if (size > 255) size = 255; + asm volatile("%[size] &= 0xff;\n" ::[size] "+r"(size) + :); + bpf_probe_read(curr, size, buffer); + *s = size; + size += 4; + BPF_CORE_READ_INTO(&i_mode, arg, dentry, d_inode, i_mode); + /* + * the format of the path is: + * ----------------------------------------- + * | 4 bytes | N bytes | 4 bytes | 2 bytes | + * | pathlen | path | flags | mode | + * ----------------------------------------- + * Next we set up the flags. + */ + asm volatile goto( + "r1 = *(u64 *)%[pid];\n" + "r7 = *(u32 *)%[offset];\n" + "if r7 s< 0 goto %l[a];\n" + "if r7 s> 1188 goto %l[a];\n" + "r1 += r7;\n" + "r2 = *(u32 *)%[flags];\n" + "*(u32 *)(r1 + 0) = r2;\n" + "r2 = *(u16 *)%[mode];\n" + "*(u16 *)(r1 + 4) = r2;\n" + : + : [pid] "m"(args), [flags] "m"(flags), [offset] "+m"(size), [mode] "m"(i_mode) + : "r0", "r1", "r2", "r7", "memory" + : a); +a: + size += sizeof(u32) + sizeof(u16); // for the flags + i_mode + return size; +} + + +void write_ipv6_addr32(u32 *dest, u32 *src) +{ + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + dest[3] = src[3]; +} + +SEC("kprobe/tailcall_prog") +int filter_prog(struct pt_regs *ctx) { + BPF_DEBUG("[secure][tailcall] enter filter_prog"); + __u32 zero = 0; + struct secure_tailcall_stack *stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!stack) + return 0; + + int call_name_idx = stack->func; + struct selector_filters* filters = NULL; + filters = bpf_map_lookup_elem(&filter_map, &call_name_idx); + + if (filters == NULL) { + // no filter was set ... + // should send data directly. + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_SEND); + return 0; + } + + // get data + int i = 0; + int pass = 1; + int filter_num = filters->filter_count; + #pragma unroll + for (; i < MAX_FILTER_FOR_PER_CALLNAME && i < filter_num; i ++) { + int idx = i; + struct selector_filter filter = filters->filters[idx]; + // if (filter.filter_type != FILTER_TYPE_UNKNOWN) { + // BPF_DEBUG("get file prefix filter, callname idx:%u type:%u, map index:%u", call_name_idx, filter.filter_type, filter.map_idx[0]); + // // BPF_DEBUG("get file prefix filter, vallen:%u, plus 8:%u", filter.vallen, filter.vallen << 3); + // } + struct addr4_lpm_trie arg4; + // struct addr6_lpm_trie arg6; + switch(filter.filter_type) { + case FILTER_TYPE_SADDR: { + uint32_t saddr = stack->tcp_data.saddr; + struct bpf_map* inner_map4 = NULL; + // struct bpf_map* inner_map6 = NULL; + if (filter.map_idx[0] != -1) { + inner_map4 = bpf_map_lookup_elem(&addr4lpm_maps, &filter.map_idx[0]); + } + // if (filter.map_idx[1] != -1) { + // inner_map6 = bpf_map_lookup_elem(&addr6lpm_maps, &filter.map_idx[1]); + // } + if (inner_map4 == NULL) { + BPF_DEBUG("there is something wrong with the lpm maps... callname idx:%u cannot find inner map for saddr, continue ... ", call_name_idx); + continue; + } + arg4.addr = saddr; + arg4.prefix = 32; + // arg6.prefix = 128; + // write the address in as 4 u32s due to alignment + // write_ipv6_addr32(arg6.addr, (__u32 *)stack->tcp_data.saddr); + __u8 *ppass4 = NULL, *ppass6 = NULL; + if (inner_map4 != NULL) ppass4 = bpf_map_lookup_elem(inner_map4, &arg4); + + // ppass6 = bpf_map_lookup_elem(inner_map6, &arg6); + if (filter.op_type == OP_TYPE_IN) { + // not in white list + if (ppass4 == NULL) { + BPF_DEBUG("callname idx:%u arg4 saddr:%u, prefix:%u not in whitelist", call_name_idx, arg4.addr, arg4.prefix); + return 0; + } + } else if (filter.op_type == OP_TYPE_NOT_IN) { + // or in black list + if (ppass4 != NULL) { + BPF_DEBUG("callname idx:%u arg4 saddr:%u, prefix:%u in blacklist", call_name_idx, arg4.addr, arg4.prefix); + return 0; + } + } + break; + } + case FILTER_TYPE_DADDR: { + uint32_t daddr = stack->tcp_data.daddr; + arg4.addr = daddr; + arg4.prefix = 32; + struct bpf_map* inner_map = bpf_map_lookup_elem(&addr4lpm_maps, &filter.map_idx[0]); + if (inner_map == NULL) { + BPF_DEBUG("callname idx:%u cannot find inner map for daddr, continue ... ", call_name_idx); + continue; + } + __u8* ppass = NULL; + ppass = bpf_map_lookup_elem(inner_map, &arg4); + if (filter.op_type == OP_TYPE_IN) { + // not in white list + if (ppass == NULL) { + BPF_DEBUG("callname idx:%u arg4 daddr:%u, prefix:%u not in whitelist", call_name_idx, arg4.addr, arg4.prefix); + return 0; + } + } else if (filter.op_type == OP_TYPE_NOT_IN) { + // or in black list + BPF_DEBUG("callname idx:%u arg4 daddr:%u, prefix:%u in blacklist", call_name_idx, arg4.addr, arg4.prefix); + if (ppass != NULL) return 0; + } + + break; + } + case FILTER_TYPE_SPORT: { + uint32_t sport = stack->tcp_data.sport; + struct bpf_map* inner_map = bpf_map_lookup_elem(&port_maps, &filter.map_idx[0]); + if (inner_map == NULL) { + BPF_DEBUG("callname idx:%u cannot find inner map for sport, continue ... ", call_name_idx); + continue; + } + __u8* ppass = NULL; + ppass = bpf_map_lookup_elem(inner_map, &sport); + if (filter.op_type == OP_TYPE_IN) { + // not in white list + if (ppass == NULL) { + BPF_DEBUG("callname idx:%u arg4 sport:%u not in whitelist", call_name_idx, sport); + return 0; + } + } else if (filter.op_type == OP_TYPE_NOT_IN) { + // or in black list + if (ppass != NULL) { + BPF_DEBUG("callname idx:%u arg4 sport:%u in blacklist", call_name_idx, sport); + return 0; + } + } + } + case FILTER_TYPE_DPORT: { + uint32_t dport = stack->tcp_data.dport; + struct bpf_map* inner_map = bpf_map_lookup_elem(&port_maps, &filter.map_idx[0]); + if (inner_map == NULL) { + BPF_DEBUG("callname idx:%u cannot find inner map for dport, continue ... ", call_name_idx); + continue; + } + __u8* ppass = NULL; + ppass = bpf_map_lookup_elem(inner_map, &dport); + if (filter.op_type == OP_TYPE_IN) { + // not in white list + if (ppass == NULL) { + BPF_DEBUG("callname idx:%u arg4 dport:%u not in whitelist", call_name_idx, dport); + return 0; + } + } else if (filter.op_type == OP_TYPE_NOT_IN) { + // or in black list + if (ppass != NULL) { + BPF_DEBUG("callname idx:%u arg4 dport:%u in blacklist", call_name_idx, dport); + return 0; + } + } + break; + } + case FILTER_TYPE_FILE_PREFIX: { + struct string_prefix_lpm_trie *prefix = NULL; + int zero = 0; + prefix = bpf_map_lookup_elem(&string_prefix_maps_heap, &zero); + if (prefix == NULL) { + BPF_DEBUG("[kprobe][tailcall] callname idx:%u cannot lookup string_prefix_maps_heap", call_name_idx); + break; + } + __u32 path_size = 0; + bpf_probe_read(&path_size, 4, stack->file_data.path); + prefix->prefixlen = path_size * 8; + bpf_probe_read(prefix->data, path_size & (STRING_PREFIX_MAX_LENGTH - 1), stack->file_data.path + 4); + int path_len = *(int *)stack->file_data.path; + BPF_DEBUG("[kprobe][tailcall] callname idx:%u begin to query inner map. stack path length:%d", call_name_idx, path_len); + BPF_DEBUG("[kprobe][tailcall] callname idx:%u begin to query inner map. stack path+4:%s", call_name_idx, &stack->file_data.path[4]); + BPF_DEBUG("[kprobe][tailcall] callname idx:%u begin to query inner map. prefix path:%s, path size:%u", call_name_idx, prefix->data, path_size); + + struct bpf_map* inner_map = bpf_map_lookup_elem(&string_prefix_maps, &filter.map_idx[0]); + __u8* ppass = NULL; + if (inner_map != NULL) { + ppass = bpf_map_lookup_elem(inner_map, prefix); + if (ppass == NULL || *ppass == 0) pass &= 0; + else pass &= 1; + } else { + // no filters were set ... + BPF_DEBUG("[kprobe][tailcall] callname idx:%u cannot find inner map, no filter set, pass", call_name_idx); + } + break; + } + default: + break; + } + } + + if (pass) { + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_SEND); + } else { + BPF_DEBUG("[filter_prog] skip submit due to the filter"); + } + + return 0; +} + +static __always_inline size_t file_data_common_size(void) { + return offsetof(struct file_data_t, path); +} + +SEC("kprobe/secure_data_send") +int secure_data_send(struct pt_regs *ctx) +{ + BPF_DEBUG("[secure][tailcall] enter secure_data_send"); + // the max tail call, just flush event + __u32 zero = 0; + struct secure_tailcall_stack *data = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!data) + return 0; + + switch (data->func) + { + case SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION: + case SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_MMAP_FILE: + case SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_PATH_TRUNCATE: + case SECURE_FUNC_TRACEPOINT_FUNC_SYS_WRITE: + case SECURE_FUNC_TRACEPOINT_FUNC_SYS_READ:{ + __u32 total = file_data_common_size() + data->file_data.size; + if (total > sizeof(struct file_data_t)) { + total = sizeof(struct file_data_t); + } + bpf_perf_event_output(ctx, &file_secure_output, BPF_F_CURRENT_CPU, &data->file_data, total); + BPF_DEBUG("[kprobe][secure_data_send][file] pid:%u, ktime:%u, func:%d send to perfbuffer.\n", data->file_data.key.pid, data->file_data.key.ktime, data->func); + break; + } + case SECURE_FUNC_TRACEPOINT_FUNC_TCP_CLOSE: + case SECURE_FUNC_TRACEPOINT_FUNC_TCP_CONNECT: + case SECURE_FUNC_TRACEPOINT_FUNC_TCP_SENDMSG: + bpf_perf_event_output(ctx, &sock_secure_output, BPF_F_CURRENT_CPU, &data->tcp_data, sizeof(struct tcp_data_t)); + BPF_DEBUG("[kprobe][secure_data_send][socket] pid:%u, ktime:%u, func:%d send to perfbuffer.\n", data->file_data.key.pid, data->file_data.key.ktime, data->func); + default: + break; + } + // bpf_perf_event_output(ctx, &file_secure_output, BPF_F_CURRENT_CPU, data, sizeof(struct secure_tailcall_stack)); + // BPF_DEBUG("[kprobe][kprobe_security_file_permission] pid:%u, ktime:%u send to perfbuffer.\n", data->key.pid, data->key.ktime); + return 0; +} + +SEC("kprobe/security_file_permission") +int kprobe_security_file_permission(struct pt_regs *ctx) +{ + BPF_DEBUG("[kprobe][kprobe_security_file_permission] enter security_file_permission."); + __u32 zero = 0; + struct secure_tailcall_stack* stack = NULL; + stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!stack) return 0; + memset(stack, 0, sizeof(stack)); + struct file *file = (struct file *)PT_REGS_PARM1(ctx); + const struct path *path_arg = 0; + path_arg = _(&file->f_path); + long ret = copy_path(stack->file_data.path, path_arg); + stack->file_data.size = ret; + int path_len = *(int *)stack->file_data.path; + const u32 flag_prefix = 4 + path_len; + int flag = -1; + if (flag_prefix < 2000 && flag_prefix >= 0) bpf_probe_read(&flag, 4, stack->file_data.path + flag_prefix); + const u32 mode_prefix = 8 + path_len; + short mode = -1; + if (mode_prefix < 2000 && mode_prefix >= 0) bpf_probe_read(&mode, 2, stack->file_data.path + mode_prefix); + BPF_DEBUG("[kprobe][tailcall][permission] before ~ stack path length:%d, ret:%lld, flag:%d", path_len, ret, flag); + BPF_DEBUG("[kprobe][tailcall][permission] before ~ stack path+4:%s, mode:%d", &stack->file_data.path[4], mode); + + __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct execve_map_value *enter; + enter = execve_map_get_noinit(pid); + if (!enter || enter->key.ktime == 0) + { + BPF_DEBUG("[kprobe][tailcall][permission] no init!!! return! stack path:%s, pid:%u", stack->file_data.path, pid); + BPF_DEBUG("[kprobe][tailcall][permission] no init!!! return! stack path+4:%s, pid:%u", &stack->file_data.path[4], pid); + return 0; + } + BPF_DEBUG("[kprobe][kprobe_security_file_permission] pid:%u ktime:%llu already enter.", pid, enter->key.ktime); + // __u32 zero = 0; + // struct secure_tailcall_stack* stack = NULL; + // stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + // if (!stack) return 0; + // memset(stack, 0, sizeof(stack)); + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION; + stack->file_data.func = TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION; + stack->file_data.key = enter->key; + stack->file_data.pkey = enter->pkey; + stack->file_data.timestamp = bpf_ktime_get_ns(); + // struct file *file = (struct file *)PT_REGS_PARM1(ctx); + // const struct path *path_arg = 0; + // path_arg = _(&file->f_path); + // copy_path(stack->file_data.path, path_arg); + // obtain operation type mask information + int mask = (int) PT_REGS_PARM2(ctx); + switch (mask) { + case MAY_READ: + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION; + stack->file_data.func = TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION_READ; + break; + case MAY_WRITE: + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION; + stack->file_data.func = TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION_WRITE; + break; + default: + BPF_DEBUG("[kprobe][kprobe_security_file_permission] unknown operation"); + break; + } + + BPF_DEBUG("[kprobe][security_file_permission] after association: pid:%u ktime:%llu path:%s already enter.", pid, enter->key.ktime, &stack->file_data.path[4]); + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_FILTER_PROG); + return 0; +} + +SEC("kprobe/security_mmap_file") +int kprobe_security_mmap_file(struct pt_regs *ctx) +{ + BPF_DEBUG("[kprobe][security_mmap_file] enter security_mmap_file."); + __u32 zero = 0; + struct secure_tailcall_stack* stack = NULL; + stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!stack) return 0; + memset(stack, 0, sizeof(stack)); + struct file *file = (struct file *)PT_REGS_PARM1(ctx); + const struct path *path_arg = 0; + path_arg = _(&file->f_path); + long ret = copy_path(stack->file_data.path, path_arg); + stack->file_data.size = ret; + int path_len = *(int *)stack->file_data.path; + BPF_DEBUG("[kprobe][tailcall][mmap] before ~ stack path length:%s, ret:%lld", path_len, ret); + BPF_DEBUG("[kprobe][tailcall][mmap] before ~ stack path+4:%s", &stack->file_data.path[4]); + + __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct execve_map_value *enter; + enter = execve_map_get_noinit(pid); + if (!enter || enter->key.ktime == 0) + { + return 0; + } + BPF_DEBUG("[kprobe][security_mmap_file] pid:%u ktime:%llu already enter.", pid, enter->key.ktime); + + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_MMAP_FILE; + stack->file_data.func = TRACEPOINT_FUNC_SECURITY_MMAP_FILE; + stack->file_data.key = enter->key; + stack->file_data.pkey = enter->pkey; + stack->file_data.timestamp = bpf_ktime_get_ns(); + + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_FILTER_PROG); + return 0; +} + +SEC("kprobe/security_path_truncate") +int kprobe_security_path_truncate(struct pt_regs *ctx) +{ + BPF_DEBUG("[kprobe][security_path_truncate] enter security_path_truncate."); + __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct execve_map_value *enter; + enter = execve_map_get_noinit(pid); + if (!enter || enter->key.ktime == 0) + { + return 0; + } + BPF_DEBUG("[kprobe][security_path_truncate] pid:%u ktime:%llu already enter.", pid, enter->key.ktime); + __u32 zero = 0; + struct secure_tailcall_stack* stack = NULL; + stack = bpf_map_lookup_elem(&tailcall_stack, &zero); + if (!stack) return 0; + memset(stack, 0, sizeof(stack)); + stack->func = SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_PATH_TRUNCATE; + stack->file_data.func = TRACEPOINT_FUNC_SECURITY_PATH_TRUNCATE; + stack->file_data.key = enter->key; + stack->file_data.pkey = enter->pkey; + stack->file_data.timestamp = bpf_ktime_get_ns(); + struct path *path = (struct path *)PT_REGS_PARM1(ctx); + const struct path *path_arg = 0; + path_arg = _(path); + long ret = copy_path(stack->file_data.path, path_arg); + stack->file_data.size = ret; + bpf_tail_call(ctx, &secure_tailcall_map, TAILCALL_FILTER_PROG); + return 0; +} + +// char _license[] SEC("license") = "GPL"; diff --git a/src/security/string_maps.h b/src/security/string_maps.h new file mode 100644 index 0000000000000000000000000000000000000000..baeedbaa72f2d7acbe30638ffeffa8ea768501e3 --- /dev/null +++ b/src/security/string_maps.h @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright Authors of Cilium */ + +#ifndef STRING_MAPS_H__ +#define STRING_MAPS_H__ + +#include +#include +#include +#include +#include "../coolbpf.h" +#include "type.h" + +/* + * To facilitate an arbitrary number of strings that can be matched on, string matching + * uses a hash look up. The problem with this is that the key to a hash has to be a fixed + * size, so if the maximum string length is 128 bytes, then all stored strings will be + * 128 bytes long (padded with 0s) and the string to be looked up also has to be padded + * with 0s to 128 bytes. This means that a short string will be hashed as if it is 128 + * bytes long. + * + * The BPF hash maps use jhash for key hashing. See include/linux/jhash.h. This requires + * approximately 1 CPU cycle per byte, so in the example above, hashing every string, + * regardless of length, will take ~128 cycles, which is clearly inefficient. See + * https://fosdem.org/2023/schedule/event/bpf_hashing/ for details. + * + * jhash hashes in 12 byte blocks (3 x u32). For all lengths >12, a number of 12 byte + * blocks are hashed, and the remainder is hashed using a combination of single byte + * loads/shifts, followed by a final mix. It appears that the most efficient use of + * jhash is with lengths equal to 12k + 1, minimising the number of single byte loads/ + * shifts. + * + * In order to reduce the amount of hashing of padded 0s, we opt to store string matches + * in multiple hashes, with increasing key sizes, where the key size is one more than a + * multiple of 12. Each string to be stored is placed in the hash that has the smallest + * key size that can accommodate it (and is padded to the key size). Strings to be looked + * up are equally padded to the smallest key size that can accommodate them, and then + * looked up in the related map. + * + * The chosen key sizes are 25, 49, 73, 97, 121, 145, 258, 514, 1026, 2050, 4098 (11 maps). + * The first 6 are sized for common uses and to minimise the hashing of empty bytes. The + * following 5 maps notionally double in size, with lengths equal to 2^k + 2. On kernels + * <5.11, the last four maps are replaced with a single map with key size 512. This is due + * to key size limitations on kernels <5.11. + * + * In order to distinguish between character buffers that end in 0s and similar buffers + * that are padded with 0s, each string will be prefixed by its length stored in a + * single byte (for first 6 maps) or as a little endian u16 (latter maps). + */ + + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_0]); + __type(value, __u8); + }); +} string_maps_0 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_1]); + __type(value, __u8); + }); +} string_maps_1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_2]); + __type(value, __u8); + }); +} string_maps_2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_3]); + __type(value, __u8); + }); +} string_maps_3 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_4]); + __type(value, __u8); + }); +} string_maps_4 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_5]); + __type(value, __u8); + }); +} string_maps_5 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_6]); + __type(value, __u8); + }); +} string_maps_6 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_7]); + __type(value, __u8); + }); +} string_maps_7 SEC(".maps"); + +#ifdef __LARGE_MAP_KEYS +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_8]); + __type(value, __u8); + }); +} string_maps_8 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_9]); + __type(value, __u8); + }); +} string_maps_9 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u8[STRING_MAPS_SIZE_10]); + __type(value, __u8); + }); +} string_maps_10 SEC(".maps"); +#endif + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, STRING_MAPS_HEAP_SIZE); +} string_maps_heap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, STRING_MAPS_HEAP_SIZE); +} string_maps_ro_zero SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(max_entries, 1); + __type(key, __u8[sizeof(struct string_prefix_lpm_trie)]); // Need to specify as byte array as wouldn't take struct as key type + __type(value, __u8); + __uint(map_flags, BPF_F_NO_PREALLOC); + }); +} string_prefix_maps SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct string_prefix_lpm_trie)); +} string_prefix_maps_heap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, STRING_MAPS_OUTER_MAX_ENTRIES); + __uint(key_size, sizeof(__u32)); + __array( + values, struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(max_entries, 1); + __type(key, __u8[sizeof(struct string_postfix_lpm_trie)]); // Need to specify as byte array as wouldn't take struct as key type + __type(value, __u8); + __uint(map_flags, BPF_F_NO_PREALLOC); + }); +} string_postfix_maps SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct string_postfix_lpm_trie)); +} string_postfix_maps_heap SEC(".maps"); + +#endif // STRING_MAPS_H__ diff --git a/src/security/tailcall_stack.h b/src/security/tailcall_stack.h new file mode 100644 index 0000000000000000000000000000000000000000..8d6313722631518f7b29622a456a3b39d7a76546 --- /dev/null +++ b/src/security/tailcall_stack.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include +#include +#include "../coolbpf.h" +#include "type.h" + +struct secure_tailcall_stack { + enum secure_funcs func; + union { + struct tcp_data_t tcp_data; + struct file_data_t file_data; + }; +} __attribute__((packed)); \ No newline at end of file diff --git a/src/security/type.h b/src/security/type.h new file mode 100644 index 0000000000000000000000000000000000000000..75bde6fcd5ffe061a9e474e55ca02c5531290b86 --- /dev/null +++ b/src/security/type.h @@ -0,0 +1,265 @@ +// +// Created by qianlu on 2024/6/12. +// + +#ifndef SYSAK_TYPE_H +#define SYSAK_TYPE_H + +#ifdef __cplusplus +#include +#endif + +#include "bpf_process_event_type.h" + +#ifndef AF_INET +#define AF_INET 2 +#endif +#ifndef AF_INET6 +#define AF_INET6 10 +#endif +#define IPV4LEN 4 +#define IPV6LEN 16 + +#define MAY_EXEC 0x00000001 +#define MAY_WRITE 0x00000002 +#define MAY_READ 0x00000004 +#define MAY_APPEND 0x00000008 +#define MAY_ACCESS 0x00000010 +#define MAY_OPEN 0x00000020 +#define MAY_CHDIR 0x00000040 +/* called from RCU mode, don't block */ +#define MAY_NOT_BLOCK 0x00000080 + +struct tuple_type { + __u64 saddr[2]; + __u64 daddr[2]; + __u16 sport; + __u16 dport; + __u16 protocol; + __u16 family; +}; + +/// network etc +enum sock_secure_ctrl_type { + INVALID, + PID, + CONTAINER_ID, + SOURCE_IP, + SOURCE_PORT, + DEST_IP, + DEST_PORT, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + NET_NS, + MAX, +}; + +enum secure_funcs { + + // file + SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION, + SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_MMAP_FILE, + SECURE_FUNC_TRACEPOINT_FUNC_SECURITY_PATH_TRUNCATE, + SECURE_FUNC_TRACEPOINT_FUNC_SYS_WRITE, + SECURE_FUNC_TRACEPOINT_FUNC_SYS_READ, + + // network + SECURE_FUNC_TRACEPOINT_FUNC_TCP_CLOSE, + SECURE_FUNC_TRACEPOINT_FUNC_TCP_CONNECT, + SECURE_FUNC_TRACEPOINT_FUNC_TCP_SENDMSG, + + + // process + + SECURE_FUNCS_MAX, +}; + +enum sock_secure_func { + TRACEPOINT_FUNC_TCP_CLOSE, + TRACEPOINT_FUNC_TCP_CONNECT, + TRACEPOINT_FUNC_TCP_SENDMSG, + TRACEPOINT_FUNC_MAX, +}; + +struct addr_port { + __u32 addr; + __u16 port; +}; + +struct ns_key_t { + __u32 net_ns_inum; +}; + +struct tcp_data_t { + struct msg_execve_key key; + struct msg_execve_key pkey; + enum sock_secure_func func; + __u16 protocol; + __u16 state; + __u16 family; + __u32 pid; + __u32 saddr; // Source address + __u32 daddr; // Destination address + __u16 sport; // Source port + __u16 dport; // Destination port + __u32 net_ns; // Network namespace + __u64 timestamp; + __u64 bytes; +}; + +#define SYSAK_SECURE_MAX_CIDR_LIMIT 20 +#define SYSAK_SECURE_MAX_CIDR_LIMIT_HALF 10 +#define SYSAK_SECURE_MAX_PORT_LIMIT 20 +#define SYSAK_SECURE_MAX_PORT_LIMIT_HALF 10 + +#define SYSAK_SECURE_MAX_PATH_LIMIT 2 +#define SYSAK_SECURE_MAX_PATH_LENGTH_LIMIT 256 + +#define INT_MAPS_OUTER_MAX_ENTRIES 20 +#define INT_MAPS_INNER_MAX_ENTRIES 8 +#define STRING_MAPS_OUTER_MAX_ENTRIES 20 +#define STRING_MAPS_INNER_MAX_ENTRIES 8 + +struct cidr_entry { + int inited; + int black; // black list or not + __u32 net; // Network part of CIDR + __u32 mask; // Network mask +}; + +struct port_entry { + int inited; + int black; // is black list or not + __u16 port; // is src port or not +}; + + + +/// process etc + +// file +enum file_secure_func +{ + TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION, + TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION_WRITE, + TRACEPOINT_FUNC_SECURITY_FILE_PERMISSION_READ, + TRACEPOINT_FUNC_SECURITY_MMAP_FILE, + TRACEPOINT_FUNC_SECURITY_PATH_TRUNCATE, + TRACEPOINT_FUNC_SYS_WRITE, + TRACEPOINT_FUNC_SYS_READ, +}; +struct file_data_t +{ + struct msg_execve_key key; + struct msg_execve_key pkey; + enum file_secure_func func; + __u64 timestamp; + __u32 size; + char path[2000]; +}; +struct path_entry +{ + // todo need updates + int inited; + int length; + char path[2000]; +}; + + +#define STRING_MAPS_KEY_INC_SIZE 24 +#define STRING_MAPS_SIZE_0 (1 * STRING_MAPS_KEY_INC_SIZE + 1) +#define STRING_MAPS_SIZE_1 (2 * STRING_MAPS_KEY_INC_SIZE + 1) +#define STRING_MAPS_SIZE_2 (3 * STRING_MAPS_KEY_INC_SIZE + 1) +#define STRING_MAPS_SIZE_3 (4 * STRING_MAPS_KEY_INC_SIZE + 1) +#define STRING_MAPS_SIZE_4 (5 * STRING_MAPS_KEY_INC_SIZE + 1) +#define STRING_MAPS_SIZE_5 (6 * STRING_MAPS_KEY_INC_SIZE + 1) +#define STRING_MAPS_SIZE_6 (256 + 2) +#ifdef __LARGE_MAP_KEYS +#define STRING_MAPS_SIZE_7 (512 + 2) +#define STRING_MAPS_SIZE_8 (1024 + 2) +#define STRING_MAPS_SIZE_9 (2048 + 2) +#define STRING_MAPS_SIZE_10 (4096 + 2) +#else +#define STRING_MAPS_SIZE_7 (512) +#endif +#define STRING_MAPS_HEAP_SIZE 16384 +#define STRING_MAPS_HEAP_MASK (8192 - 1) +#define STRING_MAPS_COPY_MASK 4095 + +#define STRING_PREFIX_MAX_LENGTH 256 + +struct string_prefix_lpm_trie { + __u32 prefixlen; + __u8 data[STRING_PREFIX_MAX_LENGTH]; +}; + +#define STRING_POSTFIX_MAX_LENGTH 128 +#define STRING_POSTFIX_MAX_MASK (STRING_POSTFIX_MAX_LENGTH - 1) +#ifdef __LARGE_BPF_PROG +#define STRING_POSTFIX_MAX_MATCH_LENGTH STRING_POSTFIX_MAX_LENGTH +#else +#define STRING_POSTFIX_MAX_MATCH_LENGTH 95 +#endif + +struct string_postfix_lpm_trie { + __u32 prefixlen; + __u8 data[STRING_POSTFIX_MAX_LENGTH]; +}; + + +#define ADDR_LPM_MAPS_OUTER_MAX_ENTRIES 20 +#define ADDR_LPM_MAPS_INNER_MAX_ENTRIES 8 + + +struct addr4_lpm_trie { + __u32 prefix; + __u32 addr; +}; + +struct addr6_lpm_trie { + __u32 prefix; + __u32 addr[4]; +}; + +enum tailcall_func { + TAILCALL_FILTER_PROG, + TAILCALL_SEND, +}; + + +enum filter_type { + FILTER_TYPE_UNKNOWN, + FILTER_TYPE_SADDR, + FILTER_TYPE_DADDR, + FILTER_TYPE_NOT_SADDR, + FILTER_TYPE_NOT_DADDR, + FILTER_TYPE_SPORT, + FILTER_TYPE_DPORT, + FILTER_TYPE_NOT_SPORT, + FILTER_TYPE_NOT_DPORT, + FILTER_TYPE_FILE_PREFIX, +}; + +enum op_type { + OP_TYPE_IN, + OP_TYPE_NOT_IN, +}; + +#define MAX_FILTER_FOR_PER_CALLNAME 8 + +struct selector_filter { + // __u32 index; + // __u32 op; + __u32 vallen; + enum filter_type filter_type; + enum op_type op_type; + __u32 map_idx[2]; + // __u8 value; +} __attribute__((packed)); + +struct selector_filters { + int filter_count; + struct selector_filter filters[MAX_FILTER_FOR_PER_CALLNAME]; +} __attribute__((packed)); + + +#endif //SYSAK_TYPE_H diff --git a/third/libbpf/src/libbpf.c b/third/libbpf/src/libbpf.c index b57f2dea59da54f545ee1b2410d64ecab9798e84..aa3b7434fb54f6260e569ad93835efc36a981c0e 100644 --- a/third/libbpf/src/libbpf.c +++ b/third/libbpf/src/libbpf.c @@ -2643,7 +2643,7 @@ int parse_btf_map_def(const char *map_name, struct btf *btf, return -EINVAL; } - snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + snprintf(inner_map_name, sizeof(inner_map_name), "%s_inner", map_name); err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); if (err) return err; @@ -2871,10 +2871,10 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, if (map->inner_map->fd < 0) return map->inner_map->fd; map->inner_map->sec_idx = sec_idx; - map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + map->inner_map->name = malloc(strlen(map_name) + sizeof("_inner") + 1); if (!map->inner_map->name) return -ENOMEM; - sprintf(map->inner_map->name, "%s.inner", map_name); + sprintf(map->inner_map->name, "%s_inner", map_name); fill_map_from_def(map->inner_map, &inner_def); } diff --git a/tools/examples/net/net.c b/tools/examples/net/net.c index 3659e65eab5fee4b2fa851185ae1fb785f10f907..6631f372be68b6abd9f49eaa283497bd5ce7acff 100644 --- a/tools/examples/net/net.c +++ b/tools/examples/net/net.c @@ -445,7 +445,7 @@ int main(int argc, char **argv) printf("net start end...\n"); while (1) { - err = ebpf_poll_events(100, &stop_flag); + err = ebpf_poll_events(100, &stop_flag, 0); if (exiting) { if (env_para.file != stdout)