diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index a1149911464c3b5005714e19b2d743961005c5fb..df413d7697675fae6b3672ee010871eeb83df5a1 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -14,6 +14,7 @@ #ifndef _UAPI__ASM_ARM_UNISTD_H #define _UAPI__ASM_ARM_UNISTD_H +#define __IGNORE_kabi_reserved454 #define __NR_OABI_SYSCALL_BASE 0x900000 #define __NR_SYSCALL_MASK 0x0fffff diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 409890bad06eb924b32f9b9950a7a5725b4ab41b..6fb31f9330e1fa151c513735e86fa167c104c5c9 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -97,6 +97,7 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_DYNAMIC is not set +CONFIG_XCU_SCHEDULER=n # # CPU/Task time and stats accounting diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h index 5f84e3dc98d0d98be0794f6c710c3548ebf75839..c9993b5dc7044e0dd70a096b8cf784f2653b823a 100644 --- a/arch/powerpc/include/uapi/asm/unistd.h +++ b/arch/powerpc/include/uapi/asm/unistd.h @@ -9,6 +9,7 @@ */ #ifndef _UAPI_ASM_POWERPC_UNISTD_H_ #define _UAPI_ASM_POWERPC_UNISTD_H_ +#define __IGNORE_kabi_reserved454 #ifndef __powerpc64__ #include diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 7d030d2d7717c936302f9ac7b3652124ff3fd7d3..8c289f45bcec1ecdc50155c10222377cfc6d6f4c 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -117,6 +117,7 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_DYNAMIC is not set +CONFIG_XCU_SCHEDULER=n # # CPU/Task time and stats accounting diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f88268a37ec25d53404d225e414eefe633f5d087..162517343cb1c283234e45ef0b12809a8604cee7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -375,7 +375,7 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack -454 common kabi_reserved454 sys_ni_syscall +454 common vstream_manage sys_vstream_manage 455 common kabi_reserved455 sys_ni_syscall 456 common kabi_reserved456 sys_ni_syscall 457 common kabi_reserved457 sys_ni_syscall diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index be5e2e747f507657efc74f5ed2b68ed262103fda..5d81c4bb98036cf0c46cbaf8b22a8091f9ef8e40 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -11,6 +11,7 @@ * thing regardless. */ #define __X32_SYSCALL_BIT 0x40000000 +#define __IGNORE_kabi_reserved454 #ifndef __KERNEL__ # ifdef __i386__ diff --git a/drivers/Makefile b/drivers/Makefile index f8e58f0ca2d1d10616725be02a68c046804ea541..57826d4b5cd7c5265bf4d1a2923d31dbc6bb4ff2 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -197,6 +197,7 @@ obj-$(CONFIG_GNSS) += gnss/ obj-$(CONFIG_INTERCONNECT) += interconnect/ obj-$(CONFIG_COUNTER) += counter/ obj-$(CONFIG_MOST) += most/ +obj-$(CONFIG_XCU_SCHEDULER) += xcu/ obj-$(CONFIG_PECI) += peci/ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ diff --git a/drivers/xcu/Makefile b/drivers/xcu/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..575115b148ecb689af0cb3047e443a95958e21f7 --- /dev/null +++ b/drivers/xcu/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_XCU_SCHEDULER) += xcu_group.o diff --git a/drivers/xcu/xcu_group.c b/drivers/xcu/xcu_group.c new file mode 100644 index 0000000000000000000000000000000000000000..0cd8f535fb2b4a75be6a5e4fe02f4b625becbe97 --- /dev/null +++ b/drivers/xcu/xcu_group.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Code for NPU driver support + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include +#include +#include +#include + +int num_active_xcu; +static DEFINE_SPINLOCK(xcu_mgr_lock); +struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS]; +static DECLARE_RWSEM(xcu_group_rwsem); +struct xcu_group *xcu_group_init(int id) +{ + struct xcu_group *node = kzalloc(sizeof(*node), GFP_KERNEL); + + if (!node) + return NULL; + + node->id = id; + node->type = XCU_TYPE_XPU; + idr_init(&node->next_layer); + return node; +} +EXPORT_SYMBOL(xcu_group_init); + +int __xcu_group_attach(struct xcu_group *new_group, + struct xcu_group *previous_group) +{ + int id = new_group->id; + + if (id == -1) + id = idr_alloc(&previous_group->next_layer, new_group, 0, + INT_MAX, GFP_KERNEL); + else + id = idr_alloc(&previous_group->next_layer, new_group, id, + id + 1, GFP_KERNEL); + + if (id < 0) { + XSCHED_ERR("Fail to attach xcu_group: id conflict @ %s\n", + __func__); + return -EEXIST; + } + new_group->id = id; + new_group->previous_layer = previous_group; + + return 0; +} + +int xcu_group_attach(struct xcu_group *new_group, + struct xcu_group *previous_group) +{ + int ret; + + down_write(&xcu_group_rwsem); + ret = __xcu_group_attach(new_group, previous_group); + up_write(&xcu_group_rwsem); + + return ret; +} +EXPORT_SYMBOL(xcu_group_attach); + +static inline void __xcu_group_detach(struct xcu_group *group) +{ + if (!group || !group->previous_layer) + return; + + idr_remove(&group->previous_layer->next_layer, group->id); + group->previous_layer = NULL; +} + +void xcu_group_detach(struct xcu_group *group) +{ + down_write(&xcu_group_rwsem); + __xcu_group_detach(group); + up_write(&xcu_group_rwsem); +} +EXPORT_SYMBOL(xcu_group_detach); + +void xcu_group_free(struct xcu_group *group) +{ + idr_destroy(&group->next_layer); + if (group != xcu_group_root) + kfree(group); +} +EXPORT_SYMBOL(xcu_group_free); + +static struct xcu_group *__xcu_group_find_nolock(struct xcu_group *group, int id) +{ + return idr_find(&group->next_layer, id); +} + +struct xcu_group *xcu_group_find(struct xcu_group *group, int id) +{ + struct xcu_group *result; + + down_read(&xcu_group_rwsem); + result = __xcu_group_find_nolock(group, id); + up_read(&xcu_group_rwsem); + + return result; +} +EXPORT_SYMBOL(xcu_group_find); + +/* This function runs "run" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object + */ +int xcu_run(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->run) { + XSCHED_ERR("No function [run] called.\n"); + return -EINVAL; + } + + return params->group->opt->run(params); +} + +/* This function runs "wait" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object + */ +int xcu_wait(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->wait) { + XSCHED_ERR("No function [wait] called.\n"); + return -EINVAL; + } + + return params->group->opt->wait(params); +} + +/* This function runs "complete" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + */ +int xcu_complete(struct xcu_op_handler_params *params) +{ + return 0; +} + +/* This function runs "finish" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement deallocation + * and freeing memory for SQ and CQ buffers. + */ +int xcu_finish(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->finish) { + XSCHED_ERR("No function [finish] called.\n"); + return -EINVAL; + } + + return params->group->opt->finish(params); +} + +/* This function runs a "alloc" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement allocation + * and registering memory for SQ and CQ buffers. + */ +int xcu_alloc(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->alloc) { + XSCHED_ERR("No function [alloc] called.\n"); + return -EINVAL; + } + + return params->group->opt->alloc(params); +} + +/* This function runs a "logic_alloc" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement allocation + * and registering memory of logic CQ buffer. + */ +int xcu_logic_alloc(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->logic_alloc) { + XSCHED_ERR("No function [logic_alloc] called.\n"); + return -EINVAL; + } + + return params->group->opt->logic_alloc(params); +} + +/* This function runs a "logic_free" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement deallocation + * and unregistering memory of a logic CQ buffer. + */ +int xcu_logic_free(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->logic_free) { + XSCHED_ERR("No function [logic_free] called.\n"); + return -EINVAL; + } + + return params->group->opt->logic_free(params); +} + +/* This function runs a "sqe_op" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to set or get sqe info. + */ +int xcu_sqe_op(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->sqe_op) { + XSCHED_ERR("No function [sqe_op] called.\n"); + return -EINVAL; + } + + return params->group->opt->sqe_op(params); +} + +static struct xcu_group __xcu_group_root = { + .id = 0, + .type = XCU_TYPE_ROOT, + .next_layer = IDR_INIT(next_layer), +}; + +struct xcu_group *xcu_group_root = &__xcu_group_root; +EXPORT_SYMBOL(xcu_group_root); + +static int nr_active_cu_inc(void) +{ + int cur_num = -1; + + spin_lock(&xcu_mgr_lock); + if (num_active_xcu >= XSCHED_NR_CUS) + goto out_unlock; + + cur_num = num_active_xcu; + num_active_xcu++; + +out_unlock: + spin_unlock(&xcu_mgr_lock); + return cur_num; +} + +static int nr_active_cu_dec(void) +{ + int cur_num = -1; + + spin_lock(&xcu_mgr_lock); + if (num_active_xcu <= 0) + goto out_unlock; + + cur_num = num_active_xcu; + num_active_xcu--; + +out_unlock: + spin_unlock(&xcu_mgr_lock); + return cur_num; +} + +/* + * Initialize and register xcu in xcu_manager array. + */ +int xsched_xcu_register(struct xcu_group *group, uint32_t phys_id) +{ + int xcu_cur_num, ret = 0; + struct xsched_cu *xcu; + + if (phys_id >= XSCHED_NR_CUS) { + XSCHED_ERR("phys_id [%u] is out of valid range [0, %d).\n", + phys_id, XSCHED_NR_CUS); + return -EINVAL; + } + + if (!group) { + XSCHED_ERR("group cannot be NULL.\n"); + return -EINVAL; + } + + xcu_cur_num = nr_active_cu_inc(); + if (xcu_cur_num < 0) { + XSCHED_ERR("Number of present XCU's exceeds %d: %d.\n", + XSCHED_NR_CUS, num_active_xcu); + return -ENOSPC; + } + + xcu = kzalloc(sizeof(struct xsched_cu), GFP_KERNEL); + if (!xcu) { + if (nr_active_cu_dec() < 0) { + XSCHED_ERR("num_active_xcu [%d] must be > 0.\n", + num_active_xcu); + } + XSCHED_ERR("Fail to alloc xcu for phys_id [%u].\n", phys_id); + return -ENOMEM; + } + + group->xcu = xcu; + xsched_cu_mgr[phys_id] = xcu; + + /* Init xcu's internals. */ + ret = xsched_xcu_init(xcu, group, phys_id); + if (ret != 0) { + group->xcu = NULL; + xsched_cu_mgr[phys_id] = NULL; + kfree(xcu); + XSCHED_ERR("Fail to init xcu[%u].", xcu->id); + return ret; + } + + xcu_cfs_root_cg_init(xcu); + + return 0; +} +EXPORT_SYMBOL(xsched_xcu_register); + +int xsched_xcu_unregister(struct xcu_group *group, uint32_t phys_id) +{ + struct xsched_cu *xcu; + + if (phys_id >= XSCHED_NR_CUS) { + XSCHED_ERR("phys_id [%u] is out of valid range [0, %d).\n", + phys_id, XSCHED_NR_CUS); + return -EINVAL; + } + + if (!group || !group->xcu || group->xcu != xsched_cu_mgr[phys_id]) { + XSCHED_ERR("group is invalid or cannot mapping to phys_id [%u].\n", phys_id); + return -EINVAL; + } + + xcu = group->xcu; + if (!xcu) { + XSCHED_ERR("xcu for phys_id [%u] not found.\n", phys_id); + return -EINVAL; + } + + if (nr_active_cu_dec() < 0) { + XSCHED_ERR("No active XCU\n"); + return -EPERM; + }; + + if (xcu->worker) { + mutex_lock(&xcu->xcu_lock); + wake_up_interruptible(&xcu->wq_xcu_idle); + mutex_unlock(&xcu->xcu_lock); + + kthread_stop(xcu->worker); + xcu->worker = NULL; + } else { + XSCHED_ERR("The worker for xcu [%u] must not be NULL.\n", xcu->id); + } + + group->xcu = NULL; + xsched_cu_mgr[phys_id] = NULL; + kfree(xcu); + + return 0; +} +EXPORT_SYMBOL(xsched_xcu_unregister); + diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 85fa78049bd0fbebee59db79363c962541a15c92..e65ae90946c2d3e27f724f4b965157b404b7b849 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -61,6 +61,10 @@ SUBSYS(pids) SUBSYS(rdma) #endif +#if IS_ENABLED(CONFIG_CGROUP_XCU) +SUBSYS(xcu) +#endif + #if IS_ENABLED(CONFIG_CGROUP_MISC) SUBSYS(misc) #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 36c592e43d65208f6d1b3099fa6805a24d5961de..119aabc72a2d403596ff695208e3fc02c9de350e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -74,6 +74,7 @@ struct landlock_ruleset_attr; enum landlock_rule_type; struct cachestat_range; struct cachestat; +struct vstream_args; #include #include @@ -948,6 +949,7 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); +asmlinkage long sys_vstream_manage(struct vstream_args __user *arg, int cmd); /* * Architecture-specific system calls */ diff --git a/include/linux/vstream.h b/include/linux/vstream.h new file mode 100644 index 0000000000000000000000000000000000000000..fd393ec97a99ab2cd70d8db51828273da41d8f24 --- /dev/null +++ b/include/linux/vstream.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VSTREAM_H +#define _LINUX_VSTREAM_H + +#include +#include + +#define MAX_VSTREAM_SIZE 2048 + +/* Vstream metadata describes each incoming kick + * that gets stored into a list of pending kicks + * inside a vstream to keep track of what is left + * to be processed by a driver. + */ +typedef struct vstream_metadata { + uint32_t exec_time; + /* A value of SQ tail that has been passed with the + * kick that is described by this exact metadata object. + */ + uint32_t sq_tail; + uint32_t sqe_num; + uint32_t sq_id; + uint8_t sqe[XCU_SQE_SIZE_MAX]; + + /* Report buffer for fake read. */ + int8_t cqe[XCU_CQE_BUF_SIZE]; + uint32_t cqe_num; + int32_t timeout; + + /* A node for metadata list */ + struct list_head node; + + struct vstream_info *parent; + + /* Time of list insertion */ + ktime_t add_time; +} vstream_metadata_t; + +typedef int vstream_manage_t(struct vstream_args *arg); + +typedef struct vstream_info { + uint32_t user_stream_id; + uint32_t id; + uint32_t vcq_id; + uint32_t logic_vcq_id; + uint32_t dev_id; + uint32_t channel_id; + uint32_t fd; + uint32_t task_type; + int tgid; + int sqcq_type; + + void *drv_ctx; + + int inode_fd; + + /* Pointer to corresponding context. */ + struct xsched_context *ctx; + + /* List node in context's vstream list. */ + struct list_head ctx_node; + + /* Pointer to an CU object on which this + * vstream is currently being processed. + * NULL if vstream is not being processed. + */ + struct xsched_cu *xcu; + + /* List node in an CU list of vstreams that + * are currently being processed by this specific CU. + */ + struct list_head xcu_node; + + /* Private vstream data. */ + void *data; + + spinlock_t stream_lock; + + uint32_t kicks_count; + + /* List of metadata a.k.a. all recorded unprocesed + * kicks for this exact vstream. + */ + struct list_head metadata_list; +} vstream_info_t; + +int vstream_alloc(struct vstream_args *arg); +int vstream_free(struct vstream_args *arg); +int vstream_kick(struct vstream_args *arg); + +#endif /* _LINUX_VSTREAM_H */ diff --git a/include/linux/xcu_group.h b/include/linux/xcu_group.h new file mode 100644 index 0000000000000000000000000000000000000000..c129dca32c518e7f35f6b10a74f01e681e547574 --- /dev/null +++ b/include/linux/xcu_group.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __XSCHED_XCU_GROUP_H__ +#define __XSCHED_XCU_GROUP_H__ + +#include +#include + +#ifndef CONFIG_XSCHED_NR_CUS +#define CONFIG_XSCHED_NR_CUS 128 +#endif /* !CONFIG_XSCHED_NR_CUS */ +#define XSCHED_NR_CUS CONFIG_XSCHED_NR_CUS + +extern struct xcu_group *xcu_group_root; + +enum xcu_type { + XCU_TYPE_ROOT, + XCU_TYPE_XPU, +}; + +enum xcu_sqe_op_type { + SQE_SET_NOTIFY, + SQE_IS_NOTIFY, +}; + +struct xcu_op_handler_params { + int fd; + struct xcu_group *group; + void *payload; + union { + struct { + void *param_1; + void *param_2; + void *param_3; + void *param_4; + void *param_5; + void *param_6; + void *param_7; + void *param_8; + }; + }; +}; + +typedef int (*xcu_op_handler_fn_t)(struct xcu_op_handler_params *params); + +struct xcu_operation { + xcu_op_handler_fn_t run; + xcu_op_handler_fn_t finish; + xcu_op_handler_fn_t wait; + xcu_op_handler_fn_t complete; + xcu_op_handler_fn_t alloc; + xcu_op_handler_fn_t logic_alloc; + xcu_op_handler_fn_t logic_free; + xcu_op_handler_fn_t sqe_op; +}; + +struct xcu_group { + /* sq id. */ + uint32_t id; + + /* Type of XCU group. */ + enum xcu_type type; + + /* IDR for the next layer of XCU group tree. */ + struct idr next_layer; + + /* Pointer to the previous XCU group in the XCU group tree. */ + struct xcu_group *previous_layer; + + /* Pointer to operation fn pointers object describing + * this XCU group's callbacks. + */ + struct xcu_operation *opt; + + /* Pointer to the XCU related to this XCU group. */ + struct xsched_cu *xcu; +}; + +int xcu_group_attach(struct xcu_group *new_group, + struct xcu_group *previous_group); +void xcu_group_detach(struct xcu_group *group); +struct xcu_group *xcu_group_find(struct xcu_group *group, int id); +struct xcu_group *xcu_group_init(int id); +void xcu_group_free(struct xcu_group *group); + +extern int xcu_run(struct xcu_op_handler_params *params); +extern int xcu_wait(struct xcu_op_handler_params *params); +extern int xcu_complete(struct xcu_op_handler_params *params); +extern int xcu_finish(struct xcu_op_handler_params *params); +extern int xcu_alloc(struct xcu_op_handler_params *params); +extern int xcu_logic_alloc(struct xcu_op_handler_params *params); +extern int xcu_logic_free(struct xcu_op_handler_params *params); +extern int xcu_sqe_op(struct xcu_op_handler_params *params); + +int xsched_xcu_register(struct xcu_group *group, uint32_t phys_id); +int xsched_xcu_unregister(struct xcu_group *group, uint32_t phys_id); +#endif /* __XSCHED_XCU_GROUP_H__ */ diff --git a/include/linux/xsched.h b/include/linux/xsched.h new file mode 100644 index 0000000000000000000000000000000000000000..d97b3beae8ad2a6bd547a8540f6efd6b87697d00 --- /dev/null +++ b/include/linux/xsched.h @@ -0,0 +1,514 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_XSCHED_H__ +#define __LINUX_XSCHED_H__ + +#include +#include +#include +#include + +#ifndef pr_fmt +#define pr_fmt(fmt) fmt +#endif + +#define XSCHED_INFO_PREFIX "XSched [INFO]: " +#define XSCHED_INFO(fmt, ...) \ + pr_info(pr_fmt(XSCHED_INFO_PREFIX fmt), ##__VA_ARGS__) + +#define XSCHED_ERR_PREFIX "XSched [ERROR]: " +#define XSCHED_ERR(fmt, ...) \ + pr_err(pr_fmt(XSCHED_ERR_PREFIX fmt), ##__VA_ARGS__) + +#define XSCHED_WARN_PREFIX "XSched [WARNING]: " +#define XSCHED_WARN(fmt, ...) \ + pr_warn(pr_fmt(XSCHED_WARN_PREFIX fmt), ##__VA_ARGS__) + +/* + * Debug specific prints for XSched + */ + +#define XSCHED_DEBUG_PREFIX "XSched [DEBUG]: " +#define XSCHED_DEBUG(fmt, ...) \ + pr_debug(pr_fmt(XSCHED_DEBUG_PREFIX fmt), ##__VA_ARGS__) + +#define XSCHED_CALL_STUB() \ + XSCHED_DEBUG(" -----* %s @ %s called *-----\n", __func__, __FILE__) + +#define XSCHED_EXIT_STUB() \ + XSCHED_DEBUG(" -----* %s @ %s exited *-----\n", __func__, __FILE__) + +#define MAX_VSTREAM_NUM 512 + +#define RUNTIME_INF ((u64)~0ULL) +#define XSCHED_TIME_INF RUNTIME_INF +#define XSCHED_CFS_WEIGHT_DFLT 1 +#define XSCHED_CFS_QUOTA_PERIOD_MS (100 * NSEC_PER_MSEC) +#define XSCHED_CFG_SHARE_DFLT 1024 + +/* + * A default kick slice for RT class XSEs. + */ +#define XSCHED_RT_KICK_SLICE 2 +/* + * A default kick slice for CFS class XSEs. + */ +#define XSCHED_CFS_KICK_SLICE 10 + +extern struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS]; + +enum xcu_sched_type { + XSCHED_TYPE_RT = 0, + XSCHED_TYPE_CFS = 1, + XSCHED_TYPE_NUM, + XSCHED_TYPE_DFLT = XSCHED_TYPE_RT +}; + +enum xse_prio { + XSE_PRIO_HIGH = 0, + XSE_PRIO_LOW = 4, + NR_XSE_PRIO, + XSE_PRIO_DFLT = XSE_PRIO_LOW +}; + +extern struct xsched_class rt_xsched_class; +extern struct xsched_class fair_xsched_class; + +#define xsched_first_class \ + list_first_entry(&(xsched_class_list), struct xsched_class, node) + +#define for_each_xsched_class(class) \ + list_for_each_entry((class), &(xsched_class_list), node) + +#define for_each_xse_prio(prio) \ + for (prio = XSE_PRIO_HIGH; prio < NR_XSE_PRIO; prio++) +#define for_each_vstream_in_ctx(vs, ctx) \ + list_for_each_entry((vs), &((ctx)->vstream_list), ctx_node) + + +/* Manages xsched RT-like class linked list based runqueue. + * + * Now RT-like class runqueue structs is identical + * but will most likely grow different in the + * future as the Xsched evolves. + */ +struct xsched_rq_rt { + struct list_head rq[NR_XSE_PRIO]; + unsigned int nr_running; +}; + +/* Manages xsched CFS-like class rbtree based runqueue. */ +struct xsched_rq_cfs { + unsigned int nr_running; + unsigned int load; + u64 min_xruntime; + struct rb_root_cached ctx_timeline; +}; + +/* Base XSched runqueue object structure that contains both mutual and + * individual parameters for different scheduling classes. + */ +struct xsched_rq { + struct xsched_entity *curr_xse; + const struct xsched_class *class; + + int state; + int nr_running; + /* RT class run queue.*/ + struct xsched_rq_rt rt; + /* CFS class run queue.*/ + struct xsched_rq_cfs cfs; +}; + +enum xsched_cu_status { + /* Worker not initialized. */ + XSCHED_XCU_NONE, + + /* Worker is sleeping in idle state. */ + XSCHED_XCU_WAIT_IDLE, + + /* Worker is sleeping in running state. */ + XSCHED_XCU_WAIT_RUNNING, + + /* Worker is active but not processing anything. */ + XSCHED_XCU_ACTIVE, + + NR_XSCHED_XCU_STATUS, +}; + +/* This is the abstraction object of the xcu computing unit. */ +struct xsched_cu { + uint32_t id; + uint32_t state; + + atomic_t pending_kicks; + struct task_struct *worker; + + /* Storage list for contexts associated with this xcu */ + uint32_t nr_ctx; + struct list_head ctx_list; + struct mutex ctx_list_lock; + + vstream_info_t *vs_array[MAX_VSTREAM_NUM]; + struct mutex vs_array_lock; + + struct xsched_rq xrq; + struct list_head vsm_list; + + struct xcu_group *group; + struct mutex xcu_lock; + wait_queue_head_t wq_xcu_idle; +}; + +extern int num_active_xcu; +#define for_each_active_xcu(xcu, id) \ + for ((id) = 0, xcu = xsched_cu_mgr[(id)]; \ + (id) < num_active_xcu && (xcu = xsched_cu_mgr[(id)]); (id)++) + +struct xsched_entity_rt { + struct list_head list_node; + enum xse_prio prio; + + ktime_t timeslice; +}; + +struct xsched_entity_cfs { + struct rb_node run_node; + + /* Rq on which this entity is (to be) queued. */ + struct xsched_rq_cfs *cfs_rq; + + /* Value of "virtual" runtime to sort entities in rbtree */ + u64 xruntime; + u32 weight; + + /* Execution time of scheduling entity */ + u64 exec_start; + u64 sum_exec_runtime; +}; + +struct xsched_entity { + uint32_t task_type; + + bool on_rq; + + pid_t owner_pid; + pid_t tgid; + + /* Amount of pending kicks currently sitting on this context. */ + atomic_t kicks_pending_ctx_cnt; + + /* Amount of submitted kicks context, used for resched decision. */ + atomic_t submitted_one_kick; + + size_t total_scheduled; + size_t total_submitted; + + /* File descriptor coming from an associated context + * used for identifying a given xsched entity in + * info and error prints. + */ + uint32_t fd; + + /* Xsched class for this xse. */ + const struct xsched_class *class; + + /* RT class entity. */ + struct xsched_entity_rt rt; + /* CFS class entity. */ + struct xsched_entity_cfs cfs; + + /* Pointer to context object. */ + struct xsched_context *ctx; + + /* Xsched entity execution statistics */ + u64 last_exec_runtime; + + /* Pointer to an XCU object that represents an XCU + * on which this xse is to be processed or is being + * processed currently. + */ + struct xsched_cu *xcu; + + /* Link to list of xsched_group items */ + struct list_head group_node; + struct xsched_group *parent_grp; + bool is_group; + + /* General purpose xse lock. */ + spinlock_t xse_lock; +}; + +struct xcg_attach_entry { + struct task_struct *task; + struct xsched_group *old_xcg; + struct xsched_group *new_xcg; + + struct list_head node; +}; + +/* xsched_group's xcu related stuff */ +struct xsched_group_xcu_priv { + /* Owner of this group */ + struct xsched_group *self; + + /* xcu id */ + int xcu_id; + + /* Link to scheduler */ + struct xsched_entity xse; /* xse of this group on runqueue */ + struct xsched_rq_cfs *cfs_rq; /* cfs runqueue "owned" by this group */ + struct xsched_rq_rt *rt_rq; /* rt runqueue "owned" by this group */ + /* Statistics */ + int nr_throttled; + u64 throttled_time; +}; + +enum xcu_file_type { + XCU_FILE_PERIOD_MS, + XCU_FILE_QUOTA_MS, + XCU_FILE_SHARES, + NR_XCU_FILE_TYPES, +}; + +/* Xsched scheduling control group */ +struct xsched_group { + /* Cgroups controller structure */ + struct cgroup_subsys_state css; + + /* Control group settings: */ + int sched_class; + int prio; + + /* Bandwidth setting: shares value set by user */ + u64 shares_cfg; + u64 shares_cfg_red; + u32 weight; + u64 children_shares_sum; + + /* Bandwidth setting: maximal quota in period */ + s64 quota; + /* record the runtime of operators during the period */ + s64 runtime; + s64 period; + struct hrtimer quota_timeout; + struct work_struct refill_work; + + struct xsched_group_xcu_priv perxcu_priv[XSCHED_NR_CUS]; + + /* Groups hierarchcy */ + struct xsched_group *parent; + struct list_head children_groups; + struct list_head group_node; + + spinlock_t lock; + + /* for XSE to move in perxcu */ + struct list_head members; + + /* to control the xcu.{period, quota, shares} files shown or not */ + struct cgroup_file xcu_file[NR_XCU_FILE_TYPES]; + struct work_struct file_show_work; +}; + +#define XSCHED_RQ_OF(xse) \ + (container_of(((xse)->cfs.cfs_rq), struct xsched_rq, cfs)) + +#define XSCHED_RQ_OF_CFS_XSE(cfs_xse) \ + (container_of(((cfs_xse)->cfs_rq), struct xsched_rq, cfs)) + +#define XSCHED_SE_OF(cfs_xse) \ + (container_of((cfs_xse), struct xsched_entity, cfs)) + +#define xcg_parent_grp_xcu(xcg) \ + ((xcg)->self->parent->perxcu_priv[(xcg)->xcu_id]) + +#define xse_parent_grp_xcu(xse_cfs) \ + (&((XSCHED_SE_OF(xse_cfs) \ + ->parent_grp->perxcu_priv[(XSCHED_SE_OF(xse_cfs))->xcu->id]))) + +static inline struct xsched_group_xcu_priv * +xse_this_grp_xcu(struct xsched_entity_cfs *xse_cfs) +{ + struct xsched_entity *xse; + + xse = xse_cfs ? container_of(xse_cfs, struct xsched_entity, cfs) : NULL; + return xse ? container_of(xse, struct xsched_group_xcu_priv, xse) : NULL; +} + +static inline struct xsched_group * +xse_this_grp(struct xsched_entity_cfs *xse_cfs) +{ + return xse_cfs ? xse_this_grp_xcu(xse_cfs)->self : NULL; +} + +/* Increments pending kicks counter for an XCU that the given + * xsched entity is attached to and for xsched entity's xsched + * class. + */ +static inline int xsched_inc_pending_kicks_xse(struct xsched_entity *xse) +{ + atomic_inc(&xse->xcu->pending_kicks); + /* Icrement pending kicks for current XSE. */ + atomic_inc(&xse->kicks_pending_ctx_cnt); + + return 0; +} + +/* Decrements pending kicks counter for an XCU that the given + * xsched entity is attached to and for XSched entity's sched + * class. + */ +static inline int xsched_dec_pending_kicks_xse(struct xsched_entity *xse) +{ + atomic_dec(&xse->xcu->pending_kicks); + /* Decrementing pending kicks for current XSE. */ + atomic_dec(&xse->kicks_pending_ctx_cnt); + + return 0; +} + +/* Checks if there are pending kicks left on a given XCU for all + * xsched classes. + */ +static inline bool xsched_check_pending_kicks_xcu(struct xsched_cu *xcu) +{ + return atomic_read(&xcu->pending_kicks); +} + +static inline int xse_integrity_check(const struct xsched_entity *xse) +{ + if (!xse) { + XSCHED_ERR("xse is null @ %s\n", __func__); + return -EINVAL; + } + + if (!xse->class) { + XSCHED_ERR("xse->class is null @ %s\n", __func__); + return -EINVAL; + } + + return 0; +} + +struct xsched_context { + uint32_t fd; + uint32_t dev_id; + pid_t tgid; + + struct list_head vstream_list; + struct list_head ctx_node; + + struct xsched_entity xse; + + spinlock_t ctx_lock; + struct mutex ctx_mutex; + struct kref kref; +}; + +extern struct list_head xsched_ctx_list; +extern struct mutex xsched_ctx_list_mutex; + +/* Returns a pointer to xsched_context object corresponding to a given + * tgid and xcu. + */ +static inline struct xsched_context * +ctx_find_by_tgid_and_xcu(pid_t tgid, struct xsched_cu *xcu) +{ + struct xsched_context *ctx; + struct xsched_context *ret = NULL; + + list_for_each_entry(ctx, &xcu->ctx_list, ctx_node) { + if (ctx->tgid == tgid) { + ret = ctx; + break; + } + } + return ret; +} + +static inline u64 gcd(u64 a, u64 b) +{ + u64 rem; + + while (a != 0 && b != 0) { + if (a > b) { + div64_u64_rem(a, b, &rem); + a = rem; + } else { + div64_u64_rem(b, a, &rem); + b = rem; + } + } + return (a) ? a : b; +} + +struct xsched_class { + enum xcu_sched_type class_id; + size_t kick_slice; + struct list_head node; + + /* Initialize a new xsched entity */ + void (*xse_init)(struct xsched_entity *xse); + + /* Destroy XSE scheduler-specific data */ + void (*xse_deinit)(struct xsched_entity *xse); + + /* Initialize a new runqueue per xcu */ + void (*rq_init)(struct xsched_cu *xcu); + + /* Removes a given XSE from it's runqueue. */ + void (*dequeue_ctx)(struct xsched_entity *xse); + + /* Places a given XSE on a runqueue on a given XCU. */ + void (*enqueue_ctx)(struct xsched_entity *xse, struct xsched_cu *xcu); + + /* Returns a next XSE to be submitted on a given XCU. */ + struct xsched_entity *(*pick_next_ctx)(struct xsched_cu *xcu); + + /* Put a XSE back into rq during preemption. */ + void (*put_prev_ctx)(struct xsched_entity *xse); + + /* Check context preemption. */ + bool (*check_preempt)(struct xsched_entity *xse); + + /* Select jobs from XSE to submit on XCU */ + size_t (*select_work)(struct xsched_cu *xcu, struct xsched_entity *xse); +}; + +static inline void xsched_init_vsm(struct vstream_metadata *vsm, + struct vstream_info *vs, vstream_args_t *arg) +{ + vsm->sq_id = arg->sq_id; + vsm->exec_time = arg->vk_args.exec_time; + vsm->sqe_num = arg->vk_args.sqe_num; + vsm->timeout = arg->vk_args.timeout; + memcpy(vsm->sqe, arg->vk_args.sqe, XCU_SQE_SIZE_MAX); + vsm->parent = vs; + INIT_LIST_HEAD(&vsm->node); +} + +int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id); +int xsched_schedule(void *input_xcu); +int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs); +int ctx_bind_to_xcu(vstream_info_t *vstream_info, struct xsched_context *ctx); +int xsched_vsm_add_tail(struct vstream_info *vs, vstream_args_t *arg); +struct vstream_metadata *xsched_vsm_fetch_first(struct vstream_info *vs); +int xsched_rt_prio_set(pid_t tgid, unsigned int prio); +void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); +void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); +int delete_ctx(struct xsched_context *ctx); + +/* Xsched group manage functions */ +void xsched_group_inherit(struct task_struct *tsk, struct xsched_entity *xse); +void xcu_cg_subsys_init(void); +void xcu_cfs_root_cg_init(struct xsched_cu *xcu); +void xcu_grp_shares_update(struct xsched_group *parent); +void xsched_group_xse_detach(struct xsched_entity *xse); + +void xsched_quota_init(void); +void xsched_quota_timeout_init(struct xsched_group *xg); +void xsched_quota_timeout_update(struct xsched_group *xg); +void xsched_quota_account(struct xsched_group *xg, s64 exec_time); +bool xsched_quota_exceed(struct xsched_group *xg); +void xsched_quota_refill(struct work_struct *work); +#endif /* !__LINUX_XSCHED_H__ */ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index bf2b30463784e026092d3dcde784237f4b5a49f0..ea50d1a3471cb6abddef8048ff54d3984f2fe427 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -826,8 +826,10 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2) #define __NR_map_shadow_stack 453 __SYSCALL(__NR_map_shadow_stack, sys_map_shadow_stack) -#define __NR_kabi_reserved454 454 -__SYSCALL(__NR_kabi_reserved454, sys_ni_syscall) +#define __IGNORE_kabi_reserved454 +#define __NR_vstream_manage 454 +__SYSCALL(__NR_vstream_manage, sys_vstream_manage) + #define __NR_kabi_reserved455 455 __SYSCALL(__NR_kabi_reserved455, sys_ni_syscall) #define __NR_kabi_reserved456 456 diff --git a/include/uapi/linux/xcu_vstream.h b/include/uapi/linux/xcu_vstream.h new file mode 100644 index 0000000000000000000000000000000000000000..b60c0e0e15f575ceb4c224ef2e53719ab4e921e7 --- /dev/null +++ b/include/uapi/linux/xcu_vstream.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_XCU_VSTREAM_H +#define _UAPI_XCU_VSTREAM_H + +#include + +#define PAYLOAD_SIZE_MAX 512 +#define XCU_SQE_SIZE_MAX 64 +#define XCU_CQE_SIZE_MAX 32 +#define XCU_CQE_REPORT_NUM 4 +#define XCU_CQE_BUF_SIZE (XCU_CQE_REPORT_NUM * XCU_CQE_SIZE_MAX) + +#define KABI_RESERVE_BYTES(idx, n) \ + __u8 __kabi_reserved_##idx[n] + +/* + * VSTREAM_ALLOC: alloc a vstream, buffer for tasks + * VSTREAM_FREE: free a vstream + * VSTREAM_KICK: there are tasks to be executed in the vstream + */ +typedef enum VSTREAM_COMMAND { + VSTREAM_ALLOC = 0, + VSTREAM_FREE, + VSTREAM_KICK, + MAX_COMMAND +} vstream_command_t; + +typedef struct vstream_alloc_args { + __s32 type; + __u32 user_stream_id; + + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_alloc_args_t; + +typedef struct vstream_free_args { + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_free_args_t; + +typedef struct vstream_kick_args { + __u32 sqe_num; + __u32 exec_time; + __s32 timeout; + __s8 sqe[XCU_SQE_SIZE_MAX]; + + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_kick_args_t; + +typedef struct vstream_args { + __u32 channel_id; + __u32 fd; + __u32 dev_id; + __u32 task_type; + __u32 sq_id; + __u32 cq_id; + + /* Device related structures. */ + union { + vstream_alloc_args_t va_args; + vstream_free_args_t vf_args; + vstream_kick_args_t vk_args; + }; + + __u32 payload_size; + __s8 payload[PAYLOAD_SIZE_MAX]; + + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_args_t; + +#endif /* _UAPI_LINUX_SCHED_H */ diff --git a/init/Kconfig b/init/Kconfig index 2720083aaa17d5d47cacdd37c51371e11e3aea97..b3c4487fa63146a8c2101cdb9013e29d0efdbf41 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -488,6 +488,7 @@ source "kernel/time/Kconfig" source "kernel/bpf/Kconfig" source "kernel/bpf-rvi/Kconfig" source "kernel/Kconfig.preempt" +source "kernel/xsched/Kconfig" menu "CPU/Task time and stats accounting" diff --git a/kernel/Makefile b/kernel/Makefile index da4c2d1838dc9a4dbac5757743f330fa192fe650..c6372dd5b36a9d2877e2187fca3a3e8ed7dab42e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -50,6 +50,7 @@ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ +obj-$(CONFIG_XCU_SCHEDULER) += xsched/ obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 115717d58aa7ea2a2b8055b669c2cf972f4a71cd..7df73d1d66289a841494e882ad8cbfce190e2620 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6256,7 +6256,7 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 17); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/xsched/Kconfig b/kernel/xsched/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..b7e7b222c9496bc9429df7e97f21d91ea5e57e80 --- /dev/null +++ b/kernel/xsched/Kconfig @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: GPL-2.0 + +config XCU_SCHEDULER + bool "Enable XSched functionality" + default n + select XCU_VSTREAM + select XCU_SCHED_RT + select XCU_SCHED_CFS + select CGROUP_XCU + help + This option enables the XSched scheduler, a custom scheduling mechanism + designed for heterogeneous compute units (e.g., XPUs). It provides: + - Priority-based task scheduling with latency-sensitive optimizations. + - Integration with cgroups (via CGROUP_XCU) for resource isolation. + + Enable this only if your system requires advanced scheduling for XPU workloads. + If unsure, say N. + +config XCU_VSTREAM + bool "Enable vstream SQ/CQ buffers maintaining for XPU" + default n + depends on XCU_SCHEDULER + help + This option enables virtual stream (vstream) support for XPUs, managing + submission queues (SQ) and completion queues (CQ) in kernel space. Key features: + - Zero-copy buffer management between user and kernel space. + - Batch processing of XPU commands to reduce MMIO overhead. + + Requires XCU_SCHEDULER to be enabled. May increase kernel memory usage. + Recommended for high-throughput XPU workloads. If unsure, say N. + +config XSCHED_NR_CUS + int "Number of CUs (a.k.a. XCUs) available to XSched mechanism" + default 128 + depends on XCU_SCHEDULER + help + This option defines the maximum number of Compute Units (CUs) that can be + managed by the XSched scheduler, consider changing this value proportionally + to the number of available XCU cores. + +config XCU_SCHED_RT + bool "XCU RT scheduling class" + default y + depends on XCU_SCHEDULER + help + Enable support for the RT scheduling class in the XCU scheduler. + + This option allows XCU to schedule tasks using real-time priorities + (XSCHED_TYPE_RT). When enabled, tasks in RT cgroups can be assigned + deterministic priorities and will be scheduled ahead of CFS tasks. + + Unless you are using RT workloads that rely on strict priority-based + scheduling within XCU, it is recommended to keep the default setting. + +config XCU_SCHED_CFS + bool "XCU CFS scheduling class" + default n + depends on XCU_SCHEDULER + help + Enable support for the CFS scheduling class in the XCU scheduler. + + This option allows the XCU scheduler to manage tasks using a fair-share + scheduling model similar to the Completely Fair Scheduler (CFS). + XCU-CFS provides proportional CPU sharing based on weights and supports + hierarchical control through cgroups. + + Enable this option if you want to run workloads that rely on fair, + weight-based CPU distribution within the XCU scheduling framework. + If your workload does not require proportional sharing or uses only the + RT scheduling class, you may leave this disabled. + +config CGROUP_XCU + bool "XCU bandwidth control and group scheduling for xsched_cfs" + default n + depends on XCU_SCHEDULER + help + This option enables the extended Compute Unit (XCU) resource controller for + CFS task groups, providing hierarchical scheduling and fine-grained bandwidth + allocation capabilities. Key features include: + - Proportional XCU time distribution across cgroups based on shares/quotas + - Nested group scheduling with latency isolation + - Integration with xsched_cfs for fair CPU resource management + + Required for systems requiring fine-grained resource control in cgroups. + If unsure, say N. diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ffa0bb98538c871aedf8f1d44404c7caa4f7ece9 --- /dev/null +++ b/kernel/xsched/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += vstream.o +obj-$(CONFIG_XCU_SCHEDULER) += core.o +obj-$(CONFIG_XCU_SCHED_RT) += rt.o +obj-$(CONFIG_XCU_SCHED_CFS) += cfs.o cfs_quota.o +obj-$(CONFIG_CGROUP_XCU) += cgroup.o diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c new file mode 100644 index 0000000000000000000000000000000000000000..1cbfd5f0e5869d9d6a08b98a0d6494c0a9eb3ebb --- /dev/null +++ b/kernel/xsched/cfs.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Completely Fair Scheduling (CFS) Class for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include + +#define CFS_INNER_RQ_EMPTY(cfs_xse) \ + ((cfs_xse)->xruntime == XSCHED_TIME_INF) + +void xs_rq_add(struct xsched_entity_cfs *xse) +{ + struct xsched_rq_cfs *cfs_rq = xse->cfs_rq; + struct rb_node **link = &cfs_rq->ctx_timeline.rb_root.rb_node; + struct rb_node *parent = NULL; + struct xsched_entity_cfs *entry; + bool leftmost = true; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct xsched_entity_cfs, run_node); + if (xse->xruntime <= entry->xruntime) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(&xse->run_node, parent, link); + rb_insert_color_cached(&xse->run_node, &cfs_rq->ctx_timeline, leftmost); +} + +void xs_rq_remove(struct xsched_entity_cfs *xse) +{ + struct xsched_rq_cfs *cfs_rq = xse->cfs_rq; + + rb_erase_cached(&xse->run_node, &cfs_rq->ctx_timeline); +} + +/** + * xs_cfs_rq_update() - Update entity's runqueue position with new xruntime + */ +static void xs_cfs_rq_update(struct xsched_entity_cfs *xse_cfs, u64 new_xrt) +{ + xs_rq_remove(xse_cfs); + xse_cfs->xruntime = new_xrt; + xs_rq_add(xse_cfs); +} + +static inline struct xsched_entity_cfs * +xs_pick_first(struct xsched_rq_cfs *cfs_rq) +{ + struct xsched_entity_cfs *xse_cfs; + struct rb_node *left = rb_first_cached(&cfs_rq->ctx_timeline); + + if (!left) + return NULL; + + xse_cfs = rb_entry(left, struct xsched_entity_cfs, run_node); + return xse_cfs; +} + +/** + * xs_update() - Account xruntime and runtime metrics. + * @xse_cfs: Point to CFS scheduling entity. + * @delta: Execution time in last period + */ +static void xs_update(struct xsched_entity_cfs *xse_cfs, u64 delta) +{ + struct xsched_group_xcu_priv *xg = xse_parent_grp_xcu(xse_cfs); + + for (; xg; xse_cfs = &xg->xse.cfs, xg = &xcg_parent_grp_xcu(xg)) { + u64 new_xrt = xse_cfs->xruntime + delta * xse_cfs->weight; + + xs_cfs_rq_update(xse_cfs, new_xrt); + xse_cfs->sum_exec_runtime += delta; + + if (xg->self->parent == NULL) + break; + } +} + +/** + * xg_update() - Update container group's xruntime + * @gxcu: Descendant xsched group's private xcu control structure + * + * No locks required to access xsched_group_xcu_priv members, + * because only one worker thread works for one XCU. + */ +static void xg_update(struct xsched_group_xcu_priv *xg, int task_delta) +{ + u64 new_xrt; + struct xsched_entity_cfs *entry; + + for (; xg; xg = &xcg_parent_grp_xcu(xg)) { + xg->cfs_rq->nr_running += task_delta; + entry = xs_pick_first(xg->cfs_rq); + new_xrt = entry ? entry->xruntime * xg->xse.cfs.weight : XSCHED_TIME_INF; + + xg->cfs_rq->min_xruntime = new_xrt; + xg->xse.cfs.xruntime = new_xrt; + + if (!xg->xse.on_rq) + break; + if (!xg->self->parent) + break; + + xs_cfs_rq_update(&xg->xse.cfs, new_xrt); + } +} + +/* + * Xsched Fair class methods + * For rq manipulation we rely on root runqueue lock already acquired in core. + * Access xsched_group_xcu_priv requires no locks because one thread per XCU. + */ +static void dequeue_ctx_fair(struct xsched_entity *xse) +{ + int task_delta; + struct xsched_cu *xcu = xse->xcu; + struct xsched_entity_cfs *first; + struct xsched_entity_cfs *xse_cfs = &xse->cfs; + + task_delta = + (xse->is_group) ? -(xse_this_grp_xcu(xse_cfs)->cfs_rq->nr_running) : -1; + + xs_rq_remove(xse_cfs); + xg_update(xse_parent_grp_xcu(xse_cfs), task_delta); + + first = xs_pick_first(&xcu->xrq.cfs); + xcu->xrq.cfs.min_xruntime = (first) ? first->xruntime : XSCHED_TIME_INF; +} + +/** + * enqueue_ctx_fair() - Add context to the runqueue + * @xse: xsched entity of context + * @xcu: executor + * + * In contrary to enqueue_task it is called once on context init. + * Although groups reside in tree, their nodes not counted in nr_running. + * The xruntime of a group xsched entitry represented by min xruntime inside. + */ +static void enqueue_ctx_fair(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + int task_delta; + struct xsched_entity_cfs *first; + struct xsched_rq_cfs *rq; + struct xsched_entity_cfs *xse_cfs = &xse->cfs; + + rq = xse_cfs->cfs_rq = xse_parent_grp_xcu(xse_cfs)->cfs_rq; + task_delta = + (xse->is_group) ? xse_this_grp_xcu(xse_cfs)->cfs_rq->nr_running : 1; + + /* If no XSE or only empty groups */ + if (xs_pick_first(rq) == NULL || rq->min_xruntime == XSCHED_TIME_INF) + rq->min_xruntime = xse_cfs->xruntime; + else + xse_cfs->xruntime = max(xse_cfs->xruntime, rq->min_xruntime); + + xs_rq_add(xse_cfs); + xg_update(xse_parent_grp_xcu(xse_cfs), task_delta); + + first = xs_pick_first(&xcu->xrq.cfs); + xcu->xrq.cfs.min_xruntime = (first) ? first->xruntime : XSCHED_TIME_INF; +} + +static struct xsched_entity *pick_next_ctx_fair(struct xsched_cu *xcu) +{ + struct xsched_entity_cfs *xse; + struct xsched_rq_cfs *rq = &xcu->xrq.cfs; + + xse = xs_pick_first(rq); + if (!xse) + return NULL; + + for (; XSCHED_SE_OF(xse)->is_group; xse = xs_pick_first(rq)) { + if (!xse || CFS_INNER_RQ_EMPTY(xse)) + return NULL; + rq = xse_this_grp_xcu(xse)->cfs_rq; + } + + return container_of(xse, struct xsched_entity, cfs); +} + +static inline bool +xs_should_preempt_fair(struct xsched_entity *xse) +{ + return (atomic_read(&xse->submitted_one_kick) >= XSCHED_CFS_KICK_SLICE); +} + +static void put_prev_ctx_fair(struct xsched_entity *xse) +{ + struct xsched_entity_cfs *prev = &xse->cfs; + + xsched_quota_account(xse->parent_grp, (s64)xse->last_exec_runtime); + xs_update(prev, xse->last_exec_runtime); +} + +void rq_init_fair(struct xsched_cu *xcu) +{ + xcu->xrq.cfs.ctx_timeline = RB_ROOT_CACHED; +} + +void xse_init_fair(struct xsched_entity *xse) +{ + xse->cfs.weight = XSCHED_CFS_WEIGHT_DFLT; +} + +void xse_deinit_fair(struct xsched_entity *xse) +{ + /* TODO Cgroup exit */ +} + +struct xsched_class fair_xsched_class = { + .class_id = XSCHED_TYPE_CFS, + .kick_slice = XSCHED_CFS_KICK_SLICE, + .rq_init = rq_init_fair, + .xse_init = xse_init_fair, + .xse_deinit = xse_deinit_fair, + .dequeue_ctx = dequeue_ctx_fair, + .enqueue_ctx = enqueue_ctx_fair, + .pick_next_ctx = pick_next_ctx_fair, + .put_prev_ctx = put_prev_ctx_fair, + .check_preempt = xs_should_preempt_fair, +}; diff --git a/kernel/xsched/cfs_quota.c b/kernel/xsched/cfs_quota.c new file mode 100644 index 0000000000000000000000000000000000000000..2b516ab5592f6063bd347e261e94231a3165ea38 --- /dev/null +++ b/kernel/xsched/cfs_quota.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Bandwidth provisioning for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include + +static struct workqueue_struct *quota_workqueue; + +void xsched_quota_refill(struct work_struct *work) +{ + uint32_t id; + struct xsched_cu *xcu; + struct xsched_group *xg; + + xg = container_of(work, struct xsched_group, refill_work); + + spin_lock(&xg->lock); + xg->runtime = max((xg->runtime - xg->quota), (s64)0); + hrtimer_start(&xg->quota_timeout, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT); + spin_unlock(&xg->lock); + + if (xg->runtime >= xg->quota) { + XSCHED_DEBUG("xcu_cgroup [css=0x%lx] is still be throttled @ %s\n", + (uintptr_t)&xg->css, __func__); + return; + } + + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + if (!READ_ONCE(xg->perxcu_priv[id].xse.on_rq)) { + enqueue_ctx(&xg->perxcu_priv[id].xse, xcu); + wake_up_interruptible(&xcu->wq_xcu_idle); + } + mutex_unlock(&xcu->xcu_lock); + } +} + +static enum hrtimer_restart quota_timer_cb(struct hrtimer *hrtimer) +{ + struct xsched_group *xg; + + xg = container_of(hrtimer, struct xsched_group, quota_timeout); + queue_work(quota_workqueue, &xg->refill_work); + + return HRTIMER_NORESTART; +} + +void xsched_quota_account(struct xsched_group *xg, s64 exec_time) +{ + spin_lock(&xg->lock); + xg->runtime += exec_time; + spin_unlock(&xg->lock); +} + +bool xsched_quota_exceed(struct xsched_group *xg) +{ + bool ret; + + spin_lock(&xg->lock); + ret = (xg->quota > 0) ? (xg->runtime >= xg->quota) : false; + spin_unlock(&xg->lock); + + return ret; +} + +void xsched_quota_init(void) +{ + quota_workqueue = create_singlethread_workqueue("xsched_quota_workqueue"); +} + +void xsched_quota_timeout_init(struct xsched_group *xg) +{ + hrtimer_init(&xg->quota_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + xg->quota_timeout.function = quota_timer_cb; +} + +void xsched_quota_timeout_update(struct xsched_group *xg) +{ + struct hrtimer *t = &xg->quota_timeout; + + hrtimer_cancel(t); + + if (xg->quota > 0 && xg->period > 0) + hrtimer_start(t, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT); +} diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c new file mode 100644 index 0000000000000000000000000000000000000000..f7eeedc80fc38be0acdb13d7f850ca375316d9eb --- /dev/null +++ b/kernel/xsched/cgroup.c @@ -0,0 +1,775 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Support cgroup for xpu device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include +#include +#include +#include +#include + +static struct xsched_group root_xsched_group; +struct xsched_group *root_xcg = &root_xsched_group; + +/* + * Cacheline aligned slab cache for xsched_group, + * to replace kzalloc with kmem_cache_alloc. + */ +static struct kmem_cache *xsched_group_cache __read_mostly; +static struct kmem_cache *xcg_attach_entry_cache __read_mostly; +static LIST_HEAD(xcg_attach_list); + +static const char xcu_sched_name[XSCHED_TYPE_NUM][4] = { + [XSCHED_TYPE_RT] = "rt", + [XSCHED_TYPE_CFS] = "cfs" +}; + +static int xcu_cg_set_file_show(struct xsched_group *xg) +{ + if (!xg) { + XSCHED_ERR("xsched_group is NULL.\n"); + return -EINVAL; + } + + /* Update visibility of related files based on sched_class */ + for (int type_name = XCU_FILE_PERIOD_MS; type_name < NR_XCU_FILE_TYPES; type_name++) { + if (unlikely(!xg->xcu_file[type_name].kn)) { + XSCHED_ERR("Fail to control the file [%d] to be %s @ %s.\n", + type_name, + xg->sched_class == XSCHED_TYPE_CFS ? "visible" : "invisible", + __func__); + return -EBUSY; + } + + cgroup_file_show(&xg->xcu_file[type_name], xg->sched_class == XSCHED_TYPE_CFS); + } + + return 0; +} + +/** + * @brief Initialize the core components of an xsched_group. + * + * This function initializes the essential components of an xsched_group, + * including the spin lock, member list, children groups list, quota timeout + * mechanism, and refill work queue. These components are necessary for the + * proper functioning of the xsched_group. + * + * @param xcg Pointer to the xsched_group to be initialized. + */ +static void xcu_cg_initialize_components(struct xsched_group *xcg) +{ + spin_lock_init(&xcg->lock); + INIT_LIST_HEAD(&xcg->members); + INIT_LIST_HEAD(&xcg->children_groups); + xsched_quota_timeout_init(xcg); + INIT_WORK(&xcg->refill_work, xsched_quota_refill); +} + +void xcu_cg_subsys_init(void) +{ + xcu_cg_initialize_components(root_xcg); + + root_xcg->sched_class = XSCHED_TYPE_DFLT; + root_xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS; + root_xcg->quota = XSCHED_TIME_INF; + root_xcg->runtime = 0; + xsched_quota_init(); + + xsched_group_cache = KMEM_CACHE(xsched_group, 0); + xcg_attach_entry_cache = KMEM_CACHE(xcg_attach_entry, 0); +} + +void xcu_cfs_root_cg_init(struct xsched_cu *xcu) +{ + int id = xcu->id; + + root_xcg->perxcu_priv[id].xcu_id = id; + root_xcg->perxcu_priv[id].self = root_xcg; + root_xcg->perxcu_priv[id].cfs_rq = &xcu->xrq.cfs; + root_xcg->perxcu_priv[id].xse.cfs.weight = XSCHED_CFS_WEIGHT_DFLT; +} + +/** + * xcu_cfs_cg_init() - Initialize xsched_group cfs runqueues and bw control. + * @xcg: new xsched_cgroup + * @parent_xg: parent's group + * + * One xsched_group can host many processes with contexts on different devices. + * Function creates xsched_entity for every XCU, and places it in runqueue + * of parent group. Create new cfs rq for xse inside group. + */ +static int xcu_cfs_cg_init(struct xsched_group *xcg, + struct xsched_group *parent_xg) +{ + int id = 0, err, i; + struct xsched_cu *xcu; + struct xsched_rq_cfs *sub_cfs_rq; + + for_each_active_xcu(xcu, id) { + xcg->perxcu_priv[id].xcu_id = id; + xcg->perxcu_priv[id].self = xcg; + + sub_cfs_rq = kzalloc(sizeof(struct xsched_rq_cfs), GFP_KERNEL); + if (!sub_cfs_rq) { + XSCHED_ERR("Fail to alloc cfs runqueue on xcu %d\n", id); + err = -ENOMEM; + goto alloc_error; + } + xcg->perxcu_priv[id].cfs_rq = sub_cfs_rq; + xcg->perxcu_priv[id].cfs_rq->ctx_timeline = RB_ROOT_CACHED; + + xcg->perxcu_priv[id].xse.is_group = true; + xcg->perxcu_priv[id].xse.xcu = xcu; + xcg->perxcu_priv[id].xse.class = &fair_xsched_class; + + /* Put new empty groups to the right in parent's rbtree: */ + xcg->perxcu_priv[id].xse.cfs.xruntime = XSCHED_TIME_INF; + xcg->perxcu_priv[id].xse.cfs.weight = XSCHED_CFS_WEIGHT_DFLT; + xcg->perxcu_priv[id].xse.parent_grp = parent_xg; + + mutex_lock(&xcu->xcu_lock); + enqueue_ctx(&xcg->perxcu_priv[id].xse, xcu); + mutex_unlock(&xcu->xcu_lock); + } + + xcg->shares_cfg = XSCHED_CFG_SHARE_DFLT; + xcu_grp_shares_update(parent_xg); + xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS; + xcg->quota = XSCHED_TIME_INF; + xcg->runtime = 0; + + return 0; + +alloc_error: + for (i = 0; i < id; i++) { + xcu = xsched_cu_mgr[i]; + mutex_lock(&xcu->xcu_lock); + dequeue_ctx(&xcg->perxcu_priv[i].xse, xcu); + mutex_unlock(&xcu->xcu_lock); + + kfree(xcg->perxcu_priv[i].cfs_rq); + } + + return err; +} + +static void xcu_cfs_cg_deinit(struct xsched_group *xcg) +{ + uint32_t id; + struct xsched_cu *xcu; + + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + dequeue_ctx(&xcg->perxcu_priv[id].xse, xcu); + mutex_unlock(&xcu->xcu_lock); + kfree(xcg->perxcu_priv[id].cfs_rq); + } + xcu_grp_shares_update(xcg->parent); +} + +/** + * xcu_cg_init() - Initialize non-root xsched_group structure. + * @xcg: new xsched_cgroup + * @parent_xg: parent's group + */ +static int xcu_cg_init(struct xsched_group *xcg, + struct xsched_group *parent_xg) +{ + xcu_cg_initialize_components(xcg); + xcg->parent = parent_xg; + list_add_tail(&xcg->group_node, &parent_xg->children_groups); + xcg->sched_class = parent_xg->sched_class; + + switch (xcg->sched_class) { + case XSCHED_TYPE_CFS: + return xcu_cfs_cg_init(xcg, parent_xg); + default: + XSCHED_INFO("xcu_cgroup: init RT group css=0x%lx\n", + (uintptr_t)&xcg->css); + break; + } + + return 0; +} + +inline struct xsched_group *xcu_cg_from_css(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct xsched_group, css) : NULL; +} + +/* + * Determine whether the given css corresponds to root_xsched_group.css. + * + * Parameter only_css_self: + * - true : Only check whether the css pointer itself is NULL + * (i.e., the subsystem root). Do not dereference xg->parent. + * Used in the allocation path (css_alloc). + * - false : Further check whether the associated xsched_group + * has no parent (i.e., a normal root check). + */ +static inline bool xsched_group_css_is_root(struct cgroup_subsys_state *css, bool only_css_self) +{ + struct xsched_group *xg; + + /* NULL indicates the subsystem root */ + if (!css) + return true; + + /* + * During the allocation phase, + * cannot find its parent xsched_group via xg->parent, + * so can only determine on the css itself. + */ + if (only_css_self) + return false; + + xg = xcu_cg_from_css(css); + + return xg && !xg->parent; +} + +/** + * xcu_css_alloc() - Allocate and init xcu cgroup. + * @parent_css: css of parent xcu cgroup + * + * Called from kernel/cgroup.c with cgroup_lock() held. + * First called in subsys initialization to create root xcu cgroup, when + * XCUs haven't been initialized yet. Func used on every new cgroup creation, + * on second call to set root xsched_group runqueue. + * + * Return: pointer of new xcu cgroup css on success, -ENOMEM otherwise. + */ +static struct cgroup_subsys_state * +xcu_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct xsched_group *xg; + + if (xsched_group_css_is_root(parent_css, true)) + return &root_xsched_group.css; + + xg = kmem_cache_alloc(xsched_group_cache, GFP_KERNEL | __GFP_ZERO); + if (!xg) + return ERR_PTR(-ENOMEM); + + return &xg->css; +} + +static void xcu_css_free(struct cgroup_subsys_state *css) +{ + struct xsched_group *xcg = xcu_cg_from_css(css); + + kmem_cache_free(xsched_group_cache, xcg); +} + + +static void delay_xcu_cg_set_file_show_workfn(struct work_struct *work) +{ + struct xsched_group *xg; + int retry = 50; + + xg = container_of(work, struct xsched_group, file_show_work); + + for (int i = 0; i < retry; i++) { + if (!xcu_cg_set_file_show(xg)) + return; + + mdelay(10); + } + + XSCHED_ERR("Failed to control the files xcu.{quota, period, shares} visibility after\n" + "%d retries, sched_class=%d, css=0x%lx\n", + retry, xg->sched_class, (uintptr_t)&xg->css); +} + +static int xcu_css_online(struct cgroup_subsys_state *css) +{ + struct xsched_group *xg = xcu_cg_from_css(css); + struct cgroup_subsys_state *parent_css = css->parent; + struct xsched_group *parent_xg; + int err; + + if (!parent_css) + return 0; + + parent_xg = xcu_cg_from_css(parent_css); + err = xcu_cg_init(xg, parent_xg); + if (err) { + kmem_cache_free(xsched_group_cache, xg); + XSCHED_ERR("Failed to initialize new xsched_group @ %s.\n", __func__); + return err; + } + + INIT_WORK(&xg->file_show_work, delay_xcu_cg_set_file_show_workfn); + schedule_work(&xg->file_show_work); + + return 0; +} + +static void xcu_css_offline(struct cgroup_subsys_state *css) +{ + struct xsched_group *xcg; + + xcg = xcu_cg_from_css(css); + if (!xsched_group_css_is_root(css, false)) { + switch (xcg->sched_class) { + case XSCHED_TYPE_CFS: + xcu_cfs_cg_deinit(xcg); + break; + default: + XSCHED_INFO("xcu_cgroup: deinit RT group css=0x%lx\n", + (uintptr_t)&xcg->css); + break; + } + } + hrtimer_cancel(&xcg->quota_timeout); + cancel_work_sync(&xcg->refill_work); + list_del(&xcg->group_node); +} + +static void xsched_group_xse_attach(struct xsched_group *xg, + struct xsched_entity *xse) +{ + spin_lock(&xg->lock); + list_add_tail(&xse->group_node, &xg->members); + spin_unlock(&xg->lock); + xse->parent_grp = xg; +} + +void xsched_group_xse_detach(struct xsched_entity *xse) +{ + struct xsched_group *xcg = xse->parent_grp; + + spin_lock(&xcg->lock); + list_del(&xse->group_node); + spin_unlock(&xcg->lock); +} + +static int xcu_task_can_attach(struct task_struct *task, + struct xsched_group *old) +{ + struct xsched_entity *xse; + bool has_xse = false; + + spin_lock(&old->lock); + list_for_each_entry(xse, &old->members, group_node) { + if (xse->owner_pid == task_pid_nr(task)) { + has_xse = true; + break; + } + } + spin_unlock(&old->lock); + + return has_xse ? -EINVAL : 0; +} + +static int xcu_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *dst_css, *old_css; + struct xsched_group *old_xcg, *dst_xcg; + struct xcg_attach_entry *entry; + int ret = 0; + + cgroup_taskset_for_each(task, dst_css, tset) { + rcu_read_lock(); + old_css = task_css(task, xcu_cgrp_id); + rcu_read_unlock(); + dst_xcg = xcu_cg_from_css(dst_css); + old_xcg = xcu_cg_from_css(old_css); + + ret = xcu_task_can_attach(task, old_xcg); + if (ret) + break; + + /* record entry for this task */ + entry = kmem_cache_alloc(xcg_attach_entry_cache, GFP_KERNEL | __GFP_ZERO); + entry->task = task; + entry->old_xcg = old_xcg; + entry->new_xcg = dst_xcg; + list_add_tail(&entry->node, &xcg_attach_list); + } + + return ret; +} + +static void xcu_cancel_attach(struct cgroup_taskset *tset) +{ + struct xcg_attach_entry *entry, *tmp; + + /* error: clear all entries */ + list_for_each_entry_safe(entry, tmp, &xcg_attach_list, node) { + list_del(&entry->node); + kmem_cache_free(xcg_attach_entry_cache, entry); + } +} + +void xcu_move_task(struct task_struct *task, struct xsched_group *old_xcg, + struct xsched_group *new_xcg) +{ + struct xsched_entity *xse, *tmp; + struct xsched_cu *xcu; + + spin_lock(&old_xcg->lock); + list_for_each_entry_safe(xse, tmp, &old_xcg->members, group_node) { + if (xse->owner_pid != task_pid_nr(task)) + continue; + + xcu = xse->xcu; + + if (old_xcg != xse->parent_grp) { + WARN_ON(old_xcg != xse->parent_grp); + return; + } + + /* delete from the old_xcg */ + list_del(&xse->group_node); + + mutex_lock(&xcu->xcu_lock); + /* dequeue from the current runqueue */ + dequeue_ctx(xse, xcu); + /* attach to the new_xcg */ + xsched_group_xse_attach(new_xcg, xse); + /* enqueue to the runqueue in new_xcg */ + enqueue_ctx(xse, xcu); + mutex_unlock(&xcu->xcu_lock); + } + spin_unlock(&old_xcg->lock); +} + +static void xcu_attach(struct cgroup_taskset *tset) +{ + struct xcg_attach_entry *entry, *tmp; + + list_for_each_entry(entry, &xcg_attach_list, node) { + xcu_move_task(entry->task, entry->old_xcg, entry->new_xcg); + } + + /* cleanup */ + list_for_each_entry_safe(entry, tmp, &xcg_attach_list, node) { + list_del(&entry->node); + kmem_cache_free(xcg_attach_entry_cache, entry); + } +} + +/** + * xsched_group_inherit() - Attach new entity to task's xsched_group. + * @task: task_struct + * @xse: xsched entity + * + * Called in xsched context initialization to attach xse to task's group + * and inherit its xse scheduling class and bandwidth control policy. + * + * Return: Zero on success. + */ +void xsched_group_inherit(struct task_struct *task, struct xsched_entity *xse) +{ + struct cgroup_subsys_state *css; + struct xsched_group *xg; + + xse->owner_pid = task_pid_nr(task); + css = task_get_css(task, xcu_cgrp_id); + xg = xcu_cg_from_css(css); + xsched_group_xse_attach(xg, xse); + css_put(css); +} + +static int xcu_sched_class_show(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct xsched_group *xg = xcu_cg_from_css(css); + + seq_printf(sf, "%s\n", xcu_sched_name[xg->sched_class]); + return 0; +} + +/** + * xcu_cg_set_sched_class() - Set scheduling type for group. + * @xg: xsched group + * @type: scheduler type + * + * Scheduler type can be changed if task is child of root group + * and haven't got scheduling entities. + * + * Return: Zero on success or -EINVAL + */ +static int xcu_cg_set_sched_class(struct xsched_group *xg, int type) +{ + if (type == xg->sched_class) + return 0; + + /* can't change scheduler when there are running members */ + if (!list_empty(&xg->members)) + return -EBUSY; + + /* deinit old type if necessary */ + switch (xg->sched_class) { + case XSCHED_TYPE_CFS: + xcu_cfs_cg_deinit(xg); + break; + default: + XSCHED_INFO("xcu_cgroup: the original sched_class is RT, css=0x%lx\n", + (uintptr_t)&xg->css); + break; + } + + /* update type */ + xg->sched_class = type; + xcu_cg_set_file_show(xg); + + /* init new type if necessary */ + switch (type) { + case XSCHED_TYPE_CFS: + return xcu_cfs_cg_init(xg, xg->parent); + default: + XSCHED_INFO("xcu_cgroup: the target sched_class is RT, css=0x%lx\n", + (uintptr_t)&xg->css); + return 0; + } +} + +static ssize_t xcu_sched_class_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct xsched_group *xg = xcu_cg_from_css(css); + char type_name[4]; + int type = -1; + + ssize_t ret = sscanf(buf, "%3s", type_name); + + if (ret < 1) + return -EINVAL; + + for (type = 0; type < XSCHED_TYPE_NUM; type++) { + if (!strcmp(type_name, xcu_sched_name[type])) + break; + } + + if (type == XSCHED_TYPE_NUM) + return -EINVAL; + + if (!list_empty(&css->children)) + return -EBUSY; + + /* only root child can switch scheduler type */ + if (!xg->parent || !xsched_group_css_is_root(&xg->parent->css, false)) + return -EINVAL; + + ret = xcu_cg_set_sched_class(xg, type); + + return (ret) ? ret : nbytes; +} + +static s64 xcu_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + s64 ret = 0; + struct xsched_group *xcucg = xcu_cg_from_css(css); + + switch (cft->private) { + case XCU_FILE_PERIOD_MS: + ret = div_s64(xcucg->period, NSEC_PER_MSEC); + break; + case XCU_FILE_QUOTA_MS: + ret = (xcucg->quota > 0) ? div_s64(xcucg->quota, NSEC_PER_MSEC) + : xcucg->quota; + break; + case XCU_FILE_SHARES: + ret = xcucg->shares_cfg; + break; + default: + XSCHED_ERR("invalid operation %lu @ %s\n", cft->private, __func__); + break; + } + + return ret; +} + +void xcu_grp_shares_update(struct xsched_group *parent) +{ + int id; + struct xsched_cu *xcu; + struct xsched_group *children; + u64 rem, sh_sum = 0, sh_gcd = 0, w_gcd = 0, sh_prod_red = 1; + + spin_lock(&parent->lock); + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) + sh_gcd = gcd(sh_gcd, children->shares_cfg); + } + + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) { + sh_sum += children->shares_cfg; + children->shares_cfg_red = div64_u64(children->shares_cfg, sh_gcd); + div64_u64_rem(sh_prod_red, children->shares_cfg_red, &rem); + if (rem) + sh_prod_red *= children->shares_cfg_red; + } + } + + parent->children_shares_sum = sh_sum; + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) { + children->weight = div64_u64(sh_prod_red, children->shares_cfg_red); + w_gcd = gcd(w_gcd, children->weight); + } + } + + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) { + children->weight = div64_u64(children->weight, w_gcd); + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + children->perxcu_priv[id].xse.cfs.weight = children->weight; + mutex_unlock(&xcu->xcu_lock); + } + } + } + spin_unlock(&parent->lock); +} + +static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) +{ + int ret = 0; + struct xsched_group *xcucg = xcu_cg_from_css(css); + s64 quota_ns; + + switch (cft->private) { + case XCU_FILE_PERIOD_MS: + if (val < 1 || val > (S64_MAX / NSEC_PER_MSEC)) { + ret = -EINVAL; + break; + } + xcucg->period = val * NSEC_PER_MSEC; + xsched_quota_timeout_update(xcucg); + break; + case XCU_FILE_QUOTA_MS: + if (val < -1 || val > (S64_MAX / NSEC_PER_MSEC)) { + ret = -EINVAL; + break; + } + /* Runtime should be updated when modifying quota_ms configuration */ + quota_ns = (val > 0) ? val * NSEC_PER_MSEC : val; + if (xcucg->quota > 0 && quota_ns > 0) + xcucg->runtime = max((xcucg->runtime - quota_ns), (s64)0); + else + xcucg->runtime = 0; + xcucg->quota = quota_ns; + xsched_quota_timeout_update(xcucg); + break; + case XCU_FILE_SHARES: + if (val <= 0 || val > U64_MAX) { + ret = -EINVAL; + break; + } + xcucg->shares_cfg = val; + xcu_grp_shares_update(xcucg->parent); + break; + default: + XSCHED_ERR("invalid operation %lu @ %s\n", cft->private, __func__); + ret = -EINVAL; + break; + } + + return ret; +} + +static int xcu_stat(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct xsched_group *xcucg = xcu_cg_from_css(css); + u64 nr_throttled = 0; + u64 throttled_time = 0; + u64 exec_runtime = 0; + int xcu_id; + struct xsched_cu *xcu; + + if (xcucg->sched_class == XSCHED_TYPE_RT) { + seq_printf(sf, "RT group stat is not supported @ %s.\n", __func__); + return 0; + } + + for_each_active_xcu(xcu, xcu_id) { + nr_throttled += xcucg->perxcu_priv[xcu_id].nr_throttled; + throttled_time += xcucg->perxcu_priv[xcu_id].throttled_time; + exec_runtime += + xcucg->perxcu_priv[xcu_id].xse.cfs.sum_exec_runtime; + } + + seq_printf(sf, "exec_runtime: %llu\n", exec_runtime); + seq_printf(sf, "shares cfg: %llu/%llu x%u\n", xcucg->shares_cfg, + xcucg->parent->children_shares_sum, xcucg->weight); + seq_printf(sf, "quota: %lld\n", xcucg->quota); + seq_printf(sf, "used: %lld\n", xcucg->runtime); + seq_printf(sf, "period: %lld\n", xcucg->period); + seq_printf(sf, "nr_throttled: %lld\n", nr_throttled); + seq_printf(sf, "throttled_time: %lld\n", throttled_time); + + return 0; +} + +static struct cftype xcu_cg_files[] = { + { + .name = "period_ms", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_PERIOD_MS, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_PERIOD_MS]), + }, + { + .name = "quota_ms", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_QUOTA_MS, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_QUOTA_MS]), + }, + { + .name = "shares", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_SHARES, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_SHARES]), + }, + { + .name = "stat", + .seq_show = xcu_stat, + }, + { + .name = "sched_class", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = xcu_sched_class_show, + .write = xcu_sched_class_write, + }, + {} /* terminate */ +}; + +struct cgroup_subsys xcu_cgrp_subsys = { + .css_alloc = xcu_css_alloc, + .css_online = xcu_css_online, + .css_offline = xcu_css_offline, + .css_free = xcu_css_free, + .can_attach = xcu_can_attach, + .cancel_attach = xcu_cancel_attach, + .attach = xcu_attach, + .dfl_cftypes = xcu_cg_files, + .legacy_cftypes = xcu_cg_files, + .early_init = false, +}; diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c new file mode 100644 index 0000000000000000000000000000000000000000..b920a79239998bdbcf0be4ae4b46a2ea3ca2d280 --- /dev/null +++ b/kernel/xsched/core.c @@ -0,0 +1,526 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Core kernel scheduler code for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * Author: Konstantin Meskhidze + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include +#include +#include +#include +#include + +/* List of scheduling classes available */ +struct list_head xsched_class_list; + +static void put_prev_ctx(struct xsched_entity *xse) +{ + struct xsched_cu *xcu = xse->xcu; + + lockdep_assert_held(&xcu->xcu_lock); + xse->class->put_prev_ctx(xse); + xse->last_exec_runtime = 0; + atomic_set(&xse->submitted_one_kick, 0); + XSCHED_DEBUG("Put current xse %d @ %s\n", xse->tgid, __func__); +} + +static size_t select_work_def(struct xsched_cu *xcu, struct xsched_entity *xse) +{ + int kick_count, scheduled = 0, not_empty; + struct vstream_info *vs; + struct xcu_op_handler_params params; + struct vstream_metadata *vsm; + size_t kick_slice = xse->class->kick_slice; + + kick_count = atomic_read(&xse->kicks_pending_ctx_cnt); + XSCHED_DEBUG("Before decrement XSE kick_count=%d @ %s\n", + kick_count, __func__); + + if (kick_count == 0) { + XSCHED_WARN("Try to select xse that has 0 kicks @ %s\n", + __func__); + return 0; + } + + do { + not_empty = 0; + for_each_vstream_in_ctx(vs, xse->ctx) { + spin_lock(&vs->stream_lock); + vsm = xsched_vsm_fetch_first(vs); + spin_unlock(&vs->stream_lock); + if (!vsm) + continue; + list_add_tail(&vsm->node, &xcu->vsm_list); + scheduled++; + xsched_dec_pending_kicks_xse(xse); + not_empty++; + } + } while ((scheduled < kick_slice) && (not_empty)); + + /* + * Iterate over all vstreams in context: + * Set wr_cqe bit in last computing task in vsm_list + */ + for_each_vstream_in_ctx(vs, xse->ctx) { + list_for_each_entry_reverse(vsm, &xcu->vsm_list, node) { + if (vsm->parent == vs) { + params.group = vsm->parent->xcu->group; + params.param_1 = &(int){SQE_SET_NOTIFY}; + params.param_2 = &vsm->sqe; + xcu_sqe_op(¶ms); + break; + } + } + } + + kick_count = atomic_read(&xse->kicks_pending_ctx_cnt); + XSCHED_DEBUG("After decrement XSE kick_count=%d @ %s\n", + kick_count, __func__); + + xse->total_scheduled += scheduled; + return scheduled; +} + +static struct xsched_entity *__raw_pick_next_ctx(struct xsched_cu *xcu) +{ + const struct xsched_class *class; + struct xsched_entity *next = NULL; + size_t scheduled; + + lockdep_assert_held(&xcu->xcu_lock); + for_each_xsched_class(class) { + next = class->pick_next_ctx(xcu); + if (next) { + scheduled = class->select_work ? + class->select_work(xcu, next) : select_work_def(xcu, next); + + XSCHED_DEBUG("xse %d scheduled=%zu total=%zu @ %s\n", + next->tgid, scheduled, next->total_scheduled, __func__); + break; + } + } + + return next; +} + +void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + lockdep_assert_held(&xcu->xcu_lock); + + if (xse_integrity_check(xse)) { + XSCHED_ERR("Fail to check xse integrity @ %s\n", __func__); + return; + } + + if (!xse->on_rq) { + xse->on_rq = true; + xse->class->enqueue_ctx(xse, xcu); + XSCHED_DEBUG("Enqueue xse %d @ %s\n", xse->tgid, __func__); + } +} + +void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + lockdep_assert_held(&xcu->xcu_lock); + + if (xse_integrity_check(xse)) { + XSCHED_ERR("Fail to check xse integrity @ %s\n", __func__); + return; + } + + if (xse->on_rq) { + xse->class->dequeue_ctx(xse); + xse->on_rq = false; + XSCHED_DEBUG("Dequeue xse %d @ %s\n", xse->tgid, __func__); + } +} + +int delete_ctx(struct xsched_context *ctx) +{ + struct xsched_cu *xcu = ctx->xse.xcu; + struct xsched_entity *curr_xse = xcu->xrq.curr_xse; + struct xsched_entity *xse = &ctx->xse; + + if (xse_integrity_check(xse)) { + XSCHED_ERR("Fail to check xse integrity @ %s\n", __func__); + return -EINVAL; + } + + if (!xse->xcu) { + XSCHED_ERR("Try to delete ctx that is not attached to xcu @ %s\n", + __func__); + return -EINVAL; + } + + /* Wait till context has been submitted. */ + while (atomic_read(&xse->kicks_pending_ctx_cnt)) { + XSCHED_DEBUG("Deleting ctx %d, xse->kicks_pending_ctx_cnt=%d @ %s\n", + xse->tgid, atomic_read(&xse->kicks_pending_ctx_cnt), + __func__); + usleep_range(100, 200); + } + + mutex_lock(&xcu->xcu_lock); + if (curr_xse == xse) + xcu->xrq.curr_xse = NULL; + dequeue_ctx(xse, xcu); + --xcu->nr_ctx; + mutex_unlock(&xcu->xcu_lock); + XSCHED_DEBUG("Deleting ctx %d, pending kicks left=%d @ %s\n", xse->tgid, + atomic_read(&xse->kicks_pending_ctx_cnt), __func__); + + xse->class->xse_deinit(xse); + +#ifdef CONFIG_CGROUP_XCU + xsched_group_xse_detach(xse); +#endif + + return 0; +} + +int xsched_xse_set_class(struct xsched_entity *xse) +{ + struct xsched_class *sched = xsched_first_class; + +#ifdef CONFIG_CGROUP_XCU + xsched_group_inherit(current, xse); + for_each_xsched_class(sched) { + if (sched->class_id == xse->parent_grp->sched_class) + break; + } +#endif + + xse->class = sched; + return 0; +} + +static void submit_kick(struct vstream_metadata *vsm) +{ + struct vstream_info *vs = vsm->parent; + struct xcu_op_handler_params params; + + params.group = vs->xcu->group; + params.fd = vs->fd; + params.param_1 = &vs->id; + params.param_2 = &vs->channel_id; + params.param_3 = vsm->sqe; + params.param_4 = &vsm->sqe_num; + params.param_5 = &vsm->timeout; + params.param_6 = &vs->sqcq_type; + params.param_7 = vs->drv_ctx; + params.param_8 = &vs->logic_vcq_id; + + /* Send vstream on a device for processing. */ + if (xcu_run(¶ms) != 0) + XSCHED_ERR( + "Fail to send Vstream id %u tasks to a device for processing.\n", + vs->id); + + XSCHED_DEBUG("Vstream id %u submit vsm: sq_tail %u\n", vs->id, vsm->sq_tail); +} + +static void submit_wait(struct vstream_metadata *vsm) +{ + struct vstream_info *vs = vsm->parent; + struct xcu_op_handler_params params; + /* Wait timeout in ms. */ + int32_t timeout = 500; + + params.group = vs->xcu->group; + params.param_1 = &vs->channel_id; + params.param_2 = &vs->logic_vcq_id; + params.param_3 = &vs->user_stream_id; + params.param_4 = &vsm->sqe; + params.param_5 = vsm->cqe; + params.param_6 = vs->drv_ctx; + params.param_7 = &timeout; + + /* Wait for a device to complete processing. */ + if (xcu_wait(¶ms)) { + XSCHED_ERR("Fail to wait Vstream id %u tasks, logic_cq_id %u.\n", + vs->id, vs->logic_vcq_id); + } + + XSCHED_DEBUG("Vstream id %u wait finish, logic_cq_id %u\n", + vs->id, vs->logic_vcq_id); +} + +static int __xsched_submit(struct xsched_cu *xcu, struct xsched_entity *xse) +{ + struct vstream_metadata *vsm, *tmp; + int submitted = 0; + long submit_exec_time = 0; + ktime_t t_start = 0; + struct xcu_op_handler_params params; + + XSCHED_DEBUG("%s called for xse %d on xcu %u\n", + __func__, xse->tgid, xcu->id); + list_for_each_entry_safe(vsm, tmp, &xcu->vsm_list, node) { + submit_kick(vsm); + XSCHED_DEBUG("Xse %d vsm %u sched_delay: %lld ns\n", + xse->tgid, vsm->sq_id, ktime_to_ns(ktime_sub(ktime_get(), vsm->add_time))); + + params.group = vsm->parent->xcu->group; + params.param_1 = &(int){SQE_IS_NOTIFY}; + params.param_2 = &vsm->sqe; + if (xcu_sqe_op(¶ms)) { + mutex_unlock(&xcu->xcu_lock); + t_start = ktime_get(); + submit_wait(vsm); + submit_exec_time += ktime_to_ns(ktime_sub(ktime_get(), t_start)); + mutex_lock(&xcu->xcu_lock); + } + submitted++; + list_del(&vsm->node); + kfree(vsm); + } + + xse->last_exec_runtime += submit_exec_time; + xse->total_submitted += submitted; + atomic_add(submitted, &xse->submitted_one_kick); + INIT_LIST_HEAD(&xcu->vsm_list); + XSCHED_DEBUG("Xse %d submitted=%d total=%zu, exec_time=%ld @ %s\n", + xse->tgid, submitted, xse->total_submitted, + submit_exec_time, __func__); + + return submitted; +} + +static inline bool should_preempt(struct xsched_entity *xse) +{ + return xse->class->check_preempt(xse); +} + +int xsched_vsm_add_tail(struct vstream_info *vs, vstream_args_t *arg) +{ + struct vstream_metadata *new_vsm; + + new_vsm = kmalloc(sizeof(struct vstream_metadata), GFP_KERNEL); + if (!new_vsm) { + XSCHED_ERR("Fail to alloc kick metadata for vs %u @ %s\n", + vs->id, __func__); + return -ENOMEM; + } + + if (vs->kicks_count > MAX_VSTREAM_SIZE) { + kfree(new_vsm); + return -EBUSY; + } + + xsched_init_vsm(new_vsm, vs, arg); + list_add_tail(&new_vsm->node, &vs->metadata_list); + new_vsm->add_time = ktime_get(); + vs->kicks_count += 1; + + return 0; +} + +/* Fetch the first vstream metadata from vstream metadata list + * and removes it from that list. Returned vstream metadata pointer + * to be freed after. + */ +struct vstream_metadata *xsched_vsm_fetch_first(struct vstream_info *vs) +{ + struct vstream_metadata *vsm; + + if (list_empty(&vs->metadata_list)) { + XSCHED_DEBUG("No metadata to fetch from vs %u @ %s\n", + vs->id, __func__); + return NULL; + } + + vsm = list_first_entry(&vs->metadata_list, struct vstream_metadata, node); + if (!vsm) { + XSCHED_ERR("Corrupted metadata list in vs %u @ %s\n", + vs->id, __func__); + return NULL; + } + + list_del(&vsm->node); + if (vs->kicks_count == 0) + XSCHED_WARN("kicks_count underflow in vs %u @ %s\n", + vs->id, __func__); + else + vs->kicks_count -= 1; + + return vsm; +} + +int xsched_schedule(void *input_xcu) +{ + struct xsched_cu *xcu = input_xcu; + struct xsched_entity *curr_xse = NULL; + struct xsched_entity *next_xse = NULL; + + while (!kthread_should_stop()) { + mutex_unlock(&xcu->xcu_lock); + wait_event_interruptible(xcu->wq_xcu_idle, + xcu->xrq.rt.nr_running || xcu->xrq.cfs.nr_running || kthread_should_stop()); + + mutex_lock(&xcu->xcu_lock); + if (kthread_should_stop()) { + mutex_unlock(&xcu->xcu_lock); + break; + } + + if (!xsched_check_pending_kicks_xcu(xcu)) { + XSCHED_WARN("%s: No pending kicks on xcu %u\n", __func__, xcu->id); + continue; + } + + next_xse = __raw_pick_next_ctx(xcu); + if (!next_xse) { + XSCHED_WARN("%s: Couldn't find next xse on xcu %u\n", __func__, xcu->id); + continue; + } + + xcu->xrq.curr_xse = next_xse; + if (__xsched_submit(xcu, next_xse) == 0) + continue; + + curr_xse = xcu->xrq.curr_xse; + if (!curr_xse) + continue; + + /* if not deleted yet */ + put_prev_ctx(curr_xse); + if (!atomic_read(&curr_xse->kicks_pending_ctx_cnt)) + dequeue_ctx(curr_xse, xcu); + +#ifdef CONFIG_CGROUP_XCU + if (xsched_quota_exceed(curr_xse->parent_grp)) + dequeue_ctx(&curr_xse->parent_grp->perxcu_priv[xcu->id].xse, xcu); +#endif + + xcu->xrq.curr_xse = NULL; + } + + return 0; +} + + +/* Initializes all xsched XCU objects. + * Should only be called from xsched_xcu_register function. + */ +int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id) +{ + struct xsched_class *sched; + int err; + + xcu->id = xcu_id; + xcu->state = XSCHED_XCU_NONE; + xcu->group = group; + + xcu->nr_ctx = 0; + xcu->xrq.curr_xse = NULL; + + atomic_set(&xcu->pending_kicks, 0); + INIT_LIST_HEAD(&xcu->vsm_list); + INIT_LIST_HEAD(&xcu->ctx_list); + init_waitqueue_head(&xcu->wq_xcu_idle); + mutex_init(&xcu->ctx_list_lock); + mutex_init(&xcu->vs_array_lock); + mutex_init(&xcu->xcu_lock); + + /* Initialize current XCU's runqueue. */ + for_each_xsched_class(sched) + sched->rq_init(xcu); + + /* This worker should set XCU to XSCHED_XCU_WAIT_IDLE. + * If after initialization XCU still has XSCHED_XCU_NONE + * status then we can assume that there was a problem + * with XCU kthread job. + */ + xcu->worker = kthread_run(xsched_schedule, xcu, "xcu_%u", xcu->id); + + if (IS_ERR(xcu->worker)) { + err = PTR_ERR(xcu->worker); + xcu->worker = NULL; + XSCHED_DEBUG("Fail to run the worker to schedule for xcu[%u].", xcu->id); + return err; + } + return 0; +} + +int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs) +{ + int err = 0; + struct xsched_entity *xse = &ctx->xse; + + atomic_set(&xse->kicks_pending_ctx_cnt, 0); + atomic_set(&xse->submitted_one_kick, 0); + + xse->total_scheduled = 0; + xse->total_submitted = 0; + xse->last_exec_runtime = 0; + + xse->fd = ctx->fd; + xse->tgid = ctx->tgid; + + err = ctx_bind_to_xcu(vs, ctx); + if (err) { + XSCHED_ERR( + "Couldn't find valid xcu for vstream %u dev_id %u @ %s\n", + vs->id, vs->dev_id, __func__); + return -EINVAL; + } + + xse->ctx = ctx; + + if (vs->xcu == NULL) { + WARN_ON(vs->xcu == NULL); + return -EINVAL; + } + + xse->xcu = vs->xcu; + + err = xsched_xse_set_class(xse); + if (err) { + XSCHED_ERR("Fail to set xse class @ %s\n", __func__); + return err; + } + xse->class->xse_init(xse); + + WRITE_ONCE(xse->on_rq, false); + + spin_lock_init(&xse->xse_lock); + return err; +} + +static void xsched_register_sched_class(struct xsched_class *sched) +{ + list_add_tail(&sched->node, &xsched_class_list); +} + +__init int xsched_sched_init(void) +{ + INIT_LIST_HEAD(&xsched_class_list); +#ifdef CONFIG_XCU_SCHED_RT + xsched_register_sched_class(&rt_xsched_class); +#endif + +#ifdef CONFIG_XCU_SCHED_CFS + xsched_register_sched_class(&fair_xsched_class); +#endif + +#ifdef CONFIG_CGROUP_XCU + xcu_cg_subsys_init(); +#endif + + return 0; +} +late_initcall(xsched_sched_init); + diff --git a/kernel/xsched/rt.c b/kernel/xsched/rt.c new file mode 100644 index 0000000000000000000000000000000000000000..41b60e341679b7bafdea84f44d5d0464440a5ce9 --- /dev/null +++ b/kernel/xsched/rt.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Real-Time Scheduling Class for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#define XSCHED_RT_TIMESLICE (10 * NSEC_PER_MSEC) + +#define TGID_HASH_BITS 8 + +/* Mapping between tgid and context */ +struct tgid_prio { + pid_t tgid; + int32_t prio; + struct hlist_node hnode; +}; + +static DEFINE_HASHTABLE(tgid_prio_map, TGID_HASH_BITS); +static DEFINE_SPINLOCK(tgid_prio_lock); + +static int tgid_prio_insert(pid_t tgid, int32_t prio) +{ + struct tgid_prio *new_map; + unsigned int hash_key; + + if (prio >= NR_XSE_PRIO) + return -EINVAL; + + new_map = kzalloc(sizeof(struct tgid_prio), GFP_KERNEL); + if (!new_map) { + XSCHED_ERR("Fail to alloc mapping (tgid=%d) @ %s\n", + tgid, __func__); + return -ENOMEM; + } + + new_map->tgid = tgid; + new_map->prio = prio; + + hash_key = hash_32(tgid, TGID_HASH_BITS); + + spin_lock(&tgid_prio_lock); + hash_add_rcu(tgid_prio_map, &new_map->hnode, hash_key); + spin_unlock(&tgid_prio_lock); + + return 0; +} + +static struct tgid_prio *tgid_prio_find(pid_t tgid) +{ + struct tgid_prio *map = NULL; + unsigned int hash_key = hash_32(tgid, TGID_HASH_BITS); + + rcu_read_lock(); + hash_for_each_possible_rcu(tgid_prio_map, map, hnode, hash_key) { + if (map->tgid == tgid) + break; + } + rcu_read_unlock(); + return map; +} + +static void tgid_prio_delete(pid_t tgid) +{ + struct tgid_prio *map; + unsigned int hash_key = hash_32(tgid, TGID_HASH_BITS); + + spin_lock(&tgid_prio_lock); + hash_for_each_possible(tgid_prio_map, map, hnode, hash_key) { + if (map->tgid == tgid) { + hash_del_rcu(&map->hnode); + spin_unlock(&tgid_prio_lock); + kfree(map); + return; + } + } + spin_unlock(&tgid_prio_lock); +} + +static inline void +xse_rt_add(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + list_add_tail(&xse->rt.list_node, &xcu->xrq.rt.rq[xse->rt.prio]); +} + +static inline void xse_rt_del(struct xsched_entity *xse) +{ + list_del_init(&xse->rt.list_node); +} + +static inline void xse_rt_move_tail(struct xsched_entity *xse) +{ + struct xsched_cu *xcu = xse->xcu; + + list_move_tail(&xse->rt.list_node, &xcu->xrq.rt.rq[xse->rt.prio]); +} + +/* Increase RT runqueue total and per prio nr_running stat. */ +static inline void xrq_inc_nr_running(struct xsched_entity *xse, + struct xsched_cu *xcu) +{ + xcu->xrq.rt.nr_running++; +} + +/* Decrease RT runqueue total and per prio nr_running stat + * and raise a bug if nr_running decrease beyond zero. + */ +static inline void xrq_dec_nr_running(struct xsched_entity *xse) +{ + struct xsched_cu *xcu = xse->xcu; + + xcu->xrq.rt.nr_running--; +} + +static void dequeue_ctx_rt(struct xsched_entity *xse) +{ + xse_rt_del(xse); + xrq_dec_nr_running(xse); +} + +static void enqueue_ctx_rt(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + xse_rt_add(xse, xcu); + xrq_inc_nr_running(xse, xcu); +} + +static inline struct xsched_entity *xrq_next_xse(struct xsched_cu *xcu, + int prio) +{ + return list_first_entry(&xcu->xrq.rt.rq[prio], struct xsched_entity, + rt.list_node); +} + +/* Return the next priority for pick_next_ctx taking into + * account if there are pending kicks on certain priority. + */ +static inline uint32_t get_next_prio_rt(struct xsched_rq *xrq) +{ + unsigned int curr_prio; + + for_each_xse_prio(curr_prio) { + if (!list_empty(&xrq->rt.rq[curr_prio])) + return curr_prio; + } + return NR_XSE_PRIO; +} + +static struct xsched_entity *pick_next_ctx_rt(struct xsched_cu *xcu) +{ + struct xsched_entity *result; + int next_prio; + + next_prio = get_next_prio_rt(&xcu->xrq); + if (next_prio >= NR_XSE_PRIO) { + XSCHED_DEBUG("No pending kicks in RT class @ %s\n", __func__); + return NULL; + } + + result = xrq_next_xse(xcu, next_prio); + if (!result) + XSCHED_ERR("Next XSE not found @ %s\n", __func__); + else + XSCHED_DEBUG("Next XSE %u at prio %u @ %s\n", result->tgid, next_prio, __func__); + + return result; +} + +static void put_prev_ctx_rt(struct xsched_entity *xse) +{ + xse->rt.timeslice -= xse->last_exec_runtime; + XSCHED_DEBUG( + "Update XSE=%d timeslice=%lld, XSE submitted=%lld in RT class @ %s\n", + xse->tgid, xse->rt.timeslice, + xse->last_exec_runtime, __func__); + + if (xse->rt.timeslice <= 0) { + xse->rt.timeslice = XSCHED_RT_TIMESLICE; + XSCHED_DEBUG("Refill XSE=%d kick_slice=%lld in RT class @ %s\n", + xse->tgid, xse->rt.timeslice, __func__); + xse_rt_move_tail(xse); + } +} + +static bool check_preempt_ctx_rt(struct xsched_entity *xse) +{ + return true; +} + +void rq_init_rt(struct xsched_cu *xcu) +{ + int prio = 0; + + xcu->xrq.rt.nr_running = 0; + + for_each_xse_prio(prio) { + INIT_LIST_HEAD(&xcu->xrq.rt.rq[prio]); + } +} + +void xse_init_rt(struct xsched_entity *xse) +{ + struct tgid_prio *map = tgid_prio_find(xse->tgid); + + xse->rt.prio = (map) ? map->prio : XSE_PRIO_DFLT; + XSCHED_DEBUG("Xse init: set priority=%d.\n", xse->rt.prio); + xse->rt.timeslice = XSCHED_RT_TIMESLICE; + INIT_LIST_HEAD(&xse->rt.list_node); +} + +void xse_deinit_rt(struct xsched_entity *xse) +{ + struct tgid_prio *map = tgid_prio_find(xse->tgid); + + if (map) { + tgid_prio_delete(xse->tgid); + XSCHED_DEBUG("Map deleted: tgid=%d\n", xse->tgid); + } +} + +struct xsched_class rt_xsched_class = { + .class_id = XSCHED_TYPE_RT, + .kick_slice = XSCHED_RT_KICK_SLICE, + .rq_init = rq_init_rt, + .xse_init = xse_init_rt, + .xse_deinit = xse_deinit_rt, + .dequeue_ctx = dequeue_ctx_rt, + .enqueue_ctx = enqueue_ctx_rt, + .pick_next_ctx = pick_next_ctx_rt, + .put_prev_ctx = put_prev_ctx_rt, + .check_preempt = check_preempt_ctx_rt +}; + +int xsched_rt_prio_set(pid_t tgid, unsigned int prio) +{ + unsigned int id; + struct xsched_cu *xcu; + struct xsched_context *ctx; + struct xsched_entity *xse; + + tgid_prio_delete(tgid); + tgid_prio_insert(tgid, prio); + + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->ctx_list_lock); + mutex_lock(&xcu->xcu_lock); + + ctx = ctx_find_by_tgid_and_xcu(tgid, xcu); + if (ctx) { + xse = &ctx->xse; + xse->rt.prio = clamp_t(unsigned int, prio, XSE_PRIO_HIGH, XSE_PRIO_LOW); + if (xse->on_rq) { + xse_rt_del(xse); + xse_rt_add(xse, xcu); + } + } + + mutex_unlock(&xcu->xcu_lock); + mutex_unlock(&xcu->ctx_list_lock); + } + + return 0; +} + diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c new file mode 100644 index 0000000000000000000000000000000000000000..b3da8a3444c993a32028d6dbe35d1888da74167d --- /dev/null +++ b/kernel/xsched/vstream.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Vstream manage for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_XCU_VSTREAM +#define XCU_HASH_ORDER 6 + +static DEFINE_MUTEX(revmap_mutex); +static DEFINE_HASHTABLE(ctx_revmap, XCU_HASH_ORDER); + +/** + * @group: value for this entry. + * @hash_node: hash node list. + * @dev_id: device id to bind with ctx. + */ +struct ctx_devid_revmap_data { + unsigned int dev_id; + struct xcu_group *group; + struct hlist_node hash_node; +}; + +static int vstream_del(vstream_info_t *vstream, uint32_t vstream_id); +static int vstream_file_release(struct inode *inode, struct file *file); +static const struct file_operations vstreamfd_fops = { + .release = vstream_file_release, +}; + +static inline struct file *vstream_file_get(int vs_fd) +{ + return fget(vs_fd); +} + +static inline void vstream_file_put(struct file *vstream_file) +{ + fput(vstream_file); +} + +static int vstream_file_create(struct vstream_info *vs) +{ + return anon_inode_getfd("[vstreamfd]", &vstreamfd_fops, vs, + O_RDWR | O_CLOEXEC | O_NONBLOCK); +} + +/* Frees a given vstream and also frees and dequeues it's context + * if a given vstream is the last and only vstream attached to it's + * corresponding context object. + */ +static void xsched_task_free(struct kref *kref) +{ + struct xsched_context *ctx; + vstream_info_t *vs, *tmp; + struct xsched_cu *xcu; + + ctx = container_of(kref, struct xsched_context, kref); + xcu = ctx->xse.xcu; + + /* Wait utill xse dequeues */ + while (READ_ONCE(ctx->xse.on_rq)) + usleep_range(100, 200); + + mutex_lock(&xcu->ctx_list_lock); + list_for_each_entry_safe(vs, tmp, &ctx->vstream_list, ctx_node) { + list_del(&vs->ctx_node); + kfree(vs); + } + + delete_ctx(ctx); + list_del(&ctx->ctx_node); + mutex_unlock(&xcu->ctx_list_lock); + + kfree(ctx); +} + +struct xsched_cu *xcu_find(uint32_t type, + uint32_t dev_id, uint32_t channel_id) +{ + struct xcu_group *group = NULL; + + /* Find xcu by type. */ + group = xcu_group_find(xcu_group_root, type); + if (group == NULL) { + XSCHED_ERR("Fail to find type group.\n"); + return NULL; + } + + /* Find device id group. */ + group = xcu_group_find(group, dev_id); + if (group == NULL) { + XSCHED_ERR("Fail to find device group.\n"); + return NULL; + } + /* Find channel id group. */ + group = xcu_group_find(group, channel_id); + if (group == NULL) { + XSCHED_ERR("Fail to find channel group.\n"); + return NULL; + } + + XSCHED_DEBUG("XCU found: type=%u, dev_id=%u, chan_id=%u.\n", + type, dev_id, channel_id); + + return group->xcu; +} + +static int vstream_destroy(vstream_info_t *vstream) +{ + int err; + struct xsched_context *ctx = NULL; + + err = vstream_del(vstream, vstream->id); + if (err) + return err; + + ctx = vstream->ctx; + kref_put(&ctx->kref, xsched_task_free); + + return 0; +} + +static int vstream_file_release(struct inode *inode, struct file *file) +{ + vstream_info_t *vstream; + (void) inode; + + if (!file->private_data) + return 0; + + vstream = file->private_data; + return vstream_destroy(vstream); +} + +static void init_xsched_ctx(struct xsched_context *ctx, + const struct vstream_info *vs) +{ + ctx->tgid = vs->tgid; + ctx->fd = vs->fd; + ctx->dev_id = vs->dev_id; + kref_init(&ctx->kref); + + INIT_LIST_HEAD(&ctx->vstream_list); + INIT_LIST_HEAD(&ctx->ctx_node); + + spin_lock_init(&ctx->ctx_lock); + mutex_init(&ctx->ctx_mutex); +} + +int ctx_bind_to_xcu(vstream_info_t *vstream_info, struct xsched_context *ctx) +{ + struct ctx_devid_revmap_data *revmap_data; + struct xsched_cu *xcu_found = NULL; + uint32_t type = XCU_TYPE_XPU; + + /* Find XCU history. */ + hash_for_each_possible(ctx_revmap, revmap_data, hash_node, + (unsigned long)ctx->dev_id) { + if (revmap_data && revmap_data->group) { + /* Bind ctx to group xcu.*/ + ctx->xse.xcu = revmap_data->group->xcu; + return 0; + } + } + + revmap_data = kzalloc(sizeof(struct ctx_devid_revmap_data), GFP_KERNEL); + if (revmap_data == NULL) { + XSCHED_ERR("Revmap_data is NULL @ %s\n", __func__); + return -ENOMEM; + } + + xcu_found = xcu_find(type, ctx->dev_id, vstream_info->channel_id); + if (!xcu_found) { + kfree(revmap_data); + return -EINVAL; + } + + /* Bind ctx to an XCU from channel group. */ + revmap_data->group = xcu_found->group; + ctx->xse.xcu = xcu_found; + vstream_info->xcu = xcu_found; + revmap_data->dev_id = vstream_info->dev_id; + XSCHED_DEBUG("Ctx bind to xcu %u @ %s\n", xcu_found->id, __func__); + + hash_add(ctx_revmap, &revmap_data->hash_node, + (unsigned long)ctx->dev_id); + + return 0; +} + +/* Allocates a new xsched_context if a new vstream_info is bound + * to a device that no other vstream that is currently present + * is bound to. + */ +static int alloc_ctx_from_vstream(struct vstream_info *vstream_info, + struct xsched_context **ctx) +{ + struct xsched_cu *xcu = vstream_info->xcu; + int ret; + + *ctx = ctx_find_by_tgid_and_xcu(vstream_info->tgid, xcu); + if (*ctx) + return 0; + + *ctx = kzalloc(sizeof(struct xsched_context), GFP_KERNEL); + if (!*ctx) { + XSCHED_ERR("Fail to alloc xsched context (tgid=%d) @ %s\n", + vstream_info->tgid, __func__); + return -ENOMEM; + } + + init_xsched_ctx(*ctx, vstream_info); + + ret = xsched_init_entity(*ctx, vstream_info); + if (ret) { + XSCHED_ERR("Fail to initialize XSE for context @ %s\n", + __func__); + kfree(*ctx); + return -EINVAL; + } + + list_add(&(*ctx)->ctx_node, &xcu->ctx_list); + ++xcu->nr_ctx; + + return 0; +} + +/* Bounds a new vstream_info object to a corresponding xsched context. */ +static int vstream_bind_to_ctx(struct vstream_info *vs) +{ + struct xsched_context *ctx = NULL; + struct xsched_cu *xcu = vs->xcu; + int err = 0; + + mutex_lock(&xcu->ctx_list_lock); + ctx = ctx_find_by_tgid_and_xcu(vs->tgid, xcu); + if (ctx) { + XSCHED_DEBUG("Ctx %d found @ %s\n", vs->tgid, __func__); + kref_get(&ctx->kref); + } else { + err = alloc_ctx_from_vstream(vs, &ctx); + if (err) + goto out_err; + } + + vs->ctx = ctx; + list_add(&vs->ctx_node, &vs->ctx->vstream_list); + +out_err: + mutex_unlock(&xcu->ctx_list_lock); + return err; +} + +static vstream_info_t *vstream_create(struct vstream_args *arg) +{ + struct vstream_info *vstream = NULL; + + vstream = kzalloc(sizeof(vstream_info_t), GFP_KERNEL); + if (!vstream) { + XSCHED_ERR("Failed to allocate vstream.\n"); + return NULL; + } + + vstream->dev_id = arg->dev_id; + vstream->channel_id = arg->channel_id; + vstream->kicks_count = 0; + vstream->xcu = NULL; + + INIT_LIST_HEAD(&vstream->ctx_node); + INIT_LIST_HEAD(&vstream->xcu_node); + INIT_LIST_HEAD(&vstream->metadata_list); + + spin_lock_init(&vstream->stream_lock); + + return vstream; +} + +static int vstream_add(vstream_info_t *vstream, uint32_t id) +{ + int err = 0; + struct xsched_cu *xcu = vstream->xcu; + + if (id >= MAX_VSTREAM_NUM) { + XSCHED_ERR("Vstream id=%u out of range @ %s.\n", + id, __func__); + return -EINVAL; + } + + mutex_lock(&xcu->vs_array_lock); + if (xcu->vs_array[id] != NULL) { + XSCHED_ERR("Vstream id=%u cell is busy.\n", id); + err = -EINVAL; + goto out_err; + } + xcu->vs_array[id] = vstream; + +out_err: + mutex_unlock(&xcu->vs_array_lock); + return err; +} + +static int vstream_del(vstream_info_t *vstream, uint32_t vstream_id) +{ + struct xsched_cu *xcu = vstream->xcu; + + if (vstream_id >= MAX_VSTREAM_NUM) { + XSCHED_ERR("Vstream id=%u out of range @ %s.\n", + vstream_id, __func__); + return -EINVAL; + } + + mutex_lock(&xcu->vs_array_lock); + xcu->vs_array[vstream_id] = NULL; + mutex_unlock(&xcu->vs_array_lock); + return 0; +} + +static vstream_info_t *vstream_get(struct xsched_cu *xcu, uint32_t vstream_id) +{ + vstream_info_t *vstream = NULL; + + if (vstream_id >= MAX_VSTREAM_NUM) { + XSCHED_ERR("Vstream id=%u out of range @ %s.\n", + vstream_id, __func__); + return NULL; + } + + mutex_lock(&xcu->vs_array_lock); + vstream = xcu->vs_array[vstream_id]; + mutex_unlock(&xcu->vs_array_lock); + + return vstream; +} + +static vstream_info_t * +vstream_get_by_user_stream_id(struct xsched_cu *xcu, uint32_t user_stream_id) +{ + int id; + static vstream_info_t *ret; + + mutex_lock(&xcu->vs_array_lock); + for (id = 0; id < MAX_VSTREAM_NUM; id++) { + if (xcu->vs_array[id] != NULL && + xcu->vs_array[id]->user_stream_id == user_stream_id) { + ret = xcu->vs_array[id]; + break; + } + } + mutex_unlock(&xcu->vs_array_lock); + return ret; +} + +static int vstream_bind_to_xcu(vstream_info_t *vstream_info) +{ + struct xsched_cu *xcu_found = NULL; + uint32_t type = XCU_TYPE_XPU; + + xcu_found = xcu_find(type, vstream_info->dev_id, vstream_info->channel_id); + if (!xcu_found) + return -EINVAL; + + /* Bind vstream to a xcu. */ + vstream_info->xcu = xcu_found; + vstream_info->dev_id = xcu_found->id; + XSCHED_DEBUG("XCU bound to a vstream: type=%u, dev_id=%u, chan_id=%u.\n", + type, vstream_info->dev_id, vstream_info->channel_id); + + return 0; +} + +static int sqcq_alloc(struct vstream_args *arg) +{ + vstream_alloc_args_t *va_args = &arg->va_args; + struct xsched_context *ctx = NULL; + struct xcu_op_handler_params params; + struct file *vs_file; + uint32_t logic_cq_id = 0; + vstream_info_t *vstream; + int ret = 0; + uint32_t tgid = 0; + uint32_t cq_id = 0; + uint32_t sq_id = 0; + + vstream = vstream_create(arg); + if (!vstream) + return -ENOSPC; + + vstream->fd = arg->fd; + vstream->task_type = arg->task_type; + + ret = vstream_bind_to_xcu(vstream); + if (ret < 0) { + ret = -EINVAL; + goto out_err_vstream_free; + } + + /* Allocates vstream's SQ and CQ memory on a XCU for processing. */ + params.group = vstream->xcu->group; + params.fd = arg->fd; + params.payload = arg->payload; + params.param_1 = &tgid; + params.param_2 = &sq_id; + params.param_3 = &cq_id; + params.param_4 = &logic_cq_id; + ret = xcu_alloc(¶ms); + if (ret) { + XSCHED_ERR("Fail to allocate SQ/CQ memory to a vstream.\n"); + goto out_err_vstream_free; + } + + vstream->drv_ctx = params.param_5; + vstream->id = sq_id; + vstream->vcq_id = cq_id; + vstream->logic_vcq_id = logic_cq_id; + vstream->user_stream_id = va_args->user_stream_id; + vstream->tgid = tgid; + vstream->sqcq_type = va_args->type; + ret = vstream_bind_to_ctx(vstream); + if (ret) + goto out_err_xcu_finish; + + ctx = vstream->ctx; + ret = vstream_file_create(vstream); + if (ret < 0) { + XSCHED_ERR("Fail to alloc anon inode for vstream %u @ %s\n", + vstream->id, __func__); + goto out_err_ctx_free; + } + vstream->inode_fd = ret; + + /* Add new vstream to array after allocating inode */ + ret = vstream_add(vstream, vstream->id); + if (ret) + goto out_err_vstream_file_put; + + arg->sq_id = sq_id; + arg->cq_id = cq_id; + + return 0; + +out_err_vstream_file_put: + vs_file = vstream_file_get(vstream->inode_fd); + if (vs_file) { + vs_file->private_data = NULL; + vstream_file_put(vs_file); + } +out_err_ctx_free: + if (ctx) { + /* In the current code context, + * vstream should not be released inside xsched_task_free. + * Otherwise, vstream may become a wild pointer. + * If it is still being used by other objects, + * it may cause a UAF issue when it is released again in + * out_err_vstream_free. + */ + mutex_lock(&vstream->xcu->ctx_list_lock); + list_del(&vstream->ctx_node); + mutex_unlock(&vstream->xcu->ctx_list_lock); + kref_put(&ctx->kref, xsched_task_free); + } +out_err_xcu_finish: + if (xcu_finish(¶ms)) + XSCHED_ERR("Fail to free vstream sqId=%u, cqId=%u.\n", sq_id, cq_id); +out_err_vstream_free: + kfree(vstream); + return ret; +} + +static int logic_cq_alloc(struct vstream_args *arg) +{ + int err = 0; + struct xcu_op_handler_params params; + vstream_info_t *vstream = NULL; + vstream_alloc_args_t *logic_cq_alloc_para = &arg->va_args; + struct xsched_cu *xcu_found = NULL; + uint32_t logic_cq_id = 0; + uint32_t type = XCU_TYPE_XPU; + + xcu_found = xcu_find(type, arg->dev_id, arg->channel_id); + if (!xcu_found) + return -EINVAL; + + vstream = vstream_get_by_user_stream_id(xcu_found, + logic_cq_alloc_para->user_stream_id); + if (vstream) + xcu_found = vstream->xcu; + params.group = xcu_found->group; + params.fd = arg->fd; + params.payload = arg->payload; + params.param_1 = &logic_cq_id; + err = xcu_logic_alloc(¶ms); + if (err) { + XSCHED_ERR("Fail to alloc logic CQ memory to a vstream.\n"); + return err; + } + if (vstream) + vstream->logic_vcq_id = logic_cq_id; + + return 0; +} + +int vstream_alloc(struct vstream_args *arg) +{ + vstream_alloc_args_t *va_args = &arg->va_args; + int ret; + + if (!va_args->type) + ret = sqcq_alloc(arg); + else + ret = logic_cq_alloc(arg); + + return ret; +} + +int vstream_free(struct vstream_args *arg) +{ + struct file *vs_file; + struct xcu_op_handler_params params; + struct xsched_cu *xcu_found; + uint32_t vstream_id = arg->sq_id; + uint32_t type = XCU_TYPE_XPU; + vstream_info_t *vstream = NULL; + int err = 0; + + xcu_found = xcu_find(type, arg->dev_id, arg->channel_id); + if (!xcu_found) + return -EINVAL; + + vstream = vstream_get(xcu_found, vstream_id); + if (!vstream) { + XSCHED_ERR("Fail to free NULL vstream, vstream id=%u\n", vstream_id); + return -EINVAL; + } + + params.group = vstream->xcu->group; + params.fd = arg->fd; + params.payload = arg->payload; + + vs_file = vstream_file_get(vstream->inode_fd); + if (vs_file) { + vs_file->private_data = NULL; + vstream_file_put(vs_file); + } + + /* After vstream_get(), destroying the vstream may not fail */ + vstream_destroy(vstream); + err = xcu_finish(¶ms); + if (err) + XSCHED_ERR("Fail to free vstream sqId=%u, cqId=%u.\n", + arg->sq_id, arg->cq_id); + + return err; +} + +int vstream_kick(struct vstream_args *arg) +{ + vstream_info_t *vstream; + struct xsched_cu *xcu = NULL; + struct xsched_entity *xse; + int err = 0; + uint32_t vstream_id = arg->sq_id; + uint32_t type = XCU_TYPE_XPU; + + xcu = xcu_find(type, arg->dev_id, arg->channel_id); + if (!xcu) + return -EINVAL; + + /* Get vstream. */ + vstream = vstream_get(xcu, vstream_id); + if (!vstream || !vstream->ctx) { + XSCHED_ERR("Vstream NULL or doesn't have a context. vstream_id=%u, dev_id=%u\n", + vstream_id, arg->dev_id); + return -EINVAL; + } + + xse = &vstream->ctx->xse; + XSCHED_DEBUG("New kick on xse %d @ %s\n", xse->tgid, __func__); + + do { + mutex_lock(&xcu->xcu_lock); + spin_lock(&vstream->stream_lock); + + /* Adding kick metadata. */ + err = xsched_vsm_add_tail(vstream, arg); + if (err == -EBUSY) { + spin_unlock(&vstream->stream_lock); + mutex_unlock(&xcu->xcu_lock); + + /* Retry after a while */ + usleep_range(100, 200); + continue; + } + + /* Don't forget to unlock */ + if (err) { + XSCHED_ERR("Fail to add kick metadata to vs %u @ %s\n", + vstream->id, __func__); + break; + } + + enqueue_ctx(xse, xcu); + + /* Increasing a total amount of kicks on an CU to which this + * context is attached to based on sched_class. + */ + xsched_inc_pending_kicks_xse(&vstream->ctx->xse); + } while (err == -EBUSY); + + spin_unlock(&vstream->stream_lock); + mutex_unlock(&xcu->xcu_lock); + if (!err) + wake_up_interruptible(&xcu->wq_xcu_idle); + + return err; +} + +/* + * vstream_manage_cmd table + */ +static vstream_manage_t(*vstream_command_table[MAX_COMMAND + 1]) = { + vstream_alloc, // VSTREAM_ALLOC + vstream_free, // VSTREAM_FREE + vstream_kick, // VSTREAM_KICK + NULL // MAX_COMMAND +}; + +SYSCALL_DEFINE2(vstream_manage, struct vstream_args __user *, arg, int, cmd) +{ + int res = 0; + struct vstream_args vstream_arg; + + if (cmd < 0 || cmd >= MAX_COMMAND) { + XSCHED_ERR("Invalid cmd value: %d, valid range is 0 to %d\n", cmd, MAX_COMMAND - 1); + return -EINVAL; + } + + if (copy_from_user(&vstream_arg, arg, sizeof(struct vstream_args))) { + XSCHED_ERR("copy_from_user failed\n"); + return -EFAULT; + } + + res = vstream_command_table[cmd](&vstream_arg); + if (copy_to_user(arg, &vstream_arg, sizeof(struct vstream_args))) { + XSCHED_ERR("copy_to_user failed\n"); + return -EFAULT; + } + + XSCHED_DEBUG("vstream_manage: cmd %d\n", cmd); + return res; +} +#else +SYSCALL_DEFINE2(vstream_manage, struct vstream_args __user *, arg, int, cmd) +{ + return 0; +} +#endif \ No newline at end of file