From e73f67ea9a7512361acb47f677372b26ca6ea335 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Wed, 13 Sep 2023 13:11:51 +0800
Subject: [PATCH] ucc: add ucc support

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I80YXE
CVE: NA

----------------------------------------

ucc support for XPU.

Signed-off-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Yang Yanchao <yangyanchao6@huawei.com>
Signed-off-by: Hui Tang <tanghui20@huawei.com>
Signed-off-by: Guan Jing <guanjing6@huawei.com>
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
---
 Kconfig                             |   2 +
 drivers/Kconfig                     |   2 +
 drivers/Makefile                    |   1 +
 drivers/xpu/Kconfig                 |   9 +
 drivers/xpu/Makefile                |   1 +
 drivers/xpu/xpu_group.c             | 175 ++++++++
 fs/proc/base.c                      | 102 ++++-
 include/linux/sched.h               |   3 +
 include/linux/ucc_common.h          |  21 +
 include/linux/ucc_kfd.h             | 110 +++++
 include/linux/ucc_sched.h           |  36 ++
 include/linux/ucc_sched/ucc_sched.h |  71 +++
 include/linux/ucc_ts.h              | 254 +++++++++++
 include/linux/vstream.h             | 123 ++++++
 include/linux/xpu_group.h           |  66 +++
 include/trace/events/ucc_sched.h    | 120 +++++
 init/init_task.c                    |   4 +
 init/main.c                         |   9 +
 kernel/Makefile                     |   2 +
 kernel/sched/Makefile               |   1 +
 kernel/sched/core.c                 |   5 +
 kernel/sched/ucc_sched.c            | 148 +++++++
 kernel/sysctl.c                     |  17 +-
 kernel/ucc/Kconfig                  |  21 +
 kernel/ucc/Makefile                 |   1 +
 kernel/ucc/ascend_vstream.c         | 654 ++++++++++++++++++++++++++++
 kernel/ucc/ascend_vstream.h         |  13 +
 kernel/ucc/vstream.c                |  62 +++
 kernel/ucc_sched/Makefile           |   1 +
 kernel/ucc_sched/core.c             | 591 +++++++++++++++++++++++++
 kernel/ucc_sched/ucc_sched.h        |  43 ++
 31 files changed, 2666 insertions(+), 2 deletions(-)
 create mode 100644 drivers/xpu/Kconfig
 create mode 100644 drivers/xpu/Makefile
 create mode 100644 drivers/xpu/xpu_group.c
 create mode 100644 include/linux/ucc_common.h
 create mode 100644 include/linux/ucc_kfd.h
 create mode 100644 include/linux/ucc_sched.h
 create mode 100644 include/linux/ucc_sched/ucc_sched.h
 create mode 100644 include/linux/ucc_ts.h
 create mode 100644 include/linux/vstream.h
 create mode 100644 include/linux/xpu_group.h
 create mode 100644 include/trace/events/ucc_sched.h
 create mode 100644 kernel/sched/ucc_sched.c
 create mode 100644 kernel/ucc/Kconfig
 create mode 100644 kernel/ucc/Makefile
 create mode 100644 kernel/ucc/ascend_vstream.c
 create mode 100644 kernel/ucc/ascend_vstream.h
 create mode 100644 kernel/ucc/vstream.c
 create mode 100644 kernel/ucc_sched/Makefile
 create mode 100644 kernel/ucc_sched/core.c
 create mode 100644 kernel/ucc_sched/ucc_sched.h

diff --git a/Kconfig b/Kconfig
index 48a80beab685..8e558777fb54 100644
--- a/Kconfig
+++ b/Kconfig
@@ -30,3 +30,5 @@ source "crypto/Kconfig"
 source "lib/Kconfig"
 
 source "lib/Kconfig.debug"
+
+source "kernel/ucc/Kconfig"
diff --git a/drivers/Kconfig b/drivers/Kconfig
index ab4d43923c4d..bd59e9e525ba 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -219,4 +219,6 @@ source "drivers/siox/Kconfig"
 
 source "drivers/slimbus/Kconfig"
 
+source "drivers/xpu/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 578f469f72fb..1130b2d92df1 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -186,3 +186,4 @@ obj-$(CONFIG_MULTIPLEXER)	+= mux/
 obj-$(CONFIG_UNISYS_VISORBUS)	+= visorbus/
 obj-$(CONFIG_SIOX)		+= siox/
 obj-$(CONFIG_GNSS)		+= gnss/
+obj-$(CONFIG_XPU_SCHEDULE)     += xpu/
diff --git a/drivers/xpu/Kconfig b/drivers/xpu/Kconfig
new file mode 100644
index 000000000000..c4a391d0039d
--- /dev/null
+++ b/drivers/xpu/Kconfig
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menuconfig XPU_SCHEDULE
+	bool "xpu schedule"
+	default n
+	help
+	  Support xpu schedule, Say Y here if you want support for use
+	  xpu schedule.
+
diff --git a/drivers/xpu/Makefile b/drivers/xpu/Makefile
new file mode 100644
index 000000000000..9edc6dcdd4d0
--- /dev/null
+++ b/drivers/xpu/Makefile
@@ -0,0 +1 @@
+obj-y		+= xpu_group.o
diff --git a/drivers/xpu/xpu_group.c b/drivers/xpu/xpu_group.c
new file mode 100644
index 000000000000..53a598db0615
--- /dev/null
+++ b/drivers/xpu/xpu_group.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/xpu_group.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+
+extern int ucc_rt_nr_running(struct xcu *cu);
+static DECLARE_RWSEM(xpu_group_rwsem);
+
+static struct xpu_capability xpu_capability_root;
+
+struct xpu_group __xpu_root = {
+	.type = XPU_TYPE_ROOT,
+	.capability = &xpu_capability_root,
+
+	.next_layer = IDR_INIT(next_layer),
+};
+
+struct xpu_group *xpu_root = &__xpu_root;
+EXPORT_SYMBOL(xpu_root);
+
+int __xpu_group_attach(struct xpu_group *new_group,
+		       struct xpu_group *previous_group)
+{
+	int id = new_group->id;
+
+	if (id == -1)
+		id = idr_alloc(&previous_group->next_layer, new_group,
+			       0, INT_MAX, GFP_KERNEL);
+	else
+		id = idr_alloc(&previous_group->next_layer, new_group,
+			       id, id + 1, GFP_KERNEL);
+	if (id < 0)
+		return -EEXIST;
+
+	new_group->id = id;
+	new_group->previous_layer = previous_group;
+
+	return 0;
+}
+
+int xpu_group_attach(struct xpu_group *new_group,
+		     struct xpu_group *previous_group)
+{
+	int ret;
+
+	down_write(&xpu_group_rwsem);
+	ret = __xpu_group_attach(new_group, previous_group);
+	up_write(&xpu_group_rwsem);
+	return ret;
+}
+EXPORT_SYMBOL(xpu_group_attach);
+
+struct xpu_group *xpu_group_alloc_and_attach(struct xpu_group *previous_group,
+					      int id)
+{
+	struct xpu_group *new = xpu_group_alloc();
+
+	if (!new) {
+		pr_err("alloc xpu_group failed\n");
+		return NULL;
+	}
+
+	new->id = id;
+
+	if (!xpu_group_attach(new, previous_group))
+		return NULL;
+
+	return new;
+}
+EXPORT_SYMBOL(xpu_group_alloc_and_attach);
+
+int __xpu_group_detach(struct xpu_group *group)
+{
+	idr_remove(&group->previous_layer->next_layer, group->id);
+	return 0;
+}
+
+int xpu_group_detach(struct xpu_group *group)
+{
+	int ret;
+
+	down_write(&xpu_group_rwsem);
+	ret = __xpu_group_detach(group);
+	up_write(&xpu_group_rwsem);
+	return ret;
+}
+EXPORT_SYMBOL(xpu_group_detach);
+
+struct xpu_group *__xpu_group_find(struct xpu_group *group, int id)
+{
+	return idr_find(&group->next_layer, id);
+}
+
+struct xpu_group *xpu_group_find(struct xpu_group *group, int id)
+{
+	struct xpu_group *p;
+
+	p = xpu_group_alloc();
+
+	down_read(&xpu_group_rwsem);
+	p = __xpu_group_find(group, id);
+	up_read(&xpu_group_rwsem);
+
+	return p;
+}
+EXPORT_SYMBOL(xpu_group_find);
+
+
+struct xpu_group *xpu_idle_group_find(struct xpu_group *group)
+{
+	struct xpu_group *entry_group;
+	int id;
+
+	down_read(&xpu_group_rwsem);
+	idr_for_each_entry(&group->next_layer, entry_group, id) {
+		if (!entry_group->used) {
+			up_read(&xpu_group_rwsem);
+			return entry_group;
+		}
+	}
+	up_read(&xpu_group_rwsem);
+
+	return NULL;
+}
+
+int xpu_run(struct xpu_group *group, void *para1, void *para2)
+{
+	int ret = 0;
+
+	if (group->opt && group->opt->run)
+		ret = group->opt->run(group, para1, para2);
+
+	return ret;
+}
+
+int xpu_finish(struct xpu_group *group, void *para1, void *para2)
+{
+	if (group->opt && group->opt->finish)
+		return group->opt->finish(group, para1, para2);
+
+	return 0;
+}
+
+int xpu_wait(struct xpu_group *group, void *para1, void *para2, void *para3)
+{
+	if (group->opt && group->opt->wait)
+		return group->opt->wait(group, para1, para2, para3);
+
+	return 0;
+}
+
+int xpu_complete(struct xpu_group *group, void *para1, void *para2, void *para3)
+{
+	if (group->opt && group->opt->complete)
+		return group->opt->complete(group, para1, para2, para3);
+
+	return 0;
+}
+
+struct xpu_group *xpu_group_alloc(void)
+{
+	struct xpu_group *node = kzalloc(sizeof(*node), GFP_KERNEL);
+
+	if (!node)
+		return NULL;
+
+	node->type = XPU_TYPE_CUSTOM;
+	idr_init(&node->next_layer);
+
+	return node;
+}
+EXPORT_SYMBOL(xpu_group_alloc);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc9841826264..516eee1ae952 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -770,7 +770,6 @@ static const struct file_operations proc_single_file_operations = {
 	.release	= single_release,
 };
 
-
 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 {
 	struct task_struct *task = get_proc_task(inode);
@@ -1546,6 +1545,99 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+#ifdef CONFIG_XPU_SCHEDULE
+static ssize_t ucc_step_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char numbuf[PROC_NUMBUF];
+	ssize_t len;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+
+	len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->ucc_step);
+
+	put_task_struct(task);
+
+	return simple_read_from_buffer(buf, count, ppos, numbuf, len);
+}
+
+static ssize_t ucc_step_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *offset)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+	int err;
+	unsigned int ucc_step;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	err = kstrtouint_from_user(buf, count, 0, &ucc_step);
+	if (err)
+		return err;
+
+	p->ucc_step = ucc_step;
+	put_task_struct(p);
+
+	return count;
+}
+
+static const struct file_operations ucc_step_operations = {
+	.write		= ucc_step_write,
+	.read		= ucc_step_read,
+};
+
+static ssize_t ucc_priority_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char numbuf[PROC_NUMBUF];
+	ssize_t len;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+
+	len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->ucc_priority);
+
+	put_task_struct(task);
+
+	return simple_read_from_buffer(buf, count, ppos, numbuf, len);
+}
+
+static ssize_t ucc_priority_write(struct file *file, const char __user *buf,
+				  size_t count, loff_t *offset)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+	int err;
+	unsigned int ucc_priority;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	err = kstrtouint_from_user(buf, count, 0, &ucc_priority);
+	if (err)
+		return err;
+
+	p->ucc_priority = ucc_priority;
+	put_task_struct(p);
+
+	return count;
+}
+
+static const struct file_operations ucc_priority_operations = {
+	.write		= ucc_priority_write,
+	.read		= ucc_priority_read,
+};
+
+#endif
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 /*
  * Print out autogroup related information:
@@ -3151,6 +3243,10 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_ASCEND_SHARE_POOL
 	ONE("sp_group", S_IRUGO, proc_sp_group_state),
 #endif
+#ifdef CONFIG_XPU_SCHEDULE
+	REG("ucc_priority", 0644, ucc_priority_operations),
+	REG("ucc_step", 0644, ucc_step_operations),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3537,6 +3633,10 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_ASCEND_SHARE_POOL
 	ONE("sp_group", S_IRUGO, proc_sp_group_state),
 #endif
+#ifdef CONFIG_XPU_SCHEDULE
+	REG("ucc_priority", 0644, ucc_priority_operations),
+	REG("ucc_step", 0644, ucc_step_operations),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8fd8c5b7cdc6..175659be95f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1281,6 +1281,9 @@ struct task_struct {
 #if !defined(__GENKSYMS__)
 #if defined(CONFIG_QOS_SCHED_SMART_GRID)
 	struct sched_grid_qos *grid_qos;
+#elif defined(CONFIG_XPU_SCHEDULE)
+	u32				ucc_priority;
+	u32				ucc_step;
 #else
 	KABI_RESERVE(8)
 #endif
diff --git a/include/linux/ucc_common.h b/include/linux/ucc_common.h
new file mode 100644
index 000000000000..3875c2226d24
--- /dev/null
+++ b/include/linux/ucc_common.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _UCC_COMMON_H
+#define _UCC_COMMON_H
+
+/*
+ * UCC Print Function
+ */
+#ifndef pr_fmt
+#define pr_fmt(fmt) fmt
+#endif
+
+#define ucc_err(fmt, ...)	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+
+#define ucc_warn(fmt, ...)	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+
+#define ucc_info(fmt, ...)	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+
+#define ucc_dbg(fmt, ...)	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+
+#endif
diff --git a/include/linux/ucc_kfd.h b/include/linux/ucc_kfd.h
new file mode 100644
index 000000000000..07eedc2fd5f2
--- /dev/null
+++ b/include/linux/ucc_kfd.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef KFD_PRIV_H_INCLUDED
+#define KFD_PRIV_H_INCLUDED
+
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/mmu_notifier.h>
+#include <linux/idr.h>
+#include <linux/dma-fence.h>
+#include <linux/workqueue.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+struct process_queue_manager;
+struct kfd_process;
+struct kfd_signal_page;
+
+struct process_queue_manager {
+	struct kfd_process	*process;
+	struct list_head	queues;
+	unsigned long		*queue_slot_bitmap;
+};
+
+struct kfd_signal_page {
+	uint64_t *kernel_address;
+	uint64_t __user *user_address;
+	bool need_to_free_pages;
+};
+
+/* Process data */
+struct kfd_process {
+	struct hlist_node kfd_processes;
+	void *mm;
+	struct kref ref;
+	struct work_struct release_work;
+	struct mutex mutex;
+	struct task_struct *lead_thread;
+	struct mmu_notifier mmu_notifier;
+/* TODO: check if use right branch */
+	struct rcu_head	rcu;
+	uint16_t pasid;
+	struct list_head per_device_data;
+	struct process_queue_manager pqm;
+	bool is_32bit_user_mode;
+	struct mutex event_mutex;
+	struct idr event_idr;
+	struct kfd_signal_page *signal_page;
+	size_t signal_mapped_size;
+	size_t signal_event_count;
+	bool signal_event_limit_reached;
+/* TODO: check if use right branch */
+	struct rb_root bo_interval_tree;
+	void *kgd_process_info;
+	struct dma_fence *ef;
+	struct delayed_work eviction_work;
+	struct delayed_work restore_work;
+	unsigned int last_eviction_seqno;
+	unsigned long last_restore_timestamp;
+	unsigned long last_evict_timestamp;
+	bool debug_trap_enabled;
+	uint32_t trap_debug_wave_launch_mode;
+	struct file *dbg_ev_file;
+	uint32_t allocated_debug_watch_point_bitmask;
+	struct kobject *kobj;
+	struct kobject *kobj_queues;
+	struct attribute attr_pasid;
+	bool has_cwsr;
+	uint64_t exception_enable_mask;
+	uint64_t exception_status;
+};
+
+struct kfd_ioctl_create_queue_args {
+	__u64 ring_base_address;	/* to KFD */
+	__u64 write_pointer_address;	/* from KFD */
+	__u64 read_pointer_address;	/* from KFD */
+	__u64 doorbell_offset;	/* from KFD */
+
+	__u32 ring_size;		/* to KFD */
+	__u32 gpu_id;		/* to KFD */
+	__u32 queue_type;		/* to KFD */
+	__u32 queue_percentage;	/* to KFD */
+	__u32 queue_priority;	/* to KFD */
+	__u32 queue_id;		/* from KFD */
+
+	__u64 eop_buffer_address;	/* to KFD */
+	__u64 eop_buffer_size;	/* to KFD */
+	__u64 ctx_save_restore_address; /* to KFD */
+	__u32 ctx_save_restore_size;	/* to KFD */
+	__u32 ctl_stack_size;		/* to KFD */
+};
+
+struct kfd_ioctl_destroy_queue_args {
+	__u32 queue_id;		/* to KFD */
+	__u32 pad;
+};
+
+struct kfd_ioctl_update_queue_args {
+	__u64 ring_base_address;	/* to KFD */
+
+	__u32 queue_id;		/* to KFD */
+	__u32 ring_size;		/* to KFD */
+	__u32 queue_percentage;	/* to KFD */
+	__u32 queue_priority;	/* to KFD */
+};
+#endif
diff --git a/include/linux/ucc_sched.h b/include/linux/ucc_sched.h
new file mode 100644
index 000000000000..5b170545f7c2
--- /dev/null
+++ b/include/linux/ucc_sched.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_UCC_SCHED_H__
+#define __LINUX_UCC_SCHED_H__
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/rculist.h>
+#include <linux/idr.h>
+#include <linux/xpu_group.h>
+#include <linux/hashtable.h>
+#include <linux/vstream.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#define VRTSQ_RTSQ_HASH_ORDER  6
+
+#ifdef CONFIG_XPU_SCHEDULE
+int ucc_process_task(struct vstream_info *vsqcq_info, struct tsdrv_ctx *ctx,
+		     int *sqenum);
+int ucc_free_task(struct vstream_info *vsqcq_info, struct tsdrv_ctx *ctx);
+int ucc_wait_cq(struct vstream_info *vsqcq_info, struct tsdrv_ctx *ctx,
+		struct devdrv_report_para *arg, int *sqenum);
+struct xpu_group *select_sq(struct vstream_info *vstream_info);
+int ucc_sched_register_xcu(int dev_id, int ts_id, int cu_num);
+void ucc_set_vstream_state(struct vstream_info *vinfo, int state);
+void ucc_dequeue_task(struct vstream_info *vInfo);
+int ucc_rt_nr_running(struct xcu *cu);
+struct xcu *ucc_get_xcu_by_id(int cu_id);
+int ucc_xcu_is_sched(int cu_id);
+void ucc_dump_statistics_info(struct ucc_se *se);
+#endif
+
+#endif
diff --git a/include/linux/ucc_sched/ucc_sched.h b/include/linux/ucc_sched/ucc_sched.h
new file mode 100644
index 000000000000..6edd8930e09e
--- /dev/null
+++ b/include/linux/ucc_sched/ucc_sched.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Huawei Technologies Co., Ltd. 2019. All rights reserved.
+ * Author: Huawei OS Kernel Lab
+ * Create: Mon Jan 30 14:29:19 2023
+ */
+
+#ifndef __LINUX_UCC_SCHED_USCHED_H__
+#define __LINUX_UCC_SCHED_USCHED_H__
+
+enum ucc_se_state {
+	SE_PREPARE,
+	SE_READY,
+	SE_RUNNING,
+	SE_BLOCK,
+	SE_DEAD,
+};
+
+enum ucc_se_flag {
+	UCC_TIF_NONE,
+	UCC_TIF_PREEMPT,
+	UCC_TIF_BALANCE,
+};
+
+enum ucc_se_prio {
+	UCC_PRIO_HIGH,
+	UCC_PRIO_LOW,
+};
+
+enum ucc_se_step {
+	UCC_STEP_SLOW = 1,
+	UCC_STEP_FAST = 10,
+};
+
+struct ucc_statistics {
+	u64	wait_start;
+	u64	wait_max;
+	u64	wait_count;
+	u64	wait_sum;
+
+	u64	preempt_start;
+	u64	preempt_max;
+	u64	preempt_count;
+	u64	preempt_sum;
+
+	u64	kernel_sum;
+	u64	timeout_count;
+
+	u64	run_start;
+	u64	run_max;
+	u64	run_count;
+	u64	run_sum;
+};
+
+struct ucc_se {
+	int on_cu;
+	struct list_head run_list;
+	enum ucc_se_state state;
+	enum ucc_se_flag flag;
+	enum ucc_se_prio prio;
+	enum ucc_se_step step;
+	raw_spinlock_t se_lock;
+	struct ucc_statistics statistics;
+	int is_timeout;
+};
+
+int ucc_sched_init(void);
+int ucc_schedule(int cu_id);
+int ucc_wake_up(struct ucc_se *se);
+
+#endif
diff --git a/include/linux/ucc_ts.h b/include/linux/ucc_ts.h
new file mode 100644
index 000000000000..7280ccca1059
--- /dev/null
+++ b/include/linux/ucc_ts.h
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef TS_H
+#define TS_H
+
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+
+#define DEVDRV_MAX_SQ_DEPTH		(1024)
+#define DEVDRV_SQ_SLOT_SIZE		(64)
+
+#define DEVDRV_MAX_SQ_NUM		(512 - 1)
+#define DEVDRV_MAX_CQ_NUM		(352 - 1)
+
+#define DEVDRV_MAX_TS_NUM		(1)
+
+#define REMAP_ALIGN_SIZE		(64 * 1024)
+#define REMAP_ALIGN_MASK		(~(REMAP_ALIGN_SIZE - 1))
+#define REMAP_ALIGN(x)			(((x) + REMAP_ALIGN_SIZE - 1) & \
+					REMAP_ALIGN_MASK)
+
+#define DEVDRV_DB_SPACE_SIZE		(1024 * 4096)
+
+#define SQCQ_RTS_INFO_LENGTH		5
+#define SQCQ_RESV_LENGTH		8
+
+#define DEVDRV_CBCQ_MAX_GID		128
+
+enum phy_sqcq_type {
+	NORMAL_SQCQ_TYPE = 0,
+	CALLBACK_SQCQ_TYPE,
+	LOGIC_SQCQ_TYPE,
+	SHM_SQCQ_TYPE,
+	DFX_SQCQ_TYPE,
+	TS_SQCQ_TYPE,
+	KERNEL_SQCQ_TYPE,
+};
+
+struct notifier_operations {
+	int (*notifier_call)(struct file *file_op, unsigned long mode);
+};
+
+#define MAX_DEVICE_COUNT 64
+
+struct davinci_intf_stru {
+	atomic_t count;
+	struct mutex dmutex;
+	struct cdev cdev;
+	struct device *device;
+	struct list_head process_list;
+	struct list_head module_list;
+	unsigned int device_status[MAX_DEVICE_COUNT];
+	cpumask_var_t cpumask;
+};
+
+#define DAVINIC_MODULE_NAME_MAX		256
+struct davinci_intf_private_stru {
+	char module_name[DAVINIC_MODULE_NAME_MAX];
+	unsigned int device_id;
+	pid_t owner_pid;
+	int close_flag;
+	atomic_t work_count;
+	int release_status;
+	struct mutex fmutex;
+	const struct file_operations fops;
+	struct notifier_operations notifier;
+	struct davinci_intf_stru *device_cb;
+	struct file priv_filep;
+	unsigned int free_type;
+};
+
+enum sqcq_alloc_status {
+	SQCQ_INACTIVE = 0,
+	SQCQ_ACTIVE
+};
+
+struct devdrv_ts_sq_info {
+	enum phy_sqcq_type type;
+	pid_t tgid;
+	u32 head;
+	u32 tail;
+	u32 credit;
+	u32 index;
+	int uio_fd;
+
+	u8 *uio_addr;
+	int uio_size;
+
+	enum sqcq_alloc_status alloc_status;
+	u64 send_count;
+
+	void *sq_sub;
+};
+
+struct devdrv_ts_cq_info {
+	enum phy_sqcq_type type;
+	pid_t tgid;
+	u32 vfid;
+
+	u32 head;
+	u32 tail;
+	u32 release_head;  /* runtime read cq head value */
+	u32 index;
+	u32 phase;
+	u32 int_flag;
+
+	int uio_fd;
+
+	u8 *uio_addr;
+	int uio_size;
+
+	enum sqcq_alloc_status alloc_status;
+	u64 receive_count;
+
+	void *cq_sub;
+
+	void (*complete_handle)(struct devdrv_ts_cq_info *cq_info);
+
+	u8 slot_size;
+};
+
+#define DEVDRV_SQ_INFO_OCCUPY_SIZE \
+	(sizeof(struct devdrv_ts_sq_info) * DEVDRV_MAX_SQ_NUM)
+#define DEVDRV_CQ_INFO_OCCUPY_SIZE \
+	(sizeof(struct devdrv_ts_cq_info) * DEVDRV_MAX_CQ_NUM)
+
+#define DEVDRV_MAX_INFO_SIZE	\
+	(DEVDRV_SQ_INFO_OCCUPY_SIZE + DEVDRV_CQ_INFO_OCCUPY_SIZE)
+#define DEVDRV_VM_SQ_MEM_OFFSET		0
+#define DEVDRV_VM_SQ_SLOT_SIZE	\
+	REMAP_ALIGN(DEVDRV_MAX_SQ_DEPTH * DEVDRV_SQ_SLOT_SIZE)
+#define DEVDRV_VM_SQ_MEM_SIZE	\
+	(DEVDRV_VM_SQ_SLOT_SIZE * DEVDRV_MAX_SQ_NUM)
+
+#define DEVDRV_VM_INFO_MEM_OFFSET	\
+	(DEVDRV_VM_SQ_MEM_OFFSET + DEVDRV_VM_SQ_MEM_SIZE)
+#define DEVDRV_VM_INFO_MEM_SIZE		REMAP_ALIGN(DEVDRV_MAX_INFO_SIZE)
+
+#define DEVDRV_VM_DB_MEM_OFFSET	\
+	(DEVDRV_VM_INFO_MEM_OFFSET + DEVDRV_VM_INFO_MEM_SIZE)
+#define DEVDRV_VM_DB_MEM_SIZE		REMAP_ALIGN(DEVDRV_DB_SPACE_SIZE)
+
+#define DEVDRV_VM_CQ_MEM_OFFSET	\
+	(DEVDRV_VM_DB_MEM_OFFSET + DEVDRV_VM_DB_MEM_SIZE)
+
+enum tsdrv_id_type {
+	TSDRV_STREAM_ID,
+	TSDRV_NOTIFY_ID,
+	TSDRV_MODEL_ID,
+	TSDRV_EVENT_SW_ID, /* should use for event alloc/free/inquiry res_num*/
+	TSDRV_EVENT_HW_ID,
+	TSDRV_IPC_EVENT_ID,
+	TSDRV_SQ_ID,
+	TSDRV_CQ_ID,
+	TSDRV_PCQ_ID,
+	TSDRV_MAX_ID,
+};
+
+#define TSDRV_CQ_REUSE 0x00000001
+#define TSDRV_SQ_REUSE 0x00000002
+
+struct normal_alloc_sqcq_para {
+	uint32_t fd;
+	uint32_t tsId;
+	uint32_t devId;
+	uint32_t sqeSize;
+	uint32_t cqeSize;
+	uint32_t sqeDepth;
+	uint32_t cqeDepth;
+	uint32_t grpId;
+	uint32_t flag;
+	uint32_t sqId;
+	uint32_t cqId;
+	uint32_t priority;
+	uint32_t info[SQCQ_RTS_INFO_LENGTH];
+	uint32_t res[SQCQ_RESV_LENGTH];
+};
+
+struct normal_free_sqcq_para {
+	uint32_t tsId;
+	uint32_t flag;
+	uint32_t sqId;
+	uint32_t cqId;
+	uint32_t res[SQCQ_RESV_LENGTH];
+};
+
+struct tsdrv_sqcq_data_para {
+	uint32_t id;
+	uint32_t val;
+};
+
+struct devdrv_report_para {
+	int timeout;
+	u32 cq_tail;
+	u32 cq_id;
+};
+
+struct tsdrv_ts_id_ctx {
+	u32 id_num;
+	struct list_head id_list;
+	spinlock_t id_lock;
+};
+struct tsdrv_ts_ctx {
+	u32 tsid;
+	atomic_t status;
+	u32 send_count;
+	u64 receive_count;
+
+	int32_t cq_tail_updated;
+	wait_queue_head_t report_wait;
+
+	struct work_struct recycle_work;
+
+	wait_queue_head_t cbcq_wait[DEVDRV_CBCQ_MAX_GID];
+
+	void *shm_sqcq_ctx;
+	void *logic_sqcq_ctx;
+	void *sync_cb_sqcq_ctx; // mini callback
+
+	struct tsdrv_ts_id_ctx id_ctx[TSDRV_MAX_ID];
+
+	/* only used by vm */
+	u32 vcqid;
+	u32 wait_queue_inited;
+	u32 cq_report_status;
+	int32_t cq_tail;
+	spinlock_t ctx_lock;
+
+	u32 recycle_cbsqcq_num; // min callback
+};
+
+//Context Delivers
+struct tsdrv_ctx {
+	u32 ctx_index;
+	atomic_t status;
+	atomic_t type;
+	pid_t tgid;
+	pid_t pid;
+	int32_t ssid;
+	u32 thread_bind_irq_num;
+	u32 mirror_ctx_status;
+	struct rb_node node;
+	struct list_head list;
+	struct vm_area_struct *vma[DEVDRV_MAX_TS_NUM];
+	spinlock_t ctx_lock;
+	struct mutex mutex_lock;
+	struct tsdrv_ts_ctx ts_ctx[DEVDRV_MAX_TS_NUM];
+
+	u64 unique_id; /* mark unique processes for vm */
+};
+
+#endif
diff --git a/include/linux/vstream.h b/include/linux/vstream.h
new file mode 100644
index 000000000000..14d799296053
--- /dev/null
+++ b/include/linux/vstream.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VSTREAM_H
+#define _LINUX_VSTREAM_H
+
+#include <linux/ucc_kfd.h>
+#include <linux/ucc_sched/ucc_sched.h>
+#include <linux/ucc_ts.h>
+
+#define MAX_VSTREAM_SIZE	1024
+#define MAX_VSTREAM_SLOT_SIZE	64
+#define MAX_CQ_SLOT_SIZE	12
+
+/*
+ * XXX_VSTREAM_ALLOC: alloc a vstream, buffer for tasks
+ * XXX_VSTREAM_FREE: free a vstream
+ * XXX_VSTREAM_KICK: there are tasks to be executed in the vstream
+ * XXX_VSTREAM_UPDATE: update information for an existing vstream
+ * XXX_CALLBACK_VSTREAM_WAIT: waiting for callback tasks
+ * XXX_CALLBACK_VSTREAM_KICK: callback tasks have been executed
+ *
+ * NOTE: Callback vstream is only for Ascend now. We do not need
+ * CALLBACK_VSTREAM_ALLOC because the callback vstream will be
+ * alloced with vstream on Ascend.
+ */
+enum VSTREAM_COMMAND {
+	/* vstream command for Ascend */
+	ASCEND_VSTREAM_ALLOC = 0,
+	ASCEND_VSTREAM_FREE,
+	ASCEND_VSTREAM_KICK,
+	ASCEND_CALLBACK_VSTREAM_WAIT,
+	ASCEND_CALLBACK_VSTREAM_KICK,
+	ASCEND_VSTREAM_GET_HEAD,
+	ASCEND_MAX_COMMAND,
+
+	/* vstream command for amdgpu */
+	AMDGPU_VSTREAM_ALLOC = ASCEND_MAX_COMMAND + 1,
+	AMDGPU_VSTREAM_FREE,
+	AMDGPU_VSTREAM_KICK,
+	AMDGPU_VSTREAM_UPDATE,
+	AMDGPU_MAX_COMMAND,
+};
+
+struct vstream_alloc_args {
+	union {
+		/* For Ascend */
+		struct normal_alloc_sqcq_para ascend;
+		/* For amdgpu */
+		struct kfd_ioctl_create_queue_args amdgpu;
+	};
+};
+
+struct vstream_free_args {
+	union {
+		/* For Ascend */
+		struct normal_free_sqcq_para ascend;
+		/* For amdgpu */
+		struct kfd_ioctl_destroy_queue_args amdgpu;
+	};
+};
+
+struct vstream_kick_args {
+	union {
+		/* For Ascend */
+		struct tsdrv_sqcq_data_para ascend;
+		/* For amdgpu */
+	};
+};
+
+struct vstream_args {
+	union {
+		struct vstream_alloc_args va_args;
+		struct vstream_free_args vf_args;
+		struct vstream_kick_args vk_args;
+		struct kfd_ioctl_update_queue_args vu_args;
+		struct tsdrv_sqcq_data_para vh_args;
+		struct devdrv_report_para cvw_args;
+		struct tsdrv_sqcq_data_para cvk_args;
+	};
+};
+
+struct vstream_node {
+	uint32_t id;
+	uint32_t head;
+	uint32_t tail;
+	uint32_t credit;
+	void *vstreamData;
+	raw_spinlock_t spin_lock;
+};
+
+struct vstream_id {
+	uint32_t vstreamId;
+	struct list_head list;
+};
+
+struct vcq_map_table {
+	uint32_t vcqId;
+	struct vstream_node *vcqNode;
+	struct list_head vstreamId_list;
+};
+
+struct vstream_info {
+	uint32_t vstreamId; //key
+	uint32_t vcqId;
+	uint32_t devId;
+	uint32_t tsId;
+	struct ucc_se se;
+	//TODO::check name
+	struct vstream_node *vsqNode;
+	struct vstream_node *vcqNode;
+	void *privdata;
+	uint32_t info[SQCQ_RTS_INFO_LENGTH];
+	int cu_id;
+	struct xpu_group *group;
+	int send_cnt;
+	struct task_struct *p;
+};
+
+typedef int vstream_manage_t(struct vstream_args *arg);
+int update_vstream_head(struct vstream_info *vstream_info, int num);
+struct vstream_info *vstream_get_info(uint32_t id);
+bool vstream_have_kernel(struct ucc_se *se);
+
+#endif /* _LINUX_VSTREAM_H */
diff --git a/include/linux/xpu_group.h b/include/linux/xpu_group.h
new file mode 100644
index 000000000000..5e3a96b15f9c
--- /dev/null
+++ b/include/linux/xpu_group.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __XPU_GROUP_H__
+#define __XPU_GROUP_H__
+#include <linux/idr.h>
+
+struct xpu_group;
+struct xcu;
+
+enum xpu_type {
+	XPU_TYPE_ROOT,
+	XPU_TYPE_TASK_QUEUE,
+	XPU_TYPE_NPU_310,
+	XPU_TYPE_CUSTOM,
+};
+
+enum xpu_capability_type {
+	TYPE_1,
+	XPU_CAPABILITY_TYPE_NR,
+};
+
+struct xpu_capability {
+	unsigned long capacities[XPU_CAPABILITY_TYPE_NR];
+};
+
+struct xpu_operation {
+	int (*run)(struct xpu_group *group, void *para1, void *para2);
+	int (*finish)(struct xpu_group *group, void *para1, void *para2);
+	int (*wait)(struct xpu_group *group, void *para1, void *para2,
+		    void *para3);
+	int (*complete)(struct xpu_group *group, void *para1, void *para2,
+			void *para3);
+};
+
+struct xpu_group {
+	int id;
+	enum xpu_type type;
+	struct xpu_capability *capability;
+
+	struct xpu_group *previous_layer;
+	struct idr next_layer;
+
+	struct xpu_operation *opt;
+
+	int used;
+
+	void *data;
+};
+
+extern struct xpu_group *xpu_root;
+
+#ifdef CONFIG_XPU_SCHEDULE
+int xpu_group_attach(struct xpu_group *new_group,
+		     struct xpu_group *previous_group);
+int xpu_group_detach(struct xpu_group *group);
+struct xpu_group *xpu_group_find(struct xpu_group *group, int id);
+struct xpu_group *xpu_idle_group_find(struct xpu_group *group);
+struct xpu_group *xpu_group_alloc(void);
+struct xpu_group *xpu_group_alloc_and_attach(struct xpu_group *previous_group,
+					     int id);
+int xpu_run(struct xpu_group *group, void *para1, void *para2);
+int xpu_finish(struct xpu_group *group, void *para1, void *para2);
+int xpu_wait(struct xpu_group *group, void *para1, void *para2, void *para3);
+#endif
+
+#endif
diff --git a/include/trace/events/ucc_sched.h b/include/trace/events/ucc_sched.h
new file mode 100644
index 000000000000..104a39b2f41c
--- /dev/null
+++ b/include/trace/events/ucc_sched.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ucc_sched
+
+#if !defined(_TRACE_UCC_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_UCC_SCHED_H
+
+#include <linux/tracepoint.h>
+#include <linux/binfmts.h>
+
+/*
+ * XXX the below ucc_sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
+ *     adding ucc_sched_stat support to SCHED_FIFO/RR would be welcome.
+ */
+DECLARE_EVENT_CLASS(ucc_sched_stat_template,
+
+	TP_PROTO(struct vstream_info *vinfo, u64 delay),
+
+	TP_ARGS(vinfo, delay),
+
+	TP_STRUCT__entry(
+		__array(char,	comm,	TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(int,	cu_id)
+		__field(u32,	vstreamId)
+		__field(u32,	prio)
+		__field(u64,	delay)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, vinfo->p->comm, TASK_COMM_LEN);
+		__entry->pid	= vinfo->p->pid;
+		__entry->cu_id = vinfo->cu_id;
+		__entry->vstreamId	= vinfo->vstreamId;
+		__entry->prio = vinfo->p->ucc_priority;
+		__entry->delay	= delay;
+	),
+
+	TP_printk("comm=%s pid=%d cu_id=%d vstreamId %u prio %u, delay=%llu [ns]",
+			__entry->comm, __entry->pid,
+			__entry->cu_id, __entry->vstreamId, __entry->prio,
+			(unsigned long long)__entry->delay)
+);
+
+DECLARE_EVENT_CLASS(ucc_sched_stat_template_1,
+
+	TP_PROTO(struct vstream_info *vinfo, u64 delay, int is_timeout),
+
+	TP_ARGS(vinfo, delay, is_timeout),
+
+	TP_STRUCT__entry(
+		__array(char,	comm,	TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(int,	cu_id)
+		__field(u32,	vstreamId)
+		__field(u64,	delay)
+		__field(int,	is_timeout)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, vinfo->p->comm, TASK_COMM_LEN);
+		__entry->pid	= vinfo->p->pid;
+		__entry->cu_id = vinfo->cu_id;
+		__entry->vstreamId	= vinfo->vstreamId;
+		__entry->delay	= delay;
+		__entry->is_timeout = is_timeout;
+	),
+
+	TP_printk("comm=%s pid=%d cu_id=%d vstreamId %u, delay=%llu [ns]:%d",
+			__entry->comm, __entry->pid,
+			__entry->cu_id, __entry->vstreamId,
+			(unsigned long long)__entry->delay,
+			__entry->is_timeout)
+);
+/*
+ * Tracepoint for accounting wait time (time the task is runnable
+ * but not actually running due to scheduler contention).
+ */
+DEFINE_EVENT(ucc_sched_stat_template, ucc_sched_stat_wait,
+	     TP_PROTO(struct vstream_info *vinfo, u64 delay),
+	     TP_ARGS(vinfo, delay));
+
+DEFINE_EVENT(ucc_sched_stat_template, ucc_sched_stat_preempt,
+	     TP_PROTO(struct vstream_info *vinfo, u64 delay),
+	     TP_ARGS(vinfo, delay));
+
+DEFINE_EVENT(ucc_sched_stat_template_1, ucc_sched_stat_run,
+	     TP_PROTO(struct vstream_info *vinfo, u64 delay, int is_timeout),
+	     TP_ARGS(vinfo, delay, is_timeout));
+
+TRACE_EVENT(ucc_sched_switch,
+
+	TP_PROTO(int preempt,
+		 struct vstream_info *next),
+
+	TP_ARGS(preempt, next),
+
+	TP_STRUCT__entry(
+		__field(int,	cu_id)
+		__field(u32,	next_vstreamId)
+		__field(u32,	next_prio)
+		__field(int,	preempt)
+	),
+
+	TP_fast_assign(
+		__entry->cu_id = next->cu_id;
+		__entry->next_vstreamId	= next->vstreamId;
+		__entry->next_prio = next->p->ucc_priority;
+		__entry->preempt = preempt;
+	),
+
+	TP_printk("cu_id=%d next_vstreamId %u next_prio %u preempt[%d]",
+			__entry->cu_id,
+			__entry->next_vstreamId, __entry->next_prio,
+			__entry->preempt)
+);
+#endif /* _TRACE_UCC_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/init/init_task.c b/init/init_task.c
index b312a045f4b9..c1a78b4da368 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -188,6 +188,10 @@ struct task_struct init_task
 		.fork_pid = 0,
 	},
 #endif
+#ifdef CONFIG_XPU_SCHEDULE
+	.ucc_priority = 1,
+	.ucc_step = 1,
+#endif
 };
 EXPORT_SYMBOL(init_task);
 
diff --git a/init/main.c b/init/main.c
index 50af60ff0ef6..7ed2e67d7011 100644
--- a/init/main.c
+++ b/init/main.c
@@ -66,6 +66,7 @@
 #include <linux/kthread.h>
 #include <linux/sched.h>
 #include <linux/sched/init.h>
+#include <linux/ucc_sched/ucc_sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/kgdb.h>
@@ -599,6 +600,14 @@ asmlinkage __visible void __init start_kernel(void)
 	 * time - but meanwhile we still have a functioning scheduler.
 	 */
 	sched_init();
+
+#ifdef CONFIG_XPU_SCHEDULE
+	/*
+	 * Set up the ucc scheduler, to enable heterogeneous scheduling.
+	 */
+	ucc_sched_init();
+#endif
+
 	/*
 	 * Disable preemption - early bootup scheduling is extremely
 	 * fragile until we cpu_idle() for the first time.
diff --git a/kernel/Makefile b/kernel/Makefile
index d0482bd27ba4..273fe481d303 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,8 @@ obj-y += irq/
 obj-y += rcu/
 obj-y += livepatch/
 obj-y += dma/
+obj-(CONFIG_XPU_SCHEDULE) += ucc_sched/
+obj-(CONFIG_XPU_UCC) += ucc/
 
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 0612af002ae5..0f659b2ad251 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,6 +19,7 @@ endif
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
 obj-y += wait.o wait_bit.o swait.o completion.o
+obj-(CONFIG_XPU_SCHEDULE) += ucc_sched.o
 
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7825ceaae0c4..459774328f56 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2316,6 +2316,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 */
 	p->prio = current->normal_prio;
 
+#ifdef CONFIG_XPU_SCHEDULE
+	p->ucc_priority = current->ucc_priority;
+	p->ucc_step = current->ucc_step;
+#endif
+
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
diff --git a/kernel/sched/ucc_sched.c b/kernel/sched/ucc_sched.c
new file mode 100644
index 000000000000..646f120c3c34
--- /dev/null
+++ b/kernel/sched/ucc_sched.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ucc_sched.h>
+#include <linux/ucc_common.h>
+
+static DEFINE_MUTEX(revmap_mutex);
+
+static DEFINE_HASHTABLE(vrtsq_rtsq_revmap, VRTSQ_RTSQ_HASH_ORDER);
+
+/**
+ * @group: value for this entry.
+ * @hash_node : hash node list.
+ * @
+ */
+struct vsqce_idx_revmap_data {
+	unsigned int vrtsdId;
+	struct xpu_group *group;
+	struct hlist_node hash_node;
+};
+
+struct xpu_group *select_sq(struct vstream_info *vstream_info)
+{
+	struct vsqce_idx_revmap_data *revmap_data;
+
+	/* find history */
+	mutex_lock(&revmap_mutex);
+	hash_for_each_possible(vrtsq_rtsq_revmap, revmap_data, hash_node,
+			       (unsigned long)vstream_info->vstreamId) {
+		if (revmap_data && revmap_data->group) {
+			mutex_unlock(&revmap_mutex);
+			return revmap_data->group;
+		}
+	}
+	mutex_unlock(&revmap_mutex);
+
+	revmap_data = kzalloc(sizeof(struct vsqce_idx_revmap_data), GFP_KERNEL);
+	if (revmap_data == NULL)
+		return NULL;
+	/* find XPU group */
+	revmap_data->group = xpu_group_find(xpu_root, XPU_TYPE_NPU_310);
+	if (revmap_data->group == NULL) {
+		ucc_err("find XPU group is failed.\n");
+		return NULL;
+	}
+	/* find device group */
+	revmap_data->group = xpu_group_find(revmap_data->group,
+					    vstream_info->devId);
+	if (revmap_data->group == NULL) {
+		ucc_err("find device group is failed.\n");
+		return NULL;
+	}
+	/* find tsgroup */
+	revmap_data->group = xpu_group_find(revmap_data->group,
+					    vstream_info->tsId);
+	if (revmap_data->group == NULL) {
+		ucc_err("find ts group is failed.\n");
+		return NULL;
+	}
+
+	/* select idle xcu */
+	revmap_data->group = xpu_idle_group_find(revmap_data->group);
+	if (revmap_data->group == NULL) {
+		ucc_err("find rtsq group is failed.\n");
+		return NULL;
+	}
+
+	revmap_data->vrtsdId = vstream_info->vstreamId;
+	/* set group used : 1 */
+	revmap_data->group->used = 1;
+
+	mutex_lock(&revmap_mutex);
+	hash_add(vrtsq_rtsq_revmap, &revmap_data->hash_node,
+		 (unsigned long)vstream_info->vstreamId);
+	mutex_unlock(&revmap_mutex);
+	return revmap_data->group;
+}
+
+int ucc_process_task(struct vstream_info *vstream_info, struct tsdrv_ctx *ctx,
+		     int *sqenum)
+{
+	struct xpu_group *group = NULL;
+
+	if (vstream_info == NULL) {
+		ucc_err("vsqcq_info is NULL\n");
+		return -1;
+	}
+
+	group = select_sq(vstream_info);
+	if (group == NULL) {
+		ucc_err("find group is failed.\n");
+		return -1;
+	}
+	/* send sqe */
+	*sqenum = xpu_run(group, vstream_info, ctx);
+
+	return 0;
+}
+EXPORT_SYMBOL(ucc_process_task);
+
+int ucc_free_task(struct vstream_info *vstream_info, struct tsdrv_ctx *ctx)
+{
+	struct vsqce_idx_revmap_data *revmap_data;
+
+	ucc_dequeue_task(vstream_info);
+
+	while (!ucc_xcu_is_sched(vstream_info->cu_id))
+		schedule_timeout_interruptible(10);
+
+	ucc_dump_statistics_info(&vstream_info->se);
+
+	mutex_lock(&revmap_mutex);
+	hash_for_each_possible(vrtsq_rtsq_revmap, revmap_data, hash_node,
+			       (unsigned long)vstream_info->vstreamId) {
+		if (revmap_data &&
+		    revmap_data->vrtsdId == vstream_info->vstreamId &&
+		    revmap_data->group) {
+			xpu_finish(revmap_data->group, vstream_info, ctx);
+			/* set group unused : 0 */
+			revmap_data->group->used = 0;
+			hash_del(&revmap_data->hash_node);
+			kfree(revmap_data);
+			revmap_data = NULL;
+			break;
+		}
+	}
+	mutex_unlock(&revmap_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ucc_free_task);
+
+int ucc_wait_cq(struct vstream_info *vstream_info, struct tsdrv_ctx *ctx,
+		struct devdrv_report_para *arg, int *cqenum)
+{
+	struct vsqce_idx_revmap_data *revmap_data;
+
+	hash_for_each_possible(vrtsq_rtsq_revmap, revmap_data, hash_node,
+			       (unsigned long)vstream_info->vstreamId) {
+		if (revmap_data &&
+		    revmap_data->vrtsdId == vstream_info->vstreamId &&
+		    revmap_data->group)
+			*cqenum = xpu_wait(revmap_data->group, vstream_info,
+					   ctx, arg);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ucc_wait_cq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c7064f67f4a5..aeceb9e9c927 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,10 @@ extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
 extern int sysctl_nr_trim_pages;
 #endif
 
+#ifdef CONFIG_XPU_SCHEDULE
+extern int sysctl_ucc_sched_rcv_timeout_ms;
+#endif
+
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
@@ -139,7 +143,7 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
-#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_QOS_SCHED_SMART_GRID)
+#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_QOS_SCHED_SMART_GRID) || defined(CONFIG_XPU_SCHEDULE)
 static int hundred_thousand = 100000;
 #endif
 #ifdef CONFIG_PERF_EVENTS
@@ -352,6 +356,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_XPU_SCHEDULE
+	{
+		.procname	= "ucc_sched_rcv_timeout",
+		.data		= &sysctl_ucc_sched_rcv_timeout_ms,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &hundred_thousand,
+	},
+#endif
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.procname	= "sched_min_granularity_ns",
diff --git a/kernel/ucc/Kconfig b/kernel/ucc/Kconfig
new file mode 100644
index 000000000000..279c11f702b1
--- /dev/null
+++ b/kernel/ucc/Kconfig
@@ -0,0 +1,21 @@
+#
+# TODO: add description
+#
+
+config XPU_UCC
+	bool "ucc"
+	default n
+	depends on ARM64 || X86
+	help
+	  Say Y here if you want support for using XPU UCC. XPU UCC
+	  is helpfer for XPU schedule. The full name of UCC is
+	  Universal Converged Computing.
+
+
+config XPU_VSTREAM
+	bool "virtual submit queue and complete queue"
+	default n
+	depends on XPU_UCC
+	help
+	  virtual Submit Queue and Complete Queue support for XPU.
+	  It is used to help XPU schedule.
diff --git a/kernel/ucc/Makefile b/kernel/ucc/Makefile
new file mode 100644
index 000000000000..0e2735d2aef4
--- /dev/null
+++ b/kernel/ucc/Makefile
@@ -0,0 +1 @@
+obj-y += ascend_vstream.o vstream.o
diff --git a/kernel/ucc/ascend_vstream.c b/kernel/ucc/ascend_vstream.c
new file mode 100644
index 000000000000..d248aaff7639
--- /dev/null
+++ b/kernel/ucc/ascend_vstream.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vstream.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/ucc_common.h>
+#include <linux/ucc_sched.h>
+
+DEFINE_MUTEX(vstreamId_Bitmap_mutex);
+static DECLARE_BITMAP(vstreamIdBitmap, DEVDRV_MAX_SQ_NUM);
+
+static DEFINE_MUTEX(vcqId_Bitmap_mutex);
+static DECLARE_BITMAP(vcqIdBitmap, DEVDRV_MAX_CQ_NUM);
+
+static DEFINE_MUTEX(revmap_mutex);
+
+static struct vstream_info *vstreamContainer[DEVDRV_MAX_SQ_NUM];
+static struct vcq_map_table *vsqcqMapTable[DEVDRV_MAX_CQ_NUM];
+
+#define MAX_SQ_SIZE	(MAX_VSTREAM_SIZE * MAX_VSTREAM_SLOT_SIZE)
+#define MAX_CQ_SIZE	(MAX_VSTREAM_SIZE * MAX_CQ_SLOT_SIZE)
+
+#define SQ_USER_ADDR_OFFSET(id)	((unsigned long)REMAP_ALIGN(MAX_SQ_SIZE) * id)
+#define CQ_USER_ADDR_OFFSET(id)	((unsigned long)REMAP_ALIGN(MAX_CQ_SIZE) * id)
+
+#define SQ_VSTREAM_DATA(id) vstreamContainer[id]->vsqNode->vstreamData
+#define CQ_VSTREAM_DATA(id) vstreamContainer[id]->vcqNode->vstreamData
+
+static struct tsdrv_ctx *get_ctx(int fd)
+{
+	struct fd f;
+	struct davinci_intf_private_stru *file_private_data;
+	struct tsdrv_ctx *ctx = NULL;
+
+	f = fdget(fd);
+	if (!f.file)
+		goto out;
+
+	file_private_data = f.file->private_data;
+	if (!file_private_data)
+		goto out;
+
+	ctx = file_private_data->priv_filep.private_data;
+
+out:
+	fdput(f);
+	return ctx;
+}
+
+static struct vcq_map_table *vstream_get_map_table(uint32_t id)
+{
+	return vsqcqMapTable[id];
+}
+
+static void free_vstreamId(uint32_t vstreamId)
+{
+	mutex_lock(&vstreamId_Bitmap_mutex);
+	clear_bit(vstreamId, vstreamIdBitmap);
+	mutex_unlock(&vstreamId_Bitmap_mutex);
+}
+
+static void free_vcqId(uint32_t vcqId, uint32_t flag)
+{
+	mutex_lock(&vcqId_Bitmap_mutex);
+	if (!(flag & TSDRV_CQ_REUSE))
+		clear_bit(vcqId, vcqIdBitmap);
+	mutex_unlock(&vcqId_Bitmap_mutex);
+}
+
+static void vstream_free_map_table(uint32_t vcqId, uint32_t vstreamId,
+				   uint32_t flag)
+{
+	struct vcq_map_table *freeTable = NULL;
+	struct vstream_id *vstreamIdNode = NULL;
+
+	freeTable = vstream_get_map_table(vcqId);
+	if (!freeTable) {
+		ucc_err("No map found for vcq:%d.\n", vcqId);
+		return;
+	}
+
+	list_for_each_entry(vstreamIdNode, &freeTable->vstreamId_list, list) {
+		if (vstreamIdNode->vstreamId == vstreamId) {
+			list_del(&vstreamIdNode->list);
+			kfree(vstreamIdNode);
+			break;
+		}
+	}
+	if (!(flag & TSDRV_CQ_REUSE)) {
+		kfree(freeTable->vcqNode->vstreamData);
+		kfree(freeTable->vcqNode);
+		kfree(freeTable);
+	}
+}
+
+static void vstream_alloc_ucc_se(struct ucc_se *se)
+{
+	memset(&se->statistics, 0, sizeof(se->statistics));
+	se->on_cu = 0;
+	se->state = SE_PREPARE;
+	se->flag = UCC_TIF_NONE;
+	se->prio = UCC_PRIO_HIGH;
+	se->step = UCC_STEP_SLOW;
+	raw_spin_lock_init(&se->se_lock);
+}
+
+static struct vstream_info *vstream_create_info(struct tsdrv_ctx *ctx,
+					   struct normal_alloc_sqcq_para *para)
+{
+	struct vcq_map_table *mapTable = NULL;
+
+	struct vstream_info *vstream = kzalloc(sizeof(struct vstream_info),
+					       GFP_KERNEL);
+	if (!vstream)
+		return NULL;
+
+	(void)memcpy(vstream->info, para->info,
+		     sizeof(uint32_t) * SQCQ_RTS_INFO_LENGTH);
+
+	vstream->privdata = ctx;
+	vstream->tsId = para->tsId;
+	vstream->vstreamId = para->sqId;
+	vstream->vcqId = para->cqId;
+
+	mapTable = vstream_get_map_table(vstream->vcqId);
+	if (!mapTable || !mapTable->vcqNode) {
+		ucc_err("No map found for vcqId:%d.\n", vstream->vcqId);
+		goto free_vstream;
+	}
+	vstream->vcqNode = mapTable->vcqNode;
+	vstream->vsqNode = kmalloc(sizeof(struct vstream_node), GFP_KERNEL);
+	if (!vstream->vsqNode) {
+		ucc_err("Failed to alloc memory for vsqNode:%d.\n",
+			       vstream->vstreamId);
+		goto free_vstream;
+	}
+	vstream->vsqNode->vstreamData = kmalloc(MAX_SQ_SIZE, GFP_KERNEL);
+	if (!vstream->vsqNode->vstreamData)
+		goto free_vsqNode;
+	vstream->vsqNode->id = vstream->vstreamId;
+	vstream->vsqNode->head = 0;
+	vstream->vsqNode->tail = 0;
+	vstream->vsqNode->credit = MAX_VSTREAM_SIZE;
+	raw_spin_lock_init(&vstream->vsqNode->spin_lock);
+	vstream->send_cnt = 0;
+	vstream->p = current;
+	vstream_alloc_ucc_se(&vstream->se);
+
+	return vstream;
+
+free_vsqNode:
+	kfree(vstream->vsqNode);
+
+free_vstream:
+	kfree(vstream);
+	return NULL;
+}
+
+struct vstream_info *vstream_get_info(uint32_t id)
+{
+	return vstreamContainer[id];
+}
+
+static void vstream_free_info(uint32_t id)
+{
+	struct vstream_info *freeInfo = vstream_get_info(id);
+
+	ucc_set_vstream_state(freeInfo, SE_DEAD);
+
+	if (freeInfo) {
+		if (freeInfo->vsqNode)
+			kfree(freeInfo->vsqNode->vstreamData);
+
+		kfree(freeInfo->vsqNode);
+	}
+
+	kfree(freeInfo);
+}
+
+static int queue_pop_by_num(struct vstream_node *node, uint32_t pop_num)
+{
+	if (node->credit + pop_num > MAX_VSTREAM_SIZE) {
+		ucc_err("Queue usage out-of-bounds");
+		return -EACCES;
+	}
+
+	node->credit += pop_num;
+	node->head = (node->head + pop_num) % MAX_VSTREAM_SIZE;
+	return 0;
+}
+
+static int queue_pop_by_head(struct vstream_node *node, uint32_t head)
+{
+	int pop_num = (head - node->head + MAX_VSTREAM_SIZE) %
+		      MAX_VSTREAM_SIZE;
+	return queue_pop_by_num(node, pop_num);
+}
+
+int update_vstream_head(struct vstream_info *vstream_info, int num)
+{
+	struct vstream_node *node = vstream_info->vsqNode;
+
+	raw_spin_lock(&node->spin_lock);
+	if (node->credit + num > MAX_VSTREAM_SIZE) {
+		raw_spin_unlock(&node->spin_lock);
+		return -1;
+	}
+
+	node->credit += num;
+	node->head = (node->head + num) % MAX_VSTREAM_SIZE;
+	raw_spin_unlock(&node->spin_lock);
+
+	return 0;
+}
+
+bool vstream_have_kernel(struct ucc_se *se)
+{
+	struct vstream_info *vinfo;
+
+	vinfo = container_of(se, struct vstream_info, se);
+	return vinfo->vsqNode->credit != MAX_VSTREAM_SIZE;
+}
+
+static int queue_push_by_num(struct vstream_node *node, uint32_t push_num)
+{
+	if (node->credit - push_num < 0)
+		return -EACCES;
+
+	node->credit -= push_num;
+	node->tail = (node->tail + push_num) % MAX_VSTREAM_SIZE;
+	return 0;
+}
+
+static int queue_push_by_tail(struct vstream_node *node, uint32_t tail)
+{
+	int push_num = (tail - node->tail + MAX_VSTREAM_SIZE) %
+		       MAX_VSTREAM_SIZE;
+	return queue_push_by_num(node, push_num);
+}
+
+static uint32_t vstream_alloc_vstreamId(void)
+{
+	uint32_t vstreamId = DEVDRV_MAX_SQ_NUM;
+
+	/* alloc vstreamId */
+	mutex_lock(&vstreamId_Bitmap_mutex);
+	vstreamId = find_first_zero_bit(vstreamIdBitmap, DEVDRV_MAX_SQ_NUM);
+	if (vstreamId == DEVDRV_MAX_SQ_NUM) {
+		ucc_err("vstreamId exhausted.\n");
+		mutex_unlock(&vstreamId_Bitmap_mutex);
+		return DEVDRV_MAX_SQ_NUM;
+	}
+	set_bit(vstreamId, vstreamIdBitmap);
+	mutex_unlock(&vstreamId_Bitmap_mutex);
+
+	return vstreamId;
+}
+
+static uint32_t vstream_alloc_vcqid(void)
+{
+	uint32_t vcqId = DEVDRV_MAX_CQ_NUM;
+
+	/* alloc vcqid */
+	mutex_lock(&vcqId_Bitmap_mutex);
+	vcqId = find_first_zero_bit(vcqIdBitmap, DEVDRV_MAX_CQ_NUM);
+	if (vcqId == DEVDRV_MAX_CQ_NUM) {
+		ucc_err("vcqId has been used up.\n");
+		mutex_unlock(&vcqId_Bitmap_mutex);
+		return DEVDRV_MAX_CQ_NUM;
+	}
+	set_bit(vcqId, vcqIdBitmap);
+	mutex_unlock(&vcqId_Bitmap_mutex);
+
+	ucc_info("vcqId = %d\n", vcqId);
+	return vcqId;
+}
+
+int vstream_map_pfnaddr(struct tsdrv_ctx *ctx,
+			struct normal_alloc_sqcq_para *para)
+{
+	int err = 0;
+	unsigned long vsqAddr;
+	unsigned long vcqAddr;
+	pgprot_t vm_page_prot;
+	struct vm_area_struct *vma = ctx->vma[para->tsId];
+
+	vsqAddr = vma->vm_start + SQ_USER_ADDR_OFFSET(para->sqId);
+	vm_page_prot = pgprot_device(vma->vm_page_prot);
+	err = remap_pfn_range(vma, vsqAddr,
+			      virt_to_pfn(SQ_VSTREAM_DATA(para->sqId)),
+			      MAX_SQ_SIZE, vm_page_prot);
+	if (err) {
+		ucc_err("remap_pfn_range failed,ret=%d.\n", err);
+		return -EFAULT;
+	}
+	if (!(para->flag & TSDRV_CQ_REUSE)) {
+		vcqAddr = vma->vm_start + DEVDRV_VM_CQ_MEM_OFFSET +
+					CQ_USER_ADDR_OFFSET(para->cqId);
+		err = remap_pfn_range(vma, vcqAddr,
+				      virt_to_pfn(CQ_VSTREAM_DATA(para->sqId)),
+				      MAX_CQ_SIZE, vm_page_prot);
+		if (err) {
+			ucc_err("remap_pfn_range failed,ret=%d.\n", err);
+			return -EFAULT;
+		}
+	}
+
+	return err;
+}
+
+void vstream_unmap_pfnaddr(struct tsdrv_ctx *ctx,
+			   struct normal_free_sqcq_para *para)
+{
+	unsigned long vsqAddr;
+	unsigned long vcqAddr;
+	size_t cqSize = PAGE_ALIGN(MAX_CQ_SIZE);
+	struct vm_area_struct *vma = ctx->vma[para->tsId];
+
+	vsqAddr = vma->vm_start + SQ_USER_ADDR_OFFSET(para->sqId);
+	zap_vma_ptes(vma, vsqAddr, MAX_SQ_SIZE);
+
+	if (!(para->flag & TSDRV_CQ_REUSE)) {
+		vcqAddr = vma->vm_start + DEVDRV_VM_CQ_MEM_OFFSET +
+					CQ_USER_ADDR_OFFSET(para->cqId);
+		zap_vma_ptes(vma, vcqAddr, cqSize);
+	}
+}
+
+static int vstream_update_vcqtable(uint32_t vcqId, uint32_t vstreamId,
+				   uint32_t flag)
+{
+	int err = -ENOSPC;
+	struct vcq_map_table *vcqTable = NULL;
+	struct vstream_id *vstreamIdNode = NULL;
+
+	if (!(flag & TSDRV_CQ_REUSE)) {
+		vcqTable = kmalloc(sizeof(struct vcq_map_table), GFP_KERNEL);
+		if (!vcqTable)
+			return -ENOMEM;
+
+		vcqTable->vcqId = vcqId;
+		vcqTable->vcqNode = kmalloc(sizeof(struct vstream_node),
+					    GFP_KERNEL);
+		if (!vcqTable->vcqNode) {
+			err = -ENOMEM;
+			goto free_vcqTable;
+		}
+
+		vcqTable->vcqNode->vstreamData = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!vcqTable->vcqNode->vstreamData) {
+			err = -ENOMEM;
+			goto free_vcqNode;
+		}
+		vcqTable->vcqNode->id = vcqId;
+		vcqTable->vcqNode->head = 0;
+		vcqTable->vcqNode->tail = 0;
+		vcqTable->vcqNode->credit = MAX_VSTREAM_SIZE;
+		INIT_LIST_HEAD(&vcqTable->vstreamId_list);
+		vsqcqMapTable[vcqId] = vcqTable;
+	} else {
+		vcqTable = vsqcqMapTable[vcqId];
+	}
+	vstreamIdNode = kmalloc(sizeof(struct vstream_id), GFP_KERNEL);
+	if (!vstreamIdNode) {
+		err = -ENOMEM;
+
+		if (!(flag & TSDRV_CQ_REUSE))
+			goto free_vstreamData;
+		return err;
+	}
+	vstreamIdNode->vstreamId = vstreamId;
+	list_add(&vstreamIdNode->list, &vcqTable->vstreamId_list);
+
+	return 0;
+
+free_vstreamData:
+	kfree(vcqTable->vcqNode->vstreamData);
+
+free_vcqNode:
+	kfree(vcqTable->vcqNode);
+
+free_vcqTable:
+	kfree(vcqTable);
+
+	return err;
+}
+
+int ascend_vstream_alloc(struct vstream_args *arg)
+{
+	uint32_t vstreamId;
+	uint32_t vcqId = DEVDRV_MAX_CQ_NUM;
+	int err = -EINVAL;
+	struct vstream_info *vstream = NULL;
+	struct tsdrv_ctx *ctx = NULL;
+	struct normal_alloc_sqcq_para *sqcq_alloc_para = &arg->va_args.ascend;
+
+	ctx = get_ctx(sqcq_alloc_para->fd);
+	if (!ctx)
+		return err;
+
+	vstreamId = vstream_alloc_vstreamId();
+	if (vstreamId == DEVDRV_MAX_SQ_NUM) {
+		ucc_err("vstreamId alloc failed.\n");
+		return err;
+	}
+	if (!(sqcq_alloc_para->flag & TSDRV_CQ_REUSE))
+		vcqId = vstream_alloc_vcqid();
+	else
+		vcqId = sqcq_alloc_para->cqId;
+
+	if (vcqId >= DEVDRV_MAX_CQ_NUM) {
+		ucc_err("vcqId alloc failed.\n");
+		goto free_vstreamIds;
+	}
+	err = vstream_update_vcqtable(vcqId, vstreamId, sqcq_alloc_para->flag);
+	if (err) {
+		ucc_err("vcqtable update failed, vcqId:%d, vstreamId:%d, flag:%d.\n",
+			 vcqId, vstreamId, sqcq_alloc_para->flag);
+		goto free_vcqid;
+	}
+
+	sqcq_alloc_para->sqId = vstreamId;
+	sqcq_alloc_para->cqId = vcqId;
+	vstream = vstream_create_info(ctx, sqcq_alloc_para);
+	if (!vstream) {
+		ucc_err("vstream create failed: vcqId:%d, vstreamId:%d.\n",
+			vcqId, vstreamId);
+		err = -ENOSPC;
+		goto free_vcqtable;
+	}
+
+	vstream->devId = sqcq_alloc_para->devId;
+	vstreamContainer[vstreamId] = vstream;
+
+	vstream->group = select_sq(vstream);
+	if (!vstream->group) {
+		ucc_err("Failed to select sq\n");
+		err = -EINVAL;
+		goto free_vstream_info;
+	}
+
+	err = vstream_map_pfnaddr(ctx, sqcq_alloc_para);
+	if (err) {
+		ucc_err("vstream map failed, ret=%d.\n", err);
+		goto free_vstream_info;
+	}
+	return 0;
+
+free_vstream_info:
+	vstream_free_info(vstreamId);
+
+free_vcqtable:
+	vstream_free_map_table(vcqId, vstreamId, sqcq_alloc_para->flag);
+
+free_vcqid:
+	free_vcqId(vcqId, sqcq_alloc_para->flag);
+
+free_vstreamIds:
+	free_vstreamId(vstreamId);
+
+	return err;
+}
+
+int ascend_vstream_free(struct vstream_args *arg)
+{
+	int err = 0;
+	struct vstream_info *vstreamInfo = NULL;
+	struct normal_free_sqcq_para *sqcq_free_para = &arg->vf_args.ascend;
+	uint32_t vstreamId = sqcq_free_para->sqId;
+	uint32_t vcqId = sqcq_free_para->cqId;
+
+	if (vstreamId >= DEVDRV_MAX_SQ_NUM || vcqId >= DEVDRV_MAX_CQ_NUM) {
+		ucc_err("vstream index out-of-range, vstreamId=%d, vcqId=%d.\n",
+			vstreamId, vcqId);
+		return -EPERM;
+	}
+
+	vstreamInfo = vstream_get_info(vstreamId);
+	if (!vstreamInfo) {
+		ucc_err("vstreamInfo get failed, vstreamId=%d.\n", vstreamId);
+		return -EPERM;
+	}
+	err = ucc_free_task(vstreamInfo, vstreamInfo->privdata);
+
+	free_vcqId(vcqId, sqcq_free_para->flag);
+	vstream_free_map_table(vcqId, vstreamId, sqcq_free_para->flag);
+
+	vstream_unmap_pfnaddr(vstreamInfo->privdata, sqcq_free_para);
+
+	vstream_free_info(vstreamId);
+	free_vstreamId(vstreamId);
+	return err;
+}
+
+int ascend_vstream_kick(struct vstream_args *arg)
+{
+	int err = 0;
+	struct tsdrv_sqcq_data_para *sqcq_data_para = &arg->vk_args.ascend;
+	int vstreamId = sqcq_data_para->id;
+	int tail = sqcq_data_para->val;
+	struct vstream_info *vstreamInfo = NULL;
+	int push_num;
+
+	vstreamInfo = vstream_get_info(vstreamId);
+	vstreamInfo->p = current;
+
+	if (!vstreamInfo) {
+		ucc_err("vstreamInfo get failed, vstreamId=%d.\n", vstreamId);
+		return -ENOMEM;
+	}
+
+	push_num = (tail - vstreamInfo->vsqNode->tail + MAX_VSTREAM_SIZE) %
+		   MAX_VSTREAM_SIZE;
+
+	raw_spin_lock(&vstreamInfo->vsqNode->spin_lock);
+	err = queue_push_by_tail(vstreamInfo->vsqNode, tail);
+	if (err) {
+		raw_spin_unlock(&vstreamInfo->vsqNode->spin_lock);
+		ucc_err("queue_push_by_tail error, ret = %d\n", err);
+		return err;
+	}
+	raw_spin_unlock(&vstreamInfo->vsqNode->spin_lock);
+
+	err = ucc_wake_up(&vstreamInfo->se);
+	return err;
+}
+
+int ascend_callback_vstream_wait(struct vstream_args *arg)
+{
+	int err = 0;
+	int cqeNum = 0;
+	int cqeSum = 0;
+	struct vstream_info *vstreamInfo = NULL;
+	struct vcq_map_table *vcqTable = NULL;
+	struct vcq_map_table *waitTable = NULL;
+	struct vstream_id *vstreamIdNode = NULL;
+	struct devdrv_report_para *report_para = &arg->cvw_args;
+	uint32_t *sqlist;
+	uint32_t sqlist_num = 0;
+	uint32_t vstreamId, vcqId;
+
+	sqlist = kmalloc_array(DEVDRV_MAX_SQ_NUM, sizeof(uint32_t), GFP_KERNEL);
+	if (!sqlist)
+		return -ENOMEM;
+
+	vcqId = report_para->cq_id;
+	if (vcqId >= DEVDRV_MAX_CQ_NUM) {
+		ucc_err("vcqId out-of-range, vcqId=%d.\n", vcqId);
+		err = -EPERM;
+		goto out;
+	}
+
+	mutex_lock(&vcqId_Bitmap_mutex);
+	waitTable = vstream_get_map_table(vcqId);
+	if (!waitTable) {
+		ucc_err("No map found for vcq:%d.\n", vcqId);
+		mutex_unlock(&vcqId_Bitmap_mutex);
+		err = -EPERM;
+		goto out;
+	}
+
+	list_for_each_entry(vstreamIdNode, &waitTable->vstreamId_list, list)
+		sqlist[sqlist_num++] = vstreamIdNode->vstreamId;
+	mutex_unlock(&vcqId_Bitmap_mutex);
+
+	//get sqInfo from hardware
+	for (vstreamId = 0; vstreamId < sqlist_num; vstreamId++) {
+		vstreamInfo = vstream_get_info(sqlist[vstreamId]);
+		if (!vstreamInfo)
+			continue;
+		err |= ucc_wait_cq(vstreamInfo, vstreamInfo->privdata,
+				   report_para, &cqeNum);
+		cqeSum += cqeNum;
+		if (cqeNum)
+			break;
+	}
+
+	//update cqInfo
+	mutex_lock(&vcqId_Bitmap_mutex);
+	vcqTable = vstream_get_map_table(vcqId);
+	if (!vcqTable) {
+		ucc_err("No map found for vcq:%d.\n", vcqId);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = queue_push_by_num(vcqTable->vcqNode, cqeSum);
+	if (err) {
+		mutex_unlock(&vcqId_Bitmap_mutex);
+		ucc_err("failed to queue_push_by_num, ret = %d.\n", err);
+		goto out;
+	}
+	report_para->cq_tail = vcqTable->vcqNode->tail;
+	mutex_unlock(&vcqId_Bitmap_mutex);
+
+out:
+	kfree(sqlist);
+	return err;
+}
+
+int ascend_callback_vstream_kick(struct vstream_args *arg)
+{
+	u32 vcqId, release_head;
+	struct vstream_info *vstreamInfo = NULL;
+	int err = 0;
+
+	vcqId = arg->cvk_args.id;
+	release_head = arg->cvk_args.val;
+	if (vcqId >= DEVDRV_MAX_CQ_NUM || release_head >= MAX_VSTREAM_SIZE) {
+		ucc_err("vstream index out-of-range, vcqId=%d, release_head=%d.\n",
+			vcqId, release_head);
+		return -EPERM;
+	}
+
+	mutex_lock(&vcqId_Bitmap_mutex);
+	vstreamInfo = vstream_get_info(vcqId);
+	if (!vstreamInfo) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = queue_pop_by_head(vstreamInfo->vcqNode, release_head);
+
+out:
+	mutex_unlock(&vcqId_Bitmap_mutex);
+	return err;
+}
+
+int ascend_vstream_get_head(struct vstream_args *arg)
+{
+	u32 vstreamId = arg->vh_args.id;
+	struct vstream_info *vstreamInfo = NULL;
+
+	if (vstreamId >= DEVDRV_MAX_SQ_NUM) {
+		ucc_err("vstreamId out-of-range, vstreamId=%d.\n", vstreamId);
+		return -EINVAL;
+	}
+
+	vstreamInfo = vstream_get_info(vstreamId);
+	if (!vstreamInfo) {
+		ucc_err("vstreamInfo get failed, vstreamId=%d.\n", vstreamId);
+		return -EINVAL;
+	}
+	arg->vh_args.val = vstreamInfo->vsqNode->head;
+
+	return 0;
+}
+
diff --git a/kernel/ucc/ascend_vstream.h b/kernel/ucc/ascend_vstream.h
new file mode 100644
index 000000000000..0cd200168495
--- /dev/null
+++ b/kernel/ucc/ascend_vstream.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef _ASCEND_VSTREAM_H
+#define _ASCEND_VSTREAM_H
+
+int ascend_vstream_alloc(struct vstream_args *arg);
+int ascend_vstream_free(struct vstream_args *arg);
+int ascend_vstream_kick(struct vstream_args *arg);
+int ascend_callback_vstream_wait(struct vstream_args *arg);
+int ascend_callback_vstream_kick(struct vstream_args *arg);
+int ascend_vstream_get_head(struct vstream_args *arg);
+
+#endif /* _ASCEND_VSTREAM_H */
diff --git a/kernel/ucc/vstream.c b/kernel/ucc/vstream.c
new file mode 100644
index 000000000000..d4705f285b89
--- /dev/null
+++ b/kernel/ucc/vstream.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/syscalls.h>
+#include <linux/vstream.h>
+
+#include "ascend_vstream.h"
+
+static int amdgpu_vstream_alloc(struct vstream_args *arg)
+{
+	return 0;
+}
+static int amdgpu_vstream_free(struct vstream_args *arg)
+{
+	return 0;
+}
+static int amdgpu_vstream_kick(struct vstream_args *arg)
+{
+	return 0;
+}
+static int amdgpu_vstream_update(struct vstream_args *arg)
+{
+	return 0;
+}
+
+/*
+ * vstream_manage_cmd table
+ */
+static vstream_manage_t (*vstream_command_table[AMDGPU_MAX_COMMAND + 1]) = {
+	ascend_vstream_alloc,			// ASCEND_VSTREAM_ALLOC
+	ascend_vstream_free,			// ASCEND_VSTREAM_FREE
+	ascend_vstream_kick,			// ASCEND_VSTREAM_KICK
+	ascend_callback_vstream_wait,		// ASCEND_CALLBACK_VSTREAM_WAIT
+	ascend_callback_vstream_kick,		// ASCEND_CALLBACK_VSTREAM_KICK
+	ascend_vstream_get_head,		// ASCEND_VSTREAM_GET_HEAD
+	NULL,					// ASCEND_MAX_COMMAND
+	amdgpu_vstream_alloc,			// AMDGPU_VSTREAM_ALLOC
+	amdgpu_vstream_free,			// AMDGPU_VSTREAM_FREE
+	amdgpu_vstream_kick,			// AMDGPU_VSTREAM_KICK
+	amdgpu_vstream_update,			// AMDGPU_VSTREAM_UPDATE
+	NULL					// AMDGPU_MAX_COMMAND
+};
+
+SYSCALL_DEFINE2(vstream_manage, struct vstream_args __user *, arg, int, cmd)
+{
+	int res = 0;
+	struct vstream_args vstream_arg;
+
+	if (cmd > AMDGPU_MAX_COMMAND)
+		return -EINVAL;
+
+	if (copy_from_user(&vstream_arg, arg, sizeof(struct vstream_args))) {
+		pr_err("copy_from_user failed\n");
+		return -EFAULT;
+	}
+	res = vstream_command_table[cmd](&vstream_arg);
+	if (copy_to_user(arg, &vstream_arg, sizeof(struct vstream_args))) {
+		pr_err("copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+	return res;
+}
diff --git a/kernel/ucc_sched/Makefile b/kernel/ucc_sched/Makefile
new file mode 100644
index 000000000000..4a41f07d091c
--- /dev/null
+++ b/kernel/ucc_sched/Makefile
@@ -0,0 +1 @@
+obj-(CONFIG_XPU_SCHEDULE) += core.o
diff --git a/kernel/ucc_sched/core.c b/kernel/ucc_sched/core.c
new file mode 100644
index 000000000000..4c7f1f59aeb9
--- /dev/null
+++ b/kernel/ucc_sched/core.c
@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ * Author: Huawei OS Kernel Lab
+ * Create: Tue Jan 17 22:19:17 2023
+ */
+
+#include <uapi/linux/sched/types.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/ucc_sched.h>
+
+#include "ucc_sched.h"
+#include "../sched/sched.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/ucc_sched.h>
+
+#define MAX_XCU_NUM	(100)
+#define TS_SQ_TRANS_TASK_THRESHOLD	(20)
+
+static struct xcu xcu_manager[MAX_XCU_NUM];
+static int num_active_xcu;
+raw_spinlock_t xcu_mgr_lock;
+int sysctl_ucc_sched_rcv_timeout_ms = 10;
+
+static struct task_struct vstream_idle_task;
+static struct vstream_info vstream_idle = {
+	.vstreamId = UINT_MAX,
+	.p = &vstream_idle_task,
+};
+
+struct sched_args {
+	int cu_id;
+};
+
+static inline int is_xcu_offline(struct xcu *cu)
+{
+	return cu->state == XCU_INACTIVE;
+}
+
+void ucc_set_vstream_state(struct vstream_info *vinfo, int state)
+{
+	vinfo->se.state = state;
+}
+
+static inline int should_se_run(struct ucc_se *se)
+{
+	return se->state != SE_BLOCK && se->state != SE_DEAD;
+}
+
+static inline void update_stats_run_start(struct xcu *cu,
+					  struct ucc_se *se)
+{
+	u64 start;
+
+	if (!schedstat_enabled())
+		return;
+
+	start = ktime_get_boot_ns();
+	__schedstat_set(se->statistics.run_start, start);
+}
+
+static inline void update_stats_run_end(struct xcu *cu,
+					struct ucc_se *se)
+{
+
+	struct vstream_info *vinfo;
+	u64 delta;
+
+	if (!schedstat_enabled())
+		return;
+
+	delta = ktime_get_boot_ns() - schedstat_val(se->statistics.run_start);
+	vinfo = container_of(se, struct vstream_info, se);
+	trace_ucc_sched_stat_run(vinfo, delta, se->is_timeout);
+
+	__schedstat_set(se->statistics.run_max,
+		      max(schedstat_val(se->statistics.run_max), delta));
+	__schedstat_inc(se->statistics.run_count);
+	__schedstat_add(se->statistics.run_sum, delta);
+	__schedstat_set(se->statistics.run_start, 0);
+}
+
+static inline void update_stats_preempt_start(struct xcu *cu,
+					      struct ucc_se *se)
+{
+	u64 wait_start;
+
+	if (!schedstat_enabled())
+		return;
+
+	wait_start = ktime_get_boot_ns();
+	__schedstat_set(se->statistics.preempt_start, wait_start);
+}
+
+static inline void update_stats_wait_start(struct xcu *cu, struct ucc_se *se)
+{
+	u64 wait_start;
+
+	if (!schedstat_enabled())
+		return;
+
+	wait_start = ktime_get_boot_ns();
+	__schedstat_set(se->statistics.wait_start, wait_start);
+}
+
+
+static inline void update_stats_wait_end(struct xcu *cu, struct ucc_se *se)
+{
+	struct vstream_info *vinfo;
+	u64 delta, preempt_delta;
+
+	if (!schedstat_enabled())
+		return;
+
+	delta = ktime_get_boot_ns() - schedstat_val(se->statistics.wait_start);
+	vinfo = container_of(se, struct vstream_info, se);
+	trace_ucc_sched_stat_wait(vinfo, delta);
+
+	__schedstat_set(se->statistics.wait_max,
+		      max(schedstat_val(se->statistics.wait_max), delta));
+	__schedstat_inc(se->statistics.wait_count);
+	__schedstat_add(se->statistics.wait_sum, delta);
+	__schedstat_set(se->statistics.wait_start, 0);
+
+	if (se->statistics.preempt_start) {
+		preempt_delta = ktime_get_boot_ns() -
+				schedstat_val(se->statistics.preempt_start);
+		trace_ucc_sched_stat_preempt(vinfo, preempt_delta);
+
+		__schedstat_set(se->statistics.preempt_max,
+				max(schedstat_val(se->statistics.preempt_max),
+				preempt_delta));
+		__schedstat_inc(se->statistics.preempt_count);
+		__schedstat_add(se->statistics.preempt_sum, preempt_delta);
+		__schedstat_set(se->statistics.preempt_start, 0);
+	}
+}
+
+void ucc_dump_statistics_info(struct ucc_se *se)
+{
+	struct vstream_info *vinfo = container_of(se, struct vstream_info, se);
+
+	pr_info("comm %s pid %d vstreamId %d kernel_sum %llu wait_count %llu wait_max %llu[ns] wait_sum %llu[ns] preempt_count %llu preempt_max %llu[ns] preempt_sum %llu[ns]\n",
+		 vinfo->p->comm,
+		 vinfo->p->pid,
+		 vinfo->vstreamId,
+		 vinfo->se.statistics.kernel_sum,
+		 vinfo->se.statistics.wait_count,
+		 vinfo->se.statistics.wait_max,
+		 vinfo->se.statistics.wait_sum,
+		 vinfo->se.statistics.preempt_count,
+		 vinfo->se.statistics.preempt_max,
+		 vinfo->se.statistics.preempt_sum);
+}
+
+static void put_prev_entity(struct xcu *cu, struct ucc_se *prev)
+{
+	if (!prev)
+		return;
+
+	if (prev->on_cu)
+		update_stats_wait_start(cu, prev);
+
+	prev->state = SE_READY;
+	cu->curr_se->state = SE_RUNNING;
+}
+
+static void set_next_entity(struct xcu *cu, struct ucc_se *se)
+{
+	if (se->on_cu && se != cu->curr_se)
+		update_stats_wait_end(cu, se);
+
+	cu->curr_se = se;
+}
+
+static void dequeue_ucc_se(struct ucc_se *se, struct xcu *cu)
+{
+	raw_spin_lock(&cu->xcu_lock);
+	if (!se->on_cu) {
+		raw_spin_unlock(&cu->xcu_lock);
+		return;
+	}
+
+	se->on_cu = 0;
+
+	list_del_init(&se->run_list);
+
+	if (list_empty(cu->queue + se->prio))
+		__clear_bit(se->prio, cu->bitmap);
+	cu->rt_nr_running--;
+
+	if (se != cu->curr_se)
+		update_stats_wait_end(cu, se);
+
+	if (cu->curr_se == se)
+		cu->curr_se = NULL;
+
+	raw_spin_unlock(&cu->xcu_lock);
+}
+
+static void enqueue_ucc_se(struct ucc_se *se, struct xcu *cu)
+{
+	struct list_head *queue = cu->queue + se->prio;
+
+	raw_spin_lock(&cu->xcu_lock);
+	if (se->on_cu) {
+		raw_spin_unlock(&cu->xcu_lock);
+		return;
+	}
+	se->on_cu = 1;
+	se->is_timeout = 0;
+	list_add_tail(&se->run_list, queue);
+	__set_bit(se->prio, cu->bitmap);
+	cu->rt_nr_running++;
+
+	update_stats_wait_start(cu, se);
+
+	raw_spin_unlock(&cu->xcu_lock);
+}
+
+static struct xcu *ucc_select_cu(struct ucc_se *se)
+{
+	struct vstream_info *vstream_info;
+	int min_nr_running = INT_MAX;
+	struct xcu *cu;
+	int select_cu = 0;
+	int cu_id;
+
+	vstream_info = container_of(se, struct vstream_info, se);
+	for (cu_id = 0; cu_id < num_active_xcu; cu_id++) {
+		cu = &xcu_manager[cu_id];
+
+		if (vstream_info->devId != cu->dev_id ||
+		    vstream_info->tsId != cu->ts_id)
+			continue;
+
+		if (cu->rt_nr_running < min_nr_running) {
+			min_nr_running = cu->rt_nr_running;
+			select_cu = cu_id;
+		}
+	}
+
+	vstream_info->cu_id = select_cu;
+	return &xcu_manager[select_cu];
+}
+
+static int ucc_check_preempt(struct ucc_se *se, struct xcu *cu)
+{
+	struct vstream_info *vinfo_curr, *vinfo;
+	struct ucc_se *curr_se;
+
+	curr_se = cu->curr_se;
+	if (!curr_se)
+		return 1;
+
+	vinfo = container_of(se, struct vstream_info, se);
+	vinfo_curr = container_of(curr_se, struct vstream_info, se);
+	if (vinfo_curr->p->ucc_priority > vinfo->p->ucc_priority) {
+		update_stats_preempt_start(cu, se);
+		curr_se->flag = UCC_TIF_PREEMPT;
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline void ucc_wakeup_idle_worker(struct xcu *cu)
+{
+	wake_up_state(cu->worker, TASK_INTERRUPTIBLE);
+}
+
+static inline void ucc_wakeup_running_worker(struct xcu *cu)
+{
+	wake_up_state(cu->worker, TASK_UNINTERRUPTIBLE);
+}
+
+int ucc_schedule(int cu_id)
+{
+	struct xcu *cu;
+
+	cu = &xcu_manager[cu_id];
+	cu->is_wake = 1;
+	ucc_wakeup_running_worker(cu);
+
+	return 0;
+}
+EXPORT_SYMBOL(ucc_schedule);
+
+int ucc_wake_up(struct ucc_se *se)
+{
+	struct xcu *cu;
+
+	raw_spin_lock(&se->se_lock);
+	if (se->on_cu) {
+		raw_spin_unlock(&se->se_lock);
+		return 0;
+	}
+
+	if (se->state == SE_BLOCK)
+		se->state = SE_READY;
+
+	cu = ucc_select_cu(se);
+	if (!cu) {
+		raw_spin_unlock(&se->se_lock);
+		return -1;
+	}
+
+	enqueue_ucc_se(se, cu);
+	if (ucc_check_preempt(se, cu))
+		ucc_wakeup_idle_worker(cu);
+
+	raw_spin_unlock(&se->se_lock);
+
+	return 0;
+}
+
+static struct ucc_se *pick_next_ucc_se(struct xcu *cu)
+{
+	struct ucc_se *se;
+	struct list_head *queue;
+	int idx;
+
+	if (!cu->rt_nr_running)
+		return NULL;
+
+	idx = sched_find_first_bit(cu->bitmap);
+	BUG_ON(idx >= MAX_UCC_PRIO);
+
+	queue = cu->queue + idx;
+	se = list_entry(queue->next, struct ucc_se, run_list);
+
+	return se;
+}
+
+static int ucc_submit_kernel(struct xcu *cu, struct ucc_se *se)
+{
+	struct vstream_info *vstream_info;
+	struct xpu_group *group;
+	struct tsdrv_ctx *ctx;
+	int kernel_num, left;
+
+	vstream_info = container_of(se, struct vstream_info, se);
+	ctx = vstream_info->privdata;
+	left = (vstream_info->vsqNode->tail - vstream_info->vsqNode->head +
+	       MAX_VSTREAM_SIZE) % MAX_VSTREAM_SIZE;
+
+	group = vstream_info->group;
+
+	kernel_num = xpu_run(group, vstream_info, ctx);
+	if (kernel_num <= 0)
+		return kernel_num;
+
+	//update vstream info head and tail;
+	update_vstream_head(vstream_info, kernel_num);
+
+	left -= kernel_num;
+
+	return kernel_num;
+}
+
+static inline void ucc_wait_idle(struct xcu *cu)
+{
+	cu->state = XCU_IDLE;
+
+	do {
+		schedule_timeout_interruptible(1);
+	} while (cu->rt_nr_running == 0);
+
+	cu->state = XCU_BUSY;
+}
+
+static inline void ucc_wait_running(struct xcu *cu, struct ucc_se *se)
+{
+	int cnt = 1;
+
+	do {
+		schedule_timeout_uninterruptible(
+			msecs_to_jiffies(sysctl_ucc_sched_rcv_timeout_ms));
+	} while (cu->is_wake == 0 && --cnt > 0);
+
+	if (cnt == 0) {
+		__schedstat_inc(se->statistics.timeout_count);
+		se->is_timeout = 1;
+	}
+}
+
+static inline void clear_se_flag(struct ucc_se *se)
+{
+	if (se)
+		se->flag = UCC_TIF_NONE;
+}
+
+void ucc_dequeue_task(struct vstream_info *vInfo)
+{
+	struct xcu *cu = &xcu_manager[vInfo->cu_id];
+	struct ucc_se *se = &vInfo->se;
+
+	raw_spin_lock(&se->se_lock);
+	dequeue_ucc_se(se, cu);
+	raw_spin_unlock(&se->se_lock);
+}
+
+/*
+ * dynamic padding: select kernels with no QoS confilcts to current ucc_se
+ * to fill cu;
+ */
+static void dynamic_padding(struct xcu *cu, struct ucc_se *se)
+{
+}
+
+static int __ucc_schedule(void *args)
+{
+	struct sched_args *sargs = (struct sched_args *)args;
+	int cu_id = sargs->cu_id;
+	struct xcu *cu = &xcu_manager[cu_id];
+	struct ucc_se *se = NULL, *curr_se = NULL;
+	struct ucc_se *prev_se = NULL;
+	struct vstream_info *vinfo;
+	int send_cnt = 0;
+	int kernel_num, preempt;
+
+	while (!is_xcu_offline(cu)) {
+		raw_spin_lock(&cu->xcu_lock);
+		cu->is_sched = 0;
+		prev_se = cu->curr_se;
+
+		preempt = 0;
+		if (prev_se) {
+			if (prev_se->flag != UCC_TIF_PREEMPT)
+				goto submit_kernel;
+
+			vinfo = container_of(prev_se, struct vstream_info, se);
+			if (send_cnt < vinfo->p->ucc_step)
+				goto submit_kernel;
+
+			preempt = 1;
+		}
+
+		clear_se_flag(prev_se);
+		se = pick_next_ucc_se(cu);
+		if (!se) {
+			cu->is_sched = 1;
+			raw_spin_unlock(&cu->xcu_lock);
+			trace_ucc_sched_switch(0, &vstream_idle);
+			ucc_wait_idle(cu);
+			continue;
+		}
+
+		set_next_entity(cu, se);
+		if (se != prev_se) {
+			put_prev_entity(cu, prev_se);
+			vinfo = container_of(se, struct vstream_info, se);
+			trace_ucc_sched_switch(preempt, vinfo);
+		}
+		send_cnt = 0;
+submit_kernel:
+		curr_se = cu->curr_se;
+		dynamic_padding(cu, curr_se);
+		raw_spin_unlock(&cu->xcu_lock);
+
+		curr_se->is_timeout = 0;
+		kernel_num = ucc_submit_kernel(cu, curr_se);
+		//has no more kernels to submit.
+		if (kernel_num <= 0 && !vstream_have_kernel(curr_se)) {
+			raw_spin_lock(&curr_se->se_lock);
+			curr_se->state = SE_BLOCK;
+			dequeue_ucc_se(curr_se, cu);
+			raw_spin_unlock(&curr_se->se_lock);
+			cu->is_sched = 1;
+			continue;
+		}
+		cu->is_sched = 1;
+
+		vinfo = container_of(curr_se, struct vstream_info, se);
+		if (vinfo->send_cnt > TS_SQ_TRANS_TASK_THRESHOLD) {
+			update_stats_run_start(cu, curr_se);
+			/* kernel has not finish */
+			if (!cu->is_wake)
+				ucc_wait_running(cu, curr_se);
+
+			update_stats_run_end(cu, curr_se);
+			cu->is_wake = 0;
+			vinfo->send_cnt = 0;
+		}
+
+		send_cnt += kernel_num;
+		schedstat_add(se->statistics.kernel_sum, kernel_num);
+	}
+
+	return 0;
+}
+
+static void init_xcu_rq(struct xcu *cu)
+{
+	int i;
+
+	for (i = 0; i < MAX_UCC_PRIO; i++) {
+		INIT_LIST_HEAD(cu->queue + i);
+		__clear_bit(i, cu->bitmap);
+	}
+
+	/* delimiter for bitsearch: */
+	__set_bit(MAX_UCC_PRIO, cu->bitmap);
+	cu->rt_nr_running = 0;
+	raw_spin_lock_init(&cu->xcu_lock);
+}
+
+static int alloc_cu_id(void)
+{
+	int cu_id = -1;
+
+	raw_spin_lock(&xcu_mgr_lock);
+	if (num_active_xcu >= MAX_XCU_NUM) {
+		raw_spin_unlock(&xcu_mgr_lock);
+		return cu_id;
+	}
+
+	cu_id = num_active_xcu;
+	num_active_xcu++;
+	raw_spin_unlock(&xcu_mgr_lock);
+
+	return cu_id;
+}
+
+int ucc_sched_register_xcu(int dev_id, int ts_id, int cu_num)
+{
+	int cu_id;
+	struct xcu *cu;
+	struct sched_args *args;
+	struct sched_param param = { .sched_priority = 1 };
+	char id_buf[16];
+	int i;
+
+	for (i = 0; i < cu_num; i++) {
+		cu_id = alloc_cu_id();
+		if (cu_id < 0) {
+			pr_err("alloc cu id failed\n");
+			return -1;
+		}
+
+		cu = &xcu_manager[cu_id];
+		cu->cu_id = cu_id;
+		cu->state = XCU_IDLE;
+		cu->curr_se = NULL;
+		cu->dev_id = dev_id;
+		cu->ts_id = ts_id;
+		cu->is_wake = 0;
+		init_xcu_rq(cu);
+
+		args = kzalloc(sizeof(struct sched_args), GFP_KERNEL);
+		if (!args)
+			return -1;
+
+		args->cu_id = cu->cu_id;
+		snprintf(id_buf, sizeof(id_buf), "%d:%d:%d",
+			 cu->cu_id, cu->dev_id, cu->ts_id);
+		cu->worker = kthread_create_on_node(__ucc_schedule,
+						    (void *)args, NUMA_NO_NODE,
+						    "u_sched/%s", id_buf);
+		sched_setscheduler_nocheck(cu->worker, SCHED_FIFO, &param);
+		wake_up_process(cu->worker);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ucc_sched_register_xcu);
+
+int ucc_sched_init(void)
+{
+	raw_spin_lock_init(&xcu_mgr_lock);
+	return 0;
+}
+
+int ucc_rt_nr_running(struct xcu *cu)
+{
+	return cu->rt_nr_running;
+}
+EXPORT_SYMBOL(ucc_rt_nr_running);
+
+struct xcu *ucc_get_xcu_by_id(int cu_id)
+{
+	return &xcu_manager[cu_id];
+}
+EXPORT_SYMBOL(ucc_get_xcu_by_id);
+
+int ucc_xcu_is_sched(int cu_id)
+{
+	return xcu_manager[cu_id].is_sched;
+}
+EXPORT_SYMBOL(ucc_xcu_is_sched);
diff --git a/kernel/ucc_sched/ucc_sched.h b/kernel/ucc_sched/ucc_sched.h
new file mode 100644
index 000000000000..30e2aa10cf2f
--- /dev/null
+++ b/kernel/ucc_sched/ucc_sched.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ * Author: Huawei OS Kernel Lab
+ * Create: Tue Jan 17 22:27:22 2023
+ */
+#ifndef __UCC_SCHED_USCHED_H__
+#define __UCC_SCHED_USCHED_H__
+
+#include <linux/sched.h>
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+#include <linux/vstream.h>
+
+//For simplicity, we set this parameter to 2.
+#define MAX_UCC_PRIO	(2)
+
+enum xcu_state {
+	XCU_INACTIVE,
+	XCU_IDLE,
+	XCU_BUSY,
+	XCU_SUBMIT,
+};
+
+/*
+ * This is the abstraction object of the xpu computing unit.
+ */
+struct xcu {
+	int is_sched;
+	int cu_id;
+	int dev_id;
+	int ts_id;
+	int rt_nr_running;
+	int is_wake;
+	struct task_struct *worker;
+	DECLARE_BITMAP(bitmap, MAX_UCC_PRIO);
+	struct list_head queue[MAX_UCC_PRIO];
+	enum xcu_state state;
+	struct ucc_se *curr_se;
+	raw_spinlock_t xcu_lock;
+};
+
+#endif
-- 
Gitee