diff --git a/docs/en/dpu_offload/_toc.yaml b/docs/en/dpu_offload/_toc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f4b81c30d8f99346b7d7928f4a0b78853ade6b52 --- /dev/null +++ b/docs/en/dpu_offload/_toc.yaml @@ -0,0 +1,14 @@ +label: libvirt Direct Connection Aggregation Environment Establishment +isManual: true +href: ./libvirt_direct_connection_aggregation_environment_establishment.md +description: DPU offloading feature for container management and its installation and deployment method on openEuler +sections: + - label: qtfs Shared File System + href: ./qtfs_architecture_and_usage.md + - label: Imperceptible DPU Offload User Guide + href: ./overview.md + sections: + - label: Imperceptible Container Management Plane Offload + href: ./imperceptible_container_management_plane_offload.md + - label: Imperceptible Container Management Plane Offload Deployment Guide + href: ./offload_deployment_guide.md diff --git a/docs/en/dpu_offload/config/client.json b/docs/en/dpu_offload/config/client.json new file mode 100644 index 0000000000000000000000000000000000000000..4aedf4c846914a6bc34dff1988c7794ddb1fa521 --- /dev/null +++ b/docs/en/dpu_offload/config/client.json @@ -0,0 +1,5 @@ +{ + "Protocol": "tcp", + "Ipaddr" : "192.168.10.11", + "Port" : "7777" +} diff --git a/docs/en/dpu_offload/config/prepare.sh b/docs/en/dpu_offload/config/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..ccfe9402051a02451644345b39ef6aa2657bfe89 --- /dev/null +++ b/docs/en/dpu_offload/config/prepare.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +mkdir -p /another_rootfs/var/run/docker/containerd +iptables -t nat -N DOCKER + +echo "---------insmod qtfs ko----------" +# TEST_MODE: IP +insmod ${YOUR_PATH}/qtfs.ko qtfs_server_ip=${YOUR_SERVER_IP} qtfs_log_level=INFO # Enter the .ko file path and IP address. +nohup ${YOUR_PATH}/udsproxyd 1 ${YOUR_CLIENT_IP} 12121 ${YOUR_SERVER_IP} 12121 2>&1 & + +# TEST_MODE: vsock +# insmod ${YOUR_PATH}/qtfs.ko qtfs_server_vsock_cid=${YOUR_SERVER_VSOCK_CID} qtfs_log_level=INFO # Enter the .ko file path and IP address. +# nohup ${YOUR_PATH}/udsproxyd 1 ${YOUR_CLIENT_VSOCK_CID} 12121 ${YOUR_SERVER_VSOCK_CID} 12121 2>&1 & + +qtcfg -w udsconnect -x /var/run/rexec +qtcfg -w udsconnect -x /run/rexec + +mkdir /another_rootfs/local_proc/ +mount -t proc proc /another_rootfs/local_proc/ +mount --bind /var/run/ /another_rootfs/var/run/ +mount --bind /var/lib/ /another_rootfs/var/lib/ +mount --bind /etc /another_rootfs/etc +mount -t devtmpfs devtmpfs /another_rootfs/dev/ +mount -t sysfs sysfs /another_rootfs/sys +mkdir -p /another_rootfs/sys/fs/cgroup +mount -t tmpfs tmpfs /another_rootfs/sys/fs/cgroup +list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" +for i in $list +do + echo $i + mkdir -p /another_rootfs/sys/fs/cgroup/$i + mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /another_rootfs/sys/fs/cgroup/$i +done + +mount -t qtfs -o proc /proc /another_rootfs/proc +echo "proc" +mount -t qtfs /sys /another_rootfs/sys +echo "cgroup" + +mkdir -p /another_rootfs/var/lib/docker/containers +mkdir -p /another_rootfs/var/lib/docker/containerd +mkdir -p /another_rootfs/var/lib/docker/overlay2 +mkdir -p /another_rootfs/var/lib/docker/image +mkdir -p /another_rootfs/var/lib/docker/tmp +mount -t qtfs /var/lib/docker/containers /another_rootfs/var/lib/docker/containers +mount -t qtfs /var/lib/docker/containerd /another_rootfs/var/lib/docker/containerd +mount -t qtfs /var/lib/docker/overlay2 /another_rootfs/var/lib/docker/overlay2 +mount -t qtfs /var/lib/docker/image /another_rootfs/var/lib/docker/image +mount -t qtfs /var/lib/docker/tmp /another_rootfs/var/lib/docker/tmp +mkdir -p /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ +mount -t qtfs /run/containerd/io.containerd.runtime.v1.linux/ /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ +mkdir -p /another_rootfs/var/run/docker/containerd +mount -t qtfs /run/docker/containerd /another_rootfs/run/docker/containerd +mkdir -p /another_rootfs/var/lib/containerd/io.containerd.runtime.v1.linux +mount -t qtfs /var/lib/containerd/io.containerd.runtime.v1.linux /another_rootfs/var/lib/containerd/io.containerd.runtime.v1.linux + +qtcfg -w udsconnect -x /another_rootfs/var/run/rexec +qtcfg -w udsconnect -x /another_rootfs/run/rexec diff --git a/docs/en/dpu_offload/config/rexec.service b/docs/en/dpu_offload/config/rexec.service new file mode 100644 index 0000000000000000000000000000000000000000..ee9e5e4895adb5c010e3f8d4db6652cfaed3d355 --- /dev/null +++ b/docs/en/dpu_offload/config/rexec.service @@ -0,0 +1,13 @@ +[Unit] +Description=Rexec_server Service +After=network.target + +[Service] +Type=simple +Environment=CMD_NET_ADDR=tcp://0.0.0.0:7777 +ExecStart=/usr/bin/rexec_server +ExecReload=/bin/kill -s HUP $MAINPID +KillMode=process + +[Install] +WantedBy=multi-user.target diff --git a/docs/en/dpu_offload/config/server.json b/docs/en/dpu_offload/config/server.json new file mode 100644 index 0000000000000000000000000000000000000000..1d4a7bbbc1cbf086e18b147f3f27e6a15c2e322e --- /dev/null +++ b/docs/en/dpu_offload/config/server.json @@ -0,0 +1,5 @@ +{ + "Protocol": "tcp", + "Ipaddr" : "0.0.0.0", + "Port" : "7777" +} diff --git a/docs/en/dpu_offload/config/server_start.sh b/docs/en/dpu_offload/config/server_start.sh new file mode 100644 index 0000000000000000000000000000000000000000..fd3655159ddb0fc6069dfa3ab802f4c9f8520c13 --- /dev/null +++ b/docs/en/dpu_offload/config/server_start.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +modprobe overlay +mkdir /var/lib/docker/containers +mkdir -p /var/lib/docker/containers +mkdir -p /var/lib/docker/containerd +mkdir -p /var/lib/docker/overlay2 +mkdir -p /var/lib/docker/tmp +mkdir -p /var/lib/docker/image +mkdir -p /var/run/docker/containerd +mkdir -p /run/containerd/io.containerd.runtime.v1.linux/ +mkdir -p /var/run/docker/netns +mkdir -p /var/lib/containerd/io.containerd.runtime.v1.linux/ +mkdir -p /run/user/0 +touch /var/run/docker/netns/default +# this should be done once +mount --bind /proc/1/ns/net /var/run/docker/netns/default + +function TaskClean() +{ + echo "Now do task clean..." + pkill engine + rmmod qtfs_server + echo "TaskClean done" +} + +trap "TaskClean exit" SIGINT + +mkdir -p /var/run/docker/containerd +mkdir -p /run/containerd/io.containerd.runtime.v1.linux/ + +# TEST_MODE: IP +insmod ${YOUR_PATH}/qtfs_server.ko qtfs_server_ip=${YOUR_SERVER_IP} qtfs_log_level=ERROR +nohup ${YOUR_PATH}/engine 16 1 ${YOUR_SERVER_IP} 12121 ${YOUR_CLIENT_IP} 12121 2>&1 & + +# TEST_MODE: vsock +# insmod ${YOUR_PATH}/qtfs_server.ko qtfs_server_vsock_cid=${YOUR_SERVER_VSOCK_CID} qtfs_log_level=ERROR +# nohup ${YOUR_PATH}/engine 16 1 ${YOUR_SERVER_VSOCK_CID} 12121 ${YOUR_CLIENT_VSOCK_CID} 12121 2>&1 & + +sleep 2 + +qtcfg -w udsconnect -x /var/run/rexec +qtcfg -w udsconnect -x /run/rexec +qtcfg -w udsconnect -x /var/run/containerd diff --git a/docs/en/dpu_offload/config/whitelist b/docs/en/dpu_offload/config/whitelist new file mode 100644 index 0000000000000000000000000000000000000000..b0be45f86276e89fa9fd0827ba06bbb27d158f62 --- /dev/null +++ b/docs/en/dpu_offload/config/whitelist @@ -0,0 +1,8 @@ +kill +taskset +qemu-kvm +rexec_shim +/usr/bin/taskset +/usr/bin/kill +/usr/bin/qemu-kvm +/usr/bin/rexec_shim diff --git a/docs/en/dpu_offload/figures/arch.png b/docs/en/dpu_offload/figures/arch.png new file mode 100644 index 0000000000000000000000000000000000000000..b6a7836fd6fab75009e781ac1ed96c73c352f75b Binary files /dev/null and b/docs/en/dpu_offload/figures/arch.png differ diff --git a/docs/en/dpu_offload/figures/offload-arch.png b/docs/en/dpu_offload/figures/offload-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..b0f7b8587c47838880bcca5d6694f66a16ec0aaf Binary files /dev/null and b/docs/en/dpu_offload/figures/offload-arch.png differ diff --git a/docs/en/dpu_offload/figures/qtfs-arch.png b/docs/en/dpu_offload/figures/qtfs-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..749b007287d8503badcea52036b7a71b06092bc2 Binary files /dev/null and b/docs/en/dpu_offload/figures/qtfs-arch.png differ diff --git a/docs/en/dpu_offload/imperceptible_container_management_plane_offload.md b/docs/en/dpu_offload/imperceptible_container_management_plane_offload.md new file mode 100644 index 0000000000000000000000000000000000000000..b6b5f362f2c6a72f71443b1fa70262b8b5d290ec --- /dev/null +++ b/docs/en/dpu_offload/imperceptible_container_management_plane_offload.md @@ -0,0 +1,35 @@ +# Imperceptible Container Management Plane Offload + +## Overview + +Moore's law ceases to apply in data center and cloud scenarios. The CPU computing power growth rate of general processing units is slowing down, while the network I/O speed and performance keep increasing. As a result, the processing capability of current general-purpose processors cannot meet the I/O processing requirements of the network and drives. In traditional data centers, more and more general-purpose CPU computing power is occupied by I/O and management planes. This part of resource loss is called data center tax. According to AWS statistics, the data center tax may account for more than 30% of the computing power of the data center. + +The data processing unit (DPU) is developed to release the computing resources from the host CPU. The management plane, network, storage, and security capabilities are offloaded to DPUs for acceleration, reducing costs and improving efficiency. Mainstream cloud vendors, such as AWS, Alibaba Cloud, and Huawei Cloud, use self-developed processors to offload the management plane and related data plane, achieving 100% utilization of data center computing resources. + +The management plane processes can be offloaded to the DPU by splitting the component source code. The source code is split into two parts that run independently on the host and DPU based on the function logic. In this way, the component is offloaded. However, this method has the following problems: + +1. The software compatibility of the component is affected. You need to maintain the component and related patches in subsequent version upgrades, which increases the maintenance workload. +2. The offload cannot be inherited by other components. You need to split each component based on code logic analysis. + +To solve these problems, openEuler introduces imperceptible DPU offload. The abstraction layer provided by the OS shields the cross-host access differences between the host and DPU, and enables service processes to be offloaded to the DPU with virtually zero modification. This part of work at the common layer of the OS and is irrelevant to upper-layer services. Other services can also inherit the offload to DPU. + +## Architecture + +### Imperceptible Container Management Plane DPU Offload Architecture + +**Figure 1** Imperceptible Container Management Plane DPU Offload Architecture + +![offload-arch](./figures/offload-arch.png) + +As shown in Figure 1, after the container management plane is offloaded, management processes such as dockerd and kubelet run on the DPU side, and container processes run on the host. The interaction between processes is ensured by the system layer. + +* Communication layer: DPUs and hosts can communicate with each other through PCIe interfaces or networks. A communication interface layer is provided based on underlying physical connections to provide communication interfaces for upper-layer services. + +* qtfs kernel shared file system: The container management plane components kubelet and dockerd interact with container processes through file systems. Management plane tools need to prepare data plane paths to rootfs and volume for container processes. In addition, the proc and cgroup file systems need to be used to control and monitor the resources and status of container processes. For details about qtfs, see [qtfs Shared File System Introduction and Usage](./qtfs-architecture-and-usage.md). + +* User-mode offload environment: You need to use qtfs to prepare the runtime environment for the offloaded management plane, and remotely mount the container management and runtime directories of the host to the DPU. System management file systems such as proc, sys, and cgroup need to be mounted. To prevent damage to the native system functions of the DPU, the preceding mounting operations are performed in the chroot environment. In addition, the management plane (running on the DPU) and container processes (running on the host) have invoking relationships. The rexec remote binary execution tool needs to be used to provide corresponding functions. + +For details about how to offload container management plane, see the [Deployment Guide](./offload-deployment-guide.md). + +> [!NOTE]NOTE +> In this user guide, modifications are performed to the container management plane components and the rexec tool of a specific version. You can modify other versions based on the actual execution environment. The patch provided in this document is for verification only and is not for commercial use. diff --git a/docs/en/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md b/docs/en/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md new file mode 100644 index 0000000000000000000000000000000000000000..90280c4bda996c18b21401f8717b14b865ccfeaa --- /dev/null +++ b/docs/en/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md @@ -0,0 +1,360 @@ +# Libvirt Direct Connection Aggregation Environment Establishment + +## 1 Hardware Preparation + +### Test Mode + +Prepare two physical machines (VMs have not been tested) that can communicate with each other. + +One physical machine functions as the DPU, and the other functions as the host. In this document, DPU and HOST refer to the two physical machines. + +> [!NOTE]NOTE +> In the test mode, network ports are exposed without connection authentication, which is risky and should be used only for internal tests and verification. Do not use this mode in the production environment. + +### vsock mode + +The DPU and HOST are required. The DPU must be able to provide vsock communication through virtio. + +This document describes only the test mode usage. + +## 2 libvirt offload architecture + +![arch](./figures/arch.png) + +## 3 Environment Setup + +### 3.1 qtfs File System Deployment + +For details, visit . + +To establish a qtfs connection, you need to disable the firewall. + +### 3.2 Deploying the udsproxyd Service + +#### 3.2.1 Introduction + +udsproxyd is a cross-host Unix domain socket (UDS) proxy service, which needs to be deployed on both the host and DPU. The udsproxyd components on the host and dpu are peers. They implement seamless UDS communication between the host and DPU, which means that if two processes can communicate with each other through UDSs on the same host, they can do the same between the host and DPU. The code of the processes does not need to be modified, only that the client process needs to run with the **LD_PRELOAD=libudsproxy.so** environment variable. As a cross-host Unix socket service, udsproxyd can be used by running with `LD_PRELOAD=libudsproxy.so`. With the support of qtfs, udsproxyd can also be used transparently. You need to configure the allowlist in advance. The specific operations are described later. + +#### 3.2.2 Deploying udsproxyd + +Build udsproxyd in the dpu-utilities project: + +```bash +cd qtfs/ipc +make -j UDS_TEST_MODE=1 && make install +``` + +The engine service on the qtfs server has incorporated the udsproxyd feature. You do not need to manually start udsproxyd if the qtfs server is deployed. However, you need to start udsproxyd on the client by running the following command: + +```bash +nohup /usr/bin/udsproxyd 2>&1 & +``` + +Parameters: + +```bash +thread num: number of threads. Currently, only one thread is supported. +addr: IP address of the host. +port: Port used on the host. +peer addr: IP address of the udsproxyd peer. +peer port: port used on the udsproxyd peer. +``` + +Example: + +```bash +nohup /usr/bin/udsproxyd 1 192.168.10.10 12121 192.168.10.11 12121 2>&1 & +``` + +If the qtfs engine service is not started, you can start udsproxyd on the server to test udsproxyd separately. Run the following command: + +```bash +nohup /usr/bin/udsproxyd 1 192.168.10.11 12121 192.168.10.10 12121 2>&1 & +``` + +#### 3.2.3 Using udsproxyd + +##### 3.2.3.1 Using udsproxyd Independently + +When starting the client process of the Unix socket application that uses the UDS service, add the **LD_PRELOAD=libudsproxy.so** environment variable to intercept the **connect** API of glibc for UDS interconnection. In the libvirt offload scenario, you can copy **libudsproxy.so**, which will be used by the libvirtd service, to the **/usr/lib64** directory in the chroot directory of libvirt. + +##### 3.2.3.2 Using the udsproxyd Service Transparently + +Configure the UDS service allowlist for qtfs. The allowlist is the sock file address bound to the Unix socket server. For example, the files of the Unix socket server created by the libvirt VM are in the **/var/lib/libvirt** directory. In this case, add the directory path to the allowlist in either of the following ways: + +* Load the allowlist by using the `qtcfg` utility. First compile the utility in **qtfs/qtinfo**. + +Run the following command on the qtfs client: + +```bash +make role=client +make install +``` + +Run the following command on the qtfs server: + +```bash +make role=server +make install +``` + +After `qtcfg` is installed automatically, run `qtcfg` to configure the allowlist. Assume that **/var/lib/libvirt** needs to be added to the allowlist: + +```bash +qtcfg -x /var/lib/libvirt/ +``` + +Query the allowlist: + +```bash +qtcfg -z +``` + +Delete an allowlist entry: + +```bash +qtcfg -y 0 +``` + +The parameter is the index number listed when you query the allowlist. + +* Add an allowlist entry through the configuration file. The configuration file needs to be set before the qtfs or qtfs_server kernel module is loaded. The allowlist is loaded when the kernel modules are initialized. + +> [!NOTE]NOTE +> The allowlist prevents irrelevant Unix sockets from establishing remote connections, causing errors or wasting resources. You are advised to set the allowlist as precisely as possible. For example, in this document, **/var/lib/libvirt** is set in the libvirt scenario. It would be risky to directly add **/var/lib**, **/var**, or the root directory. + +### 3.3 rexec Service Deployment + +#### 3.3.1 Introduction + +rexec is a remote execution component developed using the C language. It consists of the rexec client and rexec server. The server is a daemon process, and the client is a binary file. After being started, the client establishes a UDS connection with the server using the udsproxyd service, and the server daemon process starts a specified program on the server machine. During libvirt virtualization offload, libvirtd is offloaded to the DPU. When libvirtd needs to start the QEMU process on the HOST, the rexec client is invoked to remotely start the process. + +#### 3.3.2 Deploying rexec + +##### 3.3.2.1 Configuring the Environment Variables and Allowlist + +Configure the rexec server allowlist on the host. Put the **whitelist** file in the **/etc/rexec** directory, and change the file permission to read-only. + +```bash +chmod 400 /etc/rexec/whitelist +``` + +In the test environment, the allowlist is not mandatory. You can disable the allowlist by deleting the **whitelist** file and restarting the rexec_server process. + +After downloading the dpu-utilities code, go to the **qtfs/rexec** directory and run `make && make install` to install all binary files required by rexec (**rexec** and **rexec_server**) to the **/usr/bin** directory. + +Before starting the rexec_server service on the server, check whether the **/var/run/rexec** directory exists. If not, create it. + +```bash +mkdir /var/run/rexec +``` + +##### 3.3.2.2 Starting the Service + +You can start the rexec_server service on the server in either of the following ways. + +* Method 1: + + Configure rexec as a systemd service. + + Add the **[rexec.service](./config/rexec.service)** file to **/usr/lib/systemd/system**. + + Then, use `systemctl` to manage the rexec service. + + Start the service for the first time: + + ```bash + systemctl daemon-reload + + systemctl enable --now rexec + ``` + + Restart the service: + + ```bash + systemctl stop rexec + + systemctl start rexec + ``` + +* Method 2: + + Manually start the service in the background. + + ```bash + nohup /usr/bin/rexec_server 2>&1 & + ``` + +### 3.4 libvirt Service Deployment + +#### 3.4.1 Deploying on the HOST + +Install the VM runtime and libvirt. (libvirt is installed to create related directories.) + +```bash +yum install -y qemu libvirt edk2-aarch64 # (required for starting VMs in the Arm environment) +``` + +Put the VM image on the HOST. The VM image will be mounted to the client through qtfs and shared with libvirt. + +#### 3.4.2 Deploying on the DPU + +##### 3.4.2.1 Creating the Chroot Environment + +(a) Download the QCOW image from the openEuler official website, for example, openEuler 23.09: . + +(b) Mount the QCOW2 image. + +```bash +cd /root/ + +mkdir p2 new_root_origin new_root + +modprobe nbd maxport=8 + +qemu-nbd -c /dev/nbd0 xxx.qcow2 + +mount /dev/nbd0p2 /root/p2 + +cp -rf /root/p2/* /root/new_root_origin/ + +umount /root/p2 + +qemu-nbd -d /dev/nbd0 +``` + +(c) Now, the root directory of the image is decompressed in **new_root_origin**. Bind mount **new_root** to **new_root_origin** as the mount point for chroot. + +```bash +mount --bind /root/new_root_origin /root/new_root +``` + +##### 3.4.2.2 Installing libvirt + +Compile the source code with a patch. + +(a) Go to the chroot environment and install the compilation environment and common tools. + +```bash +yum groupinstall "Development tools" -y +yum install -y vim meson qemu qemu-img strace edk2-aarch64 tar +``` + +**edk2-aarch64** is required for starting VMs in the Arm environment. + +(b) Install the dependency packages required for libvirt compilation. + +```bash + yum install -y rpcgen python3-docutils glib2-devel gnutls-devel libxml2-devel libpciaccess-devel libtirpc-devel yajl-devel systemd-devel dmidecode glusterfs-api numactl +``` + +(c) Download the libvirt-x.x.x source code package . + +(d) Obtain the libvirt patch: + +. + +(e) Decompress the source code package to a directory in the chroot environment, for example, **/home**, and apply the patch. + +(f) Go to the **libvirt-x.x.x** directory and run the following command: + +```bash +meson build --prefix=/usr -Ddriver_remote=enabled -Ddriver_network=enabled -Ddriver_qemu=enabled -Dtests=disabled -Ddocs=enabled -Ddriver_libxl=disabled -Ddriver_esx=disabled -Dsecdriver_selinux=disabled -Dselinux=disabled +``` + +(g) Complete the installation. + +```bash +ninja -C build install +``` + +##### 3.4.2.3 Starting the libvirtd Service + +To use libvirt direct connection aggregation, you need to start the libvirtd service in the chroot environment, which requires the libvirtd service outside the chroot environment to be stopped. + +(a) Put the [VM jumper script](./scripts/qemu-kvm) in **/usr/bin** and **/usr/libexec** in the chroot environment to replace the **qemu-kvm** binary file. The jumper script will call rexec to start a remote VM. +> [!NOTE]NOTE +> In the XML file of virsh, set **\** under **\** to **qemu-kvm**. If you set **\** to another value, change it to **qemu-kvm** or replace the binary file specified by **\** with the jumper script. The content of the jumper script also needs to be modified accordingly. + +(b) Copy the **libudsproxy.so** file generated during udsproxyd compilation to the **/usr/lib64** directory in the chroot directory. If the udsproxyd service is used by configuring the UDS allowlist of qtfs, you do not need to copy the **libudsproxy.so** file. + +(c) Save the **rexec** binary file generated during rexec compilation to the **/usr/bin** directory of the chroot environment. + +(d) To configure the chroot mounting environment, you need to mount some directories. Use the following scripts: + +* [virt_start.sh](./scripts/virt_start.sh) is the configuration script. In the script, you need to manually change the **qtfs.ko** path to the path of the compiled **.ko** file and set the correct HOST IP address. +* [virt_umount.sh](./scripts/virt_umount.sh) is the configuration revert script. + +(e) The mount directories in the script are based on the examples in this document. You can modify the paths in the script as required. + +(f) After the chroot environment is configured, enter the chroot environment and manually start libvirtd. + +If qtfs is not configured to use the udsproxyd allowlist, run the following commands: + +```bash +LD_PRELOAD=/usr/lib64/libudsproxy.so virtlogd -d +LD_PRELOAD=/usr/lib64/libudsproxy.so libvirtd -d +``` + +If qtfs is configured to use the udsproxyd allowlist, the LD_PRELOAD prefix is not required: + +```bash +virtlogd -d +libvirtd -d +``` + +To check whether the allowlist is configured, run the following command in another terminal that is not in the chroot environment: + +```bash +qtcfg -z +``` + +Check whether the allowlist contains **/var/lib/libvirt**. + +### 3.5 VM Startup + +After the service is deployed, you can manage the VM life cycle from the DPU. + +#### 3.5.1 Defining the VM + +(a) Place the VM boot image in a directory on the HOST, for example: + +```bash +/home/VMs/Domain_name +``` + +(b) Use qtfs to mount the directory to the DPU. + +```bash +mount -t qtfs /home/VMs /home/VMs +``` + +(c) In the XML file, **/home/VMs/Domain_name** is used as the boot image. In this way, the same image file is presented to the DPU and HOST (**Domain_name** is the VM **domain**). + +(d) Check whether **\** in the XML file points to the jumper script. + +(e) Define the VM. + +```bash +virsh define xxx.xml +``` + +#### 3.5.2 Starting the VM + +```bash +virsh start domain +``` + +# 4 Environment Reset + +Some libvirt directories are shared between the DPU and the HOST. Therefore, you need to unmount these directories before uninstalling the environment. Generally, stop the libvirtd and virtlogd processes and run the **virt_umount.sh** script. If a VM is running on the HOST, stop the VM before unmounting the directories. + +## 5 Common Errors + +1. libvirt compilation failure: Check whether the dependency packages are installed. If an external directory or HOST directory is mounted to the chroot environment, the compilation may fail. In this case, unmount the directory first. + +2. qtfs mounting failure: The engine process on the server is not started or the firewall is not disabled. As a result, the qtfs connection fails. + +3. VM definition failure: Check whether the emulator in the XML file points to the jumper script, whether the VM image has been mounted to the DPU through qtfs, and whether the path is the same as that on the HOST. + +4. VM startup failure: Check whether the libvirtd and virtlogd services are started, whether the rexec service is started, whether the jumper process is started, and whether an error is reported when qemu-kvm is started. diff --git a/docs/en/dpu_offload/offload_deployment_guide.md b/docs/en/dpu_offload/offload_deployment_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..f566958ad1b0a1554e7da991a69974df1c5534ec --- /dev/null +++ b/docs/en/dpu_offload/offload_deployment_guide.md @@ -0,0 +1,167 @@ +# Imperceptible Container Management Plane Offload Deployment Guide + +> [!NOTE]NOTE +> +> In this user guide, modifications are performed to the container management plane components and the rexec tool of a specific version. You can modify other versions based on the actual execution environment. The patch provided in this document is for verification only and is not for commercial use. +> [!NOTE]NOTE +> +> The communication between shared file systems is implemented through the network. You can perform a simulated offload using two physical machines or VMs connected through the network. +> +> Before the verification, you are advised to set up a Kubernetes cluster and container running environment that can be used properly and offload the management plane process of a single node. You can use a physical machine or VM that is connected to the network as an emulated DPU. + +## Introduction + +Container management plane, that is, management tools of containers such as Kubernetes, dockerd, containerd, and isulad. Container management plane offload is to offload the container management plane from the host where the container is located to another host, that is, the DPU, a set of hardware that has an independent running environment. + +By mounting directories related to container running on the host to the DPU through qtfs, the container management plane tool running on the DPU can access these directories and prepare the running environment for the containers running on the host. To remotely mount the special file systems such as proc and sys, a dedicated rootfs is created as the running environment of Kubernetes and dockerd (referred to as **/another_rootfs**). + +In addition, rexec is used to start and delete containers so that the container management plane and containers can run on two different hosts for remote container management. + +## Related Component Patches + +### rexec + +rexec is a remote execution tool written in the Go language based on the [rexec](https://github.com/docker/libchan/tree/master/examples/rexec) example tool of Docker/libchan. rexec is used to remotely invoke binary files. For ease of use, capabilities such as transferring environment variables and monitoring the exit of original processes are added to rexec. + +To use the rexec tool, run the `CMD_NET_ADDR=tcp://0.0.0.0: rexec_server` command on the server to start the rexec service process, and then run the `CMD_NET_ADDR=tcp://: rexec [command]` on the client`. This instructs rexec_server to execute the command. + +### dockerd + +The changes to dockerd are based on version 18.09. + +In containerd, the part that invokes libnetwork-setkey through hook is commented out. This does not affect container startup. In addition, to ensure the normal use of `docker load`, an error in the `mount` function in **mounter_linux.go** is commented out. + +In the running environment of the container management plane, **/proc** is mounted to the proc file system on the server, and the local proc file system is mounted to **/local_proc**. In dockerd and containerd, **/proc** is changed to **/local_proc** for accessing **/proc/self/xxx**, **/proc/getpid()/xxx**, or related file systems. + +### containerd + +The changes to containerd are based on containerd-1.2-rc.1. + +When obtaining mounting information, **/proc/self/mountinfo** can obtain only the local mounting information of dockerd but cannot obtain that on the server. Therefore, **/proc/self/mountinfo** is changed to **/proc/1/mountinfo** to obtain the mounting information on the server by obtaining the mounting information of process 1 on the server. + +In containerd-shim, the Unix socket that communicates with containerd is changed to TCP. containerd obtains the IP address of the running environment of containerd-shim through the **SHIM_HOST** environment variable, that is, the IP address of the server. The has value of shim is used to generate a port number, which is used as the communication port to start containerd-shim. + +In addition, the original method of sending signals to containerd-shim is changed to the method of remotely invoking the `kill` command to send signals to shim, ensuring that Docker can correctly kill containers. + +### Kubernetes + +kubelet is not modified. The container QoS manager may fail to be configured for the first time. This error does not affect the subsequent pod startup process. + +## Container Management Plane Offload Operation Guide + +Start rexec_server on both the server and client. rexec_server on the server is used to invoke rexec to stat containerd-shim. rexec_server on the client is used to execute invoking of dockerd and containerd by containerd-shim. + +### Server + +Create a folder required by the container management plane, insert **qtfs_server.ko**, and start the engine process. + +In addition, you need to create the rexec script **/usr/bin/dockerd** on the server. + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://: rexec /usr/bin/dockerd $* +``` + +### Client + +Prepare a rootfs as the running environment of dockerd and containerd. Use the following script to mount the server directories required by dockerd and containerd to the client. Ensure that the remote directories mounted in the script exist on both the server and client. + +``` shell +#!/bin/bash +mkdir -p /another_rootfs/var/run/docker/containerd +iptables -t nat -N DOCKER +echo "---------insmod qtfs ko----------" +insmod /YOUR/QTFS/PATH/qtfs.ko qtfs_server_ip= qtfs_log_level=INFO + +# The proc file system in the chroot environment is replaced by the proc shared file system of the DPU. The actual proc file system of the local host needs to be mounted to **/local_proc**. +mount -t proc proc /another_rootfs/local_proc/ + +# Bind the chroot internal environment to the external environment to facilitate configuration and running. +mount --bind /var/run/ /another_rootfs/var/run/ +mount --bind /var/lib/ /another_rootfs/var/lib/ +mount --bind /etc /another_rootfs/etc + +mkdir -p /another_rootfs/var/lib/isulad + +# Create and mount the dev, sys, and cgroup file systems in the chroot environment. +mount -t devtmpfs devtmpfs /another_rootfs/dev/ +mount -t sysfs sysfs /another_rootfs/sys +mkdir -p /another_rootfs/sys/fs/cgroup +mount -t tmpfs tmpfs /another_rootfs/sys/fs/cgroup +list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" +for i in $list +do + echo $i + mkdir -p /another_rootfs/sys/fs/cgroup/$i + mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /another_rootfs/sys/fs/cgroup/$i +done + +## common system dir +mount -t qtfs -o proc /proc /another_rootfs/proc +echo "proc" +mount -t qtfs /sys /another_rootfs/sys +echo "cgroup" + +# Mount the shared directory required by the container management plane. +mount -t qtfs /var/lib/docker/containers /another_rootfs/var/lib/docker/containers +mount -t qtfs /var/lib/docker/containerd /another_rootfs/var/lib/docker/containerd +mount -t qtfs /var/lib/docker/overlay2 /another_rootfs/var/lib/docker/overlay2 +mount -t qtfs /var/lib/docker/image /another_rootfs/var/lib/docker/image +mount -t qtfs /var/lib/docker/tmp /another_rootfs/var/lib/docker/tmp +mkdir -p /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ +mount -t qtfs /run/containerd/io.containerd.runtime.v1.linux/ /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ +mkdir -p /another_rootfs/var/run/docker/containerd +mount -t qtfs /var/run/docker/containerd /another_rootfs/var/run/docker/containerd +mount -t qtfs /var/lib/kubelet/pods /another_rootfs/var/lib/kubelet/pods +``` + +In**/another_rootfs**, create the following script to support cross-host operations: + +* /another_rootfs/usr/local/bin/containerd-shim + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://: /usr/bin/rexec /usr/bin/containerd-shim $* +``` + +* /another_rootfs/usr/local/bin/remote_kill + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://: /usr/bin/rexec /usr/bin/kill $* +``` + +* /another_rootfs/usr/sbin/modprobe + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://: /usr/bin/rexec /usr/sbin/modprobe $* +``` + +After changing the root directories of dockerd and containerd to the required rootfs, run the following command to start dockerd and containerd: + +* containerd + +``` shell +#!/bin/bash +SHIM_HOST= containerd --config /var/run/docker/containerd/containerd.toml --address /var/run/containerd/containerd.sock +``` + +* dockerd + +``` shell +#!/bin/bash +SHIM_HOST=CMD_NET_ADDR=tcp://: /usr/bin/dockerd --containerd /var/run/containerd/containerd.sock +``` + +* kubelet + +Use the original parameters to start kubelet in the chroot environment. + +Because **/var/run/** is bound to **/another_rootfs/var/run/**, you can use Docker to access the **docker.sock** interface for container management in the regular rootfs. + +The container management plane is offloaded to the DPU. You can run `docker` commands to create and delete containers, or use `kubectl` on the current node to schedule and destroy pods. The actual container service process runs on the host. + +> [!NOTE]NOTE +> +> This guide describes only the container management plane offload. The offload of container network and data volumes requires additional offload capabilities, which are not included. You can perform cross-node startup of containers that are not configured with network and storage by referring to this guide. diff --git a/docs/en/dpu_offload/overview.md b/docs/en/dpu_offload/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..53ef828f387423867ffc8194bf5cf879c768276c --- /dev/null +++ b/docs/en/dpu_offload/overview.md @@ -0,0 +1,11 @@ +# Imperceptible DPU Offload User Guide + +This document describes the container management plane DPU offload function of openEuler, as well as how to install and deploy it. Through the unified abstraction layer provided by the OS, this function masks the differences in how the container management plane accesses resources across hosts. This makes it possible to offload services from the container management plane to the DPU. + +This document is intended for community developers, open source enthusiasts, and partners who use the openEuler OS and want to learn and use the OS kernel and containers. Users must: + +- Know basic Linux operations. + +- Be familiar with the fundamental mechanisms of the Linux kernel file system. + +- Understand Kubernetes and Docker, as well as how to deploy and use them. diff --git a/docs/en/dpu_offload/qtfs_architecture_and_usage.md b/docs/en/dpu_offload/qtfs_architecture_and_usage.md new file mode 100644 index 0000000000000000000000000000000000000000..7fe0544418ae42a58daab9eb2e091f945d944416 --- /dev/null +++ b/docs/en/dpu_offload/qtfs_architecture_and_usage.md @@ -0,0 +1,77 @@ +# qtfs Shared File System Architecture and Usage + +## Introduction + +qtfs is a shared file system project. It can be deployed on either a host-DPU hardware architecture or on two hosts. qtfs works in client-server mode, allowing the client to access specified file systems on the server in the same way that local files are accessed. + +qtfs provides the following features: + ++ Mount point propagation + ++ Sharing of special file systems such as proc, sys, and cgroup + ++ Shared read and write of remote files + ++ Remote mounting of server file systems on the client + ++ Customized processing of special files + ++ Remote FIFO, Unix sockets, and epoll that allow the client and server to access the files as if they were like local + ++ Bottom-layer host-DPU communication over the PCIe protocol, outperforming the network + ++ Kernel module development, preventing intrusive modification to the kernel + +## Software Architecture + +![qtfs-arch](./figures/qtfs-arch.png) + +## Installation + +Perform operations in the following qtfs-related directories: + ++ **qtfs**: code of the client kernel module. Compile the client **.ko** file in this directory. + ++ **qtfs_server**: code of the server kernel module. Compile the server **.ko** files and related programs in this directory. + ++ **qtinfo**: diagnosis tool that is used to check the status of file systems and change the log level. + ++ **demo**, **test**, and **doc**: demo programs, test programs, and project documents. + ++ Root directory: code of common modules used by the client and server. + +Configure the kernel compilation environment on two servers (or VMs). + +1. The kernel version must be 5.10 or later. +2. Install the kernel development package by running `yum install kernel-devel`. +3. Assume that the host IP address is 192.168.10.10 and the DPU IP address is 192.168.10.11. + +Install the qtfs server. + +```bash + 1. cd qtfs_server + 2. make clean && make + 3. insmod qtfs_server.ko qtfs_server_ip=192.168.10.10 qtfs_server_port=12345 qtfs_log_level=WARN + 4. nohup ./engine 16 1 192.168.10.10 12121 192.168.10.11 12121 2>&1 & +``` + +Install the qtfs client. + +```bash + 1. cd qtfs + 2. make clean && make + 3. insmod qtfs.ko qtfs_server_ip=192.168.10.10 qtfs_server_port=12345 qtfs_log_level=WARN + 4. cd ../ipc/ + 5. make clean && make && make install + 6. nohup udsproxyd 1 192.168.10.11 12121 192.168.10.10 12121 2>&1 & +``` + +## Usage + +After the installation is complete, mount the server file system to the client. For example: + +```bash + mount -t qtfs / /root/mnt/ +``` + +The file system is visible to the client. Access **/root/mnt** on the client to view and operate files on the server. diff --git a/docs/en/dpu_offload/scripts/qemu-kvm b/docs/en/dpu_offload/scripts/qemu-kvm new file mode 100644 index 0000000000000000000000000000000000000000..e869371be109b57f59709fc23bc5b1cb2002cfbf --- /dev/null +++ b/docs/en/dpu_offload/scripts/qemu-kvm @@ -0,0 +1,3 @@ +#!/bin/bash + +exec /usr/bin/rexec /usr/bin/qemu-kvm $* diff --git a/docs/en/dpu_offload/scripts/virt_start.sh b/docs/en/dpu_offload/scripts/virt_start.sh new file mode 100644 index 0000000000000000000000000000000000000000..06ca194b7a639a947b6e395f116beeba7c897459 --- /dev/null +++ b/docs/en/dpu_offload/scripts/virt_start.sh @@ -0,0 +1,48 @@ +#!/bin/bash +insmod ./qtfs.ko qtfs_server_ip=192.168.10.11 qtfs_log_level=NONE + +systemctl stop libvirtd + +if [ ! -d "/root/new_root/local_proc" ]; then + mkdir -p /root/new_root/local_proc +fi +if [ ! -d "/root/new_root/local" ]; then + mkdir -p /root/new_root/local +fi +mount -t proc proc /root/new_root/local_proc/ +mount -t proc proc /root/new_root/local/proc +mount -t sysfs sysfs /root/new_root/local/sys +mount --bind /var/run/ /root/new_root/var/run/ +mount --bind /var/lib/ /root/new_root/var/lib/ +mount --bind /var/cache/ /root/new_root/var/cache +mount --bind /etc /root/new_root/etc + +mkdir -p /root/new_root/home/VMs/ +mount -t qtfs /home/VMs/ /root/new_root/home/VMs/ + +mount -t qtfs /var/lib/libvirt /root/new_root/var/lib/libvirt + +mount -t devtmpfs devtmpfs /root/new_root/dev/ +mount -t hugetlbfs hugetlbfs /root/new_root/dev/hugepages/ +mount -t mqueue mqueue /root/new_root/dev/mqueue/ +mount -t tmpfs tmpfs /root/new_root/dev/shm + +mount -t sysfs sysfs /root/new_root/sys +mkdir -p /root/new_root/sys/fs/cgroup +mount -t tmpfs tmpfs /root/new_root/sys/fs/cgroup +list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" +for i in $list +do + echo $i + mkdir -p /root/new_root/sys/fs/cgroup/$i + mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /root/new_root/sys/fs/cgroup/$i +done + +## common system dir +mount -t qtfs -o proc /proc /root/new_root/proc +echo "proc" + +mount -t qtfs /sys /root/new_root/sys +echo "cgroup" +mount -t qtfs /dev/pts /root/new_root/dev/pts +mount -t qtfs /dev/vfio /root/new_root/dev/vfio diff --git a/docs/en/dpu_offload/scripts/virt_umount.sh b/docs/en/dpu_offload/scripts/virt_umount.sh new file mode 100644 index 0000000000000000000000000000000000000000..4adddec913c23069c6bffddec0bf1770f8c5ce71 --- /dev/null +++ b/docs/en/dpu_offload/scripts/virt_umount.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +umount /root/new_root/dev/hugepages +umount /root/new_root/etc +umount /root/new_root/home/VMs +umount /root/new_root/local_proc +umount /root/new_root/local/proc +umount /root/new_root/var/lib/libvirt +umount /root/new_root/var/lib +umount /root/new_root/* +umount /root/new_root/dev/pts +umount /root/new_root/dev/mqueue +umount /root/new_root/dev/shm +umount /root/new_root/dev/vfio +umount /root/new_root/dev +rmmod qtfs + +umount /root/new_root/sys/fs/cgroup/* +umount /root/new_root/sys/fs/cgroup +umount /root/new_root/sys diff --git a/docs/en/dpu_os/_toc.yaml b/docs/en/dpu_os/_toc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..040400fa3b6582c8702736f8eb89a2664352edc5 --- /dev/null +++ b/docs/en/dpu_os/_toc.yaml @@ -0,0 +1,11 @@ +label: DPU_OS +isManual: true +href: ./overview.md +description: This guide outlines the process of creating a DPU_OS image through openEuler OS customization, including deployment and validation procedures. +sections: + - label: DPU_OS Background and Requirements + href: ./dpu_os_background_and_requirements.md + - label: DPU_OS Tailoring Guide + href: ./dpu_os_tailoring_guide.md + - label: Verification and Deployment + href: ./verification_and_deployment.md diff --git a/docs/en/dpu_os/dpu_os_background_and_requirements.md b/docs/en/dpu_os/dpu_os_background_and_requirements.md new file mode 100644 index 0000000000000000000000000000000000000000..6c2e1aa548b7ccc4fa18618ad157669e0cfbf369 --- /dev/null +++ b/docs/en/dpu_os/dpu_os_background_and_requirements.md @@ -0,0 +1,67 @@ +# DPU-OS Background and Requirements + +## Overview + +In data center and cloud environments, Moore's Law has reached its limits, leading to a slowdown in the growth of general-purpose CPU computing power. At the same time, network I/O speeds and performance continue to rise, creating a growing disparity between the two. This gap highlights the inability of current general-purpose processors to meet the demands of network, disk, and other I/O processing. In traditional data centers, a significant portion of general-purpose CPU resources is consumed by I/O and management tasks, a phenomenon known as the "Datacenter Tax." AWS estimates that this tax can consume over 30% of a data center's computing power, and in some cases, even more. + +The DPU was introduced to address this issue by offloading management, network, storage, and security tasks from the host CPU to dedicated processor chips. This offloading accelerates processing, reduces costs, and improves efficiency. Leading cloud providers like AWS, Alibaba Cloud, and Huawei Cloud have developed custom chips to handle these offloaded tasks, ensuring that 100% of data center computing resources are available for customer use. + +The DPU market is experiencing rapid growth, driven by strong demand from cloud providers and big data applications. Numerous Chinese DPU startups have also entered the market with innovative products. This growth presents challenges for cloud and big data providers, who must integrate diverse DPU products, and for DPU manufacturers, who must adapt device drivers to customer-specified operating systems. openEuler, a leading open-source operating system in China, addresses these challenges by offering DPU-OS, a solution built on openEuler that bridges the gap between DPU manufacturers and customers. Furthermore, since DPUs rely on their OS to support service acceleration, DPU-OS requires performance optimization. By leveraging openEuler, DPU-related acceleration capabilities can be embedded into DPU-OS, fostering a robust DPU software ecosystem. + +## DPU-OS Requirements Analysis and Design + +### Current State of DPUs and OS Requirements + +DPUs exhibit several key characteristics and challenges: + +- Limited general-purpose processing resources + + DPUs are in the early stages of development, with hardware continuously evolving. Power constraints result in modest hardware specifications. Mainstream DPUs typically feature 8 to 24 CPU cores with limited single-core performance. Memory capacity ranges from 16 to 32GB, and local storage varies from tens to hundreds of gigabytes. The operating system running on DPUs must accommodate these constraints. + +- Varied DPU-OS installation methods + + The diversity of DPU manufacturers and products has led to multiple installation and deployment methods. These include PXE network installation, USB installation, and custom methods such as host-delivered installation images. + +- High performance requirements + + DPU application scenarios demand high performance. Compared to general-purpose server operating systems, DPU-OS may require specific kernel features or functional components. Examples include vDPA for device passthrough and live migration, vendor-specific driver support, seamless DPU process offloading, customized user-space data plane acceleration tools like DPDK/SPDK/OVS, and DPU management and monitoring tools. + +Based on these characteristics, the following requirements for DPU-OS are proposed: + +- Ultra-lightweight DPU-OS installation package + + Trim the openEuler system image to eliminate unnecessary packages and optimize system services to reduce resource overhead. + +- Customization support and tools + + Provide customization configurations and tools to enable customers or DPU manufacturers to tailor the system. openEuler offers an ISO reference implementation. + +- Customized kernel and system for peak performance + + Customize the kernel and drivers to deliver competitive features for DPUs. Enable hardware acceleration through tailored components and optimize system configurations for superior performance. Include DPU-related management and control tools for unified administration. + +### DPU-OS Design + +**Figure 1** Overall Design of DPU-OS + +![dpuos-arch](./figures/dpuos-arch.png) + +As illustrated in Figure 1, DPU-OS is structured into five layers: + +- **Kernel layer**: Customize the kernel configuration to remove non-essential features and modules, creating a lightweight kernel. Enable specific kernel features to deliver high-performance DPU capabilities. + +- **Driver layer**: Trim and customize openEuler native drivers, selecting the minimal required set. Integrate DPU vendor-specific drivers to natively support certain DPU hardware products. + +- **System configuration layer**: Optimize system settings through sysctl and proc configurations to ensure peak performance for DPU-related services. + +- **Peripheral package layer**: Customize and trim openEuler peripheral packages, selecting the minimal set. Provide a suite of DPU-related custom tools. + +- **System service layer**: Streamline native system service startup items to eliminate unnecessary services, minimizing runtime overhead. + +This five-layer design achieves the goal of a lightweight, high-performance DPU-OS. While this is a long-term design heavily reliant on the DPU software and hardware ecosystem, the current phase focuses on trimming using openEuler's imageTailor tool. + +For detailed steps on DPU-OS trimming, refer to the [DPU-OS Tailoring Guide](./dpu-os-tailoring-guide.md). For verification and deployment, consult the [DPU-OS Deployment and Verification Guide](./verification-and-deployment.md). + +> [!NOTE]NOTE +> +> Currently, DPU-OS leverages openEuler's existing kernel and peripheral packages, trimmed using the imageTailor tool to produce a lightweight OS installation image. Future development will integrate additional kernel and peripheral package features based on specific needs. diff --git a/docs/en/dpu_os/dpu_os_tailoring_guide.md b/docs/en/dpu_os/dpu_os_tailoring_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..489ec5299b6dd504cb23733d10bb0ad1677ad82e --- /dev/null +++ b/docs/en/dpu_os/dpu_os_tailoring_guide.md @@ -0,0 +1,65 @@ +# DPU-OS Tailoring Guide + +This document explains how to use imageTailor to trim the DPU-OS installation image using configuration files from the [dpu-utilities repository](https://gitee.com/openeuler/dpu-utilities/tree/master/dpuos). Follow these steps: + +## Prepare imageTailor and Required RPM Packages + +Install the imageTailor tool by referring to the [imageTailor User Guide](https://docs.openeuler.org/zh/docs/22.03_LTS/docs/TailorCustom/imageTailor%E4%BD%BF%E7%94%A8%E6%8C%87%E5%8D%97.html) and prepare the necessary RPM packages for tailoring. + +You can use the openEuler installation image as the RPM source. While **openEuler-22.03-LTS-everything-debug-aarch64-dvd.iso** contains a complete set of RPMs, it is large. Alternatively, use the RPMs from **openEuler-22.03-LTS-aarch64-dvd.iso** along with the install-scripts.noarch package. + +Obtain the `install-scripts.noarch` package from the everything repository or download it using yum: + +```bash +yum install -y --downloadonly --downloaddir=./ install-scripts +``` + +## Copy DPUOS Configuration Files + +The imageTailor tool is installed in **/opt/imageTailor** by default. Copy the DPU-OS configuration files to the appropriate paths, selecting the correct architecture directory. The DPU-OS tailoring configuration repository supports x86_64 and aarch64 architectures. + +```bash +cp -rf custom/cfg_dpuos /opt/imageTailor/custom +cp -rf kiwi/minios/cfg_dpuos /opt/imageTailor/kiwi/minios/cfg_dpuos +``` + +## Modify Other Configuration Files + +- Add a line for `dpuos` configuration in **kiwi/eulerkiwi/product.conf**: + +```bash +dpuos PANGEA EMBEDDED DISK GRUB2 install_mode=install install_media=CD install_repo=CD selinux=0 +``` + +- Add a line for `dpuos` configuration in **kiwi/eulerkiwi/minios.conf**: + +```bash +dpuos kiwi/minios/cfg_dpuos yes +``` + +- Add a line for `dpuos` configuration in **repos/RepositoryRule.conf**: + +```bash +dpuos 1 rpm-dir euler_base +``` + +## Set Passwords + +Navigate to **/opt/imageTailor** and update the passwords in the following files: + +- **custom/cfg_dpuos/usr_file/etc/default/grub** + +- **custom/cfg_dpuos/rpm.conf** + +- **kiwi/minios/cfg_dpuos/rpm.conf** + +For password generation and modification, refer to the openEuler imageTailor manual section on [Configuring Initial Passwords](../../../tools/community_tools/image_custom/image_tailor/imagetailor_user_guide.md#configuring-initial-passwords). + +## Execute the Tailoring Command + +Run the following command to perform the tailoring. The resulting ISO will be saved in **/opt/imageTailor/result**: + +```bash +cd /opt/imageTailor +./mkdliso -p dpuos -c custom/cfg_dpuos --sec --minios force +``` diff --git a/docs/en/dpu_os/figures/dpuos-arch.png b/docs/en/dpu_os/figures/dpuos-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..453370ab07858a13a6c40f8d22e3f608e9ec6b4c Binary files /dev/null and b/docs/en/dpu_os/figures/dpuos-arch.png differ diff --git a/docs/en/dpu_os/overview.md b/docs/en/dpu_os/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..89d83786b9a29940803a05f959d209dc6d9f1c4c --- /dev/null +++ b/docs/en/dpu_os/overview.md @@ -0,0 +1,11 @@ +# Overview + +This document outlines the background requirements and design principles of DPU-OS. It also details the process of creating a DPU-OS image by customizing the openEuler operating system, along with deployment and verification methods. The feature leverages the openEuler ecosystem to deliver a lightweight, high-performance DPU-OS, offering a reference implementation for data processing unit (DPU) scenarios and users. + +This document targets community developers, DPU vendors, and customers using openEuler who want to explore and adopt DPUs. Users should possess the following skills and knowledge: + +- Proficiency in basic Linux operations. + +- Understanding of Linux system construction and deployment fundamentals. + +- Familiarity with the openEuler imageTailor tool for image customization. diff --git a/docs/en/dpu_os/verification_and_deployment.md b/docs/en/dpu_os/verification_and_deployment.md new file mode 100644 index 0000000000000000000000000000000000000000..0a468aac1b889ad465589b008fc2acac7fc12c05 --- /dev/null +++ b/docs/en/dpu_os/verification_and_deployment.md @@ -0,0 +1,38 @@ +# Verification and Deployment + +Once DPU-OS is built, it can be installed and deployed for verification. Since DPU hardware is still in its early stages, you can also use VirtualBox to set up a virtual machine for deployment and testing. + +## Deploying DPU-OS on VirtualBox + +This section outlines the steps to install and deploy DPU-OS using the VirtualBox hypervisor. + +### Preparation for Verification + +Before deploying DPU-OS, ensure the following prerequisites are met: + +- Obtain the DPU-OS ISO file. +- Ensure the host machine has VirtualBox installed. + +### Initial Installation and Startup + +#### Creating a Virtual Machine + +Create a new virtual machine in VirtualBox: + +- Configure the virtual machine with at least 2 CPUs and 4GB of RAM. + +- Allocate a virtual disk with a recommended size of 60GB or larger. + +- Enable EFI boot in the system extension properties. + +- In the storage settings, select the local DPU-OS ISO file as the optical drive. + +- Customize other settings such as network or display as needed. + +#### Starting the Virtual Machine + +Start the newly created virtual machine and choose **Install from ISO** to begin the DPU-OS installation. The installation process is automated and requires no manual input. After installation, the system will reboot automatically. + +Select **Boot From Local Disk** to start DPU-OS. Use the password specified during the DPU-OS creation process. + +By following these steps, you can successfully deploy and verify DPU-OS locally. diff --git a/docs/zh/dpu_offload/_toc.yaml b/docs/zh/dpu_offload/_toc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..603498856d2981b195bc8d876a150a1017c65d15 --- /dev/null +++ b/docs/zh/dpu_offload/_toc.yaml @@ -0,0 +1,15 @@ +label: 直连聚合用户指南 +isManual: true +description: 介绍基于openEuler操作系统的容器管理面DPU无感卸载功能特性及安装部署方法 +sections: + - label: 直连聚合环境搭建 + href: ./libvirt_direct_connection_aggregation_environment_establishment.md + - label: qtfs共享文件系统架构 + href: ./qtfs_architecture_and_usage.md + - label: 容器管理面DPU无感卸载 + href: ./overview.md + sections: + - label: 容器管理面无感卸载 + href: ./imperceptible_container_management_plane_offload.md + - label: 容器管理面无感卸载部署指导 + href: ./offload_deployment_guide.md diff --git a/docs/zh/dpu_offload/config/rexec.service b/docs/zh/dpu_offload/config/rexec.service new file mode 100644 index 0000000000000000000000000000000000000000..ee9e5e4895adb5c010e3f8d4db6652cfaed3d355 --- /dev/null +++ b/docs/zh/dpu_offload/config/rexec.service @@ -0,0 +1,13 @@ +[Unit] +Description=Rexec_server Service +After=network.target + +[Service] +Type=simple +Environment=CMD_NET_ADDR=tcp://0.0.0.0:7777 +ExecStart=/usr/bin/rexec_server +ExecReload=/bin/kill -s HUP $MAINPID +KillMode=process + +[Install] +WantedBy=multi-user.target diff --git a/docs/zh/dpu_offload/figures/arch.png b/docs/zh/dpu_offload/figures/arch.png new file mode 100644 index 0000000000000000000000000000000000000000..b6a7836fd6fab75009e781ac1ed96c73c352f75b Binary files /dev/null and b/docs/zh/dpu_offload/figures/arch.png differ diff --git a/docs/zh/dpu_offload/figures/offload-arch.png b/docs/zh/dpu_offload/figures/offload-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..944900b42c13091e4ec40c6d51dc3c95088aa1b8 Binary files /dev/null and b/docs/zh/dpu_offload/figures/offload-arch.png differ diff --git a/docs/zh/dpu_offload/figures/qtfs-arch.png b/docs/zh/dpu_offload/figures/qtfs-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..40fd7e28707642801ec0b984690a25c08e092ac4 Binary files /dev/null and b/docs/zh/dpu_offload/figures/qtfs-arch.png differ diff --git a/docs/zh/dpu_offload/imperceptible_container_management_plane_offload.md b/docs/zh/dpu_offload/imperceptible_container_management_plane_offload.md new file mode 100644 index 0000000000000000000000000000000000000000..6a62743713e799a8baf4baabc86918b4a83c1ad0 --- /dev/null +++ b/docs/zh/dpu_offload/imperceptible_container_management_plane_offload.md @@ -0,0 +1,31 @@ +# 容器管理面无感卸载介绍 + +## 概述 + +在数据中心及云场景下,随着摩尔定律失效,通用处理单元CPU算力增长速率放缓,而同时网络IO类速率及性能不断攀升,二者增长速率差异形成的剪刀差,即当前通用处理器的处理能力无法跟上网络、磁盘等IO处理的需求。传统数据中心下越来越多的通用CPU算力被IO及管理面等占用,这部分资源损耗称之为数据中心税(Data-center Tax)。据AWS统计,数据中心税可能占据数据中心算力的30%以上,部分场景下甚至可能更多。 + +DPU的出现就是为了将这部分算力资源从主机CPU上解放出来,通过将管理面、网络、存储、安全等能力卸载到专有的处理器芯片(DPU)上进行处理加速,达成降本增效的结果。目前主流云厂商如AWS、阿里云、华为云都通过自研芯片完成管理面及相关数据面的卸载,达成数据中心计算资源100%售卖给客户。 + +管理面进程卸载到DPU可以通过对组件源码进行拆分达成,将源码根据功能逻辑拆分成独立运行的两部分,分别运行在主机和DPU,达成组件卸载的目的。但是这种做法有以下问题:一是影响组件的软件兼容性,组件后续版本升级和维护需要自己维护相关patch,带来一定的维护工作量;二是卸载工作无法被其他组件继承,后续组件卸载后仍需要进行代码逻辑分析和拆分等工作。为解决上述问题,本方案提出DPU的无感卸载,通过OS提供的抽象层,屏蔽应用在主机和DPU间跨主机访问的差异,让业务进程近似0改动达成卸载到DPU运行的目标,且这部分工作属于操作系统通用层,与上层业务无关,其他业务进行DPU卸载时也可以继承。 + +## 架构介绍 + +### 容器管理面DPU无感卸载架构 + +**图1**容器管理面DPU无感卸载架构 + +![offload-arch](./figures/offload-arch.png) + +如图1所示,容器管理面卸载后,dockerd、kubelet等管理进程运行在DPU侧,容器进程本身运行在HOST,进程之间的交互关系由系统层提供对应的能力来保证: + +* 通信层:DPU和主机之间可能通过PCIe或网络进行通信,需要基于底层物理连接提供通信接口层,为上层业务提供通信接口。 + +* 内核共享文件系统qtfs:容器管理面组件kubelet、dockerd与容器进程之间的主要交互通过文件系统进行;管理面工具需要为容器进程准备rootfs、volume等数据面路径;还需要在运行时通过proc文件系统、cgroup文件系统等控制和监控容器进程的资源及状态。共享文件系统的详细介绍参考[共享文件系统介绍](qtfs-architecture-and-usage.md) + +* 用户态卸载环境:用户态需要使用qtfs为容器管理面准备卸载后的运行时环境,将主机的容器管理及运行时相关目录远程挂载到DPU;另外由于需要挂载proc、sys、cgroup等系统管理文件系统,为防止对DPU原生系统功能的破坏,上述挂载动作都在chroot环境内完成。另外管理面(运行于DPU)和容器进程(运行于主机)之间仍存在调用关系,需要通过远程二进制执行工具(rexec)提供对应功能。 + +容器管理面无感卸载的操作步骤可参考[部署指导文档](./offload-deployment-guide.md) + +> [!NOTE]说明 +> +> 上述操作指导涉及对容器管理面组件的少量改动和rexec工具修改,这些修改基于指定版本,其他版本可基于实际执行环境做适配修改。文档中提供的patch仅供验证指导使用,不具备实际商用的条件。 diff --git a/docs/zh/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md b/docs/zh/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md new file mode 100644 index 0000000000000000000000000000000000000000..2540e49f2a276e0a34ebaccedae3dba1f959d8e7 --- /dev/null +++ b/docs/zh/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md @@ -0,0 +1,369 @@ +# 直连聚合环境搭建 + +## **1** 硬件准备 + +### 测试模式 + +需准备2台物理机(虚拟机当前未试过),网络互通。 + +其中一台作为DPU模拟,另一台作为HOST模拟。在本文档中用DPU和HOST指代这两台服务器。 + +> [!NOTE]说明 +> 测试模式因为会暴露网络端口且不做连接认证,存在网络安全风险,仅能用于内部测试验证,不要用于实际生产环境。 + +### vsock模式 + +需要DPU加HOST,且DPU能支持通过virtio提供vsock通信方式。 + +目前还未基于真实的支持DPU vsock的环境调试过,本文档当前仅描述基于测试模式的方法,下面的内容依然默认使用测试模式。 + +## **2** libvirt卸载架构图 + +![arch](./figures/arch.png) + +## **3** 环境搭建 + +### **3.1** QTFS文件系统部署 + +可参考qtfs主页: + +QTFS建联需要关闭防火墙。 + +### **3.2** UDSPROXYD服务部署 + +#### 3.2.1 简介 + +udsproxyd是一个跨主机的unix domain socket代理服务,需要分别部署在host和dpu上,在host和dpu上的udsproxyd组件是对等的关系,可以实现分布在host与dpu上的2个进程之间的uds通信,通信进程是无感的,也就是说如果这两个进程在同一主机内通过uds正常通信的功能,拉远到host和dpu之间也可以,不需要做代码适配,只需要作为client的一端加一个环境变量`LD_PRELOAD=libudsproxy.so`。udsproxyd作为一个跨主机的unix socket服务,本身可以用LD_PRELOAD=libudsproxy.so的方式对接使用,在qtfs的支持下也可以无感应用,这需要提前做好白名单相关配置,具体有两种方式,将在后面详述。 + +#### 3.2.2 部署方式 + +首先,在dpu-utilities工程内编译udsproxyd: + +```bash +cd qtfs/ipc + +make -j UDS_TEST_MODE=1 && make install +``` + +当前最新版本下,qtfs server侧的engine服务已经整合了udsproxyd的能力,所以server侧若部署了qtfs后不需要再额外启动udsproxyd。client侧则单独拉起udsproxyd服务: + +```bash +nohup /usr/bin/udsproxyd 2>&1 & +``` + +参数解释: + +```bash +thread num: 线程数量,目前只支持单线程,填1. + +addr: 本机使用的ip + +port:本机占用的port + +peer addr: udsproxyd对端的ip + +peer port: 对端port +``` + +示例: + +```bash +nohup /usr/bin/udsproxyd 1 192.168.10.10 12121 192.168.10.11 12121 2>&1 & +``` + +如果未拉起qtfs的engine服务,想单独测试udsproxyd,则在server端也对等拉起udsproxyd即可: + +```bash +nohup /usr/bin/udsproxyd 1 192.168.10.11 12121 192.168.10.10 12121 2>&1 & +``` + +#### 3.2.3 应用方式 + +##### 3.2.3.1 独立使用udsproxyd服务 + +需要在使用uds服务的unix socket应用程序的client端进程启动时添加LD_PRELOAD=libudsproxy.so环境变量,以接管glibc的connect api进行uds对接,在libvirt卸载场景中可以将libudsproxy.so拷贝到libvirt的chroot目录下的/usr/lib64中以提供给libvirtd服务使用,这一步在后面介绍。 + +#### 3.2.3.2 无感使用udsproxyd服务 + +首先为qtfs配置uds服务的白名单,这里说的白名单是unix socket的server端bind的sock文件地址,例如libvirt的虚拟机建立的unix socket的服务端文件地址都在/var/lib/libvirt下,则我们需要增加一条白名单路径为/var/lib/libvirt/,提供两种方式供选择: + +a) 通过配置工具qtcfg加载,进入qtfs/qtinfo目录编译工具: + +在qtfs的client端执行 + +```bash +make role=client +make install +``` + +在qtfs的server端执行 + +```bash +make role=server +make install +``` + +配置工具将会自动安装,然后使用qtcfg命令配置白名单,假设需要增加的白名单为"/var/lib/libvirt/",输入: + +```bash +qtcfg -x /var/lib/libvirt/ +``` + +查询白名单为: + +```bash +qtcfg -z +``` + +删除白名单为: + +```bash +qtcfg -y 0 +``` + +删除白名单时,参数为查询白名单时列出来的index序号。 + +b) 通过配置文件增加,这需要在qtfs或qtfs_server内核模块加载前配置,通过内核模块初始化时读取该文件进行白名单配置。 + +> [!NOTE]说明 +> 白名单是为了防止不相干的unix socket链接也进行远程连接产生错误,或者浪费不必要的资源,所以白名单尽量设置得精确一些,比如本文中针对libvirt场景设置为/var/lib/libvirt/比较好,而直接将/var/lib/或/var/或直接将根目录加入的做法是有较大风险的。 + +### **3.3** REXEC服务部署 + +#### 3.3.1 简介 + +rexec是一个用c语言开发的远程执行组件,分为rexec client和rexec server。server端为一个常驻服务进程,client端为一个二进制文件,client端被执行后会基于udsproxyd服务与server端建立uds连接,并由server常驻进程在server端拉起指定程序。在libvirt虚拟化卸载中,libvirtd卸载到DPU上,当它需要在HOST拉起虚拟机qemu进程时调起rexec client进行远程拉起。 + +#### 3.3.2 部署方法 + +##### 3.3.2.1 配置环境变量与白名单 + +在host侧配置rexec server的白名单,将文件whitelist放置在/etc/rexec/目录下并修改权限为只读: + +```bash +chmod 400 /etc/rexec/whitelist +``` + +如果想仅用于测试,可以不进行白名单配置,删除此文件重启rexec_server进程后则没有白名单限制。 + +下载dpu-utilities代码后,进入qtfs/rexec主目录下,执行:`make && make install`即可安装rexec所需全部二进制到/usr/bin目录下,包括了:`rexec、rexec_server`两个二进制可执行文件。 + +在server端启动rexec_server服务之前,检查是否存在/var/run/rexec目录,没有则创建: + +```bash +mkdir /var/run/rexec +``` + +##### 3.3.2.2 服务方式 + +server端可以通过两种方式拉起rexec_server服务。 + +- 方式1: + +配置systemd服务 + +在/usr/lib/systemd/system/下增加rexec.service文件,内容如下: + +[rexec.service](./config/rexec.service) + +然后通过systemctl管理rexec服务。 + +首次配置服务时: + +```bash +systemctl daemon-reload + +systemctl enable --now rexec +``` + +后续重启新启动服务: + +```bash +systemctl stop rexec + +systemctl start rexec +``` + +- 方式2: + +手动后台拉起 + +```bash +nohup /usr/bin/rexec_server 2>&1 & +``` + +### **3.4** libvirt服务部署 + +#### 3.4.1 HOST侧部署 + +HOST无需额外部署,只需要安装虚拟机启动环境以及libvirt即可(安装libvirt主要是为了创建对应的目录): + +```bash +yum install -y qemu libvirt edk2-aarch64 #(arm环境虚拟机启动需要) +``` + +HOST需要放置虚拟机镜像,后面通过qtfs挂载到client端共享给libvirt。 + +#### 3.4.2 DPU侧部署 + +##### 3.4.2.1 创建chroot环境 + +a) 从openEuler官网下载qcow镜像,例如23.09版本。 + +b) 将qcow2挂载出来: + +```bash +cd /root/ + +mkdir p2 new_root_origin new_root + +modprobe nbd maxport=8 + +qemu-nbd -c /dev/nbd0 xxx.qcow2 + +mount /dev/nbd0p2 /root/p2 + +cp -rf /root/p2/* /root/new_root_origin/ + +umount /root/p2 + +qemu-nbd -d /dev/nbd0 +``` + +c) 此时new_root_origin有解压出来的镜像根目录,再将new_root绑定挂载到该目录上,作为chroot的根目录挂载点: + +```bash +mount --bind /root/new_root_origin /root/new_root +``` + +##### 3.4.2.2 安装libvirt + +此处介绍patch方式源码编译,如果计算提供rpm包则参考计算提供的安装方法。 + +a) 进入chroot环境,安装编译环境和常用工具: + +```bash +yum groupinstall "Development tools" -y +yum install -y vim meson qemu qemu-img strace edk2-aarch64 tar +``` + +其中edk2-aarch64是arm环境下虚拟机启动需要的。 + +b) 安装libvirt编译需要的依赖包: + +```bash +yum install -y rpcgen python3-docutils glib2-devel gnutls-devel libxml2-devel libpciaccess-devel libtirpc-devel yajl-devel systemd-devel dmidecode glusterfs-api numactl +``` + +c) 下载libvirt-x.x.x。源码包:。 + +d) 获取直连聚合libvirt patch: + + + +e) 将源码包解压到chroot环境下的目录,如/home。将patch打上。 + +f) 进入libvirt-x.x.x目录。 + +```bash +meson build --prefix=/usr -Ddriver_remote=enabled -Ddriver_network=enabled -Ddriver_qemu=enabled -Dtests=disabled -Ddocs=enabled -Ddriver_libxl=disabled -Ddriver_esx=disabled -Dsecdriver_selinux=disabled -Dselinux=disabled +``` + +g) 安装成功 + +```bash +ninja -C build install +``` + +##### 3.4.2.3 启动libvirtd服务 + +libvirt直连聚合卸载模式,需要从chroot内启动libvirtd服务,首先需要把chroot之外的libvirtd服务停掉。 + +a) 放置虚拟机跳板脚本在chroot环境下的/usr/bin和/usr/libexec下:[qemu-kvm](./scripts/qemu-kvm)。替换原同名二进制,这个跳板脚本就是用于调用rexec拉起远端虚拟机。 + +> [!NOTE]说明 +> virsh使用的xml中,\下面的\需要填qemu-kvm,如果是填的其他,则需要修改为qemu-kvm,或者将跳板脚本替换\指代的二进制,且跳板脚本内容需要对应地更改。 + +b) 将udsproxyd编译时附带产生的libudsproxy.so拷贝到本chroot目录下/usr/lib64下,如果配置qtfs的uds白名单方式使用udsproxyd服务,则不需要。 + +c) 将前面rexec编译产生的rexec二进制放置到本chroot的/usr/bin/目录下。 + +d) 配置chroot的挂载环境,需要挂载一些目录,使用如下配置脚本: + +- [virt_start.sh](./scripts/virt_start.sh)为配置脚本,virt_start.sh脚本中需要手动修改qtfs ko dir为编译的ko位置,host ip address为正确的host地址。 + +- [virt_umount.sh](./scripts/virt_umount.sh)为消除配置脚本。 + +e) 脚本中挂载目录位置都是按照本文档前文创建目录位置与名称为准,如果有修改需要同步适配修改脚本。 + +f) 配置好chroot环境后,进入chroot环境,手动拉起libvirtd。 + +未配置qtfs使用udsproxyd白名单的拉起方式: + +```bash +LD_PRELOAD=/usr/lib64/libudsproxy.so virtlogd -d +LD_PRELOAD=/usr/lib64/libudsproxy.so libvirtd -d +```` + +如果已配置qtfs使用udsproxyd白名单,则不需要增加LD_PRELOAD前缀: + +```bash +virtlogd -d +libvirtd -d +``` + +查看是否已配置白名单的方式,在chroot之外的窗口,执行: + +```bash +qtcfg -z +``` + +查看列举出来的白名单是否包含/var/lib/libvirt/。 + +### **3.5** 拉起虚拟机 + +服务部署完成后,即可以在DPU侧进行虚拟机的生命周期管理。 + +#### 3.5.1 虚拟机define + +a) 将虚拟机启动镜像放置在HOST侧某目录,例如: + +```bash +/home/VMs/Domain_name +``` + +b) 使用qtfs将这个目录挂载到DPU侧: + +```bash +mount -t qtfs /home/VMs /home/VMs +``` + +c) xml中使用`/home/VMs/Domain_name`作为启动镜像,这样在DPU和HOST侧看到的都是同一个镜像文件(Domain_name是虚拟机domain的名字)。 + +d) 检查xml中\是否指向了跳板脚本。 + +e) 执行以下命令。 + +```bash +virsh define xxx.xml +``` + +#### 3.5.2 虚拟机start + +```bash +virsh start domain +``` + +## **4** 环境重置 + +由于libvirt在DPU和HOST之间共享了部分目录,卸载环境时需要先将这部分目录全部umount。一般先停掉libvirtd和virtlogd进程,调用virt_umount脚本即可。如果HOST还有虚拟机运行,也需要先杀掉才能umount。 + +## **5** 部分问题定位思路 + +1、 libvirt编译失败:检查依赖包安装是否完全,如果chroot挂载了外部目录或者host目录,也可能导致编译失败,需先解除挂载。 + +2、 QTFS挂载失败:可能server端engine进程没拉起、防火墙没关导致qtfs建联失败。 + +3、 虚拟机define失败:检查xml里的项目仿真器是否指向跳板脚本、虚拟机镜像是否已经通过qtfs挂载到DPU上可见,且路径与HOST一致。 + +4、 虚拟机启动失败:libvirtd和virtlogd服务是否拉起、rexec服务是否拉起、跳板进程是否拉起、是否qemu-kvm拉起时报错。 diff --git a/docs/zh/dpu_offload/offload_deployment_guide.md b/docs/zh/dpu_offload/offload_deployment_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..c3a0b075d73e5ca72b8ce12faae47fea2d9e4083 --- /dev/null +++ b/docs/zh/dpu_offload/offload_deployment_guide.md @@ -0,0 +1,166 @@ +# 容器管理面无感卸载部署指导 + +> [!NOTE]说明 +> +> 本指导涉及对容器管理面组件的少量改动和rexec工具修改,这些修改基于指定版本,其他版本可基于实际执行环境做适配修改。文档中提供的patch仅供验证指导使用,不具备实际商用的条件。 +> +> 当前共享文件系统之间通信通过网络完成,可通过网络互连的两台物理机器或VM模拟验证。 +> +> 建议用户验证前先搭建可正常使用的kubernetes集群和容器运行环境,针对其中单个节点的管理面进程进行卸载验证,卸载环境(DPU)可选择一台具备网络连接的物理机或VM。 + +## 简介 + +容器管理面,即kubernetes、dockerd、containerd、isulad等容器的管理工具,而容器管理面卸载,即是将容器管理面卸载到与容器所在机器(以下称为HOST)之外的另一台机器(当前场景下是指DPU,一个具备独立运行环境的硬件集合)上运行。 + +我们使用共享文件系统qtfs将HOST上与容器运行相关的目录挂载到DPU上,使得容器管理面工具(运行在DPU)可以访问到这些目录,并为容器(运行在HOST)准备运行所需要的环境,此处,因为需要挂载远端的proc和sys等特殊文件系统,所以,我们创建了一个专门的rootfs以作为kubernetes、dockerd的运行环境(以下称为`/another_rootfs`)。 + +并且通过rexec执行容器的拉起、删除等操作,使得可以将容器管理面和容器分离在不同的两台机器上,远程对容器进行管理。 + +## 相关组件补丁介绍 + +### rexec介绍 + +rexec是一个用go语言开发的远程执行工具,基于docker/libchan下的[rexec](https://github.com/docker/libchan/tree/master/examples/rexec)示例工具改造而成,实现远程调用远端二进制的功能,为方便使用在rexec中增加了环境变量传递和监控原进程退出等能力。 + +rexec工具的具体使用方式为在服务器端用`CMD_NET_ADDR=tcp://0.0.0.0:<端口号> rexec_server`的方式拉起rexec服务进程,然后在客户端用`CMD_NET_ADDR=tcp://<服务端ip>:<端口号> rexec [要执行的指令]`的方式启动,便可以调用rexec_server执行需要执行的指令,并等待指令执行结果返回。 + +### dockerd相关改动介绍 + +对dockerd的改动基于18.09版本。 + +在containerd中,暂时注释掉了通过hook调用libnetwork-setkey的部分,此处不影响容器的拉起。并且,为了docker load的正常使用,注释掉了在mounter_linux.go 中mount函数中一处错误的返回。 + +最后,因为在容器管理面的运行环境中,将`/proc`挂在了服务端的proc文件系统,而本地的proc文件系统则挂载在了`/local_proc`,所以,dockerd以及containerd中的对`/proc/self/xxx`或者`/proc/getpid()/xxx`或者相关的文件系统访问的部分,我们统统将`/proc`改为了`/local_proc`。 + +### containerd相关改动介绍 + +对于containerd的改动基于containerd-1.2-rc.1版本。 + +在获取mountinfo时,因为`/proc/self/mountinfo`只能获取到dockerd本身在本地的mountinfo,而无法获取到服务端的mountinfo,所以,将其改为了`/proc/1/mountinfo`,使其通过获取服务端1号进程mountinfo的方式得到服务端的mountinfo。 + +在contaienrd-shim中,将与containerd通信的unix socket改为了用tcp通信,containerd通过`SHIM_HOST`环境变量获取containerd-shim所运行环境的ip,即服务端ip。用shim的哈希值计算出一个端口号,并以此作为通信的端口,来拉起containerd-shim. + +并且,将原来的通过系统调用给contaienr-shim发信号的方式,改为了通过远程调用kill指令的方式向shim发信号,确保了docker杀死容器的行为可以正确的执行。 + +### kubernetes相关改动介绍 + +kubelet暂不需要功能性改动,可能会遇到容器QoS管理器首次设置失败的错误,该错误不影响后续Pods拉起流程,暂时忽略该报错。 + +## 容器管理面卸载操作指南 + +在服务器端和客户端,都要拉起rexec_server。服务器端拉起rexec_server,主要是用于客户端创建容器时用rexec拉起containerd-shim,而客户端拉起rexec_server,则是为了执行containerd-shim对dockerd和containerd的调用。 + +### 服务器端 + +创建容器管理面所需要的文件夹,然后插入qtfs_server.ko,并拉起engine进程。 + +此外在服务器端,还需要创建rexec脚本/usr/bin/dockerd. + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://<客户端ip>: rexec /usr/bin/dockerd $* +``` + +### 客户端 + +需要准备一个rootfs,作为dockerd与containerd的运行环境,通过如下的脚本,将dockerd、containerd所需要的服务端目录挂载到客户端。并且,需要确保在以下脚本中被挂载的远程目录在服务端和客户端都存在。 + +``` shell +#!/bin/bash +mkdir -p /another_rootfs/var/run/docker/containerd +iptables -t nat -N DOCKER +echo "---------insmod qtfs ko----------" +insmod /YOUR/QTFS/PATH/qtfs.ko qtfs_server_ip=<服务端ip> qtfs_log_level=INFO + +# chroot环境内的proc使用DPU的proc共享文件系统替换,需要将本机真实proc文件系统挂载到local_proc下使用 +mount -t proc proc /another_rootfs/local_proc/ + +# 将chroot内环境与外部环境bind,方便进行配置和运行 +mount --bind /var/run/ /another_rootfs/var/run/ +mount --bind /var/lib/ /another_rootfs/var/lib/ +mount --bind /etc /another_rootfs/etc + +mkdir -p /another_rootfs/var/lib/isulad + +# 在chroot环境内创建并挂载dev、sys和cgroup文件系统 +mount -t devtmpfs devtmpfs /another_rootfs/dev/ +mount -t sysfs sysfs /another_rootfs/sys +mkdir -p /another_rootfs/sys/fs/cgroup +mount -t tmpfs tmpfs /another_rootfs/sys/fs/cgroup +list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" +for i in $list +do + echo $i + mkdir -p /another_rootfs/sys/fs/cgroup/$i + mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /another_rootfs/sys/fs/cgroup/$i +done + +## common system dir +mount -t qtfs -o proc /proc /another_rootfs/proc +echo "proc" +mount -t qtfs /sys /another_rootfs/sys +echo "cgroup" + +# 挂载容器管理面所需要的共享目录 +mount -t qtfs /var/lib/docker/containers /another_rootfs/var/lib/docker/containers +mount -t qtfs /var/lib/docker/containerd /another_rootfs/var/lib/docker/containerd +mount -t qtfs /var/lib/docker/overlay2 /another_rootfs/var/lib/docker/overlay2 +mount -t qtfs /var/lib/docker/image /another_rootfs/var/lib/docker/image +mount -t qtfs /var/lib/docker/tmp /another_rootfs/var/lib/docker/tmp +mkdir -p /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ +mount -t qtfs /run/containerd/io.containerd.runtime.v1.linux/ /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ +mkdir -p /another_rootfs/var/run/docker/containerd +mount -t qtfs /var/run/docker/containerd /another_rootfs/var/run/docker/containerd +mount -t qtfs /var/lib/kubelet/pods /another_rootfs/var/lib/kubelet/pods +``` + +在/another_rootfs中,需要创建以下脚本,用来支持部分跨主机操作。 + +* /another_rootfs/usr/local/bin/containerd-shim + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://<服务端ip>: /usr/bin/rexec /usr/bin/containerd-shim $* +``` + +* /another_rootfs/usr/local/bin/remote_kill + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://<服务端ip>: /usr/bin/rexec /usr/bin/kill $* +``` + +* /another_rootfs/usr/sbin/modprobe + +``` shell +#!/bin/bash +CMD_NET_ADDR=tcp://<服务端ip>: /usr/bin/rexec /usr/sbin/modprobe $* +``` + +在chroot到dockerd和containerd运行所需的rootfs后,用如下的命令拉起dockerd和containerd + +* containerd + +``` shell +#!/bin/bash +SHIM_HOST=<服务端ip> containerd --config /var/run/docker/containerd/containerd.toml --address /var/run/containerd/containerd.sock +``` + +* dockerd + +``` shell +#!/bin/bash +SHIM_HOST=<服务端ip> CMD_NET_ADDR=tcp://<服务端ip>: /usr/bin/dockerd --containerd /var/run/containerd/containerd.sock +``` + +* kubelet + +在chroot环境内使用原参数拉起kubelet即可。 + +因为我们已经将/var/run/和/another_rootfs/var/run/绑定在了一起,所以可以在正常的rootfs下,通过docker来访问docker.sock接口进行容器管理。 + +至此,完成容器管理面卸载到DPU,可以通过docker相关操作进行容器创建、删除等操作,也可以通过kubectl在当前节点进行pods调度和销毁,且实际容器业务进程运行在HOST侧。 + +> [!NOTE]说明 +> +> 本指导所述操作只涉及容器管理面进程卸载,不包含容器网络和数据卷volume等卸载,如有相关需求,需要通过额外的网络或存储卸载能力支持。本指导支持不带网络和存储的容器跨节点拉起。 diff --git a/docs/zh/dpu_offload/overview.md b/docs/zh/dpu_offload/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..4f4ee65a44030b0e956c71b4500e53cf8a095847 --- /dev/null +++ b/docs/zh/dpu_offload/overview.md @@ -0,0 +1,11 @@ +# 容器管理面DPU无感卸载指南 + +本文档介绍基于openEuler操作系统的容器管理面DPU无感卸载功能特性及安装部署方法,该特性可以通过操作系统提供的统一抽象层,屏蔽容器管理面跨主机资源访问的差异,实现容器管理面业务无感卸载到DPU上。 + +本文档适用于使用openEuler系统并希望了解和使用操作系统内核及容器的社区开发者、开源爱好者以及相关合作伙伴。使用人员需要具备以下经验和技能: + +- 熟悉Linux基本操作 + +- 熟悉linux内核文件系统相关基础机制 + +- 对kubernetes和docker有一定了解,熟悉docker及kubernetes部署及使用。 diff --git a/docs/zh/dpu_offload/qtfs_architecture_and_usage.md b/docs/zh/dpu_offload/qtfs_architecture_and_usage.md new file mode 100644 index 0000000000000000000000000000000000000000..fc24e9ce06588570651ca3f131e73aea34f61922 --- /dev/null +++ b/docs/zh/dpu_offload/qtfs_architecture_and_usage.md @@ -0,0 +1,67 @@ +# qtfs + +## 介绍 + +qtfs是一个共享文件系统项目,可部署在host-dpu的硬件架构上,也可以部署在2台服务器之间。以客户端服务器的模式工作,使客户端能通过qtfs访问服务端的指定文件系统,得到本地文件访问一致的体验。 + +qtfs的特性: + ++ 支持挂载点传播; + ++ 支持proc、sys、cgroup等特殊文件系统的共享; + ++ 支持远程文件读写的共享; + ++ 支持在客户端对服务端的文件系统进行远程挂载; + ++ 支持特殊文件的定制化处理; + ++ 支持远端fifo、unix-socket等,并且支持epoll,使客户端和服务端像本地通信一样使用这些文件; + ++ 支持基于host-dpu架构通过PCIe协议底层通信,性能大大优于网络; + ++ 支持内核模块形式开发,无需对内核进行侵入式修改。 + +## 软件架构 + +软件大体框架图: + +![qtfs-arch](./figures/qtfs-arch.png) + +## 安装教程 + +目录说明: + ++ **qtfs**: 客户端内核模块相关代码,直接在该目录下编译客户端ko。 + ++ **qtfs_server**: 服务端内核模块相关代码,直接在该目录下编译服务端ko和相关程序。 + ++ **qtinfo**: 诊断工具,支持查询文件系统的工作状态以及修改log级别等。 + ++ **demo**、**test**、**doc**: 测试程序、演示程序以及项目资料等。 + ++ 根目录: 客户端与服务端通用的公共模块代码。 + +首先找两台服务器(或虚拟机)配置内核编译环境: + +1. 要求内核版本在5.10或更高版本。 +2. 安装内核开发包:yum install kernel-devel。 + +服务端安装: + +1. cd qtfs_server +2. make clean && make +3. insmod qtfs_server.ko qtfs_server_ip=x.x.x.x qtfs_server_port=12345 qtfs_log_level=WARN +4. ./engine 4096 16 + +客户端安装: + +1. cd qtfs +2. make clean && make +3. insmod qtfs.ko qtfs_server_ip=x.x.x.x qtfs_server_port=12345 qtfs_log_level=WARN + +## 使用说明 + +安装完成后,客户端通过挂载把服务端的文件系统让客户端可见,例如:`mount -t qtfs / /root/mnt/` + +客户端进入"/root/mnt"后便可查看到server端的所有文件,以及对其进行相关操作。 diff --git a/docs/zh/dpu_offload/scripts/qemu-kvm b/docs/zh/dpu_offload/scripts/qemu-kvm new file mode 100644 index 0000000000000000000000000000000000000000..e869371be109b57f59709fc23bc5b1cb2002cfbf --- /dev/null +++ b/docs/zh/dpu_offload/scripts/qemu-kvm @@ -0,0 +1,3 @@ +#!/bin/bash + +exec /usr/bin/rexec /usr/bin/qemu-kvm $* diff --git a/docs/zh/dpu_offload/scripts/virt_start.sh b/docs/zh/dpu_offload/scripts/virt_start.sh new file mode 100644 index 0000000000000000000000000000000000000000..06ca194b7a639a947b6e395f116beeba7c897459 --- /dev/null +++ b/docs/zh/dpu_offload/scripts/virt_start.sh @@ -0,0 +1,48 @@ +#!/bin/bash +insmod ./qtfs.ko qtfs_server_ip=192.168.10.11 qtfs_log_level=NONE + +systemctl stop libvirtd + +if [ ! -d "/root/new_root/local_proc" ]; then + mkdir -p /root/new_root/local_proc +fi +if [ ! -d "/root/new_root/local" ]; then + mkdir -p /root/new_root/local +fi +mount -t proc proc /root/new_root/local_proc/ +mount -t proc proc /root/new_root/local/proc +mount -t sysfs sysfs /root/new_root/local/sys +mount --bind /var/run/ /root/new_root/var/run/ +mount --bind /var/lib/ /root/new_root/var/lib/ +mount --bind /var/cache/ /root/new_root/var/cache +mount --bind /etc /root/new_root/etc + +mkdir -p /root/new_root/home/VMs/ +mount -t qtfs /home/VMs/ /root/new_root/home/VMs/ + +mount -t qtfs /var/lib/libvirt /root/new_root/var/lib/libvirt + +mount -t devtmpfs devtmpfs /root/new_root/dev/ +mount -t hugetlbfs hugetlbfs /root/new_root/dev/hugepages/ +mount -t mqueue mqueue /root/new_root/dev/mqueue/ +mount -t tmpfs tmpfs /root/new_root/dev/shm + +mount -t sysfs sysfs /root/new_root/sys +mkdir -p /root/new_root/sys/fs/cgroup +mount -t tmpfs tmpfs /root/new_root/sys/fs/cgroup +list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" +for i in $list +do + echo $i + mkdir -p /root/new_root/sys/fs/cgroup/$i + mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /root/new_root/sys/fs/cgroup/$i +done + +## common system dir +mount -t qtfs -o proc /proc /root/new_root/proc +echo "proc" + +mount -t qtfs /sys /root/new_root/sys +echo "cgroup" +mount -t qtfs /dev/pts /root/new_root/dev/pts +mount -t qtfs /dev/vfio /root/new_root/dev/vfio diff --git a/docs/zh/dpu_offload/scripts/virt_umount.sh b/docs/zh/dpu_offload/scripts/virt_umount.sh new file mode 100644 index 0000000000000000000000000000000000000000..4adddec913c23069c6bffddec0bf1770f8c5ce71 --- /dev/null +++ b/docs/zh/dpu_offload/scripts/virt_umount.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +umount /root/new_root/dev/hugepages +umount /root/new_root/etc +umount /root/new_root/home/VMs +umount /root/new_root/local_proc +umount /root/new_root/local/proc +umount /root/new_root/var/lib/libvirt +umount /root/new_root/var/lib +umount /root/new_root/* +umount /root/new_root/dev/pts +umount /root/new_root/dev/mqueue +umount /root/new_root/dev/shm +umount /root/new_root/dev/vfio +umount /root/new_root/dev +rmmod qtfs + +umount /root/new_root/sys/fs/cgroup/* +umount /root/new_root/sys/fs/cgroup +umount /root/new_root/sys diff --git a/docs/zh/dpu_os/_toc.yaml b/docs/zh/dpu_os/_toc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..46bee7c9218f7c6738954981bd8573ce51d90b7f --- /dev/null +++ b/docs/zh/dpu_os/_toc.yaml @@ -0,0 +1,10 @@ +label: DPU_OS +isManual: true +description: 介绍基于openEuler操作系统裁剪构建DPU_OS镜像的方法以及部署验证方法 +sections: + - label: DPU_OS背景与需求 + href: ./dpu_os_background_and_requirements.md + - label: DPU_OS裁剪指导 + href: ./dpu_os_tailoring_guide.md + - label: 验证与部署 + href: ./verification_and_deployment.md diff --git a/docs/zh/dpu_os/dpu_os_background_and_requirements.md b/docs/zh/dpu_os/dpu_os_background_and_requirements.md new file mode 100644 index 0000000000000000000000000000000000000000..e960864c9335c9fb2a917d7b86746e1b6f16e641 --- /dev/null +++ b/docs/zh/dpu_os/dpu_os_background_and_requirements.md @@ -0,0 +1,67 @@ +# DPU-OS背景与需求 + +## 概述 + +在数据中心及云场景下,摩尔定律失效,通用处理单元CPU算力增长速率放缓,而网络IO类速率及性能不断攀升,二者增长速率差异形成剪刀差,即当前通用处理器的处理能力无法跟上网络、磁盘等IO处理的需求。传统数据中心下越来越多的通用CPU算力被IO及管理面等处理占用,这部分资源损耗称之为数据中心税(Datacenter Tax)。据AWS统计,数据中心税可能占据数据中心算力的30%以上,部分场景下甚至可能更多。 + +DPU (Data Processing Unit) 的出现就是为了将这部分算力资源从主机CPU上解放出来,通过将管理面、网络、存储、安全等能力卸载到专有的处理器芯片上进行处理加速,达成降本增效的结果。目前主流云厂商如AWS、阿里云、华为云都通过自研芯片完成管理面及相关数据面的卸载,实现数据中心计算资源100%售卖给客户。 + +目前DPU发展非常火热,云厂商及大数据在相关场景下对DPU存在较强烈的需求,国内也有很多DPU初创公司推出不同的DPU产品。在这一背景下,云和大数据等厂商需要考虑如何整合使用不同DPU产品,而DPU厂商也面临对不同客户交付时设备驱动适配客户指定操作系统的问题。openEuler作为国内领先的开源开放操作系统,通过基于openEuler构建的DPU-OS,解决DPU厂商及客户之间的适配问题。另外DPU上OS用于承载部分业务加速的需求,需要对DPU-OS进行性能优化加速,可以基于openEuler构建DPU相关加速能力,内置在DPU-OS中,构建DPU相关软件生态。 + +## DPU-OS需求分析及设计 + +### DPU现状及对OS需求 + +DPU普遍具有以下特点和问题: + +* DPU通用处理能力资源受限 + + 当前DPU仍处在发展早期阶段,硬件上仍在不断演进,而且由于DPU供电限制,当前硬件规格普遍较低。主流DPU中通用处理器CPU核数普遍较少,约8-24CPU,且单核处理能力较弱。内存大小受限,普遍在16-32GB。DPU本地存储空间为几十到几百GB不等。运行于DPU之上的操作系统也需要考虑这些限制。 + +* DPU-OS安装方式多样 + + 当前DPU厂商及产品多种多样,对应操作系统的安装部署方式也不尽相同,包括PXE网络安装、U盘安装或其他自定义安装方式(由HOST下发安装镜像)。 + +* DPU性能需求 + + DPU的应用场景决定其对性能有强烈需求,相比于通用服务器操作系统,DPU-OS可能对内核特性或功能组件有特殊要求,比如用于设备直通热迁移的vDPA特性、厂商特定驱动适配支持、DPU进程的无感卸载特性、定制优化的用户态数据面加速工具如DPDK/SPDK/OVS、DPU管理监控相关的工具类组件。 + +针对以上DPU现状,提出对DPU-OS的需求如下: + +* 极致轻量的DPU-OS安装包 + + 通过裁剪openEuler系统镜像,减少非必要安装包的空间占用;通过优化系统服务,减少资源底噪开销。 + +* 裁剪配置及工具支持 + + 提供裁剪配置及裁剪工具支持,客户或DPU厂商可根据各自需求进行定制;openEuler提供ISO参考实现。 + +* 定制化内核及系统,提供极致性能 + + 通过定制内核及相关驱动,提供DPU竞争力内核特性;定制化加速类组件,使能DPU硬件加速能力;优化系统配置提供更优性能;通过DPU相关管理控制工具,方便用户统一管理。 + +### DPU-OS设计 + +**图1**DPU-OS整体设计 + +![dpuos-arch](./figures/dpuos-arch.png) + +如图1所示,DPU-OS分为五层设计: + +* 内核层:通过定制内核config,裁剪非必需内核特性及模块,达成内核轻量级效果;使能特定内核特性提供高性能DPU内核能力。 + +* 驱动层:对openEuler原生驱动进行裁剪定制,选择最小集合;DPU厂商相关底层驱动集成,原生支持部分DPU硬件产品。 + +* 系统配置层:通过对系统sysctl、proc进行配置,为DPU相关业务提供最优性能。 + +* 外围包层:对openEuler外围包进行裁剪定制,选择最小集合;提供DPU相关的定制工具集合。 + +* 系统服务层:通过优化系统原生服务启动项,减少非必要系统服务运行,保证系统运行时底噪最小化。 + +通过上述五层设计达成轻量化、极致性能DPU-OS的目标。该方案为相对长期设计,且对DPU相关软硬件生态有较强的依赖;当前第一阶段先实现基于openEuler imageTailor进行裁剪。 + +DPU-OS的裁剪步骤可参考[DPU-OS裁剪指导文档](./dpu-os-tailoring-guide.md),验证与部署可参考[DPU-OS部署验证指导文档](./verification-and-deployment.md)。 + +> [!NOTE]说明 +> +> 当前阶段DPU-OS先基于openEuler现有内核及外围包,使用镜像裁剪工具imageTailor进行裁剪,提供轻量化OS安装镜像。后续可根据实际诉求,进行相关内核及外围包特性的开发及集成。 diff --git a/docs/zh/dpu_os/dpu_os_tailoring_guide.md b/docs/zh/dpu_os/dpu_os_tailoring_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..e42110eb1445ebee9a88a82d3870377ecacdd99e --- /dev/null +++ b/docs/zh/dpu_os/dpu_os_tailoring_guide.md @@ -0,0 +1,65 @@ +# DPU-OS裁剪指导 + +本文档主要介绍`imageTailor`的使用方法并结合[dpu-utilities仓库](https://gitee.com/openeuler/dpu-utilities/tree/master/dpuos)的`dpuos`配置文件裁剪得到`dpuos`的安装镜像,具体步骤如下: + +## 准备imageTailor和所需的rpm包 + +参照[imageTailor使用指导文档](https://docs.openeuler.org/zh/docs/22.03_LTS/docs/TailorCustom/imageTailor%E4%BD%BF%E7%94%A8%E6%8C%87%E5%8D%97.html)安装好`imageTailor`工具并将裁剪所要用到的rpm包准备好。 + +可以使用openEuler提供安装镜像作为镜像裁剪所需要rpm包源,`openEuler-{version}-everything-debug-aarch64-dvd.iso`中的rpm比较全但是此镜像很大,可以用镜像`openEuler-{version}-aarch64-dvd.iso`中的rpm包和一个`install-scripts.noarch`实现。 + +`install-scripts.noarch`包括可以从everything包中获取,或者在系统中通过yum下载: + +```bash +yum install -y --downloadonly --downloaddir=./ install-scripts +``` + +## 拷贝dpuos相关的配置文件 + +`imageTailor`工具默认安装在`/opt/imageTailor`路径下。执行下面的命令将`dpuos`的配置拷贝到对应的路径下,拷贝时选择对应架构目录。当前DPU-OS裁剪配置库支持x86_64和aarch64两种架构。 + +```bash +cp -rf custom/cfg_dpuos /opt/imageTailor/custom +cp -rf kiwi/minios/cfg_dpuos /opt/imageTailor/kiwi/minios/cfg_dpuos +``` + +## 修改其他配置文件 + +* 修改`kiwi/eulerkiwi/product.conf`,增加一行`dpuos`相关配置: + +```bash +dpuos PANGEA EMBEDDED DISK GRUB2 install_mode=install install_media=CD install_repo=CD selinux=0 +``` + +* 修改`kiwi/eulerkiwi/minios.conf`,增加一行`dpuos`的相关配置: + +```bash +dpuos kiwi/minios/cfg_dpuos yes +``` + +* 修改`repos/RepositoryRule.conf`,增加一行`dpuos`的相关配置: + +```bash +dpuos 1 rpm-dir euler_base +``` + +## 设置密码 + +进入到`/opt/imageTailor`子目录下,修改下面3个文件的密码: + +* `custom/cfg_dpuos/usr_file/etc/default/grub` + +* `custom/cfg_dpuos/rpm.conf` + +* `kiwi/minios/cfg_dpuos/rpm.conf` + +密码生成及修改方法可详见openEuler imageTailor手册[配置初始密码](https://docs.openeuler.org/zh/docs/22.03_LTS/docs/TailorCustom/imageTailor%E4%BD%BF%E7%94%A8%E6%8C%87%E5%8D%97.html#%E9%85%8D%E7%BD%AE%E5%88%9D%E5%A7%8B%E5%AF%86%E7%A0%81)章节。 + +## 执行裁剪命令 + +执行下面的命令进行裁剪,最后裁剪出来的iso在`/opt/imageTailor/result`路径下: + +```bash +cd /opt/imageTailor +./mkdliso -p dpuos -c custom/cfg_dpuos --sec --minios force +``` diff --git a/docs/zh/dpu_os/figures/dpuos-arch.png b/docs/zh/dpu_os/figures/dpuos-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..453370ab07858a13a6c40f8d22e3f608e9ec6b4c Binary files /dev/null and b/docs/zh/dpu_os/figures/dpuos-arch.png differ diff --git a/docs/zh/dpu_os/verification_and_deployment.md b/docs/zh/dpu_os/verification_and_deployment.md new file mode 100644 index 0000000000000000000000000000000000000000..3284d19d7655bfd48cb29e9b57e908e1b8dd3e11 --- /dev/null +++ b/docs/zh/dpu_os/verification_and_deployment.md @@ -0,0 +1,38 @@ +# 验证与部署 + +DPU-OS制作完成后可以安装部署进行验证。由于目前DPU硬件还不成熟,也可以通过VirtualBox拉起虚拟机进行相关部署验证。 + +## 在VirtualBox上部署DPU-OS + +本章节展示了如何在VirtualBox虚拟机管理程序上安装部署DPU-OS。 + +### 验证准备 + +在开始部署 DPU-OS 之前,需要做如下准备工作: + +- 获取 DPU-OS ISO +- 安装有VirtualBox的宿主机 + +### 初步安装与启动 + +#### 创建虚拟机 + +通过VirtualBox创建新的虚拟机: + +- 选择虚拟机配置,CPU及内存建议2CPU+4GB内存以上 + +- 创建虚拟机磁盘,磁盘大小建议60GB以上 + +- 系统扩展属性部分,勾选启动EFI + +- 存储设置部分,光驱配置选择本地DPU-OS ISO作为光驱文件 + +- 其他网络或显示设置可自定义 + +#### 启动虚拟机 + +启动新建的虚拟机,启动项选择`Install from ISO`进行DPU-OS安装,安装过程自动进行无需手动干预,安装完成后自动重启。 + +选择启动项 `Boot From Local Disk`,启动后即可进入DPU-OS。密码为DPU-OS制作时指定的密码。 + +通过上述步骤,即可完成DPU-OS的本地部署验证。