From fa9d6cb5a9a6ec6b306b2609e386b9cc0b5282e9 Mon Sep 17 00:00:00 2001 From: Zhigang Wang Date: Tue, 22 Sep 2020 17:31:03 +0800 Subject: [PATCH 1/2] doc: first doc commit and update README.md update README.md and add initial docs for stratovirt. Signed-off-by: Zhigang Wang --- README.md | 75 +- docs/StratoVirt-Guidebook.md | 517 ++++ docs/default.json | 21 + docs/design.md | 43 + docs/images/StratoVirt-arch.png | Bin 0 -> 21334 bytes .../config_openeuler_4.19_aarch64 | 2193 ++++++++++++++ .../config_openeuler_4.19_x86_64 | 2550 +++++++++++++++++ docs/mk_initrd.md | 66 + docs/quickstart.md | 220 ++ 9 files changed, 5660 insertions(+), 25 deletions(-) create mode 100644 docs/StratoVirt-Guidebook.md create mode 100644 docs/default.json create mode 100644 docs/design.md create mode 100644 docs/images/StratoVirt-arch.png create mode 100644 docs/kernel_config/config_openeuler_4.19_aarch64 create mode 100644 docs/kernel_config/config_openeuler_4.19_x86_64 create mode 100644 docs/mk_initrd.md create mode 100644 docs/quickstart.md diff --git a/README.md b/README.md index ac1845f3..0810b3ff 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,62 @@ -# stratovirt +# StratoVirt -#### 介绍 -StratoVirt is an opensource VMM(Virtual Machine Manager) which aims to perform next generation virtualization.StratoVirt is based on Rust programming language.StratoVirt is lightweight, efficient and safe.It also has features like Full Sence Support and Modules Flexible Splitting. +StratoVirt is an opensource VMM(Virtual Machine Manager) which aims to perform +next generation virtualization.StratoVirt is based on Rust programming +language.StratoVirt is lightweight, efficient and safe.It also has features like +Full Sence Support and Modules Flexible Splitting. -#### 软件架构 -软件架构说明 +StratoVirt is based on Rust language, which ensures the high performance in +safety and efficiency. +StratoVirt supports live-time remote control with qmp commands. -#### 安装教程 +In the future, StratoVirt would be capable of virtualizing normal machines +with specific hardware emulators. -1. xxxx -2. xxxx -3. xxxx +## How to start -#### 使用说明 +### Preparation +Before building StratoVirt, make sure that Rust language and Cargo have already +been installed, if not, you can install Rust and cargo from following links: -1. xxxx -2. xxxx -3. xxxx +https://www.rust-lang.org/tools/install -#### 参与贡献 +### Build StratoVirt +To build StratoVirt, go to the project's directory and make use of Cargo: +```sh +$ git clone https://gitee.com/src-openeuler/stratovirt.git +$ cd stratovirt +$ cargo build --release +``` +Now you can find StratoVirt binary in `target/debug/stratovirt` -1. Fork 本仓库 -2. 新建 Feat_xxx 分支 -3. 提交代码 -4. 新建 Pull Request +### Run a VM with StratoVirt +To run StratoVirt quickly, requires +* A PE format Linux kernel +* An EXT4-format rootfs image +```shell +$ ./target/release/stratovirt \ + -kernel /path/to/kernel \ + -append console=ttyS0 root=/dev/vda reboot=k panic=1 \ + -drive file=/path/to/rootfs,id=rootfs,readonly=off \ + -api-channel unix:/path/to/socket \ + -serial stdio +``` -#### 特技 +Running a VM with json configuration file is also supported, +please refer to [quickstart guide](./docs/quickstart.md) for more details. -1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md -2. Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com) -3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目 -4. [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目,是综合评定出的优秀开源项目 -5. Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) -6. Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) +## How to contribute +We welcome new contributors! If you want to join us, please +take a glance at the Rust formatting guidance first: + +https://github.com/rust-dev-tools/fmt-rfcs/tree/master/guide + +Use `cargo clippy` to check and improve your code, the installation guidance +and usage is as below: + +https://github.com/rust-lang/rust-clippy + +## Licensing +StratoVirt is licensed under the Mulan PSL v2. diff --git a/docs/StratoVirt-Guidebook.md b/docs/StratoVirt-Guidebook.md new file mode 100644 index 00000000..5e94eb66 --- /dev/null +++ b/docs/StratoVirt-Guidebook.md @@ -0,0 +1,517 @@ +# StratoVirt Guidebook + +## 1. General Setting + +StratoVirt supports json configuration file and cmdline arguments. If you set the same item in both + json configuration file and cmdline arguments, cmdline arguments will override settings in json + configuration file. + +### 1.1 Cpu Number + +StratoVirt supports to set the number of VCPUs(**nr_vcpus**). + +This allows you to set the maximum number of VCPUs that VM will support. The maximum supported + value is 254 and the minimum value that makes sense is 1. + +By default, after booted, VM will online all cpu you set. + +```shell +# cmdline +-smp [cpus=]n + +# json +{ + "machine-config": { + "vcpu_count": 1, + ... + }, + ... +} +``` + +### 1.2 Memory Size + +StratoVirt supports to set the size of VM's memory. + +This allows you to set the size of memory that VM will support. +If you use cmdline arguments to set it, you can give `M` or `G` as units (default units is `Kib`). +For json configuration file, memory unit is not supported. + +```shell +# cmdline +-m [size=]megs +-m 805306368 +-m 256M +-m 1G + +# json +{ + "machine-config": { + "mem_size": 805306368, + ... + }, + ... +} +``` + +### 1.3 Kernel and Kernel Parameters + +StratoVirt supports to launch (PE format) linux kernel 4.19 and can also set kernel + parameters for VM. + +This allows you to give a path to linux kernel, the path can be absolute path or relative path. + +And given kernel parameters will be actually interpreted by boot loader. + +``` shell +# cmdline +-kernel /path/to/kernel \ +-append console=ttyS0 rebook=k panic=1 pci=off tsc=reliable ipv6.disable=1 + +# json +{ + "boot-source": { + "kernel_image_path": "/path/to/kernel", + "boot_args": "console=ttyS0 reboot=k panic=1 pci=off tsc=reliable ipv6.disable=1", + ... + }, + ... +} +``` + +### 1.4 Initrd Configuration + +StratoVirt supports to launch VM by a initrd (boot loader initialized RAM disk). + +This allows you to give a path to initrd image. This image will be loaded to ram by boot loader. + +If you want to use initrd as rootfs, you can add `root=` and `rdinit=` in Kernel Parameters. + +```shell +# cmdline +-initrd /Images/initrd + +# json +{ + "boot-source": { + "boot_args": "console=ttyS0 reboot=k panic=1 pci=off tsc=reliable ipv6.disable=1 root=/dev/ram rdinit=/bin/sh", + "initrd_fs_path": "/path/to/initrd", + ... + }, + ... +} +``` + +## 2. Device Configuration + +StratoVirt supports to deploy one kind of legacy device and four kinds of virtio-mmio device. + +The max number of device is 16 in x86_64 platform and 32 in aarch64 platform. + +### 2.1 Virtio-blk + +Virtio block device is a virtual block device, instead of placing write and read requests with the + actual device. + +Five properties are supported for virtio block device you can set. + +* drive_id: unique device-id presented for StratoVirt +* path_on_host: the path of block device in host +* serial_num: serial number for virtio block (Optional) +* read_only: whether virtio block device is read_only or not +* direct: open block device with `O_DIRECT` mode or not + +If you want to boot VM with a virtio block device as rootfs, you should add `root=DEVICE_NAME_IN_GUESTOS` + in Kernel Parameters. `DEVICE_NAME_IN_GUESTOS` will from `vda` to `vdz` in order. + +```shell +# cmdline +-drive id=drive_id,file=path_on_host,serial=serial_num,readonly=off,direct=off + +# json +{ + ... + "drive": [ + { + "drive_id": "rootfs", + "path_on_host": "/path/to/block", + "serial_num": "11111111", + "direct": false, + "read_only": false + } + ], + ... +} +``` + +### 2.2 Virtio-net + +Virtio-net is a virtual Ethernet card in VM. It can enable VM network capability. + +Three properties are supported for virtio net device. + +* iface_id: unique device-id presented for StratoVirt +* host_dev_name: name of tap device in host +* mac: set mac address in VM (Optional) + +```shell +# cmdline +-netdev id=iface_id,netdev=host_dev_name[,mac=12:34:56:78:9A:BC] + +# json +{ + ... + "net": [ + { + "iface_id": "tap0", + "host_dev_name": "tap0", + "mac": "12:34:56:78:9A:BC" + } + ] +} +``` + +StratoVirt also supports vhost-net to get a higher-performance in network. + +It can be set by given `vhost` property. + +```shell +# cmdline +-netdev id=iface_id,netdev=host_dev_name,vhost=on[,mac=12:34:56:78:9A:BC] + +# json +{ + ... + "net": [ + { + "iface_id": "tap0", + "host_dev_name": "tap0", + "mac": "12:34:56:78:9A:BC", + "vhost_type": "vhost-kernel" + } + ] +} +``` + +*How to set a tap device?* + +```shell +# In host +$ brctl addbr qbr0 +$ ip tuntap add tap0 mode tap +$ brctl addif qbr0 +$ ifconfig qbr0 up; ifconfig tap0 up +$ ifconfig qbr0 1.1.1.1 + +# Run StratoVirt +... -netdev id=iface_0,netdev=tap0 ... + +# In guest +$ ip link set eth0 up +$ ip addr add 1.1.1.2/24 dev eth0 + +# Now network is reachable +$ ping 1.1.1.1 +``` + +### 2.3 Virtio-console + +Virtio console is a general-purpose serial device for data transfer between the guest and host. + Character devices at /dev/hvc0 to /dev/hvc7 in guest will be created once setting it. In host, + it will be presented as a UnixSocket. + +Two properties can be set for virtio console device. + +* console_id: unique device-id presented for StratoVirt +* socket_path: the path of virtio console socket in the host + +```shell +# shell +-chardev id=console_id,path=socket_path + +# json +{ + "console": [ + { + "console_id": "charconsole0", + "socket_path": "/path/to/socket/path" + } + ], + ... +} +``` + +### 2.4 Virtio-vsock + +Virtio vsock is a host/guest communications device like virtio console, but higher performance. + +If you want use it, need: + +* Host kernel config: CONFIG_VHOST_VSOCK=m +* Guest kernel config: CONFIG_VIRTIO_VSOCKETS=y + +And `modprobe vhost_vsock` in the host. + + Two properties can be set for virtio vsock device. + +* vsock_id: unique device-id presented for StratoVirt +* guest_cid: a unique Context-ID in host to each guest, it should satisfy `3<=guest_cid {"return":{}} +-> {"event":"STOP","data":{},"timestamp":{"seconds":1583908726,"microseconds":162739}} +``` + +#### 3.3.2 Command `cont` + +Resume all guest VCPU execution. + +```json +<- {"execute":"cont"} +-> {"return":{}} +-> {"event":"RESUME","data":{},"timestamp":{"seconds":1583908853,"microseconds":411394}} +``` + +#### 3.3.3 Command `quit` + +This command will cause StratoVirt process to exit gracefully. + +```json +<- {"execute":"quit"} +-> {"event":"SHUTDOWN","data":{"guest":false,"reason":"host-qmp-quit"},"timestamp":{"ds":1590563776,"microseconds":519808}} +-> {"return":{}} +``` + +#### 3.3.4 Command `query-status` + +Query the running status of all VCPUs. + +```json +<- { "execute": "query-status" } +-> { "return": { "running": true,"singlestep": false,"status": "running" } } +``` + +#### 3.3.5 Command `getfd` + +Receive a file descriptor via SCM rights and assign it a name. + +```json +<- { "execute": "getfd", "arguments": { "fdname": "fd1" } } +-> { "return": {} } +``` + +### 3.4 Device Hot-replace + +StratoVirt supports hot-replace some virtio-mmio device such as virtio-blk and virtio-net with QMP. + +#### 3.4.1 Hot-replace Virtio-blk + +```json +<- {"execute": "blockdev-add", "arguments": {"node-name": "drive-0", "file": {"driver": "file", "filename": "/path/to/block"}, "cache": {"direct": true}, "read-only": false}} +-> {"return": {}} +<- {"execute": "device_add", "arguments": {"id": "drive-0", "driver": "virtio-blk-mmio", "addr": "0x1"}} +-> {"return": {}} +``` + +**`node-name` in `blockdev-add` should be same as `id` in `device_add`.** + +For `addr`, it start at `0x0` mapping in guest with `vda` in x86_64 platform, and start at `0x1` + mapping in guest with `vdb` in aarch64 platform. + +You can also remove the replaced block device by: + +```json +<- {"execute": "device_del", "arguments": {"id": "drive-0"}} +-> {"event": "DEVICE_DELETED", "data":{"device": "drive-0", "path": "/path/to/block"}} +-> {"return": {}} +``` + +#### 3.4.2 Hot-replace Virtio-net + +```json +<- {"execute":"netdev_add", "arguments":{"id":"net-0", "ifname":"tap0"}} +-> {"execute":"device_add", "arguments":{"id":"net-0", "driver":"virtio-net-mmio", "addr":"0x0"}} +``` + +**`id` in `netdev_add` should be same as `id` in `device_add`.** + +For `addr`, it start at `0x0` mapping in guest with `eth0`. + +You can also remove the replaced net device by: + +```json +<- {"execute": "device_del", "arguments": {"id": "net-0"}} +-> {"return": {}} +``` + +### 3.5 Event Notification + +When some events happen, connected client will receive QMP events. + +Now StratoVirt supports `SHUTDOWN`, `STOP`, `RESUME`, `DEVICE_DELETED` four events. + +## 4. Other Features + +### 4.1 Daemonize + +StratoVirt supports to run as a daemon. + +```shell +# cmdline +-daemonize +``` + +**When run StratoVirt as a daemon, you are not allowed to bind serial with stdio or output log to stdio.** + +And you can also restore StratoVirt's **pid number** to a file by: + +```shell +# cmdline +-pidfile /path/to/pidfile +``` + +### 4.2 Seccomp + +StratoVirt use [prctl(2)](https://man7.org/linux/man-pages/man2/prctl.2.html) to limit the syscall +in StratoVirt process by default. StratoVirt use only 32 syscalls(33 in x86_64) after running. It +will make a slight influence on performance to StratoVirt. If you want to disable seccomp, you can +run StratoVirt with `-disable-seccomp`. + +### 4.3 Logging + +StratoVirt supports to output log to stderr and log file. + +You can enable StratoVirt's logging by: + +```shell +# Output log to stderr +-D +# Output log to log file +-D /path/to/log/file +``` + +StratoVirt's log output level is dependent on env `QUANTVISOR_LOG_LEVEL`. Logging levels are `trace` +, `debug`, `info`, `warn`, `error`. The default level is `error`. + +### 4.4 Omit_vm_memory + +When StratoVirt aborts unexpectedly, you may get a related core file in which the guest machine's +whole memory are dumped. However, in most cases, such memory dumping is worthless and may consume +a lot of system memory for storing. + +StratoVirt provides feature `omit_vm_memory` to avoid dumping vm's memory in the core file. + +This feature is closed by default. There are two ways to open it: + +```shell +# cmdline +-omit_vm_memory + +# json +{ + "machine-config": { + ... + "omit_vm_memory": true, + ... + }, + ... +} +``` diff --git a/docs/default.json b/docs/default.json new file mode 100644 index 00000000..d004d2aa --- /dev/null +++ b/docs/default.json @@ -0,0 +1,21 @@ +{ + "boot-source": { + "kernel_image_path": "/path/to/kernel", + "boot_args": "console=ttyS0 reboot=k panic=1 pci=off tsc=reliable ipv6.disable=1 root=/dev/vda quiet" + }, + "machine-config": { + "vcpu_count": 5, + "mem_size": 268435456 + }, + "drive": [ + { + "drive_id": "rootfs", + "path_on_host": "/path/to/rootfs/image", + "direct": false, + "read_only": false + } + ], + "serial": { + "stdio": true + } +} diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 00000000..6e648ba0 --- /dev/null +++ b/docs/design.md @@ -0,0 +1,43 @@ +# StratoVirt Design + +## Overview + +StratoVirt is an open-source lightweight virtualization technology based on a Linux kernel-based virtual machine (kernel-based virtual machine, KVM). +StratoVirt reduces memory resource consumption and improves a VM startup speed while retaining an isolation capability and a security capability of traditional virtualization. +StratoVirt capplies to microservices or serverless scenarios such as function computing and paravirtualization devices, such as GIC, serial, RTC, and virtio devices. +StratoVirt reserved interface and design for importing more feature, even standard virtualization support. + +## Features + +- High isolation based on hardware +- Fast cold boot:Benefit from the minimalist design, StratoVirt could boot a microVM in 50ms. +- Low memory overhead:StratoVirt works with a memory footprint at 3MB. +- IO enhancement: StratoVirt offers normal IO ability with minimalist IO device emulation. +- OCI compatibility:StratoVirt offers OCI-compatible interface,which connects to Kubernetes ecosystem perfectly. +- Multi-platform support: Full support for Intel and Arm platform. +- Expansibility:StratoVirt reserved interface and design for importing more feature, even expend to standard virtualization support. + +## Architecture + +The following figure shows the StratoVirt core architecture, which consists of three layers from top to bottom. + +- OCI compatibility API: VM management interface. It uses the QMP protocol to communicate with external systems and is compatible with OCI. +- BootLoader: StratoVirt uses a simple BootLoader to load the kernel image, instead of the traditional cumbersome BIOS and Grub boot modes, to achieve fast boot. +- MicroVM: MicroVM is introduced. To improve performance and reduce the attack surface, StratoVirt minimizes the simulation of user-mode devices. KVM simulation devices and paravirtualization devices, such as GIC, serial, RTC, and virtio devices, are used. + +![image](images/StratoVirt-arch.png) + +## Internal Implementation + +#### Running Architecture + +A StratoVirt VM is an independent process in Linux. The process has two types of threads: main thread and VCPU thread. The main thread is a cycle for processing asynchronous time and collects events from external systems. +For example, a VCPU thread. Each VCPU has a thread, processes trap events of the VCPU. The other is the main thread for processing events. + + + +## Restrictions + +- Only the Linux operating system is supported. The kernel version is 4.19. +- Only Linux is supported as the client operating system, and the kernel version is 4.19. +- Supports a maximum of 254 CPUs. diff --git a/docs/images/StratoVirt-arch.png b/docs/images/StratoVirt-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..25ecb5c99fcba9da64a0104766f6adf8ff0ef460 GIT binary patch literal 21334 zcmb5WXIxX;y7nC_OA%11N)r*K2uN3|QUpPI2|Y^hy%RuGL_nG}rPt6wIs`~8^xh*~ zLvO)QLw`qHXPvdqK6^jU^M2qbV3>2vnK8$G-T&(v2~ks#C%bt4A_M{^e z)#{jM27bxnCa3Er<6{2Y&Dzo7hL*LxIfVZn|2@%r4@CL-Z``{tDkLJxCsc-X$3q|* zs3)?Iw7iVhCJ!PRhEq-g<`2Z^#t&oV{urH!oNa!8<@+su3$~kqB(E-8>51P~msGex zky`XV{VYWpW%1=2uRrQ9bnM@bBFJ7~qSQ&J7WiIvR%2E7v{+k~vSg7Y>JM*q|8j{_ z$1nC!2$&_gkhWj`vSg}K?FpBx4J41hqQ$nn&W z7h7*jA&_i&_&LZ82`m=`vUCM@3Zf86xCnXZ(*%V;23S$2AyG`fU&?tXX`-sCsz6Oe zZ>{w)QE%vOrUi{qSl&mu?#$7Az4J%J*B4F%*z>+S1NoHqYZu%#_8bkVm<4d_{=+$_ zv9_wpZ4k(dmPPbmC(^iWD~+K3$7rVI&w}CY6Gtk-ykBbBi}IX*ssM8zF3>LY@%5Fl zI3?=4W2ay=Y$t^(nAcw?n?UV-zQ87Yi0xuVqhIVYL++XL%&d&=`tJ?6O68r4dueYP zbFhA4>D*jnU}`ssH1c_#_(A75^ytQ_XYqjc$J00OLb_B*XVw><27Txzduc0iu?5Xg z>-VEEs!>`L)+s?@N19()B8s24TJ5w*Ve}Kg?Y;?K?+zXiIw{m}N=aco1IaUsz^?e7 zfw^OBgzYNG=%pXMLB5s%2cph&rq?LWaeq-vY3z$W323QYFJGgz?C_khTmW~XASsEk zKP5I8sb)U@{bPH(6s|_dBGKY`pommFhIDNJKb> z6!JlS$SB#}(m&e!V#80`Wz>jB*3{GS78{`!RH&KeTJm0V^(Wa6rK!K%p0vCnLw{Hn zASuRNP5$WfntooEzS?~JB7ahxt!c{`e}#*fM$NqnPc(adH{^vG{3F>%o%!sOG2iuX zu%KW{DP8x2&%D!0eRt>?b>j|hY&0e;lV^xJV@Z;r+Vq#*cEB5f_0Q_WE`pXUZ0K>J((Y^des9Cv)z~(XlGOgm`OSs*k!y_fS+?RQ1@6 zbf2trmBSw-A1T}`5a-Fl^$;SyKk8sArNwjlQ}k+fWY+4hG-b-9Ra6+h!+q`SJ0w*7 zhQB0Vc_xr14!u|!)L7^Q9&OhfTc%2R`skqZk>)Gnj@{z7yQL1hB;Up@Eubv3g?)>D ztqNC+eN;uzmzuZ4k32S2?(DvhBeb1%M#S(5PTeWPNzs2j(Ov-Z%%Qt~*EM_hAfRY> zDKD+2XYo>emHleu^sa9H)UVDbyAuMYt3yM5=>p4pL3Ylo>P7-B2glfayD)+Mp90Kh zEz<&9AFS;d=ThyGL+%B18^PnId{O9=$mIy?7P1t=my;5v=PU9>73)VXKX%3)*FTmk z791ji%q97_kLhH0Vn(D+(5u3hXY4)+}1&>$_horD_Iqy4Xelpmizsn|Vbe*5#j?WdI27*Ssm$cNVXM{qJK^ApvbnXZqG zUxbs7=1zJJv)6SAT^o@CIs;|bU~}E==e|GH7R`e6tLLI?Z7zt*BCzf6r@s(zC&}T zu)uW(PG|}_lLS?^UtB|I?c6!ZJ-__MtolqoB=@tp8osyLDkBhv+MOCw$el=|ZnJ8L z|CsgKc6ul!)4r!t=OmZBqRdz@ZTXy(YKIu5-%os9yg#Txj z-G$~O%(yh7!FMNMJFHPiGvIJ7Qp$Skn&V)VAfei!xVy|EfLsFes(6>G?uh*V8J`+?BNZ4p{IP-Z(dKhuVXE!guZKlui z_geq2WdVP16#|KM$?=DQ3KA>(Z|enEvIC?@ATQqkV=1=@kj!0;e5^(A3z?rToa^TU z1L0z(G{`(5R+x~3(hICo5J<&sZ>p@{zxiErz+4!%oyYS+p7k&1U-+o=_woK$xzc`H z8rPzH3i9bgf52ZgEJtgf@sEF$kN;832pz{q2fjW&`}_OSsC@Iz`1}q0Osfpj!^c4c zed`G-HMir#J)?S$Vi+2=qrJ?bSSUM}@mL~($LKl}Q-sn|qU8PiAL0TwGin$WfBX`B?2Z&BUp!;xJbEge~z6 z{6em3nsJRoN>YdVUUL|&Od~Fb^~cdVzQP#8(U;Ht(3>AZd zq_TUcwY0S08h4G|-)qSWF>bnb56fvj@?4*4w3qs*6}i?Uwl40z96P2itDfZS>&6J( zvn_3;Dd2&N5|A?^g0s0s0Y}>?&$^#)=neU&86rCL4a$AfV95v1wIQCL?)ou-nR#Zt9uXCPhi7J(5Ok|ahV9!|71MfL$g%--R zH6rQ@!!>4RXqx+6N1RVMQ^28fT+?Rur4$yVsz&C3#zW~QsG%NGabL4ip4o4RO^!>a z$cbcb7k$#GH2XR5Ok%-iMjg+Sxp0xGKCP6bjTu9#_u8Z1%*~JXx4(V+2By-*irl)d zd)KAn+~@9Fa6%=8h37Zg&Nv+^XcjK6t+|`WttcQ>vfrO3NBwssu_p# z&=ec~u)Ad(>$ep7C2Q8wH?Abfq0Z|;3D*?!>O|e(ID$Tij>Em*Q^-}h-!S+lH}>=8 zSB7N`)mA?koz3kt92MOz8@?x_wvpGXu_#!bGIl6TWMGI>;zK3sZ}PMjQzv{-Di3R= z&szOFS$i~EY_AfQ=kF14Bt1)>pj=$~>!poS!@z28bzBeSU12b`=D32VDS6Vec2f?U zb-I-k!B*aKo7-COvIywjYE^y62By0<^F4kTvuIMoQ5_S0ji>)+xM-q#XT`xHV>`7X z11rq|!o68o28C$<;ykyC#76g?S5Vh=_ZBv+yH#I%299^C8#Vml_%&DA!^rtesQl+S z{JHKah;?Ek3wuE7vnvHD@*G)oX=rQ*629l6!2Pf_l6jY|kojm)o+m`uVPcWwLjd6$ zv*FdFc@cCwc~a=BPuJsUIZ9^d9i|#aPtEiO99Jlnu%G)H7#J9puUBRgWV(=cymcpG zXzq(_r1-|$ivBlb^nw!L8Lr^o+Z+pK_W!OI!aRUJz}54#U%P&NqqPytqREkBgYBunU&`$nBM=C`emMmMpm*`@chW8ih-xx? z;`Zn0O_f-&Q)jr7)BS5lwvKGP$`<;8PnqqxvAtnjmZ^hVjr^7M$@-f6<#*+I!;ux1 zJ<0iEk_;aho1k5!v^<7=Lq>vi$NMWO1Z)^@a-QN9i3RlW3R;Va)qIcnwA(W-mxhn# zqfTJ;_S)4xhq8%J3oyhOs@OqE=_t)udOZ*7p7Qk-TRc| zl+o>Vytg3HdDOE33p-rQiUc$Q;LnGBg10Ou_Vrx!tDu8OhA5$8tE zW*`ZZQ-}}Kaiyb~S@ztw&jp7c)cNxJcO|ogY)7mLNd53EpS+ULd)pWsr!3+r*1)6 zQ2U4E*U|+e&kYNe;UX_BN-w3Q>ejiK2h5W&AMd4|=xH`rJNm5{@U#zT6im!I76{_| zWGEiYfO+5W=vD}l@UR+n&sqfH(GiNxp?u-w235BW3cP3b7RgEkfMaXm1zmm-6bY|>t zsZXqJ8j2wGdo2_mRx4hAs;sI@d~r3-Akqw%{I%fg%O`s5zG`F`_Ur-@S)-BhT_AF;wENqkn+ z)|#eN*5z}fHuD?&&O@PdQ&)nbu+{murbratnQ>w?P(S;ILe%napD$)F;Y*-yg);6S zeN#w%dAio1@b-r1&g0qUf7orXUR7@p36=k*hvgETrQ|TX&aQa}y9w=jGgp;1gNfs# ze*izy{z+T{7JaFJXldY?zLr(E^TEbCQzLnl%O!hX1 zv3w47yxsHNJvtS&why#B9qZ=e1uT24M2x=F7rkFyC`uA>9C_4t-j^Xtk?NV-Eni<> zBbEV;&@g9_Duu)J%Gy)MHc^N%F`>n8B{uqsyASoPi*?jI502jFqXx$AIVMrqb)pl& z%EN3`km6olQjbG+u7uC@By(@tezQ-$^KjgU%2cLY!2ecCfzUJO<&_o9hZ@gjC zqNh}aR7=10qHz#GRLG{_u$BV?q2c=t4*=$vzw%}ZvN>v|+*Yaa9Im<9>z>pL#Id9j zQ}e%7nT;3m?m!wtAi8D<|8tpJ>ks?>TW9_+R-$xx@D>8OYuIgSvU&f5%n6aNyx#`$ z`l0Oa@BdX)|7IpCZ-$ew zx?U)-bpi6JFD^2K`?nfyY&yccj%J3u(2Efd)m}u~6u(x5e9p|FTfJv^zBl+kgPtag z;tT>@bJySZM1%8){OsRn`&VLADh)3dbhrq4!K1n4dE>7?d+0YaGlMdK-lRIk9!?0A zna`JoR*AZ|8#V?AhkQe1%x&< zWhgp2I*Oz(oU;itRv~!oPmb)^nCm1te;(-L{H>`*Un8%a6fsYgK~OyIlcsZl8&-l` zNcByAc7x)D*4xt0&=U0VepI5zApw7U>`x)~e*RU)>i{s#IJ{(IXu-tDbw9;W?Xw8^+&oL2NP&{zg zu>djg<=Lan1x1#@_qGZB4gQ;TpW&Z;cX7x&1!)wehIRb2x%Shj$!*to9wVRqttGYm zd2Dmo$_UV?mKt^Q$T4u=gzd*>U5Zt#)iV}CEv?05P^qs-D3JQm7D0+a&PHSBb7p8b zuh={-tj{ewE;DVFZcNl2?Y5Cm?k=7Ts*873KD8OF+8^~K5WF+l4)9@W7+xbND=RA_ zuVp4xwtsh`&XrUBgVrC;bB1p<-f3l6SaZm0a$kk!(inCd>WaJ-{%9V$Qj+zV#XWa&j5wkZ3h4u4YcsEZ{Qd_tqA+4^NDBxaj{jGvVaCTA4QUUKb~Sg@-E z+N<>Fx1jH^9hzs26u+u5h<(n1TM|LBgZyVDrKgNH%`SllnRJz*6FnM$8+n>*zU zEko)aBjQcgm=YtuL6>MroSd^ARQ)h8qEDwK@C=DK*8(ER;UdSg=={qtKfa6C(hb<} z5<8n->42Szw7Ncr`nBq)#HDxxhYR`dkU;}^(JDi0S7=yAfzSegwAx{E|0^kPEt&`S z;xVYeFqwkTKAkr+`&Ag@m5UFd1!Nx=))ye!1id^H)g9m4;Y`c^hxk^OIQ1Ge4T)<# z1qa*-=)*Z)7VWIml@!mZe2KA2Yt5ZAS^;X;jr_KUS+wP>3SThioW6J z;`GngI?OMYa*j1{Y}6hks5`{dcm_Csdqc*vBn!WL2z1~5=(%+IZwui6bF_JlkXtqA zqqUZBdLvz``mbqzJ0B^P37-_Q93f}kf@d+7(4J}f9fLl0x1`QVehXUE!@Giy%f}tj zMw-+X2dJGf+fwU-8wue~bxda7pyAuiZ74+_Bb9ERbJ=D5tsn@IXNwH$G?eO^LNm%7 zk_vWaV^rUCReI@0Fi9+*eeve2X-WiW$7`=*Ys{rlkDdAAEzuKL=9J6b+Kh3AMMr#) zNNC3~`|JC%)+1kDk<9GEOiA@ShP&|E(Y-bjh+yNST*%bEAowwb)vN*zi%B89geCr|8J#(2P!*ve3#?qrYxDAH=;)vAE;H z(%_IF>R>7S*rwIgJjgYN4d;tD1`l@7S4+F!YD0_Vrm$npp5ZgezN)D^Ht{B}eB7Iv zpWcgWb-Jl2Qjq^SE1g;>J&dl)!a%o;$6JW$@ugQT51U@4$F*_B!>WdDl~r z54|0-Ws1=ecxAwjGTZd|YyVNV{&OW6)THDu{y&skfQh)mlTJgP@%_z?6!W*lF8_u| zN{73{ry#nu9j#6Ae+q)~V_CISaLaSJeM)cr;;B6QfQ+AmRUjOHW5$ zKjr#YYvgyWZk>jbKyI)Dx>e8@kaArp&Y>BofxA^{f$ZZSpYH4~H7K6G2YJTAN!iRP zCo1`Gbp$nbWhjpW_4*9t;e`@z?5}^1brj$bfM$5m~RdS7q zX=lP}wA}pY=!lh-HKV2S=wR2^p^dh(sPCMcuA5VNu$H`+o!#B8RAEU;+(pILnIDH# zV6e0mg~>DIcmKfKQS6=TP>t#QO)|PdW_desll7Ap{opy2$!Tv_P7)3pcnL7mYek{5 zFHyW!dKvF;DYIEOMKFq0yUZzy=vF#RA_sF-gA1xUcz_sD))dm-nP7;6HF)t2750R` ze)_=v?)#i#BcxkCeMluOU`yhjIKw=_ZlT?^aoP)8=M+G7PP* z9VqJWS4Y3(uF1GmDq?n>ORxBXef3lb{>36$;oADFxV3EeGWWzG-3WE>PE=`0c=Nb! zcHKpLA(-c8-3Ll*7?shFiS(&(p8%vv17*aq^jt%;SWFqC}L5lAM-9C`cNUq&877|JO5TV;qqLj zoNM!E*0N9GO1jq^#LYUaO{0sD^&Dm52e(Gpluy4pun~21Z5^HCjtp2`o-6RyE5%U1 zKYK@nnm;_QNC%p=I}fLX;eNk)op&_=J|*0)^Z;UyNDT2qnyPi#5*R~QWd&{rycce? z!g6n8j>oQ--mxJgzhoQCcAh02qUe~x)lB=3vzdFn#sGPdAl7r{%$Y=$r%&O8pTRu& zE2smUD>g<|()lZuXK(U@3L=4oS)o9~{)VdL7M4q2%x5c)(sR;1fpTqcwW2qfFor?R zH(_I!y+UW?F(lAJA6}P3g0q|&8_)v639>+yxMPVVh|>M2_vA2=vJhJ8@l_PfmuNsj z&hSS*qNTMJJf#WT2J;yWetv$GI&xTyu`jpQG7UAhaQYX^ znaWJtnBHssjD@dOz`7;kIJJ7Q4Y|KuD8rfMy>1iTxAyky3*OyOGAQn_Kh%xZ3Q)L| z)9paztE5RXGT-(j2@(v*RbjEWvQqFxf_K<3)(_uIE2Q19MKW9sPGX#Z|EJs^zU$+r-^#qD&QYKZzt03f1*7(BAg1pt5<$2`?~Fx zGIdZIqnHOA00>2{9YUbZZofmsyLZGHV7)dE1nZp3AosJy9#z*~?Hb0CzVku9wpTF3 zc?tT#jDA*ms`+oLIO1sm2iXx@-p0T@pK#)BDVozNYNn5Ho{1iMtrIKyV8U&snR7yK^o9weNECU{+wj+-Pu~Pgg>Jdfa`E@u}dwdp|_HaE}yrXzr|l zRyO0Y`}F=u4OO=Lab!e93vA7bpULJCD=L#J(zcJy+RN)Zn;z1CDYX)(UG%^R&0u{z z2RqK8vv#vwJD9^RJS3guDs9Nj$C|)kusbX}8No~&arond_0sVO<5IbUkDd*_okyat zn_Rc)rwMN;!wh#+wVG#knrUJ8qeIE-9c|rmOX|D0_A3JjB5wx2dE?d? z(a09L*09+lM*YW1#0&fO=*MAP15L(u9&L+YI+tgW)xYB5@t@6OgjLxL12=Al8g{{4 zYJZW{GHH<725?}-r%#{eF-}G|sa`xvVQ(~^Ji;6$_I<%WF<@9<#+M;$Z21SY-8!+f zU)S8Y8dHv5kECImW_ED(>nhzU3!S`Zx0}1Zj7DNFK2L7)Ct@5WXT-}w`Rq)e9NihE3&Yiul@RxaP$+X zjg|PX^-EXR?8LnVJA-X1JaK!V?Z7l>(0p4UBDD^ zG3C%HyOt+-g|wpC4`|O7Juz=@WbD|)vJC3B+sNU+(<~Krra(m{Q*%;=_c-utAIXq$ z*uo}@;l%ijXsxeal$^#7DJ z`X5#?Bns$bq9+W27$FA!4f#W#jCFSesC>$E^NQ#Bnx((=)_KG|Z?1p-j|5Ch{y03E z-~HxqwG7QT854JahAu-OU0g&Zt_vY{46&K1w(UKjjrqTlxWzy9IXd!tRY%%cNZ+>p zn_PbNB8}{sy?fv?`vcbW8*V~wu<^e6`{>yK+84l9OZXrVs|aGd^P5-xU!DHH9Q8hP zBJulA9lZJNm-&cXdLqC3`*F4h9X|(Sr5Na)h;BOKJ z9>0Ny?}#{jxN2%@`uu!U{Qg%qq0(tMj|Fq70%p9@I>s0aE}ly2c&G?G=CqwPz^bPy zS!xIZ#i~#(P6qXu6bxf%+tnpZO=<3~OS!+mtZ6hXT z3_@#@)Vu`h-kZ_I$Nq6Lh_d&?9ff^5o9-fMVWX{F1KTEEmxC?`F@)g1Jx5reyTntqDzF<@`O(4Fp05u*)7 zc)N$zvrNa*t3!n_Dwj6Lm=aE8RE-FINp#4cYylt4rohYl_|Q9{a;r6_V5 zSc=jg)XY^e3Sn-7xB%$$wc4!xeNXDPro>tm}Hig@*WGmM^<#C zin=QgK0!E z_b(2I#@fsxS4!xwU)RXUTmt>J;g{pJ>0@Qh;f9$AF)dB}26)hy*~M^=!Iflvt(Y%? zp^fqWfCWU1ls0UHazVPiV;>Sl+Bup6qTiHl!3mr4CUr-D%R+FZphknSCX0Fo^jSZ6zhlx-A#f|xg$>U=KqNo2&IR1Ag_P^FS$X$!SfeJ^QKgr8~ zsT@oJCsnq5-{~Ql@jp5K3O9iRcN0m{T|@Is-5dq^e;LL<1GwvG4{r2L2*mnYDxK6N z$mi1!!SV2+kQcEHbhptK1OFBt{S}_NV=H|g@(a|Hh;}P89q24UXC)bK!S8oO6aBjh>MqYEBhdg_(e#wK5CrkPS zP={z{M&jXHxyIX#-M(DWovojKK>i~+DJUWRX_aEqmxX^9PX=bwTKv_Yg|)* z#9Q#X+qZ8^7sL_0scilZUUI=*z^b{ko#vb?S^qUzQ(#&XW(&W`N2VpAAW~>znnYh~+K?kePqbAA;i;v*HRm%lz*$n?ui4p z8RxRuXk`Oiz2`dl-ax~%HZ3jxQBWH55XR|Iy}uqffrwGgj8gtrKD~;S zW}a&EEvC)+j_?paxE(5QAXF8*Vtd^Jqh;iXyBi+nOO2P~MJ0yU`h!8ZjQ+ER{?}Ep z`>eEhrhf(g*?M6-zUtc#PT~dFnCiib-Rh{;tQ)+9E{2cg15a=td3#KGxC@ux={jWh zVKQ_j_^I4nLO4ck&7HkKJ=wKxw{78thcGU^WPNOoZdnT_lVSZXI&p=-U8aU@&zf1? zwsE!(i4V6l!iBjSG z89_TFcF(hFdsrDeSLF@Y2ErSJ6SU?=q!F3I@!L1~$}N_bDLIt~hg*jQhYew=x9ZWz zon{v!f+4r>BfB2bH&5l_;ozX!|=)O1(Tf|AGtAyR#KYjO1y=zv3z1H6hYpCj!H0+nIK}+8Mo^#ft1d|~6{8fhD zv%$T$uk_F^PN0&=5f&jbrPn>HpV&otHSc1TbtV<;*`Pa+{G2xrgR&F_|kTRu5GSj5#RWc_!8G?wcYS(g}p>Mwghgy;)cJttshwCs>ks!n?>Th zbl~+Peo1vr&mV^xeN>!KXww>MDL2mcV)SJ1Huhn#3zc}~^|U-eJfUlWa9uzpmkpLdi_>fHlP3x?I1NGY7>F(7}g33qQ0 z6F(y*M)dqbp3{Cch;{S{s$v7=#BGuGy|FFyj+gRV zmYH57g9~p75kPi>{fF|?r>m3oyMXZFR#vzo7I99Exxk#|k5SaZiFiQ^F2fk3WX>;s z2c97~T%+&q>S)&MsU1=W!IM8FT_{x5$Xrmu>Box_s|O{Yeh{9|DDgIC=YRchGw&*e z)Zwq82J4GuK6=B^+E5L=ZlkK)sNvhAKb2c^7%FZrzU@7mk}&{#M&0guCrhdU*A*8& z0!}BGeZA24@-kD&29=omU>2?PkqYC@lz7i6Z%bUBN3F5D8KbNV4-Ca%bNB4Ai#n() zC)5?({y}AQ{>vZ_;|a3_MtnF=DQQZHZ593gHSv|@!cq^D46oI)R>5JWrDhjiohy|N z7?)g*vCl8@L z=1vN?expwnr8~1V!6~0&+0M6;4CK!4yUxEe=HE7-_F3Vk_NbkzCD6j)Kbx?R;LUN{ zz|5RaZEnQ3&?hc4J&-(Y{{$lob3MT`pEyrRj;_eYL~5-(M(jPsSfO=^VNqsssqG=n zst06~E}h&g+B3O}$Is6u$buSIj)-W@bl-D!d)M9n((Kg1`6_pu*NW+N}e!SvYdaW}nq#wKz9`-v{s3-e1 zsn*omKxRRs@uq#v%sIJ?Ay!2Y+sJ!}{c0!6L0)?~p~+HlnPSiN`Xonv_SG61e(I{S zB3cf~%bPe$&o2q7w_!T%1y!P@ts;61m2>9XV?o!1#?05G8EUA5#hNoD@KrkA3`cGt zC-MDB2YDr)z|oI2jz*Awc77d8+r=bOs%{&S)ITwFvE=a7)|Ek>=EjjJ4z8YpiDGM# znMZ0Fv_pdRV$F@!6XVLkN~9L8@O<-Pb!wC6#A{d(eq5;Y#A8^wHt6@lXI>DOPgzx0 zCl@yosu%TV$)w{A&_g0Y?7|#S5*xNG^?2G#A4A7lBl zwFgKCAC7ihkvFe67sCCKr&}d5H=LTcb@Yp7bWfRA|E_*{t8xW{v`elDYt`*i!l`$G z+;{3b-92?GpY2r3g!_Ex~Pxz!q{EtHgA#D2fa)@unR{}`&sV}Q^H zZb?#-ffjH}&i%P&CFxvfJi?xlXtWM?=XCSPDPwif_cy6Gf#n#I6X<9?;6>l){^1YRr|V~Hut9WeC(;B zdQ4`sc1g1g$|*05q)!y)=;2(fGRjXf-Jtcmj`I(942Ae?^^#W3KGF9g6ijsq)5kRXsLK%mjE{vpOcA{lf7q@%U{-cBFy)MRLc z+Y}i5@PDIoC9Uq%hu}rDN3m9^f@ec{%BmbKL!cb$AY}&9Q>W*3;XPX8@2=qAxykKB zH~Crg(*Ms*4afc}e~NIKu$_pvx}Pqi>A|sd%bZDbn*E*OGw54YH}Xd z07H7j$ivamvCT4BDuCFHCKkRf2TL#%Sz6lbY7U2s^Ay&VGysHP9v7uwT=?eAn`G^C z30{w!g>B8uWD9>8sv`+r*A^w4py}gwpxYdDUFZo953jo=>a(>7ct8ixu0^^Ts|#rX zxQ#{7)DLNOl9Q85;+rT7idMm*Zc}SwD=Dk`$0O*{^{nrK$EYx zV=BU#*KO1#<+fI9oaq9D-EnQKim7}Hka6TKpY36&kao6PbqFZ6B?dJrfgUa{8z9-w zsh+zDutaJ-&;>a;-l|*m(?=yHo514&?zbl%D<_n59`xJAdYmVifFmgM3^JyNaI2$+ z#xYRN@rT;j!yW1L8gurPX6jH9cX(+n_jfFq3IX^s=M>YgM302$YN*s~{cBZVIj z_S6lcadZ>vZ=>&zRof>aLQzac8g5GiY*J1KKPdthucniUDjgknNZynLdIr%vjzWqC zST>#-Cu#-yf*!jnw zA$M&1J>-;O3$(HK=t~!v{j2YbFt7ll!Q51zj?rW!?EK}RZ>tHXs4~D)>I>rdtvuyu z$6uRiCx;PYxL<1x2nS_%MgN3{2Dm_IvwRb`?*$%c3fA)u)rNM7Z^Znom_UBFc!z0} zQ3#N1@|x2-62Cs5(cH9X2I2aGeDrus7QG>wQpE22kE`hi;KJj8tE(JFp&#cSRk(H6 z7I2gG(dX>KG5EPGz46s6^)3(ma0u2K?RTNSaG zxMH_+Kw&cxWUS;&Qn>8o>_&p-fa|x2XwABcp|0i3g)E3wCLS1W-$z(g_~W3o*6g3b)(;^^M}FX{oH0$@K{(FN$E| z;FT?gLfXV0@;rhQ;{9Y9n8|77$t|s^QOoA?4;~bUL7laz`h?SZCAniRQE&Z#2zgDq zW45=pCY)Otc}kVm$y1hnOaX2k-rlD1{TW0nsN~2jg*uJY4k9eSVBlZ6@6)VKcQxQ^ z@jykE)X83NqD}r|3X6)v?a?A(l+ur2$5HvFx!|b#!sx95rJhiEalY~k)*Ns4R)_11 z;4<7qPZm*iC|!5D+NaSci=#N+)Up!y5*a1|A00{#NNBK+Ln(do-n4+#VcD@8G73!ogy+h`7kKZ=pesqMbY$i@K%IYp^Y#1%>&5m2t)H~^@WckN6VFY3N_>1WULQpfnnBPZRp+6FfF zzQ!+8wvUBRJrn^URS?Q6Q0(uYE{p+r>Yk8-RKFd));3VwZ~A*b{G$Eb&OU^h%W=K_ zEK~k0ou0d!#wx;A?w#G~7Y_U+N0{Hc4k}Q)jqBD0e?Sj6^0+L~N@}~EkmV~(lJNWB z5JPngjEvp>ln>W}wE|6?=%?AkY3##bOw?`glyV!HM=_JAce`T94v6=%lj`^!NfUpGM%A9;!h73V8Y3aYM3 zo`QtMn|%7_bWI%WmHjt3xP?Llddvy~ug{(5>yIumn<(lFC?-p~`4$K*{ql99-kRiW zend?BrBlg`Y@|H>S7!Kkme5!p&D+1Y@~K%kUr)CCt__X4Kv<&OP*>u3qWhnLelv9B z9}vi&rsBT_C;$A=hC@H_=@u_vPv`#$e!&h~coGfd(`B|ulmCM@{Ri>=&%^ukQy~7w zOXussx(`-^)oHZUY;O3wcX@^_wkD!oCAmOKyL+sS&8`0W^(kS1Gm)3Q;0Jd!wq@2! zF7eIb(<%KbnhnxvfEnHHx$DOMA@?~`9^MQW;p59#B8sLW4p}mCa?Ubv_$nc#1Rm`j z;64;}m&zm@69uF5!CFoIEQnNlN*8*8@I;EkEC?oS#Va_Nh62rwZno}EfcfWJ+ZB^G zY>ya#U+y^{vN8u~&m62xNVqcwi2;jJf;?kC6VCpc+O8c>Q>?E3q&3j8QlBCec^<`nyJ) zn*`zigxTLXg(htj@Dt)b-jLhi^9G~_-GC$L?A|t3>pU|%J3H5z0D|B2e-7)M1t?8M z?I0!ftT~jLhiY4Kk`l;6PowwL%;aL17R2`#vl2lTMR1?+i@4ELlBf($KU8Xh-o3S) zL{3J=;S4tOnn0~Pa9i<6JLR`WvCf4(;6w}2cmCkLZ)CF zYvSC9xQUWD*MK3El5JU1Uyaco>{*@Ub~_;YuC}&~Vm5uQf!;R!(UrxgqE(B5Gs@M* zagnnT4B(z40@$YY8;grp?1}SzNd|q9K|=! zu&}tW=L7H}lPI1RXo4B$J*7d*xGNIy~$4yj3B@f@KLQhT#8 z;g4C`O(P``)(%u<^5kMug2bGob>h*RNS)c<)0WT8Vq}&8gqtn0TATWS{ZIro_GS~S zRw7o@J6I7liPXZ{IUr5@wm=ZC=dox*ujGMU+fxIuxtii2=_4-hX| z)V8GVb~d89-eK)~=Q>Q0Qy8)kGT!xiBTO3=axISEHyTt~@^YWRSW+Os1 zxOuxnX>csb3gQp75?IjOciRQbzhBKQx@$m6zvO~%0 z>AfD$WP3A^fC7%nNND9_xl!#x9T0j9_8xQqo9w_YkLqy;QjTX|z&-&M;GO@HPa-0U z6@P2l!#;k=n;x;8*H{?^s9q1%B43cp;hpE==EctJQgwRPyZ-jAg~BCEP+)UQUcVyz z){p7wKK0=;?V+Lr<=kZOoLLbYHJ{27!%v{_G3b;S#vF<`_^I zR?jmMF6seP8=s=kUiLk%V})i=vgdQt$1U+9NhYf5d`0k@aqO9&=!1InpxgHHaF0I- zDyq0)gQX5L5V-37FTPn)MrsKW%Rj#bAFTq-2oj#Zh5Pz57dO`|4D>>P)}^~@U{Sjl z>Tv@0mgc|a>8|ThqnZiKiv4QuAgD%3KJIH9`=NWHUx6OAmEY=cIGc!KiaQ+&S?(I2}3c5EEw= zYn(*&_+?~Pj3kUzM^5hNoQNjG$(Xrj6wBxwdyVa9RReKkqi z*ZNBErg6ut++#-TBX%yDZSU&hZrGAZYu(dYim5fQypNB3ApESH)zB{ds7@r=x!8Ut zsm{&(VQ$9?lRp{VvFWT;tNXlveVD`*X0zZ>I82^_fK!FYmQ9Ogr1( z+Q|L+*L4lXbMHHnx|5Y&;X9Ol)5SceN&ATrv&Ft-bqeLDh>~B%-v9RSHmu#gDB0~A z?i|OVvVD}GYgZP3N98(rFD1AeypL+2A7yWpds1WyetY4c8}k~BfOQQ>e=Y-e23X$dc^8M^NY%$?>hCQ3}bv; zC{WYb3OPSUi zmwU5J{Ntr_19$L`qNAf1W`(pYbwnr5V zl5!$O$L@lK8oq@}im4+G7EUkB2y$x`dW1TjP#E;dE*%}*EAYg+3Ewo*Nss(K3M=4? z;)6&W*wcK0(nlC>K-{LVU74dISO%`K9c;8*7Pxko!`X zz#Mro<&`0ZvIwAun0r8_BHjK~$@@*KuV;f>u{opN5=F`{3wR7DiKzPc?%lbyjtG@C zfQzbub__D!qiY?s`D^CYN)+&ZEq@MDud=OIvJWm_rZ#A`R~yfK_gFGa=fjK8R zJtbvS1Bpa#qu-F!Pv~NB6e97fMAj^u<%KMycl0yzva=6I+Q>s6QyNZoQLVduid;2W z?Xzroh5PU8_1puOcW#2i@+Xt;S4x*?4Z1PYmKgO_<}Cr22^+dD{`drV^Q6h%qSDn4 z(@G<4^g9giQg&HKevIUE{BUk6QmfWAMWn(}sFK`ICT|4hagS(Umq!_&sd8IVH{6}P zq{2K@z%4j<3By&K2x z`W-p9`ccjHZ(9pIYP`yQiZC-XGmVQQF_2=RMbtZLDq#Bo0s!dDvL_@gVZXjYq**JD zOSY)M#r?V&$>!l0xGUu~g*_#=N>C-r>J4f~GrbaXxP!wqS7ArvIo z3KSI(mp2j4#)D_(FdeTHE^+d@(gr3Foh{jfyr|BD0ko4d+WRV5RVxY{q2aV%!b6NP z<6Zu7UMV8G_tT@KvQv5ulYA>&FmS)LQIC{ADcrX-EdKL~`c^V{?WI{u3WK34RJsTV rarVIf?HYJ + [*] Build static binary (no shared libs) +``` + +## 3. Install BusyBox + +Install the compiled BusyBox to default path: `_install`. + +``` shell +make install +``` + +## 4. Config BusyBox + +```shell +cd _install +mkdir proc sys dev etc etc/init.d +touch etc/init.d/rcS + +cat >etc/init.d/rcS< /tmp/StratoVirt-initrd +``` + +## 6. Running StratoVirt with initrd + +```shell +$ ./stratovirt \ + -kernel /path/to/kernel \ + -append console=ttyS0 reboot=k panic=1 root=/dev/ram rdinit=/bin/sh \ + -initrd /tmp/StratoVirt-initrd \ + -api-channel unix:/path/to/socket \ + -serial stdio +``` + diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 00000000..bc8cefb5 --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,220 @@ +# Getting Started with StratoVirt + +## 1. Prepare + +* Host os + + StratoVirt supports host os Linux 4.19 in both x86_64 and aarch64 platform. + + KVM mod should be supported in your host Linux kernel. + +* Authority + + You should have read/write access to `/dev/kvm`. If not, you can get your access by: + + ```shell + $ sudo setfacl -m u:${USER}:rw /dev/kvm + ``` +## 2. Build StratoVirt from source + +### 2.1 Check rust environment + +To build StratoVirt, make sure that Rust language environment and Cargo have already installed. + The version of rustc is suggested up to 1.42. + +```shell +$ rustc -version +rustc 1.42.0 +``` + +If you want to deploy rust environment, the following link will help you: + + + +### 2.2 Build with musl-libc + +With musl-libc, StratoVirt is linked statically and having no library dependencies. It's the + default target to build StratoVirt. + +```shell +# Add musl rust tool-chain, if installed, skip +$ arch=`uname -m` +$ rustup target add ${arch}-unknown-linux-musl + +# Build StratoVirt +$ cargo build --release --target ${arch}-unknown-linux-musl +``` + +Now you can find StratoVirt binary in `target/${arch}-unknown-linux-musl/release/stratovirt`. + +### 2.3 Build with glibc + +StratoVirt can also build using glibc toolchains. By this way, StratoVirt is linked dynamically. + +```shell +# Add gnu rust tool-chain, if installed, skip +$ arch=`uname -m` +$ rustup target add ${arch}-unknown-linux-gnu + +# Build StratoVirt +$ cargo build --release --target ${arch}-unknown-linux-gnu +``` + +Now you can find StratoVirt binary in `target/${arch}-unknown-linux-gnu/release/stratovirt`. + +## 3. Get Kernel and rootfs Image + +### 3.1 Build kernel + +The current version StratoVirt supports only PE-format kernel images in both x86_64 and aarch64 +platforms, which can be built with: + +1. Firstly, get the openEuler kernel source code: + + ```shell + $ git clone https://gitee.com/openeuler/kernel + $ cd kernel + ``` + +2. Check out the kernel version to kernel-4.19: + + ```shell + $ git checkout kernel-4.19 + ``` + +3. Configure your linux kernel build. You can use [our recommended config](./kernel_config) and +copy it to `kernel` path as `.config`. You can interactive config by: + + ```shell + $ make menuconfig + ``` + +4. Build and transform kernel image to PE format. + + ```shell + $ make -j vmlinux && objcopy -O binary vmlinux vmlinux.bin + ``` + +### 3.2 Make rootfs + +Rootfs image is a file system image. An EXT4-format image with an `init` can be mounted at + boot time in StratoVirt. Below is a simple way to make a EXT4 rootfs image: + +1. Prepare a properly-sized file(e.g. 1G): + + ```shell + $ dd if=/dev/zero of=./rootfs.ext4 bs=1G count=20 + ``` + +2. Create an empty EXT4 file system on this file: + + ```shell + $ mkfs.ext4 ./rootfs.ext4 + ``` + +3. Mount the file image: + + ```shell + $ sudo mount ./rootfs.ext4 /mnt/rootfs && cd /mnt/rootfs + ``` + +4. Get the [latest alpine-minirootfs](http://dl-cdn.alpinelinux.org/alpine) for your platform(e.g. + aarch64 3.12.0): + + ```shell + $ wget http://dl-cdn.alpinelinux.org/alpine/latest-stable/releases/aarch64/alpine-minirootfs-3.12.0-aarch64.tar.gz + $ tar -zxvf alpine-minirootfs-3.12.0-aarch64.tar.gz + $ rm alpine-minirootfs-3.12.0-aarch64.tar.gz + ``` + +5. Make a simple `/sbin/init` for EXT4 file image. + + ```shell + $ cat > sbin/init < Date: Tue, 22 Sep 2020 18:36:02 +0800 Subject: [PATCH 2/2] stratovirt: first code commit add initial code for stratovirt. Signed-off-by: Bin Wu Signed-off-by: Zhigang Wang , Signed-off-by: Hailiang Zhang , Signed-off-by: Liang Zhang , Signed-off-by: Yandong Xu , Signed-off-by: Xiaohe Yang , Signed-off-by: Jie Yang , Signed-off-by: Fei Xu , Signed-off-by: Wei Gao , Signed-off-by: Ying Fang , Signed-off-by: Zeyu Jin , Signed-off-by: Ming Yang --- .cargo/config | 19 + Cargo.lock | 299 ++++ Cargo.toml | 46 + address_space/Cargo.toml | 16 + address_space/src/address.rs | 298 ++++ address_space/src/address_space.rs | 703 ++++++++ address_space/src/host_mmap.rs | 154 ++ address_space/src/lib.rs | 155 ++ address_space/src/listener.rs | 650 +++++++ address_space/src/region.rs | 942 ++++++++++ boot_loader/Cargo.toml | 20 + boot_loader/src/aarch64/mod.rs | 116 ++ boot_loader/src/lib.rs | 181 ++ boot_loader/src/x86_64/bootparam.rs | 199 +++ boot_loader/src/x86_64/gdt.rs | 112 ++ boot_loader/src/x86_64/mod.rs | 520 ++++++ boot_loader/src/x86_64/mptable.rs | 259 +++ device_model/Cargo.toml | 26 + device_model/src/cpu/aarch64/mod.rs | 260 +++ device_model/src/cpu/mod.rs | 633 +++++++ device_model/src/cpu/x86_64/cpuid.rs | 31 + device_model/src/cpu/x86_64/mod.rs | 481 ++++++ .../src/interrupt_controller/aarch64/gicv3.rs | 484 ++++++ .../src/interrupt_controller/aarch64/mod.rs | 153 ++ device_model/src/interrupt_controller/mod.rs | 33 + device_model/src/legacy/mod.rs | 33 + device_model/src/legacy/pl031.rs | 178 ++ device_model/src/legacy/serial.rs | 459 +++++ device_model/src/lib.rs | 68 + device_model/src/micro_vm/cmdline.rs | 448 +++++ device_model/src/micro_vm/main_loop.rs | 77 + device_model/src/micro_vm/micro_syscall.rs | 162 ++ device_model/src/micro_vm/mod.rs | 1262 ++++++++++++++ device_model/src/mmio/bus.rs | 391 +++++ device_model/src/mmio/mod.rs | 179 ++ device_model/src/mmio/virtio_mmio.rs | 1213 +++++++++++++ device_model/src/virtio/block.rs | 879 ++++++++++ device_model/src/virtio/console.rs | 398 +++++ device_model/src/virtio/mod.rs | 235 +++ device_model/src/virtio/net.rs | 706 ++++++++ device_model/src/virtio/queue.rs | 1525 +++++++++++++++++ device_model/src/virtio/vhost/kernel/mod.rs | 435 +++++ device_model/src/virtio/vhost/kernel/net.rs | 267 +++ device_model/src/virtio/vhost/kernel/vsock.rs | 226 +++ device_model/src/virtio/vhost/mod.rs | 82 + license/LICENSE | 127 ++ ...Third_Party_Open_Source_Software_Notice.md | 358 ++++ machine_manager/Cargo.toml | 22 + machine_manager/src/config/boot_source.rs | 269 +++ machine_manager/src/config/chardev.rs | 179 ++ machine_manager/src/config/fs.rs | 123 ++ machine_manager/src/config/machine_config.rs | 143 ++ machine_manager/src/config/mod.rs | 458 +++++ machine_manager/src/config/network.rs | 163 ++ machine_manager/src/lib.rs | 46 + machine_manager/src/machine.rs | 174 ++ machine_manager/src/qmp/mod.rs | 763 +++++++++ machine_manager/src/qmp/qmp_schema.rs | 819 +++++++++ machine_manager/src/socket.rs | 850 +++++++++ src/main.rs | 150 ++ util/Cargo.toml | 16 + util/src/aio/libaio.rs | 122 ++ util/src/aio/mod.rs | 183 ++ util/src/aio/raw.rs | 42 + util/src/arg_parser.rs | 631 +++++++ util/src/byte_code.rs | 69 + util/src/checksum.rs | 35 + util/src/daemonize.rs | 176 ++ util/src/device_tree.rs | 219 +++ util/src/epoll_context.rs | 576 +++++++ util/src/kvm_ioctls_ext.rs | 56 + util/src/lib.rs | 130 ++ util/src/link_list.rs | 133 ++ util/src/logger.rs | 120 ++ util/src/num_ops.rs | 140 ++ util/src/offsetof.rs | 245 +++ util/src/seccomp.rs | 583 +++++++ util/src/tap.rs | 115 ++ util/src/unix.rs | 33 + 79 files changed, 24281 insertions(+) create mode 100644 .cargo/config create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 address_space/Cargo.toml create mode 100644 address_space/src/address.rs create mode 100644 address_space/src/address_space.rs create mode 100644 address_space/src/host_mmap.rs create mode 100644 address_space/src/lib.rs create mode 100644 address_space/src/listener.rs create mode 100644 address_space/src/region.rs create mode 100644 boot_loader/Cargo.toml create mode 100644 boot_loader/src/aarch64/mod.rs create mode 100644 boot_loader/src/lib.rs create mode 100644 boot_loader/src/x86_64/bootparam.rs create mode 100644 boot_loader/src/x86_64/gdt.rs create mode 100644 boot_loader/src/x86_64/mod.rs create mode 100644 boot_loader/src/x86_64/mptable.rs create mode 100644 device_model/Cargo.toml create mode 100644 device_model/src/cpu/aarch64/mod.rs create mode 100644 device_model/src/cpu/mod.rs create mode 100644 device_model/src/cpu/x86_64/cpuid.rs create mode 100644 device_model/src/cpu/x86_64/mod.rs create mode 100644 device_model/src/interrupt_controller/aarch64/gicv3.rs create mode 100644 device_model/src/interrupt_controller/aarch64/mod.rs create mode 100644 device_model/src/interrupt_controller/mod.rs create mode 100644 device_model/src/legacy/mod.rs create mode 100644 device_model/src/legacy/pl031.rs create mode 100644 device_model/src/legacy/serial.rs create mode 100644 device_model/src/lib.rs create mode 100644 device_model/src/micro_vm/cmdline.rs create mode 100644 device_model/src/micro_vm/main_loop.rs create mode 100644 device_model/src/micro_vm/micro_syscall.rs create mode 100644 device_model/src/micro_vm/mod.rs create mode 100644 device_model/src/mmio/bus.rs create mode 100644 device_model/src/mmio/mod.rs create mode 100644 device_model/src/mmio/virtio_mmio.rs create mode 100644 device_model/src/virtio/block.rs create mode 100644 device_model/src/virtio/console.rs create mode 100644 device_model/src/virtio/mod.rs create mode 100644 device_model/src/virtio/net.rs create mode 100644 device_model/src/virtio/queue.rs create mode 100644 device_model/src/virtio/vhost/kernel/mod.rs create mode 100644 device_model/src/virtio/vhost/kernel/net.rs create mode 100644 device_model/src/virtio/vhost/kernel/vsock.rs create mode 100644 device_model/src/virtio/vhost/mod.rs create mode 100644 license/LICENSE create mode 100644 license/Third_Party_Open_Source_Software_Notice.md create mode 100644 machine_manager/Cargo.toml create mode 100644 machine_manager/src/config/boot_source.rs create mode 100644 machine_manager/src/config/chardev.rs create mode 100644 machine_manager/src/config/fs.rs create mode 100644 machine_manager/src/config/machine_config.rs create mode 100644 machine_manager/src/config/mod.rs create mode 100644 machine_manager/src/config/network.rs create mode 100644 machine_manager/src/lib.rs create mode 100644 machine_manager/src/machine.rs create mode 100644 machine_manager/src/qmp/mod.rs create mode 100644 machine_manager/src/qmp/qmp_schema.rs create mode 100644 machine_manager/src/socket.rs create mode 100644 src/main.rs create mode 100644 util/Cargo.toml create mode 100644 util/src/aio/libaio.rs create mode 100644 util/src/aio/mod.rs create mode 100644 util/src/aio/raw.rs create mode 100644 util/src/arg_parser.rs create mode 100644 util/src/byte_code.rs create mode 100644 util/src/checksum.rs create mode 100644 util/src/daemonize.rs create mode 100644 util/src/device_tree.rs create mode 100644 util/src/epoll_context.rs create mode 100644 util/src/kvm_ioctls_ext.rs create mode 100644 util/src/lib.rs create mode 100644 util/src/link_list.rs create mode 100644 util/src/logger.rs create mode 100644 util/src/num_ops.rs create mode 100644 util/src/offsetof.rs create mode 100644 util/src/seccomp.rs create mode 100644 util/src/tap.rs create mode 100644 util/src/unix.rs diff --git a/.cargo/config b/.cargo/config new file mode 100644 index 00000000..0b1372f0 --- /dev/null +++ b/.cargo/config @@ -0,0 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +# +# StratoVirt is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan +# PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +# NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +[build] + +[target.'cfg(any(target_arch="aarch64"))'] +rustflags = [ + "-C", "link-arg=-lgcc", + "-C", "link-arg=-lfdt", +] diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..00197ee8 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,299 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "StratoVirt" +version = "0.1.0" +dependencies = [ + "device_model 0.1.0", + "error-chain 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "machine_manager 0.1.0", + "util 0.1.0", + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "addr2line" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "gimli 0.21.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "address_space" +version = "0.1.0" +dependencies = [ + "error-chain 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-bindings 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-ioctls 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "util 0.1.0", + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "adler32" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "backtrace" +version = "0.3.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "addr2line 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", + "miniz_oxide 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "object 0.20.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "boot_loader" +version = "0.1.0" +dependencies = [ + "address_space 0.1.0", + "error-chain 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-bindings 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-ioctls 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "util 0.1.0", + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "device_model" +version = "0.1.0" +dependencies = [ + "address_space 0.1.0", + "boot_loader 0.1.0", + "byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)", + "error-chain 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-bindings 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-ioctls 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "machine_manager 0.1.0", + "serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.55 (registry+https://github.com/rust-lang/crates.io-index)", + "util 0.1.0", + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "error-chain" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace 0.3.49 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "gimli" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "itoa" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "kvm-bindings" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "kvm-ioctls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kvm-bindings 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libc" +version = "0.2.71" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "log" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "machine_manager" +version = "0.1.0" +dependencies = [ + "error-chain 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.55 (registry+https://github.com/rust-lang/crates.io-index)", + "util 0.1.0", + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "miniz_oxide" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "adler32 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "object" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "proc-macro2" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde" +version = "1.0.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde_derive 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_derive" +version = "1.0.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_json" +version = "1.0.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "itoa 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "syn" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "unicode-xid" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "util" +version = "0.1.0" +dependencies = [ + "error-chain 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-bindings 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "kvm-ioctls 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "version_check" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vmm-sys-util" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[metadata] +"checksum addr2line 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a49806b9dadc843c61e7c97e72490ad7f7220ae249012fbda9ad0609457c0543" +"checksum adler32 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "567b077b825e468cc974f0020d4082ee6e03132512f207ef1a02fd5d00d1f32d" +"checksum backtrace 0.3.49 (registry+https://github.com/rust-lang/crates.io-index)" = "05100821de9e028f12ae3d189176b41ee198341eb8f369956407fea2f5cc666c" +"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +"checksum byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +"checksum error-chain 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)" = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" +"checksum gimli 0.21.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bcc8e0c9bce37868955864dbecd2b1ab2bdf967e6f28066d65aaac620444b65c" +"checksum itoa 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e" +"checksum kvm-bindings 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d381156ad52005b4655a9421401f02b80f9f049e653496e3ea6639a83fc12453" +"checksum kvm-ioctls 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d99720f5df3814a7188f095ad6774775b7635dfdd62b7c091ce7e00a51c3c109" +"checksum libc 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)" = "9457b06509d27052635f90d6466700c65095fdf75409b3fbdd903e988b886f49" +"checksum log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +"checksum miniz_oxide 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "791daaae1ed6889560f8c4359194f56648355540573244a5448a83ba1ecc7435" +"checksum object 0.20.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1ab52be62400ca80aa00285d25253d7f7c437b7375c4de678f5405d3afe82ca5" +"checksum proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)" = "beae6331a816b1f65d04c45b078fd8e6c93e8071771f41b8163255bbd8d7c8fa" +"checksum quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" +"checksum rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783" +"checksum ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +"checksum serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3" +"checksum serde_derive 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "2a0be94b04690fbaed37cddffc5c134bf537c8e3329d53e982fe04c374978f8e" +"checksum serde_json 1.0.55 (registry+https://github.com/rust-lang/crates.io-index)" = "ec2c5d7e739bc07a3e73381a39d61fdb5f671c60c1df26a130690665803d8226" +"checksum syn 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "239f255b9e3429350f188c27b807fc9920a15eb9145230ff1a7d054c08fec319" +"checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" +"checksum version_check 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +"checksum vmm-sys-util 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "183d25b56a61a6f518ef464ac578e790f04added34dfaab59a453d8a03cb7bd0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..7a96d2ee --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "StratoVirt" +version = "0.1.0" +authors = ["Huawei StratoVirt Team"] +edition = "2018" +description = "a lightweight hypervisor with low memory overhead and fast booting speed" +license = "Mulan PSL v2" + +[dependencies] +util = { path = "util" } +machine_manager = { path = "machine_manager" } +device_model = { path = "device_model" } + +log = "0.4.8" +error-chain = "0.12.4" +vmm-sys-util = "0.6.1" + +[workspace] +members = [ + "address_space", + "machine_manager", + "boot_loader", + "util", + "device_model", +] + +[[bin]] +name = "stratovirt" +path = "src/main.rs" + +[features] +default = ["qmp"] +qmp = [] + +[package.metadata.rpm.cargo] +buildflags = ["--release"] + +[package.metadata.rpm.targets] +stratovirt = { path = "/usr/bin/stratovirt" } + +[profile.dev] +panic = "abort" + +[profile.release] +panic = "abort" +lto = true diff --git a/address_space/Cargo.toml b/address_space/Cargo.toml new file mode 100644 index 00000000..d4003b08 --- /dev/null +++ b/address_space/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "address_space" +version = "0.1.0" +authors = ["Huawei StratoVirt Team"] +license = "Mulan PSL v2" +description = "provide memory management for VM" + +[dependencies] +util = {path = "../util"} + +kvm-ioctls = "0.5.0" +libc = "0.2.71" +kvm-bindings = "0.2.0" +vmm-sys-util = "0.6.1" +error-chain = "0.12.4" +log = "0.4.8" diff --git a/address_space/src/address.rs b/address_space/src/address.rs new file mode 100644 index 00000000..b5c877c1 --- /dev/null +++ b/address_space/src/address.rs @@ -0,0 +1,298 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; +use std::ops::{BitAnd, BitOr}; + +use util::num_ops::{round_down, round_up}; + +/// Represent the address in given address space. +#[derive(Copy, Clone, Default, Debug, Eq, PartialEq, Ord, PartialOrd)] +pub struct GuestAddress(pub u64); + +impl GuestAddress { + /// Get the raw value of `GuestAddress`. + pub fn raw_value(self) -> u64 { + self.0 + } + + /// Get the offset of this address from the given address. + /// The caller has to guarantee no underflow occurs. + /// + /// # Arguments + /// + /// * `other` -Other `GuestAddress`. + pub fn offset_from(self, other: Self) -> u64 { + self.raw_value() - other.raw_value() + } + + /// Return address of this address plus the given offset, return None if overflows. + /// + /// # Arguments + /// + /// * `offset` - Offset address. + pub fn checked_add(self, offset: u64) -> Option { + self.0.checked_add(offset).map(Self) + } + + /// Return address of this address minus the given offset, return None if overflows. + /// + /// # Arguments + /// + /// * `offset` - Offset address. + pub fn checked_sub(self, offset: u64) -> Option { + self.0.checked_sub(offset).map(Self) + } + + /// Return address of this address plus the given offset. + /// The caller has to guarantee no overflow occurs. + /// + /// # Arguments + /// + /// * `offset` - Offset address. + pub fn unchecked_add(self, offset: u64) -> Self { + Self(self.0 + offset) + } + + /// Return address of this address minus the given offset. + /// The caller has to guarantee no underflow occurs. + /// + /// # Arguments + /// + /// * `offset` - Offset address. + pub fn unchecked_sub(self, offset: u64) -> Self { + Self(self.0 - offset) + } + + /// Return aligned-up address of Self, according to the given alignment. + /// Return None if overflow occurs. + /// + /// # Arguments + /// + /// * `alignment` - Alignment base. + pub fn align_up(self, alignment: u64) -> Option { + round_up(self.0, alignment).map(Self) + } + + /// Return aligned-down address of Self, according to the given alignment. + /// Return None if underflow occurs. + /// + /// # Arguments + /// + /// * `alignment` - Alignment base. + pub fn align_down(self, alignment: u64) -> Option { + round_down(self.0, alignment).map(Self) + } +} + +/// Implement BitAnd trait for GuestAddress. +impl BitAnd for GuestAddress { + type Output = GuestAddress; + fn bitand(self, other: u64) -> GuestAddress { + GuestAddress(self.0 & other) + } +} + +/// Implement BitOr trait for GuestAddress. +impl BitOr for GuestAddress { + type Output = GuestAddress; + fn bitor(self, other: u64) -> GuestAddress { + GuestAddress(self.0 | other) + } +} + +/// Represent an address range. +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct AddressRange { + /// Base address. + pub base: GuestAddress, + /// Size of memory segment. + pub size: u64, +} + +/// Implement From trait for AddressRange. +impl From<(u64, u64)> for AddressRange { + fn from(range: (u64, u64)) -> AddressRange { + AddressRange { + base: GuestAddress(range.0), + size: range.1, + } + } +} + +/// Implement PartialOrd trait for AddressRange. +impl PartialOrd for AddressRange { + fn partial_cmp(&self, other: &AddressRange) -> Option { + if self.base != other.base { + self.base.partial_cmp(&other.base) + } else { + self.size.partial_cmp(&other.size) + } + } +} + +/// Implement Ord trait for AddressRange. +impl Ord for AddressRange { + fn cmp(&self, other: &AddressRange) -> Ordering { + self.partial_cmp(&other).unwrap() + } +} + +impl AddressRange { + /// Create a new `AddressRange`. + /// + /// # Arguments + /// + /// * `base` - The base address of a AddressRange. + /// * `size` - The size of a AddressRange. + pub fn new(base: GuestAddress, size: u64) -> AddressRange { + AddressRange { base, size } + } + + /// Find the intersection with other `AddressRange`. + /// Return the intersection of Self and the given address range. + /// Return None if not overlaps. + /// + /// # Arguments + /// + /// * `other` - Other AddressRange. + pub fn find_intersection(&self, other: AddressRange) -> Option { + let end = self.base.checked_add(self.size)?; + let other_end = other.base.checked_add(other.size)?; + + if end <= other.base || other_end <= self.base { + return None; + } + + let start = std::cmp::max(self.base, other.base); + Some(AddressRange { + base: start, + size: std::cmp::min(end, other_end).offset_from(start), + }) + } + + /// Return the end address of this address range. + #[inline] + pub fn end_addr(&self) -> GuestAddress { + self.base.unchecked_add(self.size) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_address_add() { + let addr1 = GuestAddress(0xAE); + let offset: u64 = 0x01; + + let max_addr = GuestAddress(u64::max_value()); + let min_addr = GuestAddress(u64::min_value()); + + assert_eq!(Some(GuestAddress(0xAF)), addr1.checked_add(offset)); + assert_eq!(None, max_addr.checked_add(offset)); + assert_eq!(None, min_addr.checked_sub(offset)); + } + + #[test] + fn test_offset() { + let addr1 = GuestAddress(0xAE); + let addr2 = GuestAddress(0xA0); + let addr3 = GuestAddress(0xE); + + assert_eq!(addr3.raw_value(), addr1.offset_from(addr2)); + } + + #[test] + fn test_address_cmp() { + let addr1 = GuestAddress(0xAE); + let addr2 = GuestAddress(0x63); + let addr3 = GuestAddress(0xAE); + + assert!(addr1 == addr3); + assert!(addr1 > addr2); + assert!(addr2 < addr3); + assert!(addr1 >= addr3); + assert!(addr1 <= addr3); + assert!(addr1 >= addr2); + assert!(addr2 <= addr3); + } + #[test] + fn test_address_equal() { + let addr1 = GuestAddress(0x111); + let addr2 = GuestAddress(0x123); + let addr3 = GuestAddress(0x123); + + assert_eq!(addr2, addr3); + assert_ne!(addr1, addr2); + } + + #[test] + fn test_address_mask() { + let addr = GuestAddress(0xAEAE); + + assert_eq!(GuestAddress(0xAE00), addr & 0xffff00); + assert_eq!(GuestAddress(0xAEAE), addr & 0xFFFFFF); + assert_eq!(GuestAddress(0xFFFF), addr | 0xFFFF); + assert_eq!(GuestAddress(0xFFAE), addr | 0xFF00); + } + + #[test] + fn test_address_align() { + let addr1 = GuestAddress(0x1001); + let addr2 = GuestAddress(0x1009); + + assert_eq!(Some(GuestAddress(0x1010)), addr1.align_up(0x10)); + assert_eq!(Some(GuestAddress(0x1000)), addr2.align_down(0x10)); + } + + #[test] + fn test_address_range_intersects() { + let range1 = AddressRange { + base: GuestAddress(0_u64), + size: 8_u64, + }; + let range2 = AddressRange { + base: GuestAddress(0_u64), + size: 0_u64, + }; + let range3 = AddressRange { + base: GuestAddress(5_u64), + size: 9_u64, + }; + let range4 = AddressRange { + base: GuestAddress(8_u64), + size: 1u64, + }; + + assert!(range1.find_intersection(range2).is_none()); + assert_eq!( + range1.find_intersection(range3), + Some(AddressRange { + base: GuestAddress(5_u64), + size: 3_u64 + }) + ); + assert!(range1.find_intersection(range4).is_none()); + } + + #[test] + fn test_address_range_end_addr() { + let range = AddressRange { + base: GuestAddress(55_u64), + size: 10_u64, + }; + + assert_eq!(range.end_addr(), GuestAddress(65_u64)); + } +} diff --git a/address_space/src/address_space.rs b/address_space/src/address_space.rs new file mode 100644 index 00000000..ed00b9b6 --- /dev/null +++ b/address_space/src/address_space.rs @@ -0,0 +1,703 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::sync::{Arc, Mutex, RwLock}; + +use util::byte_code::ByteCode; + +use crate::errors::{ErrorKind, Result, ResultExt}; +use crate::region::FlatView; +use crate::{ + AddressRange, FlatRange, GuestAddress, Listener, ListenerReqType, Region, RegionIoEventFd, + RegionType, +}; + +/// Address Space of memory. +#[derive(Clone)] +pub struct AddressSpace { + /// Root Region of this AddressSpace. + root: Region, + /// Flat_view is the output of rendering all regions in parent address-space, + /// every time the topology changed (add/delete region), flat_view would be updated. + flat_view: Arc>, + /// The triggered call-backs when flat_view changed. + listeners: Arc>>>, + /// The vector buffer would help in comparison stage of topology update. + ioeventfds: Arc>>, +} + +impl AddressSpace { + /// Create a new `AddressSpace` according to the given root region. + /// + /// # Arguments + /// + /// * `root` - Root region of address space. + pub fn new(root: Region) -> Result> { + let space = Arc::new(AddressSpace { + root: root.clone(), + flat_view: Arc::new(RwLock::new(FlatView::default())), + listeners: Arc::new(Mutex::new(Vec::new())), + ioeventfds: Arc::new(Mutex::new(Vec::new())), + }); + + root.set_belonged_address_space(&space); + if !space.root.subregions().is_empty() { + space.update_topology()?; + } + + Ok(space) + } + + /// Get the copy of the root of AddressSpace. + pub fn root(&self) -> &Region { + &self.root + } + + /// Register the listener to the `AddressSpace`. + /// + /// # Arguments + /// + /// * `listener` - Provided methods for Listener. + /// + /// # Errors + /// + /// Return Error if fail to call `listener`. + pub fn register_listener(&self, listener: Box) -> Result<()> { + for fr in self.flat_view.read().unwrap().0.iter() { + listener + .handle_request(Some(&fr), None, ListenerReqType::AddRegion) + .chain_err(|| "Failed to call listener")?; + } + + let mut idx = 0; + let mut mls = self.listeners.lock().unwrap(); + while idx < mls.len() { + let ml = mls.get(idx).unwrap(); + if ml.priority() >= listener.priority() { + break; + } + idx += 1; + } + mls.insert(idx, listener); + Ok(()) + } + + /// Call listener to deal with the request. + /// + /// # Arguments + /// + /// * `flat_range` - Available when operating `Region`. + /// * `evtfd` - Available when operating `Ioeventfd`. + /// * `req_type` - One selection of how to operate the `Region` or `Ioeventfd`. + /// + /// # Errors + /// + /// Return Error if fail to call listener. + fn call_listeners( + &self, + flat_range: Option<&FlatRange>, + evtfd: Option<&RegionIoEventFd>, + req_type: ListenerReqType, + ) -> Result<()> { + let listeners = self.listeners.lock().unwrap(); + match req_type { + ListenerReqType::DeleteRegion | ListenerReqType::AddIoeventfd => { + listeners.iter().rev().try_for_each(|ml| { + ml.handle_request(flat_range, evtfd, req_type) + .chain_err(|| "Failed to call listener") + }) + } + _ => listeners.iter().try_for_each(|ml| { + ml.handle_request(flat_range, evtfd, req_type) + .chain_err(|| "Failed to call listener") + }), + } + } + + /// Update the topology pass. + /// + /// # Arguments + /// + /// * `old_view` - Old flatview. + /// * `new_view` - New flatview. + /// * `is_add` - Add `new_view` if `true` otherwise replace the `old_view` with `new_view`. + fn update_topology_pass( + &self, + old_view: &FlatView, + new_view: &FlatView, + is_add: bool, + ) -> Result<()> { + let old_ranges = &old_view.0; + let new_ranges = &new_view.0; + let mut old_idx = 0_usize; + let mut new_idx = 0_usize; + + while old_idx < old_ranges.len() || new_idx < new_ranges.len() { + let old_range = old_ranges.get(old_idx); + let new_range = new_ranges.get(new_idx); + + if let Some(old_r) = old_range { + if let Some(new_r) = new_range { + if old_r.addr_range == new_r.addr_range { + old_idx += 1; + new_idx += 1; + continue; + } else if old_r.addr_range.base < new_r.addr_range.base + || (old_r.addr_range.base == new_r.addr_range.base + && old_r.addr_range.size != new_r.addr_range.size) + { + if !is_add { + self.call_listeners(Some(old_r), None, ListenerReqType::DeleteRegion)?; + } + old_idx += 1; + continue; + } + } else { + if !is_add { + self.call_listeners(Some(old_r), None, ListenerReqType::DeleteRegion)?; + } + old_idx += 1; + continue; + } + } + + // current old_range is None, or current new_range is before old_range + if is_add && new_range.is_some() { + self.call_listeners(new_range, None, ListenerReqType::AddRegion)?; + } + new_idx += 1; + } + + Ok(()) + } + + /// Updates ioeventfds pass according to New `RegionIoEventFd` array. + /// + /// # Arguments + /// + /// * `new_evtfds` - New `RegionIoEventFd` array. + fn update_ioeventfds_pass(&self, new_evtfds: &[RegionIoEventFd]) -> Result<()> { + let old_evtfds = self.ioeventfds.lock().unwrap(); + let mut old_idx = 0; + let mut new_idx = 0; + + while old_idx < old_evtfds.len() || new_idx < new_evtfds.len() { + let old_fd = old_evtfds.get(old_idx); + let new_fd = new_evtfds.get(new_idx); + if old_fd.is_some() && (new_fd.is_none() || old_fd.unwrap().before(new_fd.unwrap())) { + self.call_listeners(None, old_fd, ListenerReqType::DeleteIoeventfd)?; + old_idx += 1; + } else if new_fd.is_some() + && (old_fd.is_none() || new_fd.unwrap().before(old_fd.unwrap())) + { + self.call_listeners(None, new_fd, ListenerReqType::AddIoeventfd)?; + new_idx += 1; + } else { + old_idx += 1; + new_idx += 1; + } + } + + Ok(()) + } + + /// Update IoEvents. + fn update_ioeventfds(&self) -> Result<()> { + let flatview = self.flat_view.read().unwrap(); + let mut ioeventfds = Vec::::new(); + + for fr in flatview.0.iter() { + for evtfd in fr.owner.ioeventfds().iter() { + if fr.addr_range.find_intersection(evtfd.addr_range).is_some() { + ioeventfds.push(evtfd.try_clone()?); + } + } + } + + self.update_ioeventfds_pass(&ioeventfds)?; + *self.ioeventfds.lock().unwrap() = ioeventfds; + Ok(()) + } + + /// Return the start host address of Region where the `GuestAddress` belongs to. + /// + /// # Arguments + /// + /// * `addr` - Guest address. + pub fn get_host_address(&self, addr: GuestAddress) -> Option { + let view = &self.flat_view.read().unwrap().0; + + match view.binary_search_by_key(&addr, |x| x.addr_range.base) { + Ok(x) => view[x] + .owner + .get_host_address() + .map(|hva| hva + view[x].offset_in_region), + Err(x) if (x > 0 && addr < view[x - 1].addr_range.end_addr()) => { + let offset = addr.offset_from(view[x - 1].addr_range.base); + let offset_in_region = view[x - 1].offset_in_region; + view[x - 1] + .owner + .get_host_address() + .map(|hva| hva + offset_in_region + offset) + } + _ => None, + } + } + + /// Check if the GuestAddress is in one of Ram region. + /// + /// # Arguments + /// + /// * `addr` - Guest address. + pub fn address_in_memory(&self, addr: GuestAddress, size: u64) -> bool { + let view = &self.flat_view.read().unwrap().0; + + match view.binary_search_by_key(&addr, |x| x.addr_range.base) { + Ok(x) => { + view[x].owner.region_type() == RegionType::Ram && size <= view[x].addr_range.size + } + Err(x) if (x > 0 && addr < view[x - 1].addr_range.end_addr()) => { + view[x - 1].owner.region_type() == RegionType::Ram + && size <= view[x - 1].addr_range.end_addr().offset_from(addr) + } + _ => false, + } + } + + /// Return the biggest end address in all Ram regions in AddressSpace. + pub fn memory_end_address(&self) -> GuestAddress { + let view = &self.flat_view.read().unwrap().0; + view.iter() + .filter(|fr| fr.owner.region_type() == RegionType::Ram) + .max_by_key(|fr| fr.addr_range.end_addr()) + .map_or(GuestAddress(0), |fr| fr.addr_range.end_addr()) + } + + /// Read memory segment to `dst`. + /// + /// # Arguments + /// + /// * `dst` - Destination the data would be written to. + /// * `addr` - Start address. + /// * `count` - Size of data. + /// + /// # Errors + /// + /// Return Error if the `addr` is a invalid GuestAddress. + pub fn read(&self, dst: &mut dyn std::io::Write, addr: GuestAddress, count: u64) -> Result<()> { + let view = &self.flat_view.read().unwrap().0; + + let (fr, offset) = match view.binary_search_by_key(&addr, |x| x.addr_range.base) { + Ok(x) => (&view[x], 0), + Err(x) if (x > 0 && addr < view[x - 1].addr_range.end_addr()) => { + let fr = &view[x - 1]; + (fr, addr.offset_from(fr.addr_range.base)) + } + _ => return Err(ErrorKind::AddrInvalid(addr.raw_value()).into()), + }; + + fr.owner.read( + dst, + fr.addr_range.base.unchecked_sub(fr.offset_in_region), + fr.offset_in_region + offset, + count, + ) + } + + /// Write memory segment to `dst`. + /// + /// # Arguments + /// + /// * `dst` - Destination the data would be written to. + /// * `addr` - Start address. + /// * `count` - Size of data. + /// + /// # Errors + /// + /// Return Error if the `addr` is a invalid GuestAddress. + pub fn write(&self, src: &mut dyn std::io::Read, addr: GuestAddress, count: u64) -> Result<()> { + let view = &self.flat_view.read().unwrap().0; + + let (fr, offset) = match view.binary_search_by_key(&addr, |x| x.addr_range.base) { + Ok(x) => (&view[x], 0), + Err(x) if (x > 0 && addr < view[x - 1].addr_range.end_addr()) => { + let fr = &view[x - 1]; + (fr, addr.offset_from(fr.addr_range.base)) + } + _ => return Err(ErrorKind::AddrInvalid(addr.raw_value()).into()), + }; + + fr.owner.write( + src, + fr.addr_range.base.unchecked_sub(fr.offset_in_region), + fr.offset_in_region + offset, + count, + ) + } + + /// Write an object to memory. + /// + /// # Arguments + /// + /// * `data` - The object that will be written to the memory. + /// * `addr` - The start address of memory where the object will be written to. + /// + /// # Note + /// To use this method, it is necessary to implement `ByteCode` trait for your object. + pub fn write_object(&self, data: &T, addr: GuestAddress) -> Result<()> { + self.write(&mut data.as_bytes(), addr, std::mem::size_of::() as u64) + } + + /// Read some data from memory to form an object. + /// + /// # Arguments + /// + /// * `addr` - The start address of memory where the data will be read from. + /// + /// # Note + /// To use this method, it is necessary to implement `ByteCode` trait for your object. + pub fn read_object(&self, addr: GuestAddress) -> Result { + let mut obj = T::default(); + self.read( + &mut obj.as_mut_bytes(), + addr, + std::mem::size_of::() as u64, + )?; + Ok(obj) + } + + /// Update the topology of memory. + pub fn update_topology(&self) -> Result<()> { + let old_fv = self.flat_view.read().unwrap(); + + let addr_range = AddressRange::new(GuestAddress(0), self.root.size()); + let new_fv = self.root.generate_flatview(GuestAddress(0), addr_range)?; + + self.update_topology_pass(&old_fv, &new_fv, false)?; + self.update_topology_pass(&old_fv, &new_fv, true)?; + + drop(old_fv); + *self.flat_view.write().unwrap() = new_fv; + self.update_ioeventfds()?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use vmm_sys_util::eventfd::EventFd; + + use super::*; + use crate::{HostMemMapping, RegionOps}; + + #[derive(Default, Clone)] + struct TestListener { + reqs: Arc>>, + } + + impl Listener for TestListener { + fn priority(&self) -> i32 { + 2 + } + + fn handle_request( + &self, + range: Option<&FlatRange>, + eventfd: Option<&RegionIoEventFd>, + req_type: ListenerReqType, + ) -> Result<()> { + match req_type { + ListenerReqType::AddRegion | ListenerReqType::DeleteRegion => { + self.reqs + .lock() + .unwrap() + .push((req_type, range.unwrap().addr_range)); + } + ListenerReqType::AddIoeventfd | ListenerReqType::DeleteIoeventfd => { + self.reqs + .lock() + .unwrap() + .push((req_type, eventfd.unwrap().addr_range)); + } + } + Ok(()) + } + } + + struct TestDevice; + impl RegionOps for TestDevice { + fn read(&mut self, _data: &mut [u8], _base: GuestAddress, _offset: u64) -> bool { + true + } + + fn write(&mut self, _data: &[u8], _base: GuestAddress, _offset: u64) -> bool { + true + } + } + + // the listeners in AddressSpace is settled in ascending order by priority + #[test] + fn test_listeners() { + // define an array of listeners in order to check the priority order + struct ListenerPrior0; + impl Listener for ListenerPrior0 { + fn priority(&self) -> i32 { + 0 + } + } + struct ListenerPrior3; + impl Listener for ListenerPrior3 { + fn priority(&self) -> i32 { + 3 + } + } + struct ListenerPrior4; + impl Listener for ListenerPrior4 { + fn priority(&self) -> i32 { + 4 + } + } + struct ListenerNeg; + impl Listener for ListenerNeg { + fn priority(&self) -> i32 { + -1 + } + } + + let root = Region::init_container_region(8000); + let space = AddressSpace::new(root).unwrap(); + let listener1 = Box::new(ListenerPrior0); + let listener2 = Box::new(ListenerPrior0); + let listener3 = Box::new(ListenerPrior3); + let listener4 = Box::new(ListenerPrior4); + let listener5 = Box::new(ListenerNeg); + space.register_listener(listener1).unwrap(); + space.register_listener(listener3).unwrap(); + space.register_listener(listener5).unwrap(); + space.register_listener(listener2).unwrap(); + space.register_listener(listener4).unwrap(); + + let mut pre_prior = std::i32::MIN; + for listener in space.listeners.lock().unwrap().iter() { + let curr = listener.priority(); + assert!(pre_prior <= curr); + pre_prior = curr; + } + } + + #[test] + fn test_update_topology() { + let root = Region::init_container_region(8000); + let space = AddressSpace::new(root.clone()).unwrap(); + let listener = TestListener::default(); + space.register_listener(Box::new(listener.clone())).unwrap(); + + // memory region layout + // 0 1000 2000 3000 4000 5000 6000 7000 8000 + // |------|------|------|------|------|------|------|------| + // C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + // B: [ ] + // + // the flat_view is as follows, region-b is container which will not appear in the flat-view + // [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + let region_b = Region::init_container_region(4000); + let region_c = Region::init_io_region(6000, Arc::new(Mutex::new(TestDevice))); + region_b.set_priority(2); + region_c.set_priority(1); + root.add_subregion(region_b.clone(), 2000).unwrap(); + root.add_subregion(region_c.clone(), 0).unwrap(); + + assert_eq!(space.flat_view.read().unwrap().0.len(), 1); + assert_eq!(listener.reqs.lock().unwrap().len(), 1); + assert_eq!( + listener.reqs.lock().unwrap().get(0).unwrap().1, + AddressRange::new(region_c.offset(), region_c.size()) + ); + listener.reqs.lock().unwrap().clear(); + + // region layout + // 0 1000 2000 3000 4000 5000 6000 7000 8000 + // |------|------|------|------|------|------|------|------| + // C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + // B: [ ] + // D: [DDDDDD] + // the flat_view is as follows, + // [CCCCCCCCCCCC][DDDDDD][CCCCCCCCCCCCCCCCCCC] + let region_d = Region::init_io_region(1000, Arc::new(Mutex::new(TestDevice))); + region_b.add_subregion(region_d.clone(), 0).unwrap(); + + assert_eq!(space.flat_view.read().unwrap().0.len(), 3); + assert_eq!(listener.reqs.lock().unwrap().len(), 4); + // delete flat-range 0~6000 first, belonging to region_c + assert_eq!( + listener.reqs.lock().unwrap().get(0).unwrap().1, + AddressRange::new(region_c.offset(), region_c.size()) + ); + // add range 0~2000, belonging to region_c + assert_eq!( + listener.reqs.lock().unwrap().get(1).unwrap().1, + AddressRange::new(region_c.offset(), 2000) + ); + // add range 2000~3000, belonging to region_d + let region_d_range = AddressRange::new(GuestAddress(2000), region_d.size()); + assert_eq!( + listener.reqs.lock().unwrap().get(2).unwrap().1, + region_d_range + ); + // add range 3000~6000, belonging to region_c + assert_eq!( + listener.reqs.lock().unwrap().get(3).unwrap().1, + AddressRange::from((3000, 3000)) + ); + } + + #[test] + fn test_update_ioeventfd() { + struct TestIoEventFd; + impl RegionOps for TestIoEventFd { + fn read(&mut self, _data: &mut [u8], _base: GuestAddress, _offset: u64) -> bool { + true + } + + fn write(&mut self, _data: &[u8], _base: GuestAddress, _offset: u64) -> bool { + true + } + + fn ioeventfds(&self) -> Vec { + vec![RegionIoEventFd { + fd: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + addr_range: AddressRange::from((0, 4)), + data_match: true, + data: 0_64, + }] + } + } + + // region layout + // 0 1000 2000 3000 4000 5000 6000 7000 8000 + // |------|------|------|------|------|------|------|------| + // b: [BBBBBBBBBBBBB] + // c: [CCCCCCCCCCCCC] + // the flat_view is as follows, + // [BBBBBBBBBBBBB][CCCCC] + let root = Region::init_container_region(8000); + let space = AddressSpace::new(root.clone()).unwrap(); + let listener = TestListener::default(); + space.register_listener(Box::new(listener.clone())).unwrap(); + + let region_b = Region::init_io_region(2000, Arc::new(Mutex::new(TestIoEventFd))); + region_b.set_priority(1); + let region_c = Region::init_io_region(2000, Arc::new(Mutex::new(TestIoEventFd))); + + root.add_subregion(region_c, 2000).unwrap(); + assert_eq!(listener.reqs.lock().unwrap().len(), 2); + assert_eq!( + listener.reqs.lock().unwrap().get(1).unwrap().1, + AddressRange::new(GuestAddress(2000), 4) + ); + listener.reqs.lock().unwrap().clear(); + + root.add_subregion(region_b, 1000).unwrap(); + assert_eq!(listener.reqs.lock().unwrap().len(), 5); + // add ioeventfd of region_b + assert_eq!( + listener.reqs.lock().unwrap().get(3).unwrap().1, + AddressRange::new(GuestAddress(1000), 4) + ); + // ioeventfd in region_c is shawdowed, delete it + assert_eq!( + listener.reqs.lock().unwrap().get(4).unwrap().1, + AddressRange::new(GuestAddress(2000), 4) + ); + } + + #[test] + fn test_get_ram_info() { + let root = Region::init_container_region(8000); + let space = AddressSpace::new(root.clone()).unwrap(); + + let ram1 = Arc::new(HostMemMapping::new(GuestAddress(0), 1000, false).unwrap()); + let ram2 = Arc::new(HostMemMapping::new(GuestAddress(2000), 1000, false).unwrap()); + let region_a = Region::init_ram_region(ram1.clone()); + let region_b = Region::init_ram_region(ram2.clone()); + root.add_subregion(region_a, ram1.start_address().raw_value()) + .unwrap(); + root.add_subregion(region_b, ram2.start_address().raw_value()) + .unwrap(); + + assert_eq!( + space.memory_end_address(), + ram2.start_address().unchecked_add(ram2.size()) + ); + assert!(space.address_in_memory(GuestAddress(0), 0)); + assert_eq!(space.address_in_memory(GuestAddress(1000), 0), false); + assert_eq!(space.address_in_memory(GuestAddress(1500), 0), false); + assert!(space.address_in_memory(GuestAddress(2900), 0)); + + assert_eq!( + space.get_host_address(GuestAddress(500)), + Some(ram1.host_address() + 500) + ); + assert_eq!( + space.get_host_address(GuestAddress(2500)), + Some(ram2.host_address() + 500) + ); + + // region layout + // 0 1000 2000 3000 4000 5000 6000 7000 8000 + // |------|------|------|------|------|------|------|------| + // a: [AAAAAA] + // b: [BBBBBB] + // c: [CCCCCCCCC] + // the flat_view is as follows, + // [AAAAAA][CCCCCCCCC][BB] + let region_c = Region::init_io_region(1500, Arc::new(Mutex::new(TestDevice))); + region_c.set_priority(1); + root.add_subregion(region_c, 1000).unwrap(); + + assert_eq!( + space.memory_end_address(), + ram2.start_address().unchecked_add(ram2.size()) + ); + assert!(space.address_in_memory(GuestAddress(0), 0)); + assert_eq!(space.address_in_memory(GuestAddress(1000), 0), false); + assert_eq!(space.address_in_memory(GuestAddress(1500), 0), false); + assert_eq!(space.address_in_memory(GuestAddress(2400), 0), false); + assert!(space.address_in_memory(GuestAddress(2900), 0)); + + assert_eq!( + space.get_host_address(GuestAddress(500)), + Some(ram1.host_address() + 500) + ); + assert!(space.get_host_address(GuestAddress(2400)).is_none()); + assert_eq!( + space.get_host_address(GuestAddress(2500)), + Some(ram2.host_address() + 500) + ); + } + + #[test] + fn test_write_and_read_object() { + let root = Region::init_container_region(8000); + let space = AddressSpace::new(root.clone()).unwrap(); + let ram1 = Arc::new(HostMemMapping::new(GuestAddress(0), 1000, false).unwrap()); + let region_a = Region::init_ram_region(ram1.clone()); + root.add_subregion(region_a, ram1.start_address().raw_value()) + .unwrap(); + + let data: u64 = 10000; + assert!(space.write_object(&data, GuestAddress(992)).is_ok()); + let data1: u64 = space.read_object(GuestAddress(992)).unwrap(); + assert_eq!(data1, 10000); + assert!(space.write_object(&data, GuestAddress(993)).is_err()); + } +} diff --git a/address_space/src/host_mmap.rs b/address_space/src/host_mmap.rs new file mode 100644 index 00000000..0a781b77 --- /dev/null +++ b/address_space/src/host_mmap.rs @@ -0,0 +1,154 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::sync::Arc; + +use crate::errors::{ErrorKind, Result}; +use crate::{AddressRange, GuestAddress}; + +/// Create a new HostMemMapping. +/// +/// # Arguments +/// +/// * `ranges` - The guest address range that will be mapped. +/// * `omit_vm_memory` - Dump guest memory in core file or not. +pub fn create_host_mmaps( + ranges: &[(u64, u64)], + omit_vm_memory: bool, +) -> Result>> { + let mut mappings = Vec::new(); + + for range in ranges.iter() { + mappings.push(Arc::new(HostMemMapping::new( + GuestAddress(range.0), + range.1, + omit_vm_memory, + )?)); + } + + Ok(mappings) +} + +/// Record information of memory mapping. +pub struct HostMemMapping { + /// Record the range of one memory segment. + address_range: AddressRange, + /// The start address of mapped memory. + host_addr: *mut u8, +} + +// Send and Sync is not auto-implemented for raw pointer type +// implementing them is safe because field of HostMemMapping won't change once initialized, +// only access(r/w) is permitted +unsafe impl Send for HostMemMapping {} +unsafe impl Sync for HostMemMapping {} + +impl HostMemMapping { + /// Construct a new HostMemMapping. + /// + /// # Arguments + /// + /// * `guest_addr` - The start address im memory. + /// * `size` - Size of memory that will be mapped. + /// * `omit_vm_memory` - Dump guest memory in core file or not. + /// + /// # Errors + /// + /// Return Error if fail to map memory. + pub fn new( + guest_addr: GuestAddress, + size: u64, + omit_vm_memory: bool, + ) -> Result { + let host_addr = unsafe { + let hva = libc::mmap( + std::ptr::null_mut() as *mut libc::c_void, + size as libc::size_t, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE, + -1, + 0, + ); + if hva == libc::MAP_FAILED { + return Err(ErrorKind::Mmap.into()); + } + hva + }; + + if omit_vm_memory { + unsafe { + let madvise_res = libc::madvise( + host_addr as *mut libc::c_void, + size as libc::size_t, + libc::MADV_DONTDUMP, + ); + if madvise_res < 0 { + error!("madvise with MADV_DONTDUMP failed"); + } + } + } + + Ok(HostMemMapping { + address_range: AddressRange { + base: guest_addr, + size, + }, + host_addr: host_addr as *mut u8, + }) + } + + /// Get size of mapped memory. + pub fn size(&self) -> u64 { + self.address_range.size + } + + /// Get start address of mapped memory. + pub fn start_address(&self) -> GuestAddress { + self.address_range.base + } + + /// Get start `HVA` (host virtual address) of mapped memory. + #[inline] + pub fn host_address(&self) -> u64 { + self.host_addr as u64 + } +} + +impl Drop for HostMemMapping { + /// Release the memory mapping. + fn drop(&mut self) { + unsafe { + libc::munmap( + self.host_addr as *mut libc::c_void, + self.size() as libc::size_t, + ); + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + fn identify(ram: HostMemMapping, st: u64, end: u64) { + assert_eq!(ram.start_address(), GuestAddress(st)); + assert_eq!(ram.size(), end - st); + } + + #[test] + fn test_ramblock_creation() { + let ram1 = HostMemMapping::new(GuestAddress(0), 100u64, false).unwrap(); + let ram2 = HostMemMapping::new(GuestAddress(0), 100u64, false).unwrap(); + identify(ram1, 0, 100); + identify(ram2, 0, 100); + } +} diff --git a/address_space/src/lib.rs b/address_space/src/lib.rs new file mode 100644 index 00000000..ab9014f2 --- /dev/null +++ b/address_space/src/lib.rs @@ -0,0 +1,155 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +#![allow(missing_docs)] +//! Manages address resources that used by Vm and their devices. +//! +//! # Examples +//! +//! ```rust +//! use std::sync::{Arc, Mutex}; +//! extern crate address_space; +//! use address_space::{AddressSpace, Region, GuestAddress, HostMemMapping, RegionOps}; +//! +//! struct DummyDevice; +//! impl RegionOps for DummyDevice { +//! fn read(&mut self, data: &mut [u8], base: GuestAddress, offset: u64) -> bool { +//! true +//! } +//! fn write(&mut self, data: &[u8], base: GuestAddress, offset: u64) -> bool { +//! true +//! } +//! } +//! +//! fn main() { +//! // 1. create address_space +//! let space = AddressSpace::new(Region::init_container_region(u64::max_value())).unwrap(); +//! +//! // 2. create an Ram-type Region, and set it's priority +//! let mem_mapping = Arc::new(HostMemMapping::new(GuestAddress(0), 0x1000, false).unwrap()); +//! let ram_region = Region::init_ram_region(mem_mapping.clone()); +//! ram_region.set_priority(10); +//! +//! // 3. create a IO-type Region +//! let dev = Arc::new(Mutex::new(DummyDevice)); +//! let io_region = Region::init_io_region(0x1000, dev); +//! +//! // 4. add sub_region to address_space's root region +//! space.root().add_subregion(ram_region, mem_mapping.start_address().raw_value()); +//! space.root().add_subregion(io_region, 0x2000); +//! +//! // 5. access address_space +//! space.write_object(&0x11u64, GuestAddress(0)); +//! } +//! ``` + +extern crate kvm_bindings; +extern crate kvm_ioctls; +extern crate libc; +extern crate util; +extern crate vmm_sys_util; +#[macro_use] +extern crate error_chain; +#[macro_use] +extern crate log; + +mod address; +mod address_space; +mod host_mmap; +mod listener; +mod region; + +pub use address::{AddressRange, GuestAddress}; +pub use address_space::AddressSpace; +pub use host_mmap::{create_host_mmaps, HostMemMapping}; +#[cfg(target_arch = "x86_64")] +pub use listener::KvmIoListener; +pub use listener::KvmMemoryListener; +pub use listener::{Listener, ListenerReqType}; +pub use region::{FlatRange, Region, RegionIoEventFd, RegionType}; + +pub mod errors { + error_chain! { + foreign_links { + Io(std::io::Error); + } + links { + KvmListener(crate::listener::errors::Error, crate::listener::errors::ErrorKind); + } + errors { + RegionOverlap(addr: u64) { + display("Region overlap with others, addr {}", addr) + } + IoEventFd { + display("Failed to clone EventFd") + } + AddrResource { + display("No available address resource in space") + } + AddrNotAligned(addr: u64) { + display("Specified address is not aligned, {}", addr) + } + AddrInvalid(addr: u64) { + display("Failed to find matched region, addr {}", addr) + } + Overflow(addr: u64) { + display("Address overflows, addr is {}", addr) + } + FileBackend { + display("Exceed file-backend length") + } + Mmap { + display("Failed to mmap") + } + IoAccess(offset: u64) { + display("Access io region failed, offset is {}", offset) + } + RegionType(t: crate::RegionType) { + display("Wrong region type, {:#?}", t) + } + } + } +} + +/// Provide Some operations of `Region`, mainly used by Vm's devices. +pub trait RegionOps: Send { + /// Read data from Region to argument `data`, + /// return `true` if read successfully, or return `false`. + /// + /// # Arguments + /// + /// * `data` - A u8-type array. + /// * `base` - Base address. + /// * `offset` - Offset from base address. + fn read(&mut self, data: &mut [u8], base: GuestAddress, offset: u64) -> bool; + + /// Write `data` to memory, + /// return `true` if write successfully, or return `false`. + /// + /// # Arguments + /// + /// * `data` - A u8-type array. + /// * `base` - Base address. + /// * `offset` - Offset from base address. + fn write(&mut self, data: &[u8], base: GuestAddress, offset: u64) -> bool; + + /// Create a group of IoEvents for `region`. + fn ioeventfds(&self) -> Vec { + Vec::new() + } +} + +/// Gets the page size of system. +#[inline] +pub fn page_size() -> u64 { + unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 } +} diff --git a/address_space/src/listener.rs b/address_space/src/listener.rs new file mode 100644 index 00000000..3810d1db --- /dev/null +++ b/address_space/src/listener.rs @@ -0,0 +1,650 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Mutex}; + +use kvm_bindings::kvm_userspace_memory_region; +use kvm_ioctls::{IoEventAddress, NoDatamatch, VmFd}; +use util::num_ops::round_down; + +use crate::{page_size, AddressRange, FlatRange, RegionIoEventFd, RegionType}; + +pub mod errors { + error_chain! { + errors { + NoAvailKvmSlot { + display("No available kvm_mem_slot, used up") + } + NoMatchedKvmSlot(addr: u64, sz: u64) { + display("Failed to find matched kvm_mem_slot, addr {}, size {}", addr, sz) + } + Overlap { + display("Address range overlaps with others") + } + } + } +} +use self::errors::{ErrorKind, Result, ResultExt}; + +/// Different operations of listener requests. +#[derive(Debug, Copy, Clone)] +pub enum ListenerReqType { + /// Add a region. + AddRegion, + /// Delete a region. + DeleteRegion, + /// Add a io event file descriptor. + AddIoeventfd, + /// Delete a io event file descriptor. + DeleteIoeventfd, +} + +pub trait Listener: Send + Sync { + /// Get priority. + fn priority(&self) -> i32; + + /// Deal with the request. + /// + /// # Arguments + /// + /// * `_range` - FlatRange would be used to find the region. + /// * `_evtfd` - RegionIoEventFd of Region. + /// * `_type` - Request type. + fn handle_request( + &self, + _range: Option<&FlatRange>, + _evtfd: Option<&RegionIoEventFd>, + _type: ListenerReqType, + ) -> std::result::Result<(), crate::errors::Error> { + Ok(()) + } +} + +/// Memory slot constructing a link between guest address and host address. +#[derive(Default, Copy, Clone)] +struct MemSlot { + /// Index of a memory slot. + pub index: u32, + /// Guest address. + pub guest_addr: u64, + /// Size of memory. + pub size: u64, + /// Host address. + pub host_addr: u64, + /// Flag. + pub flag: u32, +} + +/// Kvm memory listener. +#[derive(Clone)] +pub struct KvmMemoryListener { + /// Id of AddressSpace. + as_id: Arc, + /// File descriptor of VM. + fd: Arc, + /// Record all MemSlots. + slots: Arc>>, +} + +impl KvmMemoryListener { + /// Create a new KvmMemoryListener for a VM. + /// + /// # Arguments + /// + /// * `nr_slots` - Number of slots. + /// * `vmfd` - The file descriptor of VM. + pub fn new(nr_slots: u32, vmfd: Arc) -> KvmMemoryListener { + KvmMemoryListener { + as_id: Arc::new(AtomicU32::new(0)), + fd: vmfd, + slots: Arc::new(Mutex::new(vec![MemSlot::default(); nr_slots as usize])), + } + } + + /// Find a free slot and fills it with given arguments. + /// + /// # Arguments + /// + /// * `guest_addr` - Guest address. + /// * `size` - Size of slots. + /// * `host_addr` - Host address. + /// + /// # Errors + /// + /// Return Error if + /// * no valid Kvm slot. + /// * memory overflows. + fn get_free_slot(&self, guest_addr: u64, size: u64, host_addr: u64) -> Result { + let mut slots = self.slots.lock().unwrap(); + + // check if the given address range overlaps with exist ones + let range = AddressRange::from((guest_addr, size)); + slots.iter().try_for_each::<_, Result<()>>(|s| { + if AddressRange::from((s.guest_addr, s.size)) + .find_intersection(range) + .is_some() + { + return Err(ErrorKind::Overlap.into()); + } + Ok(()) + })?; + + for (index, slot) in slots.iter_mut().enumerate() { + if slot.size == 0 { + slot.index = index as u32; + slot.guest_addr = guest_addr; + slot.size = size; + slot.host_addr = host_addr; + return Ok(slot.index); + } + } + + Err(ErrorKind::NoAvailKvmSlot.into()) + } + + /// Delete a slot after finding it according to the given arguments. + /// Return the deleted one if succeed. + /// + /// # Arguments + /// + /// * `addr` - Guest address of slot. + /// * `size` - Size of slots. + /// + /// # Errors + /// + /// Return Error if no Kem slot matched. + fn delete_slot(&self, addr: u64, size: u64) -> Result { + let mut slots = self.slots.lock().unwrap(); + for slot in slots.iter_mut() { + if slot.guest_addr == addr && slot.size == size { + // set slot size to zero, so it can be reused later + slot.size = 0; + return Ok(*slot); + } + } + Err(ErrorKind::NoMatchedKvmSlot(addr, size).into()) + } + + /// Align a piece of memory segment according to `alignment`, + /// return AddressRange after aligned. + /// + /// # Arguments + /// + /// * `range` - One piece of memory segment. + /// * `alignment` - Alignment base. + /// + /// # Errors + /// + /// Return Error if Memslot size is zero after aligned. + fn align_mem_slot(range: AddressRange, alignment: u64) -> Result { + let aligned_addr = range.base.align_up(alignment).chain_err(|| { + format!( + "Address Overflows after aligned, addr: {}", + range.base.raw_value() + ) + })?; + + let aligned_size = range + .size + .checked_sub(aligned_addr.offset_from(range.base)) + .and_then(|sz| round_down(sz, alignment)) + .filter(|&sz| sz > 0_u64) + .ok_or_else(|| ErrorKind::Msg("Mem slot size is zero after aligned".to_string()))?; + + Ok(AddressRange::new(aligned_addr, aligned_size)) + } + + /// Add a region to KvmMemoryListener, + /// the argument `flat_range` is used to find the region. + /// + /// # Arguments + /// + /// * `flat_range` - FlatRange would be used to find the region. + /// + /// # Errors + /// + /// Return Error if fail to delete kvm_mem_slot. + fn add_region(&self, flat_range: &FlatRange) -> Result<()> { + if flat_range.owner.region_type() != RegionType::Ram { + return Ok(()); + } + + let (aligned_addr, aligned_size) = + Self::align_mem_slot(flat_range.addr_range, page_size()).map(|r| (r.base, r.size))?; + let align_adjust = aligned_addr.raw_value() - flat_range.addr_range.base.raw_value(); + + // `unwrap()` won't fail because Ram-type Region definitely has hva + let aligned_hva = flat_range.owner.get_host_address().unwrap() + + flat_range.offset_in_region + + align_adjust; + + let slot_idx = self.get_free_slot(aligned_addr.raw_value(), aligned_size, aligned_hva)?; + + let kvm_region = kvm_userspace_memory_region { + slot: slot_idx | (self.as_id.load(Ordering::SeqCst) << 16), + guest_phys_addr: aligned_addr.raw_value(), + memory_size: aligned_size, + userspace_addr: aligned_hva, + flags: 0, + }; + unsafe { + self.fd.set_user_memory_region(kvm_region).or_else(|e| { + self.delete_slot(aligned_addr.raw_value(), aligned_size) + .chain_err(|| "Failed to delete kvm_mem_slot")?; + Err(e).chain_err(|| { + format!( + "KVM register memory region failed: addr {}, size {}", + aligned_addr.raw_value(), + aligned_size + ) + }) + })?; + } + Ok(()) + } + + /// Delete a region from KvmMemoryListener. + /// + /// # Arguments + /// + /// * `flat_range` - FlatRange would be used to find the region. + fn delete_region(&self, flat_range: &FlatRange) -> Result<()> { + if flat_range.owner.region_type() != RegionType::Ram { + return Ok(()); + } + + let (aligned_addr, aligned_size) = + Self::align_mem_slot(flat_range.addr_range, page_size()).map(|r| (r.base, r.size))?; + + let mem_slot = self.delete_slot(aligned_addr.raw_value(), aligned_size)?; + + let kvm_region = kvm_userspace_memory_region { + slot: mem_slot.index | (self.as_id.load(Ordering::SeqCst) << 16), + guest_phys_addr: mem_slot.guest_addr, + memory_size: 0_u64, + userspace_addr: mem_slot.host_addr, + flags: 0, + }; + unsafe { + self.fd.set_user_memory_region(kvm_region).chain_err(|| { + format!( + "KVM unregister memory region failed: addr {}", + aligned_addr.raw_value(), + ) + })?; + } + + Ok(()) + } + + /// Register a IoEvent to `/dev/kvm`. + /// + /// # Arguments + /// + /// * `ioevtfd` - IoEvent would be added. + /// + /// # Errors + /// + /// Return Error if the length of ioeventfd data is unexpected. + fn add_ioeventfd(&self, ioevtfd: &RegionIoEventFd) -> Result<()> { + let io_addr = IoEventAddress::Mmio(ioevtfd.addr_range.base.raw_value()); + + let ioctl_ret = if ioevtfd.data_match { + let length = ioevtfd.addr_range.size; + match length { + 2 => self + .fd + .register_ioevent(&ioevtfd.fd, &io_addr, ioevtfd.data as u16), + 4 => self + .fd + .register_ioevent(&ioevtfd.fd, &io_addr, ioevtfd.data as u32), + 8 => self + .fd + .register_ioevent(&ioevtfd.fd, &io_addr, ioevtfd.data as u64), + _ => bail!("Unexpected ioeventfd data length"), + } + } else { + self.fd.register_ioevent(&ioevtfd.fd, &io_addr, NoDatamatch) + }; + + ioctl_ret.chain_err(|| { + format!( + "KVM register ioeventfd failed: mmio-addr {}", + ioevtfd.addr_range.base.raw_value() + ) + })?; + + Ok(()) + } + + /// Deletes `ioevtfd` from `/dev/kvm` + /// + /// # Arguments + /// + /// * `ioevtfd` - IoEvent would be deleted. + fn delete_ioeventfd(&self, ioevtfd: &RegionIoEventFd) -> Result<()> { + let io_addr = IoEventAddress::Mmio(ioevtfd.addr_range.base.raw_value()); + self.fd + .unregister_ioevent(&ioevtfd.fd, &io_addr) + .chain_err(|| { + format!( + "KVM unregister ioeventfd failed: mmio-addr {}", + ioevtfd.addr_range.base.raw_value() + ) + })?; + + Ok(()) + } +} + +impl Listener for KvmMemoryListener { + /// Get default priority. + fn priority(&self) -> i32 { + 10_i32 + } + + /// Deal with the request. + /// + /// # Arguments + /// + /// * `flat_range` - FlatRange would be used to find the region. + /// * `evtfd` - IoEvent of Region. + /// * `req_type` - Request type. + /// + /// # Errors + /// + /// Returns Error if + /// * No FlatRange in argument `flat_range`. + /// * No IoEventFd in argument `evtfd'. + fn handle_request( + &self, + flat_range: Option<&FlatRange>, + evtfd: Option<&RegionIoEventFd>, + req_type: ListenerReqType, + ) -> std::result::Result<(), crate::errors::Error> { + match req_type { + ListenerReqType::AddRegion => { + self.add_region(flat_range.chain_err(|| "No FlatRange")?)? + } + ListenerReqType::DeleteRegion => { + self.delete_region(flat_range.chain_err(|| "No FlatRange")?)? + } + ListenerReqType::AddIoeventfd => { + self.add_ioeventfd(evtfd.chain_err(|| "No IoEventFd")?)? + } + ListenerReqType::DeleteIoeventfd => { + self.delete_ioeventfd(evtfd.chain_err(|| "No IoEventFd")?)? + } + } + Ok(()) + } +} + +#[cfg(target_arch = "x86_64")] +pub struct KvmIoListener { + fd: Arc, +} + +#[cfg(target_arch = "x86_64")] +impl KvmIoListener { + /// Create a new KvmIoListener. + /// + /// # Arguments + /// + /// * `fd` - File descriptor of VM. + pub fn new(fd: Arc) -> KvmIoListener { + KvmIoListener { fd } + } + + /// Register a IoEvent to `/dev/kvm`. + /// + /// # Arguments + /// + /// * `ioevtfd` - IoEvent of Region. + /// + /// # Errors + /// + /// Return Error if the length of ioeventfd data is unexpected. + fn add_ioeventfd(&self, ioevtfd: &RegionIoEventFd) -> Result<()> { + let io_addr = IoEventAddress::Pio(ioevtfd.addr_range.base.raw_value()); + + let ioctl_ret = if ioevtfd.data_match { + let length = ioevtfd.addr_range.size; + match length { + 2 => self + .fd + .register_ioevent(&ioevtfd.fd, &io_addr, ioevtfd.data as u16), + 4 => self + .fd + .register_ioevent(&ioevtfd.fd, &io_addr, ioevtfd.data as u32), + 8 => self + .fd + .register_ioevent(&ioevtfd.fd, &io_addr, ioevtfd.data as u64), + _ => bail!("unexpected ioeventfd data length"), + } + } else { + self.fd.register_ioevent(&ioevtfd.fd, &io_addr, NoDatamatch) + }; + + ioctl_ret.chain_err(|| { + format!( + "KVM register ioeventfd failed: mmio-addr {}", + ioevtfd.addr_range.base.raw_value() + ) + })?; + + Ok(()) + } + + /// Delete an IoEvent from `/dev/kvm`. + /// + /// # Arguments + /// + /// * `ioevtfd` - IoEvent of Region. + fn delete_ioeventfd(&self, ioevtfd: &RegionIoEventFd) -> Result<()> { + let io_addr = IoEventAddress::Pio(ioevtfd.addr_range.base.raw_value()); + self.fd + .unregister_ioevent(&ioevtfd.fd, &io_addr) + .chain_err(|| { + format!( + "KVM unregister ioeventfd failed: io-addr is {}", + ioevtfd.addr_range.base.raw_value() + ) + })?; + + Ok(()) + } +} + +/// Kvm io listener. +#[cfg(target_arch = "x86_64")] +impl Listener for KvmIoListener { + /// Get the default priority. + fn priority(&self) -> i32 { + 10_i32 + } + + /// Deal with the request. + /// + /// # Arguments + /// + /// * `_range` - FlatRange would be used to find the region. + /// * `evtfd` - IoEvent of Region. + /// * `req_type` - Request type. + fn handle_request( + &self, + _range: Option<&FlatRange>, + evtfd: Option<&RegionIoEventFd>, + req_type: ListenerReqType, + ) -> std::result::Result<(), crate::errors::Error> { + match req_type { + ListenerReqType::AddIoeventfd => { + self.add_ioeventfd(evtfd.chain_err(|| "No IoEventFd")?)? + } + ListenerReqType::DeleteIoeventfd => { + self.delete_ioeventfd(evtfd.chain_err(|| "No IoEventFd")?)? + } + _ => {} + } + Ok(()) + } +} + +#[cfg(test)] +mod test { + use kvm_ioctls::Kvm; + use libc::EFD_NONBLOCK; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + use crate::{GuestAddress, HostMemMapping, Region, RegionIoEventFd}; + + fn generate_region_ioeventfd(addr: u64, datamatch: Option) -> RegionIoEventFd { + RegionIoEventFd { + fd: EventFd::new(EFD_NONBLOCK).unwrap(), + addr_range: AddressRange::from((addr, 4)), + data_match: datamatch.is_some(), + data: datamatch.unwrap_or(064), + } + } + + fn create_vm() -> VmFd { + let kvm = Kvm::new().expect("create kvm failed"); + kvm.create_vm().expect("create vm failed") + } + + fn create_ram_range(addr: u64, size: u64, offset_in_region: u64) -> FlatRange { + let mem_mapping = Arc::new(HostMemMapping::new(GuestAddress(addr), size, false).unwrap()); + FlatRange { + addr_range: AddressRange::new( + mem_mapping.start_address().unchecked_add(offset_in_region), + mem_mapping.size() - offset_in_region, + ), + owner: Region::init_ram_region(mem_mapping.clone()), + offset_in_region, + } + } + + #[test] + fn test_alloc_slot() { + let kml = KvmMemoryListener::new(34, Arc::new(create_vm())); + let host_addr = 0u64; + assert_eq!(kml.get_free_slot(0, 100, host_addr).unwrap(), 0); + assert_eq!(kml.get_free_slot(200, 100, host_addr).unwrap(), 1); + assert_eq!(kml.get_free_slot(300, 100, host_addr).unwrap(), 2); + assert_eq!(kml.get_free_slot(500, 100, host_addr).unwrap(), 3); + assert!(kml.get_free_slot(200, 100, host_addr).is_err()); + + kml.delete_slot(200, 100).unwrap(); + assert!(kml.delete_slot(150, 100).is_err()); + assert!(kml.delete_slot(700, 100).is_err()); + assert_eq!(kml.get_free_slot(200, 100, host_addr).unwrap(), 1); + } + + #[test] + fn test_add_del_ram_region() { + let vm = Arc::new(create_vm()); + let kml = KvmMemoryListener::new(34, vm.clone()); + + let ram_size = page_size(); + let ram_fr1 = create_ram_range(0, ram_size, 0); + kml.handle_request(Some(&ram_fr1), None, ListenerReqType::AddRegion) + .unwrap(); + //flat-range already added, adding again should make an error + assert!(kml + .handle_request(Some(&ram_fr1), None, ListenerReqType::AddRegion) + .is_err()); + assert!(kml + .handle_request(Some(&ram_fr1), None, ListenerReqType::DeleteRegion) + .is_ok()); + //flat-range already deleted, deleting again should make an error + assert!(kml + .handle_request(Some(&ram_fr1), None, ListenerReqType::DeleteRegion) + .is_err()); + } + + #[test] + fn test_add_region_align() { + let vm = Arc::new(create_vm()); + let kml = KvmMemoryListener::new(34, vm.clone()); + + // flat-range not aligned + let page_size = page_size(); + let ram_fr2 = create_ram_range(page_size, 2 * page_size, 1000); + assert!(kml + .handle_request(Some(&ram_fr2), None, ListenerReqType::AddRegion) + .is_ok()); + + // flat-range size is zero after aligned, this step should make an error + let ram_fr3 = create_ram_range(page_size, page_size, 1000); + assert!(kml + .handle_request(Some(&ram_fr3), None, ListenerReqType::AddRegion) + .is_err()); + } + + #[test] + fn test_add_del_ioeventfd() { + let vm = Arc::new(create_vm()); + let kml = KvmMemoryListener::new(34, vm.clone()); + + let evtfd = generate_region_ioeventfd(4, None); + assert!(kml + .handle_request(None, Some(&evtfd), ListenerReqType::AddIoeventfd) + .is_ok()); + // evtfd already added, adding again should make an error + assert!(kml + .handle_request(None, Some(&evtfd), ListenerReqType::AddIoeventfd) + .is_err()); + assert!(kml + .handle_request(None, Some(&evtfd), ListenerReqType::DeleteIoeventfd) + .is_ok()); + // evtfd already deleted, deleting again should make an error + assert!(kml + .handle_request(None, Some(&evtfd), ListenerReqType::DeleteIoeventfd) + .is_err()); + + // register an ioeventfd with data-match + let evtfd = generate_region_ioeventfd(64, Some(4u64)); + assert!(kml + .handle_request(None, Some(&evtfd), ListenerReqType::AddIoeventfd) + .is_ok()); + // deleting this ioeventfd returns an error, for the reason that + // function `unregister_ioevent` in kvm-ioctls package don't have an `data_match` argument + assert!(kml + .handle_request(None, Some(&evtfd), ListenerReqType::DeleteIoeventfd) + .is_err()); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_kvm_io_listener() { + let vm = Arc::new(create_vm()); + let iol = KvmIoListener::new(vm.clone()); + + let evtfd = generate_region_ioeventfd(4, None); + assert!(iol + .handle_request(None, Some(&evtfd), ListenerReqType::AddIoeventfd) + .is_ok()); + // evtfd already added, adding again should make an error + assert!(iol + .handle_request(None, Some(&evtfd), ListenerReqType::AddIoeventfd) + .is_err()); + assert!(iol + .handle_request(None, Some(&evtfd), ListenerReqType::DeleteIoeventfd) + .is_ok()); + // evtfd already deleted, deleting again should make an error + assert!(iol + .handle_request(None, Some(&evtfd), ListenerReqType::DeleteIoeventfd) + .is_err()); + } +} diff --git a/address_space/src/region.rs b/address_space/src/region.rs new file mode 100644 index 00000000..f18bf30d --- /dev/null +++ b/address_space/src/region.rs @@ -0,0 +1,942 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex, RwLock, Weak}; + +use crate::errors::{ErrorKind, Result}; +use crate::{AddressRange, AddressSpace, GuestAddress, HostMemMapping, RegionOps}; + +/// Types of Region. +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum RegionType { + /// Ram type. + Ram, + /// IO type. + IO, + /// Container type. + Container, +} + +/// Represents a memory region, used by mem-mapped IO or Ram. +#[derive(Clone)] +pub struct Region { + /// Type of Region, won't be changed once initialized. + region_type: RegionType, + /// The priority of Region, only valid in parent Container-type Region. + priority: Arc, + /// Size of Region. + size: Arc, + /// Offset in parent Container-type region.It won't be changed once initialized. + offset: Arc>, + /// If not Ram-type Region, `mem_mapping` is None. It won't be changed once initialized. + mem_mapping: Option>, + /// `ops` provides read/write function. + ops: Option>>, + /// Weak pointer pointing to the father address-spaces. + space: Arc>>, + /// Sub-regions array, keep sorted + subregions: Arc>>, +} + +/// Used to trigger events. +/// If `data_match` is enabled, the `EventFd` is triggered iff `data` is written +/// to the specified address. +pub struct RegionIoEventFd { + /// EventFd to be triggered when guest writes to the address. + pub fd: vmm_sys_util::eventfd::EventFd, + /// Addr_range contains two params as follows: + /// base: in addr_range is the address of EventFd. + /// size: can be 2, 4, 8 bytes. + pub addr_range: AddressRange, + /// If data_match is enabled. + pub data_match: bool, + /// The specified value to trigger events. + pub data: u64, +} + +impl RegionIoEventFd { + /// Calculate if this `RegionIoEventFd` is located before the given one. + /// + /// # Arguments + /// + /// * `other` - Other `RegionIoEventFd`. + pub fn before(&self, other: &RegionIoEventFd) -> bool { + if self.addr_range.base != other.addr_range.base { + return self.addr_range.base < other.addr_range.base; + } + if self.addr_range.size != other.addr_range.size { + return self.addr_range.size < other.addr_range.size; + } + if self.data_match != other.data_match { + return self.data_match && (!other.data_match); + } + if self.data != other.data { + return self.data < other.data; + } + false + } + + /// Return the cloned IoEvent, + /// return error if failed to clone EventFd. + pub fn try_clone(&self) -> Result { + let fd = self.fd.try_clone().or(Err(ErrorKind::IoEventFd))?; + Ok(RegionIoEventFd { + fd, + addr_range: self.addr_range, + data_match: self.data_match, + data: self.data, + }) + } +} + +/// FlatRange is a piece of continuous memory address。 +#[derive(Clone, PartialEq, Eq)] +pub struct FlatRange { + /// The address range. + pub addr_range: AddressRange, + /// The owner of this flat-range. + pub owner: Region, + /// The offset within Region. + pub offset_in_region: u64, +} + +/// Contain a set of `FlatRange`. +/// Note that flat ranges is sorted by implementing `PartialOrd` and `Ord` trait. +#[derive(Default, Clone)] +pub struct FlatView(pub Vec); + +/// Implement PartialEq/Eq for comparison of Region. +impl PartialEq for Region { + fn eq(&self, other: &Region) -> bool { + self.priority() == other.priority() + && self.region_type() == other.region_type() + && self.offset() == other.offset() + && self.size() == other.size() + } +} + +impl Eq for Region {} + +impl Region { + /// The core function of initialization. + /// + /// # Arguments + /// + /// * `size` - Size of `Region`. + /// * `region_type` - Type of `Region`. + /// * `mem_mapping` - Mapped memory. + /// * `ops` - Region operations. + fn init_region_internal( + size: u64, + region_type: RegionType, + mem_mapping: Option>, + ops: Option>>, + ) -> Region { + Region { + region_type, + priority: Arc::new(AtomicI32::new(0)), + offset: Arc::new(Mutex::new(GuestAddress(0))), + size: Arc::new(AtomicU64::new(size)), + mem_mapping, + ops, + space: Arc::new(RwLock::new(Weak::new())), + subregions: Arc::new(RwLock::new(Vec::new())), + } + } + + /// Initialize Ram-type region. + /// + /// # Arguments + /// + /// * `mem_mapping` - Mapped memory of this Ram region. + pub fn init_ram_region(mem_mapping: Arc) -> Region { + Region::init_region_internal(mem_mapping.size(), RegionType::Ram, Some(mem_mapping), None) + } + + /// Initialize IO-type region. + /// + /// # Arguments + /// + /// * `size` - Size of IO region. + /// * `dev` - Operation of Region. + pub fn init_io_region(size: u64, dev: Arc>) -> Region { + Region::init_region_internal(size, RegionType::IO, None, Some(dev)) + } + + /// Initialize Container-type region. + /// + /// # Arguments + /// + /// * `size` - Size of container region. + pub fn init_container_region(size: u64) -> Region { + Region::init_region_internal(size, RegionType::Container, None, None) + } + + /// Get the type of this region. + pub fn region_type(&self) -> RegionType { + self.region_type + } + + /// Get the priority of this region. + pub fn priority(&self) -> i32 { + self.priority.load(Ordering::SeqCst) + } + + /// Set the priority of this region. + /// + /// # Arguments + /// + /// * `prior` - Priority of region. + pub fn set_priority(&self, prior: i32) { + self.priority.store(prior, Ordering::SeqCst); + } + + /// Get size of this region. + pub fn size(&self) -> u64 { + self.size.load(Ordering::SeqCst) + } + + /// Get the offset of this region. + /// The offset is within its parent region or belonged address space. + pub fn offset(&self) -> GuestAddress { + *self.offset.lock().unwrap() + } + + /// Set the offset of region, + /// this function is only used when this region is added to its parent region. + /// + /// # Arguments + /// + /// * `offset` - Offset in parent region. + pub fn set_offset(&self, offset: GuestAddress) { + self.offset.lock().unwrap().0 = offset.raw_value(); + } + + /// Get the host address if this region is backed by host-memory, + /// Return `None` if it is not a Ram-type region. + pub fn get_host_address(&self) -> Option { + if self.region_type != RegionType::Ram { + return None; + } + self.mem_mapping.as_ref().map(|r| r.host_address()) + } + + /// Return all sub-regions of this Region, the returned vector is not empty, + /// iff this region is a container. + pub(crate) fn subregions(&self) -> Vec { + self.subregions.read().unwrap().clone() + } + + /// Set `AddressSpace` for `region`, + /// this function is called when this region is added to parent region or + /// added to belonged address space. + /// + /// # Arguments + /// + /// * `space` - The AddressSpace that the region belongs to. + pub(crate) fn set_belonged_address_space(&self, space: &Arc) { + *self.space.write().unwrap() = Arc::downgrade(&space); + } + + /// Release the address space this region belongs to, + /// this function is called when this region is removed from its parent region or + /// removed from belonged address space. + pub(crate) fn del_belonged_address_space(&self) { + *self.space.write().unwrap() = Weak::new(); + } + + /// Check if the address(end address) overflows or exceeds the end of this region. + /// + /// # Arguments + /// + /// * `addr` - Start address. + /// * `size` - Size of memory segment. + /// + /// # Errors + /// + /// Return Error if the address overflows. + fn check_valid_offset(&self, addr: u64, size: u64) -> Result<()> { + if addr + .checked_add(size) + .filter(|end| *end <= self.size()) + .is_none() + { + return Err(ErrorKind::Overflow(addr).into()); + } + Ok(()) + } + + /// Read memory segment to `dst`. + /// + /// # Arguments + /// + /// * `dst` - Destination the data would be written to. + /// * `base` - Base address. + /// * `offset` - Offset from base address. + /// * `count` - Size of data. + /// + /// # Errors + /// + /// Return Error if + /// * fail to access io region. + /// * the region is a container. + /// * the address overflows. + pub fn read( + &self, + dst: &mut dyn std::io::Write, + base: GuestAddress, + offset: u64, + count: u64, + ) -> Result<()> { + self.check_valid_offset(offset, count)?; + + match self.region_type { + RegionType::Ram => { + let host_addr = self.mem_mapping.as_ref().unwrap().host_address(); + let slice = unsafe { + std::slice::from_raw_parts((host_addr + offset) as *const u8, count as usize) + }; + dst.write_all(slice)?; + } + RegionType::IO => { + if count >= std::usize::MAX as u64 { + return Err(ErrorKind::Overflow(count).into()); + } + let mut slice = vec![0_u8; count as usize]; + if !self + .ops + .as_ref() + .unwrap() + .lock() + .unwrap() + .read(&mut slice, base, offset) + { + return Err(ErrorKind::IoAccess(offset).into()); + } + dst.write_all(&slice)?; + } + _ => { + return Err(ErrorKind::RegionType(self.region_type()).into()); + } + } + Ok(()) + } + + /// Write data segment from `src` to memory. + /// + /// # Arguments + /// + /// * `src` - Source data. + /// * `base` - Base address. + /// * `offset` - Offset from base address. + /// * `count` - Size of data. + /// + /// # Errors + /// + /// Return Error if + /// * fail to access io region. + /// * the region is a container. + /// * the address overflows. + pub fn write( + &self, + src: &mut dyn std::io::Read, + base: GuestAddress, + offset: u64, + count: u64, + ) -> Result<()> { + self.check_valid_offset(offset, count)?; + + match self.region_type { + RegionType::Ram => { + let host_addr = self.mem_mapping.as_ref().unwrap().host_address(); + let slice = unsafe { + std::slice::from_raw_parts_mut((host_addr + offset) as *mut u8, count as usize) + }; + src.read_exact(slice)?; + } + RegionType::IO => { + if count >= std::usize::MAX as u64 { + return Err(ErrorKind::Overflow(count).into()); + } + let mut slice = vec![0_u8; count as usize]; + src.read_exact(&mut slice)?; + + if !self + .ops + .as_ref() + .unwrap() + .lock() + .unwrap() + .write(&slice, base, offset) + { + return Err(ErrorKind::IoAccess(offset).into()); + } + } + _ => { + return Err(ErrorKind::RegionType(self.region_type()).into()); + } + } + Ok(()) + } + + /// Return the IoEvent of a `Region`. + pub(crate) fn ioeventfds(&self) -> Vec { + match self.region_type { + RegionType::IO => { + let ioeventfds = self.ops.as_ref().unwrap().lock().unwrap().ioeventfds(); + ioeventfds + .iter() + .map(|e| { + let mut evt_cloned = e.try_clone().unwrap(); + evt_cloned.addr_range.base.0 += self.offset().raw_value(); + evt_cloned + }) + .collect() + } + _ => Vec::new(), + } + } + + /// Add sub-region to this region. + /// + /// # Arguments + /// + /// * `child` - Subregion of this region. + /// * `offset` - Offset of subregion. + /// + /// # Errors + /// + /// Return Error if + /// * This region is not a Container. + /// * The argument `offset` plus child region's size overflows or exceed this region's size. + /// * The child-region already exists in sub-regions array. + /// * Failed to generate flat view (topology changed after adding sub-region). + pub fn add_subregion(&self, child: Region, offset: u64) -> Result<()> { + // check parent Region's property, and check if child Region's offset is valid or not + if self.region_type() != RegionType::Container { + return Err(ErrorKind::RegionType(self.region_type()).into()); + } + self.check_valid_offset(offset, child.size())?; + + // set child region's offset and father address-space + child.set_offset(GuestAddress(offset)); + if let Some(space) = self.space.read().unwrap().upgrade() { + child.set_belonged_address_space(&space) + } + + // insert to `subregion` array and update topology of father address-space + let mut sub_regions = self.subregions.write().unwrap(); + let mut index = 0_usize; + while index < sub_regions.len() { + if child.priority() >= sub_regions.get(index).unwrap().priority() { + break; + } + index += 1; + } + sub_regions.insert(index, child); + drop(sub_regions); + + if let Some(space) = self.space.read().unwrap().upgrade() { + space.update_topology()?; + } else { + debug!("add subregion to container region, which has no belonged address-space"); + } + + Ok(()) + } + + /// Delete sub-region of this region. + /// + /// # Arguments + /// + /// * `child` - Subregion of this region. + /// + /// # Errors + /// + /// Return Error if + /// * The child-region does not exist in sub-regions array. + /// * Failed to generate flat view (topology changed after removing sub-region). + pub fn delete_subregion(&self, child: &Region) -> Result<()> { + let mut sub_regions = self.subregions.write().unwrap(); + let mut removed = false; + for (index, sub_r) in sub_regions.iter().enumerate() { + if child == sub_r { + sub_regions.remove(index); + removed = true; + break; + } + } + drop(sub_regions); + + if !removed { + bail!("Delete subregion failed: no matched region"); + } + + // get father address-space and update topology + if let Some(space) = self.space.read().unwrap().upgrade() { + space.update_topology()?; + } else { + debug!("add subregion to container region, which has no belonged address-space"); + } + child.del_belonged_address_space(); + + Ok(()) + } + + /// Recursive function to render region, terminate if this region is not a container. + /// + /// # Arguments + /// + /// * `base` - Base address of a Region. + /// * `addr_range` - Address Range. + /// * `flat_view` - FlatView of a Region. + /// + /// # Errors + /// + /// Return Error if the input address range `addr_range` has no intersection with this region. + fn render_region_pass( + &self, + base: GuestAddress, + addr_range: AddressRange, + flat_view: &mut FlatView, + ) -> Result<()> { + match self.region_type { + RegionType::Container => { + let region_base = base.unchecked_add(self.offset().raw_value()); + let region_range = AddressRange::new(region_base, self.size()); + let intersect = match region_range.find_intersection(addr_range) { + Some(r) => r, + None => bail!( + "Generate flat view failed: region_addr {} exceeds", + region_base.raw_value() + ), + }; + + for sub_r in self.subregions.read().unwrap().iter() { + sub_r.render_region_pass(region_base, intersect, flat_view)?; + } + } + RegionType::Ram | RegionType::IO => { + self.render_terminate_region(base, addr_range, flat_view)?; + } + } + Ok(()) + } + + /// Render terminate region. + /// + /// # Arguments + /// + /// * `base` - Base address of a Region. + /// * `addr_range` - Address Range. + /// * `flat_view` - FlatView of a Region. + /// + /// # Errors + /// + /// Return Error if the input address range `addr_range` has no intersection with this region. + fn render_terminate_region( + &self, + base: GuestAddress, + addr_range: AddressRange, + flat_view: &mut FlatView, + ) -> Result<()> { + let region_range = + AddressRange::new(base.unchecked_add(self.offset().raw_value()), self.size()); + let intersect = match region_range.find_intersection(addr_range) { + Some(r) => r, + None => bail!( + "Gen flatview failed: region_addr {} exceeds", + region_range.base.raw_value() + ), + }; + + let mut offset_in_region = intersect.base.offset_from(region_range.base); + let mut start = intersect.base; + let mut remain = intersect.size; + + let mut index = 0_usize; + while index < flat_view.0.len() { + let fr = &flat_view.0[index]; + let fr_end = fr.addr_range.end_addr(); + if start >= fr.addr_range.end_addr() { + index += 1; + continue; + } + + if start < fr.addr_range.base { + let range_size = std::cmp::min(remain, fr.addr_range.base.offset_from(start)); + + flat_view.0.insert( + index, + FlatRange { + addr_range: AddressRange { + base: start, + size: range_size, + }, + owner: self.clone(), + offset_in_region, + }, + ); + index += 1; + } + let step = std::cmp::min(fr_end.offset_from(start), remain); + start = start.unchecked_add(step); + offset_in_region += step; + remain -= step; + if remain == 0 { + break; + } + index += 1; + } + + if remain > 0 { + flat_view.0.insert( + index, + FlatRange { + addr_range: AddressRange::new(start, remain), + owner: self.clone(), + offset_in_region, + }, + ); + } + + Ok(()) + } + + /// Create corresponding `FlatView` for the `Region`. + /// Return the `FlatView`. + /// + /// # Arguments + /// + /// * `base` - Base address. + /// * `addr_range` - Address range. + pub fn generate_flatview( + &self, + base: GuestAddress, + addr_range: AddressRange, + ) -> Result { + let mut flat_view = FlatView::default(); + match self.region_type { + RegionType::Container => self.render_region_pass(base, addr_range, &mut flat_view)?, + RegionType::Ram | RegionType::IO => { + self.render_terminate_region(base, addr_range, &mut flat_view)? + } + } + Ok(flat_view) + } +} + +#[cfg(test)] +mod test { + use std::io::{Read, Seek, SeekFrom}; + + use libc::EFD_NONBLOCK; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + + #[derive(Default)] + struct TestDevice { + head: u64, + } + + impl RegionOps for TestDevice { + fn read(&mut self, data: &mut [u8], _base: GuestAddress, _offset: u64) -> bool { + if data.len() != std::mem::size_of::() { + return false; + } + + for i in 0..std::mem::size_of::() { + data[i] = (self.head >> (8 * i)) as u8; + } + true + } + + fn write(&mut self, data: &[u8], _addr: GuestAddress, _offset: u64) -> bool { + if data.len() != std::mem::size_of::() { + return false; + } + self.head = match unsafe { data.align_to::() } { + (_, [m], _) => *m, + _ => return false, + }; + true + } + } + + #[test] + fn test_ram_region() { + let mem_mapping = Arc::new(HostMemMapping::new(GuestAddress(0), 1024u64, false).unwrap()); + let ram_region = Region::init_ram_region(mem_mapping.clone()); + let data: [u8; 10] = [10; 10]; + let mut res_data: [u8; 10] = [0; 10]; + let count = data.len() as u64; + + assert_eq!(ram_region.region_type(), RegionType::Ram); + + assert_eq!(ram_region.offset(), GuestAddress(0u64)); + ram_region.set_offset(GuestAddress(0x11u64)); + assert_eq!(ram_region.offset(), GuestAddress(0x11u64)); + + //test read/write + assert!(ram_region + .write(&mut data.as_ref(), GuestAddress(0), 0, count) + .is_ok()); + assert!(ram_region + .read(&mut res_data.as_mut(), GuestAddress(0), 0, count) + .is_ok()); + assert_eq!(&data, &mut res_data); + + assert_eq!( + ram_region.get_host_address().unwrap(), + mem_mapping.host_address() + ); + + assert!(ram_region.check_valid_offset(0, 1000).is_ok()); + assert!(ram_region.check_valid_offset(100, 1000).is_err()); + } + + #[test] + fn test_ram_region_access() { + // the target guest address is 0~1024 (1024 not included) + let rgn_start = GuestAddress(0); + let host_mmap = HostMemMapping::new(GuestAddress(0), 1024u64, false).unwrap(); + let ram_region = Region::init_ram_region(Arc::new(host_mmap)); + let mut file = std::fs::File::create("/tmp/test_read_write_buffer.tmp").unwrap(); + let mut file_read = std::fs::File::open("/tmp/test_read_write_buffer.tmp").unwrap(); + let slice: [u8; 24] = [91; 24]; + let mut res_slice: [u8; 24] = [0; 24]; + let mut res_slice2: [u8; 24] = [0; 24]; + + // write 91 to 1000~1024 (1024 not included) + ram_region + .write(&mut slice.as_ref(), rgn_start, 1000, slice.len() as u64) + .unwrap(); + + // read the ram to the file, then check the file's content + assert!(ram_region.read(&mut file, rgn_start, 1000, 24).is_ok()); + assert!(file_read.read(&mut res_slice).is_ok()); + assert_eq!(&slice, &mut res_slice); + + // write the file content to 0~24 (24 not included) + // then ckeck the ram's content + file_read.seek(SeekFrom::Start(0)).unwrap(); + assert!(ram_region.write(&mut file_read, rgn_start, 0, 24).is_ok()); + ram_region + .read(&mut res_slice2.as_mut(), rgn_start, 0, 24) + .unwrap(); + assert_eq!(&slice, &mut res_slice2); + + std::fs::remove_file("/tmp/test_read_write_buffer.tmp").unwrap(); + } + + #[test] + fn test_io_region() { + let config_space = Arc::new(Mutex::new(TestDevice::default())); + let io_region = Region::init_io_region(16, config_space.clone()); + let data = [0x01u8; 8]; + let mut data_res = [0x0u8; 8]; + let count = data.len() as u64; + + assert_eq!(io_region.region_type(), RegionType::IO); + + // test read/write + assert!(io_region + .write(&mut data.as_ref(), GuestAddress(0), 0, count) + .is_ok()); + assert!(io_region + .read(&mut data_res.as_mut(), GuestAddress(0), 0, count) + .is_ok()); + assert_eq!(data.to_vec(), data_res.to_vec()); + + assert!(io_region.get_host_address().is_none()); + } + + #[test] + fn test_region_ioeventfd() { + let mut fd1 = RegionIoEventFd { + fd: EventFd::new(EFD_NONBLOCK).unwrap(), + addr_range: AddressRange::from((1000, 4u64)), + data_match: false, + data: 0, + }; + // compare length + let mut fd2 = fd1.try_clone().unwrap(); + fd2.addr_range.size = 8; + assert!(fd1.before(&fd2)); + + // compare address + fd2.addr_range.base.0 = 1024; + fd2.addr_range.size = 4; + assert!(fd1.before(&fd2)); + + // compare datamatch + fd2.addr_range = fd1.addr_range; + fd2.data_match = true; + assert_eq!(fd1.before(&fd2), false); + + // if datamatch, compare data + fd1.data_match = true; + fd2.data = 10u64; + assert!(fd1.before(&fd2)); + } + + // test add/del sub-region to container-region, and check priority + #[test] + fn test_add_del_subregion() { + let container = Region::init_container_region(1 << 10); + assert_eq!(container.region_type(), RegionType::Container); + assert_eq!(container.priority(), 0); + + // create two io region as container's sub regions + let dev = Arc::new(Mutex::new(TestDevice::default())); + let dev2 = Arc::new(Mutex::new(TestDevice::default())); + let io_region = Region::init_io_region(1 << 4, dev.clone()); + let io_region2 = Region::init_io_region(1 << 4, dev2.clone()); + io_region2.set_priority(10); + + // add duplicate io-region or ram-region will fail + assert!(container.add_subregion(io_region.clone(), 0u64).is_ok()); + assert!(container.add_subregion(io_region2.clone(), 20u64).is_ok()); + + // sub_regions are stored in descending order of priority + assert_eq!(container.subregions.read().unwrap().len(), 2); + assert_eq!( + container + .subregions + .read() + .unwrap() + .get(1) + .unwrap() + .priority(), + 0 + ); + assert_eq!( + container + .subregions + .read() + .unwrap() + .get(0) + .unwrap() + .priority(), + 10 + ); + + assert!(container.delete_subregion(&io_region).is_ok()); + assert!(container.delete_subregion(&io_region2).is_ok()); + assert!(container.delete_subregion(&io_region2).is_err()); + + assert_eq!(container.subregions.read().unwrap().len(), 0); + } + + #[test] + fn test_generate_flatview() { + let config_c = Arc::new(Mutex::new(TestDevice::default())); + let config_d = Arc::new(Mutex::new(TestDevice::default())); + let config_e = Arc::new(Mutex::new(TestDevice::default())); + + // memory region layout + // 0 1000 2000 3000 4000 5000 6000 7000 8000 + // |------|------|------|------|------|------|------|------| + // A: [ ] + // C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + // B: [ ] + // D: [DDDDD] + // E: [EEEEE] + // + // the flat_view is as follows + // [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC] + { + let region_a = Region::init_container_region(8000); + let region_b = Region::init_container_region(4000); + let region_c = Region::init_io_region(6000, config_c.clone()); + let region_d = Region::init_io_region(1000, config_d.clone()); + let region_e = Region::init_io_region(1000, config_e.clone()); + + region_b.set_priority(2); + region_c.set_priority(1); + region_a.add_subregion(region_b.clone(), 2000).unwrap(); + region_a.add_subregion(region_c.clone(), 0).unwrap(); + region_b.add_subregion(region_d.clone(), 0).unwrap(); + region_b.add_subregion(region_e.clone(), 2000).unwrap(); + + let addr_range = AddressRange::from((0u64, region_a.size())); + let view = region_a + .generate_flatview(GuestAddress(0), addr_range) + .unwrap(); + + for fr in view.0.iter() { + println!( + "\nrange: addr is {:#x}, size is {:#x}", + fr.addr_range.base.raw_value(), + fr.addr_range.size + ); + println!("offset is {:#x}", fr.offset_in_region); + println!("region type is {:#?}", fr.owner.region_type()); + println!( + "size is {:#x}, priority = {:#?}", + fr.owner.size(), + fr.owner.priority(), + ); + } + assert_eq!(view.0.len(), 5); + } + + // memory region layout + // 0 1000 2000 3000 4000 5000 6000 7000 8000 + // |------|------|------|------|------|------|------|------| + // A: [ ] + // C: [CCCCCC] 1 + // B: [ ] 1 + // D: [DDDDDDDDDDDDDDDDDDDD] 2 + // E: [EEEEEEEEEEEEE] 3 + // + // the flat_view is as follows + // [CCCCCC] [DDDDDDDDDDDD][EEEEEEEEEEEEE] + { + let region_a = Region::init_container_region(8000); + let region_b = Region::init_container_region(5000); + let region_c = Region::init_io_region(1000, config_c.clone()); + let region_d = Region::init_io_region(3000, config_d.clone()); + let region_e = Region::init_io_region(2000, config_e.clone()); + + region_a.add_subregion(region_b.clone(), 2000).unwrap(); + region_a.add_subregion(region_c.clone(), 0).unwrap(); + region_d.set_priority(2); + region_e.set_priority(3); + region_b.add_subregion(region_d.clone(), 0).unwrap(); + region_b.add_subregion(region_e.clone(), 2000).unwrap(); + + let addr_range = AddressRange::from((0u64, region_a.size())); + let view = region_a + .generate_flatview(GuestAddress(0), addr_range) + .unwrap(); + + for fr in view.0.iter() { + println!( + "\nrange: addr is {}, size is {}", + fr.addr_range.base.raw_value(), + fr.addr_range.size + ); + println!("offset is {}", fr.offset_in_region); + println!("region type is {:#?}", fr.owner.region_type()); + println!( + "size is {}, priority = {}", + fr.owner.size(), + fr.owner.priority(), + ); + } + assert_eq!(view.0.len(), 3); + } + } +} diff --git a/boot_loader/Cargo.toml b/boot_loader/Cargo.toml new file mode 100644 index 00000000..cb515610 --- /dev/null +++ b/boot_loader/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "boot_loader" +version = "0.1.0" +authors = ["Huawei StratoVirt Team"] +edition = "2018" +license = "Mulan PSL v2" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +address_space = { path = "../address_space" } +util = { path = "../util" } + +kvm-bindings = "0.2.0" +kvm-ioctls = "0.5.0" +vmm-sys-util = "0.6.1" + +libc = "0.2.71" +log = "0.4.8" +error-chain = "0.12.4" diff --git a/boot_loader/src/aarch64/mod.rs b/boot_loader/src/aarch64/mod.rs new file mode 100644 index 00000000..1c6a94ec --- /dev/null +++ b/boot_loader/src/aarch64/mod.rs @@ -0,0 +1,116 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::path::PathBuf; +use std::sync::Arc; + +use self::errors::{ErrorKind, Result}; +use address_space::{AddressSpace, GuestAddress}; +use util::device_tree; + +pub mod errors { + use util::device_tree; + error_chain! { + links { + AddressSpace(address_space::errors::Error, address_space::errors::ErrorKind); + } + errors { + DTBOverflow(size: u64) { + display( + "guest memory size {} should bigger than {}", + size, + device_tree::FDT_MAX_SIZE + ) + } + InitrdOverflow(addr: u64, size: u32) { + display( + "Failed to allocate initrd image {} to memory {}.", + size, + addr + ) + } + } + } +} + +const DRAM_MEM_START: u64 = 0x8000_0000; +const AARCH64_KERNEL_OFFSET: u64 = 0x8_0000; + +/// Boot loader config used for aarch64. +#[derive(Default, Debug)] +pub struct AArch64BootLoaderConfig { + /// Path of kernel image. + pub kernel: PathBuf, + /// Path of initrd image. + pub initrd: Option, + /// Initrd file size, 0 means no initrd file. + pub initrd_size: u32, +} + +/// The start address for `kernel image`, `initrd image` and `dtb` in guest memory. +pub struct AArch64BootLoader { + /// Start address for `kernel image` in guest memory. + pub kernel_start: u64, + /// Start address for `initrd image` in guest memory. + pub initrd_start: u64, + /// Start address for `dtb` in guest memory. + pub dtb_start: u64, +} + +pub fn linux_bootloader( + config: &AArch64BootLoaderConfig, + sys_mem: &Arc, +) -> Result { + let dtb_addr = + if sys_mem.memory_end_address().raw_value() > u64::from(device_tree::FDT_MAX_SIZE) { + if let Some(addr) = sys_mem + .memory_end_address() + .raw_value() + .checked_sub(u64::from(device_tree::FDT_MAX_SIZE)) + { + if sys_mem.address_in_memory(GuestAddress(addr), 0) { + addr + } else { + DRAM_MEM_START + } + } else { + 0 + } + } else { + 0 + }; + + if dtb_addr == 0 { + return Err(ErrorKind::DTBOverflow(sys_mem.memory_end_address().raw_value()).into()); + } + + let mut initrd_addr = 0; + if config.initrd_size > 0 { + initrd_addr = if let Some(addr) = dtb_addr.checked_sub(u64::from(config.initrd_size)) { + addr + } else { + return Err(ErrorKind::InitrdOverflow(dtb_addr, config.initrd_size).into()); + }; + + if !sys_mem.address_in_memory(GuestAddress(initrd_addr), 0) { + initrd_addr = DRAM_MEM_START + u64::from(device_tree::FDT_MAX_SIZE); + } + } else { + info!("No initrd image file."); + } + + Ok(AArch64BootLoader { + kernel_start: DRAM_MEM_START + AARCH64_KERNEL_OFFSET, + initrd_start: initrd_addr, + dtb_start: dtb_addr, + }) +} diff --git a/boot_loader/src/lib.rs b/boot_loader/src/lib.rs new file mode 100644 index 00000000..32075864 --- /dev/null +++ b/boot_loader/src/lib.rs @@ -0,0 +1,181 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # Boot Loader +//! +//! The crate to initialize memory state during booting VM. +//! +//! ## Design +//! +//! This crate offers support for: +//! 1. Loading PE (vmlinux.bin) kernel images. +//! 2. Loading initrd image. +//! 3. Initialization for architecture related information. +//! +//! ## Platform Support +//! +//! - `x86_64` +//! - `aarch64` +//! +//! ## Examples +//! +//! This example shows how to loading a PE linux kernel with the linux boot +//! protocol before VM start in both x86 and aarch64. +//! +//! ```no_run +//! # extern crate address_space; +//! # extern crate boot_loader; +//! +//! use address_space::{AddressSpace, Region}; +//! use boot_loader::{BootLoaderConfig, load_kernel}; +//! +//! #[cfg(target_arch="x86_64")] +//! fn main() { +//! let guest_mem = AddressSpace::new(Region::init_container_region(std::u64::MAX)).unwrap(); +//! let kernel_file = std::path::PathBuf::from("/path/to/my/kernel"); +//! let bootloader_config = BootLoaderConfig { +//! kernel: kernel_file, +//! initrd: None, +//! initrd_size: 0, +//! kernel_cmdline: String::new(), +//! cpu_count: 0, +//! }; +//! +//! let layout = load_kernel(&bootloader_config, &guest_mem).unwrap(); +//! // Now PE linux kernel and kernel cmdline are loaded to guest memory... +//! } +//! +//! #[cfg(target_arch="aarch64")] +//! fn main() { +//! let guest_mem = AddressSpace::new(Region::init_container_region(u64::MAX)).unwrap(); +//! let kernel_file = std::path::PathBuf::from("/path/to/my/kernel"); +//! let bootloader_config = BootLoaderConfig { +//! kernel: kernel_file, +//! initrd: None, +//! initrd_size: 0, +//! }; +//! +//! let layout = load_kernel(&bootloader_config, &guest_mem).unwrap(); +//! // Now PE linux kernel is loaded to guest memory... +//! } +//! ``` + +#[macro_use] +extern crate log; +#[macro_use] +extern crate error_chain; + +#[cfg(target_arch = "aarch64")] +mod aarch64; +#[cfg(target_arch = "x86_64")] +mod x86_64; + +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; + +use address_space::{AddressSpace, GuestAddress}; + +#[cfg(target_arch = "aarch64")] +use aarch64::linux_bootloader; +#[cfg(target_arch = "aarch64")] +pub use aarch64::AArch64BootLoader as BootLoader; +#[cfg(target_arch = "aarch64")] +pub use aarch64::AArch64BootLoaderConfig as BootLoaderConfig; + +#[cfg(target_arch = "x86_64")] +use x86_64::linux_bootloader; +#[cfg(target_arch = "x86_64")] +pub use x86_64::X86BootLoader as BootLoader; +#[cfg(target_arch = "x86_64")] +pub use x86_64::X86BootLoaderConfig as BootLoaderConfig; + +pub mod errors { + #[cfg(target_arch = "aarch64")] + use super::aarch64 as arch; + #[cfg(target_arch = "x86_64")] + use super::x86_64 as arch; + + error_chain! { + links { + ArchErrors(arch::errors::Error, arch::errors::ErrorKind); + AddressSpace(address_space::errors::Error, address_space::errors::ErrorKind); + } + errors { + BootLoaderOpenKernel { + description("Boot loader open kernel error") + display("Failed to open kernel image or initrd") + } + } + } +} + +use self::errors::{ErrorKind, Result}; + +/// Load PE(vmlinux.bin) linux kernel to Guest Memory. +/// +/// # Arguments +/// * `kernel_file` - host path for kernel. +/// * `kernel_start` - kernel start address in guest memory. +/// * `sys_mem` - guest memory. +/// +/// # Errors +/// * `BootLoaderOpenKernel`: Open PE linux kernel failed. +/// * `AddressSpace`: Write PE linux kernel to guest memory failed. +fn load_image(kernel_file: &PathBuf, kernel_start: u64, sys_mem: &Arc) -> Result<()> { + debug!("Loading image {:?}", kernel_file); + let len = std::fs::metadata(kernel_file).unwrap().len(); + let mut kernel_image = match fs::File::open(kernel_file) { + Ok(file) => file, + _ => return Err(ErrorKind::BootLoaderOpenKernel.into()), + }; + + sys_mem.write(&mut kernel_image, GuestAddress(kernel_start), len)?; + + Ok(()) +} + +/// Load PE(vmlinux.bin) linux kernel and other boot source to Guest Memory. +/// +/// # Steps +/// +/// 1. Prepare for linux kernel boot env, return guest memory layout. +/// 2. According guest memory layout, load PE linux kernel to guest memory. +/// 3. According guest memory layout, load initrd image to guest memory. +/// 4. For `x86_64` arch, inject cmdline to guest memory. +/// +/// # Arguments +/// +/// * `config` - boot source config, contains kernel, initrd and kernel +/// cmdline(only `x86_64`). +/// * `sys_mem` - guest memory. +/// +/// # Errors +/// +/// Load kernel, initrd or kernel cmdline to guest memory failed. Boot source +/// is broken or guest memory is unnormal. +pub fn load_kernel(config: &BootLoaderConfig, sys_mem: &Arc) -> Result { + let boot_loader = linux_bootloader(config, sys_mem)?; + + load_image(&config.kernel, boot_loader.kernel_start, &sys_mem)?; + match &config.initrd { + Some(initrd) => { + load_image(&initrd, boot_loader.initrd_start, &sys_mem)?; + } + None => {} + }; + + #[cfg(target_arch = "x86_64")] + x86_64::setup_kernel_cmdline(&config, sys_mem)?; + + Ok(boot_loader) +} diff --git a/boot_loader/src/x86_64/bootparam.rs b/boot_loader/src/x86_64/bootparam.rs new file mode 100644 index 00000000..16914e9d --- /dev/null +++ b/boot_loader/src/x86_64/bootparam.rs @@ -0,0 +1,199 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use util::byte_code::ByteCode; + +pub const E820_RAM: u32 = 1; +pub const E820_RESERVED: u32 = 2; + +// Structures below sourced from: +// https://www.kernel.org/doc/html/latest/x86/boot.html +// https://www.kernel.org/doc/html/latest/x86/zero-page.html +#[repr(C, packed)] +#[derive(Debug, Default, Copy, Clone)] +pub struct RealModeKernelHeader { + setup_sects: u8, + root_flags: u16, + syssize: u32, + ram_size: u16, + vid_mode: u16, + root_dev: u16, + boot_flag: u16, + jump: u16, + header: u32, + version: u16, + realmode_swtch: u32, + start_sys_seg: u16, + kernel_version: u16, + type_of_loader: u8, + loadflags: u8, + setup_move_size: u16, + code32_start: u32, + ramdisk_image: u32, + ramdisk_size: u32, + bootsect_kludge: u32, + heap_end_ptr: u16, + ext_loader_ver: u8, + ext_loader_type: u8, + cmdline_ptr: u32, + initrd_addr_max: u32, + kernel_alignment: u32, + relocatable_kernel: u8, + min_alignment: u8, + xloadflags: u16, + cmdline_size: u32, + hardware_subarch: u32, + hardware_subarch_data: u64, + payload_offset: u32, + payload_length: u32, + setup_data: u64, + pref_address: u64, + init_size: u32, + handover_offset: u32, + kernel_info_offset: u32, +} + +impl RealModeKernelHeader { + pub fn new(cmdline_ptr: u32, cmdline_size: u32, ramdisk_image: u32, ramdisk_size: u32) -> Self { + RealModeKernelHeader { + boot_flag: 0xaa55, + header: 0x5372_6448, // "HdrS" + type_of_loader: 0xff, // undefined identifier and version + cmdline_ptr, + cmdline_size, + ramdisk_image, + ramdisk_size, + ..Default::default() + } + } +} + +#[repr(C, packed)] +#[derive(Debug, Default, Copy, Clone)] +pub struct E820Entry { + addr: u64, + size: u64, + type_: u32, +} + +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct BootParams { + screen_info: [u8; 0x40], + apm_bios_info: [u8; 0x14], + pad1: u32, + tboot_addr: [u8; 0x8], + ist_info: [u8; 0x10], + pad2: [u8; 0x10], + hd0_info: [u8; 0x10], + hd1_info: [u8; 0x10], + sys_desc_table: [u8; 0x10], + olpc_ofw_header: [u8; 0x10], + ext_ramdisk_image: u32, + ext_ramdisk_size: u32, + ext_cmd_line_ptr: u32, + pad3: [u8; 0x74], + edid_info: [u8; 0x80], + efi_info: [u8; 0x20], + alt_mem_k: u32, + scratch: u32, + e820_entries: u8, + eddbuf_entries: u8, + edd_mbr_sig_buf_entries: u8, + kbd_status: u8, + secure_boot: u8, + pad4: u16, + sentinel: u8, + pad5: u8, + kernel_header: RealModeKernelHeader, // offset: 0x1f1 + pad6: [u8; 0x24], + edd_mbr_sig_buffer: [u8; 0x40], + e820_table: [E820Entry; 0x80], + pad8: [u8; 0x30], + eddbuf: [u8; 0x1ec], +} + +impl ByteCode for BootParams {} + +impl Default for BootParams { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} + +impl BootParams { + pub fn new(kernel_header: RealModeKernelHeader) -> Self { + BootParams { + kernel_header, + ..Default::default() + } + } + + pub fn add_e820_entry(&mut self, addr: u64, size: u64, type_: u32) { + self.e820_table[self.e820_entries as usize] = E820Entry { addr, size, type_ }; + self.e820_entries += 1; + } +} + +#[cfg(test)] +mod test { + use std::path::PathBuf; + use std::sync::Arc; + + use address_space::{AddressSpace, GuestAddress, HostMemMapping, Region}; + + use super::super::{setup_boot_params, X86BootLoaderConfig}; + use super::*; + + #[test] + fn test_boot_param() { + // test setup_boot_params function + let root = Region::init_container_region(0x2000_0000); + let space = AddressSpace::new(root.clone()).unwrap(); + let ram1 = Arc::new(HostMemMapping::new(GuestAddress(0), 0x1000_0000, false).unwrap()); + let region_a = Region::init_ram_region(ram1.clone()); + root.add_subregion(region_a, ram1.start_address().raw_value()) + .unwrap(); + + let config = X86BootLoaderConfig { + kernel: PathBuf::new(), + initrd: Some(PathBuf::new()), + initrd_size: 0x1_0000, + kernel_cmdline: String::from("this_is_a_piece_of_test_string"), + cpu_count: 2, + }; + let (_, initrd_addr_tmp) = setup_boot_params(&config, &space).unwrap(); + assert_eq!(initrd_addr_tmp, 0xfff_0000); + let test_zero_page = space + .read_object::(GuestAddress(0x0000_7000)) + .unwrap(); + assert_eq!(test_zero_page.e820_entries, 4); + + unsafe { + assert_eq!(test_zero_page.e820_table[0].addr, 0); + assert_eq!(test_zero_page.e820_table[0].size, 0x0009_FC00); + assert_eq!(test_zero_page.e820_table[0].type_, 1); + + assert_eq!(test_zero_page.e820_table[1].addr, 0x0009_FC00); + assert_eq!(test_zero_page.e820_table[1].size, 0x400); + assert_eq!(test_zero_page.e820_table[1].type_, 2); + + assert_eq!(test_zero_page.e820_table[2].addr, 0x000F_0000); + assert_eq!(test_zero_page.e820_table[2].size, 0); + assert_eq!(test_zero_page.e820_table[2].type_, 2); + + assert_eq!(test_zero_page.e820_table[3].addr, 0x0010_0000); + assert_eq!(test_zero_page.e820_table[3].size, 0x0ff0_0000); + assert_eq!(test_zero_page.e820_table[3].type_, 1); + } + } +} diff --git a/boot_loader/src/x86_64/gdt.rs b/boot_loader/src/x86_64/gdt.rs new file mode 100644 index 00000000..557e4cf9 --- /dev/null +++ b/boot_loader/src/x86_64/gdt.rs @@ -0,0 +1,112 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::convert::Into; + +// /* +// * Constructor for a conventional segment GDT (or LDT) entry. +// * This is a macro so it can be used in initializers. +// */ +// #define GDT_ENTRY(flags, base, limit) \ +// ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ +// (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ +// (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \ +// (((base) & _AC(0x00ffffff,ULL)) << 16) | \ +// (((limit) & _AC(0x0000ffff,ULL)))) +// +pub struct GdtEntry(pub u64); + +impl GdtEntry { + pub fn new(flags: u64, base: u64, limit: u64) -> Self { + let base = (base & 0xff00_0000) << (56 - 24) | (base & 0x00ff_ffff) << 16; + let limit = (limit & 0x000f_0000) << (48 - 16) | (limit & 0x0000_ffff); + let flags = (flags & 0x0000_f0ff) << 40; + + GdtEntry(base | limit | flags) + } +} + +// Intel SDM 3A 3.4.5, segment descriptor has two +// words(8 byte): +// Word 1st: +// Bits(0 - 15): Segment Limit +// Bits(16 - 31): Base Address 0:15 +// +// Word 2nd: +// Bits(0 - 7): Base Address 23:16 +// Bits(8 - 11): Type, Segment type +// Bits(12): S, Descriptor type +// Bits(13 - 14): DPL, Descriptor privilege level +// Bits(15): P, Segment present +// Bits(16 - 19): Segment Limit +// Bits(20): AVL, Available for use by system software +// Bits(21): L, 64-bit code segment +// Bits(22): D/B, Default Operation Size +// Bits(23): G, Granularity +// Bits(24 - 31): Base Address 24, 31 +impl Into for GdtEntry { + fn into(self) -> kvm_bindings::kvm_segment { + let base = (self.0 >> 16 & 0x00ff_ffff) | (self.0 >> (56 - 24) & 0xff00_0000); + let limit = (self.0 >> (48 - 16) & 0x000f_0000) | (self.0 & 0x0000_ffff); + let flags = (self.0 >> 40) & 0x0000_f0ff; + + kvm_bindings::kvm_segment { + base, + limit: limit as u32, + type_: (flags & 0xf) as u8, + present: ((flags >> (15 - 8)) & 0x1) as u8, + dpl: ((flags >> (13 - 8)) & 0x3) as u8, + db: ((flags >> (22 - 8)) & 0x1) as u8, + s: ((flags >> (12 - 8)) & 0x1) as u8, + l: ((flags >> (21 - 8)) & 0x1) as u8, + g: ((flags >> (23 - 8)) & 0x1) as u8, + avl: ((flags >> (20 - 8)) & 0x1) as u8, + ..Default::default() + } + } +} + +impl Into for GdtEntry { + fn into(self) -> u64 { + self.0 + } +} + +#[cfg(test)] +mod test { + use super::*; + use kvm_bindings::kvm_segment; + + #[test] + fn test_gdt_entry() { + assert_eq!(GdtEntry::new(0xa09b, 0x0, 0xfffff).0, 0xaf9b000000ffff); + assert_eq!(GdtEntry::new(0xc093, 0x0, 0xfffff).0, 0xcf93000000ffff); + } + + #[test] + fn test_segment() { + let gdt_entry = GdtEntry(0xaf9b000000ffff); + let seg: kvm_segment = gdt_entry.into(); + + assert_eq!(1, seg.g); + assert_eq!(0, seg.db); + assert_eq!(1, seg.l); + assert_eq!(0, seg.avl); + assert_eq!(1, seg.present); + assert_eq!(0, seg.dpl); + assert_eq!(1, seg.s); + assert_eq!(11, seg.type_); + assert_eq!(0, seg.base); + assert_eq!(1048575, seg.limit); + assert_eq!(0, seg.unusable); + } +} diff --git a/boot_loader/src/x86_64/mod.rs b/boot_loader/src/x86_64/mod.rs new file mode 100644 index 00000000..a4bcb3f1 --- /dev/null +++ b/boot_loader/src/x86_64/mod.rs @@ -0,0 +1,520 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! Boot Loader load PE linux kernel image to guest memory according +//! [`x86 boot protocol`](https://www.kernel.org/doc/Documentation/x86/boot.txt). +//! +//! Below is x86_64 bootloader memory layout: +//! +//! ``` text +//! +------------------------+ +//! 0x0000_0000 | Real Mode IVT | +//! | | +//! +------------------------+ +//! 0x0000_7000 | | +//! | Zero Page | +//! | | +//! 0x0000_9000 +------------------------+ +//! | Page Map Level4 | +//! | | +//! 0x0000_a000 +------------------------+ +//! | Page Directory Pointer| +//! | | +//! 0x0000_b000 +------------------------+ +//! | Page Directory Entry | +//! | | +//! 0x0002_0000 +------------------------+ +//! | Kernel Cmdline | +//! | | +//! 0x0009_fc00 +------------------------+ +//! | EBDA - MPtable | +//! | | +//! 0x000a_0000 +------------------------+ +//! | VGA_RAM | +//! | | +//! 0x000f_0000 +------------------------+ +//! | MB_BIOS | +//! | | +//! 0x0010_0000 +------------------------+ +//! | Kernel _setup | +//! | | +//! ~------------------------~ +//! | Initrd Ram | +//! 0x****_**** +------------------------+ +//! ``` + +const REAL_MODE_IVT_BEGIN: u64 = 0x0000_0000; + +extern crate address_space; + +mod bootparam; +mod gdt; +mod mptable; + +use std::path::PathBuf; +use std::string::String; +use std::sync::Arc; + +use kvm_bindings::kvm_segment; + +use self::errors::{ErrorKind, Result, ResultExt}; +use address_space::{AddressSpace, GuestAddress}; +use bootparam::{BootParams, RealModeKernelHeader, E820_RAM, E820_RESERVED}; +use gdt::GdtEntry; +use mptable::{ + BusEntry, ConfigTableHeader, FloatingPointer, IOApicEntry, IOInterruptEntry, + LocalInterruptEntry, ProcessEntry, DEST_ALL_LAPIC_MASK, INTERRUPT_TYPE_EXTINT, + INTERRUPT_TYPE_INT, INTERRUPT_TYPE_NMI, IOAPIC_BASE_ADDR, LAPIC_BASE_ADDR, +}; +use util::checksum::obj_checksum; + +pub mod errors { + error_chain! { + links { + AddressSpace(address_space::errors::Error, address_space::errors::ErrorKind); + } + errors { + MaxCpus(cpus: u8) { + display("Configure cpu number({}) above supported max cpu numbers(254)", cpus) + } + } + } +} + +const ZERO_PAGE_START: u64 = 0x0000_7000; +const PML4_START: u64 = 0x0000_9000; +const PDPTE_START: u64 = 0x0000_a000; +const PDE_START: u64 = 0x0000_b000; +const CMDLINE_START: u64 = 0x0002_0000; + +const EBDA_START: u64 = 0x0009_fc00; +const VGA_RAM_BEGIN: u64 = 0x000a_0000; +const MB_BIOS_BEGIN: u64 = 0x000f_0000; +const VMLINUX_RAM_START: u64 = 0x0010_0000; +const KVM_32BIT_GAP_SIZE: u64 = 0x0300 << 20; /* 3GB */ +const KVM_32BIT_MAX_MEM_SIZE: u64 = 1 << 32; /* 4GB */ +const KVM_32BIT_GAP_START: u64 = KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE; +const INITRD_ADDR_MAX: u64 = 0x37ff_ffff; + +const VMLINUX_STARTUP: u64 = 0x0100_0000; +const BOOT_LOADER_SP: u64 = 0x0000_8ff0; + +const GDT_ENTRY_BOOT_CS: u8 = 2; +const GDT_ENTRY_BOOT_DS: u8 = 3; +const BOOT_GDT_OFFSET: u64 = 0x500; +const BOOT_IDT_OFFSET: u64 = 0x520; + +const BOOT_GDT_MAX: usize = 4; + +/// Boot loader config used for x86_64. +pub struct X86BootLoaderConfig { + /// Path of the kernel image. + pub kernel: PathBuf, + /// Path of the initrd image. + pub initrd: Option, + /// Initrd image size. + pub initrd_size: u32, + /// Kernel cmdline parameters. + pub kernel_cmdline: String, + /// VM's CPU count. + pub cpu_count: u8, +} + +/// The start address for some boot source in guest memory for `x86_64`. +pub struct X86BootLoader { + pub kernel_start: u64, + pub kernel_sp: u64, + pub initrd_start: u64, + pub boot_pml4_addr: u64, + pub zero_page_addr: u64, + pub segments: BootGdtSegment, +} + +#[derive(Debug, Default, Copy, Clone)] +pub struct BootGdtSegment { + pub code_segment: kvm_segment, + pub data_segment: kvm_segment, + pub gdt_base: u64, + pub gdt_limit: u16, + pub idt_base: u64, + pub idt_limit: u16, +} + +fn setup_page_table(sys_mem: &Arc) -> Result { + // Initial pagetables. + + // Puts PML4 right after zero page but aligned to 4k. + let boot_pml4_addr = PML4_START; + let boot_pdpte_addr = PDPTE_START; + let boot_pde_addr = PDE_START; + + // Entry covering VA [0..512GB) + let pdpte = boot_pdpte_addr | 0x03; + sys_mem + .write_object(&pdpte, GuestAddress(boot_pml4_addr)) + .chain_err(|| format!("Failed to load PD PTE to 0x{:x}", boot_pml4_addr))?; + + // Entry covering VA [0..1GB) + let pde = boot_pde_addr | 0x03; + sys_mem + .write_object(&pde, GuestAddress(boot_pdpte_addr)) + .unwrap(); + + // 512 2MB entries together covering VA [0..1GB). Note we are assuming + // CPU supports 2MB pages (/proc/cpuinfo has 'pse'). All modern CPUs do. + for i in 0..512u64 { + let pde = (i << 21) + 0x83u64; + sys_mem + .write_object(&pde, GuestAddress(boot_pde_addr + i * 8)) + .chain_err(|| format!("Failed to load PDE to 0x{:x}", boot_pde_addr + i * 8))?; + } + + Ok(boot_pml4_addr) +} + +macro_rules! write_entry { + ( $d:expr, $t:ty, $m:expr, $o:expr, $s:expr ) => { + let entry = $d; + $m.write_object(&entry, GuestAddress($o))?; + $o += std::mem::size_of::<$t>() as u64; + $s = $s.wrapping_add(obj_checksum(&entry)); + }; +} + +fn setup_isa_mptable(sys_mem: &Arc, start_addr: u64, num_cpus: u8) -> Result<()> { + const BUS_ID: u8 = 0; + const MPTABLE_MAX_CPUS: u32 = 254; // mptable max support 255 cpus, reserve one for ioapic id + const MPTABLE_IOAPIC_NR: u8 = 16; + + if u32::from(num_cpus) > MPTABLE_MAX_CPUS { + return Err(ErrorKind::MaxCpus(num_cpus).into()); + } + + let ioapic_id: u8 = num_cpus + 1; + let header = start_addr + std::mem::size_of::() as u64; + sys_mem.write_object( + &FloatingPointer::new(header as u32), + GuestAddress(start_addr), + )?; + + let mut offset = header + std::mem::size_of::() as u64; + let mut sum = 0u8; + + for cpu_id in 0..num_cpus { + write_entry!( + ProcessEntry::new(cpu_id as u8, true, cpu_id == 0), + ProcessEntry, + sys_mem, + offset, + sum + ); + } + + write_entry!(BusEntry::new(BUS_ID), BusEntry, sys_mem, offset, sum); + + write_entry!( + IOApicEntry::new(ioapic_id, true, IOAPIC_BASE_ADDR), + IOApicEntry, + sys_mem, + offset, + sum + ); + + for i in 0..MPTABLE_IOAPIC_NR { + write_entry!( + IOInterruptEntry::new(INTERRUPT_TYPE_INT, BUS_ID, i, ioapic_id, i), + IOInterruptEntry, + sys_mem, + offset, + sum + ); + } + + write_entry!( + LocalInterruptEntry::new(INTERRUPT_TYPE_EXTINT, BUS_ID, 0, ioapic_id, 0), + LocalInterruptEntry, + sys_mem, + offset, + sum + ); + + write_entry!( + LocalInterruptEntry::new(INTERRUPT_TYPE_NMI, BUS_ID, 0, DEST_ALL_LAPIC_MASK, 1), + LocalInterruptEntry, + sys_mem, + offset, + sum + ); + + sys_mem.write_object( + &ConfigTableHeader::new((offset - header) as u16, sum, LAPIC_BASE_ADDR), + GuestAddress(header), + )?; + + Ok(()) +} + +fn setup_boot_params( + config: &X86BootLoaderConfig, + sys_mem: &Arc, +) -> Result<(u64, u64)> { + let (ramdisk_size, ramdisk_image, initrd_addr) = if config.initrd_size > 0 { + let mut initrd_addr_max = INITRD_ADDR_MAX as u32; + if initrd_addr_max as u64 > sys_mem.memory_end_address().raw_value() as u64 { + initrd_addr_max = sys_mem.memory_end_address().raw_value() as u32; + }; + + let img = (initrd_addr_max - config.initrd_size as u32) & !0xfffu32; + (config.initrd_size as u32, img, img as u64) + } else { + info!("No initrd image file."); + (0u32, 0u32, 0u64) + }; + + let mut boot_params = BootParams::new(RealModeKernelHeader::new( + CMDLINE_START as u32, + config.kernel_cmdline.len() as u32, + ramdisk_image, + ramdisk_size, + )); + + boot_params.add_e820_entry( + REAL_MODE_IVT_BEGIN, + EBDA_START - REAL_MODE_IVT_BEGIN, + E820_RAM, + ); + boot_params.add_e820_entry(EBDA_START, VGA_RAM_BEGIN - EBDA_START, E820_RESERVED); + boot_params.add_e820_entry(MB_BIOS_BEGIN, 0, E820_RESERVED); + + let high_memory_start = GuestAddress(VMLINUX_RAM_START); + let end_32bit_gap_start = GuestAddress(KVM_32BIT_GAP_START); + let first_addr_past_32bits = GuestAddress(KVM_32BIT_MAX_MEM_SIZE); + let mem_end = sys_mem.memory_end_address(); + if mem_end < end_32bit_gap_start { + boot_params.add_e820_entry( + high_memory_start.raw_value() as u64, + mem_end.offset_from(high_memory_start) as u64, + E820_RAM, + ); + } else { + boot_params.add_e820_entry( + high_memory_start.raw_value() as u64, + end_32bit_gap_start.offset_from(high_memory_start) as u64, + E820_RAM, + ); + if mem_end > first_addr_past_32bits { + boot_params.add_e820_entry( + first_addr_past_32bits.raw_value() as u64, + mem_end.offset_from(first_addr_past_32bits) as u64, + E820_RAM, + ); + } + } + + sys_mem + .write_object(&boot_params, GuestAddress(ZERO_PAGE_START)) + .chain_err(|| format!("Failed to load zero page to 0x{:x}", ZERO_PAGE_START))?; + + Ok((ZERO_PAGE_START, initrd_addr)) +} + +fn write_gdt_table(table: &[u64], guest_mem: &Arc) -> Result<()> { + let mut boot_gdt_addr = BOOT_GDT_OFFSET as u64; + for (_, entry) in table.iter().enumerate() { + guest_mem + .write_object(entry, GuestAddress(boot_gdt_addr)) + .chain_err(|| format!("Failed to load gdt to 0x{:x}", boot_gdt_addr))?; + boot_gdt_addr += 8; + } + Ok(()) +} + +fn write_idt_value(val: u64, guest_mem: &Arc) -> Result<()> { + let boot_idt_addr = BOOT_IDT_OFFSET; + guest_mem + .write_object(&val, GuestAddress(boot_idt_addr)) + .chain_err(|| format!("Failed to load gdt to 0x{:x}", boot_idt_addr))?; + + Ok(()) +} + +pub fn setup_gdt(guest_mem: &Arc) -> Result { + let gdt_table: [u64; BOOT_GDT_MAX as usize] = [ + GdtEntry::new(0, 0, 0).into(), // NULL + GdtEntry::new(0, 0, 0).into(), // NULL + GdtEntry::new(0xa09b, 0, 0xfffff).into(), // CODE + GdtEntry::new(0xc093, 0, 0xfffff).into(), // DATA + ]; + + let mut code_seg: kvm_segment = GdtEntry(gdt_table[GDT_ENTRY_BOOT_CS as usize]).into(); + code_seg.selector = GDT_ENTRY_BOOT_CS as u16 * 8; + let mut data_seg: kvm_segment = GdtEntry(gdt_table[GDT_ENTRY_BOOT_DS as usize]).into(); + data_seg.selector = GDT_ENTRY_BOOT_DS as u16 * 8; + + write_gdt_table(&gdt_table[..], guest_mem)?; + write_idt_value(0, guest_mem)?; + + Ok(BootGdtSegment { + code_segment: code_seg, + data_segment: data_seg, + gdt_base: BOOT_GDT_OFFSET, + gdt_limit: std::mem::size_of_val(&gdt_table) as u16 - 1, + idt_base: BOOT_IDT_OFFSET, + idt_limit: std::mem::size_of::() as u16 - 1, + }) +} + +pub fn linux_bootloader( + config: &X86BootLoaderConfig, + sys_mem: &Arc, +) -> Result { + let boot_pml4 = setup_page_table(sys_mem)?; + + setup_isa_mptable(sys_mem, EBDA_START, config.cpu_count)?; + + let (zero_page, initrd_addr) = setup_boot_params(&config, sys_mem)?; + + let gdt_seg = setup_gdt(sys_mem)?; + + Ok(X86BootLoader { + kernel_start: VMLINUX_STARTUP, + kernel_sp: BOOT_LOADER_SP, + initrd_start: initrd_addr, + boot_pml4_addr: boot_pml4, + zero_page_addr: zero_page, + segments: gdt_seg, + }) +} + +pub fn setup_kernel_cmdline( + config: &X86BootLoaderConfig, + sys_mem: &Arc, +) -> Result<()> { + let mut cmdline = config.kernel_cmdline.as_bytes(); + sys_mem.write( + &mut cmdline, + GuestAddress(CMDLINE_START), + config.kernel_cmdline.len() as u64, + )?; + + Ok(()) +} + +#[cfg(test)] +mod test { + use super::*; + use address_space::*; + use std::sync::Arc; + use std::vec::Vec; + #[test] + fn test_x86_bootloader_and_kernel_cmdline() { + let root = Region::init_container_region(0x2000_0000); + let space = AddressSpace::new(root.clone()).unwrap(); + let ram1 = Arc::new(HostMemMapping::new(GuestAddress(0), 0x1000_0000, false).unwrap()); + let region_a = Region::init_ram_region(ram1.clone()); + root.add_subregion(region_a, ram1.start_address().raw_value()) + .unwrap(); + assert_eq!(setup_page_table(&space).unwrap(), 0x0000_9000); + assert_eq!( + space.read_object::(GuestAddress(0x0000_9000)).unwrap(), + 0x0000_a003 + ); + assert_eq!( + space.read_object::(GuestAddress(0x0000_a000)).unwrap(), + 0x0000_b003 + ); + let mut page_addr: u64 = 0x0000_b000; + let mut tmp_value: u64 = 0x83; + for _ in 0..512u64 { + assert_eq!( + space.read_object::(GuestAddress(page_addr)).unwrap(), + tmp_value + ); + page_addr += 8; + tmp_value += 0x20_0000; + } + + let config = X86BootLoaderConfig { + kernel: PathBuf::new(), + initrd: Some(PathBuf::new()), + initrd_size: 0x1_0000, + kernel_cmdline: String::from("this_is_a_piece_of_test_string"), + cpu_count: 2, + }; + let (_, initrd_addr_tmp) = setup_boot_params(&config, &space).unwrap(); + assert_eq!(initrd_addr_tmp, 0xfff_0000); + + //test setup_gdt function + let c_seg = kvm_segment { + base: 0, + limit: 1048575, + selector: 16, + type_: 11, + present: 1, + dpl: 0, + db: 0, + s: 1, + l: 1, + g: 1, + avl: 0, + unusable: 0, + padding: 0, + }; + let d_seg = kvm_segment { + base: 0, + limit: 1048575, + selector: 24, + type_: 3, + present: 1, + dpl: 0, + db: 1, + s: 1, + l: 0, + g: 1, + avl: 0, + unusable: 0, + padding: 0, + }; + + let boot_gdt_seg = setup_gdt(&space).unwrap(); + + assert_eq!(boot_gdt_seg.code_segment, c_seg); + assert_eq!(boot_gdt_seg.data_segment, d_seg); + assert_eq!(boot_gdt_seg.gdt_limit, 31); + assert_eq!(boot_gdt_seg.idt_limit, 7); + let mut arr: Vec = Vec::new(); + let mut boot_addr: u64 = 0x500; + for _ in 0..BOOT_GDT_MAX { + arr.push(space.read_object(GuestAddress(boot_addr)).unwrap()); + boot_addr += 8; + } + assert_eq!(arr[0], 0); + assert_eq!(arr[1], 0); + assert_eq!(arr[2], 0xaf9b000000ffff); + assert_eq!(arr[3], 0xcf93000000ffff); + + //test setup_kernel_cmdline function + let cmd_len: u64 = config.kernel_cmdline.len() as u64; + let mut read_buffer: [u8; 30] = [0; 30]; + //let mut read_buffer:Vec = Vec::with_capacity(); + assert!(setup_kernel_cmdline(&config, &space).is_ok()); + space + .read( + &mut read_buffer.as_mut(), + GuestAddress(0x0002_0000), + cmd_len, + ) + .unwrap(); + let s = String::from_utf8(read_buffer.to_vec()).unwrap(); + assert_eq!(s, "this_is_a_piece_of_test_string".to_string()); + } +} diff --git a/boot_loader/src/x86_64/mptable.rs b/boot_loader/src/x86_64/mptable.rs new file mode 100644 index 00000000..2651e3fd --- /dev/null +++ b/boot_loader/src/x86_64/mptable.rs @@ -0,0 +1,259 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use util::byte_code::ByteCode; +use util::checksum::obj_checksum; + +const SPEC_VERSION: u8 = 4; // version 1.4 +const APIC_VERSION: u8 = 0x14; + +// Variables and Structures below sourced from: +// Intel MultiProcessor Specification 1.4 +const CPU_FLAGS_ENABLE: u8 = 0x1; +const CPU_FLAGS_BSP: u8 = 0x2; +const APIC_FLAGS_ENABLE: u8 = 0x1; + +pub const INTERRUPT_TYPE_INT: u8 = 0; +pub const INTERRUPT_TYPE_NMI: u8 = 1; +pub const INTERRUPT_TYPE_EXTINT: u8 = 3; +pub const IOAPIC_BASE_ADDR: u32 = 0xfec0_0000; +pub const LAPIC_BASE_ADDR: u32 = 0xfee0_0000; +pub const DEST_ALL_LAPIC_MASK: u8 = 0xff; + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct FloatingPointer { + signature: [u8; 4], + pointer: u32, + length: u8, + spec: u8, + checksum: u8, + feature1: u8, + feature2: u32, +} + +impl ByteCode for FloatingPointer {} + +impl FloatingPointer { + pub fn new(pointer: u32) -> Self { + let mut fp = FloatingPointer { + signature: [b'_', b'M', b'P', b'_'], + pointer, + length: 1, // spec: 01h + spec: SPEC_VERSION, + checksum: 0, + feature1: 0, + feature2: 0, + }; + + let sum = obj_checksum(&fp); + fp.checksum = (-(sum as i8)) as u8; + + fp + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct ConfigTableHeader { + signature: [u8; 4], + length: u16, + spec: u8, + checksum: u8, + oem_id: [u8; 8], + product_id: [u8; 12], + oem_table_pointer: u32, + oem_table_size: u16, + entry_count: u16, + lapic_addr: u32, + ext_table_length: u16, + ext_table_checksum: u8, + reserved: u8, +} + +impl ByteCode for ConfigTableHeader {} + +impl ConfigTableHeader { + pub fn new(length: u16, sum: u8, lapic_addr: u32) -> Self { + let mut ct = ConfigTableHeader { + signature: [b'P', b'C', b'M', b'P'], + length, + spec: SPEC_VERSION, + checksum: 0, + oem_id: [b'q', b'v', 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], + product_id: [ + b'1', b'.', b'0', 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + ], + oem_table_pointer: 0, + oem_table_size: 0, + entry_count: 0, + lapic_addr, + ext_table_length: 0, + ext_table_checksum: 0, + reserved: 0, + }; + + let sum = sum.wrapping_add(obj_checksum(&ct)); + ct.checksum = (-(sum as i8)) as u8; + + ct + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct ProcessEntry { + type_: u8, + lapic_id: u8, + lapic_version: u8, + cpu_flags: u8, + cpu_signature: u32, + feature_flags: u32, + reserved: u32, + reserved1: u32, +} + +impl ByteCode for ProcessEntry {} + +impl ProcessEntry { + pub fn new(lapic_id: u8, enable: bool, bsp: bool) -> Self { + let mut cpu_flags = if enable { CPU_FLAGS_ENABLE } else { 0 }; + if bsp { + cpu_flags |= CPU_FLAGS_BSP; + } + + ProcessEntry { + type_: 0, + lapic_id, + lapic_version: APIC_VERSION, + cpu_flags, + cpu_signature: 0x600, // Intel CPU Family Number: 0x6 + feature_flags: 0x201, // APIC & FPU + reserved: 0, + reserved1: 0, + } + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct BusEntry { + type_: u8, + bus_id: u8, + bus_type: [u8; 6], +} + +impl ByteCode for BusEntry {} + +impl BusEntry { + pub fn new(bus_id: u8) -> Self { + BusEntry { + type_: 1, + bus_id, + bus_type: [b'I', b'S', b'A', 0x0, 0x0, 0x0], + } + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct IOApicEntry { + type_: u8, + ioapic_id: u8, + ioapic_version: u8, + ioapic_flags: u8, + ioapic_addr: u32, +} + +impl ByteCode for IOApicEntry {} + +impl IOApicEntry { + pub fn new(ioapic_id: u8, enable: bool, ioapic_addr: u32) -> Self { + let ioapic_flags = if enable { APIC_FLAGS_ENABLE } else { 0 }; + + IOApicEntry { + type_: 2, + ioapic_id, + ioapic_version: APIC_VERSION, + ioapic_flags, + ioapic_addr, + } + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct IOInterruptEntry { + type_: u8, + interrupt_type: u8, + interrupt_flags: u16, + source_bus_id: u8, + source_bus_irq: u8, + dest_ioapic_id: u8, + dest_ioapic_int: u8, +} + +impl ByteCode for IOInterruptEntry {} + +impl IOInterruptEntry { + pub fn new( + interrupt_type: u8, + source_bus_id: u8, + source_bus_irq: u8, + dest_ioapic_id: u8, + dest_ioapic_int: u8, + ) -> Self { + IOInterruptEntry { + type_: 3, + interrupt_type, + interrupt_flags: 0, // conforms to spec of bus + source_bus_id, + source_bus_irq, + dest_ioapic_id, + dest_ioapic_int, + } + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct LocalInterruptEntry { + type_: u8, + interrupt_type: u8, + interrupt_flags: u16, + source_bus_id: u8, + source_bus_irq: u8, + dest_lapic_id: u8, + dest_lapic_lint: u8, +} + +impl ByteCode for LocalInterruptEntry {} + +impl LocalInterruptEntry { + pub fn new( + interrupt_type: u8, + source_bus_id: u8, + source_bus_irq: u8, + dest_lapic_id: u8, + dest_lapic_lint: u8, + ) -> Self { + LocalInterruptEntry { + type_: 4, + interrupt_type, + interrupt_flags: 0, // conforms to spec of bus + source_bus_id, + source_bus_irq, + dest_lapic_id, + dest_lapic_lint, + } + } +} diff --git a/device_model/Cargo.toml b/device_model/Cargo.toml new file mode 100644 index 00000000..a799fb54 --- /dev/null +++ b/device_model/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "device_model" +version = "0.1.0" +authors = ["Huawei StratoVirt Team"] +edition = "2018" +license = "Mulan PSL v2" + +[dependencies] +log = "0.4.8" +libc = "0.2.71" +kvm-bindings = "0.2.0" +kvm-ioctls = "0.5.0" +vmm-sys-util = "0.6.1" +byteorder = "1.3.4" +error-chain = "0.12.4" +serde = { version = "1.0.114", features = ["derive"] } +serde_json = "1.0.55" +address_space = { path = "../address_space" } +util = { path = "../util" } +machine_manager = { path = "../machine_manager" } +boot_loader = { path = "../boot_loader" } + +[features] +default = ["qmp"] +mmio = [] +qmp = [] diff --git a/device_model/src/cpu/aarch64/mod.rs b/device_model/src/cpu/aarch64/mod.rs new file mode 100644 index 00000000..2510f48b --- /dev/null +++ b/device_model/src/cpu/aarch64/mod.rs @@ -0,0 +1,260 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::convert::Into; +use std::mem; +use std::sync::Arc; + +use kvm_bindings::{ + kvm_regs, kvm_vcpu_init, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, + KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, +}; +use kvm_ioctls::{VcpuFd, VmFd}; + +use self::errors::{ErrorKind, Result}; + +pub mod errors { + error_chain! { + errors { + GetSysRegister(err_info: String) { + description("Get sys Register error") + display("Failed to get system register: {}!", err_info) + } + SetSysRegister(err_info: String) { + description("Set sys Register error") + display("Failed to Set system register: {}!", err_info) + } + } + } +} + +// PSR (Processor State Register) bits. +// See: https://elixir.bootlin.com/linux/v5.6/source/arch/arm64/include/uapi/asm/ptrace.h#L34 +#[allow(non_upper_case_globals)] +const PSR_MODE_EL1h: u64 = 0x0000_0005; +const PSR_F_BIT: u64 = 0x0000_0040; +const PSR_I_BIT: u64 = 0x0000_0080; +const PSR_A_BIT: u64 = 0x0000_0100; +const PSR_D_BIT: u64 = 0x0000_0200; + +// MPIDR - Multiprocessor Affinity Register. +// See: https://elixir.bootlin.com/linux/v5.6/source/arch/arm64/include/asm/sysreg.h#L130 +pub const SYS_MPIDR_EL1: u64 = 0x6030_0000_0013_c005; + +// MPIDR is Multiprocessor Affinity Register +// [40:63] bit reserved on AArch64 Architecture, +const UNINIT_MPIDR: u64 = 0xFFFF_FF00_0000_0000; + +// AArch64 cpu core register +// See: https://elixir.bootlin.com/linux/v5.6/source/arch/arm64/include/uapi/asm/kvm.h#L50 + +// User structures for general purpose, floating point and debug registers. +// See: https://elixir.bootlin.com/linux/v5.6/source/arch/arm64/include/uapi/asm/ptrace.h#L75 +#[allow(non_camel_case_types)] +#[allow(dead_code)] +pub enum Arm64CoreRegs { + KVM_USER_PT_REGS, + KVM_SP_EL1, + KVM_ELR_EL1, + KVM_SPSR(usize), + KVM_USER_FPSIMD_STATE, + USER_PT_REG_REGS(usize), + USER_PT_REG_SP, + USER_PT_REG_PC, + USER_PT_REG_PSTATE, + USER_FPSIMD_STATE_VREGS(usize), + USER_FPSIMD_STATE_FPSR, + USER_FPSIMD_STATE_FPCR, + USER_FPSIMD_STATE_RES(usize), +} + +#[allow(clippy::zero_ptr)] +impl Into for Arm64CoreRegs { + fn into(self) -> u64 { + let register_size; + let regid = match self { + Arm64CoreRegs::KVM_USER_PT_REGS => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, regs) + } + Arm64CoreRegs::KVM_SP_EL1 => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, sp_el1) + } + Arm64CoreRegs::KVM_ELR_EL1 => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, elr_el1) + } + Arm64CoreRegs::KVM_SPSR(idx) if idx < KVM_NR_SPSR as usize => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, spsr) + idx * 8 + } + Arm64CoreRegs::KVM_USER_FPSIMD_STATE => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, fp_regs) + } + Arm64CoreRegs::USER_PT_REG_REGS(idx) if idx < 31 => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, regs, user_pt_regs, regs) + idx * 8 + } + Arm64CoreRegs::USER_PT_REG_SP => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, regs, user_pt_regs, sp) + } + Arm64CoreRegs::USER_PT_REG_PC => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, regs, user_pt_regs, pc) + } + Arm64CoreRegs::USER_PT_REG_PSTATE => { + register_size = KVM_REG_SIZE_U64; + offset_of!(kvm_regs, regs, user_pt_regs, pstate) + } + Arm64CoreRegs::USER_FPSIMD_STATE_VREGS(idx) if idx < 32 => { + register_size = KVM_REG_SIZE_U128; + offset_of!(kvm_regs, fp_regs, user_fpsimd_state, vregs) + idx * 16 + } + Arm64CoreRegs::USER_FPSIMD_STATE_FPSR => { + register_size = KVM_REG_SIZE_U32; + offset_of!(kvm_regs, fp_regs, user_fpsimd_state, fpsr) + } + Arm64CoreRegs::USER_FPSIMD_STATE_FPCR => { + register_size = KVM_REG_SIZE_U32; + offset_of!(kvm_regs, fp_regs, user_fpsimd_state, fpcr) + } + Arm64CoreRegs::USER_FPSIMD_STATE_RES(idx) if idx < 2 => { + register_size = 128; + offset_of!(kvm_regs, fp_regs, user_fpsimd_state, __reserved) + idx * 8 + } + _ => panic!("No such Register"), + }; + + KVM_REG_ARM64 as u64 + | register_size as u64 + | u64::from(KVM_REG_ARM_CORE) + | (regid / mem::size_of::()) as u64 + } +} + +pub fn set_one_core_reg(vcpu: &Arc, reg: Arm64CoreRegs, data: u64) -> Result<()> { + match vcpu.set_one_reg(reg.into(), data) { + Ok(_) => Ok(()), + Err(e) => Err(ErrorKind::SetSysRegister(format!("{:?}", e)).into()), + } +} + +/// AArch64 CPU booting configure information +/// +/// Before jumping into the kernel, primary CPU general-purpose +/// register `x0` need to setting to physical address of device +/// tree blob (dtb) in system RAM. +/// +/// See: https://elixir.bootlin.com/linux/v5.6/source/Documentation/arm64/booting.rst +#[derive(Default, Copy, Clone)] +pub struct AArch64CPUBootConfig { + pub fdt_addr: u64, + pub kernel_addr: u64, +} + +/// AArch64 CPU architect information +#[derive(Default, Copy, Clone)] +pub struct CPUAArch64 { + /// The vcpu id, `0` means primary CPU. + vcpu_id: u32, + /// MPIDR register value of this vcpu, + /// The MPIDR provides an additional processor identification mechanism + /// for scheduling purposes. + mpidr: u64, + /// The guest physical address of kernel start point. + boot_ip: u64, + /// The guest physical address of device tree blob (dtb). + fdt_addr: u64, + /// Used to pass vcpu target and supported features to kvm. + kvi: kvm_vcpu_init, +} + +impl CPUAArch64 { + pub fn new(vm_fd: &Arc, vcpu_id: u32) -> Self { + let mut kvi = kvm_bindings::kvm_vcpu_init::default(); + vm_fd.get_preferred_target(&mut kvi).unwrap(); + + CPUAArch64 { + vcpu_id, + mpidr: UNINIT_MPIDR, + boot_ip: 0, + fdt_addr: 0, + kvi, + } + } + + pub fn realize( + &mut self, + vcpu_fd: &Arc, + boot_config: &AArch64CPUBootConfig, + ) -> Result<()> { + self.boot_ip = boot_config.kernel_addr; + self.fdt_addr = boot_config.fdt_addr; + + // support PSCI 0.2 + // We already checked that the capability is supported. + self.kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; + + // Non-boot cpus are powered off initially. + if self.vcpu_id != 0 { + self.kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; + } + + vcpu_fd.vcpu_init(&self.kvi).unwrap(); + + self.get_mpidr(vcpu_fd); + + Ok(()) + } + + pub fn get_mpidr(&mut self, vcpu_fd: &Arc) -> u64 { + if self.mpidr == UNINIT_MPIDR { + self.mpidr = match vcpu_fd.get_one_reg(SYS_MPIDR_EL1) { + Ok(mpidr) => mpidr as u64, + Err(e) => panic!("update vcpu mpidr failed {:?}", e), + }; + } + debug!("self.mpidr is {}", self.mpidr); + self.mpidr + } + + pub fn reset_vcpu(&self, vcpu: &Arc) -> Result<()> { + // Configure PSTATE(Processor State), mask all interrupts. + let data: u64 = PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h; + set_one_core_reg(&vcpu, Arm64CoreRegs::USER_PT_REG_PSTATE, data) + .expect("Failed to set core reg pstate register"); + + // Reset x1, x2, x3 register to zero. + set_one_core_reg(&vcpu, Arm64CoreRegs::USER_PT_REG_REGS(1), 0) + .expect("Failed to init x1 to zero"); + + set_one_core_reg(&vcpu, Arm64CoreRegs::USER_PT_REG_REGS(2), 0) + .expect("Failed to init x2 to zero"); + + set_one_core_reg(&vcpu, Arm64CoreRegs::USER_PT_REG_REGS(3), 0) + .expect("Failed to init x3 to zero"); + + // Configure boot ip and device tree address, prepare for kernel setup + if self.vcpu_id == 0 { + set_one_core_reg(&vcpu, Arm64CoreRegs::USER_PT_REG_REGS(0), self.fdt_addr) + .expect("Failed to set device tree address"); + + set_one_core_reg(&vcpu, Arm64CoreRegs::USER_PT_REG_PC, self.boot_ip) + .expect("Failed to set boot ip"); + } + + Ok(()) + } +} diff --git a/device_model/src/cpu/mod.rs b/device_model/src/cpu/mod.rs new file mode 100644 index 00000000..09143450 --- /dev/null +++ b/device_model/src/cpu/mod.rs @@ -0,0 +1,633 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # Cpu +//! +//! This mod is to initialize vcpus to assigned state and drive them to run. +//! +//! ## Design +//! +//! This module offers support for: +//! 1. Create vcpu. +//! 2. According configuration, initialize vcpu registers and run. +//! 3. Handle vcpu VmIn/VmOut events. +//! 4. Handle vcpu lifecycle. +//! +//! ## Platform Support +//! +//! - `x86_64` +//! - `aarch64` +#[cfg(target_arch = "aarch64")] +mod aarch64; +#[cfg(target_arch = "x86_64")] +mod x86_64; + +use std::cell::RefCell; +use std::sync::{Arc, Barrier, Condvar, Mutex}; +use std::thread; +use std::time::Duration; + +use kvm_ioctls::{VcpuExit, VcpuFd}; +use libc::{c_int, c_void, siginfo_t}; +use vmm_sys_util::signal::{register_signal_handler, Killable}; + +#[cfg(feature = "qmp")] +use machine_manager::{qmp::qmp_schema as schema, qmp::QmpChannel}; + +use self::errors::{ErrorKind, Result}; +#[cfg(target_arch = "aarch64")] +pub use aarch64::errors as ArchCPUError; +#[cfg(target_arch = "aarch64")] +pub use aarch64::AArch64CPUBootConfig as CPUBootConfig; +#[cfg(target_arch = "aarch64")] +pub use aarch64::CPUAArch64 as ArchCPU; +use machine_manager::machine::MachineInterface; +#[cfg(target_arch = "x86_64")] +pub use x86_64::errors as ArchCPUError; +#[cfg(target_arch = "x86_64")] +pub use x86_64::X86CPUBootConfig as CPUBootConfig; +#[cfg(target_arch = "x86_64")] +pub use x86_64::X86CPU as ArchCPU; + +pub mod errors { + error_chain! { + links { + ArchCpu(super::ArchCPUError::Error, super::ArchCPUError::ErrorKind); + } + foreign_links { + Signal(vmm_sys_util::errno::Error); + } + errors { + CreateVcpu(err_info: String) { + description("Create kvm vcpu error!") + display("Failed to create kvm vcpu: {}!", err_info) + } + RealizeVcpu(err_info: String) { + description("Configure vcpu error!") + display("Failed to configure kvm vcpu: {}!", err_info) + } + StartVcpu(err_info: String) { + description("Start vcpu error!") + display("Failed to starting kvm vcpu: {}!", err_info) + } + StopVcpu(err_info: String) { + description("Stop vcpu error!") + display("Failed to stopping kvm vcpu: {}!", err_info) + } + DestroyVcpu(err_info: String) { + description("Destroy vcpu error!") + display("Failed to destroy kvm vcpu: {}!", err_info) + } + } + } +} + +// SIGRTMIN = 34 (GNU, in MUSL is 35) and SIGRTMAX = 64 in linux, VCPU signal +// number should be assigned to SIGRTMIN + n, (n = 0...30). +#[cfg(not(target_env = "musl"))] +const VCPU_EXIT_SIGNAL: i32 = 34; +#[cfg(target_env = "musl")] +const VCPU_EXIT_SIGNAL: i32 = 35; +#[cfg(not(target_env = "musl"))] +const VCPU_PAUSE_SIGNAL: i32 = 35; +#[cfg(target_env = "musl")] +const VCPU_PAUSE_SIGNAL: i32 = 36; +#[cfg(not(target_env = "musl"))] +const VCPU_TASK_SIGNAL: i32 = 36; +#[cfg(target_env = "musl")] +const VCPU_TASK_SIGNAL: i32 = 37; + +const UNINITIALIZED_VCPU_ID: u32 = 9999; + +/// State for `CPU` lifecycle. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum CpuLifecycleState { + /// `CPU` structure is only be initialized, but nothing set. + Nothing = 0, + /// `CPU` structure's property is set with configuration. + Created = 1, + /// `CPU` start to be running. + Running = 2, + /// `CPU` thread is sleeping. + Paused = 3, + /// `CPU` structure is going to destroy. + Stopping = 4, + /// `CPU` structure destroyed, will be dropped soon. + Stopped = 5, +} + +// Record vcpu information +struct ThreadVcpu { + dirty_stamps: u64, + vcpu_id: u32, +} + +thread_local! { + static LOCAL_THREAD_VCPU: RefCell = RefCell::new( + ThreadVcpu { + dirty_stamps: 0, + vcpu_id: UNINITIALIZED_VCPU_ID, + } + ) +} + +fn init_local_thread_vcpu(vcpu_id: u8) { + LOCAL_THREAD_VCPU.with(|thread_vcpu| { + let mut vcpu_signal = thread_vcpu.borrow_mut(); + vcpu_signal.vcpu_id = u32::from(vcpu_id); + vcpu_signal.dirty_stamps = 0; + }) +} + +/// Trait to handle `CPU` lifetime. +pub trait CPUInterface { + /// Realize `CPU` structure, set registers value for `CPU`. + fn realize(&self, boot: &CPUBootConfig) -> Result<()>; + + /// + /// # Arguments + /// + /// * `cpu` - The cpu instance shared in thread. + /// * `thread_barrier` - The cpu thread barrier. + /// * `paused` - After started, paused vcpu or not. + /// * `use seccomp` - Use seccomp in vcpu thread. + fn start( + cpu: Arc, + thread_barrier: Arc, + paused: bool, + use_seccomp: bool, + ) -> Result<()> + where + Self: std::marker::Sized; + + /// Make `CPU` lifecycle from `Running` to `Paused`. + fn pause(&self) -> Result<()>; + + /// Make `CPU` lifecycle from `Paused` to `Running`. + fn resume(&self) -> Result<()>; + + /// Make `CPU` lifecycle to `Stopping`, then `Stopped`. + fn destroy(&self) -> Result<()>; + + /// Reset registers value for `CPU`. + fn reset(&self) -> Result<()>; + + /// Handle vcpu event from `kvm`. + fn kvm_vcpu_exec(&self) -> Result; +} + +/// Trait to handle `CPU` running statement. +pub trait CPUWorker { + const SYNC_READ_CPU_STATE: u64 = 1; + const SYNC_WRITE_CPU_STATE: u64 = 2; + + /// Handle `notify` change in vcpu thread. + fn handle_workqueue(&self); + + /// Check vcpu thread is `paused` or `running`. + fn ready_for_running(&self) -> bool; +} + +/// `CPU` is a wrapper around creating and using a kvm-based VCPU. +pub struct CPU { + /// ID of this virtual CPU, `0` means this cpu is primary `CPU`. + id: u8, + /// The file descriptor of this kvm-based VCPU. + fd: Arc, + /// Architecture special CPU property. + arch_cpu: Arc>, + /// LifeCycle state of kvm-based VCPU. + state: Arc<(Mutex, Condvar)>, + /// Works need to handled by this VCPU. + work_queue: Arc<(Mutex, Condvar)>, + /// The thread handler of this virtual CPU. + task: Arc>>>, + /// The thread tid of this VCPU. + tid: Arc>>, + /// The VM combined by this VCPU. + vm: Arc>>, +} + +impl CPU { + /// Allocates a new `CPU` for `vm` + /// + /// # Arguments + /// + /// * `vcpu_fd` - The file descriptor of this `CPU`. + /// * `id` - ID of this `CPU`. + /// * `arch_cpu` - Architecture special `CPU` property. + /// * `vm` - The virtual machine this `CPU` gets attached to. + pub fn new( + vcpu_fd: Arc, + id: u8, + arch_cpu: Arc>, + vm: Arc>>, + ) -> Result { + Ok(CPU { + id, + fd: vcpu_fd, + arch_cpu, + state: Arc::new((Mutex::new(CpuLifecycleState::Created), Condvar::new())), + work_queue: Arc::new((Mutex::new(0), Condvar::new())), + task: Arc::new(Mutex::new(None)), + tid: Arc::new(Mutex::new(None)), + vm, + }) + } + + /// Get this `CPU`'s ID. + pub fn id(&self) -> u8 { + self.id + } + + /// Get this `CPU`'s file descriptor. + #[cfg(target_arch = "aarch64")] + pub fn fd(&self) -> &Arc { + &self.fd + } + + /// Get this `CPU`'s architecture-special property. + #[cfg(target_arch = "aarch64")] + pub fn arch(&self) -> &Arc> { + &self.arch_cpu + } + + /// Set task the `CPU` to handle. + pub fn set_task(&self, task: Option>) { + let mut data = self.task.lock().unwrap(); + (*data).take().map(thread::JoinHandle::join); + *data = task; + } + + /// Get this `CPU`'s thread id. + pub fn tid(&self) -> u64 { + match *self.tid.lock().unwrap() { + Some(tid) => tid, + None => 0, + } + } + + /// Set thread id for `CPU`. + pub fn set_tid(&self) { + *self.tid.lock().unwrap() = Some(util::unix::gettid()); + } + + /// Init signal for `CPU` event. + fn init_signals() -> Result<()> { + extern "C" fn handle_signal(signum: c_int, _: *mut siginfo_t, _: *mut c_void) { + match signum { + VCPU_EXIT_SIGNAL => LOCAL_THREAD_VCPU.with(|thread_vcpu| { + let mut vcpu_signal = thread_vcpu.borrow_mut(); + vcpu_signal.dirty_stamps = VCPU_EXIT_SIGNAL as u64; + }), + VCPU_PAUSE_SIGNAL => LOCAL_THREAD_VCPU.with(|thread_vcpu| { + let mut vcpu_signal = thread_vcpu.borrow_mut(); + vcpu_signal.dirty_stamps = VCPU_PAUSE_SIGNAL as u64; + }), + _ => {} + } + } + + register_signal_handler(VCPU_EXIT_SIGNAL, handle_signal)?; + register_signal_handler(VCPU_PAUSE_SIGNAL, handle_signal)?; + register_signal_handler(VCPU_TASK_SIGNAL, handle_signal)?; + + Ok(()) + } +} + +impl CPUInterface for CPU { + fn realize(&self, boot: &CPUBootConfig) -> Result<()> { + let (cpu_state, _) = &*self.state; + if *cpu_state.lock().unwrap() != CpuLifecycleState::Created { + return Err( + ErrorKind::RealizeVcpu(format!("VCPU{} may has realized.", self.id())).into(), + ); + } + + self.arch_cpu.lock().unwrap().realize(&self.fd, boot)?; + + Ok(()) + } + + fn resume(&self) -> Result<()> { + let (cpu_state_locked, cvar) = &*self.state; + let mut cpu_state = cpu_state_locked.lock().unwrap(); + if *cpu_state == CpuLifecycleState::Running { + warn!("vcpu{} in running state, no need to resume", self.id()); + return Ok(()); + } + + *cpu_state = CpuLifecycleState::Running; + drop(cpu_state); + cvar.notify_one(); + Ok(()) + } + + fn start( + cpu: Arc, + thread_barrier: Arc, + paused: bool, + use_seccomp: bool, + ) -> Result<()> { + let (cpu_state, _) = &*cpu.state; + if *cpu_state.lock().unwrap() == CpuLifecycleState::Running { + return Err(ErrorKind::StartVcpu("Cpu is already running".to_string()).into()); + } + if paused { + *cpu_state.lock().unwrap() = CpuLifecycleState::Paused; + } else { + *cpu_state.lock().unwrap() = CpuLifecycleState::Running; + } + + let local_cpu = cpu.clone(); + let handle = thread::Builder::new() + .name(format!("CPU {}/KVM", cpu.id)) + .spawn(move || { + init_local_thread_vcpu(cpu.id); + if let Err(e) = CPU::init_signals() { + error!("Failed to init cpu{} signal:{}", cpu.id, e); + } + + cpu.set_tid(); + + // The vcpu thread is going to run, + // reset its running environment. + cpu.reset().unwrap(); + + // Wait for all vcpu to complete the running + // environment initialization. + thread_barrier.wait(); + + info!("vcpu{} start running", cpu.id); + if use_seccomp { + if let Err(e) = crate::micro_vm::micro_syscall::register_seccomp() { + error!("Failed to register seccomp in cpu{} thread:{}", cpu.id, e); + } + } + + loop { + if !cpu.ready_for_running() { + break; + } + + if !cpu.kvm_vcpu_exec().unwrap() { + break; + } + } + + // The vcpu thread is about to exit, marking the state + // of the CPU state as Stopped. + let (cpu_state, cvar) = &*cpu.state; + *cpu_state.lock().unwrap() = CpuLifecycleState::Stopped; + cvar.notify_one(); + }) + .unwrap(); + local_cpu.set_task(Some(handle)); + Ok(()) + } + + fn reset(&self) -> Result<()> { + self.arch_cpu.lock().unwrap().reset_vcpu(&self.fd)?; + Ok(()) + } + + fn pause(&self) -> Result<()> { + let task = self.task.lock().unwrap(); + let (cpu_state, cvar) = &*self.state; + + if *cpu_state.lock().unwrap() == CpuLifecycleState::Running { + *cpu_state.lock().unwrap() = CpuLifecycleState::Paused; + cvar.notify_one() + } + + match &(*task) { + Some(thread) => match thread.kill(VCPU_PAUSE_SIGNAL) { + Ok(_) => Ok(()), + Err(e) => Err(ErrorKind::StopVcpu(format!("{}", e)).into()), + }, + None => { + warn!("VCPU thread not started, no need to stop"); + Ok(()) + } + } + } + + fn destroy(&self) -> Result<()> { + let task = self.task.lock().unwrap(); + let (cpu_state, cvar) = &*self.state; + if *cpu_state.lock().unwrap() == CpuLifecycleState::Running { + *cpu_state.lock().unwrap() = CpuLifecycleState::Stopping; + } else { + *cpu_state.lock().unwrap() = CpuLifecycleState::Stopped; + } + + self.fd.set_kvm_immediate_exit(0); + match &(*task) { + Some(thread) => match thread.kill(VCPU_EXIT_SIGNAL) { + Ok(_) => {} + Err(e) => { + error!( + "killing VCPU{} thread({}) failed: {}", + self.id(), + self.tid(), + e + ); + } + }, + None => {} + } + let mut cpu_state = cpu_state.lock().unwrap(); + cvar.notify_all(); + + cpu_state = cvar + .wait_timeout(cpu_state, Duration::from_millis(16)) + .unwrap() + .0; + + if *cpu_state == CpuLifecycleState::Stopped { + *cpu_state = CpuLifecycleState::Nothing; + Ok(()) + } else { + Err(ErrorKind::DestroyVcpu(format!("VCPU still in {:?} state", *cpu_state)).into()) + } + } + + fn kvm_vcpu_exec(&self) -> Result { + match self.fd.run() { + Ok(run) => match run { + #[cfg(target_arch = "x86_64")] + VcpuExit::IoIn(addr, data) => { + self.vm.pio_in(u64::from(addr), data); + } + #[cfg(target_arch = "x86_64")] + VcpuExit::IoOut(addr, data) => { + self.vm.pio_out(u64::from(addr), data); + } + VcpuExit::MmioRead(addr, data) => { + self.vm.mmio_read(addr, data); + } + VcpuExit::MmioWrite(addr, data) => { + self.vm.mmio_write(addr, data); + } + #[cfg(target_arch = "x86_64")] + VcpuExit::Hlt => { + info!("Vcpu{} Received KVM_EXIT_HLT signal", self.id()); + panic!("Hlt vpu {}", self.id()); + } + VcpuExit::Shutdown | VcpuExit::SystemEvent => { + info!("Vcpu{} Received an KVM_EXIT_SHUTDOWN signal", self.id()); + let (cpu_state, _) = &*self.state; + *cpu_state.lock().unwrap() = CpuLifecycleState::Stopped; + self.vm.destroy(); + + #[cfg(feature = "qmp")] + { + let shutdown_msg = schema::SHUTDOWN { + guest: true, + reason: "guest-shutdown".to_string(), + }; + event!(SHUTDOWN; shutdown_msg); + } + + return Ok(false); + } + VcpuExit::FailEntry => { + info!("Vcpu{} Received KVM_EXIT_FAIL_ENTRY signal", self.id()); + return Ok(false); + } + VcpuExit::InternalError => { + info!("Vcpu{} Received KVM_EXIT_INTERNAL_ERROR signal", self.id()); + return Ok(false); + } + r => panic!("Unexpected exit reason: {:?}", r), + }, + Err(ref e) => { + match e.errno() { + libc::EAGAIN => {} + libc::EINTR => { + self.fd.set_kvm_immediate_exit(0); + } + _ => { + error!("Failure during vcpu run: {}", e); + panic!("VcpuUnhandledKvmExit"); + } + }; + } + } + Ok(true) + } +} + +impl CPUWorker for CPU { + fn handle_workqueue(&self) { + LOCAL_THREAD_VCPU.with(|thread_vcpu| { + let mut vcpu_signal = thread_vcpu.borrow_mut(); + if vcpu_signal.dirty_stamps != 0 { + vcpu_signal.dirty_stamps = 0; + drop(vcpu_signal); + + let (work_queue_locked, cvar) = &*self.work_queue; + let mut work_queue = work_queue_locked.lock().unwrap(); + if *work_queue & Self::SYNC_READ_CPU_STATE == Self::SYNC_READ_CPU_STATE { + *work_queue &= !Self::SYNC_READ_CPU_STATE; + cvar.notify_all(); + } + + if *work_queue & Self::SYNC_WRITE_CPU_STATE == Self::SYNC_WRITE_CPU_STATE { + *work_queue &= !Self::SYNC_WRITE_CPU_STATE; + cvar.notify_all(); + } + } + }); + } + + fn ready_for_running(&self) -> bool { + let mut flag = 0_u32; + let (cpu_state_locked, cvar) = &*self.state; + let mut cpu_state = cpu_state_locked.lock().unwrap(); + loop { + self.handle_workqueue(); + + match *cpu_state { + CpuLifecycleState::Paused => { + if flag == 0 { + info!("Vcpu{} paused", self.id); + flag = 1; + } + cpu_state = cvar.wait(cpu_state).unwrap(); + } + CpuLifecycleState::Running => { + return true; + } + CpuLifecycleState::Stopping => { + info!("Vcpu{} shutdown", self.id); + cvar.notify_all(); + return false; + } + _ => { + warn!("Unknown Vmstate"); + return true; + } + } + } + } +} + +/// The wrapper for topology for VCPU. +#[derive(Clone)] +pub struct CpuTopology { + /// Number of sockets in VM. + pub sockets: u8, + /// Number of cores in VM. + pub cores: u8, + /// Number of threads in VM. + pub threads: u8, + /// Number of vcpus in VM. + pub nrcpus: u8, + /// Number of online vcpus in VM. + pub max_cpus: u8, + /// Online mask number of all vcpus. + pub online_mask: Arc>>, +} + +impl CpuTopology { + /// Get online mask for a cpu. + /// + /// # Notes + /// + /// When `online_mask` is `0`, vcpu is offline. When `online_mask` is `1`, + /// vcpu is online. + /// + /// # Arguments + /// + /// * `vcpu_id` - ID of vcpu. + pub fn get_mask(&self, vcpu_id: usize) -> u8 { + let mask = self.online_mask.lock().unwrap(); + mask[vcpu_id] + } + + /// Get single cpu topology for vcpu, return this vcpu's `socket-id`, + /// `core-id` and `thread-id`. + /// + /// # Arguments + /// + /// * `vcpu_id` - ID of vcpu. + pub fn get_topo(&self, vcpu_id: usize) -> (u8, u8, u8) { + let cpu_per_socket = self.cores * self.threads; + let cpu_per_core = self.threads; + let socketid: u8 = vcpu_id as u8 / cpu_per_socket; + let coreid: u8 = (vcpu_id as u8 % cpu_per_socket) / cpu_per_core; + let threadid: u8 = (vcpu_id as u8 % cpu_per_socket) % cpu_per_core; + (socketid, coreid, threadid) + } +} diff --git a/device_model/src/cpu/x86_64/cpuid.rs b/device_model/src/cpu/x86_64/cpuid.rs new file mode 100644 index 00000000..1084b271 --- /dev/null +++ b/device_model/src/cpu/x86_64/cpuid.rs @@ -0,0 +1,31 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use core::arch::x86_64::__cpuid_count; + +pub fn host_cpuid( + leaf: u32, + subleaf: u32, + eax: *mut u32, + ebx: *mut u32, + ecx: *mut u32, + edx: *mut u32, +) { + unsafe { + let cpuid = __cpuid_count(leaf, subleaf); + + *eax = cpuid.eax; + *ebx = cpuid.ebx; + *ecx = cpuid.ecx; + *edx = cpuid.edx; + } +} diff --git a/device_model/src/cpu/x86_64/mod.rs b/device_model/src/cpu/x86_64/mod.rs new file mode 100644 index 00000000..94ea3671 --- /dev/null +++ b/device_model/src/cpu/x86_64/mod.rs @@ -0,0 +1,481 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +mod cpuid; + +use std::sync::Arc; + +use kvm_bindings::{ + kvm_fpu, kvm_msr_entry, kvm_regs, kvm_segment, kvm_sregs, Msrs, KVM_MAX_CPUID_ENTRIES, +}; +use kvm_ioctls::{Kvm, VcpuFd, VmFd}; + +use self::errors::Result; +use cpuid::host_cpuid; + +pub mod errors { + error_chain! { + foreign_links { + Io(std::io::Error); + Kvm(kvm_ioctls::Error); + } + } +} + +const ECX_EPB_SHIFT: u32 = 3; +const X86_FEATURE_HYPERVISOR: u32 = 31; +const X86_FEATURE_TSC_DEADLINE_TIMER: u32 = 24; + +const MSR_LIST: &[u32] = &[ + 0x0174, // MSR_IA32_SYSENTER_CS + 0x0175, // MSR_IA32_SYSENTER_ESP + 0x0176, // MSR_IA32_SYSENTER_EIP + 0xc000_0081, // MSR_STAR, legacy mode SYSCALL target + 0xc000_0082, // MSR_LSTAR, long mode SYSCALL target + 0xc000_0083, // MSR_CSTAR, compat mode SYSCALL target + 0xc000_0084, // MSR_SYSCALL_MASK, EFLAGS mask for syscall + 0xc000_0102, // MSR_KERNEL_GS_BASE, SwapGS GS shadow + 0x0010, // MSR_IA32_TSC, + 0x01a0, // MSR_IA32_MISC_ENABLE, +]; + +const MSR_IA32_MISC_ENABLE: u32 = 0x01a0; +const MSR_IA32_MISC_ENABLE_FAST_STRING: u64 = 0x1; + +/// AArch64 CPU booting configure information +pub struct X86CPUBootConfig { + /// Register %rip value + pub boot_ip: u64, + /// Register %rsp value + pub boot_sp: u64, + /// zero page address, as the second parameter of __startup_64 + /// arch/x86/kernel/head_64.S:86 + pub zero_page: u64, + pub code_segment: kvm_segment, + pub data_segment: kvm_segment, + pub gdt_base: u64, + pub gdt_size: u16, + pub idt_base: u64, + pub idt_size: u16, + pub pml4_start: u64, +} + +#[derive(Default, Copy, Clone)] +pub struct X86CPU { + id: u32, + nr_vcpus: u32, + boot_ip: u64, + boot_sp: u64, + zero_page: u64, + code_segment: kvm_segment, + data_segment: kvm_segment, + gdt_base: u64, + gdt_size: u16, + idt_base: u64, + idt_size: u16, + pml4_start: u64, +} + +impl X86CPU { + pub fn new(_vm_fd: &Arc, vcpuid: u32, nr_vcpus: u32) -> Self { + X86CPU { + id: vcpuid, + nr_vcpus, + ..Default::default() + } + } + + pub fn realize(&mut self, vcpu_fd: &Arc, boot_config: &X86CPUBootConfig) -> Result<()> { + self.boot_ip = boot_config.boot_ip; + self.boot_sp = boot_config.boot_sp; + self.zero_page = boot_config.zero_page; + self.code_segment = boot_config.code_segment; + self.data_segment = boot_config.data_segment; + self.gdt_base = boot_config.gdt_base; + self.gdt_size = boot_config.gdt_size; + self.idt_base = boot_config.idt_base; + self.idt_size = boot_config.idt_size; + self.pml4_start = boot_config.pml4_start; + + // Only setting vcpu lapic state, other registers should + // reset when the vcpu start running. + self.setup_lapic(vcpu_fd)?; + + Ok(()) + } + + fn setup_cpuid(&self, vcpu_fd: &Arc) -> Result<()> { + let sys_fd = match Kvm::new() { + Ok(fd) => fd, + _ => panic!("setup_cpuid:Open /dev/kvm failed"), + }; + let mut cpuid = sys_fd.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES)?; + let entries = cpuid.as_mut_slice(); + + for entry in entries.iter_mut() { + match entry.function { + 1 => { + if entry.index == 0 { + entry.ecx |= 1u32 << X86_FEATURE_HYPERVISOR; + entry.ecx |= 1u32 << X86_FEATURE_TSC_DEADLINE_TIMER + } + } + 2 => { + host_cpuid( + 2, + 0, + &mut entry.eax, + &mut entry.ebx, + &mut entry.ecx, + &mut entry.edx, + ); + } + 4 => { + // cache info: needed for Pentium Pro compatibility + // Passthrough host cache info directly to guest + host_cpuid( + 4, + entry.index, + &mut entry.eax, + &mut entry.ebx, + &mut entry.ecx, + &mut entry.edx, + ); + entry.eax &= !0xfc00_0000; + if entry.eax & 0x0001_ffff != 0 && self.nr_vcpus > 1 { + entry.eax |= (self.nr_vcpus - 1) << 26; + } + } + 6 => { + entry.ecx &= !(1u32 << ECX_EPB_SHIFT); + } + 10 => { + if entry.eax != 0 { + let version_id = entry.eax & 0xff; + let num_counters = entry.eax & 0xff00; + if version_id != 2 || num_counters == 0 { + entry.eax = 0; + } + } + } + 0xb => { + // Extended Topology Enumeration Leaf + entry.edx = self.id as u32; + entry.ecx = entry.index & 0xff; + match entry.index { + 0 => { + entry.eax = 0u32; + entry.ebx = 1u32; + entry.ecx |= 1u32 << 8; + } + 1 => { + entry.eax = 32u32 - self.nr_vcpus.leading_zeros(); + entry.ebx = self.nr_vcpus; + entry.ecx |= 2u32 << 8; + } + _ => { + entry.ebx = 0xff; + } + } + entry.ebx &= 0xffff; + } + 0x8000_0002..=0x8000_0004 => { + // Passthrough host cpu model name directly to guest + host_cpuid( + entry.function, + entry.index, + &mut entry.eax, + &mut entry.ebx, + &mut entry.ecx, + &mut entry.edx, + ); + } + _ => (), + } + } + + vcpu_fd.set_cpuid2(&cpuid)?; + Ok(()) + } + + fn setup_sregs(&self, vcpu_fd: &Arc) -> Result<()> { + // X86_CR0_PE: Protection Enable + // EFER_LME: Long mode enable + // EFER_LMA: Long mode active + // arch/x86/include/uapi/asm/processor-flags.h + const X86_CR0_PE: u64 = 0x1; + const EFER_LME: u64 = 0x100; + const EFER_LMA: u64 = 0x400; + + // X86_CR0_PG: enable Paging + // X86_CR4_PAE: enable physical address extensions + // arch/x86/include/uapi/asm/processor-flags.h + const X86_CR0_PG: u64 = 0x8000_0000; + const X86_CR4_PAE: u64 = 0x20; + + let mut sregs: kvm_sregs = vcpu_fd.get_sregs()?; + + // Init gdt table, gdt table has loaded to Guest Memory Space + sregs.cs = self.code_segment; + sregs.ds = self.data_segment; + sregs.es = self.data_segment; + sregs.fs = self.data_segment; + sregs.gs = self.data_segment; + sregs.ss = self.data_segment; + + sregs.gdt.base = self.gdt_base; + sregs.gdt.limit = self.gdt_size; + + // Init idt table, idt table has loaded to Guest Memory Space + sregs.idt.base = self.idt_base; + sregs.idt.limit = self.idt_size; + + // Open 64-bit protected mode, include + // Protection enable, Long mode enable, Long mode active + sregs.cr0 |= X86_CR0_PE; + sregs.efer |= EFER_LME | EFER_LMA; + + // Setup page table + sregs.cr3 = self.pml4_start; + sregs.cr4 |= X86_CR4_PAE; + sregs.cr0 |= X86_CR0_PG; + + vcpu_fd.set_sregs(&sregs)?; + + Ok(()) + } + + #[allow(clippy::cast_ptr_alignment)] + fn setup_lapic(&self, vcpu_fd: &Arc) -> Result<()> { + // Disable nmi and external interrupt before enter protected mode + // arch/x86/include/asm/apicdef.h + // local_apic struct like: + // struct local_apic { + // /*350*/ struct { /* LVT - LINT0 */ + // u32 vector : 8, + // delivery_mode : 3, + // __reserved_1 : 1, + // delivery_status : 1, + // polarity : 1, + // remote_irr : 1, + // trigger : 1, + // mask : 1, + // __reserved_2 : 15; + // u32 __reserved_3[3]; + // } lvt_lint0; + // + // /*360*/ struct { /* LVT - LINT1 */ + // u32 vector : 8, + // delivery_mode : 3, + // __reserved_1 : 1, + // delivery_status : 1, + // polarity : 1, + // remote_irr : 1, + // trigger : 1, + // mask : 1, + // __reserved_2 : 15; + // u32 __reserved_3[3]; + // } lvt_lint1; + // } + // + // #define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7) + // #define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8)) + const APIC_LVT0: usize = 0x350; + const APIC_LVT1: usize = 0x360; + const APIC_MODE_NMI: u32 = 0x4; + const APIC_MODE_EXTINT: u32 = 0x7; + + let mut lapic = vcpu_fd.get_lapic()?; + + // The member regs in struct kvm_lapic_state is a u8 array with 1024 entries, + // so it's saft to cast u8 pointer to u32 at position APIC_LVT0 and APIC_LVT1. + unsafe { + let apic_lvt_lint0 = &mut lapic.regs[APIC_LVT0..] as *mut [i8] as *mut u32; + *apic_lvt_lint0 &= !0x700; + *apic_lvt_lint0 |= APIC_MODE_EXTINT << 8; + + let apic_lvt_lint1 = &mut lapic.regs[APIC_LVT1..] as *mut [i8] as *mut u32; + *apic_lvt_lint1 &= !0x700; + *apic_lvt_lint1 |= APIC_MODE_NMI << 8; + } + + vcpu_fd.set_lapic(&lapic)?; + + Ok(()) + } + + fn setup_regs(&self, vcpu_fd: &Arc) -> Result<()> { + let regs: kvm_regs = kvm_regs { + rflags: 0x0002, /* Means processor has been initialized */ + rip: self.boot_ip, + rsp: self.boot_sp, + rbp: self.boot_sp, + rsi: self.zero_page, + ..Default::default() + }; + vcpu_fd.set_regs(®s)?; + + Ok(()) + } + + fn setup_fpu(&self, vcpu_fd: &Arc) -> Result<()> { + // Default value for fxregs_state.mxcsr + // arch/x86/include/asm/fpu/types.h + const MXCSR_DEFAULT: u32 = 0x1f80; + + let fpu: kvm_fpu = kvm_fpu { + fcw: 0x37f, + mxcsr: MXCSR_DEFAULT, + ..Default::default() + }; + + vcpu_fd.set_fpu(&fpu)?; + + Ok(()) + } + + fn setup_msrs(&self, vcpu_fd: &Arc) -> Result<()> { + let mut entries = Vec::::new(); + + // Enable fast-string operation to improve string + // store operations. + for msr in MSR_LIST { + let data = match *msr { + MSR_IA32_MISC_ENABLE => MSR_IA32_MISC_ENABLE_FAST_STRING, + _ => 0u64, + }; + + entries.push(kvm_msr_entry { + index: *msr, + data, + ..Default::default() + }); + } + + debug!("pushed msr entries[{:?}] {:?}", entries.len(), entries); + + vcpu_fd.set_msrs(&Msrs::from_entries(&entries))?; + + Ok(()) + } + + pub fn reset_vcpu(&self, vcpu_fd: &Arc) -> Result<()> { + self.setup_cpuid(vcpu_fd)?; + self.setup_sregs(vcpu_fd)?; + self.setup_regs(vcpu_fd)?; + self.setup_fpu(vcpu_fd)?; + self.setup_msrs(vcpu_fd)?; + + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use kvm_bindings::kvm_segment; + use std::sync::Arc; + #[test] + + fn test_x86_64_cpu() { + let code_seg = kvm_segment { + base: 0, + limit: 1048575, + selector: 16, + type_: 11, + present: 1, + dpl: 0, + db: 0, + s: 1, + l: 1, + g: 1, + avl: 0, + unusable: 0, + padding: 0, + }; + let data_seg = kvm_segment { + base: 0, + limit: 1048575, + selector: 24, + type_: 3, + present: 1, + dpl: 0, + db: 1, + s: 1, + l: 0, + g: 1, + avl: 0, + unusable: 0, + padding: 0, + }; + let cpu_config = X86CPUBootConfig { + boot_ip: 0, + boot_sp: 0, + zero_page: 0x0000_7000, + code_segment: code_seg, + data_segment: data_seg, + gdt_base: 0x500u64, + gdt_size: 16, + idt_base: 0x520u64, + idt_size: 8, + pml4_start: 0x0000_9000, + }; + let kvm = Kvm::new().unwrap(); + let vm = Arc::new(kvm.create_vm().unwrap()); + /* For `get_lapic` in realize function to work, + you need to create a irq_chip for VM before creating the VCPU. */ + vm.create_irq_chip().unwrap(); + let vcpu = Arc::new(vm.create_vcpu(0).unwrap()); + let mut x86_cpu = X86CPU::new(&vm, 0, 1); + //test realize function + assert!(x86_cpu.realize(&vcpu, &cpu_config).is_ok()); + + //test setup special registers + assert!(x86_cpu.setup_sregs(&vcpu).is_ok()); + let x86_sregs = vcpu.get_sregs().unwrap(); + assert_eq!(x86_sregs.cs, code_seg); + assert_eq!(x86_sregs.ds, data_seg); + assert_eq!(x86_sregs.es, data_seg); + assert_eq!(x86_sregs.fs, data_seg); + assert_eq!(x86_sregs.gs, data_seg); + assert_eq!(x86_sregs.ss, data_seg); + assert_eq!(x86_sregs.gdt.base, cpu_config.gdt_base); + assert_eq!(x86_sregs.gdt.limit, cpu_config.gdt_size); + assert_eq!(x86_sregs.idt.base, cpu_config.idt_base); + assert_eq!(x86_sregs.idt.limit, cpu_config.idt_size); + assert_eq!(x86_sregs.cr0 & 0x1, 1); + assert_eq!((x86_sregs.cr0 & 0x8000_0000) >> 31, 1); + assert_eq!(x86_sregs.cr3, cpu_config.pml4_start); + assert_eq!((x86_sregs.cr4 & 0x20) >> 5, 1); + assert_eq!((x86_sregs.efer & 0x700) >> 8, 5); + + //test setup_regs function + assert!(x86_cpu.setup_regs(&vcpu).is_ok()); + let x86_regs = vcpu.get_regs().unwrap(); + assert_eq!(x86_regs.rflags, 0x0002); + assert_eq!(x86_regs.rip, 0); + assert_eq!(x86_regs.rsp, 0); + assert_eq!(x86_regs.rbp, 0); + assert_eq!(x86_regs.rsi, 0x0000_7000); + + //test setup_fpu function + assert!(x86_cpu.setup_fpu(&vcpu).is_ok()); + let x86_fpu = vcpu.get_fpu().unwrap(); + assert_eq!(x86_fpu.fcw, 0x37f); + + //test setup_msrs function + assert!(x86_cpu.setup_msrs(&vcpu).is_ok()); + + //test setup_cpuid function + assert!(x86_cpu.setup_cpuid(&vcpu).is_ok()); + } +} diff --git a/device_model/src/interrupt_controller/aarch64/gicv3.rs b/device_model/src/interrupt_controller/aarch64/gicv3.rs new file mode 100644 index 00000000..0982ba30 --- /dev/null +++ b/device_model/src/interrupt_controller/aarch64/gicv3.rs @@ -0,0 +1,484 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::sync::{Arc, Mutex}; + +use kvm_ioctls::{DeviceFd, VmFd}; + +use machine_manager::machine::{KvmVmState, MachineLifecycle}; +use util::kvm_ioctls_ext::{check_device_attr, get_device_attr}; +use util::{device_tree, errors}; + +use super::GICConfig; +use super::GICDevice; + +// See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. +const SZ_64K: u64 = 0x0001_0000; +const KVM_VGIC_V3_DIST_SIZE: u64 = SZ_64K; +const KVM_VGIC_V3_REDIST_SIZE: u64 = 2 * SZ_64K; +const KVM_VGIC_V3_ITS_SIZE: u64 = 2 * SZ_64K; + +#[derive(Debug)] +pub enum Error { + /// Error while calling KVM ioctl for setting up the global interrupt controller. + CreateGIC(kvm_ioctls::Error), + /// Error while setting device attributes for the GIC. + SetDeviceAttribute(kvm_ioctls::Error), + /// Error while getting device attributes for the GIC. + GetDeviceAttribute(kvm_ioctls::Error), + /// Error while check device attributes for the GIC. + CheckDeviceAttribute(kvm_ioctls::Error), +} +type Result = std::result::Result; + +/// A wrapper for kvm_based device check and access. +pub struct KvmDevice; + +impl KvmDevice { + fn kvm_device_check(fd: &DeviceFd, group: u32, attr: u64) -> Result { + let attr = kvm_bindings::kvm_device_attr { + group, + attr, + addr: 0, + flags: 0, + }; + + let support = check_device_attr(fd, &attr).map_err(Error::CheckDeviceAttribute)?; + + if support == 0 { + Ok(false) + } else { + Ok(true) + } + } + + fn kvm_device_access( + fd: &DeviceFd, + group: u32, + attr: u64, + addr: u64, + write: bool, + ) -> Result<()> { + let attr = kvm_bindings::kvm_device_attr { + group, + attr, + addr, + flags: 0, + }; + + if write { + fd.set_device_attr(&attr) + .map_err(Error::SetDeviceAttribute)?; + } else { + let mut attr = attr; + get_device_attr(fd, &mut attr).map_err(Error::GetDeviceAttribute)?; + }; + + Ok(()) + } +} + +trait GICv3Access { + /// Returns `gicr_attr` of `vCPU`. + fn vcpu_gicr_attr(&self, cpu: usize) -> u64; + + fn access_gic_distributor(&self, offset: u64, gicd_value: &mut u32, write: bool) -> Result<()>; + + fn access_gic_redistributor( + &self, + offset: u64, + cpu: usize, + gicr_value: &mut u32, + write: bool, + ) -> Result<()>; + + fn access_gic_cpu( + &self, + offset: u64, + cpu: usize, + gicc_value: &mut u64, + write: bool, + ) -> Result<()>; + + fn access_gic_line_level(&self, offset: u64, gicll_value: &mut u32, write: bool) -> Result<()>; +} + +/// A wrapper around creating and managing a `GICv3`. +pub struct GICv3 { + /// The fd for the GICv3 device. + fd: DeviceFd, + /// Number of vCPUs, determines the number of redistributor and CPU interface. + vcpu_count: u64, + /// GICv3 ITS, support MSI. + its: bool, + /// GICv3 ITS device. + its_dev: Option, + /// Maximum irq number. + nr_irqs: u32, + /// Base address in the guest physical address space of the GICv3 + /// redistributor register mappings. + redists_base: u64, + /// GICv3 redistributor region size. + redists_size: u64, + /// Base address in the guest physical address space of the GICv3 distributor + /// register mappings. + dist_base: u64, + /// GICv3 distributor region size. + dist_size: u64, + /// Lifecycle state for GICv3. + state: Arc>, +} + +impl GICv3 { + pub fn new(vm: &Arc, config: &GICConfig) -> Result { + config.check_sanity().unwrap(); + + let mut gic_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3, + fd: 0, + flags: 0, + }; + + let gic_fd = match vm.create_device(&mut gic_device) { + Ok(fd) => fd, + Err(e) => return Err(Error::CreateGIC(e)), + }; + + let mut gicv3 = GICv3 { + fd: gic_fd, + vcpu_count: config.vcpu_count, + nr_irqs: config.max_irq, + its: config.msi, + its_dev: None, + redists_size: config.vcpu_count * KVM_VGIC_V3_REDIST_SIZE, + redists_base: config.map_region + - KVM_VGIC_V3_DIST_SIZE + - config.vcpu_count * KVM_VGIC_V3_REDIST_SIZE, + dist_size: KVM_VGIC_V3_DIST_SIZE, + dist_base: config.map_region - KVM_VGIC_V3_DIST_SIZE, + state: Arc::new(Mutex::new(KvmVmState::Created)), + }; + + if gicv3.its { + gicv3.its_dev = + Some(GICv3Its::new(&vm, gicv3.redists_base - KVM_VGIC_V3_ITS_SIZE).unwrap()); + } + + Ok(gicv3) + } + + fn realize(&self) -> Result<()> { + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_V3_ADDR_TYPE_REDIST), + &self.redists_base as *const u64 as u64, + true, + )?; + + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_V3_ADDR_TYPE_DIST), + &self.dist_base as *const u64 as u64, + true, + )?; + + KvmDevice::kvm_device_check(&self.fd, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0)?; + + // Init the interrupt number support by the GIC. + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_NR_IRQS, + 0, + &self.nr_irqs as *const u32 as u64, + true, + )?; + + // Finalize the GIC. + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + 0, + true, + )?; + + let mut state = self.state.lock().unwrap(); + *state = KvmVmState::Running; + + Ok(()) + } + + fn device_fd(&self) -> &DeviceFd { + &self.fd + } +} + +impl MachineLifecycle for GICv3 { + fn pause(&self) -> bool { + let attr = kvm_bindings::kvm_device_attr { + group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + attr: u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES), + addr: 0, + flags: 0, + }; + + if self.device_fd().set_device_attr(&attr).is_ok() { + let mut state = self.state.lock().unwrap(); + *state = KvmVmState::Running; + + true + } else { + false + } + } + + fn notify_lifecycle(&self, old: KvmVmState, new: KvmVmState) -> bool { + let state = self.state.lock().unwrap(); + if *state != old { + error!("GICv3 lifecycle error: state check failed."); + return false; + } + drop(state); + + match (old, new) { + (KvmVmState::Running, KvmVmState::Paused) => self.pause(), + _ => true, + } + } +} + +impl GICv3Access for GICv3 { + fn vcpu_gicr_attr(&self, cpu: usize) -> u64 { + let clustersz = 16; + + let aff1 = (cpu / clustersz) as u64; + let aff0 = (cpu % clustersz) as u64; + + let affid = (aff1 << 8) | aff0; + let cpu_affid: u64 = ((affid & 0xFF_0000_0000) >> 8) | (affid & 0xFF_FFFF); + + let last = if (self.vcpu_count - 1) == cpu as u64 { + 1 + } else { + 0 + }; + + ((cpu_affid << 32) | (1 << 24) | (1 << 8) | (last << 4)) + & kvm_bindings::KVM_DEV_ARM_VGIC_V3_MPIDR_MASK as u64 + } + + fn access_gic_distributor(&self, offset: u64, gicd_value: &mut u32, write: bool) -> Result<()> { + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_DIST_REGS, + offset, + gicd_value as *mut u32 as u64, + write, + ) + } + + fn access_gic_redistributor( + &self, + offset: u64, + cpu: usize, + gicr_value: &mut u32, + write: bool, + ) -> Result<()> { + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, + self.vcpu_gicr_attr(cpu) | offset, + gicr_value as *mut u32 as u64, + write, + ) + } + + fn access_gic_cpu( + &self, + offset: u64, + cpu: usize, + gicc_value: &mut u64, + write: bool, + ) -> Result<()> { + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, + self.vcpu_gicr_attr(cpu) | offset, + gicc_value as *mut u64 as u64, + write, + ) + } + + fn access_gic_line_level(&self, offset: u64, gicll_value: &mut u32, write: bool) -> Result<()> { + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, + self.vcpu_gicr_attr(0) | offset, + gicll_value as *mut u32 as u64, + write, + ) + } +} + +impl GICDevice for GICv3 { + fn create_device( + vm: &Arc, + gic_conf: &GICConfig, + ) -> Result> { + let gic = GICv3::new(vm, gic_conf)?; + + gic.realize()?; + + if let Some(its) = &gic.its_dev { + its.realize()?; + } + + Ok(Arc::new(gic)) + } + + fn generate_fdt(&self, fdt: &mut Vec) -> errors::Result<()> { + let gic_reg = [ + self.dist_base, + self.dist_size, + self.redists_base, + self.redists_size, + ]; + let node = "/intc"; + device_tree::add_sub_node(fdt, node)?; + device_tree::set_property_string(fdt, node, "compatible", "arm,gic-v3")?; + device_tree::set_property(fdt, node, "interrupt-controller", None)?; + device_tree::set_property_u32(fdt, node, "#interrupt-cells", 0x3)?; + device_tree::set_property_u32(fdt, node, "phandle", device_tree::GIC_PHANDLE)?; + device_tree::set_property_u32(fdt, node, "#address-cells", 0x2)?; + device_tree::set_property_u32(fdt, node, "#size-cells", 0x2)?; + device_tree::set_property_u32(fdt, node, "#redistributor-regions", 0x1)?; + device_tree::set_property_array_u64(fdt, node, "reg", &gic_reg)?; + + let gic_intr = [ + device_tree::GIC_FDT_IRQ_TYPE_PPI, + 0x9, + device_tree::IRQ_TYPE_LEVEL_HIGH, + ]; + device_tree::set_property_array_u32(fdt, node, "interrupts", &gic_intr)?; + + if let Some(its) = &self.its_dev { + device_tree::set_property(fdt, node, "ranges", None)?; + let its_reg = [its.msi_base, its.msi_size]; + let node = "/intc/its"; + device_tree::add_sub_node(fdt, node)?; + device_tree::set_property_string(fdt, node, "compatible", "arm,gic-v3-its")?; + device_tree::set_property(fdt, node, "msi-controller", None)?; + device_tree::set_property_u32(fdt, node, "phandle", device_tree::GIC_ITS_PHANDLE)?; + device_tree::set_property_array_u64(fdt, node, "reg", &its_reg)?; + } + + Ok(()) + } +} + +pub struct GICv3Its { + /// The fd for the GICv3Its device + fd: DeviceFd, + + /// Base address in the guest physical address space of the GICv3 ITS + /// control register frame. + msi_base: u64, + + /// GICv3 ITS needs to be 64K aligned and the region covers 128K. + msi_size: u64, +} + +impl GICv3Its { + fn new(vm: &Arc, msi_base: u64) -> Result { + let mut its_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, + fd: 0, + flags: 0, + }; + + let its_fd = match vm.create_device(&mut its_device) { + Ok(fd) => fd, + Err(e) => return Err(Error::CreateGIC(e)), + }; + + Ok(GICv3Its { + fd: its_fd, + msi_base, + msi_size: KVM_VGIC_V3_ITS_SIZE, + }) + } + + fn realize(&self) -> Result<()> { + KvmDevice::kvm_device_check( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + )?; + + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + &self.msi_base as *const u64 as u64, + true, + )?; + + // Finalize the GIC Its. + KvmDevice::kvm_device_access( + &self.fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + &self.msi_base as *const u64 as u64, + true, + )?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_gic_config() { + let mut gic_conf = GICConfig { + version: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3.into(), + map_region: 0x0002_0000_0000, + vcpu_count: 4, + max_irq: 192, + msi: false, + }; + + assert!(gic_conf.check_sanity().is_ok()); + gic_conf.version = 3; + assert!(gic_conf.check_sanity().is_err()); + gic_conf.version = kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3.into(); + assert!(gic_conf.check_sanity().is_ok()); + + gic_conf.vcpu_count = 257; + assert!(gic_conf.check_sanity().is_err()); + gic_conf.vcpu_count = 0; + assert!(gic_conf.check_sanity().is_err()); + gic_conf.vcpu_count = 24; + assert!(gic_conf.check_sanity().is_ok()); + + gic_conf.max_irq = 32; + assert!(gic_conf.check_sanity().is_err()); + + gic_conf.max_irq = 32; + assert!(gic_conf.check_sanity().is_err()); + } +} diff --git a/device_model/src/interrupt_controller/aarch64/mod.rs b/device_model/src/interrupt_controller/aarch64/mod.rs new file mode 100644 index 00000000..5ae43fbf --- /dev/null +++ b/device_model/src/interrupt_controller/aarch64/mod.rs @@ -0,0 +1,153 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +mod gicv3; + +use std::sync::Arc; + +use kvm_ioctls::VmFd; + +pub use gicv3::Error as GICError; +pub use gicv3::GICv3; +use machine_manager::machine::{KvmVmState, MachineLifecycle}; +#[cfg(target_arch = "aarch64")] +use util::{device_tree, errors}; + +// First 32 are private to each CPU (SGIs and PPIs). +const GIC_IRQ_INTERNAL: u32 = 32; + +#[derive(Debug)] +pub enum Error { + /// Invalid argument + EINVAL(std::string::String), +} + +/// Configure a Interrupt controller. +pub struct GICConfig { + /// Config GIC version + pub version: u32, + /// GIC region mappings base address, aligned 64K + pub map_region: u64, + /// Config number of CPUs handled by the device + pub vcpu_count: u64, + /// Config maximum number of irqs handled by the device + pub max_irq: u32, + /// Config msi support + pub msi: bool, +} + +impl GICConfig { + fn check_sanity(&self) -> Result<(), Error> { + if self.version != kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3 { + return Err(Error::EINVAL("GIC only support GICv3".to_string())); + }; + + if self.vcpu_count > 256 || self.vcpu_count == 0 { + return Err(Error::EINVAL( + "GIC only support maximum 256 vcpus".to_string(), + )); + } + + if self.map_region < 0x1000_0000 { + return Err(Error::EINVAL( + "GIC mapping Guest Physical Address need above 0x1000_0000".to_string(), + )); + }; + + if self.max_irq <= GIC_IRQ_INTERNAL { + return Err(Error::EINVAL("GIC irq numbers need above 32".to_string())); + } + + Ok(()) + } +} + +/// A wrapper for `GIC` must perform the function. +pub trait GICDevice: MachineLifecycle { + /// Constructs a kvm_based `GIC` device. + /// + /// # Arguments + /// + /// * `vm` - File descriptor for vmfd. + /// * `gic_conf` - Configuration for `GIC`. + fn create_device( + vm: &Arc, + gic_conf: &GICConfig, + ) -> Result, GICError> + where + Self: Sized; + + /// Constructs `fdt` node for `GIC`. + /// + /// # Arguments + /// + /// * `fdt` - Device tree presented by bytes. + fn generate_fdt(&self, fdt: &mut Vec) -> errors::Result<()>; +} + +/// A wrapper around creating and using a kvm-based interrupt controller. +pub struct InterruptController { + #[cfg(target_arch = "aarch64")] + gic: Arc, +} + +impl InterruptController { + /// Constructs a new kvm_based `InterruptController`. + /// + /// # Arguments + /// + /// * `vm` - File descriptor for vmfd. + /// * `gic_conf` - Configuration for `GIC`. + pub fn new(vm: Arc, gic_conf: &GICConfig) -> Result { + Ok(InterruptController { + gic: GICv3::create_device(&vm, gic_conf).unwrap(), + }) + } + + /// Change `InterruptController` lifecycle state to `Stopped`. + pub fn stop(&self) { + self.gic + .notify_lifecycle(KvmVmState::Running, KvmVmState::Paused); + debug!("Device gic stopped!"); + } +} + +#[cfg(target_arch = "aarch64")] +impl device_tree::CompileFDT for InterruptController { + fn generate_fdt_node(&self, fdt: &mut Vec) -> errors::Result<()> { + self.gic.generate_fdt(fdt)?; + debug!("Interrupt Controller device tree generated!"); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + #[test] + fn test_create_gicv3() { + use super::*; + use kvm_ioctls::Kvm; + + let kvm = Kvm::new().unwrap(); + let vm = Arc::new(kvm.create_vm().unwrap()); + + let gic_conf = GICConfig { + version: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3.into(), + map_region: 0x0002_0000_0000, + vcpu_count: 4, + max_irq: 192, + msi: false, + }; + + assert!(gicv3::GICv3::new(&vm, &gic_conf).is_ok()); + } +} diff --git a/device_model/src/interrupt_controller/mod.rs b/device_model/src/interrupt_controller/mod.rs new file mode 100644 index 00000000..c94cc192 --- /dev/null +++ b/device_model/src/interrupt_controller/mod.rs @@ -0,0 +1,33 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # Interrupt Controller +//! +//! This module is to create and manager interrupt controller. +//! +//! ## Design +//! +//! This module offers support for: +//! 1. Create kvm-based interrupt controller. +//! 2. Manager lifecycle for `GIC`. +//! +//! ## Platform Support +//! +//! - `aarch64` +#[cfg(target_arch = "aarch64")] +mod aarch64; + +#[cfg(target_arch = "aarch64")] +pub use aarch64::InterruptController; + +#[cfg(target_arch = "aarch64")] +pub use aarch64::GICConfig as InterruptControllerConfig; diff --git a/device_model/src/legacy/mod.rs b/device_model/src/legacy/mod.rs new file mode 100644 index 00000000..f8406569 --- /dev/null +++ b/device_model/src/legacy/mod.rs @@ -0,0 +1,33 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # Legacy +//! +//! This mod emulate legacy devices include RTC and Serial. +//! +//! ## Design +//! +//! This module offers support for: +//! 1. Pl031 device, Arm PrimeCell Real Time Clock. +//! 2. Serial device, Serial UART. +//! +//! ## Platform Support +//! +//! - `x86_64` +//! - `aarch64` +mod serial; +pub use self::serial::Serial; + +#[cfg(target_arch = "aarch64")] +mod pl031; +#[cfg(target_arch = "aarch64")] +pub use self::pl031::PL031; diff --git a/device_model/src/legacy/pl031.rs b/device_model/src/legacy/pl031.rs new file mode 100644 index 00000000..a662e083 --- /dev/null +++ b/device_model/src/legacy/pl031.rs @@ -0,0 +1,178 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::time::{Instant, SystemTime, UNIX_EPOCH}; + +use address_space::{GuestAddress, RegionOps}; +use byteorder::{ByteOrder, LittleEndian}; +use kvm_ioctls::VmFd; +use vmm_sys_util::eventfd::EventFd; + +use super::super::mmio::errors::{Result, ResultExt}; +use super::super::mmio::{DeviceResource, DeviceType, MmioDeviceOps}; + +/// Registers for pl032 from ARM PrimeCell Real Time Clock Technical Reference Manual. +/// Data Register. +const RTC_DR: u64 = 0x00; +/// Match Register. +const RTC_MR: u64 = 0x04; +/// Load Register. +const RTC_LR: u64 = 0x08; +/// Control Register. +const RTC_CR: u64 = 0x0c; +/// Interrupt Mask Set or Clear Register. +const RTC_IMSC: u64 = 0x10; +/// Raw Interrupt Status Register. +const RTC_RIS: u64 = 0x14; +/// Masked Interrupt Status Register. +const RTC_MIS: u64 = 0x18; +/// Interrupt Clear Register. +const RTC_ICR: u64 = 0x1c; +/// Peripheral ID registers, default value. +const RTC_PERIPHERAL_ID: [u8; 8] = [0x31, 0x10, 0x14, 0x00, 0x0d, 0xf0, 0x05, 0xb1]; + +/// Pl032 structure. +pub struct PL031 { + /// Match register value. + mr: u32, + /// Load register value. + lr: u32, + /// Interrupt Mask Set or Clear register value. + imsr: u32, + /// Raw Interrupt Status register value. + risr: u32, + /// The duplicate of Load register value. + tick_offset: u32, + /// Record the real time. + base_time: Instant, + /// Interrupt eventfd. + interrupt_evt: Option, +} + +impl PL031 { + pub fn new() -> Self { + PL031 { + mr: 0, + lr: 0, + imsr: 0, + risr: 0, + tick_offset: SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time wrong") + .as_secs() as u32, // since 1970-01-01 00:00:00,it never cause overflow. + base_time: Instant::now(), + interrupt_evt: None, + } + } + + /// Send interrupt to guest. + fn interrupt(&self) { + if let Some(evt) = &self.interrupt_evt { + let _ = evt.write(1); + } + } + + /// Get current clock value. + fn get_current_value(&self) -> u32 { + self.base_time.elapsed().as_secs() as u32 + self.tick_offset + } +} + +impl RegionOps for PL031 { + /// Read data from registers by guest. + fn read(&mut self, data: &mut [u8], _base: GuestAddress, offset: u64) -> bool { + if offset >= 0xFE0 && offset < 0x1000 { + let value = u32::from(RTC_PERIPHERAL_ID[((offset - 0xFE0) >> 2) as usize]); + LittleEndian::write_u32(data, value); + return true; + } + + let mut value: u32 = 0; + match offset { + RTC_DR => { + value = self.get_current_value(); + } + RTC_MR => { + value = self.mr; + } + RTC_LR => { + value = self.lr; + } + RTC_CR => { + value = 1; + } + RTC_IMSC => { + value = self.imsr; + } + RTC_RIS => { + value = self.risr; + } + RTC_MIS => { + value = self.risr & self.imsr; + } + _ => {} + } + + LittleEndian::write_u32(data, value); + + true + } + + /// Write data to registers by guest. + fn write(&mut self, data: &[u8], _base: GuestAddress, offset: u64) -> bool { + let value = LittleEndian::read_u32(data); + + match offset { + RTC_MR => { + self.mr = value; + } + RTC_LR => { + self.lr = value; + self.tick_offset = value; + self.base_time = Instant::now(); + } + RTC_IMSC => { + self.imsr = value & 1; + self.interrupt(); + } + RTC_ICR => { + self.risr = 0; + self.interrupt(); + } + _ => {} + } + + true + } +} + +impl MmioDeviceOps for PL031 { + /// Realize RTC device when VM starting. + fn realize(&mut self, vm_fd: &VmFd, resource: DeviceResource) -> Result<()> { + match EventFd::new(libc::EFD_NONBLOCK) { + Ok(evt) => { + vm_fd + .register_irqfd(&evt, resource.irq) + .chain_err(|| "Failed to register irqfd")?; + self.interrupt_evt = Some(evt); + + Ok(()) + } + Err(_) => Err("Failed to create new EventFd".into()), + } + } + + /// Get device type. + fn get_type(&self) -> DeviceType { + DeviceType::RTC + } +} diff --git a/device_model/src/legacy/serial.rs b/device_model/src/legacy/serial.rs new file mode 100644 index 00000000..62902aed --- /dev/null +++ b/device_model/src/legacy/serial.rs @@ -0,0 +1,459 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::collections::VecDeque; +use std::io; +use std::os::unix::io::RawFd; +use std::sync::{Arc, Mutex}; + +use address_space::{GuestAddress, RegionOps}; +use kvm_ioctls::VmFd; +use util::epoll_context::{EventNotifier, EventNotifierHelper, NotifierOperation}; +use vmm_sys_util::{epoll::EventSet, eventfd::EventFd, terminal::Terminal}; + +use super::super::mmio::errors::{Result, ResultExt}; +use super::super::mmio::{DeviceResource, DeviceType, MmioDeviceOps}; + +const UART_IER_RDI: u8 = 0x01; +const UART_IER_THRI: u8 = 0x02; +const UART_IIR_NO_INT: u8 = 0x01; +const UART_IIR_THRI: u8 = 0x02; +const UART_IIR_RDI: u8 = 0x04; +const _UART_IIR_ID: u8 = 0x06; + +const UART_LCR_DLAB: u8 = 0x80; +const UART_LSR_DR: u8 = 0x01; +const _UART_LSR_OE: u8 = 0x02; +const _UART_LSR_BI: u8 = 0x10; +const UART_LSR_THRE: u8 = 0x20; +const UART_LSR_TEMT: u8 = 0x40; + +const UART_MCR_OUT2: u8 = 0x08; +const UART_MCR_LOOP: u8 = 0x10; +const UART_MSR_CTS: u8 = 0x10; +const UART_MSR_DSR: u8 = 0x20; +const UART_MSR_DCD: u8 = 0x80; + +const RECEIVER_BUFF_SIZE: usize = 1024; + +/// Contain registers and operation methods of serial. +pub struct Serial { + /// Receiver buffer register. + rbr: VecDeque, + /// Interrupt enable register. + ier: u8, + /// interrupt identification register. + iir: u8, + /// Line control register. + lcr: u8, + /// Modem control register. + mcr: u8, + /// Line status register. + lsr: u8, + /// Modem status register. + msr: u8, + /// Scratch register. + scr: u8, + /// Used to set baud rate. + div: u16, + /// Transmitter holding register. + thr_pending: u32, + /// Interrupt event file descriptor. + interrupt_evt: Option, + /// Operation methods. + output: Option>, +} + +impl Serial { + /// Create a new `Serial` instance with default parameters. + pub fn new() -> Self { + Serial { + rbr: VecDeque::new(), + ier: 0, + iir: UART_IIR_NO_INT, + lcr: 0x03, // 8 bits + mcr: UART_MCR_OUT2, + lsr: UART_LSR_TEMT | UART_LSR_THRE, + msr: UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, + scr: 0, + div: 0x0c, + thr_pending: 0, + interrupt_evt: None, + output: None, + } + } + + /// Set EventFd for serial. + /// + /// # Errors + /// + /// Return Error if + /// * fail to write EventFd. + /// * fail to get an interrupt event fd. + fn interrupt(&self) -> Result<()> { + match &self.interrupt_evt { + Some(evt) => evt.write(1).chain_err(|| "Failed to write fd")?, + None => bail!("Failed to get an interrupt event fd"), + }; + + Ok(()) + } + + /// Update interrupt identification register, + /// this method would be called when the interrupt identification changes. + fn update_iir(&mut self) -> Result<()> { + let mut iir = UART_IIR_NO_INT; + + if self.ier & UART_IER_RDI != 0 && self.lsr & UART_LSR_DR != 0 { + iir &= !UART_IIR_NO_INT; + iir |= UART_IIR_RDI; + } else if self.ier & UART_IER_THRI != 0 && self.thr_pending > 0 { + iir &= !UART_IIR_NO_INT; + iir |= UART_IIR_THRI; + } + + self.iir = iir; + + if iir != UART_IIR_NO_INT { + self.interrupt()?; + } + + Ok(()) + } + + /// Append `data` to receiver buffer register, and update IIR. + /// + /// # Arguments + /// + /// * `data` - A u8-type array. + pub fn receive(&mut self, data: &[u8]) -> Result<()> { + if self.mcr & UART_MCR_LOOP == 0 { + if self.rbr.len() >= RECEIVER_BUFF_SIZE { + bail!("Serial receive buffer extend the Max size."); + } + + self.rbr.extend(data); + self.lsr |= UART_LSR_DR; + + self.update_iir()?; + } + + Ok(()) + } + + /// Read one byte data from a certain register selected by `offset`. + /// + /// # Arguments + /// + /// * `offset` - Used to select a register. + /// + /// # Errors + /// + /// Return Error if fail to update iir. + fn read_internal(&mut self, offset: u64) -> u8 { + let mut ret: u8 = 0; + + match offset { + 0 => { + if self.lcr & UART_LCR_DLAB != 0 { + ret = self.div as u8; + } else { + if !self.rbr.is_empty() { + ret = self.rbr.pop_front().unwrap_or_default(); + } + if self.rbr.is_empty() { + self.lsr &= !UART_LSR_DR; + } + + if self.update_iir().is_err() { + error!("Failed to update iir."); + } + } + } + 1 => { + if self.lcr & UART_LCR_DLAB != 0 { + ret = (self.div >> 8) as u8; + } else { + ret = self.ier + } + } + 2 => { + ret = self.iir | 0xc0; + self.thr_pending = 0; + self.iir = UART_IIR_NO_INT + } + 3 => { + ret = self.lcr; + } + 4 => { + ret = self.mcr; + } + 5 => { + ret = self.lsr; + } + 6 => { + if self.mcr & UART_MCR_LOOP != 0 { + ret = (self.mcr & 0x0c) << 4; + ret |= (self.mcr & 0x02) << 3; + ret |= (self.mcr & 0x01) << 5; + } else { + ret = self.msr; + } + } + 7 => { + ret = self.scr; + } + _ => {} + } + + ret + } + + /// Write one byte data to a certain register selected by `offset`. + /// + /// # Arguments + /// + /// * `offset` - Used to select a register. + /// * `data` - A u8-type data, which will be written to the register. + /// + /// # Errors + /// + /// Return Error if + /// * fail to get output file descriptor. + /// * fail to write serial. + /// * fail to flush serial. + fn write_internal(&mut self, offset: u64, data: u8) -> Result<()> { + match offset { + 0 => { + if self.lcr & UART_LCR_DLAB != 0 { + self.div = (self.div & 0xff00) | u16::from(data); + } else { + self.thr_pending = 1; + + if self.mcr & UART_MCR_LOOP != 0 { + // loopback mode + if self.rbr.len() >= RECEIVER_BUFF_SIZE { + bail!("Serial receive buffer extend the Max size."); + } + + self.rbr.push_back(data); + self.lsr |= UART_LSR_DR; + } else { + let output = match &mut self.output { + Some(output_) => output_, + None => bail!("Failed to get output fd."), + }; + + output + .write_all(&[data]) + .chain_err(|| "Failed to write for serial.")?; + output.flush().chain_err(|| "Failed to flush for serial.")?; + } + + self.update_iir()?; + } + } + 1 => { + if self.lcr & UART_LCR_DLAB != 0 { + self.div = (self.div & 0x00ff) | (u16::from(data) << 8); + } else { + let changed = (self.ier ^ data) & 0x0f; + self.ier = data & 0x0f; + + if changed != 0 { + self.update_iir()?; + } + } + } + 3 => { + self.lcr = data; + } + 4 => { + self.mcr = data; + } + 7 => { + self.scr = data; + } + _ => {} + } + + Ok(()) + } +} + +impl RegionOps for Serial { + /// Read one byte data to `data` from a certain register selected by `offset`. + /// + /// # Arguments + /// + /// * `data` - The destination that the data would be read to. + /// * `offset` - Used to select a register. + /// + /// # Errors + /// + /// Return Error if fail to update iir. + fn read(&mut self, data: &mut [u8], _base: GuestAddress, offset: u64) -> bool { + data[0] = self.read_internal(offset); + + true + } + + /// Write one byte data to a certain register selected by `offset`. + /// + /// # Arguments + /// + /// * `offset` - Used to select a register. + /// * `data` - A u8-type array, but only the first data would be written to the register. + /// + /// # Errors + /// + /// Return Error if + /// * fail to get output file descriptor. + /// * fail to write serial. + /// * fail to flush serial. + fn write(&mut self, data: &[u8], _base: GuestAddress, offset: u64) -> bool { + self.write_internal(offset, data[0]).is_ok() + } +} + +impl MmioDeviceOps for Serial { + /// Realize a serial for VM. + /// * Create a new output component. + /// * Register DeviceResource IRQ to VM. + /// * Set interrupt_evt component. + /// + /// # Arguments + /// + /// * `vm_fd` - File descriptor of VM. + /// * `resource` - Device resource. + /// + /// # Errors + /// + /// Return Error if + /// * fail to register. + /// * fail to create a new EventFd. + fn realize(&mut self, vm_fd: &VmFd, resource: DeviceResource) -> Result<()> { + self.output = Some(Box::new(std::io::stdout())); + + match EventFd::new(libc::EFD_NONBLOCK) { + Ok(evt) => { + vm_fd + .register_irqfd(&evt, resource.irq) + .chain_err(|| "Failed to register irqfd")?; + self.interrupt_evt = Some(evt); + + Ok(()) + } + Err(_) => Err("Failed to create new EventFd".into()), + } + } + + /// Get type of Device. + fn get_type(&self) -> DeviceType { + DeviceType::SERIAL + } +} + +impl EventNotifierHelper for Serial { + /// Add serial to `EventNotifier`. + /// + /// # Arguments + /// + /// * `serial` - Serial instance. + fn internal_notifiers(serial: Arc>) -> Vec { + let mut notifiers = Vec::new(); + + let mut handlers = Vec::new(); + let handler: Box Option>> = + Box::new(move |_, _| { + let mut out = [0_u8; 64]; + if let Ok(count) = std::io::stdin().lock().read_raw(&mut out) { + let _ = serial.lock().unwrap().receive(&out[..count]); + } + None + }); + + handlers.push(Arc::new(Mutex::new(handler))); + + let notifier = EventNotifier::new( + NotifierOperation::AddShared, + libc::STDIN_FILENO, + None, + EventSet::IN, + handlers, + ); + + notifiers.push(notifier); + notifiers + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_methods_of_serial() { + // test new method + let mut usart = Serial::new(); + assert_eq!(usart.ier, 0); + assert_eq!(usart.iir, 1); + assert_eq!(usart.lcr, 3); + assert_eq!(usart.mcr, 8); + assert_eq!(usart.lsr, 0x60); + assert_eq!(usart.msr, 0xb0); + assert_eq!(usart.scr, 0); + assert_eq!(usart.div, 0x0c); + assert_eq!(usart.thr_pending, 0); + + // test interrupt method + // for interrupt method to work, + // you need to set interrupt_evt at first + assert!(usart.interrupt().is_err()); + + let evt = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + usart.interrupt_evt = Some(evt); + assert!(usart.interrupt().is_ok()); + + // test receive method + let data = [0x01, 0x02]; + assert!(usart.receive(&data).is_ok()); + assert_eq!(usart.rbr.is_empty(), false); + assert_eq!(usart.rbr.len(), 2); + assert_eq!(usart.rbr.front(), Some(&0x01)); + assert_eq!((usart.lsr & 0x01), 1); + + // test write_and_read_internal method + assert_eq!(usart.read_internal(0), 0x01); + assert_eq!(usart.read_internal(0), 0x02); + assert_eq!((usart.lsr & 0x01), 0); + + // for write_internal with first argument to work, + // you need to set output at first + assert!(usart.write_internal(0, 0x03).is_err()); + usart.output = Some(Box::new(std::io::stdout())); + assert!(usart.write_internal(0, 0x03).is_ok()); + usart.write_internal(3, 0xff).unwrap(); + assert_eq!(usart.read_internal(3), 0xff); + usart.write_internal(4, 0xff).unwrap(); + assert_eq!(usart.read_internal(4), 0xff); + usart.write_internal(7, 0xff).unwrap(); + assert_eq!(usart.read_internal(7), 0xff); + usart.write_internal(0, 0x0d).unwrap(); + assert_eq!(usart.read_internal(0), 0x0d); + usart.write_internal(1, 0x0c).unwrap(); + assert_eq!(usart.read_internal(1), 0x0c); + assert_eq!(usart.read_internal(2), 0xc1); + assert_eq!(usart.read_internal(5), 0x60); + assert_eq!(usart.read_internal(6), 0xf0); + } +} diff --git a/device_model/src/lib.rs b/device_model/src/lib.rs new file mode 100644 index 00000000..52e86a7a --- /dev/null +++ b/device_model/src/lib.rs @@ -0,0 +1,68 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! Interfaces for simulating real hardware. +//! +//! This crate simulates: +//! - cpu +//! - interrupt controller (aarch64) +//! - legacy devices, such as serial devices +//! - MMIO bus +//! - devices with virtio support, such as virtio-blk and virtio-net +//! - mainboard for micro VM +//! +//! # Platform support +//! +//! - x86_64 +//! - aarch64 + +#[macro_use] +extern crate log; +#[macro_use] +extern crate vmm_sys_util; +#[macro_use] +extern crate error_chain; +extern crate serde; +#[cfg(target_arch = "aarch64")] +#[macro_use] +extern crate util; +#[macro_use] +extern crate machine_manager; + +mod cpu; +mod interrupt_controller; +mod legacy; +mod micro_vm; +mod mmio; +mod virtio; + +pub use error_chain::*; +pub use micro_vm::{cmdline, main_loop::MainLoop, micro_syscall::register_seccomp, LightMachine}; + +pub mod errors { + error_chain! { + links { + AddressSpace(address_space::errors::Error, address_space::errors::ErrorKind); + Util(util::errors::Error, util::errors::ErrorKind); + BootLoader(boot_loader::errors::Error, boot_loader::errors::ErrorKind); + Manager(machine_manager::errors::Error, machine_manager::errors::ErrorKind); + Cpu(crate::cpu::errors::Error, crate::cpu::errors::ErrorKind); + Mmio(crate::mmio::errors::Error, crate::mmio::errors::ErrorKind); + } + foreign_links { + Io(std::io::Error); + Kvm(kvm_ioctls::Error); + Json(serde_json::Error); + Nul(std::ffi::NulError); + } + } +} diff --git a/device_model/src/micro_vm/cmdline.rs b/device_model/src/micro_vm/cmdline.rs new file mode 100644 index 00000000..ebf10b55 --- /dev/null +++ b/device_model/src/micro_vm/cmdline.rs @@ -0,0 +1,448 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::fs::File; +use std::io::Read; + +use error_chain::bail; +use machine_manager::config::VmConfig; +use machine_manager::socket::SocketType; +use util::arg_parser::{Arg, ArgMatches, ArgParser}; + +use crate::errors::{Result, ResultExt}; + +// Read the programe version in `Cargo.toml`. +const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); + +/// This macro is to run struct $z 's function $s whose arg is $x 's inner member. +/// There is a multi-macro-cast in cases of vec and bool. +/// +/// # Examples +/// +/// ```text +/// update_args_to_config!(name, vm_cfg, update_name); +/// update_args_to_config!(name, vm_cfg, update_name, vec); +/// update_args_to_config!(name, vm_cfg, update_name, bool); +/// ``` +macro_rules! update_args_to_config { + ( $x:tt, $z:expr, $s:tt ) => { + if let Some(temp) = &$x { + $z.$s(temp.to_string()) + } + }; + ( $x:tt, $z:expr, $s:tt, vec ) => { + if let Some(temp) = &$x { + $z.$s(&temp.to_vec()) + } + }; + ( $x:tt, $z:expr, $s:tt, bool ) => { + if $x { + $z.$s() + } + }; +} + +/// This macro is to run struct $z 's function $s whose arg is $x 's every inner +/// member. +/// +/// # Examples +/// +/// ```text +/// update_args_to_config_multi!(drive, vm_cfg, update_drive); +/// ``` +macro_rules! update_args_to_config_multi { + ( $x:tt, $z:expr, $s:tt ) => { + if let Some(temps) = &$x { + for temp in temps { + $z.$s(temp.to_string()) + } + } + }; +} + +/// This function is to define all commandline arguments. +pub fn create_args_parser<'a>() -> ArgParser<'a> { + ArgParser::new("StratoVirt") + .version(VERSION.unwrap_or("unknown")) + .author("Huawei Technologies Co., Ltd") + .about("A light kvm-based hypervisor.") + .arg( + Arg::with_name("name") + .long("name") + .value_name("vm_name") + .help("set the name of the guest.") + .takes_value(true), + ) + .arg( + Arg::with_name("smp") + .long("smp") + .value_name("[cpus=]n") + .help("set the number of CPUs to 'n' (default: 1)") + .takes_value(true), + ) + .arg( + Arg::with_name("memory") + .long("m") + .value_name("[size=]megs") + .help("configure guest RAM") + .takes_value(true), + ) + .arg( + Arg::with_name("config-file") + .long("config") + .value_name("json file path") + .help("Sets a config file for vmm.") + .takes_value(true), + ) + .arg( + Arg::with_name("kernel") + .long("kernel") + .value_name("kernel_path") + .help("use uncompressed kernel image") + .takes_value(true), + ) + .arg( + Arg::with_name("kernel-cmdline") + .multiple(true) + .long("append") + .value_name("command-line parameters") + .help("use 'cmdline' as kernel command line") + .takes_values(true), + ) + .arg( + Arg::with_name("initrd-file") + .long("initrd") + .value_name("initrd_path") + .help("use 'initrd-file' as initial ram disk") + .takes_value(true), + ) + .arg( + Arg::with_name("api-channel") + .long("api-channel") + .value_name("unix:PATH") + .help("set api-channel's unixsocket path") + .takes_value(true) + .required(true), + ) + .arg( + Arg::with_name("drive") + .multiple(true) + .long("drive") + .value_name("[file=path][,id=str][,readonly=][,direct=]") + .help("use 'file' as a drive image") + .takes_values(true), + ) + .arg( + Arg::with_name("netdev") + .multiple(true) + .long("netdev") + .value_name("tap[,id=str][,netdev=hostname][,mac=addr]") + .help("configure a host TAP network with ID 'str'") + .takes_values(true), + ) + .arg( + Arg::with_name("chardev") + .multiple(true) + .long("chardev") + .value_name("chartype[,id=str][,path=socket_path]") + .help("set char device for vm") + .takes_values(true), + ) + .arg( + Arg::with_name("device") + .multiple(true) + .long("device") + .value_name("device_type[,prop1=value1,...]") + .help("add device (based on driver) and sets driver properties") + .takes_values(true), + ) + .arg( + Arg::with_name("serial") + .long("serial") + .value_name("[stdio]") + .help("add serial and set stdio or not") + .can_no_value(true) + .takes_value(true), + ) + .arg( + Arg::with_name("display log") + .long("D") + .value_name("log_path") + .help("output log to logfile (default stderr)") + .takes_value(true) + .can_no_value(true), + ) + .arg( + Arg::with_name("pidfile") + .long("pidfile") + .help("write PID to 'file'") + .takes_value(true), + ) + .arg( + Arg::with_name("daemonize") + .long("daemonize") + .help("daemonize StratoVirt after initializing") + .takes_value(false) + .required(false), + ) + .arg( + Arg::with_name("disable-seccomp") + .long("disable-seccomp") + .help("not use seccomp sandbox for StratoVirt") + .takes_value(false) + .required(false), + ) + .arg( + Arg::with_name("freeze_cpu") + .short("S") + .long("freeze") + .help("Freeze CPU at startup") + .takes_value(false) + .required(false), + ) + .arg( + Arg::with_name("omit_vm_memory") + .long("omit_vm_memory") + .help("not dump guest memory in core file") + .takes_value(false) + .required(false), + ) + // Below cmdline is adapted for Kata/Qemu, no use. + .arg( + Arg::with_name("uuid") + .long("uuid") + .value_name("uuid") + .help("specify machine UUID") + .takes_value(true) + .hidden(true), + ) + .arg( + Arg::with_name("cpu") + .long("cpu") + .help("select CPU architecture") + .takes_value(true) + .hidden(true), + ) + .arg( + Arg::with_name("machine") + .long("machine") + .help("selects emulated machine") + .takes_value(true) + .hidden(true), + ) + .arg( + Arg::with_name("global_property") + .long("global") + .multiple(true) + .help("set a global default for a item property") + .takes_values(true) + .hidden(true), + ) + .arg( + Arg::with_name("object") + .multiple(true) + .long("object") + .value_name(" TYPENAME[,PROP1=VALUE1,...]") + .help("create a new object of type TYPENAME settingproperties") + .takes_values(true) + .hidden(true), + ) + .arg( + Arg::with_name("fsdriver") + .multiple(true) + .long("fsdev") + .help("set fs device for vm") + .takes_values(true) + .hidden(true), + ) + .arg( + Arg::with_name("vga [std|cirrus|vmware|qxl|xenfb|tcx|cg3|virtio|none]") + .long("vga") + .help("select video card type") + .takes_value(true) + .hidden(true), + ) + .arg( + Arg::with_name("numa node") + .long("numa") + .value_name("[,memdev=id][,cpus=cpu[-cpu]][,nodeid=node]") + .help("set numa config") + .takes_value(true) + .hidden(true), + ) + .arg( + Arg::with_name("no-user-config") + .long("no-user-config") + .help("do not load user-provided config files at startup") + .takes_value(false) + .required(false) + .hidden(true), + ) + .arg( + Arg::with_name("nodefaults") + .long("nodefaults") + .help("don't create default devices") + .takes_value(false) + .required(false) + .hidden(true), + ) + .arg( + Arg::with_name("nographic") + .long("nographic") + .help("disable graphical output and redirect serial I/Os to console") + .takes_value(false) + .required(false) + .hidden(true), + ) +} + +/// Create `VmConfig` from `ArgMatches`'s arg. +/// +/// When accepted cmdline arguments, `StratoVirt` will parse useful arguments and +/// transform them to VM's configuration structure -- `VmConfig`. +/// +/// # Arguments +/// +/// - * `args` - The structure accepted input cmdline arguments. +/// +/// # Errors +/// +/// Input arguments is illegal for `VmConfig` or `VmConfig`'s health check +/// failed -- with this unhealthy `VmConfig`, VM will not boot successfully. +#[allow(unused_parens)] +pub fn create_vmconfig(args: &ArgMatches) -> Result { + // Parse config-file json. + // VmConfig can be transformed by json file which described VmConfig + // directly. + let mut vm_cfg = VmConfig::default(); + if let Some(config_file) = args.value_of("config-file") { + let config_value = match File::open(&config_file) { + Ok(mut f) => { + let mut data = String::new(); + f.read_to_string(&mut data) + .chain_err(|| format!("Failed to read from file:{}", &config_file))?; + if config_file.contains("json") { + serde_json::from_str(&data)? + } else { + bail!("Only support \'json\' format config-file"); + } + } + Err(e) => { + bail!("Failed to open config file by: {}", e); + } + }; + vm_cfg = VmConfig::create_from_value(config_value) + .chain_err(|| "Failed to parse config file to VmConfig")?; + } + + // Parse cmdline args which need to set in VmConfig + update_args_to_config!((args.value_of("name")), vm_cfg, update_name); + update_args_to_config!((args.value_of("memory")), vm_cfg, update_memory); + update_args_to_config!((args.value_of("smp")), vm_cfg, update_cpu); + update_args_to_config!((args.value_of("kernel")), vm_cfg, update_kernel); + update_args_to_config!((args.value_of("initrd-file")), vm_cfg, update_initrd); + update_args_to_config!((args.value_of("serial")), vm_cfg, update_serial); + update_args_to_config!( + (args.values_of("kernel-cmdline")), + vm_cfg, + update_kernel_cmdline, + vec + ); + update_args_to_config_multi!((args.values_of("drive")), vm_cfg, update_drive); + update_args_to_config_multi!((args.values_of("device")), vm_cfg, update_vsock); + update_args_to_config_multi!((args.values_of("netdev")), vm_cfg, update_net); + update_args_to_config_multi!((args.values_of("chardev")), vm_cfg, update_console); + update_args_to_config!( + (args.is_present("omit_vm_memory")), + vm_cfg, + update_omit_vm_memory, + bool + ); + + // Check the mini-set for Vm to start is ok + vm_cfg + .check_vmconfig(args.is_present("daemonize")) + .chain_err(|| "Precheck failed, VmConfig is unhealthy, stop running")?; + + Ok(vm_cfg) +} + +/// This function is to parse api-channel socket path and type. +/// +/// # Arguments +/// +/// * `args` - The structure accepted input cmdline arguments. +/// +/// # Errors +/// +/// The value of `api-channel` is illegel. +pub fn check_api_channel(args: &ArgMatches) -> Result<(String, SocketType)> { + if let Some(api) = args.value_of("api-channel") { + let (api_path, api_type) = parse_path(&api) + .map(|(path, type_)| (path, type_)) + .chain_err(|| "Failed to parse api-channel socket path")?; + Ok((api_path, api_type)) + } else { + bail!("Please use \'-api-channel\' to give a api-channel path for Unix socket"); + } +} + +/// This function is to parse a `String` to socket path string and socket type. +/// +/// # Arguments +/// +/// * `args_str` - The arguments `String` would be parsed. +/// +/// # Errors +/// +/// The arguments `String` is illegal. +fn parse_path(args_str: &str) -> Result<(String, SocketType)> { + let arg: Vec<&str> = args_str.split(',').collect(); + let item = arg[0].to_string(); + let path_vec: Vec<&str> = item.split(':').collect(); + if path_vec.len() > 1 { + if path_vec[0] == "unix" { + let unix_path = String::from(path_vec[1]); + Ok((unix_path, SocketType::Unix)) + } else { + bail!("{} type is not support yet!", path_vec[0]); + } + } else { + bail!("Failed to parse path: {}", args_str); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_path() { + let test_path = "unix:/tmp/stratovirt.sock"; + assert_eq!( + parse_path(test_path).unwrap(), + ("/tmp/stratovirt.sock".to_string(), SocketType::Unix) + ); + + let test_path = "unix:/tmp/stratovirt.sock,nowait,server"; + assert_eq!( + parse_path(test_path).unwrap(), + ("/tmp/stratovirt.sock".to_string(), SocketType::Unix) + ); + + let test_path = "tcp:127.0.0.1:8080,nowait,server"; + assert!(parse_path(test_path).is_err()); + + let test_path = "file:/tmp/stratovirt-file"; + assert!(parse_path(test_path).is_err()); + } +} diff --git a/device_model/src/micro_vm/main_loop.rs b/device_model/src/micro_vm/main_loop.rs new file mode 100644 index 00000000..460a8c56 --- /dev/null +++ b/device_model/src/micro_vm/main_loop.rs @@ -0,0 +1,77 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate util; + +use std::sync::Arc; + +use util::epoll_context::{EventNotifier, MainLoopContext, MainLoopManager}; + +static mut CURRENT_MAINLOOP: Option = None; + +/// The struct `MainLoop` is the only struct can handle Global variable +/// `CURRENT_MAINLOOP`. It can manage events add and adjust or start to +/// run `main_loop`. +pub struct MainLoop {} + +impl MainLoop { + /// Constructs a `MainLoopContext` in global `QMP_CHANNEL`. + pub fn object_init() { + unsafe { + if CURRENT_MAINLOOP.is_none() { + CURRENT_MAINLOOP = Some(MainLoopContext::new()); + } + } + } + + /// Set a `manager` to `CURRENT_MAINLOOP`. + /// + /// # Arguments + /// + /// * `manager` - The main part to manager `CURRENT_MAINLOOP`. + pub fn set_manager(manager: Arc) { + Self::locked_inner().set_manager(manager); + } + + /// Update event notifiers to `CURRENT_MAINLOOP`. + /// + /// * `notifiers` - The wrapper of events will be handled in + /// `CURRENT_MAINLOOP`. + /// + /// # Errors + /// + /// Update event failed. + pub fn update_event(notifiers: Vec) -> util::errors::Result<()> { + Self::locked_inner().update_events(notifiers) + } + + /// Start to run `CURRENT_MAINLOOP` according `epoll`. + /// + /// # Notes + /// + /// Once run `CURRENT_MAINLOOP`, `epoll` in `MainLoopContext` will execute + /// `epoll_wait()` function to wait for events. + pub fn run() -> util::errors::Result { + Self::locked_inner().run() + } + + fn locked_inner() -> &'static mut MainLoopContext { + unsafe { + match &mut CURRENT_MAINLOOP { + Some(main_loop) => main_loop, + None => { + panic!("Main loop not initialized"); + } + } + } + } +} diff --git a/device_model/src/micro_vm/micro_syscall.rs b/device_model/src/micro_vm/micro_syscall.rs new file mode 100644 index 00000000..2ea20a5b --- /dev/null +++ b/device_model/src/micro_vm/micro_syscall.rs @@ -0,0 +1,162 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate libc; + +use crate::errors::Result; +use crate::virtio::vhost::kernel::*; +use util::kvm_ioctls_ext::{KVM_GET_DEVICE_ATTR, KVM_HAS_DEVICE_ATTR}; +use util::seccomp::{BpfRule, SeccompCmpOpt, SeccompOpt, SyscallFilter}; +use util::tap::{TUNSETIFF, TUNSETOFFLOAD, TUNSETVNETHDRSZ}; + +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/futex.h +const FUTEX_WAIT: u32 = 0; +const FUTEX_WAKE: u32 = 1; +const FUTEX_CMP_REQUEUE: u32 = 4; +const FUTEX_WAKE_OP: u32 = 5; +const FUTEX_WAIT_BITSET: u32 = 9; +const FUTEX_PRIVATE_FLAG: u32 = 128; +const FUTEX_WAIT_PRIVATE: u32 = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; +const FUTEX_WAKE_PRIVATE: u32 = FUTEX_WAKE | FUTEX_PRIVATE_FLAG; +const FUTEX_CMP_REQUEUE_PRIVATE: u32 = FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG; +const FUTEX_WAKE_OP_PRIVATE: u32 = FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG; +const FUTEX_WAIT_BITSET_PRIVATE: u32 = FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG; + +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/fcntl.h +const F_GETFD: u32 = 1; +const F_SETFD: u32 = 2; +const F_LINUX_SPECIFIC_BASE: u32 = 1024; +const F_DUPFD_CLOEXEC: u32 = F_LINUX_SPECIFIC_BASE + 6; + +// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/asm-generic/ioctls.h +const TCGETS: u32 = 0x5401; +const TCSETS: u32 = 0x5402; +const TIOCGWINSZ: u32 = 0x5413; +const FIOCLEX: u32 = 0x5451; +const FIONBIO: u32 = 0x5421; +const KVM_RUN: u32 = 0xae80; + +// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/asm-generic/kvm.h +const KVM_SET_DEVICE_ATTR: u32 = 0x4018_aee1; + +/// Create a syscall allowlist for seccomp. +/// +/// # Notes +/// This allowlist limit syscall with: +/// * x86_64-unknown-gnu: 34 syscalls +/// * x86_64-unknown-musl: 33 syscalls +/// * aarch64-unknown-gnu: 33 syscalls +/// * aarch64-unknown-musl: 32 syscalls +/// To reduce performance losses, the syscall rules is ordered by frequency. +fn syscall_allow_list() -> Vec { + vec![ + BpfRule::new(libc::SYS_read), + BpfRule::new(libc::SYS_write), + ioctl_allow_list(), + #[cfg(not(all(target_env = "gnu", target_arch = "x86_64")))] + BpfRule::new(libc::SYS_epoll_pwait), + #[cfg(all(target_env = "gnu", target_arch = "x86_64"))] + BpfRule::new(libc::SYS_epoll_wait), + BpfRule::new(libc::SYS_io_getevents), + BpfRule::new(libc::SYS_io_submit), + BpfRule::new(libc::SYS_dup), + BpfRule::new(libc::SYS_close), + BpfRule::new(libc::SYS_eventfd2), + BpfRule::new(libc::SYS_epoll_ctl), + BpfRule::new(libc::SYS_fdatasync), + BpfRule::new(libc::SYS_recvmsg), + BpfRule::new(libc::SYS_sendmsg), + BpfRule::new(libc::SYS_recvfrom), + BpfRule::new(libc::SYS_io_setup), + BpfRule::new(libc::SYS_brk), + BpfRule::new(libc::SYS_fcntl) + .add_constraint(SeccompCmpOpt::Eq, 1, F_DUPFD_CLOEXEC) + .add_constraint(SeccompCmpOpt::Eq, 1, F_SETFD) + .add_constraint(SeccompCmpOpt::Eq, 1, F_GETFD), + BpfRule::new(libc::SYS_rt_sigprocmask), + #[cfg(target_arch = "x86_64")] + BpfRule::new(libc::SYS_open), + BpfRule::new(libc::SYS_openat), + BpfRule::new(libc::SYS_sigaltstack), + BpfRule::new(libc::SYS_mmap), + BpfRule::new(libc::SYS_munmap), + BpfRule::new(libc::SYS_accept4), + BpfRule::new(libc::SYS_lseek), + BpfRule::new(libc::SYS_futex) + .add_constraint(SeccompCmpOpt::Eq, 1, FUTEX_WAKE_PRIVATE) + .add_constraint(SeccompCmpOpt::Eq, 1, FUTEX_WAIT_PRIVATE) + .add_constraint(SeccompCmpOpt::Eq, 1, FUTEX_CMP_REQUEUE_PRIVATE) + .add_constraint(SeccompCmpOpt::Eq, 1, FUTEX_WAKE_OP_PRIVATE) + .add_constraint(SeccompCmpOpt::Eq, 1, FUTEX_WAIT_BITSET_PRIVATE), + BpfRule::new(libc::SYS_exit), + BpfRule::new(libc::SYS_exit_group), + BpfRule::new(libc::SYS_rt_sigreturn), + #[cfg(target_env = "musl")] + BpfRule::new(libc::SYS_tkill), + #[cfg(target_env = "gnu")] + BpfRule::new(libc::SYS_tgkill), + BpfRule::new(libc::SYS_gettid), + BpfRule::new(libc::SYS_getpid), + BpfRule::new(libc::SYS_fstat), + BpfRule::new(libc::SYS_pread64), + BpfRule::new(libc::SYS_pwrite64), + #[cfg(target_env = "gnu")] + BpfRule::new(libc::SYS_madvise).add_constraint( + SeccompCmpOpt::Eq, + 2, + libc::MADV_DONTNEED as u32, + ), + ] +} + +/// Create a syscall bpf rule for syscall `ioctl`. +fn ioctl_allow_list() -> BpfRule { + BpfRule::new(libc::SYS_ioctl) + .add_constraint(SeccompCmpOpt::Eq, 1, TCGETS) + .add_constraint(SeccompCmpOpt::Eq, 1, TCSETS) + .add_constraint(SeccompCmpOpt::Eq, 1, TIOCGWINSZ) + .add_constraint(SeccompCmpOpt::Eq, 1, FIOCLEX) + .add_constraint(SeccompCmpOpt::Eq, 1, FIONBIO) + .add_constraint(SeccompCmpOpt::Eq, 1, KVM_RUN) + .add_constraint(SeccompCmpOpt::Eq, 1, KVM_SET_DEVICE_ATTR) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_VSOCK_SET_GUEST_CID() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_VSOCK_SET_RUNNING() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_VRING_CALL() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_VRING_NUM() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_VRING_ADDR() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_VRING_BASE() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_VRING_KICK() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_VRING_CALL() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_OWNER() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_FEATURES() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_SET_MEM_TABLE() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, VHOST_NET_SET_BACKEND() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, TUNSETIFF() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, TUNSETOFFLOAD() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, TUNSETVNETHDRSZ() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, KVM_GET_DEVICE_ATTR() as u32) + .add_constraint(SeccompCmpOpt::Eq, 1, KVM_HAS_DEVICE_ATTR() as u32) +} + +/// Register seccomp rules in syscall allowlist to seccomp. +pub fn register_seccomp() -> Result<()> { + let mut seccomp_filter = SyscallFilter::new(SeccompOpt::Trap); + + let mut bpf_rules = syscall_allow_list(); + for bpf_rule in &mut bpf_rules { + seccomp_filter.push(bpf_rule); + } + + seccomp_filter.realize()?; + + Ok(()) +} diff --git a/device_model/src/micro_vm/mod.rs b/device_model/src/micro_vm/mod.rs new file mode 100644 index 00000000..8b5ddf5b --- /dev/null +++ b/device_model/src/micro_vm/mod.rs @@ -0,0 +1,1262 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # Micro VM +//! +//! Micro VM is a extremely light machine type. +//! It has a very simple machine model, which benefits to a very short +//! boot-time and tiny memory usage. +//! +//! ## Design +//! +//! This module offers support for: +//! 1. Create and manage lifecycle for `Micro VM`. +//! 2. Set cmdline arguments parameters for `Micro VM`. +//! 3. Manage mainloop to handle events for `Micro VM` and its devices. +//! +//! ## Platform Support +//! +//! - `x86_64` +//! - `aarch64` +extern crate address_space; +extern crate boot_loader; +extern crate machine_manager; +extern crate util; + +pub mod cmdline; +pub mod main_loop; +pub mod micro_syscall; + +use std::marker::{Send, Sync}; +use std::ops::Deref; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::{Arc, Barrier, Condvar, Mutex}; +use std::vec::Vec; + +#[cfg(target_arch = "x86_64")] +use kvm_bindings::{kvm_pit_config, KVM_PIT_SPEAKER_DUMMY}; +use kvm_ioctls::{Kvm, VmFd}; +use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::terminal::Terminal; + +#[cfg(target_arch = "x86_64")] +use address_space::KvmIoListener; +use address_space::{create_host_mmaps, AddressSpace, GuestAddress, KvmMemoryListener, Region}; +use boot_loader::{load_kernel, BootLoaderConfig}; +use machine_manager::config::{ + BootSource, ConsoleConfig, DriveConfig, NetworkInterfaceConfig, SerialConfig, VmConfig, + VsockConfig, +}; +use machine_manager::machine::{ + DeviceInterface, KvmVmState, MachineAddressInterface, MachineExternalInterface, + MachineInterface, MachineLifecycle, +}; +#[cfg(feature = "qmp")] +use machine_manager::{qmp, qmp::qmp_schema as schema, qmp::QmpChannel}; +#[cfg(target_arch = "aarch64")] +use util::device_tree; +#[cfg(target_arch = "aarch64")] +use util::device_tree::CompileFDT; +use util::epoll_context::{ + EventNotifier, EventNotifierHelper, MainLoopManager, NotifierCallback, NotifierOperation, +}; + +use crate::cpu::{ArchCPU, CPUBootConfig, CPUInterface, CpuTopology, CPU}; +use crate::errors::{Result, ResultExt}; +#[cfg(target_arch = "aarch64")] +use crate::interrupt_controller::{InterruptController, InterruptControllerConfig}; +#[cfg(target_arch = "aarch64")] +use crate::legacy::PL031; +#[cfg(target_arch = "aarch64")] +use crate::mmio::DeviceResource; +use crate::MainLoop; +use crate::{ + legacy::Serial, + mmio::{Bus, DeviceType, VirtioMmioDevice}, + virtio::{vhost, Console}, +}; + +/// Layout of aarch64 +#[cfg(target_arch = "aarch64")] +pub const DRAM_BASE: u64 = 1 << 31; +#[cfg(target_arch = "aarch64")] +pub const MEM_MAPPED_IO_BASE: u64 = 1 << 30; + +/// Layout of x86_64 +#[cfg(target_arch = "x86_64")] +pub const MEM_MAPPED_IO_BASE: u64 = (1 << 32) - MEM_MAPPED_IO_SIZE; +#[cfg(target_arch = "x86_64")] +pub const MEM_MAPPED_IO_SIZE: u64 = 768 << 20; + +/// Every type of devices depends on this configure-related trait to perform +/// initialization. +pub trait ConfigDevBuilder { + /// Constructs device in `Bus` according configuration structure. + /// + /// # Arguments + /// + /// * `sys_mem` - The guest memory to device constructs over. + /// * `bus` - The `mmio` bus where the device initializing. + fn build_dev(&self, sys_mem: Arc, bus: &mut Bus) -> Result<()>; +} + +impl ConfigDevBuilder for DriveConfig { + fn build_dev(&self, _sys_mem: Arc, bus: &mut Bus) -> Result<()> { + bus.fill_replaceable_device(&self.drive_id, Arc::new(self.clone()), DeviceType::BLK) + .chain_err(|| "build dev from config failed") + } +} + +impl ConfigDevBuilder for NetworkInterfaceConfig { + fn build_dev(&self, sys_mem: Arc, bus: &mut Bus) -> Result<()> { + if self.vhost_type.is_some() { + let net = Arc::new(Mutex::new(vhost::kernel::Net::new( + self.clone(), + sys_mem.clone(), + ))); + let device = Arc::new(Mutex::new(VirtioMmioDevice::new(sys_mem, net))); + bus.attach_device(device) + .chain_err(|| "build dev from config failed")?; + Ok(()) + } else { + bus.fill_replaceable_device(&self.iface_id, Arc::new(self.clone()), DeviceType::NET) + .chain_err(|| "build dev from config failed") + } + } +} + +impl ConfigDevBuilder for ConsoleConfig { + fn build_dev(&self, sys_mem: Arc, bus: &mut Bus) -> Result<()> { + let console = Arc::new(Mutex::new(Console::new(self.clone()))); + let device = Arc::new(Mutex::new(VirtioMmioDevice::new(sys_mem, console))); + bus.attach_device(device) + .chain_err(|| "build dev from config failed")?; + Ok(()) + } +} + +impl ConfigDevBuilder for VsockConfig { + fn build_dev(&self, sys_mem: Arc, bus: &mut Bus) -> Result<()> { + let vsock = Arc::new(Mutex::new(vhost::kernel::Vsock::new( + self.clone(), + sys_mem.clone(), + ))); + let device = Arc::new(Mutex::new(VirtioMmioDevice::new(sys_mem, vsock))); + bus.attach_device(device) + .chain_err(|| "build dev from config failed")?; + Ok(()) + } +} + +impl ConfigDevBuilder for SerialConfig { + fn build_dev(&self, _sys_mem: Arc, bus: &mut Bus) -> Result<()> { + let serial = Arc::new(Mutex::new(Serial::new())); + bus.attach_device(serial.clone()) + .chain_err(|| "build dev from config failed")?; + + if self.stdio { + MainLoop::update_event(EventNotifierHelper::internal_notifiers(serial))?; + } + Ok(()) + } +} + +/// A wrapper around creating and using a kvm-based micro VM. +pub struct LightMachine { + /// KVM VM file descriptor, represent VM entry in kvm module. + vm_fd: Arc, + /// `vCPU` topology, support sockets, cores, threads. + cpu_topo: CpuTopology, + /// `vCPU` devices. + cpus: Arc>>>, + /// Interrupt controller device. + #[cfg(target_arch = "aarch64")] + irq_chip: Arc, + /// Memory address space. + sys_mem: Arc, + /// IO address space. + #[cfg(target_arch = "x86_64")] + sys_io: Arc, + /// Mmio bus. + bus: Bus, + /// VM running state. + vm_state: Arc<(Mutex, Condvar)>, + /// Vm boot_source config. + boot_source: Arc>, + /// VM power button, handle VM `Shutdown` event. + power_button: EventFd, +} + +impl LightMachine { + /// Constructs a new `LightMachine`. + /// + /// # Arguments + /// + /// * `vm_config` - Represents the configuration for VM. + pub fn new(vm_config: VmConfig) -> Result> { + let kvm = Kvm::new()?; + let vm_fd = Arc::new(kvm.create_vm()?); + + let sys_mem = AddressSpace::new(Region::init_container_region(u64::max_value()))?; + let nr_slots = kvm.get_nr_memslots(); + sys_mem.register_listener(Box::new(KvmMemoryListener::new( + nr_slots as u32, + vm_fd.clone(), + )))?; + + #[cfg(target_arch = "x86_64")] + let sys_io = AddressSpace::new(Region::init_container_region(1 << 16))?; + #[cfg(target_arch = "x86_64")] + sys_io.register_listener(Box::new(KvmIoListener::new(vm_fd.clone())))?; + + #[cfg(target_arch = "x86_64")] + Self::arch_init(&vm_fd)?; + + // Init guest-memory + // Define ram-region ranges according to architectures + let ram_ranges = Self::arch_ram_ranges(vm_config.machine_config.mem_size); + let mem_mappings = create_host_mmaps(&ram_ranges, vm_config.machine_config.omit_vm_memory)?; + for mmap in mem_mappings.iter() { + sys_mem.root().add_subregion( + Region::init_ram_region(mmap.clone()), + mmap.start_address().raw_value(), + )?; + } + + // Pre init vcpu and cpu topology + let mut mask: Vec = Vec::with_capacity(vm_config.machine_config.nr_cpus as usize); + for _i in 0..vm_config.machine_config.nr_cpus { + mask.push(1) + } + + let cpu_topo = CpuTopology { + sockets: vm_config.machine_config.nr_cpus, + cores: 1, + threads: 1, + nrcpus: vm_config.machine_config.nr_cpus, + max_cpus: vm_config.machine_config.nr_cpus, + online_mask: Arc::new(Mutex::new(mask)), + }; + + let nrcpus = vm_config.machine_config.nr_cpus; + let mut vcpu_fds = vec![]; + for cpu_id in 0..nrcpus { + vcpu_fds.push(Arc::new(vm_fd.create_vcpu(cpu_id)?)); + } + + // Interrupt Controller Chip init + #[cfg(target_arch = "aarch64")] + let intc_conf = InterruptControllerConfig { + version: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3, + map_region: 1 << 30, + vcpu_count: u64::from(vm_config.machine_config.nr_cpus), + max_irq: 192, + msi: true, + }; + #[cfg(target_arch = "aarch64")] + let irq_chip = InterruptController::new(vm_fd.clone(), &intc_conf)?; + + // Machine state init + let vm_state = Arc::new((Mutex::new(KvmVmState::Created), Condvar::new())); + + // Create vm object + let mut vm = LightMachine { + cpu_topo, + cpus: Arc::new(Mutex::new(Vec::new())), + #[cfg(target_arch = "aarch64")] + irq_chip: Arc::new(irq_chip), + sys_mem: sys_mem.clone(), + #[cfg(target_arch = "x86_64")] + sys_io, + bus: Bus::new(sys_mem), + boot_source: Arc::new(Mutex::new(vm_config.clone().boot_source)), + vm_fd: vm_fd.clone(), + vm_state, + power_button: EventFd::new(libc::EFD_NONBLOCK) + .chain_err(|| "Create EventFd for power-button failed.")?, + }; + + // Add mmio devices + vm.add_devices(vm_config)?; + + let vm = Arc::new(vm); + + // Add vcpu object to vm + let cpu_vm: Arc>> = + Arc::new(Box::new(vm.clone())); + for vcpu_id in 0..nrcpus { + #[cfg(target_arch = "aarch64")] + let arch_cpu = ArchCPU::new(&vm_fd, u32::from(vcpu_id)); + + #[cfg(target_arch = "x86_64")] + let arch_cpu = ArchCPU::new(&vm_fd, u32::from(vcpu_id), u32::from(nrcpus)); + + let cpu = CPU::new( + vcpu_fds[vcpu_id as usize].clone(), + vcpu_id, + Arc::new(Mutex::new(arch_cpu)), + cpu_vm.clone(), + )?; + + let mut vcpus = vm.cpus.lock().unwrap(); + let newcpu = Arc::new(cpu); + vcpus.push(newcpu.clone()); + } + + Ok(vm) + } + + /// Calculate the ranges of memory according to architecture. + /// + /// # Arguments + /// + /// * `mem_size` - memory size of VM. + /// + /// # Returns + /// + /// A array of ranges, it's element represents (start_addr, size). + /// On x86_64, there is a gap ranged from (4G - 768M) to 4G, which will be skipped. + fn arch_ram_ranges(mem_size: u64) -> Vec<(u64, u64)> { + // ranges is the vector of (start_addr, size) + let mut ranges = Vec::<(u64, u64)>::new(); + + #[cfg(target_arch = "aarch64")] + ranges.push((DRAM_BASE, mem_size)); + + #[cfg(target_arch = "x86_64")] + { + let gap_start = MEM_MAPPED_IO_BASE; + ranges.push((0, std::cmp::min(gap_start, mem_size))); + if mem_size > gap_start { + let gap_end = MEM_MAPPED_IO_BASE + MEM_MAPPED_IO_SIZE; + ranges.push((gap_end, mem_size - gap_start)); + } + } + + ranges + } + + #[cfg(target_arch = "x86_64")] + fn arch_init(vm_fd: &VmFd) -> Result<()> { + vm_fd.create_irq_chip()?; + vm_fd.set_tss_address(0xfffb_d000 as usize)?; + + let mut pit_config = kvm_pit_config::default(); + pit_config.flags = KVM_PIT_SPEAKER_DUMMY; + vm_fd.create_pit2(pit_config)?; + + Ok(()) + } + + /// Realize `LightMachine` means let all members of `LightMachine` enabled. + #[cfg(target_arch = "aarch64")] + pub fn realize(&self) -> Result<()> { + self.bus + .realize_devices(&self.vm_fd, &self.boot_source, &self.sys_mem)?; + + let boot_source = self.boot_source.lock().unwrap(); + + let (initrd, initrd_size) = match &boot_source.initrd { + Some(rd) => (Some(rd.initrd_file.clone()), rd.initrd_size), + None => (None, 0), + }; + + let bootloader_config = BootLoaderConfig { + kernel: boot_source.kernel_file.clone(), + initrd, + initrd_size: initrd_size as u32, + }; + + let layout = load_kernel(&bootloader_config, &self.sys_mem)?; + if let Some(rd) = &boot_source.initrd { + *rd.initrd_addr.lock().unwrap() = layout.initrd_start; + } + + // need to release lock here, as generate_fdt_node will acquire it later + drop(boot_source); + + let boot_config = CPUBootConfig { + fdt_addr: layout.dtb_start, + kernel_addr: layout.kernel_start, + }; + + for cpu_index in 0..self.cpu_topo.max_cpus { + self.cpus.lock().unwrap()[cpu_index as usize].realize(&boot_config)?; + } + + let mut fdt = vec![0; device_tree::FDT_MAX_SIZE as usize]; + self.generate_fdt_node(&mut fdt)?; + + self.sys_mem.write( + &mut fdt.as_slice(), + GuestAddress(boot_config.fdt_addr as u64), + fdt.len() as u64, + )?; + + self.register_power_event()?; + + Ok(()) + } + + /// Realize `LightMachine` means let all members of `LightMachine` enabled. + #[cfg(target_arch = "x86_64")] + pub fn realize(&self) -> Result<()> { + self.bus.realize_devices( + &self.vm_fd, + &self.boot_source, + &self.sys_mem, + self.sys_io.clone(), + )?; + + let boot_source = self.boot_source.lock().unwrap(); + + // Load kernel image + let (initrd, initrd_size) = match &boot_source.initrd { + Some(rd) => (Some(rd.initrd_file.clone()), rd.initrd_size), + None => (None, 0), + }; + let bootloader_config = BootLoaderConfig { + kernel: boot_source.kernel_file.clone(), + initrd, + initrd_size: initrd_size as u32, + kernel_cmdline: boot_source.kernel_cmdline.to_string(), + cpu_count: self.cpu_topo.nrcpus, + }; + + let layout = load_kernel(&bootloader_config, &self.sys_mem)?; + let boot_config = CPUBootConfig { + boot_ip: layout.kernel_start, + boot_sp: layout.kernel_sp, + zero_page: layout.zero_page_addr, + code_segment: layout.segments.code_segment, + data_segment: layout.segments.data_segment, + gdt_base: layout.segments.gdt_base, + gdt_size: layout.segments.gdt_limit, + idt_base: layout.segments.idt_base, + idt_size: layout.segments.idt_limit, + pml4_start: layout.boot_pml4_addr, + }; + + for cpu_index in 0..self.cpu_topo.max_cpus { + self.cpus.lock().unwrap()[cpu_index as usize].realize(&boot_config)?; + } + + self.register_power_event()?; + + Ok(()) + } + + /// Start VM, changed `LightMachine`'s `vmstate` to `Paused` or + /// `Running`. + /// + /// # Arguments + /// + /// * `paused` - After started, paused all vcpu or not. + /// * `use_seccomp` - If use seccomp sandbox or not. + pub fn vm_start(&self, paused: bool, use_seccomp: bool) -> Result<()> { + let cpus_thread_barrier = Arc::new(Barrier::new((self.cpu_topo.max_cpus + 1) as usize)); + + for cpu_index in 0..self.cpu_topo.max_cpus { + let cpu_thread_barrier = cpus_thread_barrier.clone(); + let cpu = self.cpus.lock().unwrap()[cpu_index as usize].clone(); + CPU::start(cpu, cpu_thread_barrier, paused, use_seccomp)?; + } + + let mut vmstate = self.vm_state.deref().0.lock().unwrap(); + if paused { + *vmstate = KvmVmState::Paused; + } else { + *vmstate = KvmVmState::Running; + } + cpus_thread_barrier.wait(); + + Ok(()) + } + + /// Pause VM, sleepy all vcpu thread. Changed `LightMachine`'s `vmstate` + /// from `Running` to `Paused`. + fn vm_pause(&self) -> Result<()> { + for cpu_index in 0..self.cpu_topo.max_cpus { + self.cpus.lock().unwrap()[cpu_index as usize].pause()?; + } + + #[cfg(target_arch = "aarch64")] + self.irq_chip.stop(); + + let mut vmstate = self.vm_state.deref().0.lock().unwrap(); + *vmstate = KvmVmState::Paused; + + Ok(()) + } + + /// Resume VM, awaken all vcpu thread. Changed `LightMachine`'s `vmstate` + /// from `Paused` to `Running`. + fn vm_resume(&self) -> Result<()> { + for cpu_index in 0..self.cpu_topo.max_cpus { + self.cpus.lock().unwrap()[cpu_index as usize].resume()?; + } + + let mut vmstate = self.vm_state.deref().0.lock().unwrap(); + *vmstate = KvmVmState::Running; + + Ok(()) + } + + /// Destroy VM, kill all vcpu thread. Changed `LightMachine`'s `vmstate` + /// to `KVM_VMSTATE_DESTROY`. + fn vm_destroy(&self) -> Result<()> { + let mut vmstate = self.vm_state.deref().0.lock().unwrap(); + *vmstate = KvmVmState::Shutdown; + + let mut cpus = self.cpus.lock().unwrap(); + for cpu_index in 0..self.cpu_topo.max_cpus { + cpus[cpu_index as usize].destroy()?; + } + cpus.clear(); + + Ok(()) + } + + fn register_device(&mut self, dev_builder_ops: &T) -> Result<()> { + dev_builder_ops.build_dev(self.sys_mem.clone(), &mut self.bus) + } + + fn add_devices(&mut self, vm_config: VmConfig) -> Result<()> { + #[cfg(target_arch = "aarch64")] + { + let rtc = Arc::new(Mutex::new(PL031::new())); + self.bus + .attach_device(rtc) + .chain_err(|| "add rtc to bus failed")?; + } + + if let Some(serial) = vm_config.serial { + self.register_device(&serial)?; + } + + if let Some(vsock) = vm_config.vsock { + self.register_device(&vsock)?; + } + + if let Some(drives) = vm_config.drives { + for drive in drives { + self.register_device(&drive)?; + } + } + + if let Some(nets) = vm_config.nets { + for net in nets { + self.register_device(&net)?; + } + } + + if let Some(consoles) = vm_config.consoles { + for console in consoles { + self.register_device(&console)?; + } + } + + Ok(()) + } + + fn register_power_event(&self) -> Result<()> { + let power_button = self.power_button.try_clone().unwrap(); + let button_fd = power_button.as_raw_fd(); + let power_button_handler: Arc>> = + Arc::new(Mutex::new(Box::new(move |_, _| { + let _ret = power_button.read().unwrap(); + None + }))); + + let notifier = EventNotifier::new( + NotifierOperation::AddShared, + button_fd, + None, + EventSet::IN, + vec![power_button_handler], + ); + + MainLoop::update_event(vec![notifier])?; + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + fn generate_serial_device_node( + &self, + dev_info: &DeviceResource, + fdt: &mut Vec, + ) -> util::errors::Result<()> { + let node = format!("/uart@{:x}", dev_info.addr); + device_tree::add_sub_node(fdt, &node)?; + device_tree::set_property_string(fdt, &node, "compatible", "ns16550a")?; + device_tree::set_property_string(fdt, &node, "clock-names", "apb_pclk")?; + device_tree::set_property_u32(fdt, &node, "clocks", device_tree::CLK_PHANDLE)?; + device_tree::set_property_array_u64(fdt, &node, "reg", &[dev_info.addr, dev_info.size])?; + device_tree::set_property_array_u32( + fdt, + &node, + "interrupts", + &[ + device_tree::GIC_FDT_IRQ_TYPE_SPI, + dev_info.irq, + device_tree::IRQ_TYPE_EDGE_RISING, + ], + )?; + + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + fn generate_rtc_device_node( + &self, + dev_info: &DeviceResource, + fdt: &mut Vec, + ) -> util::errors::Result<()> { + let node = format!("/pl031@{:x}", dev_info.addr); + device_tree::add_sub_node(fdt, &node)?; + device_tree::set_property_string(fdt, &node, "compatible", "arm,pl031\0arm,primecell\0")?; + device_tree::set_property_string(fdt, &node, "clock-names", "apb_pclk")?; + device_tree::set_property_u32(fdt, &node, "clocks", device_tree::CLK_PHANDLE)?; + device_tree::set_property_array_u64(fdt, &node, "reg", &[dev_info.addr, dev_info.size])?; + device_tree::set_property_array_u32( + fdt, + &node, + "interrupts", + &[ + device_tree::GIC_FDT_IRQ_TYPE_SPI, + dev_info.irq, + device_tree::IRQ_TYPE_LEVEL_HIGH, + ], + )?; + + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + fn generate_virtio_devices_node( + &self, + dev_info: &DeviceResource, + fdt: &mut Vec, + ) -> util::errors::Result<()> { + let node = format!("/virtio_mmio@{:x}", dev_info.addr); + device_tree::add_sub_node(fdt, &node)?; + device_tree::set_property_string(fdt, &node, "compatible", "virtio,mmio")?; + device_tree::set_property_u32(fdt, &node, "interrupt-parent", device_tree::GIC_PHANDLE)?; + device_tree::set_property_array_u64(fdt, &node, "reg", &[dev_info.addr, dev_info.size])?; + device_tree::set_property_array_u32( + fdt, + &node, + "interrupts", + &[ + device_tree::GIC_FDT_IRQ_TYPE_SPI, + dev_info.irq, + device_tree::IRQ_TYPE_EDGE_RISING, + ], + )?; + + Ok(()) + } +} + +impl MachineLifecycle for LightMachine { + fn pause(&self) -> bool { + if self.notify_lifecycle(KvmVmState::Running, KvmVmState::Paused) { + #[cfg(feature = "qmp")] + event!(STOP); + + true + } else { + false + } + } + + fn resume(&self) -> bool { + if !self.notify_lifecycle(KvmVmState::Paused, KvmVmState::Running) { + return false; + } + + #[cfg(feature = "qmp")] + event!(RESUME); + + true + } + + fn destroy(&self) -> bool { + let vmstate = { + let state = self.vm_state.deref().0.lock().unwrap(); + *state + }; + + if !self.notify_lifecycle(vmstate, KvmVmState::Shutdown) { + return false; + } + + true + } + + fn notify_lifecycle(&self, old: KvmVmState, new: KvmVmState) -> bool { + use KvmVmState::*; + + let vmstate = self.vm_state.deref().0.lock().unwrap(); + if *vmstate != old { + error!("Vm lifecycle error: state check failed."); + return false; + } + drop(vmstate); + + match (old, new) { + (Created, Running) => { + if let Err(e) = self.vm_start(false, false) { + error!("Vm lifecycle error:{}", e); + }; + } + (Running, Paused) => { + if let Err(e) = self.vm_pause() { + error!("Vm lifecycle error:{}", e); + }; + } + (Paused, Running) => { + if let Err(e) = self.vm_resume() { + error!("Vm lifecycle error:{}", e); + }; + } + (_, Shutdown) => { + if let Err(e) = self.vm_destroy() { + error!("Vm lifecycle error:{}", e); + }; + self.power_button.write(1).unwrap(); + } + (_, _) => { + error!("Vm lifecycle error: this transform is illegal."); + return false; + } + } + + let vmstate = self.vm_state.deref().0.lock().unwrap(); + if *vmstate != new { + error!("Vm lifecycle error: state transform failed."); + return false; + } + + true + } +} + +impl MachineAddressInterface for LightMachine { + #[cfg(target_arch = "x86_64")] + fn pio_in(&self, addr: u64, mut data: &mut [u8]) -> bool { + // The function pit_calibrate_tsc() in kernel gets stuck if data read from + // io-port 0x61 is not 0x20. + // This problem only happens before Linux version 4.18 (fixed by 368a540e0) + if addr == 0x61 { + data[0] = 0x20; + return true; + } + let length = data.len() as u64; + self.sys_io + .read(&mut data, GuestAddress(addr), length) + .is_ok() + } + + #[cfg(target_arch = "x86_64")] + fn pio_out(&self, addr: u64, mut data: &[u8]) -> bool { + let count = data.len() as u64; + self.sys_io + .write(&mut data, GuestAddress(addr), count) + .is_ok() + } + + fn mmio_read(&self, addr: u64, mut data: &mut [u8]) -> bool { + let length = data.len() as u64; + self.sys_mem + .read(&mut data, GuestAddress(addr), length) + .is_ok() + } + + fn mmio_write(&self, addr: u64, mut data: &[u8]) -> bool { + let count = data.len() as u64; + self.sys_mem + .write(&mut data, GuestAddress(addr), count) + .is_ok() + } +} + +impl DeviceInterface for LightMachine { + #[cfg(feature = "qmp")] + fn query_status(&self) -> qmp::Response { + let vmstate = self.vm_state.deref().0.lock().unwrap(); + let qmp_state = match *vmstate { + KvmVmState::Running => schema::StatusInfo { + singlestep: false, + running: true, + status: schema::RunState::running, + }, + KvmVmState::Paused => schema::StatusInfo { + singlestep: false, + running: true, + status: schema::RunState::paused, + }, + _ => Default::default(), + }; + + qmp::Response::create_response(serde_json::to_value(&qmp_state).unwrap(), None) + } + + #[cfg(feature = "qmp")] + fn query_cpus(&self) -> qmp::Response { + let mut cpu_vec: Vec = Vec::new(); + for cpu_index in 0..self.cpu_topo.max_cpus { + if self.cpu_topo.get_mask(cpu_index as usize) == 1 { + let thread_id = self.cpus.lock().unwrap()[cpu_index as usize].tid(); + let (socketid, coreid, threadid) = self.cpu_topo.get_topo(cpu_index as usize); + let cpu_instance = schema::CpuInstanceProperties { + node_id: None, + socket_id: Some(socketid as isize), + core_id: Some(coreid as isize), + thread_id: Some(threadid as isize), + }; + #[cfg(target_arch = "x86_64")] + { + let cpu_info = schema::CpuInfo::x86 { + current: true, + qom_path: String::from("/machine/unattached/device[") + + &cpu_index.to_string() + + &"]".to_string(), + halted: false, + props: Some(cpu_instance), + CPU: cpu_index as isize, + thread_id: thread_id as isize, + x86: schema::CpuInfoX86 {}, + }; + cpu_vec.push(serde_json::to_value(cpu_info).unwrap()); + } + #[cfg(target_arch = "aarch64")] + { + let cpu_info = schema::CpuInfo::Arm { + current: true, + qom_path: String::from("/machine/unattached/device[") + + &cpu_index.to_string() + + &"]".to_string(), + halted: false, + props: Some(cpu_instance), + CPU: cpu_index as isize, + thread_id: thread_id as isize, + arm: schema::CpuInfoArm {}, + }; + cpu_vec.push(serde_json::to_value(cpu_info).unwrap()); + } + } + } + qmp::Response::create_response(cpu_vec.into(), None) + } + + #[cfg(feature = "qmp")] + fn query_hotpluggable_cpus(&self) -> qmp::Response { + let mut hotplug_vec: Vec = Vec::new(); + #[cfg(target_arch = "x86_64")] + let cpu_type = String::from("host-x86-cpu"); + #[cfg(target_arch = "aarch64")] + let cpu_type = String::from("host-aarch64-cpu"); + + for cpu_index in 0..self.cpu_topo.max_cpus { + if self.cpu_topo.get_mask(cpu_index as usize) == 0 { + let (socketid, coreid, threadid) = self.cpu_topo.get_topo(cpu_index as usize); + let cpu_instance = schema::CpuInstanceProperties { + node_id: None, + socket_id: Some(socketid as isize), + core_id: Some(coreid as isize), + thread_id: Some(threadid as isize), + }; + let hotpluggable_cpu = schema::HotpluggableCPU { + type_: cpu_type.clone(), + vcpus_count: 1, + props: cpu_instance, + qom_path: None, + }; + hotplug_vec.push(serde_json::to_value(hotpluggable_cpu).unwrap()); + } else { + let (socketid, coreid, threadid) = self.cpu_topo.get_topo(cpu_index as usize); + let cpu_instance = schema::CpuInstanceProperties { + node_id: None, + socket_id: Some(socketid as isize), + core_id: Some(coreid as isize), + thread_id: Some(threadid as isize), + }; + let hotpluggable_cpu = schema::HotpluggableCPU { + type_: cpu_type.clone(), + vcpus_count: 1, + props: cpu_instance, + qom_path: Some( + String::from("/machine/unattached/device[") + + &cpu_index.to_string() + + &"]".to_string(), + ), + }; + hotplug_vec.push(serde_json::to_value(hotpluggable_cpu).unwrap()); + } + } + qmp::Response::create_response(hotplug_vec.into(), None) + } + + fn device_add( + &self, + id: String, + driver: String, + addr: Option, + lun: Option, + ) -> bool { + // get slot of bus by addr or lun + let mut slot = 0; + if let Some(addr) = addr { + let slot_str = addr.as_str().trim_start_matches("0x"); + + if let Ok(n) = usize::from_str_radix(slot_str, 16) { + slot = n; + } + } else if let Some(lun) = lun { + slot = lun + 1; + } + + self.bus.add_replaceable_device(&id, &driver, slot).is_ok() + } + + fn device_del(&self, device_id: String) -> bool { + match self.bus.del_replaceable_device(&device_id) { + Ok(path) => { + #[cfg(feature = "qmp")] + { + let block_del_event = schema::DEVICE_DELETED { + device: Some(device_id), + path, + }; + event!(DEVICE_DELETED; block_del_event); + } + + true + } + _ => false, + } + } + + fn blockdev_add( + &self, + node_name: String, + file: schema::FileOptions, + cache: Option, + read_only: Option, + ) -> bool { + let read_only = if let Some(ro) = read_only { ro } else { false }; + + let direct = if let Some(cache) = cache { + match cache.direct { + Some(direct) => direct, + _ => true, + } + } else { + true + }; + + let config = DriveConfig { + drive_id: node_name.clone(), + path_on_host: file.filename, + read_only, + direct, + serial_num: None, + }; + + self.bus + .add_replaceable_config(node_name, Arc::new(config)) + .is_ok() + } + + fn netdev_add(&self, id: String, if_name: Option, fds: Option) -> bool { + let mut config = NetworkInterfaceConfig { + iface_id: id.clone(), + host_dev_name: "".to_string(), + mac: None, + tap_fd: None, + vhost_type: None, + vhost_fd: None, + }; + + if let Some(fds) = fds { + let netdev_fd = if fds.contains(':') { + let col: Vec<_> = fds.split(':').collect(); + String::from(col[col.len() - 1]) + } else { + String::from(&fds) + }; + + #[cfg(feature = "qmp")] + { + if let Some(fd_num) = QmpChannel::get_fd(&netdev_fd) { + config.tap_fd = Some(fd_num); + } else { + // try to convert string to RawFd + let fd_num = match netdev_fd.parse::() { + Ok(fd) => fd, + _ => { + error!( + "Add netdev error: failed to convert {} to RawFd.", + netdev_fd + ); + return false; + } + }; + + config.tap_fd = Some(fd_num); + } + } + } else if let Some(if_name) = if_name { + config.host_dev_name = if_name; + } + + self.bus + .add_replaceable_config(id, Arc::new(config)) + .is_ok() + } + + #[cfg(feature = "qmp")] + fn getfd(&self, fd_name: String, if_fd: Option) -> qmp::Response { + if let Some(fd) = if_fd { + QmpChannel::set_fd(fd_name, fd); + qmp::Response::create_empty_response() + } else { + let err_resp = schema::QmpErrorClass::GenericError("Invalid SCM message".to_string()); + qmp::Response::create_error_response(err_resp, None).unwrap() + } + } +} + +impl MachineInterface for LightMachine {} +impl MachineExternalInterface for LightMachine {} + +impl MainLoopManager for LightMachine { + fn main_loop_should_exit(&self) -> bool { + let vmstate = self.vm_state.deref().0.lock().unwrap(); + *vmstate == KvmVmState::Shutdown + } + + fn main_loop_cleanup(&self) -> util::errors::Result<()> { + if let Err(e) = std::io::stdin().lock().set_canon_mode() { + error!( + "destroy virtual machine: reset stdin to canonical mode failed, {}", + e + ); + } + + Ok(()) + } +} + +#[cfg(target_arch = "aarch64")] +trait CompileFDTHelper { + fn generate_cpu_nodes(&self, fdt: &mut Vec) -> util::errors::Result<()>; + fn generate_memory_node(&self, fdt: &mut Vec) -> util::errors::Result<()>; + fn generate_devices_node(&self, fdt: &mut Vec) -> util::errors::Result<()>; + fn generate_chosen_node(&self, fdt: &mut Vec) -> util::errors::Result<()>; +} + +#[cfg(target_arch = "aarch64")] +impl CompileFDTHelper for LightMachine { + fn generate_cpu_nodes(&self, fdt: &mut Vec) -> util::errors::Result<()> { + let node = "/cpus"; + + device_tree::add_sub_node(fdt, node)?; + device_tree::set_property_u32(fdt, node, "#address-cells", 0x02)?; + device_tree::set_property_u32(fdt, node, "#size-cells", 0x0)?; + + // Generate CPU topology + if self.cpu_topo.max_cpus > 0 && self.cpu_topo.max_cpus % 8 == 0 { + device_tree::add_sub_node(fdt, "/cpus/cpu-map")?; + + let sockets = self.cpu_topo.max_cpus / 8; + for cluster in 0..u32::from(sockets) { + let clster = format!("/cpus/cpu-map/cluster{}", cluster); + device_tree::add_sub_node(fdt, &clster)?; + + for i in 0..2 as u32 { + let sub_cluster = format!("{}/cluster{}", clster, i); + device_tree::add_sub_node(fdt, &sub_cluster)?; + + let core0 = format!("{}/core0", sub_cluster); + device_tree::add_sub_node(fdt, &core0)?; + let thread0 = format!("{}/thread0", core0); + device_tree::add_sub_node(fdt, &thread0)?; + device_tree::set_property_u32(fdt, &thread0, "cpu", cluster * 8 + i * 4 + 10)?; + + let thread1 = format!("{}/thread1", core0); + device_tree::add_sub_node(fdt, &thread1)?; + device_tree::set_property_u32( + fdt, + &thread1, + "cpu", + cluster * 8 + i * 4 + 10 + 1, + )?; + + let core1 = format!("{}/core1", sub_cluster); + device_tree::add_sub_node(fdt, &core1)?; + let thread0 = format!("{}/thread0", core1); + device_tree::add_sub_node(fdt, &thread0)?; + device_tree::set_property_u32( + fdt, + &thread0, + "cpu", + cluster * 8 + i * 4 + 10 + 2, + )?; + + let thread1 = format!("{}/thread1", core1); + device_tree::add_sub_node(fdt, &thread1)?; + device_tree::set_property_u32( + fdt, + &thread1, + "cpu", + cluster * 8 + i * 4 + 10 + 3, + )?; + } + } + } + + let cpu_list = self.cpus.lock().unwrap(); + for cpu_index in 0..self.cpu_topo.max_cpus { + let mpidr = cpu_list[cpu_index as usize] + .arch() + .lock() + .unwrap() + .get_mpidr(cpu_list[cpu_index as usize].fd()); + + let node = format!("/cpus/cpu@{:x}", mpidr); + device_tree::add_sub_node(fdt, &node)?; + device_tree::set_property_u32( + fdt, + &node, + "phandle", + u32::from(cpu_index) + device_tree::CPU_PHANDLE_START, + )?; + device_tree::set_property_string(fdt, &node, "device_type", "cpu")?; + device_tree::set_property_string(fdt, &node, "compatible", "arm,arm-v8")?; + if self.cpu_topo.max_cpus > 1 { + device_tree::set_property_string(fdt, &node, "enable-method", "psci")?; + } + device_tree::set_property_u64(fdt, &node, "reg", mpidr & 0x007F_FFFF)?; + } + + Ok(()) + } + + fn generate_memory_node(&self, fdt: &mut Vec) -> util::errors::Result<()> { + let mem_size = self.sys_mem.memory_end_address().raw_value() - 0x8000_0000; + let node = "/memory"; + device_tree::add_sub_node(fdt, node)?; + device_tree::set_property_string(fdt, node, "device_type", "memory")?; + device_tree::set_property_array_u64(fdt, node, "reg", &[0x8000_0000, mem_size as u64])?; + + Ok(()) + } + + fn generate_devices_node(&self, fdt: &mut Vec) -> util::errors::Result<()> { + // timer + let mut cells: Vec = Vec::new(); + for &irq in [13, 14, 11, 10].iter() { + cells.push(device_tree::GIC_FDT_IRQ_TYPE_PPI); + cells.push(irq); + cells.push(device_tree::IRQ_TYPE_LEVEL_HIGH); + } + let node = "/timer"; + device_tree::add_sub_node(fdt, node)?; + device_tree::set_property_string(fdt, node, "compatible", "arm,armv8-timer")?; + device_tree::set_property(fdt, node, "always-on", None)?; + device_tree::set_property_array_u32(fdt, node, "interrupts", &cells)?; + + // clock + let node = "/apb-pclk"; + device_tree::add_sub_node(fdt, node)?; + device_tree::set_property_string(fdt, node, "compatible", "fixed-clock")?; + device_tree::set_property_string(fdt, node, "clock-output-names", "clk24mhz")?; + device_tree::set_property_u32(fdt, node, "#clock-cells", 0x0)?; + device_tree::set_property_u32(fdt, node, "clock-frequency", 24_000_000)?; + device_tree::set_property_u32(fdt, node, "phandle", device_tree::CLK_PHANDLE)?; + + // psci + let node = "/psci"; + device_tree::add_sub_node(fdt, node)?; + device_tree::set_property_string(fdt, node, "compatible", "arm,psci-0.2")?; + device_tree::set_property_string(fdt, node, "method", "hvc")?; + + for dev_info in self.bus.get_devices_info().iter().rev() { + match dev_info.dev_type { + DeviceType::SERIAL => { + self.generate_serial_device_node(dev_info, fdt)?; + } + DeviceType::RTC => { + self.generate_rtc_device_node(dev_info, fdt)?; + } + _ => { + self.generate_virtio_devices_node(dev_info, fdt)?; + } + } + } + + Ok(()) + } + + fn generate_chosen_node(&self, fdt: &mut Vec) -> util::errors::Result<()> { + let node = "/chosen"; + + let boot_source = self.boot_source.lock().unwrap(); + + device_tree::add_sub_node(fdt, node)?; + let cmdline = &boot_source.kernel_cmdline.to_string(); + device_tree::set_property_string(fdt, node, "bootargs", cmdline.as_str())?; + + match &boot_source.initrd { + Some(initrd) => { + device_tree::set_property_u64( + fdt, + node, + "linux,initrd-start", + *initrd.initrd_addr.lock().unwrap(), + )?; + device_tree::set_property_u64( + fdt, + node, + "linux,initrd-end", + *initrd.initrd_addr.lock().unwrap() + initrd.initrd_size, + )?; + } + None => {} + } + + Ok(()) + } +} + +#[cfg(target_arch = "aarch64")] +impl device_tree::CompileFDT for LightMachine { + fn generate_fdt_node(&self, fdt: &mut Vec) -> util::errors::Result<()> { + device_tree::create_device_tree(fdt)?; + + device_tree::set_property_string(fdt, "/", "compatible", "linux,dummy-virt")?; + device_tree::set_property_u32(fdt, "/", "#address-cells", 0x2)?; + device_tree::set_property_u32(fdt, "/", "#size-cells", 0x2)?; + device_tree::set_property_u32(fdt, "/", "interrupt-parent", device_tree::GIC_PHANDLE)?; + + self.generate_cpu_nodes(fdt)?; + self.generate_memory_node(fdt)?; + self.generate_devices_node(fdt)?; + self.generate_chosen_node(fdt)?; + self.irq_chip.generate_fdt_node(fdt)?; + + Ok(()) + } +} diff --git a/device_model/src/mmio/bus.rs b/device_model/src/mmio/bus.rs new file mode 100644 index 00000000..46905635 --- /dev/null +++ b/device_model/src/mmio/bus.rs @@ -0,0 +1,391 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::sync::{Arc, Mutex}; + +use address_space::{AddressSpace, RegionOps}; +use kvm_ioctls::VmFd; +use machine_manager::config::{BootSource, ConfigCheck}; + +use super::super::virtio::{Block, Net}; +use super::{ + errors::Result, DeviceResource, DeviceType, MmioDevice, MmioDeviceOps, VirtioMmioDevice, +}; +use crate::micro_vm::MEM_MAPPED_IO_BASE; + +#[cfg(target_arch = "aarch64")] +const IRQ_RANGE: (u32, u32) = (32, 191); +#[cfg(target_arch = "x86_64")] +const IRQ_RANGE: (u32, u32) = (5, 15); + +const MMIO_SERIAL_IRQ: u32 = 4; +const MMIO_SERIAL_ADDR: u64 = 0x3f8; +const MMIO_LEN: u64 = 0x1000; + +/// The replaceable block device maximum count. +pub const MMIO_REPLACEABLE_BLK_NR: usize = 6; +/// The replaceable network device maximum count. +pub const MMIO_REPLACEABLE_NET_NR: usize = 2; + +/// The config of replaceable device. +struct MmioReplaceableConfig { + /// Device id. + id: String, + /// The dev_config of the related backend device. + dev_config: Arc, +} + +/// The device information of replaceable device. +struct MmioReplaceableDevInfo { + /// The related MMIO device. + device: MmioDevice, + /// Device id. + id: String, + /// Identify if this device is be used. + used: bool, +} + +/// The gather of config, info and count of all replaceable devices. +struct MmioReplaceableInfo { + /// The arrays of all replaceable configs. + configs: Arc>>, + /// The arrays of all replaceable device information. + devices: Arc>>, + /// The count of block device which is plugin. + block_count: usize, + /// The count of network device which is plugin. + net_count: usize, +} + +impl MmioReplaceableInfo { + pub fn new() -> Self { + MmioReplaceableInfo { + configs: Arc::new(Mutex::new(Vec::new())), + devices: Arc::new(Mutex::new(Vec::new())), + block_count: 0_usize, + net_count: 0_usize, + } + } +} + +/// MMIO Bus. +pub struct Bus { + /// The devices inserted in bus. + devices: Vec, + /// All replaceable device information. + replaceable_info: MmioReplaceableInfo, +} + +impl Bus { + /// Initial the MMIO Bus structure. + /// + /// # Steps + /// + /// 1. Initial MMIO Bus + /// 2. Prepare the replaceable information of block and network devices. + /// + /// # Arguments + /// + /// * `sys_mem` - guest memory. + pub fn new(sys_mem: Arc) -> Self { + let mut bus = Bus { + devices: Vec::new(), + replaceable_info: MmioReplaceableInfo::new(), + }; + + for _ in 0..MMIO_REPLACEABLE_BLK_NR { + let block = Arc::new(Mutex::new(Block::new())); + let device = Arc::new(Mutex::new(VirtioMmioDevice::new(sys_mem.clone(), block))); + if let Ok(dev) = bus.attach_device(device.clone()) { + bus.replaceable_info + .devices + .lock() + .unwrap() + .push(MmioReplaceableDevInfo { + device: dev, + id: "".to_string(), + used: false, + }); + } + } + + for _ in 0..MMIO_REPLACEABLE_NET_NR { + let net = Arc::new(Mutex::new(Net::new())); + let device = Arc::new(Mutex::new(VirtioMmioDevice::new(sys_mem.clone(), net))); + if let Ok(dev) = bus.attach_device(device.clone()) { + bus.replaceable_info + .devices + .lock() + .unwrap() + .push(MmioReplaceableDevInfo { + device: dev, + id: "".to_string(), + used: false, + }); + } + } + + bus + } + + /// Attach a MMIO device to Bus. + /// + /// # Arguments + /// + /// * `device` - MMIO device. + /// + /// # Errors + /// + /// Return Error if irq number exceed the limit as Arch spec defined. + pub fn attach_device( + &mut self, + device: Arc>, + ) -> Result { + let device_type = device.lock().unwrap().get_type(); + let index = self.devices.len(); + + let resource = match device_type { + DeviceType::SERIAL if cfg!(target_arch = "x86_64") => DeviceResource { + addr: MMIO_SERIAL_ADDR, + size: 8, + irq: MMIO_SERIAL_IRQ, + dev_type: device_type, + }, + _ => DeviceResource { + addr: MEM_MAPPED_IO_BASE + index as u64 * MMIO_LEN, + size: MMIO_LEN, + irq: IRQ_RANGE.0 + index as u32, + dev_type: device_type, + }, + }; + + if resource.irq > IRQ_RANGE.1 { + bail!("irq {} exceed max value {}", resource.irq, IRQ_RANGE.1); + } + + let device = MmioDevice { + device: device.clone(), + dev_region: device, + resource: Arc::new(resource), + }; + + self.devices.push(device.clone()); + + Ok(device) + } + + /// Get the information of all devices inserted in bus. + #[cfg(target_arch = "aarch64")] + pub fn get_devices_info(&self) -> Vec { + let mut infos = Vec::new(); + + for dev in self.devices.iter() { + infos.push(dev.get_resource()) + } + + infos + } + + /// Get an unused entry of replaceable_info, then fill the fields and mark it as `used`. + /// + /// # Arguments + /// + /// * `id` - Device id. + /// * `path` - Related backend device path. + /// * `dev_type` - MMIO device type. + /// + /// # Errors + /// + /// Returns Error if the device number exceed the Max count. + pub fn fill_replaceable_device( + &mut self, + id: &str, + dev_config: Arc, + dev_type: DeviceType, + ) -> Result<()> { + let index = match dev_type { + DeviceType::BLK => { + let index = self.replaceable_info.block_count; + if index >= MMIO_REPLACEABLE_BLK_NR { + return Err("Index is out of bounds".into()); + } + self.replaceable_info.block_count += 1; + index + } + DeviceType::NET => { + let index = self.replaceable_info.net_count + MMIO_REPLACEABLE_BLK_NR; + if index >= MMIO_REPLACEABLE_BLK_NR + MMIO_REPLACEABLE_NET_NR { + return Err("Index is out of bounds".into()); + } + self.replaceable_info.net_count += 1; + index + } + _ => { + return Err("Device Type is unsupported".into()); + } + }; + + let mut replaceable_devices = self.replaceable_info.devices.lock().unwrap(); + if let Some(device_info) = replaceable_devices.get_mut(index) { + if device_info.used { + return Err(format!("The index{} is used, {}", index, id).into()); + } else { + device_info.id = id.to_string(); + device_info.used = true; + device_info.device.update_config(Some(dev_config.clone()))?; + } + } + + self.add_replaceable_config(id.to_string(), dev_config)?; + + Ok(()) + } + + /// Add new config into replaceable_info configs arrays. + /// + /// # Arguments + /// + /// * `id` - Device id. + /// * `path` - Related backend device path. + pub fn add_replaceable_config( + &self, + id: String, + dev_config: Arc, + ) -> Result<()> { + let mut configs_lock = self.replaceable_info.configs.lock().unwrap(); + if configs_lock.len() >= MMIO_REPLACEABLE_BLK_NR + MMIO_REPLACEABLE_NET_NR { + bail!("Replaceable configs size extend the max size."); + } + + for config in configs_lock.iter() { + if config.id == id { + bail!("Add the id {} repeatedly", id); + } + } + + let config = MmioReplaceableConfig { id, dev_config }; + configs_lock.push(config); + + Ok(()) + } + + /// Get an unused entry of replaceable_info which is indexed by `slot`, + /// then update the fields and mark it as `used`. + /// + /// # Arguments + /// + /// * `id` - Device id. + /// * `driver` - Driver type passed in by HotPlug. + /// * `slot` - The index of replaceable_info entries. + /// + /// # Errors + /// + /// Returns Error if the entry is already used. + pub fn add_replaceable_device(&self, id: &str, driver: &str, slot: usize) -> Result<()> { + let index = if driver.contains("net") { + if slot >= MMIO_REPLACEABLE_NET_NR { + bail!("Index is out of bounds"); + } + slot + MMIO_REPLACEABLE_BLK_NR + } else if driver.contains("blk") { + if slot >= MMIO_REPLACEABLE_BLK_NR { + bail!("Index is out of bounds"); + } + slot + } else { + bail!("Unsupported replaceable device type, type: {}", driver); + }; + + let configs_lock = self.replaceable_info.configs.lock().unwrap(); + // find the configuration by id + let mut dev_config = None; + for config in configs_lock.iter() { + if config.id == id { + dev_config = Some(config.dev_config.clone()); + } + } + + if dev_config.is_none() { + bail!("Failed to find the configuration {} ", id); + } + + // find the replaceable device and replace it + let mut replaceable_devices = self.replaceable_info.devices.lock().unwrap(); + if let Some(device_info) = replaceable_devices.get_mut(index) { + if device_info.used { + bail!("The slot{} is used, {}", slot, id); + } else { + device_info.id = id.to_string(); + device_info.used = true; + device_info.device.update_config(dev_config)?; + } + } + + Ok(()) + } + + /// Find the entry of replaceable_info which is specified by `id`, + /// then update the fields and mark it as `unused`. + /// + /// # Arguments + /// + /// * `id` - Device id. + pub fn del_replaceable_device(&self, id: &str) -> Result { + // find the index of configuration by name and remove it + let mut configs_lock = self.replaceable_info.configs.lock().unwrap(); + for (index, config) in configs_lock.iter().enumerate() { + if config.id == id { + configs_lock.remove(index); + break; + } + } + + // set the status of the device to 'unused' + let mut replaceable_devices = self.replaceable_info.devices.lock().unwrap(); + for device_info in replaceable_devices.iter_mut() { + if device_info.id == id { + device_info.id = "".to_string(); + device_info.used = false; + device_info.device.update_config(None)?; + } + } + + Ok(id.to_string()) + } + + /// Realize all the devices inserted in this Bus. + /// + /// # Arguments + /// + /// * `vm_fd` - The file descriptor of VM. + /// * `bs` - The boot source of VM. + /// * `sys_mem` - The guest memory to device constructs over. + pub fn realize_devices( + &self, + vm_fd: &VmFd, + bs: &Arc>, + sys_mem: &Arc, + #[cfg(target_arch = "x86_64")] sys_io: Arc, + ) -> Result<()> { + for device in &self.devices { + device.realize( + vm_fd, + &bs, + &sys_mem, + #[cfg(target_arch = "x86_64")] + sys_io.clone(), + )?; + } + + Ok(()) + } +} diff --git a/device_model/src/mmio/mod.rs b/device_model/src/mmio/mod.rs new file mode 100644 index 00000000..b9787bef --- /dev/null +++ b/device_model/src/mmio/mod.rs @@ -0,0 +1,179 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # MMIO +//! +//! This mod is used for create MMIO device. +//! +//! ## Design +//! +//! This module offers support for: +//! 1. DeviceType to identify NET,BLK,SERIAL... +//! 2. MMIO device structure. +//! 3. MMIO device trait. +//! +//! ## Platform Support +//! +//! - `x86_64` +//! - `aarch64` +use kvm_ioctls::VmFd; +use std::sync::{Arc, Mutex}; + +mod bus; +mod virtio_mmio; + +pub use self::bus::Bus; +pub use self::virtio_mmio::VirtioMmioDevice; + +use address_space::{AddressSpace, Region, RegionOps}; +use error_chain::bail; +use machine_manager::config::{BootSource, ConfigCheck, Param}; + +pub mod errors { + error_chain! { + links { + AddressSpace(address_space::errors::Error, address_space::errors::ErrorKind); + Virtio(crate::virtio::errors::Error, crate::virtio::errors::ErrorKind); + } + errors { + MmioRegister(offset: u64) { + display("Unsupported mmio register, 0x{:x}", offset) + } + DeviceStatus(status: u32) { + display("Invalid device status 0x{:x}", status) + } + } + } +} +use self::errors::Result; + +/// The different type of MMIO Device. +#[derive(Copy, Clone, Eq, PartialEq)] +pub enum DeviceType { + NET, + BLK, + SERIAL, + #[cfg(target_arch = "aarch64")] + RTC, + OTHER, +} + +/// The requirement of address space and irq number by MMIO device. +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct DeviceResource { + /// Address space start address. + pub addr: u64, + /// Address space size. + pub size: u64, + /// Interrupt irq number. + pub irq: u32, + /// MMIO device type. + pub dev_type: DeviceType, +} + +/// MmioDevice structure which used to register into system address space. +#[derive(Clone)] +pub struct MmioDevice { + /// MmioDeviceOps used to be invoked in function realize(). + device: Arc>, + /// RegionOps used to be registered into system address space. + dev_region: Arc>, + /// The DeviceResource required by this MMIO device. + resource: Arc, +} + +impl MmioDevice { + /// Realize this MMIO device for VM. + /// + /// # Arguments + /// + /// * `vm_fd` - The file descriptor of VM. + /// * `bs` - The boot source of VM. + /// * `sys_mem` - The guest memory to device constructs over. + pub fn realize( + &self, + vm_fd: &VmFd, + bs: &Arc>, + sys_mem: &Arc, + #[cfg(target_arch = "x86_64")] sys_io: Arc, + ) -> Result<()> { + self.device.lock().unwrap().realize(vm_fd, *self.resource)?; + + match self.resource.dev_type { + DeviceType::SERIAL if cfg!(target_arch = "x86_64") => { + #[cfg(target_arch = "x86_64")] + sys_io.root().add_subregion( + Region::init_io_region(self.resource.size, self.dev_region.clone()), + self.resource.addr, + )?; + } + _ => { + sys_mem.root().add_subregion( + Region::init_io_region(self.resource.size, self.dev_region.clone()), + self.resource.addr, + )?; + } + } + + // add to kernel cmdline + let cmdline = &mut bs.lock().unwrap().kernel_cmdline; + if let DeviceType::SERIAL = self.resource.dev_type { + #[cfg(target_arch = "aarch64")] + cmdline.push(Param { + param_type: "earlycon".to_string(), + value: format!("uart,mmio,0x{:08x}", self.resource.addr), + }); + } else { + #[cfg(target_arch = "x86_64")] + cmdline.push(Param { + param_type: "virtio_mmio.device".to_string(), + value: format!( + "{}K@0x{:08x}:{}", + self.resource.size / 1024, + self.resource.addr, + self.resource.irq + ), + }); + } + + Ok(()) + } + + /// Get the resource requirement of MMIO device. + #[cfg(target_arch = "aarch64")] + pub fn get_resource(&self) -> DeviceResource { + *self.resource + } + + /// Update the low level config of MMIO device. + /// + /// # Arguments + /// + /// * `file_path` - For Block device is image path; For Net device is tap path. + pub fn update_config(&self, dev_config: Option>) -> Result<()> { + self.device.lock().unwrap().update_config(dev_config) + } +} + +/// Trait for MMIO device. +pub trait MmioDeviceOps: Send { + /// Realize this MMIO device for VM. + fn realize(&mut self, vm_fd: &VmFd, resource: DeviceResource) -> Result<()>; + + /// Get the resource requirement of MMIO device. + fn get_type(&self) -> DeviceType; + + /// Update the low level config of MMIO device. + fn update_config(&mut self, _dev_config: Option>) -> Result<()> { + bail!("Unsupported to update configuration"); + } +} diff --git a/device_model/src/mmio/virtio_mmio.rs b/device_model/src/mmio/virtio_mmio.rs new file mode 100644 index 00000000..d642fc05 --- /dev/null +++ b/device_model/src/mmio/virtio_mmio.rs @@ -0,0 +1,1213 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Mutex}; + +use address_space::{AddressRange, AddressSpace, GuestAddress, RegionIoEventFd, RegionOps}; +use byteorder::{ByteOrder, LittleEndian}; +use kvm_ioctls::VmFd; +use machine_manager::config::ConfigCheck; +use vmm_sys_util::eventfd::EventFd; + +use super::super::virtio::{ + virtio_has_feature, Queue, QueueConfig, VirtioDevice, NOTIFY_REG_OFFSET, + QUEUE_TYPE_PACKED_VRING, QUEUE_TYPE_SPLIT_VRING, VIRTIO_F_RING_PACKED, VIRTIO_TYPE_BLOCK, + VIRTIO_TYPE_NET, +}; + +use super::errors::{ErrorKind, Result, ResultExt}; +use super::{DeviceResource, DeviceType, MmioDeviceOps}; + +/// Registers of virtio-mmio device refer to Virtio Spec. +/// Magic value - Read Only. +const MAGIC_VALUE_REG: u64 = 0x00; +/// Virtio device version - Read Only. +const VERSION_REG: u64 = 0x04; +/// Virtio device ID - Read Only. +const DEVICE_ID_REG: u64 = 0x08; +/// Virtio vendor ID - Read Only. +const VENDOR_ID_REG: u64 = 0x0c; +/// Bitmask of the features supported by the device(host) (32 bits per set) - Read Only. +const DEVICE_FEATURES_REG: u64 = 0x10; +/// Device (host) features set selector - Write Only. +const DEVICE_FEATURES_SEL_REG: u64 = 0x14; +/// Bitmask of features activated by the driver (guest) (32 bits per set) - Write Only. +const DRIVER_FEATURES_REG: u64 = 0x20; +/// Activated features set selector - Write Only. +const DRIVER_FEATURES_SEL_REG: u64 = 0x24; +/// Queue selector - Write Only. +const QUEUE_SEL_REG: u64 = 0x30; +/// Maximum size of the currently selected queue - Read Only. +const QUEUE_NUM_MAX_REG: u64 = 0x34; +/// Queue size for the currently selected queue - Write Only. +const QUEUE_NUM_REG: u64 = 0x38; +/// Ready bit for the currently selected queue - Read Write. +const QUEUE_READY_REG: u64 = 0x44; +/// Interrupt status - Read Only. +const INTERRUPT_STATUS_REG: u64 = 0x60; +/// Interrupt acknowledge - Write Only. +const INTERRUPT_ACK_REG: u64 = 0x64; +/// Device status register - Read Write. +const STATUS_REG: u64 = 0x70; +/// The low 32bit of queue's Descriptor Table address. +const QUEUE_DESC_LOW_REG: u64 = 0x80; +/// The high 32bit of queue's Descriptor Table address. +const QUEUE_DESC_HIGH_REG: u64 = 0x84; +/// The low 32 bit of queue's Available Ring address. +const QUEUE_AVAIL_LOW_REG: u64 = 0x90; +/// The high 32 bit of queue's Available Ring address. +const QUEUE_AVAIL_HIGH_REG: u64 = 0x94; +/// The low 32bit of queue's Used Ring address. +const QUEUE_USED_LOW_REG: u64 = 0xa0; +/// The high 32bit of queue's Used Ring address. +const QUEUE_USED_HIGH_REG: u64 = 0xa4; +/// Configuration atomicity value. +const CONFIG_GENERATION_REG: u64 = 0xfc; + +const VENDOR_ID: u32 = 0; +const MMIO_MAGIC_VALUE: u32 = 0x7472_6976; +const MMIO_VERSION: u32 = 2; + +const CONFIG_STATUS_ACKNOWLEDGE: u32 = 0x01; +const CONFIG_STATUS_DRIVER: u32 = 0x02; +const CONFIG_STATUS_DRIVER_OK: u32 = 0x04; +const CONFIG_STATUS_FEATURES_OK: u32 = 0x08; +const CONFIG_STATUS_FAILED: u32 = 0x80; + +/// HostNotifyInfo includes the info needed for notifying backend from guest. +pub struct HostNotifyInfo { + /// Eventfds which notify backend to use the avail ring. + events: Vec, +} + +impl HostNotifyInfo { + pub fn new(queue_num: usize) -> Self { + let mut events = Vec::new(); + for _i in 0..queue_num { + events.push(EventFd::new(libc::EFD_NONBLOCK).unwrap()); + } + + HostNotifyInfo { events } + } +} + +/// The configuration of virtio-mmio device, the fields refer to Virtio Spec. +pub struct VirtioMmioCommonConfig { + /// Bitmask of the features supported by the device (host)(32 bits per set). + features_select: u32, + /// Device (host) feature-setting selector. + acked_features_select: u32, + /// Interrupt status. + interrupt_status: Arc, + /// Device status. + device_status: u32, + /// Configuration atomicity value. + config_generation: u32, + /// Queue selector. + queue_select: u32, + /// The configuration of queues. + queues_config: Vec, + /// The type of queue, either be split ring or packed ring. + queue_type: u16, +} + +impl VirtioMmioCommonConfig { + pub fn new(device: &Arc>) -> Self { + let locked_device = device.lock().unwrap(); + let mut queues_config = Vec::new(); + let queue_size = locked_device.queue_size(); + for _ in 0..locked_device.queue_num() { + queues_config.push(QueueConfig::new(queue_size)) + } + + VirtioMmioCommonConfig { + features_select: 0, + acked_features_select: 0, + interrupt_status: Arc::new(AtomicU32::new(0)), + device_status: 0, + config_generation: 0, + queue_select: 0, + queues_config, + queue_type: QUEUE_TYPE_SPLIT_VRING, + } + } + + /// Check whether virtio device status is as expected. + fn check_device_status(&self, set: u32, clr: u32) -> bool { + self.device_status & (set | clr) == set + } + + /// Get mutable QueueConfig structure of virtio device. + fn get_mut_queue_config(&mut self) -> Result<&mut QueueConfig> { + if self.check_device_status( + CONFIG_STATUS_FEATURES_OK, + CONFIG_STATUS_DRIVER_OK | CONFIG_STATUS_FAILED, + ) { + self.queues_config + .get_mut(self.queue_select as usize) + .ok_or_else(|| "Mmio-reg queue_select overflows".into()) + } else { + Err(ErrorKind::DeviceStatus(self.device_status).into()) + } + } + + /// Get immutable QueueConfig structure of virtio device. + fn get_queue_config(&self) -> Result<&QueueConfig> { + self.queues_config + .get(self.queue_select as usize) + .ok_or_else(|| "Mmio-reg queue_select overflows".into()) + } + + /// Read data from the common config of virtio device. + /// Return the config value in u32. + /// # Arguments + /// + /// * `device` - Virtio device entity. + /// * `offset` - The offset of common config. + fn read_common_config( + &self, + device: &Arc>, + offset: u64, + ) -> Result { + let value = match offset { + MAGIC_VALUE_REG => MMIO_MAGIC_VALUE, + VERSION_REG => MMIO_VERSION, + DEVICE_ID_REG => device.lock().unwrap().device_type() as u32, + VENDOR_ID_REG => VENDOR_ID, + DEVICE_FEATURES_REG => { + let mut features = device + .lock() + .unwrap() + .get_device_features(self.features_select); + if self.features_select == 1 { + features |= 0x1; // enable support of VirtIO Version 1 + } + features + } + QUEUE_NUM_MAX_REG => self + .get_queue_config() + .map(|config| u32::from(config.max_size))?, + QUEUE_READY_REG => self.get_queue_config().map(|config| config.ready as u32)?, + INTERRUPT_STATUS_REG => self.interrupt_status.load(Ordering::SeqCst), + STATUS_REG => self.device_status, + CONFIG_GENERATION_REG => self.config_generation, + _ => { + return Err(ErrorKind::MmioRegister(offset).into()); + } + }; + + Ok(value) + } + + /// Write data to the common config of virtio device. + /// + /// # Arguments + /// + /// * `device` - Virtio device entity. + /// * `offset` - The offset of common config. + /// * `value` - The value to write. + /// + /// # Errors + /// + /// Returns Error if the offset is out of bound. + fn write_common_config( + &mut self, + device: &Arc>, + offset: u64, + value: u32, + ) -> Result<()> { + match offset { + DEVICE_FEATURES_SEL_REG => self.features_select = value, + DRIVER_FEATURES_REG => { + if self.check_device_status( + CONFIG_STATUS_DRIVER, + CONFIG_STATUS_FEATURES_OK | CONFIG_STATUS_FAILED, + ) { + device + .lock() + .unwrap() + .set_driver_features(self.acked_features_select, value); + if self.acked_features_select == 1 + && virtio_has_feature(u64::from(value) << 32, VIRTIO_F_RING_PACKED) + { + self.queue_type = QUEUE_TYPE_PACKED_VRING; + } + } else { + return Err(ErrorKind::DeviceStatus(self.device_status).into()); + } + } + DRIVER_FEATURES_SEL_REG => self.acked_features_select = value, + QUEUE_SEL_REG => self.queue_select = value, + QUEUE_NUM_REG => self + .get_mut_queue_config() + .map(|config| config.size = value as u16)?, + QUEUE_READY_REG => self + .get_mut_queue_config() + .map(|config| config.ready = value == 1)?, + INTERRUPT_ACK_REG => { + if self.check_device_status(CONFIG_STATUS_DRIVER_OK, 0) { + self.interrupt_status.fetch_and(!value, Ordering::SeqCst); + } + } + STATUS_REG => self.device_status = value, + QUEUE_DESC_LOW_REG => self.get_mut_queue_config().map(|config| { + config.desc_table = GuestAddress(config.desc_table.0 | u64::from(value)); + })?, + QUEUE_DESC_HIGH_REG => self.get_mut_queue_config().map(|config| { + config.desc_table = GuestAddress(config.desc_table.0 | (u64::from(value) << 32)); + })?, + QUEUE_AVAIL_LOW_REG => self.get_mut_queue_config().map(|config| { + config.avail_ring = GuestAddress(config.avail_ring.0 | u64::from(value)); + })?, + QUEUE_AVAIL_HIGH_REG => self.get_mut_queue_config().map(|config| { + config.avail_ring = GuestAddress(config.avail_ring.0 | (u64::from(value) << 32)); + })?, + QUEUE_USED_LOW_REG => self.get_mut_queue_config().map(|config| { + config.used_ring = GuestAddress(config.used_ring.0 | u64::from(value)); + })?, + QUEUE_USED_HIGH_REG => self.get_mut_queue_config().map(|config| { + config.used_ring = GuestAddress(config.used_ring.0 | (u64::from(value) << 32)); + })?, + _ => { + return Err(ErrorKind::MmioRegister(offset).into()); + } + }; + Ok(()) + } +} + +/// virtio-mmio device structure. +pub struct VirtioMmioDevice { + /// The entity of low level device. + device: Arc>, + /// Identify if this device is activated by frontend driver. + device_activated: bool, + /// EventFd used to send interrupt to VM + interrupt_evt: EventFd, + /// HostNotifyInfo used for guest notifier + host_notify_info: HostNotifyInfo, + /// Virtio common config refer to Virtio Spec. + common_config: VirtioMmioCommonConfig, + /// System address space. + mem_space: Arc, +} + +impl VirtioMmioDevice { + pub fn new(mem_space: Arc, device: Arc>) -> Self { + let device_clone = device.clone(); + let queue_num = device_clone.lock().unwrap().queue_num(); + + VirtioMmioDevice { + device, + device_activated: false, + interrupt_evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + host_notify_info: HostNotifyInfo::new(queue_num), + common_config: VirtioMmioCommonConfig::new(&device_clone), + mem_space, + } + } + + /// Activate the virtio device, this function is called by vcpu thread when frontend + /// virtio driver is ready and write `DRIVER_OK` to backend. + fn activate(&mut self) -> Result<()> { + let queues_config = &self.common_config.queues_config; + let mut queues: Vec>> = Vec::with_capacity(queues_config.len()); + for q_config in queues_config.iter() { + let queue = Queue::new(*q_config, self.common_config.queue_type)?; + if !queue.is_valid(&self.mem_space) { + bail!("Invalid queue"); + } + queues.push(Arc::new(Mutex::new(queue))) + } + + let mut queue_evts = Vec::::new(); + for fd in self.host_notify_info.events.iter() { + let evt_fd_clone = match fd.try_clone() { + Ok(fd) => fd, + Err(e) => { + error!("Failed to clone IoEventFd, {}", e); + continue; + } + }; + queue_evts.push(evt_fd_clone); + } + self.device.lock().unwrap().activate( + self.mem_space.clone(), + self.interrupt_evt.try_clone().unwrap(), + self.common_config.interrupt_status.clone(), + queues, + queue_evts, + )?; + + Ok(()) + } +} + +impl RegionOps for VirtioMmioDevice { + /// Read data by virtio driver from VM. + fn read(&mut self, data: &mut [u8], _base: GuestAddress, offset: u64) -> bool { + match offset { + 0x00..=0xff if data.len() == 4 => { + let value = + if let Ok(v) = self.common_config.read_common_config(&self.device, offset) { + v + } else { + error!("Failed to read mmio register"); + return false; + }; + LittleEndian::write_u32(data, value); + } + 0x100..=0xfff => { + if self + .device + .lock() + .unwrap() + .read_config(offset as u64 - 0x100, data) + .is_err() + { + error!("Failed to read virtio-dev config space"); + return false; + } + } + _ => { + warn!( + "Failed to read mmio register: overflows, offset is 0x{:x}", + offset, + ); + } + }; + true + } + + /// Write data by virtio driver from VM. + fn write(&mut self, data: &[u8], _base: GuestAddress, offset: u64) -> bool { + match offset { + 0x00..=0xff if data.len() == 4 => { + let value = LittleEndian::read_u32(data); + match self + .common_config + .write_common_config(&self.device, offset, value) + { + Ok(_) => {} + Err(err) => { + error!("Failed to write mmio register, err: {}", err); + return false; + } + }; + + if self.common_config.check_device_status( + CONFIG_STATUS_ACKNOWLEDGE + | CONFIG_STATUS_DRIVER + | CONFIG_STATUS_DRIVER_OK + | CONFIG_STATUS_FEATURES_OK, + CONFIG_STATUS_FAILED, + ) && !self.device_activated + { + let res = self.activate().map(|_| self.device_activated = true); + if let Err(e) = res { + error!( + "Failed to activate dev, type: {:#?}, err: {:#?}", + self.device.lock().unwrap().device_type(), + e + ); + } + } + } + 0x100..=0xfff => { + if self + .common_config + .check_device_status(CONFIG_STATUS_DRIVER, CONFIG_STATUS_FAILED) + { + if self + .device + .lock() + .unwrap() + .write_config(offset as u64 - 0x100, data) + .is_err() + { + error!("Failed to write virtio-dev config space"); + return false; + } + } else { + error!("Failed to write virtio-dev config space: driver is not ready"); + return false; + } + } + _ => { + warn!( + "Failed to write mmio register: overflows, offset is 0x{:x}", + offset, + ); + return false; + } + } + true + } + + /// Return the ioeventfds of device, + /// these fds will be register to `KVM` and used for guest notifier. + fn ioeventfds(&self) -> Vec { + let mut ret = Vec::new(); + for (index, eventfd) in self.host_notify_info.events.iter().enumerate() { + let addr = u64::from(NOTIFY_REG_OFFSET); + let eventfd_clone = match eventfd.try_clone() { + Err(e) => { + error!("Failed to clone ioeventfd, error is {}", e); + continue; + } + Ok(fd) => fd, + }; + ret.push(RegionIoEventFd { + fd: eventfd_clone, + addr_range: AddressRange::from((addr, std::mem::size_of::() as u64)), + data_match: true, + data: index as u64, + }) + } + + ret + } +} + +impl MmioDeviceOps for VirtioMmioDevice { + /// Realize this MMIO device for VM. + fn realize(&mut self, vm_fd: &VmFd, resource: DeviceResource) -> Result<()> { + vm_fd + .register_irqfd(&self.interrupt_evt, resource.irq) + .chain_err(|| "Failed to register irqfd")?; + + self.device + .lock() + .unwrap() + .realize() + .chain_err(|| "Failed to realize device for virtio mmio device")?; + + Ok(()) + } + + /// Get the resource requirement of MMIO device. + fn get_type(&self) -> DeviceType { + match self.device.lock().unwrap().device_type() { + VIRTIO_TYPE_NET => DeviceType::NET, + VIRTIO_TYPE_BLOCK => DeviceType::BLK, + _ => DeviceType::OTHER, + } + } + + /// Update the low level config of MMIO device. + fn update_config(&mut self, dev_config: Option>) -> Result<()> { + self.device + .lock() + .unwrap() + .update_config(dev_config) + .chain_err(|| "Failed to update configuration")?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::io::Write; + + use address_space::{AddressSpace, GuestAddress, HostMemMapping, Region}; + use util::num_ops::{read_u32, write_u32}; + + use super::*; + type VirtioResult = std::result::Result; + + fn address_space_init() -> Arc { + let root = Region::init_container_region(1 << 36); + let sys_space = AddressSpace::new(root).unwrap(); + let host_mmap = + Arc::new(HostMemMapping::new(GuestAddress(0), SYSTEM_SPACE_SIZE, false).unwrap()); + sys_space + .root() + .add_subregion( + Region::init_ram_region(host_mmap.clone()), + host_mmap.start_address().raw_value(), + ) + .unwrap(); + sys_space + } + + const SYSTEM_SPACE_SIZE: u64 = (1024 * 1024) as u64; + const CONFIG_SPACE_SIZE: usize = 16; + const QUEUE_NUM: usize = 2; + const QUEUE_SIZE: u16 = 256; + + pub struct VirtioDeviceTest { + pub device_features: u64, + pub driver_features: u64, + pub config_space: Vec, + pub b_active: bool, + pub b_realized: bool, + } + + impl VirtioDeviceTest { + pub fn new() -> Self { + let mut config_space = Vec::new(); + for i in 0..CONFIG_SPACE_SIZE { + config_space.push(i as u8); + } + + VirtioDeviceTest { + device_features: 0, + driver_features: 0, + b_active: false, + b_realized: false, + config_space, + } + } + } + + impl VirtioDevice for VirtioDeviceTest { + fn realize(&mut self) -> VirtioResult<()> { + self.b_realized = true; + Ok(()) + } + + fn device_type(&self) -> u32 { + DeviceType::BLK as u32 + } + + fn queue_num(&self) -> usize { + QUEUE_NUM + } + + fn queue_size(&self) -> u16 { + QUEUE_SIZE + } + + fn get_device_features(&self, features_select: u32) -> u32 { + read_u32(self.device_features, features_select) + } + + fn set_driver_features(&mut self, page: u32, value: u32) { + let mut v = write_u32(value, page); + let unrequested_features = v & !self.device_features; + if unrequested_features != 0 { + v &= !unrequested_features; + } + self.driver_features |= v; + } + + fn read_config(&self, offset: u64, mut data: &mut [u8]) -> VirtioResult<()> { + let config_len = self.config_space.len() as u64; + if offset >= config_len { + bail!( + "The offset{} for reading is more than the length{} of configuration", + offset, + config_len + ); + } + if let Some(end) = offset.checked_add(data.len() as u64) { + data.write_all( + &self.config_space[offset as usize..std::cmp::min(end, config_len) as usize], + )?; + } + + Ok(()) + } + + fn write_config(&mut self, offset: u64, data: &[u8]) -> VirtioResult<()> { + let data_len = data.len(); + let config_len = self.config_space.len(); + if offset as usize + data_len > config_len { + bail!( + "The offset{} {}for writing is more than the length{} of configuration", + offset, + data_len, + config_len + ); + } + + self.config_space[(offset as usize)..(offset as usize + data_len)] + .copy_from_slice(&data[..]); + + Ok(()) + } + + fn activate( + &mut self, + _mem_space: Arc, + _interrupt_evt: EventFd, + _interrupt_status: Arc, + mut _queues: Vec>>, + mut _queue_evts: Vec, + ) -> VirtioResult<()> { + self.b_active = true; + Ok(()) + } + } + + #[test] + fn test_virtio_mmio_device_new() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let virtio_device_clone = virtio_device.clone(); + let sys_space = address_space_init(); + + let virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + assert_eq!(virtio_mmio_device.device_activated, false); + assert_eq!( + virtio_mmio_device.host_notify_info.events.len(), + virtio_device_clone.lock().unwrap().queue_num() + ); + assert_eq!(virtio_mmio_device.common_config.features_select, 0); + assert_eq!(virtio_mmio_device.common_config.acked_features_select, 0); + assert_eq!(virtio_mmio_device.common_config.device_status, 0); + assert_eq!(virtio_mmio_device.common_config.config_generation, 0); + assert_eq!(virtio_mmio_device.common_config.queue_select, 0); + assert_eq!( + virtio_mmio_device.common_config.queues_config.len(), + virtio_device_clone.lock().unwrap().queue_num() + ); + assert_eq!( + virtio_mmio_device.common_config.queue_type, + QUEUE_TYPE_SPLIT_VRING + ); + } + + #[test] + fn test_virtio_mmio_device_read_01() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let virtio_device_clone = virtio_device.clone(); + let sys_space = address_space_init(); + let mut virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + let addr = GuestAddress(0); + + // read the register of magic value + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, MAGIC_VALUE_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), MMIO_MAGIC_VALUE); + + // read the register of version + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, VERSION_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), MMIO_VERSION); + + // read the register of device id + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, DEVICE_ID_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), DeviceType::BLK as u32); + + // read the register of vendor id + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, VENDOR_ID_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), VENDOR_ID); + + // read the register of the features + // get low 32bit of the features + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.features_select = 0; + virtio_device_clone.lock().unwrap().device_features = 0x0000_00f8_0000_00fe; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, DEVICE_FEATURES_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0x0000_00fe); + // get high 32bit of the features for device which supports VirtIO Version 1 + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.features_select = 1; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, DEVICE_FEATURES_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0x0000_00f9); + } + + #[test] + fn test_virtio_mmio_device_read_02() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let sys_space = address_space_init(); + let mut virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + let addr = GuestAddress(0); + + // read the register representing max size of the queue + // for queue_select as 0 + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.queue_select = 0; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, QUEUE_NUM_MAX_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), QUEUE_SIZE as u32); + // for queue_select as 1 + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.queue_select = 1; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, QUEUE_NUM_MAX_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), QUEUE_SIZE as u32); + + // read the register representing the status of queue + // for queue_select as 0 + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + LittleEndian::write_u32(&mut buf[..], 1); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_READY_REG), + true + ); + let mut data: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut data[..], addr, QUEUE_READY_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&data[..]), 1); + // for queue_select as 1 + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.queue_select = 1; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, QUEUE_READY_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0); + + // read the register representing the status of interrupt + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, INTERRUPT_STATUS_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0); + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device + .common_config + .interrupt_status + .store(0b10_1111, Ordering::Relaxed); + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, INTERRUPT_STATUS_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0b10_1111); + + // read the register representing the status of device + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.device_status = 0; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, STATUS_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0); + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.device_status = 5; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, STATUS_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 5); + } + + #[test] + fn test_virtio_mmio_device_read_03() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let virtio_device_clone = virtio_device.clone(); + let sys_space = address_space_init(); + let mut virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + let addr = GuestAddress(0); + + // read the configuration atomic value + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, CONFIG_GENERATION_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0); + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.config_generation = 10; + assert_eq!( + virtio_mmio_device.read(&mut buf[..], addr, CONFIG_GENERATION_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 10); + + // read the unknown register + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!(virtio_mmio_device.read(&mut buf[..], addr, 0xf1), false); + assert_eq!(virtio_mmio_device.read(&mut buf[..], addr, 0xfff + 1), true); + assert_eq!(buf, [0xff, 0xff, 0xff, 0xff]); + + // read the configuration space of virtio device + // write something + let result: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + virtio_device_clone + .lock() + .unwrap() + .config_space + .as_mut_slice() + .copy_from_slice(&result[..]); + + let mut data: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + assert_eq!(virtio_mmio_device.read(&mut data[..], addr, 0x100), true); + assert_eq!(data, result); + + let mut data: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0]; + let result: Vec = vec![9, 10, 11, 12, 13, 14, 15, 16]; + assert_eq!(virtio_mmio_device.read(&mut data[..], addr, 0x108), true); + assert_eq!(data, result); + } + + #[test] + fn test_virtio_mmio_device_write_01() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let virtio_device_clone = virtio_device.clone(); + let sys_space = address_space_init(); + let mut virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + let addr = GuestAddress(0); + + // write the selector for device features + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 2); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, DEVICE_FEATURES_SEL_REG), + true + ); + assert_eq!(virtio_mmio_device.common_config.features_select, 2); + + // write the device features + // false when the device status is CONFIG_STATUS_FEATURES_OK or CONFIG_STATUS_FAILED isn't CONFIG_STATUS_DRIVER + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, DRIVER_FEATURES_REG), + false + ); + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FAILED; + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, DRIVER_FEATURES_REG), + false + ); + virtio_mmio_device.common_config.device_status = + CONFIG_STATUS_FEATURES_OK | CONFIG_STATUS_FAILED | CONFIG_STATUS_DRIVER; + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, DRIVER_FEATURES_REG), + false + ); + // it is ok to write the low 32bit of device features + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_DRIVER; + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.acked_features_select = 0; + LittleEndian::write_u32(&mut buf[..], 0x0000_00fe); + virtio_device_clone.lock().unwrap().device_features = 0x0000_00fe; + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, DRIVER_FEATURES_REG), + true + ); + assert_eq!( + virtio_device_clone.lock().unwrap().driver_features as u32, + 0x0000_00fe + ); + // it is ok to write the high 32bit of device features + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.acked_features_select = 1; + LittleEndian::write_u32(&mut buf[..], 0x0000_00ff); + virtio_device_clone.lock().unwrap().device_features = 0x0000_00ff_0000_0000; + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, DRIVER_FEATURES_REG), + true + ); + assert_eq!( + virtio_mmio_device.common_config.queue_type, + QUEUE_TYPE_PACKED_VRING + ); + assert_eq!( + virtio_device_clone.lock().unwrap().driver_features >> 32 as u32, + 0x0000_00ff + ); + + // write the selector of driver features + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0x00ff_0000); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, DRIVER_FEATURES_SEL_REG), + true + ); + assert_eq!( + virtio_mmio_device.common_config.acked_features_select, + 0x00ff_0000 + ); + + // write the selector of queue + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0x0000_ff00); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_SEL_REG), + true + ); + assert_eq!(virtio_mmio_device.common_config.queue_select, 0x0000_ff00); + + // write the size of queue + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + LittleEndian::write_u32(&mut buf[..], 128); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_NUM_REG), + true + ); + if let Ok(config) = virtio_mmio_device.common_config.get_queue_config() { + assert_eq!(config.size, 128); + } else { + assert!(false); + } + } + + #[test] + fn test_virtio_mmio_device_write_02() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let sys_space = address_space_init(); + let mut virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + let addr = GuestAddress(0); + + // write the ready status of queue + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + LittleEndian::write_u32(&mut buf[..], 1); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_READY_REG), + true + ); + let mut data: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut data[..], addr, QUEUE_READY_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&data[..]), 1); + + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + LittleEndian::write_u32(&mut buf[..], 2); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_READY_REG), + true + ); + let mut data: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut data[..], addr, QUEUE_READY_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&data[..]), 0); + + // write the interrupt status + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_DRIVER_OK; + virtio_mmio_device + .common_config + .interrupt_status + .store(0b10_1111, Ordering::Relaxed); + LittleEndian::write_u32(&mut buf[..], 0b111); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, INTERRUPT_ACK_REG), + true + ); + let mut data: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut data[..], addr, INTERRUPT_STATUS_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&data[..]), 0b10_1000); + } + + #[test] + fn test_virtio_mmio_device_write_03() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let sys_space = address_space_init(); + let mut virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + let addr = GuestAddress(0); + + // write the low 32bit of queue's descriptor table address + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0xffff_fefe); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_DESC_LOW_REG), + true + ); + if let Ok(config) = virtio_mmio_device.common_config.get_queue_config() { + assert_eq!(config.desc_table.0 as u32, 0xffff_fefe) + } else { + assert!(false); + } + + // write the high 32bit of queue's descriptor table address + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0xfcfc_ffff); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_DESC_HIGH_REG), + true + ); + if let Ok(config) = virtio_mmio_device.common_config.get_queue_config() { + assert_eq!((config.desc_table.0 >> 32) as u32, 0xfcfc_ffff) + } else { + assert!(false); + } + + // write the low 32bit of queue's available ring address + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0xfcfc_fafa); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_AVAIL_LOW_REG), + true + ); + if let Ok(config) = virtio_mmio_device.common_config.get_queue_config() { + assert_eq!(config.avail_ring.0 as u32, 0xfcfc_fafa) + } else { + assert!(false); + } + + // write the high 32bit of queue's available ring address + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0xecec_fafa); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_AVAIL_HIGH_REG), + true + ); + if let Ok(config) = virtio_mmio_device.common_config.get_queue_config() { + assert_eq!((config.avail_ring.0 >> 32) as u32, 0xecec_fafa) + } else { + assert!(false); + } + + // write the low 32bit of queue's used ring address + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0xacac_fafa); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_USED_LOW_REG), + true + ); + if let Ok(config) = virtio_mmio_device.common_config.get_queue_config() { + assert_eq!(config.used_ring.0 as u32, 0xacac_fafa) + } else { + assert!(false); + } + + // write the high 32bit of queue's used ring address + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], 0xcccc_fafa); + assert_eq!( + virtio_mmio_device.write(&buf[..], addr, QUEUE_USED_HIGH_REG), + true + ); + if let Ok(config) = virtio_mmio_device.common_config.get_queue_config() { + assert_eq!((config.used_ring.0 >> 32) as u32, 0xcccc_fafa) + } else { + assert!(false); + } + } + + fn align(size: u64, alignment: u64) -> u64 { + let align_adjust = if size % alignment != 0 { + alignment - (size % alignment) + } else { + 0 + }; + (size + align_adjust) as u64 + } + + #[test] + fn test_virtio_mmio_device_write_04() { + let virtio_device = Arc::new(Mutex::new(VirtioDeviceTest::new())); + let virtio_device_clone = virtio_device.clone(); + let sys_space = address_space_init(); + let mut virtio_mmio_device = VirtioMmioDevice::new(sys_space, virtio_device); + let addr = GuestAddress(0); + + virtio_mmio_device.common_config.queue_select = 0; + virtio_mmio_device.common_config.device_status = CONFIG_STATUS_FEATURES_OK; + if let Ok(config) = virtio_mmio_device.common_config.get_mut_queue_config() { + config.desc_table = GuestAddress(0); + config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * 16); + config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * 16 + 8 + 2 * (QUEUE_SIZE as u64), + 4096, + )); + config.size = QUEUE_SIZE; + config.ready = true; + } + virtio_mmio_device.common_config.queue_select = 1; + if let Ok(config) = virtio_mmio_device.common_config.get_mut_queue_config() { + config.desc_table = GuestAddress(0); + config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * 16); + config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * 16 + 8 + 2 * (QUEUE_SIZE as u64), + 4096, + )); + config.size = QUEUE_SIZE / 2; + config.ready = true; + } + + // write the device status + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32(&mut buf[..], CONFIG_STATUS_ACKNOWLEDGE); + assert_eq!(virtio_mmio_device.write(&buf[..], addr, STATUS_REG), true); + assert_eq!(virtio_mmio_device.device_activated, false); + let mut data: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut data[..], addr, STATUS_REG), + true + ); + assert_eq!(LittleEndian::read_u32(&data[..]), CONFIG_STATUS_ACKNOWLEDGE); + + let mut buf: Vec = vec![0xff, 0xff, 0xff, 0xff]; + LittleEndian::write_u32( + &mut buf[..], + CONFIG_STATUS_ACKNOWLEDGE + | CONFIG_STATUS_DRIVER + | CONFIG_STATUS_DRIVER_OK + | CONFIG_STATUS_FEATURES_OK, + ); + assert_eq!(virtio_device_clone.lock().unwrap().b_active, false); + assert_eq!(virtio_mmio_device.write(&buf[..], addr, STATUS_REG), true); + assert_eq!(virtio_mmio_device.device_activated, true); + assert_eq!(virtio_device_clone.lock().unwrap().b_active, true); + let mut data: Vec = vec![0xff, 0xff, 0xff, 0xff]; + assert_eq!( + virtio_mmio_device.read(&mut data[..], addr, STATUS_REG), + true + ); + assert_eq!( + LittleEndian::read_u32(&data[..]), + CONFIG_STATUS_ACKNOWLEDGE + | CONFIG_STATUS_DRIVER + | CONFIG_STATUS_DRIVER_OK + | CONFIG_STATUS_FEATURES_OK + ); + } +} diff --git a/device_model/src/virtio/block.rs b/device_model/src/virtio/block.rs new file mode 100644 index 00000000..73e34a36 --- /dev/null +++ b/device_model/src/virtio/block.rs @@ -0,0 +1,879 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::cmp; +use std::convert::TryFrom; +use std::fs::{File, OpenOptions}; +use std::io::{Seek, SeekFrom, Write}; +use std::mem::size_of; +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::mpsc::{channel, Receiver, Sender}; +use std::sync::{Arc, Mutex}; + +use address_space::{AddressSpace, GuestAddress}; +use machine_manager::config::{ConfigCheck, DriveConfig}; +use util::aio::{Aio, AioCb, AioCompleteFunc, IoCmd, Iovec}; +use util::byte_code::ByteCode; +use util::epoll_context::{ + read_fd, EventNotifier, EventNotifierHelper, NotifierCallback, NotifierOperation, +}; +use util::num_ops::{read_u32, write_u32}; +use vmm_sys_util::{epoll::EventSet, eventfd::EventFd}; + +use super::super::micro_vm::main_loop::MainLoop; +use super::errors::{ErrorKind, Result, ResultExt}; +use super::{ + Element, Queue, VirtioDevice, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_SEG_MAX, + VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_OK, VIRTIO_BLK_T_FLUSH, + VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, VIRTIO_F_RING_EVENT_IDX, + VIRTIO_F_RING_INDIRECT_DESC, VIRTIO_F_VERSION_1, VIRTIO_MMIO_INT_CONFIG, VIRTIO_MMIO_INT_VRING, + VIRTIO_TYPE_BLOCK, +}; + +/// Number of virtqueues. +const QUEUE_NUM_BLK: usize = 1; +/// Size of each virtqueue. +const QUEUE_SIZE_BLK: u16 = 256; +/// Size of configuration space of the virtio block device. +const CONFIG_SPACE_SIZE: usize = 16; +/// Used to compute the number of sectors. +const SECTOR_SHIFT: u8 = 9; +/// Size of a sector of the block device. +const SECTOR_SIZE: u64 = (0x01 as u64) << SECTOR_SHIFT; +/// Size of the dummy block device. +const DUMMY_IMG_SIZE: u64 = 0; + +type SenderConfig = (Option, u64, Option, bool); +type VirtioBlockInterrupt = Box Result<()> + Send + Sync>; + +fn get_serial_num_config(serial_num: &str) -> Vec { + let mut id_bytes = vec![0; VIRTIO_BLK_ID_BYTES as usize]; + let bytes_to_copy = cmp::min(serial_num.len(), VIRTIO_BLK_ID_BYTES as usize); + + let serial_bytes = serial_num.as_bytes(); + id_bytes[..bytes_to_copy].clone_from_slice(&serial_bytes[..bytes_to_copy]); + id_bytes +} + +/// Write data to memory at specified address. +/// +/// # Arguments +/// +/// * `buf` - The data buffer. +/// * `hva` - The destination address in the memory. +/// +/// # Safety +/// +/// hva is non-null which is guaranteed by the caller, and the entire memory range +/// of this slice is contained within a single allocated object. +pub fn write_buf_mem(buf: &[u8], hva: u64) -> Result<()> { + let mut slice = unsafe { std::slice::from_raw_parts_mut(hva as *mut u8, buf.len()) }; + (&mut slice) + .write(buf) + .chain_err(|| format!("Failed to write buf(hva:{})", hva))?; + + Ok(()) +} + +/// The unwritable header of virtio block's request. +#[repr(C)] +#[derive(Default, Clone, Copy)] +struct RequestOutHeader { + /// Request type. + request_type: u32, + /// The priority of request. + io_prio: u32, + /// The offset sector of request. + sector: u64, +} + +impl RequestOutHeader { + /// Return true if the request type is valid. + pub fn is_valid(&self) -> bool { + match self.request_type { + VIRTIO_BLK_T_IN | VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_FLUSH | VIRTIO_BLK_T_GET_ID => true, + _ => { + error!("request type {} is not supported \n", self.request_type); + false + } + } + } +} + +impl ByteCode for RequestOutHeader {} + +/// The aio control block. +#[derive(Clone)] +pub struct AioCompleteCb { + /// The virtqueue to which this aiocb belongs. + pub queue: Arc>, + /// The address space to which this aiocb belongs. + pub mem_space: Arc, + /// Index of the descriptor. + pub desc_index: u16, + /// Total length of the descriptor chain. + pub rw_len: u32, + /// The memory address where stores the result of handling the request. + pub req_status_addr: GuestAddress, + /// Callback for triggering an interrupt. + pub interrupt_cb: Option>, + /// Bit mask of features negotiated by the backend and the frontend. + pub driver_features: u64, +} + +impl AioCompleteCb { + /// Create an aio control block. + /// + /// # Arguments + /// + /// * `queue` - Virtqueue. + /// * `mem_space` - Address Space to which the aio belongs. + /// * `desc_index` - Index of the descriptor. + /// * `req_status_addr` - The memory address where stores the result of handling the request. + /// * `interrupt_cb` - Callback for triggering an interrupt. + /// * `driver_features` - Bit mask of features negotiated by the backend and the frontend. + pub fn new( + queue: Arc>, + mem_space: Arc, + desc_index: u16, + rw_len: u32, + req_status_addr: GuestAddress, + interrupt_cb: Option>, + driver_features: u64, + ) -> Self { + AioCompleteCb { + queue, + mem_space, + desc_index, + rw_len, + req_status_addr, + interrupt_cb, + driver_features, + } + } +} + +/// Virtio block IO request. +struct Request { + /// The index of descriptor for the request. + desc_index: u16, + /// The header(out_header) which is read-only. + out_header: RequestOutHeader, + /// The IO vector which is both readable and writable. + iovec: Vec, + /// The total length of data. + data_len: u64, + /// The address of header(in_header) which is writable, and this header + /// should be written with the result of handling the request. + in_header: GuestAddress, +} + +impl Request { + /// Create a block IO request. + /// + /// # Arguments + /// + /// * `mem_space`: Address space to which the request belongs. + /// * `elem`: IO request element. + fn new(mem_space: &Arc, elem: &Element) -> Result { + if elem.out_iovec.is_empty() || elem.in_iovec.is_empty() || elem.desc_num < 2 { + bail!( + "Missed header: out {} in {}", + elem.out_iovec.len(), + elem.in_iovec.len() + ); + } + + let out_iov_elem = elem.out_iovec.get(0).unwrap(); + if out_iov_elem.len < size_of::() as u32 { + bail!("Invalid out header: length {}", out_iov_elem.len); + } + + let out_header = mem_space + .read_object::(out_iov_elem.addr) + .chain_err(|| format!("Failed to read from memory, addr {}", out_iov_elem.addr.0))?; + + if !out_header.is_valid() { + bail!("Unsupported request type"); + } + + let pos = elem.in_iovec.len() - 1; + let in_iov_elem = elem.in_iovec.get(pos).unwrap(); + if in_iov_elem.len < 1 { + bail!("Invalid out header: length {}", in_iov_elem.len); + } + + let mut request = Request { + desc_index: elem.index, + out_header, + iovec: Vec::with_capacity(elem.desc_num as usize), + data_len: 0, + in_header: in_iov_elem.addr, + }; + + match out_header.request_type { + VIRTIO_BLK_T_IN | VIRTIO_BLK_T_GET_ID => { + for (index, elem_iov) in elem.in_iovec.iter().enumerate() { + if index == elem.in_iovec.len() - 1 { + break; + } + if let Some(hva) = mem_space.get_host_address(elem_iov.addr) { + let iov = Iovec { + iov_base: hva, + iov_len: u64::from(elem_iov.len), + }; + request.iovec.push(iov); + request.data_len += u64::from(elem_iov.len); + } + } + } + VIRTIO_BLK_T_OUT => { + for (index, elem_iov) in elem.out_iovec.iter().enumerate() { + if index == 0 { + continue; + } + if let Some(hva) = mem_space.get_host_address(elem_iov.addr) { + let iov = Iovec { + iov_base: hva, + iov_len: u64::from(elem_iov.len), + }; + request.iovec.push(iov); + request.data_len += u64::from(elem_iov.len); + } + } + } + _ => (), + } + + Ok(request) + } + + #[allow(clippy::too_many_arguments)] + #[allow(clippy::borrowed_box)] + fn execute( + &self, + aio: &mut Box>, + disk: &mut File, + disk_sectors: u64, + serial_num: &Option, + direct: bool, + last_aio: bool, + iocompletecb: AioCompleteCb, + ) -> Result { + let mut top: u64 = self.data_len / SECTOR_SIZE; + if self.data_len % SECTOR_SIZE != 0 { + top += 1; + } + top.checked_add(self.out_header.sector) + .filter(|off| off <= &disk_sectors) + .chain_err(|| { + format!( + "offset {} invalid, disk sector {}", + self.out_header.sector, disk_sectors + ) + })?; + + let mut aiocb = AioCb { + last_aio, + file_fd: disk.as_raw_fd(), + opcode: IoCmd::NOOP, + iovec: Vec::new(), + offset: (self.out_header.sector << SECTOR_SHIFT) as usize, + process: true, + iocb: None, + iocompletecb, + }; + + for iov in self.iovec.iter() { + let iovec = Iovec { + iov_base: iov.iov_base, + iov_len: iov.iov_len, + }; + aiocb.iovec.push(iovec); + } + + match self.out_header.request_type { + VIRTIO_BLK_T_IN => { + aiocb.opcode = IoCmd::PREADV; + if direct { + (*aio).as_mut().rw_aio(aiocb)?; + } else { + (*aio).as_mut().rw_sync(aiocb)?; + } + } + VIRTIO_BLK_T_OUT => { + aiocb.opcode = IoCmd::PWRITEV; + if direct { + (*aio).as_mut().rw_aio(aiocb)?; + } else { + (*aio).as_mut().rw_sync(aiocb)?; + } + } + VIRTIO_BLK_T_FLUSH => { + aiocb.opcode = IoCmd::FDSYNC; + (*aio).as_mut().rw_sync(aiocb)?; + } + VIRTIO_BLK_T_GET_ID => { + if let Some(serial) = serial_num { + let serial_vec = get_serial_num_config(&serial); + + for iov in self.iovec.iter() { + if (iov.iov_len as usize) < serial_vec.len() { + bail!( + "The buffer length {} is less than the length {} of serial num", + iov.iov_len, + serial_vec.len() + ); + } + write_buf_mem(&serial_vec, iov.iov_base) + .chain_err(|| "Failed to write buf for virtio block id")?; + } + } + + return Ok(1); + } + _ => bail!("The type of request is not supported"), + }; + Ok(0) + } +} + +/// Control block of Block IO. +pub struct BlockIoHandler { + /// The virtqueue. + pub queue: Arc>, + /// Eventfd of the virtqueue for IO event. + pub queue_evt: EventFd, + /// The address space to which the block device belongs. + pub mem_space: Arc, + /// The image file opened by the block device. + pub disk_image: Option, + /// The number of sectors of the disk image. + pub disk_sectors: u64, + /// Serial number of the block device. + pub serial_num: Option, + /// if use direct access io. + pub direct: bool, + /// Aio context. + pub aio: Option>>, + /// Bit mask of features negotiated by the backend and the frontend. + pub driver_features: u64, + /// The receiving half of Rust's channel to receive the image file. + receiver: Receiver, + /// Eventfd for config space update. + update_evt: RawFd, + /// Callback to trigger an interrupt. + pub interrupt_cb: Arc, +} + +impl BlockIoHandler { + /// Build IO requests if there are elements in virtqueue needed to be finished, + /// and execute them. If required, an interrupt is sent to the guest. + pub fn process_queue(&mut self) -> Result<()> { + let mut req_queue = Vec::new(); + let mut req_index = 0; + let mut last_aio_req_index = 0; + let mut need_interrupt = false; + + while let Ok(elem) = self + .queue + .lock() + .unwrap() + .vring + .pop_avail(&self.mem_space, self.driver_features) + { + match Request::new(&self.mem_space, &elem) { + Ok(req) => { + match req.out_header.request_type { + VIRTIO_BLK_T_IN | VIRTIO_BLK_T_OUT => { + last_aio_req_index = req_index; + } + _ => {} + } + req_queue.push(req); + req_index += 1; + } + Err(e) => { + error!("failed to create request, err {:#?}", e); + break; + } + }; + } + + if let Some(disk_img) = self.disk_image.as_mut() { + req_index = 0; + for req in req_queue.iter() { + if let Some(ref mut aio) = self.aio { + let rw_len = match req.out_header.request_type { + VIRTIO_BLK_T_IN => u32::try_from(req.data_len) + .chain_err(|| "Convert block request len to u32 with overflow.")?, + _ => 0u32, + }; + + let aiocompletecb = AioCompleteCb::new( + self.queue.clone(), + self.mem_space.clone(), + req.desc_index, + rw_len, + req.in_header, + Some(self.interrupt_cb.clone()), + self.driver_features, + ); + + match req.execute( + aio, + disk_img, + self.disk_sectors, + &self.serial_num, + self.direct, + last_aio_req_index == req_index, + aiocompletecb, + ) { + Ok(v) => { + if v == 1 { + // get device id + self.mem_space + .write_object(&VIRTIO_BLK_S_OK, req.in_header)?; + self.queue.lock().unwrap().vring.add_used( + &self.mem_space, + req.desc_index, + 1, + )?; + + if self + .queue + .lock() + .unwrap() + .vring + .should_notify(&self.mem_space, self.driver_features) + { + need_interrupt = true; + } + } + } + Err(e) => { + error!("Failed to parse available descriptor chain: {:?}", e); + } + } + req_index += 1; + } + } + } else if !req_queue.is_empty() { + for req in req_queue.iter() { + self.queue + .lock() + .unwrap() + .vring + .add_used(&self.mem_space, req.desc_index, 1)?; + } + need_interrupt = true + } + + if !req_queue.is_empty() || need_interrupt { + (self.interrupt_cb)(VIRTIO_MMIO_INT_VRING)?; + } + + Ok(()) + } + + /// Build an aio context. + pub fn build_aio(&self) -> Result>> { + let complete_func = Arc::new(Box::new(move |aiocb: &AioCb, ret: i64| { + let status = if ret < 0 { + ret + } else { + i64::from(VIRTIO_BLK_S_OK) + }; + let complete_cb = &aiocb.iocompletecb; + + if complete_cb + .mem_space + .write_object(&status, complete_cb.req_status_addr) + .is_err() + { + error!("Failed to write object(aio completion)"); + return; + } + + let mut queue_lock = complete_cb.queue.lock().unwrap(); + if queue_lock + .vring + .add_used( + &complete_cb.mem_space, + complete_cb.desc_index, + complete_cb.rw_len, + ) + .is_err() + { + error!( + "Failed to add used ring(aio completion), index {}, len {}", + complete_cb.desc_index, complete_cb.rw_len + ); + return; + } + + let trigger_interrupt_status = queue_lock + .vring + .should_notify(&complete_cb.mem_space, complete_cb.driver_features); + if trigger_interrupt_status + && (*complete_cb.interrupt_cb.as_ref().unwrap())(VIRTIO_MMIO_INT_VRING).is_err() + { + error!("Failed to trigger interrupt(aio completion)"); + } + }) as AioCompleteFunc); + + Ok(Box::new(Aio::new(complete_func)?)) + } + + fn add_event_notifiers(mut self) -> Result<()> { + self.aio = Some(self.build_aio()?); + MainLoop::update_event(EventNotifierHelper::internal_notifiers(Arc::new( + Mutex::new(self), + )))?; + + Ok(()) + } + + fn update_evt_handler(&mut self) { + match self.receiver.recv() { + Ok((image, disk_sectors, serial_num, direct)) => { + self.disk_sectors = disk_sectors; + self.disk_image = image; + self.serial_num = serial_num; + self.direct = direct; + } + Err(_) => { + self.disk_sectors = 0; + self.disk_image = None; + self.serial_num = None; + self.direct = true; + } + }; + + self.process_queue() + .unwrap_or_else(|_| error!("Failed to handle block IO.")); + } +} + +fn build_event_notifier(fd: RawFd, handler: Box) -> EventNotifier { + let mut handlers = Vec::new(); + handlers.push(Arc::new(Mutex::new(handler))); + EventNotifier::new( + NotifierOperation::AddShared, + fd, + None, + EventSet::IN, + handlers, + ) +} + +impl EventNotifierHelper for BlockIoHandler { + fn internal_notifiers(block_io: Arc>) -> Vec { + let mut notifiers = Vec::new(); + let locked_block_io = block_io.lock().unwrap(); + + // Register event notifier for update_evt. + let cloned_block_io = block_io.clone(); + let handler: Box = Box::new(move |_, fd: RawFd| { + read_fd(fd); + cloned_block_io.lock().unwrap().update_evt_handler(); + None + }); + notifiers.push(build_event_notifier(locked_block_io.update_evt, handler)); + + // Register event notifier for queue_evt. + let cloned_block_io = block_io.clone(); + let handler: Box = Box::new(move |_, fd: RawFd| { + read_fd(fd); + + let mut locked_block_io = cloned_block_io.lock().unwrap(); + locked_block_io + .process_queue() + .unwrap_or_else(|_| error!("Failed to handle block IO.")); + None + }); + notifiers.push(build_event_notifier( + locked_block_io.queue_evt.as_raw_fd(), + handler, + )); + + // Register event notifier for aio. + let cloned_block_io = block_io.clone(); + if let Some(ref aio) = locked_block_io.aio { + let handler: Box = Box::new(move |_, fd: RawFd| { + read_fd(fd); + + if let Some(aio) = &mut cloned_block_io.lock().unwrap().aio { + aio.handle() + .map_err(|e| error!("Failed to handle aio, {}", e)) + .ok(); + } + None + }); + notifiers.push(build_event_notifier(aio.fd.as_raw_fd(), handler)); + } + + notifiers + } +} + +/// Block device structure. +pub struct Block { + /// Configuration of the block device. + blk_cfg: DriveConfig, + /// Image file opened. + disk_image: Option, + /// Number of sectors of the image file. + disk_sectors: u64, + /// Bit mask of features supported by the backend. + device_features: u64, + /// Bit mask of features negotiated by the backend and the frontend. + driver_features: u64, + /// Config space of the block device. + config_space: Vec, + /// Callback to trigger interrupt. + interrupt_cb: Option>, + /// The sending half of Rust's channel to send the image file. + sender: Option>, + /// Eventfd for config space update. + update_evt: EventFd, +} + +impl Block { + /// Create a block device. + /// + /// # Arguments + /// + /// * `blk_cfg` - Configuration of the block device. + pub fn new() -> Block { + Block { + blk_cfg: Default::default(), + disk_image: None, + disk_sectors: 0, + device_features: 0, + driver_features: 0, + config_space: Vec::with_capacity(CONFIG_SPACE_SIZE), + interrupt_cb: None, + sender: None, + update_evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + } + } + + fn build_device_config_space(&mut self) -> Result<()> { + // capacity: 64bits + let num_sectors = DUMMY_IMG_SIZE >> SECTOR_SHIFT; + for i in 0..8 { + self.config_space.push((num_sectors >> (8 * i)) as u8); + } + + // size_max=0: 32bits + for _ in 0..4 { + self.config_space.push(0_u8); + } + + // seg_max=128-2: 32bits + for i in 0..4 { + self.config_space.push((126 >> (8 * i)) as u8); + } + + Ok(()) + } +} + +impl VirtioDevice for Block { + /// Realize vhost virtio network device. + fn realize(&mut self) -> Result<()> { + self.device_features = (1_u64 << VIRTIO_F_VERSION_1) | (1_u64 << VIRTIO_BLK_F_FLUSH); + if self.blk_cfg.read_only { + self.device_features |= 1_u64 << VIRTIO_BLK_F_RO; + }; + self.device_features |= 1_u64 << VIRTIO_F_RING_INDIRECT_DESC; + self.device_features |= 1_u64 << VIRTIO_BLK_F_SIZE_MAX; + self.device_features |= 1_u64 << VIRTIO_BLK_F_SEG_MAX; + self.device_features |= 1_u64 << VIRTIO_F_RING_EVENT_IDX; + + self.build_device_config_space() + .chain_err(|| "Failed to build config space")?; + + let mut disk_size = DUMMY_IMG_SIZE; + + if self.blk_cfg.path_on_host != "" { + self.disk_image = None; + + let mut file = if self.blk_cfg.direct { + OpenOptions::new() + .read(true) + .write(!self.blk_cfg.read_only) + .custom_flags(libc::O_DIRECT) + .open(&self.blk_cfg.path_on_host) + .chain_err(|| { + format!("failed to open the file {}", self.blk_cfg.path_on_host) + })? + } else { + OpenOptions::new() + .read(true) + .write(!self.blk_cfg.read_only) + .open(&self.blk_cfg.path_on_host) + .chain_err(|| { + format!("failed to open the file {}", self.blk_cfg.path_on_host) + })? + }; + + disk_size = file + .seek(SeekFrom::End(0)) + .chain_err(|| "Failed to seek the end")? as u64; + + self.disk_image = Some(file); + } else { + self.disk_image = None; + } + + self.disk_sectors = disk_size >> SECTOR_SHIFT; + for i in 0..8 { + self.config_space[i] = (self.disk_sectors >> (8 * i)) as u8; + } + + Ok(()) + } + + /// Get the virtio device type, refer to Virtio Spec. + fn device_type(&self) -> u32 { + VIRTIO_TYPE_BLOCK + } + + /// Get the count of virtio device queues. + fn queue_num(&self) -> usize { + QUEUE_NUM_BLK + } + + /// Get the queue size of virtio device. + fn queue_size(&self) -> u16 { + QUEUE_SIZE_BLK + } + + /// Get device features from host. + fn get_device_features(&self, features_select: u32) -> u32 { + read_u32(self.device_features, features_select) + } + + /// Set driver features by guest. + fn set_driver_features(&mut self, page: u32, value: u32) { + let mut v = write_u32(value, page); + let unrequested_features = v & !self.device_features; + if unrequested_features != 0 { + v &= !unrequested_features; + } + self.driver_features |= v; + } + + /// Read data of config from guest. + fn read_config(&self, offset: u64, mut data: &mut [u8]) -> Result<()> { + let config_len = self.config_space.len() as u64; + if offset >= config_len { + return Err(ErrorKind::DevConfigOverflow(offset, config_len).into()); + } + if let Some(end) = offset.checked_add(data.len() as u64) { + data.write_all( + &self.config_space[offset as usize..cmp::min(end, config_len) as usize], + )?; + } + + Ok(()) + } + + /// Write data to config from guest. + fn write_config(&mut self, offset: u64, data: &[u8]) -> Result<()> { + let data_len = data.len(); + let config_len = self.config_space.len(); + if offset as usize + data_len > config_len { + return Err(ErrorKind::DevConfigOverflow(offset, config_len as u64).into()); + } + + self.config_space[(offset as usize)..(offset as usize + data_len)] + .copy_from_slice(&data[..]); + + Ok(()) + } + + /// Activate the virtio device, this function is called by vcpu thread when frontend + /// virtio driver is ready and write `DRIVER_OK` to backend. + fn activate( + &mut self, + mem_space: Arc, + interrupt_evt: EventFd, + interrupt_status: Arc, + mut queues: Vec>>, + mut queue_evts: Vec, + ) -> Result<()> { + let interrupt_evt = interrupt_evt.try_clone()?; + let interrupt_status = interrupt_status; + let cb = Arc::new(Box::new(move |status: u32| { + interrupt_status.fetch_or(status, Ordering::SeqCst); + interrupt_evt.write(1).chain_err(|| ErrorKind::EventFdWrite) + }) as VirtioBlockInterrupt); + + self.interrupt_cb = Some(cb.clone()); + let (sender, receiver) = channel(); + self.sender = Some(sender); + + let handler = BlockIoHandler { + queue: queues.remove(0), + queue_evt: queue_evts.remove(0), + mem_space, + disk_image: self.disk_image.take(), + disk_sectors: self.disk_sectors, + direct: self.blk_cfg.direct, + serial_num: self.blk_cfg.serial_num.clone(), + aio: None, + driver_features: self.driver_features, + receiver, + update_evt: self.update_evt.as_raw_fd(), + interrupt_cb: cb, + }; + handler.add_event_notifiers()?; + + Ok(()) + } + + fn update_config(&mut self, dev_config: Option>) -> Result<()> { + if let Some(conf) = dev_config { + self.blk_cfg = conf.as_any().downcast_ref::().unwrap().clone(); + } else { + self.blk_cfg = Default::default(); + } + + self.realize()?; + + if let Some(sender) = &self.sender { + sender + .send(( + self.disk_image.take(), + self.disk_sectors, + self.blk_cfg.serial_num.clone(), + self.blk_cfg.direct, + )) + .chain_err(|| ErrorKind::ChannelSend("image fd".to_string()))?; + + self.update_evt + .write(1) + .chain_err(|| ErrorKind::EventFdWrite)?; + } + + if let Some(interrupt_cb) = &self.interrupt_cb { + interrupt_cb(VIRTIO_MMIO_INT_CONFIG).chain_err(|| ErrorKind::EventFdWrite)?; + } + + Ok(()) + } +} diff --git a/device_model/src/virtio/console.rs b/device_model/src/virtio/console.rs new file mode 100644 index 00000000..5231181b --- /dev/null +++ b/device_model/src/virtio/console.rs @@ -0,0 +1,398 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::cmp; +use std::io::{Read, Write}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Mutex}; + +use address_space::AddressSpace; +use machine_manager::config::ConsoleConfig; +use util::byte_code::ByteCode; +use util::epoll_context::{read_fd, EventNotifier, EventNotifierHelper, NotifierOperation}; +use util::num_ops::{read_u32, write_u32}; +use util::unix::limit_permission; +use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::eventfd::EventFd; + +use super::super::micro_vm::main_loop::MainLoop; +use super::errors::{ErrorKind, Result, ResultExt}; +use super::{ + Queue, VirtioDevice, VIRTIO_CONSOLE_F_SIZE, VIRTIO_F_VERSION_1, VIRTIO_MMIO_INT_VRING, + VIRTIO_TYPE_CONSOLE, +}; + +/// Number of virtqueues. +const QUEUE_NUM_CONSOLE: usize = 2; +/// Size of virtqueue. +const QUEUE_SIZE_CONSOLE: u16 = 256; + +#[derive(Copy, Clone, Debug, Default)] +#[repr(C)] +pub struct VirtioConsoleConfig { + max_nr_ports: u32, + emerg_wr: u32, +} + +impl ByteCode for VirtioConsoleConfig {} + +impl VirtioConsoleConfig { + /// Create configuration of virtio-console devices. + pub fn new() -> Self { + VirtioConsoleConfig { + max_nr_ports: 1_u32, + emerg_wr: 0_u32, + } + } +} + +/// Console device's IO handle context. +struct ConsoleHandler { + /// Virtqueue for console input. + input_queue: Arc>, + /// Virtqueue for console output. + output_queue: Arc>, + /// Eventfd of output_queue. + output_queue_evt: EventFd, + /// The address space to which the console device belongs. + mem_space: Arc, + /// Eventfd for triggering interrupts. + interrupt_evt: EventFd, + /// State of the interrupt in the device/function. + interrupt_status: Arc, + /// Bit mask of features negotiated by the backend and the frontend. + driver_features: u64, + /// Unix domain socket server. + listener: UnixListener, + /// Unix stream socket got by the incoming connection. + client: Option, +} + +impl ConsoleHandler { + #[allow(clippy::useless_asref)] + /// Handler for console input. + /// + /// # Arguments + /// + /// * `buffer` - where to put the input data. + pub fn input_handle(&mut self, buffer: &mut [u8]) -> Result<()> { + let mut queue_lock = self.input_queue.lock().unwrap(); + + let count = buffer.len(); + if count == 0 { + return Ok(()); + } + + while let Ok(elem) = queue_lock + .vring + .pop_avail(&self.mem_space, self.driver_features) + { + let mut write_count = 0_usize; + for elem_iov in elem.in_iovec.iter() { + let allow_write_count = cmp::min(write_count + elem_iov.len as usize, count); + let source_slice = &mut buffer[write_count..allow_write_count]; + + let write_result = self.mem_space.write( + &mut source_slice.as_ref(), + elem_iov.addr, + source_slice.len() as u64, + ); + match write_result { + Ok(_) => { + write_count = allow_write_count; + } + Err(e) => { + error!("Failed to write slice: {:?}", e); + break; + } + } + } + + match queue_lock + .vring + .add_used(&self.mem_space, elem.index, write_count as u32) + { + Ok(_) => (), + Err(e) => { + error!("Failed to add used ring {}: {:?}", elem.index, e); + break; + } + } + + if write_count >= count { + break; + } + } + + self.interrupt_status + .fetch_or(VIRTIO_MMIO_INT_VRING, Ordering::SeqCst); + self.interrupt_evt + .write(1) + .chain_err(|| ErrorKind::EventFdWrite)?; + Ok(()) + } + + /// Handler for console output. + pub fn output_handle(&mut self) -> Result<()> { + let mut queue_lock = self.output_queue.lock().unwrap(); + let mut buffer = [0_u8; 4096]; + + while let Ok(elem) = queue_lock + .vring + .pop_avail(&self.mem_space, self.driver_features) + { + let mut read_count = 0_usize; + for elem_iov in elem.out_iovec.iter() { + let allow_read_count = cmp::min(read_count + elem_iov.len as usize, buffer.len()); + let mut slice = &mut buffer[read_count..allow_read_count]; + + let read_result = self.mem_space.read( + &mut slice, + elem_iov.addr, + (allow_read_count - read_count) as u64, + ); + match read_result { + Ok(_) => { + read_count = allow_read_count; + } + Err(e) => { + error!("Failed to read buffer: {:?}", e); + break; + } + }; + } + + if let Some(mut client) = self.client.as_ref() { + if let Err(e) = client.write(&buffer[..read_count as usize]) { + error!("Failed to write console output: {}.", e); + }; + } + + if let Err(e) = queue_lock.vring.add_used(&self.mem_space, elem.index, 0) { + error!("Failed to add used ring {}: {:?}", elem.index, e); + break; + } + } + + Ok(()) + } +} + +impl EventNotifierHelper for ConsoleHandler { + fn internal_notifiers(console_handler: Arc>) -> Vec { + let mut notifiers = Vec::new(); + + let cls_outer = console_handler.clone(); + let handler = Box::new(move |_, _| { + let cls = cls_outer.clone(); + let (stream, _) = cls.lock().unwrap().listener.accept().unwrap(); + let listener_fd = cls.lock().unwrap().listener.as_raw_fd(); + let stream_fd = stream.as_raw_fd(); + cls.lock().unwrap().client = Some(stream); + let cls_inner = cls.clone(); + + let cls_mid = cls; + let handler = Box::new(move |event, _| { + if event == EventSet::IN { + let cls_inner = cls_mid.clone(); + let mut cls_inner_lk = cls_inner.lock().unwrap(); + + if let Some(client) = &cls_inner_lk.client { + let mut client_inner = client.try_clone().unwrap(); + + let mut buffer = [0_u8; 4096]; + if let Ok(nr) = client_inner.read(&mut buffer) { + let _ = cls_inner_lk.input_handle(&mut buffer[..nr]); + } + } + } + + if event & EventSet::HANG_UP == EventSet::HANG_UP { + cls_inner.lock().unwrap().client = None; + Some(vec![EventNotifier::new( + NotifierOperation::Delete, + stream_fd, + Some(listener_fd), + EventSet::IN | EventSet::HANG_UP, + Vec::new(), + )]) + } else { + None as Option> + } + }); + + Some(vec![EventNotifier::new( + NotifierOperation::AddShared, + stream_fd, + Some(listener_fd), + EventSet::IN | EventSet::HANG_UP, + vec![Arc::new(Mutex::new(handler))], + )]) + }); + + notifiers.push(EventNotifier::new( + NotifierOperation::AddShared, + console_handler.lock().unwrap().listener.as_raw_fd(), + None, + EventSet::IN, + vec![Arc::new(Mutex::new(handler))], + )); + + let cls = console_handler.clone(); + let handler = Box::new(move |_, fd: RawFd| { + read_fd(fd); + + let _ = cls.clone().lock().unwrap().output_handle(); + + None as Option> + }); + + notifiers.push(EventNotifier::new( + NotifierOperation::AddShared, + console_handler.lock().unwrap().output_queue_evt.as_raw_fd(), + None, + EventSet::IN, + vec![Arc::new(Mutex::new(handler))], + )); + + notifiers + } +} + +/// Virtio console device structure. +pub struct Console { + /// Virtio configuration. + config: Arc>, + /// Bit mask of features supported by the backend. + device_features: u64, + /// Bit mask of features negotiated by the backend and the frontend. + driver_features: u64, + /// UnixListener for virtio-console to communicate in host. + listener: UnixListener, +} + +impl Console { + /// Create a virtio-console device. + /// + /// # Arguments + /// + /// * `console_cfg` - Device configuration set by user. + pub fn new(console_cfg: ConsoleConfig) -> Self { + let path = console_cfg.socket_path; + let listener = UnixListener::bind(path.as_str()) + .unwrap_or_else(|_| panic!("Failed to bind socket {}", path)); + + limit_permission(path.as_str()) + .unwrap_or_else(|_| panic!("Failed to change file permission for {}", path)); + + Console { + config: Arc::new(Mutex::new(VirtioConsoleConfig::new())), + device_features: 0_u64, + driver_features: 0_u64, + listener, + } + } +} + +impl VirtioDevice for Console { + /// Realize vhost virtio network device. + fn realize(&mut self) -> Result<()> { + self.device_features = 1_u64 << VIRTIO_F_VERSION_1 | 1_u64 << VIRTIO_CONSOLE_F_SIZE; + + Ok(()) + } + + /// Get the virtio device type, refer to Virtio Spec. + fn device_type(&self) -> u32 { + VIRTIO_TYPE_CONSOLE + } + + /// Get the count of virtio device queues. + fn queue_num(&self) -> usize { + QUEUE_NUM_CONSOLE + } + + /// Get the queue size of virtio device. + fn queue_size(&self) -> u16 { + QUEUE_SIZE_CONSOLE + } + + /// Get device features from host. + fn get_device_features(&self, features_select: u32) -> u32 { + read_u32(self.device_features, features_select) + } + + /// Set driver features by guest. + fn set_driver_features(&mut self, page: u32, value: u32) { + let mut v = write_u32(value, page); + let unrequested_features = v & !self.device_features; + if unrequested_features != 0 { + warn!("Received acknowledge request with unknown feature."); + v &= !unrequested_features; + } + self.driver_features |= v; + } + + /// Read data of config from guest. + fn read_config(&self, offset: u64, mut data: &mut [u8]) -> Result<()> { + let config = *self.config.lock().unwrap(); + let config_slice = config.as_bytes(); + let config_len = config_slice.len() as u64; + if offset >= config_len { + return Err(ErrorKind::DevConfigOverflow(offset, config_len).into()); + } + + if let Some(end) = offset.checked_add(data.len() as u64) { + data.write_all(&config_slice[offset as usize..cmp::min(end, config_len) as usize])?; + } + + Ok(()) + } + + /// Write data to config from guest. + fn write_config(&mut self, _offset: u64, _data: &[u8]) -> Result<()> { + bail!("No device config space") + } + + /// Activate the virtio device, this function is called by vcpu thread when frontend + /// virtio driver is ready and write `DRIVER_OK` to backend. + fn activate( + &mut self, + mem_space: Arc, + interrupt_evt: EventFd, + interrupt_status: Arc, + mut queues: Vec>>, + mut queue_evts: Vec, + ) -> Result<()> { + queue_evts.remove(0); // input_queue_evt never used + + let handler = ConsoleHandler { + input_queue: queues.remove(0), + output_queue: queues.remove(0), + output_queue_evt: queue_evts.remove(0), + mem_space, + interrupt_evt: interrupt_evt.try_clone()?, + interrupt_status, + driver_features: self.driver_features, + listener: self.listener.try_clone()?, + client: None, + }; + + MainLoop::update_event(EventNotifierHelper::internal_notifiers(Arc::new( + Mutex::new(handler), + )))?; + + Ok(()) + } +} diff --git a/device_model/src/virtio/mod.rs b/device_model/src/virtio/mod.rs new file mode 100644 index 00000000..aee8b05f --- /dev/null +++ b/device_model/src/virtio/mod.rs @@ -0,0 +1,235 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # Virtio +//! +//! This mod is used for virtio device. +//! +//! ## Design +//! +//! This module offers support for: +//! 1. Some Spec specified const variable used by virtio device. +//! 2. Virtio Device trait +//! +//! ## Platform Support +//! +//! - `x86_64` +//! - `aarch64` +pub mod block; +pub mod console; +pub mod net; +mod queue; +pub mod vhost; + +pub use self::block::Block; +pub use self::console::Console; +pub use self::net::Net; +pub use self::queue::*; + +use std::sync::atomic::AtomicU32; +use std::sync::{Arc, Mutex}; + +use address_space::AddressSpace; +use machine_manager::config::ConfigCheck; +use vmm_sys_util::eventfd::EventFd; + +/// Check if the bit of features is configured. +pub fn virtio_has_feature(feature: u64, fbit: u32) -> bool { + feature & (1 << fbit) != 0 +} + +/// Identifier of different virtio device, refer to Virtio Spec. +pub const VIRTIO_TYPE_NET: u32 = 1; +pub const VIRTIO_TYPE_BLOCK: u32 = 2; +pub const VIRTIO_TYPE_CONSOLE: u32 = 3; +pub const _VIRTIO_TYPE_RNG: u32 = 4; +pub const _VIRTIO_TYPE_BALLOON: u32 = 5; +pub const VIRTIO_TYPE_VSOCK: u32 = 19; +pub const _VIRTIO_TYPE_FS: u32 = 26; + +/// Feature Bits, refer to Virtio Spec. +/// Negotiating this feature indicates that the driver can use descriptors +/// with the VIRTQ_DESC_F_INDIRECT flag set. +pub const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +/// This feature enables the used_event and the avail_event fields. +pub const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +/// Indicates compliance with Virtio Spec. +pub const VIRTIO_F_VERSION_1: u32 = 32; +/// This feature indicates that the device can be used on a platform +/// where device access to data in memory is limited and/or translated. +pub const VIRTIO_F_ACCESS_PLATFORM: u32 = 33; +/// This feature indicates support for the packed virtqueue layout. +pub const VIRTIO_F_RING_PACKED: u32 = 34; + +/// Device handles packets with partial checksum. +pub const VIRTIO_NET_F_CSUM: u32 = 0; +/// Driver handles packets with partial checksum. +pub const VIRTIO_NET_F_GUEST_CSUM: u32 = 1; +/// Device has given MAC address. +pub const VIRTIO_NET_F_MAC: u32 = 5; +/// Driver can receive TSOv4. +pub const VIRTIO_NET_F_GUEST_TSO4: u32 = 7; +/// Driver can receive UFO. +pub const VIRTIO_NET_F_GUEST_UFO: u32 = 10; +/// Device can receive TSOv4. +pub const VIRTIO_NET_F_HOST_TSO4: u32 = 11; +/// Device can receive UFO. +pub const VIRTIO_NET_F_HOST_UFO: u32 = 14; +/// Configuration cols and rows are valid. +pub const VIRTIO_CONSOLE_F_SIZE: u64 = 0; +/// Maximum size of any single segment is in size_max. +pub const VIRTIO_BLK_F_SIZE_MAX: u32 = 1; +/// Maximum number of segments in a request is in seg_max. +pub const VIRTIO_BLK_F_SEG_MAX: u32 = 2; +/// Device is read-only. +pub const VIRTIO_BLK_F_RO: u32 = 5; +/// Cache flush command support. +pub const VIRTIO_BLK_F_FLUSH: u32 = 9; + +/// The IO type of virtio block, refer to Virtio Spec. +/// Read. +pub const VIRTIO_BLK_T_IN: u32 = 0; +/// Write. +pub const VIRTIO_BLK_T_OUT: u32 = 1; +/// Flush. +pub const VIRTIO_BLK_T_FLUSH: u32 = 4; +/// Device id +pub const VIRTIO_BLK_T_GET_ID: u32 = 8; +/// Device id length +pub const VIRTIO_BLK_ID_BYTES: u32 = 20; +/// Success +pub const VIRTIO_BLK_S_OK: u32 = 0; + +/// Interrupt status: Used Buffer Notification +pub const VIRTIO_MMIO_INT_VRING: u32 = 0x01; +/// Interrupt status: Configuration Change Notification +pub const VIRTIO_MMIO_INT_CONFIG: u32 = 0x02; + +/// The offset between notify reg's address and base MMIO address +/// Guest OS uses notify reg to notify the VMM. +pub const NOTIFY_REG_OFFSET: u32 = 0x50; + +/// Packet header, refer to Virtio Spec. +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct VirtioNetHdr { + pub flags: u8, + pub gso_type: u8, + pub hdr_len: u16, + pub gso_size: u16, + pub csum_start: u16, + pub csum_offset: u16, + pub num_buffers: u16, +} + +pub mod errors { + error_chain! { + foreign_links { + Io(std::io::Error); + } + links { + Util(util::errors::Error, util::errors::ErrorKind); + AddressSpace(address_space::errors::Error, address_space::errors::ErrorKind); + } + errors { + EventFdCreate { + display("Failed to create eventfd.") + } + EventFdWrite { + display("Failed to write eventfd.") + } + ThreadCreate(name: String) { + display("Failed to create {} thread", name) + } + ChannelSend(value: String) { + display("Failed to send {} on the channel", value) + } + QueueIndex(index: u16, size: u16) { + display("Queue index {} invalid, queue size is {}", index, size) + } + QueueDescInvalid { + display("Vring descriptor is invalid") + } + DevConfigOverflow(offset: u64, size: u64) { + display("Failed to r/w dev config space: overflows, offset {}, space size {}", offset, size) + } + InterruptTrigger { + display("Failed to trigger interrupt") + } + VhostIoctl(ioctl: String) { + display("Vhost ioctl failed: {}", ioctl) + } + } + } +} +pub use self::errors::*; + +/// The trait for virtio device operations. +pub trait VirtioDevice: Send { + /// Realize low level device. + fn realize(&mut self) -> Result<()>; + + /// Get the virtio device type, refer to Virtio Spec. + fn device_type(&self) -> u32; + + /// Get the count of virtio device queues. + fn queue_num(&self) -> usize; + + /// Get the queue size of virtio device. + fn queue_size(&self) -> u16; + + /// Get device features from host. + fn get_device_features(&self, features_select: u32) -> u32; + + /// Set driver features by guest. + fn set_driver_features(&mut self, page: u32, value: u32); + + /// Read data of config from guest. + fn read_config(&self, offset: u64, data: &mut [u8]) -> Result<()>; + + /// Write data to config from guest. + fn write_config(&mut self, offset: u64, data: &[u8]) -> Result<()>; + + /// Activate the virtio device, this function is called by vcpu thread when frontend + /// virtio driver is ready and write `DRIVER_OK` to backend. + /// + /// # Arguments + /// + /// * `mem_space` - System mem. + /// * `interrupt_evt` - The eventfd used to send interrupt to guest. + /// * `interrupt_status` - The interrupt status present to guest. + /// * `queues` - The virtio queues. + /// * `queue_evts` - The notifier events from guest. + fn activate( + &mut self, + mem_space: Arc, + interrupt_evt: EventFd, + interrupt_status: Arc, + queues: Vec>>, + queue_evts: Vec, + ) -> Result<()>; + + /// Reset virtio device. + fn reset(&mut self) -> Option<()> { + None + } + + /// Update the low level config of MMIO device, + /// for example: update the images file fd of virtio block device. + /// + /// # Arguments + /// + /// * `_file_path` - The related backend file path. + fn update_config(&mut self, _dev_config: Option>) -> Result<()> { + bail!("Unsupported to update configuration") + } +} diff --git a/device_model/src/virtio/net.rs b/device_model/src/virtio/net.rs new file mode 100644 index 00000000..05b41b50 --- /dev/null +++ b/device_model/src/virtio/net.rs @@ -0,0 +1,706 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::io::Write; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::mpsc::{channel, Receiver, Sender}; +use std::sync::{Arc, Mutex}; +use std::{cmp, mem}; + +use address_space::AddressSpace; +use machine_manager::config::{ConfigCheck, NetworkInterfaceConfig}; +use util::byte_code::ByteCode; +use util::epoll_context::{ + read_fd, EventNotifier, EventNotifierHelper, NotifierCallback, NotifierOperation, +}; +use util::num_ops::{read_u32, write_u32}; +use util::tap::{Tap, TUN_F_VIRTIO}; +use vmm_sys_util::{epoll::EventSet, eventfd::EventFd}; + +use super::super::micro_vm::main_loop::MainLoop; +use super::errors::{ErrorKind, Result, ResultExt}; +use super::{ + Queue, VirtioDevice, VirtioNetHdr, VIRTIO_F_VERSION_1, VIRTIO_MMIO_INT_VRING, + VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_UFO, + VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC, VIRTIO_TYPE_NET, +}; + +/// Number of virtqueues. +const QUEUE_NUM_NET: usize = 2; +/// Size of each virtqueue. +const QUEUE_SIZE_NET: u16 = 256; +/// The maximum buffer size when segmentation offload is enabled. +/// This includes a 12-byte virtio net header, refer to Virtio Spec. +const FRAME_BUF_SIZE: usize = 65562; + +type SenderConfig = Option; + +/// Configuration of virtio-net devices. +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, Default)] +pub struct VirtioNetConfig { + /// Mac Address. + pub mac: [u8; 6], + /// Device status. + pub status: u16, + /// Maximum number of each of transmit and receive queues. + pub max_virtqueue_pairs: u16, + /// Maximum Transmission Unit. + pub mtu: u16, + /// Speed, in units of 1Mb. + pub speed: u32, + /// 0x00 - half duplex + /// 0x01 - full duplex + pub duplex: u8, +} + +impl ByteCode for VirtioNetConfig {} + +/// Transmit virtqueue. +struct TxVirtio { + /// Virtqueue. + queue: Arc>, + /// Eventfd of this virtqueue for notifing. + queue_evt: EventFd, + /// Buffer data to transmit. + frame_buf: [u8; FRAME_BUF_SIZE], +} + +impl TxVirtio { + /// Create a transmit virqueue. + /// + /// # Arguments + /// + /// * `queue` - The virtqueue. + /// * `queue_evt` - Eventfd of this virtqueue for notifing. + fn new(queue: Arc>, queue_evt: EventFd) -> Self { + TxVirtio { + queue, + queue_evt, + frame_buf: [0u8; FRAME_BUF_SIZE], + } + } +} + +/// Receive virtqueue. +struct RxVirtio { + /// True if some frame not received successfully. + unfinished_frame: bool, + /// True if interrupt is required to notify the guest. + need_irqs: bool, + /// Virtqueue. + queue: Arc>, + /// Eventfd of this virtqueue for notifing. + queue_evt: EventFd, + /// Size of data received. + bytes_read: usize, + /// Buffer data received. + frame_buf: [u8; FRAME_BUF_SIZE], +} + +impl RxVirtio { + /// Create a receive virqueue. + /// + /// # Arguments + /// + /// * `queue` - The virtqueue. + /// * `queue_evt` - Eventfd of this virtqueue for notifing. + fn new(queue: Arc>, queue_evt: EventFd) -> Self { + RxVirtio { + unfinished_frame: false, + need_irqs: false, + queue, + queue_evt, + bytes_read: 0, + frame_buf: [0u8; FRAME_BUF_SIZE], + } + } +} + +/// Control block of network IO. +pub struct NetIoHandler { + /// The receive virtqueue. + rx: RxVirtio, + /// The transmit virtqueue. + tx: TxVirtio, + /// Tap device opened. + tap: Option, + tap_fd: RawFd, + /// The address space to which the network device belongs. + mem_space: Arc, + /// Eventfd for interrupt. + interrupt_evt: EventFd, + /// State of the interrupt in the device/function. + interrupt_status: Arc, + /// Bit mask of features negotiated by the backend and the frontend. + driver_features: u64, + /// The receiving half of Rust's channel to receive tap information. + receiver: Receiver, + /// Eventfd for config space update. + update_evt: RawFd, +} + +impl NetIoHandler { + #[allow(clippy::useless_asref)] + fn handle_frame_rx(&mut self) -> Result<()> { + let elem = self + .rx + .queue + .lock() + .unwrap() + .vring + .pop_avail(&self.mem_space, self.driver_features) + .chain_err(|| "Failed to pop avail ring")?; + + let mut write_count = 0; + for elem_iov in elem.in_iovec.iter() { + let allow_write_count = + cmp::min(write_count + elem_iov.len as usize, self.rx.bytes_read); + + let source_slice = &self.rx.frame_buf[write_count..allow_write_count]; + match self.mem_space.write( + &mut source_slice.as_ref(), + elem_iov.addr, + source_slice.len() as u64, + ) { + Ok(_) => { + write_count = allow_write_count; + } + Err(e) => { + error!("Failed to write slice: err {:?}", e); + break; + } + } + + if write_count >= self.rx.bytes_read { + break; + } + } + + self.rx + .queue + .lock() + .unwrap() + .vring + .add_used(&self.mem_space, elem.index, write_count as u32) + .chain_err(|| format!("Failed to add used ring {}", elem.index))?; + self.rx.need_irqs = true; + + if write_count < self.rx.bytes_read { + bail!( + "The length {} which is written is less than the length {} of buffer which is read", + write_count, + self.rx.bytes_read + ); + } + + Ok(()) + } + + fn handle_last_frame_rx(&mut self) -> Result<()> { + if self.handle_frame_rx().is_ok() { + self.rx.unfinished_frame = false; + self.handle_rx()?; + } else if self.rx.need_irqs { + self.rx.need_irqs = false; + self.interrupt_status + .fetch_or(VIRTIO_MMIO_INT_VRING, Ordering::SeqCst); + self.interrupt_evt + .write(1) + .chain_err(|| ErrorKind::EventFdWrite)?; + } + + Ok(()) + } + + fn handle_rx(&mut self) -> Result<()> { + while let Some(tap) = self.tap.as_mut() { + match tap.read(&mut self.rx.frame_buf) { + Ok(count) => { + self.rx.bytes_read = count; + if self.handle_frame_rx().is_err() { + self.rx.unfinished_frame = true; + break; + } + } + Err(e) => { + match e.raw_os_error() { + Some(err) if err == libc::EAGAIN => (), + _ => { + bail!("Failed to read tap"); + } + }; + break; + } + } + } + + if self.rx.need_irqs { + self.rx.need_irqs = false; + self.interrupt_status + .fetch_or(VIRTIO_MMIO_INT_VRING, Ordering::SeqCst); + self.interrupt_evt + .write(1) + .chain_err(|| ErrorKind::EventFdWrite)?; + } + + Ok(()) + } + + fn handle_tx(&mut self) -> Result<()> { + let mut queue = self.tx.queue.lock().unwrap(); + + while let Ok(elem) = queue.vring.pop_avail(&self.mem_space, self.driver_features) { + let mut read_count = 0; + for elem_iov in elem.out_iovec.iter() { + let alloc_read_count = + cmp::min(read_count + elem_iov.len as usize, self.tx.frame_buf.len()); + + let mut slice = &mut self.tx.frame_buf[read_count..alloc_read_count as usize]; + self.mem_space + .read( + &mut slice, + elem_iov.addr, + (alloc_read_count - read_count) as u64, + ) + .chain_err(|| "Failed to read buffer for transmit")?; + + read_count = alloc_read_count; + } + if let Some(tap) = self.tap.as_mut() { + tap.write(&self.tx.frame_buf[..read_count as usize]) + .chain_err(|| "Net: tx: failed to write to tap")?; + } + + queue + .vring + .add_used(&self.mem_space, elem.index, 0) + .chain_err(|| format!("Net tx:Failed to add used ring {}", elem.index))?; + } + + Ok(()) + } + + fn update_evt_handler(net_io: &Arc>) -> Option> { + let mut locked_net_io = net_io.lock().unwrap(); + locked_net_io.tap = match locked_net_io.receiver.recv() { + Ok(tap) => tap, + Err(e) => { + error!("Failed to receive the tap {}", e); + None + } + }; + let old_tap_fd = locked_net_io.tap_fd; + locked_net_io.tap_fd = -1; + if let Some(tap) = locked_net_io.tap.as_ref() { + locked_net_io.tap_fd = tap.as_raw_fd(); + } + + let mut notifiers = Vec::new(); + notifiers.push(build_event_notifier( + locked_net_io.update_evt, + None, + NotifierOperation::Delete, + EventSet::IN, + )); + notifiers.push(build_event_notifier( + locked_net_io.rx.queue_evt.as_raw_fd(), + None, + NotifierOperation::Delete, + EventSet::IN, + )); + notifiers.push(build_event_notifier( + locked_net_io.tx.queue_evt.as_raw_fd(), + None, + NotifierOperation::Delete, + EventSet::IN, + )); + if old_tap_fd != -1 { + notifiers.push(build_event_notifier( + old_tap_fd, + None, + NotifierOperation::Delete, + EventSet::IN, + )); + } + drop(locked_net_io); + + notifiers.append(&mut EventNotifierHelper::internal_notifiers(net_io.clone())); + Some(notifiers) + } +} + +fn build_event_notifier( + fd: RawFd, + handler: Option>, + op: NotifierOperation, + event: EventSet, +) -> EventNotifier { + let mut handlers = Vec::new(); + if let Some(h) = handler { + handlers.push(Arc::new(Mutex::new(h))); + } + EventNotifier::new(op, fd, None, event, handlers) +} + +impl EventNotifierHelper for NetIoHandler { + fn internal_notifiers(net_io: Arc>) -> Vec { + // Register event notifier for update_evt. + let locked_net_io = net_io.lock().unwrap(); + let cloned_net_io = net_io.clone(); + let handler: Box = Box::new(move |_, fd: RawFd| { + read_fd(fd); + NetIoHandler::update_evt_handler(&cloned_net_io) + }); + let mut notifiers = Vec::new(); + let update_fd = locked_net_io.update_evt; + notifiers.push(build_event_notifier( + update_fd, + Some(handler), + NotifierOperation::AddShared, + EventSet::IN, + )); + + // Register event notifier for rx. + let cloned_net_io = net_io.clone(); + let handler: Box = Box::new(move |_, fd: RawFd| { + let mut locked_net_io = cloned_net_io.lock().unwrap(); + read_fd(fd); + if locked_net_io.rx.unfinished_frame { + locked_net_io + .handle_last_frame_rx() + .map_err(|e| error!("Failed to handle last frame(rx), {}", e)) + .ok(); + } + None + }); + let rx_fd = locked_net_io.rx.queue_evt.as_raw_fd(); + notifiers.push(build_event_notifier( + rx_fd, + Some(handler), + NotifierOperation::AddShared, + EventSet::IN, + )); + + // Register event notifier for tx. + let cloned_net_io = net_io.clone(); + let handler: Box = Box::new(move |_, fd: RawFd| { + read_fd(fd); + cloned_net_io + .lock() + .unwrap() + .handle_tx() + .map_err(|e| error!("Failed to handle tx, {}", e)) + .ok(); + None + }); + let tx_fd = locked_net_io.tx.queue_evt.as_raw_fd(); + notifiers.push(build_event_notifier( + tx_fd, + Some(handler), + NotifierOperation::AddShared, + EventSet::IN, + )); + + // Register event notifier for tap. + let cloned_net_io = net_io.clone(); + if let Some(tap) = locked_net_io.tap.as_ref() { + let handler: Box = Box::new(move |_, _| { + let mut locked_net_io = cloned_net_io.lock().unwrap(); + if locked_net_io.rx.unfinished_frame { + locked_net_io + .handle_last_frame_rx() + .map_err(|e| error!("Failed to handle last frame(rx), {}", e)) + .ok(); + } else { + locked_net_io + .handle_rx() + .map_err(|e| error!("Failed to handle rx, {}", e)) + .ok(); + } + None + }); + let tap_fd = tap.as_raw_fd(); + notifiers.push(build_event_notifier( + tap_fd, + Some(handler), + NotifierOperation::AddShared, + EventSet::IN | EventSet::EDGE_TRIGGERED, + )); + } + + notifiers + } +} + +/// Network device structure. +pub struct Net { + /// Configuration of the network device. + net_cfg: NetworkInterfaceConfig, + /// Tap device opened. + tap: Option, + /// Bit mask of features supported by the backend. + device_features: u64, + /// Bit mask of features negotiated by the backend and the frontend. + driver_features: u64, + /// Virtio net configurations. + device_config: VirtioNetConfig, + /// The send half of Rust's channel to send tap information. + sender: Option>, + /// Eventfd for config space update. + update_evt: EventFd, +} + +/// Set Mac address configured into the virtio configuration, and return features mask with +/// VIRTIO_NET_F_MAC set. +/// +/// # Arguments +/// +/// * `device_config` - Virtio net configurations. +/// * `mac` - Mac address configured by user. +pub fn build_device_config_space(device_config: &mut VirtioNetConfig, mac: &str) -> u64 { + let mut config_features = 0_u64; + let mut bytes = [0_u8; 6]; + for (i, s) in mac.split(':').collect::>().iter().enumerate() { + bytes[i] = if let Ok(v) = u8::from_str_radix(s, 16) { + v + } else { + return config_features; + }; + } + device_config.mac.copy_from_slice(&bytes); + config_features |= 1 << VIRTIO_NET_F_MAC; + + config_features +} + +/// Open tap device if no fd provided, configure and return it. +/// +/// # Arguments +/// +/// * `net_fd` - Fd of tap device opened. +/// * `host_dev_name` - Path of tap device on host. +pub fn create_tap(net_fd: Option, host_dev_name: Option<&str>) -> Result> { + if net_fd.is_none() && host_dev_name.is_none() { + return Ok(None); + } + if net_fd.is_some() && host_dev_name.is_some() { + error!("Create tap: fd and file_path exist meanwhile (use fd by default)"); + } + + let tap = if let Some(fd) = net_fd { + Tap::new(None, Some(fd)).chain_err(|| "Failed to create tap")? + } else { + // `unwrap()` won't fail because the arguments have been checked + let dev_name = host_dev_name.unwrap(); + Tap::new(Some(dev_name), None) + .chain_err(|| format!("Failed to create tap with name {}", dev_name))? + }; + + tap.set_offload(TUN_F_VIRTIO) + .chain_err(|| "Failed to set tap offload")?; + + let vnet_hdr_size = mem::size_of::() as u32; + tap.set_hdr_size(vnet_hdr_size) + .chain_err(|| "Failed to set tap hdr size")?; + + Ok(Some(tap)) +} + +impl Net { + /// Create a new virtio network device. + /// + /// # Arguments + /// + /// * `net_cfg` - Configuration of the network device. + pub fn new() -> Self { + Net { + net_cfg: Default::default(), + tap: None, + device_features: 0_u64, + driver_features: 0_u64, + device_config: VirtioNetConfig::default(), + sender: None, + update_evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + } + } +} + +impl VirtioDevice for Net { + /// Realize vhost virtio network device. + fn realize(&mut self) -> Result<()> { + self.device_features = 1 << VIRTIO_F_VERSION_1 + | 1 << VIRTIO_NET_F_CSUM + | 1 << VIRTIO_NET_F_GUEST_CSUM + | 1 << VIRTIO_NET_F_GUEST_TSO4 + | 1 << VIRTIO_NET_F_GUEST_UFO + | 1 << VIRTIO_NET_F_HOST_TSO4 + | 1 << VIRTIO_NET_F_HOST_UFO; + + if let Some(mac) = &self.net_cfg.mac { + self.device_features |= build_device_config_space(&mut self.device_config, mac); + } + + if self.net_cfg.host_dev_name != "" { + self.tap = None; + self.tap = create_tap(None, Some(&self.net_cfg.host_dev_name)) + .chain_err(|| "Failed to open tap with file path")?; + } else if let Some(fd) = self.net_cfg.tap_fd { + let mut need_create = true; + if let Some(tap) = &self.tap { + if fd == tap.as_raw_fd() { + need_create = false; + } + } + + if need_create { + self.tap = create_tap(Some(fd), None).chain_err(|| "Failed to open tap")?; + } + } else { + self.tap = None; + } + + if let Some(mac) = &self.net_cfg.mac { + self.device_features |= build_device_config_space(&mut self.device_config, mac); + } + + Ok(()) + } + + /// Get the virtio device type, refer to Virtio Spec. + fn device_type(&self) -> u32 { + VIRTIO_TYPE_NET + } + + /// Get the count of virtio device queues. + fn queue_num(&self) -> usize { + QUEUE_NUM_NET + } + + /// Get the queue size of virtio device. + fn queue_size(&self) -> u16 { + QUEUE_SIZE_NET + } + + /// Get device features from host. + fn get_device_features(&self, features_select: u32) -> u32 { + read_u32(self.device_features, features_select) + } + + /// Set driver features by guest. + fn set_driver_features(&mut self, page: u32, value: u32) { + let mut v = write_u32(value, page); + let unrequested_features = v & !self.device_features; + if unrequested_features != 0 { + warn!("Received acknowledge request with unknown feature: {:x}", v); + v &= !unrequested_features; + } + self.driver_features |= v; + } + + /// Read data of config from guest. + fn read_config(&self, offset: u64, mut data: &mut [u8]) -> Result<()> { + let config_slice = self.device_config.as_bytes(); + let config_len = config_slice.len() as u64; + if offset >= config_len { + return Err(ErrorKind::DevConfigOverflow(offset, config_len).into()); + } + if let Some(end) = offset.checked_add(data.len() as u64) { + data.write_all(&config_slice[offset as usize..cmp::min(end, config_len) as usize])?; + } + Ok(()) + } + + /// Write data to config from guest. + fn write_config(&mut self, offset: u64, data: &[u8]) -> Result<()> { + let data_len = data.len(); + let config_slice = self.device_config.as_mut_bytes(); + let config_len = config_slice.len(); + if offset as usize + data_len > config_len { + return Err(ErrorKind::DevConfigOverflow(offset, config_len as u64).into()); + } + + config_slice[(offset as usize)..(offset as usize + data_len)].copy_from_slice(&data[..]); + + Ok(()) + } + + /// Activate the virtio device, this function is called by vcpu thread when frontend + /// virtio driver is ready and write `DRIVER_OK` to backend. + fn activate( + &mut self, + mem_space: Arc, + interrupt_evt: EventFd, + interrupt_status: Arc, + mut queues: Vec>>, + mut queue_evts: Vec, + ) -> Result<()> { + let rx_queue = queues.remove(0); + let rx_queue_evt = queue_evts.remove(0); + let tx_queue = queues.remove(0); + let tx_queue_evt = queue_evts.remove(0); + + let (sender, receiver) = channel(); + self.sender = Some(sender); + + let tap_fd = if let Some(tap) = &self.tap { + tap.as_raw_fd() + } else { + -1 + }; + + let handler = NetIoHandler { + rx: RxVirtio::new(rx_queue, rx_queue_evt), + tx: TxVirtio::new(tx_queue, tx_queue_evt), + tap: self.tap.take(), + tap_fd, + mem_space, + interrupt_evt: interrupt_evt.try_clone()?, + interrupt_status, + driver_features: self.driver_features, + receiver, + update_evt: self.update_evt.as_raw_fd(), + }; + MainLoop::update_event(EventNotifierHelper::internal_notifiers(Arc::new( + Mutex::new(handler), + )))?; + + Ok(()) + } + + fn update_config(&mut self, dev_config: Option>) -> Result<()> { + if let Some(conf) = dev_config { + self.net_cfg = conf + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + } else { + self.net_cfg = Default::default(); + } + + self.realize()?; + + if let Some(sender) = &self.sender { + sender + .send(self.tap.take()) + .chain_err(|| ErrorKind::ChannelSend("tap fd".to_string()))?; + + self.update_evt + .write(1) + .chain_err(|| ErrorKind::EventFdWrite)?; + } + + Ok(()) + } +} diff --git a/device_model/src/virtio/queue.rs b/device_model/src/virtio/queue.rs new file mode 100644 index 00000000..0efa78a5 --- /dev/null +++ b/device_model/src/virtio/queue.rs @@ -0,0 +1,1525 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::cmp::min; +use std::mem::size_of; +use std::num::Wrapping; +use std::sync::atomic::{fence, Ordering}; +use std::sync::Arc; + +use address_space::{AddressSpace, GuestAddress}; +use util::byte_code::ByteCode; + +use super::errors::{ErrorKind, Result, ResultExt}; +use super::{virtio_has_feature, VIRTIO_F_RING_EVENT_IDX}; + +/// When host consumes a buffer, don't interrupt the guest. +const VRING_AVAIL_F_NO_INTERRUPT: u16 = 1; +/// Split Virtqueue. +pub const QUEUE_TYPE_SPLIT_VRING: u16 = 1; +/// Packed Virtqueue. +pub const QUEUE_TYPE_PACKED_VRING: u16 = 2; + +fn checked_offset_mem( + mmio_space: &Arc, + base: GuestAddress, + offset: u64, +) -> Result { + if !mmio_space.address_in_memory(base, offset) { + bail!( + "Invalid Address: base {}, size {}", + base.raw_value(), + offset + ); + } + base.checked_add(offset).ok_or_else(|| { + ErrorKind::Msg(format!( + "Address overflows: base {}, size {}", + base.raw_value(), + offset + )) + .into() + }) +} + +/// The configuration of virtqueue. +#[derive(Default, Clone, Copy)] +pub struct QueueConfig { + /// Guest physical address of the descriptor table. + pub desc_table: GuestAddress, + /// Guest physical address of the available ring. + pub avail_ring: GuestAddress, + /// Guest physical address of the used ring. + pub used_ring: GuestAddress, + /// The maximal size of elements offered by the device. + pub max_size: u16, + /// The queue size set by the guest. + pub size: u16, + /// Virtual queue ready bit. + pub ready: bool, +} + +impl QueueConfig { + /// Create configuration for a virtqueue. + /// + /// # Arguments + /// + /// * `max_size` - The maximum size of the virtqueue. + /// + pub fn new(max_size: u16) -> Self { + QueueConfig { + desc_table: GuestAddress(0), + avail_ring: GuestAddress(0), + used_ring: GuestAddress(0), + max_size, + size: 0, + ready: false, + } + } +} + +/// IO vector element which contains the information of a descriptor. +#[derive(Debug, Clone, Copy)] +pub struct ElemIovec { + /// Guest address of descriptor. + pub addr: GuestAddress, + /// Length of descriptor. + pub len: u32, +} + +/// IO request element. +pub struct Element { + /// Index of the descriptor in the table. + pub index: u16, + /// Number of descriptors. + pub desc_num: u16, + /// Vector to put host readable descriptors. + pub out_iovec: Vec, + /// Vector to put host writable descriptors. + pub in_iovec: Vec, +} + +impl Element { + /// Create an IO request element. + /// + /// # Arguments + /// + /// * `index` - The index of descriptor in the virqueue descriptor table. + pub fn new(index: u16) -> Self { + Element { + index, + desc_num: 0, + out_iovec: Vec::new(), + in_iovec: Vec::new(), + } + } +} + +/// Vring operations. +pub trait VringOps { + /// Return true if the configuration of vring is valid. + /// + /// # Arguments + /// + /// * `sys_mem` - Address space to which the vring belongs. + fn is_valid(&self, sys_mem: &Arc) -> bool; + + /// Assemble an IO request element with descriptors from the available vring. + /// + /// # Arguments + /// + /// * `sys_mem` - Address space to which the vring belongs. + /// * `features` - Bit mask of features negotiated by the backend and the frontend. + fn pop_avail(&mut self, sys_mem: &Arc, features: u64) -> Result; + + /// Fill the used vring after processing the IO request. + /// + /// # Arguments + /// + /// * `sys_mem` - Address space to which the vring belongs. + /// * `index` - Index of descriptor in the virqueue descriptor table. + /// * `len` - Total length of the descriptor chain which was used (written to). + fn add_used(&mut self, sys_mem: &Arc, index: u16, len: u32) -> Result<()>; + + /// Return true if guest needed to be notified. + /// + /// # Arguments + /// + /// * `sys_mem` - Address space to which the vring belongs. + /// * `features` - Bit mask of features negotiated by the backend and the frontend. + fn should_notify(&mut self, system_space: &Arc, features: u64) -> bool; + + /// Get the actual size of the vring. + fn actual_size(&self) -> u16; + + /// Get the configuration of the vring. + fn get_queue_config(&self) -> QueueConfig; +} + +/// Virtio used element. +#[repr(C)] +#[derive(Default, Clone, Copy)] +struct UsedElem { + /// Index of descriptor in the virqueue descriptor table. + id: u32, + /// Total length of the descriptor chain which was used (written to). + len: u32, +} + +impl ByteCode for UsedElem {} + +/// A struct including flags and idx for avail vring and used vring. +#[repr(C)] +#[derive(Default, Clone, Copy)] +struct SplitVringFlagsIdx { + flags: u16, + idx: u16, +} + +impl ByteCode for SplitVringFlagsIdx {} + +/// The length of used element. +const USEDELEM_LEN: u64 = size_of::() as u64; +/// The length of avail element. +const AVAILELEM_LEN: u64 = size_of::() as u64; +/// The length of available ring except array of avail element(flags: u16 idx: u16 used_event: u16). +const VRING_AVAIL_LEN_EXCEPT_AVAILELEM: u64 = (size_of::() * 3) as u64; +/// The length of used ring except array of used element(flags: u16 idx: u16 avail_event: u16). +const VRING_USED_LEN_EXCEPT_USEDELEM: u64 = (size_of::() * 3) as u64; +/// The length of flags(u16) and idx(u16). +const VRING_FLAGS_AND_IDX_LEN: u64 = size_of::() as u64; +/// The position of idx in the available ring and the used ring. +const VRING_IDX_POSITION: u64 = size_of::() as u64; +/// This marks a buffer as continuing via the next field. +const VIRTQ_DESC_F_NEXT: u16 = 0x1; +/// This marks a buffer as write-only (otherwise read-only). +const VIRTQ_DESC_F_WRITE: u16 = 0x2; +/// This means the buffer contains a list of buffer descriptors. +const VIRTQ_DESC_F_INDIRECT: u16 = 0x4; + +/// Descriptor of split vring. +#[repr(C)] +#[derive(Default, Clone, Copy)] +pub struct SplitVringDesc { + /// Address (guest-physical). + pub addr: GuestAddress, + /// Length. + pub len: u32, + /// The flags as indicated above. + pub flags: u16, + /// We chain unused descriptors via this, too. + pub next: u16, +} + +/// The length of virtio descriptor. +const DESCRIPTOR_LEN: u64 = size_of::() as u64; + +impl SplitVringDesc { + /// Create a descriptor of split vring. + /// + /// # Arguments + /// + /// * `sys_mem` - Address space to which the vring belongs. + /// * `desc_table` - Guest address of virtqueue descriptor table. + /// * `queue_size` - Size of virtqueue. + /// * `index` - Index of descriptor in the virqueue descriptor table. + pub fn new( + sys_mem: &Arc, + desc_table: GuestAddress, + queue_size: u16, + index: u16, + ) -> Result { + if index >= queue_size { + return Err(ErrorKind::QueueIndex(index, queue_size).into()); + } + + let desc = + if let Some(desc_addr) = desc_table.checked_add(u64::from(index) * DESCRIPTOR_LEN) { + sys_mem.read_object::(desc_addr)? + } else { + bail!( + "Address overflows: addr {}, size {}", + desc_table.raw_value(), + u64::from(index) * DESCRIPTOR_LEN + ); + }; + if desc.is_valid(sys_mem, queue_size) { + Ok(desc) + } else { + Err(ErrorKind::QueueDescInvalid.into()) + } + } + + /// Return true if the descriptor is valid. + fn is_valid(&self, sys_mem: &Arc, queue_size: u16) -> bool { + !(checked_offset_mem(&sys_mem, self.addr, u64::from(self.len)).is_err() + || (self.has_next() && self.next >= queue_size)) + } + + /// Return true if this descriptor has next descriptor. + fn has_next(&self) -> bool { + self.flags & VIRTQ_DESC_F_NEXT != 0 + } + + /// Get the next descriptor in descriptor chain. + fn next_desc( + sys_mem: &Arc, + desc_table: GuestAddress, + queue_size: u16, + index: u16, + ) -> Result { + SplitVringDesc::new(sys_mem, desc_table, queue_size, index) + .chain_err(|| format!("Failed to find next descriptor {}", index)) + } + + /// Check whether this descriptor is write-only or read-only. + /// Write-only means that the emulated device can write and the driver can read. + fn write_only(&self) -> bool { + self.flags & VIRTQ_DESC_F_WRITE != 0 + } + + /// Return true if this descriptor is a indirect descriptor. + fn is_indirect_desc(&self) -> bool { + self.flags & VIRTQ_DESC_F_INDIRECT != 0 + } + + /// Return true if the indirect descriptor is valid. + /// The len can be divided evenly by the size of descriptor and can not be zero. + fn is_valid_indirect_desc(&self) -> bool { + u64::from(self.len) % DESCRIPTOR_LEN == 0 && self.len != 0 + } + + /// Get the num of descriptor in the table of indirect descriptor. + fn get_desc_num(&self) -> u16 { + (u64::from(self.len) / DESCRIPTOR_LEN) as u16 + } + + /// Get element from descriptor chain. + fn get_element( + sys_mem: &Arc, + desc_table: GuestAddress, + queue_size: u16, + index: u16, + mut desc: SplitVringDesc, + ) -> Result { + let mut elem = Element::new(index); + + loop { + if elem.desc_num >= queue_size { + break; + } + + let iovec = ElemIovec { + addr: desc.addr, + len: desc.len, + }; + + if desc.write_only() { + elem.in_iovec.push(iovec); + } else { + elem.out_iovec.push(iovec); + } + elem.desc_num += 1; + + if desc.has_next() { + desc = Self::next_desc(sys_mem, desc_table, queue_size, desc.next)?; + } else { + break; + } + } + + Ok(elem) + } + + /// Get element from indirect descriptor chain. + fn get_indirect_desc(&self, sys_mem: &Arc, index: u16) -> Result { + if !self.is_valid_indirect_desc() { + return Err(ErrorKind::QueueDescInvalid.into()); + } + + let desc_num = self.get_desc_num(); + let desc_table = self.addr; + let desc = Self::next_desc(sys_mem, desc_table, desc_num, 0)?; + Self::get_element(sys_mem, desc_table, desc_num, index, desc) + } + + /// Get element from normal descriptor chain. + fn get_nonindirect_desc( + &self, + sys_mem: &Arc, + desc_table: GuestAddress, + queue_size: u16, + index: u16, + ) -> Result { + Self::get_element(sys_mem, desc_table, queue_size, index, *self) + } +} + +impl ByteCode for SplitVringDesc {} + +/// Split vring. +#[derive(Default, Clone, Copy)] +pub struct SplitVring { + /// Guest physical address of the descriptor table. + /// The table is composed of descriptors(SplitVringDesc). + pub desc_table: GuestAddress, + + /// Guest physical address of the available ring. + /// The ring is composed of flags(u16), idx(u16), ring[size](u16) and used_event(u16). + pub avail_ring: GuestAddress, + + /// Guest physical address of the used ring. + /// The ring is composed of flags(u16), idx(u16), used_ring[size](UsedElem) and avail_event(u16). + pub used_ring: GuestAddress, + + /// Indicate whether the queue configuration is finished. + pub ready: bool, + + /// The maximal size in elements offered by the device. + pub max_size: u16, + + /// The queue size set by frontend. + pub size: u16, + + /// The next index which can be popped in the available vring. + next_avail: Wrapping, + + /// The next index which can be pushed in the used vring. + next_used: Wrapping, + + /// The index of last descriptor used which has triggered interrupt. + last_signal_used: Wrapping, +} + +impl SplitVring { + /// Create a split vring. + /// + /// # Arguments + /// + /// * `queue_config` - Configuration of the vring. + pub fn new(queue_config: QueueConfig) -> Self { + SplitVring { + desc_table: queue_config.desc_table, + avail_ring: queue_config.avail_ring, + used_ring: queue_config.used_ring, + ready: queue_config.ready, + max_size: queue_config.max_size, + size: queue_config.size, + next_avail: Wrapping(0), + next_used: Wrapping(0), + last_signal_used: Wrapping(0), + } + } + + /// The actual size of the queue. + fn actual_size(&self) -> u16 { + min(self.size, self.max_size) + } + + /// Get the index of the available ring from guest memory. + fn get_avail_idx(&self, sys_mem: &Arc) -> Result { + let avail_flags_idx: SplitVringFlagsIdx = + sys_mem.read_object::(self.avail_ring)?; + + Ok(avail_flags_idx.idx) + } + + /// Get the flags of the available ring from guest memory. + fn get_avail_flags(&self, sys_mem: &Arc) -> Result { + let avail_flags_idx: SplitVringFlagsIdx = + sys_mem.read_object::(self.avail_ring)?; + Ok(avail_flags_idx.flags) + } + + /// Get the index of the used ring from guest memory. + fn get_used_idx(&self, sys_mem: &Arc) -> Result { + let used_flag_idx: SplitVringFlagsIdx = + sys_mem.read_object::(self.used_ring)?; + Ok(used_flag_idx.idx) + } + + /// Set the avail idx to the field of the event index for the available ring. + fn set_avail_event(&self, sys_mem: &Arc) -> Result<()> { + let avail_event_offset = + VRING_FLAGS_AND_IDX_LEN + USEDELEM_LEN * u64::from(self.actual_size()); + let event_idx = self + .get_avail_idx(sys_mem) + .chain_err(|| "failed to get avail idx")?; + + fence(Ordering::Release); + sys_mem.write_object( + &event_idx, + GuestAddress(self.used_ring.0 + avail_event_offset), + )?; + + Ok(()) + } + + /// Get the event index of the used ring from guest memory. + fn get_used_event(&self, sys_mem: &Arc) -> Result { + let used_event_offset = + VRING_FLAGS_AND_IDX_LEN + AVAILELEM_LEN * u64::from(self.actual_size()); + + let used_event: u16 = + if let Some(used_event_addr) = self.avail_ring.checked_add(used_event_offset) { + sys_mem.read_object::(used_event_addr)? + } else { + bail!( + "Address overflows: addr {}, size {}", + self.avail_ring.raw_value(), + used_event_offset + ); + }; + + Ok(used_event) + } + + /// The number of descriptor chains in the available ring. + fn avail_ring_len(&mut self, sys_mem: &Arc) -> Result { + let avail_idx = self.get_avail_idx(sys_mem).map(Wrapping)?; + + Ok((avail_idx - self.next_avail).0) + } + + /// Return true if VRING_AVAIL_F_NO_INTERRUPT is set. + fn is_avail_ring_no_interrupt(&self, sys_mem: &Arc) -> bool { + if let Ok(avail_flags) = self.get_avail_flags(sys_mem) { + (avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0 + } else { + warn!("Getting avail flags failed"); + false + } + } + + /// Return true if it's required to trigger interrupt for the used vring. + fn used_ring_need_event(&mut self, sys_mem: &Arc) -> bool { + let old = self.last_signal_used; + let new = if let Ok(used_idx) = self.get_used_idx(sys_mem) { + Wrapping(used_idx) + } else { + return false; + }; + + let used_event_idx = if let Ok(used_event_idx) = self.get_used_event(sys_mem) { + Wrapping(used_event_idx) + } else { + return false; + }; + + self.last_signal_used = new; + (new - used_event_idx - Wrapping(1)) < (new - old) + } + + fn is_invalid_memory(&self, sys_mem: &Arc, actual_size: u64) -> bool { + let desc_table_end = + match checked_offset_mem(&sys_mem, self.desc_table, DESCRIPTOR_LEN * actual_size) { + Ok(addr) => addr, + Err(_) => { + error!( + "descriptor table is out of bounds: start:{} size:{}", + self.desc_table.0, + DESCRIPTOR_LEN * actual_size + ); + return true; + } + }; + + let desc_avail_end = match checked_offset_mem( + &sys_mem, + self.avail_ring, + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + AVAILELEM_LEN * actual_size, + ) { + Ok(addr) => addr, + Err(_) => { + error!( + "avail ring is out of bounds: start:{} size:{}", + self.avail_ring.0, + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + AVAILELEM_LEN * actual_size + ); + return true; + } + }; + + if checked_offset_mem( + &sys_mem, + self.used_ring, + VRING_USED_LEN_EXCEPT_USEDELEM + USEDELEM_LEN * actual_size, + ) + .is_err() + { + error!( + "used ring is out of bounds: start:{} size:{}", + self.used_ring.0, + VRING_USED_LEN_EXCEPT_USEDELEM + USEDELEM_LEN * actual_size + ); + return true; + } + + if self.desc_table >= self.avail_ring + || self.avail_ring >= self.used_ring + || desc_table_end > self.avail_ring + || desc_avail_end > self.used_ring + { + error!("The memory of descriptor table:{}, avail ring:{} or used ring:{} is overlapped. queue size:{}", + self.desc_table.0, self.avail_ring.0, self.used_ring.0, actual_size); + return true; + } + + if self.desc_table.0 & 0xf != 0 { + error!("descriptor table:{} is not aligned", self.desc_table.0); + true + } else if self.avail_ring.0 & 0x1 != 0 { + error!("avail ring:{} is not aligned", self.avail_ring.0); + true + } else if self.used_ring.0 & 0x3 != 0 { + error!("used ring:{} is not aligned", self.used_ring.0); + true + } else { + false + } + } +} + +impl VringOps for SplitVring { + fn is_valid(&self, sys_mem: &Arc) -> bool { + let size = u64::from(self.actual_size()); + if !self.ready { + error!("The configuration of vring is not ready\n"); + false + } else if self.size > self.max_size || self.size == 0 || (self.size & (self.size - 1)) != 0 + { + error!( + "vring with invalid size:{} max size:{}", + self.size, self.max_size + ); + false + } else { + !self.is_invalid_memory(sys_mem, size) + } + } + + fn pop_avail(&mut self, sys_mem: &Arc, features: u64) -> Result { + let avail_len = self.avail_ring_len(sys_mem)?; + if avail_len == 0 { + bail!("failed to pop avail: empty!"); + } + + let index_offset = VRING_FLAGS_AND_IDX_LEN + + AVAILELEM_LEN * u64::from(self.next_avail.0 % self.actual_size()); + let desc_index: u16 = + if let Some(desc_index_addr) = self.avail_ring.checked_add(index_offset) { + sys_mem.read_object::(desc_index_addr)? + } else { + bail!( + "Address overflows: addr {}, size {}", + self.avail_ring.raw_value(), + index_offset + ); + }; + + if virtio_has_feature(features, VIRTIO_F_RING_EVENT_IDX) { + self.set_avail_event(sys_mem)?; + } + + let desc = SplitVringDesc::new(sys_mem, self.desc_table, self.actual_size(), desc_index)?; + let elem = if desc.is_indirect_desc() { + if desc.write_only() { + bail!("Unexpected descriptor for writing only"); + } + + desc.get_indirect_desc(sys_mem, desc_index) + .map(|elem| { + self.next_avail += Wrapping(1); + elem + }) + .chain_err(|| "Failed to get indirect desc")? + } else { + desc.get_nonindirect_desc(sys_mem, self.desc_table, self.actual_size(), desc_index) + .map(|elem| { + self.next_avail += Wrapping(1); + elem + })? + }; + Ok(elem) + } + + fn add_used(&mut self, sys_mem: &Arc, index: u16, len: u32) -> Result<()> { + if index >= self.size { + return Err(ErrorKind::QueueIndex(index, self.size).into()); + } + + let used_ring = self.used_ring; + let next_used = u64::from(self.next_used.0 % self.actual_size()); + let used_elem_addr = + GuestAddress(used_ring.0 + VRING_FLAGS_AND_IDX_LEN + next_used * USEDELEM_LEN); + let used_elem = UsedElem { + id: u32::from(index), + len, + }; + sys_mem.write_object::(&used_elem, used_elem_addr)?; + + self.next_used += Wrapping(1); + + fence(Ordering::Release); + + sys_mem.write_object( + &(self.next_used.0 as u16), + GuestAddress(used_ring.0 + VRING_IDX_POSITION), + )?; + + Ok(()) + } + + fn should_notify(&mut self, sys_mem: &Arc, features: u64) -> bool { + if virtio_has_feature(features, VIRTIO_F_RING_EVENT_IDX) { + self.used_ring_need_event(sys_mem) + } else { + !self.is_avail_ring_no_interrupt(sys_mem) + } + } + + fn actual_size(&self) -> u16 { + self.actual_size() + } + + fn get_queue_config(&self) -> QueueConfig { + QueueConfig { + desc_table: self.desc_table, + avail_ring: self.avail_ring, + used_ring: self.used_ring, + ready: self.ready, + max_size: self.max_size, + size: self.size, + } + } +} + +/// Virtio queue. +pub struct Queue { + /// Vring structure. + pub vring: Box, +} + +impl Queue { + /// Create a virtqueue. + /// + /// # Arguments + /// + /// * `queue_config` - Configuration of the vring. + /// * `queue_type` - Type of virtqueue. + pub fn new(queue_config: QueueConfig, queue_type: u16) -> Result { + let vring: Box = match queue_type { + QUEUE_TYPE_SPLIT_VRING => Box::new(SplitVring::new(queue_config)), + _ => { + bail!("Unsupported queue type {}", queue_type); + } + }; + + Ok(Queue { vring }) + } + + /// Return true if the memory layout of the virqueue is valid. + /// + /// # Arguments + /// + /// * `sys_mem` - Address space to which the vring belongs. + pub fn is_valid(&self, sys_mem: &Arc) -> bool { + self.vring.is_valid(sys_mem) + } +} + +#[cfg(test)] +mod tests { + pub use super::*; + use address_space::{AddressSpace, GuestAddress, HostMemMapping, Region}; + + fn address_space_init() -> Arc { + let root = Region::init_container_region(1 << 36); + let sys_space = AddressSpace::new(root).unwrap(); + let host_mmap = + Arc::new(HostMemMapping::new(GuestAddress(0), SYSTEM_SPACE_SIZE, false).unwrap()); + sys_space + .root() + .add_subregion( + Region::init_ram_region(host_mmap.clone()), + host_mmap.start_address().raw_value(), + ) + .unwrap(); + sys_space + } + + trait VringOpsTest { + fn set_desc( + &self, + sys_mem: &Arc, + index: u16, + addr: GuestAddress, + len: u32, + flags: u16, + next: u16, + ) -> Result<()>; + + fn set_avail_ring_idx(&self, sys_mem: &Arc, idx: u16) -> Result<()>; + + fn set_avail_ring_flags(&self, sys_mem: &Arc, flags: u16) -> Result<()>; + + fn set_avail_ring_elem( + &self, + sys_mem: &Arc, + avail_pos: u16, + index: u16, + ) -> Result<()>; + + fn get_avail_event(&self, sys_mem: &Arc) -> Result; + + fn get_used_elem(&self, sys_mem: &Arc, index: u16) -> Result; + + fn get_used_ring_idx(&self, sys_mem: &Arc) -> Result; + + fn set_used_ring_idx(&self, sys_mem: &Arc, idx: u16) -> Result<()>; + + fn set_used_event_idx(&self, sys_mem: &Arc, idx: u16) -> Result<()>; + } + + impl VringOpsTest for SplitVring { + fn set_desc( + &self, + sys_mem: &Arc, + index: u16, + addr: GuestAddress, + len: u32, + flags: u16, + next: u16, + ) -> Result<()> { + if index >= self.actual_size() { + return Err(ErrorKind::QueueIndex(index, self.size).into()); + } + + let desc_addr_offset = DESCRIPTOR_LEN * index as u64; + let desc = SplitVringDesc { + addr, + len, + flags, + next, + }; + sys_mem.write_object::( + &desc, + GuestAddress(self.desc_table.0 + desc_addr_offset), + )?; + + Ok(()) + } + + fn set_avail_ring_idx(&self, sys_mem: &Arc, idx: u16) -> Result<()> { + let avail_idx_offset = 2 as u64; + sys_mem + .write_object::(&idx, GuestAddress(self.avail_ring.0 + avail_idx_offset))?; + Ok(()) + } + + fn set_avail_ring_flags(&self, sys_mem: &Arc, flags: u16) -> Result<()> { + let avail_idx_offset = 0 as u64; + sys_mem + .write_object::(&flags, GuestAddress(self.avail_ring.0 + avail_idx_offset))?; + Ok(()) + } + + fn set_avail_ring_elem( + &self, + sys_mem: &Arc, + avail_pos: u16, + desc_index: u16, + ) -> Result<()> { + let avail_idx_offset = VRING_FLAGS_AND_IDX_LEN + AVAILELEM_LEN * (avail_pos as u64); + sys_mem.write_object::( + &desc_index, + GuestAddress(self.avail_ring.0 + avail_idx_offset), + )?; + Ok(()) + } + + fn get_avail_event(&self, sys_mem: &Arc) -> Result { + let avail_event_idx_offset = + VRING_FLAGS_AND_IDX_LEN + USEDELEM_LEN * (self.actual_size() as u64); + let event_idx = sys_mem + .read_object::(GuestAddress(self.used_ring.0 + avail_event_idx_offset))?; + Ok(event_idx) + } + + fn get_used_elem(&self, sys_mem: &Arc, index: u16) -> Result { + let used_elem_offset = VRING_FLAGS_AND_IDX_LEN + USEDELEM_LEN * (index as u64); + let used_elem = sys_mem + .read_object::(GuestAddress(self.used_ring.0 + used_elem_offset))?; + Ok(used_elem) + } + + fn get_used_ring_idx(&self, sys_mem: &Arc) -> Result { + let used_idx_offset = VRING_IDX_POSITION; + let idx = + sys_mem.read_object::(GuestAddress(self.used_ring.0 + used_idx_offset))?; + Ok(idx) + } + + fn set_used_ring_idx(&self, sys_mem: &Arc, idx: u16) -> Result<()> { + let used_idx_offset = VRING_IDX_POSITION; + sys_mem.write_object::(&idx, GuestAddress(self.used_ring.0 + used_idx_offset))?; + Ok(()) + } + + fn set_used_event_idx(&self, sys_mem: &Arc, idx: u16) -> Result<()> { + let event_idx_offset = + VRING_FLAGS_AND_IDX_LEN + AVAILELEM_LEN * (self.actual_size() as u64); + sys_mem + .write_object::(&idx, GuestAddress(self.avail_ring.0 + event_idx_offset))?; + Ok(()) + } + } + + fn set_indirect_desc( + sys_mem: &Arc, + desc_addr: GuestAddress, + addr: GuestAddress, + len: u32, + flags: u16, + next: u16, + ) -> Result<()> { + let desc = SplitVringDesc { + addr, + len, + flags, + next, + }; + sys_mem.write_object::(&desc, desc_addr)?; + Ok(()) + } + + const SYSTEM_SPACE_SIZE: u64 = (1024 * 1024) as u64; + const QUEUE_SIZE: u16 = 256 as u16; + + fn align(size: u64, alignment: u64) -> u64 { + let align_adjust = if size % alignment != 0 { + alignment - (size % alignment) + } else { + 0 + }; + (size + align_adjust) as u64 + } + + #[test] + fn test_valid_queue_01() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + + // failed when the type of queue is invalid + let queue = Queue::new(queue_config, 0); + assert!(queue.is_err()); + let queue = Queue::new(queue_config, QUEUE_TYPE_PACKED_VRING); + assert!(queue.is_err()); + + // it is valid + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the status is not ready + queue_config.ready = false; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + queue_config.ready = true; + + // it is invalid when the size of virtual ring is more than the max size + queue_config.size = QUEUE_SIZE + 1; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + + // it is invalid when the size of virtual ring is zero + queue_config.size = 0; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + + // it is invalid when the size of virtual ring isn't power of 2 + queue_config.size = 15; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + } + + #[test] + fn test_valid_queue_02() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of descriptor table is out of bound + queue_config.desc_table = + GuestAddress(SYSTEM_SPACE_SIZE - (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + 1 as u64); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.desc_table = GuestAddress(0); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of avail ring is out of bound + queue_config.avail_ring = GuestAddress( + SYSTEM_SPACE_SIZE + - (VRING_AVAIL_LEN_EXCEPT_AVAILELEM + AVAILELEM_LEN * (QUEUE_SIZE as u64)) + + 1 as u64, + ); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of used ring is out of bound + queue_config.used_ring = GuestAddress( + SYSTEM_SPACE_SIZE + - (VRING_USED_LEN_EXCEPT_USEDELEM + USEDELEM_LEN * (QUEUE_SIZE as u64)) + + 1 as u64, + ); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + } + + #[test] + fn test_valid_queue_03() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of descriptor table is equal to the address of avail ring + queue_config.avail_ring = GuestAddress(0); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of descriptor table is overlapped to the address of avail ring + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN - 1); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of avail ring is equal to the address of used ring + queue_config.used_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of avail ring is overlapped to the address of used ring + queue_config.used_ring = GuestAddress( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64) + - 1, + ); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + } + + #[test] + fn test_valid_queue_04() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of descriptor table is not aligned to 16 + queue_config.desc_table = GuestAddress(15 as u64); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.desc_table = GuestAddress(0); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of avail ring is not aligned to 2 + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN + 1); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + + // it is invalid when the address of used ring is not aligned to 4 + queue_config.used_ring = GuestAddress( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64) + + 3, + ); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), false); + // recover the address for valid queue + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + let queue = Queue::new(queue_config, QUEUE_TYPE_SPLIT_VRING).unwrap(); + assert_eq!(queue.is_valid(&sys_space), true); + } + + #[test] + fn test_pop_avail_01() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let mut vring = SplitVring::new(queue_config); + assert_eq!(vring.is_valid(&sys_space), true); + + // it is ok when the descriptor chain is normal + // set the information of index 0 for descriptor + vring + .set_desc(&sys_space, 0, GuestAddress(0x111), 16, VIRTQ_DESC_F_NEXT, 1) + .unwrap(); + + // set the information of index 1 for descriptor + vring + .set_desc( + &sys_space, + 1, + GuestAddress(0x222), + 32, + VIRTQ_DESC_F_WRITE | VIRTQ_DESC_F_NEXT, + 2, + ) + .unwrap(); + + // set the information of index 2 for descriptor + vring + .set_desc( + &sys_space, + 2, + GuestAddress(0x333), + 48, + VIRTQ_DESC_F_WRITE, + 0, + ) + .unwrap(); + + // set the index 0 of descriptor to the position 0 for the element of avail ring + vring.set_avail_ring_elem(&sys_space, 0, 0).unwrap(); + // set 1 to the idx of avail ring + vring.set_avail_ring_idx(&sys_space, 1).unwrap(); + + let features = 1 << VIRTIO_F_RING_EVENT_IDX as u64; + let elem = match vring.pop_avail(&sys_space, features) { + Ok(ret) => ret, + Err(_) => Element { + index: 1, + desc_num: 0, + out_iovec: Vec::new(), + in_iovec: Vec::new(), + }, + }; + assert_eq!(elem.index, 0); + assert_eq!(elem.desc_num, 3); + assert_eq!(elem.out_iovec.len(), 1); + let elem_iov = elem.out_iovec.get(0).unwrap(); + assert_eq!(elem_iov.addr, GuestAddress(0x111)); + assert_eq!(elem_iov.len, 16); + assert_eq!(elem.in_iovec.len(), 2); + let elem_iov = elem.in_iovec.get(0).unwrap(); + assert_eq!(elem_iov.addr, GuestAddress(0x222)); + assert_eq!(elem_iov.len, 32); + let elem_iov = elem.in_iovec.get(1).unwrap(); + assert_eq!(elem_iov.addr, GuestAddress(0x333)); + assert_eq!(elem_iov.len, 48); + + // the event idx of avail ring is equal to get_avail_event + let event_idx = vring.get_avail_event(&sys_space).unwrap(); + assert_eq!(event_idx, 1); + let avail_idx = vring.get_avail_idx(&sys_space).unwrap(); + assert_eq!(avail_idx, 1); + } + + #[test] + fn test_pop_avail_02() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let mut vring = SplitVring::new(queue_config); + assert_eq!(vring.is_valid(&sys_space), true); + + // it is ok when the descriptor chain is indirect + // set the information for indirect descriptor + vring + .set_desc( + &sys_space, + 0, + GuestAddress(SYSTEM_SPACE_SIZE / 2), + 48, + VIRTQ_DESC_F_INDIRECT, + 0, + ) + .unwrap(); + + // set the information of index 0 for indirect descriptor chain + set_indirect_desc( + &sys_space, + GuestAddress(SYSTEM_SPACE_SIZE / 2), + GuestAddress(0x444), + 100, + VIRTQ_DESC_F_NEXT, + 1, + ) + .unwrap(); + + // set the information of index 1 for indirect descriptor chain + set_indirect_desc( + &sys_space, + GuestAddress(SYSTEM_SPACE_SIZE / 2 + DESCRIPTOR_LEN), + GuestAddress(0x555), + 200, + VIRTQ_DESC_F_NEXT, + 2, + ) + .unwrap(); + + // set the information of index 2 for indirect descriptor chain + set_indirect_desc( + &sys_space, + GuestAddress(SYSTEM_SPACE_SIZE / 2 + DESCRIPTOR_LEN * 2), + GuestAddress(0x666), + 300, + VIRTQ_DESC_F_WRITE, + 2, + ) + .unwrap(); + + // set the index 0 of descriptor to the position 0 for the element of avail ring + vring.set_avail_ring_elem(&sys_space, 0, 0).unwrap(); + // set 1 to the idx of avail ring + vring.set_avail_ring_idx(&sys_space, 1).unwrap(); + + let features = 1 << VIRTIO_F_RING_EVENT_IDX as u64; + let elem = match vring.pop_avail(&sys_space, features) { + Ok(ret) => ret, + Err(_) => Element { + index: 1, + desc_num: 0, + out_iovec: Vec::new(), + in_iovec: Vec::new(), + }, + }; + assert_eq!(elem.index, 0); + assert_eq!(elem.desc_num, 3); + assert_eq!(elem.out_iovec.len(), 2); + let elem_iov = elem.out_iovec.get(0).unwrap(); + assert_eq!(elem_iov.addr, GuestAddress(0x444)); + assert_eq!(elem_iov.len, 100); + let elem_iov = elem.out_iovec.get(1).unwrap(); + assert_eq!(elem_iov.addr, GuestAddress(0x555)); + assert_eq!(elem_iov.len, 200); + assert_eq!(elem.in_iovec.len(), 1); + let elem_iov = elem.in_iovec.get(0).unwrap(); + assert_eq!(elem_iov.addr, GuestAddress(0x666)); + assert_eq!(elem_iov.len, 300); + } + + #[test] + fn test_pop_avail_03() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let mut vring = SplitVring::new(queue_config); + assert_eq!(vring.is_valid(&sys_space), true); + + // it is error when the idx of avail ring which is equal to next_avail + // set 0 to the idx of avail ring which is equal to next_avail + vring.set_avail_ring_idx(&sys_space, 0).unwrap(); + let features = 1 << VIRTIO_F_RING_EVENT_IDX as u64; + if let Ok(_) = vring.pop_avail(&sys_space, features) { + assert!(false); + } + + // it is error when the indirect descriptor is written + // set the index 0 of descriptor to the position 0 for the element of avail ring + vring.set_avail_ring_elem(&sys_space, 0, 0).unwrap(); + // set 1 to the idx of avail ring + vring.set_avail_ring_idx(&sys_space, 1).unwrap(); + // it is false when it sets the indirect descriptor to be written + vring + .set_desc( + &sys_space, + 0, + GuestAddress(0x11), + 16, + VIRTQ_DESC_F_INDIRECT | VIRTQ_DESC_F_WRITE, + 0, + ) + .unwrap(); + if let Err(err) = vring.pop_avail(&sys_space, features) { + assert_eq!(err.to_string(), "Unexpected descriptor for writing only"); + } else { + assert!(false); + } + + // error comes when the length of indirect descriptor can not be divided by 16 + vring + .set_desc( + &sys_space, + 0, + GuestAddress(0x11), + 17, + VIRTQ_DESC_F_INDIRECT, + 0, + ) + .unwrap(); + if let Ok(_) = vring.pop_avail(&sys_space, features) { + assert!(false); + } + + // error comes when the length of indirect descriptor is more than the length of descriptor chain + // set the information of index 0 for descriptor + vring + .set_desc( + &sys_space, + 0, + GuestAddress(SYSTEM_SPACE_SIZE / 2), + 32, + VIRTQ_DESC_F_INDIRECT, + 0, + ) + .unwrap(); + + // set the information of index 0 for descriptor + set_indirect_desc( + &sys_space, + GuestAddress(SYSTEM_SPACE_SIZE / 2), + GuestAddress(0x444), + 100, + VIRTQ_DESC_F_NEXT, + 1, + ) + .unwrap(); + + // set the information of index 1 for descriptor + set_indirect_desc( + &sys_space, + GuestAddress(SYSTEM_SPACE_SIZE / 2 + DESCRIPTOR_LEN), + GuestAddress(0x555), + 200, + VIRTQ_DESC_F_NEXT, + 2, + ) + .unwrap(); + + // set the information of index 2 for descriptor + set_indirect_desc( + &sys_space, + GuestAddress(SYSTEM_SPACE_SIZE / 2 + DESCRIPTOR_LEN * 2), + GuestAddress(0x666), + 300, + VIRTQ_DESC_F_WRITE, + 2, + ) + .unwrap(); + if let Err(err) = vring.pop_avail(&sys_space, features) { + assert_eq!(err.to_string(), "Failed to get indirect desc"); + } else { + assert!(false); + } + } + + #[test] + fn test_add_used() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let mut vring = SplitVring::new(queue_config); + assert_eq!(vring.is_valid(&sys_space), true); + + // it is false when the index is more than the size of queue + let err = vring.add_used(&sys_space, QUEUE_SIZE, 100).unwrap_err(); + if let ErrorKind::QueueIndex(offset, size) = err.kind() { + assert_eq!(*offset, 256); + assert_eq!(*size, 256); + } + + assert!(vring.add_used(&sys_space, 10, 100).is_ok()); + let elem = vring.get_used_elem(&sys_space, 0).unwrap(); + assert_eq!(elem.id, 10); + assert_eq!(elem.len, 100); + assert_eq!(vring.get_used_ring_idx(&sys_space).unwrap(), 1); + } + + #[test] + fn test_should_notify() { + let sys_space = address_space_init(); + + let mut queue_config = QueueConfig::new(QUEUE_SIZE); + queue_config.desc_table = GuestAddress(0); + queue_config.avail_ring = GuestAddress((QUEUE_SIZE as u64) * DESCRIPTOR_LEN); + queue_config.used_ring = GuestAddress(align( + (QUEUE_SIZE as u64) * DESCRIPTOR_LEN + + VRING_AVAIL_LEN_EXCEPT_AVAILELEM + + AVAILELEM_LEN * (QUEUE_SIZE as u64), + 4096, + )); + queue_config.ready = true; + queue_config.size = QUEUE_SIZE; + let mut vring = SplitVring::new(queue_config); + assert_eq!(vring.is_valid(&sys_space), true); + + // it's true when the feature of event idx and no interrupt for the avail ring is closed + let features = 0 as u64; + assert!(vring.set_avail_ring_flags(&sys_space, 0).is_ok()); + assert_eq!(vring.should_notify(&sys_space, features), true); + + // it's false when the feature of event idx is closed and the feature of no interrupt for the avail ring is open + let features = 0 as u64; + assert!(vring + .set_avail_ring_flags(&sys_space, VRING_AVAIL_F_NO_INTERRUPT) + .is_ok()); + assert_eq!(vring.should_notify(&sys_space, features), false); + + // it's true when the feature of event idx is open and (new - event_idx - Wrapping(1) < new -old) + let features = 1 << VIRTIO_F_RING_EVENT_IDX as u64; + vring.last_signal_used = Wrapping(5); //old + assert!(vring.set_used_ring_idx(&sys_space, 10).is_ok()); //new + assert!(vring.set_used_event_idx(&sys_space, 6).is_ok()); //event_idx + assert_eq!(vring.should_notify(&sys_space, features), true); + + // it's false when the feature of event idx is open and (new - event_idx - Wrapping(1) > new -old) + vring.last_signal_used = Wrapping(5); //old + assert!(vring.set_used_ring_idx(&sys_space, 10).is_ok()); //new + assert!(vring.set_used_event_idx(&sys_space, 1).is_ok()); //event_idx + assert_eq!(vring.should_notify(&sys_space, features), false); + + // it's false when the feature of event idx is open and (new - event_idx - Wrapping(1) = new -old) + vring.last_signal_used = Wrapping(5); //old + assert!(vring.set_used_ring_idx(&sys_space, 10).is_ok()); //new + assert!(vring.set_used_event_idx(&sys_space, 4).is_ok()); //event_idx + assert_eq!(vring.should_notify(&sys_space, features), false); + } +} diff --git a/device_model/src/virtio/vhost/kernel/mod.rs b/device_model/src/virtio/vhost/kernel/mod.rs new file mode 100644 index 00000000..6b97e2fe --- /dev/null +++ b/device_model/src/virtio/vhost/kernel/mod.rs @@ -0,0 +1,435 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +mod net; +mod vsock; + +pub use net::Net; +pub use vsock::Vsock; + +use std::fs::{File, OpenOptions}; +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Mutex}; + +use address_space::{ + AddressSpace, FlatRange, GuestAddress, Listener, ListenerReqType, RegionIoEventFd, RegionType, +}; +use util::byte_code::ByteCode; +use util::epoll_context::{read_fd, EventNotifier, EventNotifierHelper, NotifierOperation}; +use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::ioctl::{ioctl, ioctl_with_mut_ref, ioctl_with_ptr, ioctl_with_ref}; + +use super::super::errors::{ErrorKind, Result, ResultExt}; +use super::super::{QueueConfig, VIRTIO_MMIO_INT_VRING}; +use super::{VhostNotify, VhostOps}; + +/// Refer to VHOST_VIRTIO in +/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/vhost.h. +const VHOST: u32 = 0xaf; +ioctl_ior_nr!(VHOST_GET_FEATURES, VHOST, 0x00, u64); +ioctl_iow_nr!(VHOST_SET_FEATURES, VHOST, 0x00, u64); +ioctl_io_nr!(VHOST_SET_OWNER, VHOST, 0x01); +ioctl_iow_nr!(VHOST_SET_MEM_TABLE, VHOST, 0x03, VhostMemory); +ioctl_iow_nr!(VHOST_SET_VRING_NUM, VHOST, 0x10, VhostVringState); +ioctl_iow_nr!(VHOST_SET_VRING_ADDR, VHOST, 0x11, VhostVringAddr); +ioctl_iow_nr!(VHOST_SET_VRING_BASE, VHOST, 0x12, VhostVringState); +ioctl_iow_nr!(VHOST_SET_VRING_KICK, VHOST, 0x20, VhostVringFile); +ioctl_iow_nr!(VHOST_SET_VRING_CALL, VHOST, 0x21, VhostVringFile); +ioctl_iow_nr!(VHOST_NET_SET_BACKEND, VHOST, 0x30, VhostVringFile); +ioctl_iow_nr!(VHOST_VSOCK_SET_GUEST_CID, VHOST, 0x60, u64); +ioctl_iow_nr!(VHOST_VSOCK_SET_RUNNING, VHOST, 0x61, i32); + +/// Refer to vhost_vring_file in +/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/vhost.h. +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct VhostVringFile { + /// Vring index. + pub index: u32, + /// File fd. + pub fd: RawFd, +} + +/// Refer to vhost_vring_state in +/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/vhost.h. +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct VhostVringState { + /// Vring index. + pub index: u32, + /// Vring size. + pub num: u32, +} + +/// Refer to vhost_vring_addr in +/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/vhost.h. +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct VhostVringAddr { + /// Vring index. + pub index: u32, + /// Option flags. + pub flags: u32, + /// Base address of descriptor table. + pub desc_user_addr: u64, + /// Base address of used vring. + pub used_user_addr: u64, + /// Base address of available vring. + pub avail_user_addr: u64, + /// Address where to write logs. + pub log_guest_addr: u64, +} + +/// Refer to vhost_memory_region in +/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/vhost.h. +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct VhostMemoryRegion { + /// GPA. + pub guest_phys_addr: u64, + /// Size of the memory region. + pub memory_size: u64, + /// HVA. + pub userspace_addr: u64, + /// No flags specified for now. + pub flags_padding: u64, +} + +impl ByteCode for VhostMemoryRegion {} + +/// Refer to vhost_memory in +/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/vhost.h. +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct VhostMemory { + pub nregions: u32, + pub padding: u32, +} + +impl ByteCode for VhostMemory {} + +#[derive(Clone)] +pub struct VhostMemInfo { + regions: Arc>>, +} + +impl VhostMemInfo { + pub fn new() -> VhostMemInfo { + VhostMemInfo { + regions: Arc::new(Mutex::new(Vec::new())), + } + } + + pub fn addr_to_host(&self, addr: GuestAddress) -> Option { + let addr = addr.raw_value(); + for region in self.regions.lock().unwrap().iter() { + if addr >= region.guest_phys_addr && addr < region.guest_phys_addr + region.memory_size + { + let offset = addr - region.guest_phys_addr; + return Some(region.userspace_addr + offset); + } + } + None + } + + fn check_vhost_mem_range(fr: &FlatRange) -> bool { + fr.owner.region_type() == RegionType::Ram + } + + fn add_mem_range(&self, fr: &FlatRange) { + let guest_phys_addr = fr.addr_range.base.raw_value(); + let memory_size = fr.addr_range.size; + let userspace_addr = fr.owner.get_host_address().unwrap() + fr.offset_in_region; + + self.regions.lock().unwrap().push(VhostMemoryRegion { + guest_phys_addr, + memory_size, + userspace_addr, + flags_padding: 0_u64, + }); + } + + fn delete_mem_range(&self, fr: &FlatRange) { + let mut mem_regions = self.regions.lock().unwrap(); + let target = VhostMemoryRegion { + guest_phys_addr: fr.addr_range.base.raw_value(), + memory_size: fr.addr_range.size, + userspace_addr: fr.owner.get_host_address().unwrap() + fr.offset_in_region, + flags_padding: 0_u64, + }; + for (index, mr) in mem_regions.iter().enumerate() { + if mr.guest_phys_addr == target.guest_phys_addr + && mr.memory_size == target.memory_size + && mr.userspace_addr == target.userspace_addr + && mr.flags_padding == target.flags_padding + { + mem_regions.remove(index); + return; + } + } + debug!("Vhost: deleting mem region failed: not matched"); + } +} + +impl Listener for VhostMemInfo { + fn priority(&self) -> i32 { + 0 + } + + fn handle_request( + &self, + range: Option<&FlatRange>, + _evtfd: Option<&RegionIoEventFd>, + req_type: ListenerReqType, + ) -> std::result::Result<(), address_space::errors::Error> { + match req_type { + ListenerReqType::AddRegion => { + if Self::check_vhost_mem_range(&range.unwrap()) { + self.add_mem_range(range.unwrap()); + } + } + ListenerReqType::DeleteRegion => { + let fr = range.unwrap(); + if fr.owner.region_type() == RegionType::Ram { + self.delete_mem_range(&fr); + } + } + _ => {} + } + Ok(()) + } +} + +pub struct VhostBackend { + fd: File, + mem_info: VhostMemInfo, +} + +impl VhostBackend { + pub fn new( + mem_space: &Arc, + path: &str, + rawfd: Option, + ) -> Result { + let fd = match rawfd { + Some(rawfd) => unsafe { File::from_raw_fd(rawfd) }, + None => OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) + .open(path) + .chain_err(|| format!("Failed to open {}.", path))?, + }; + let mem_info = VhostMemInfo::new(); + mem_space.register_listener(Box::new(mem_info.clone()))?; + + Ok(VhostBackend { fd, mem_info }) + } +} + +impl AsRawFd for VhostBackend { + fn as_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } +} + +impl VhostOps for VhostBackend { + fn set_owner(&self) -> Result<()> { + let ret = unsafe { ioctl(self, VHOST_SET_OWNER()) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_OWNER".to_string()).into()); + } + Ok(()) + } + + fn get_features(&self) -> Result { + let mut avail_features: u64 = 0; + let ret = unsafe { ioctl_with_mut_ref(self, VHOST_GET_FEATURES(), &mut avail_features) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_GET_FEATURES".to_string()).into()); + } + Ok(avail_features) + } + + fn set_features(&self, features: u64) -> Result<()> { + let ret = unsafe { ioctl_with_ref(self, VHOST_SET_FEATURES(), &features) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_FEATURES".to_string()).into()); + } + Ok(()) + } + + fn set_mem_table(&self) -> Result<()> { + let regions = self.mem_info.regions.lock().unwrap().len(); + let vm_size = std::mem::size_of::(); + let vmr_size = std::mem::size_of::(); + let mut bytes: Vec = vec![0; vm_size + regions * vmr_size]; + + bytes[0..vm_size].copy_from_slice( + VhostMemory { + nregions: regions as u32, + padding: 0, + } + .as_bytes(), + ); + + for (index, region) in self.mem_info.regions.lock().unwrap().iter().enumerate() { + bytes[(vm_size + index * vmr_size)..(vm_size + (index + 1) * vmr_size)] + .copy_from_slice(region.as_bytes()); + } + + let ret = unsafe { ioctl_with_ptr(self, VHOST_SET_MEM_TABLE(), bytes.as_ptr()) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_MEM_TABLE".to_string()).into()); + } + Ok(()) + } + + fn set_vring_num(&self, queue_idx: usize, num: u16) -> Result<()> { + let vring_state = VhostVringState { + index: queue_idx as u32, + num: u32::from(num), + }; + let ret = unsafe { ioctl_with_ref(self, VHOST_SET_VRING_NUM(), &vring_state) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_VRING_NUM".to_string()).into()); + } + Ok(()) + } + + fn set_vring_addr(&self, queue_config: &QueueConfig, index: usize, flags: u32) -> Result<()> { + let desc_user_addr = self + .mem_info + .addr_to_host(queue_config.desc_table) + .ok_or_else(|| { + ErrorKind::Msg(format!( + "Failed to transform desc-table address {}", + queue_config.desc_table.0 + )) + })?; + + let used_user_addr = self + .mem_info + .addr_to_host(queue_config.used_ring) + .ok_or_else(|| { + ErrorKind::Msg(format!( + "Failed to transform used ring address {}", + queue_config.used_ring.0 + )) + })?; + + let avail_user_addr = self + .mem_info + .addr_to_host(queue_config.avail_ring) + .ok_or_else(|| { + ErrorKind::Msg(format!( + "Failed to transform avail ring address {}", + queue_config.avail_ring.0 + )) + })?; + + let vring_addr = VhostVringAddr { + index: index as u32, + flags, + desc_user_addr, + used_user_addr, + avail_user_addr, + log_guest_addr: 0_u64, + }; + + let ret = unsafe { ioctl_with_ref(self, VHOST_SET_VRING_ADDR(), &vring_addr) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_VRING_ADDR".to_string()).into()); + } + Ok(()) + } + + fn set_vring_base(&self, queue_idx: usize, num: u16) -> Result<()> { + let vring_state = VhostVringState { + index: queue_idx as u32, + num: u32::from(num), + }; + let ret = unsafe { ioctl_with_ref(self, VHOST_SET_VRING_BASE(), &vring_state) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_VRING_BASE".to_string()).into()); + } + Ok(()) + } + + fn set_vring_call(&self, queue_idx: usize, fd: &EventFd) -> Result<()> { + let vring_file = VhostVringFile { + index: queue_idx as u32, + fd: fd.as_raw_fd(), + }; + let ret = unsafe { ioctl_with_ref(self, VHOST_SET_VRING_CALL(), &vring_file) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_VRING_CALL".to_string()).into()); + } + Ok(()) + } + + fn set_vring_kick(&self, queue_idx: usize, fd: &EventFd) -> Result<()> { + let vring_file = VhostVringFile { + index: queue_idx as u32, + fd: fd.as_raw_fd(), + }; + let ret = unsafe { ioctl_with_ref(self, VHOST_SET_VRING_KICK(), &vring_file) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_SET_VRING_KICK".to_string()).into()); + } + Ok(()) + } +} + +pub struct VhostIoHandler { + interrupt_evt: EventFd, + interrupt_status: Arc, + host_notifies: Vec, +} + +impl EventNotifierHelper for VhostIoHandler { + fn internal_notifiers(vhost_handler: Arc>) -> Vec { + let mut notifiers = Vec::new(); + let vhost = vhost_handler.clone(); + + let handler: Box Option>> = + Box::new(move |_, fd: RawFd| { + read_fd(fd); + + let v = vhost.clone(); + let v = v.lock().unwrap(); + v.interrupt_status + .fetch_or(VIRTIO_MMIO_INT_VRING, Ordering::SeqCst); + if v.interrupt_evt.write(1).is_err() { + error!("Failed to write interrupt eventfd for vhost"); + } + + None as Option> + }); + let h = Arc::new(Mutex::new(handler)); + + for host_notify in vhost_handler.lock().unwrap().host_notifies.iter() { + notifiers.push(EventNotifier::new( + NotifierOperation::AddShared, + host_notify.notify_evt.as_raw_fd(), + None, + EventSet::IN, + vec![h.clone()], + )); + } + + notifiers + } +} diff --git a/device_model/src/virtio/vhost/kernel/net.rs b/device_model/src/virtio/vhost/kernel/net.rs new file mode 100644 index 00000000..5a51d547 --- /dev/null +++ b/device_model/src/virtio/vhost/kernel/net.rs @@ -0,0 +1,267 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::cmp; +use std::fs::File; +use std::io::Write; +use std::os::unix::io::AsRawFd; +use std::sync::atomic::AtomicU32; +use std::sync::{Arc, Mutex}; + +use address_space::AddressSpace; +use machine_manager::config::NetworkInterfaceConfig; +use util::byte_code::ByteCode; +use util::epoll_context::EventNotifierHelper; +use util::num_ops::{read_u32, write_u32}; +use util::tap::Tap; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::ioctl::ioctl_with_ref; + +use super::super::super::super::micro_vm::main_loop::MainLoop; +use super::super::super::errors::{ErrorKind, Result, ResultExt}; +use super::super::super::{ + net::{build_device_config_space, create_tap, VirtioNetConfig}, + Queue, VirtioDevice, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VIRTIO_NET_F_CSUM, + VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_UFO, + VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_TYPE_NET, +}; +use super::super::{VhostNotify, VhostOps}; +use super::{VhostBackend, VhostIoHandler, VhostVringFile, VHOST_NET_SET_BACKEND}; + +/// Number of virtqueues. +const QUEUE_NUM_NET: usize = 2; +/// Size of each virtqueue. +const QUEUE_SIZE_NET: u16 = 256; +/// Feature for vhost-net to add virtio_net_hdr for RX, and strip for TX packets. +const VHOST_NET_F_VIRTIO_NET_HDR: u32 = 27; + +trait VhostNetBackend { + /// Attach virtio net ring to a raw socket, or tap device. + /// The socket must be already bound to an ethernet device, this device will be + /// used for transmit. Pass fd -1 to unbind from the socket and the transmit + /// device. This can be used to stop the ring (e.g. for migration). + /// + /// # Arguments + /// * `queue_index` - Index of the queue to modify. + /// * `fd` - EventFd that will be signaled from guest. + fn set_backend(&self, queue_index: usize, tap_file: &File) -> Result<()>; +} + +impl VhostNetBackend for VhostBackend { + /// Attach virtio net ring to a raw socket, or tap device. + fn set_backend(&self, queue_index: usize, tap_file: &File) -> Result<()> { + let vring_file = VhostVringFile { + index: queue_index as u32, + fd: tap_file.as_raw_fd(), + }; + + let ret = unsafe { ioctl_with_ref(self, VHOST_NET_SET_BACKEND(), &vring_file) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_NET_SET_BACKEND".to_string()).into()); + } + Ok(()) + } +} + +/// Network device structure. +pub struct Net { + /// Configuration of the network device. + net_cfg: NetworkInterfaceConfig, + /// Tap device opened. + tap: Option, + /// Related vhost-net kernel device. + backend: Option, + /// Bit mask of features supported by the backend. + device_features: u64, + /// Bit mask of features negotiated by the backend and the frontend. + driver_features: u64, + /// Bit mask of features supported by the vhost-net kernel. + vhost_features: u64, + /// Virtio net configurations. + device_config: VirtioNetConfig, + /// System address space. + mem_space: Arc, +} + +impl Net { + pub fn new(net_cfg: NetworkInterfaceConfig, mem_space: Arc) -> Self { + Net { + net_cfg, + tap: None, + backend: None, + device_features: 0_u64, + driver_features: 0_u64, + vhost_features: 0_u64, + device_config: VirtioNetConfig::default(), + mem_space, + } + } +} + +impl VirtioDevice for Net { + /// Realize vhost virtio network device. + fn realize(&mut self) -> Result<()> { + let backend = VhostBackend::new(&self.mem_space, "/dev/vhost-net", self.net_cfg.vhost_fd)?; + backend.set_owner()?; + + let mut vhost_features = backend.get_features()?; + vhost_features &= !(1_u64 << VHOST_NET_F_VIRTIO_NET_HDR); + vhost_features &= !(1_u64 << VIRTIO_F_ACCESS_PLATFORM); + + let mut device_features = vhost_features; + device_features |= 1 << VIRTIO_F_VERSION_1 + | 1 << VIRTIO_NET_F_CSUM + | 1 << VIRTIO_NET_F_GUEST_CSUM + | 1 << VIRTIO_NET_F_GUEST_TSO4 + | 1 << VIRTIO_NET_F_GUEST_UFO + | 1 << VIRTIO_NET_F_HOST_TSO4 + | 1 << VIRTIO_NET_F_HOST_UFO; + + if let Some(mac) = &self.net_cfg.mac { + device_features |= build_device_config_space(&mut self.device_config, mac); + } + + let host_dev_name = match self.net_cfg.host_dev_name.as_str() { + "" => None, + _ => Some(self.net_cfg.host_dev_name.as_str()), + }; + + self.tap = + create_tap(self.net_cfg.tap_fd, host_dev_name).chain_err(|| "Failed to create tap")?; + self.backend = Some(backend); + self.device_features = device_features; + self.vhost_features = vhost_features; + + Ok(()) + } + + /// Get the virtio device type, refer to Virtio Spec. + fn device_type(&self) -> u32 { + VIRTIO_TYPE_NET + } + + /// Get the count of virtio device queues. + fn queue_num(&self) -> usize { + QUEUE_NUM_NET + } + + /// Get the queue size of virtio device. + fn queue_size(&self) -> u16 { + QUEUE_SIZE_NET + } + + /// Get device features from host. + fn get_device_features(&self, features_select: u32) -> u32 { + read_u32(self.device_features, features_select) + } + + /// Set driver features by guest. + fn set_driver_features(&mut self, page: u32, value: u32) { + let mut features = write_u32(value, page); + let unsupported_features = features & !self.device_features; + if unsupported_features != 0 { + warn!( + "Received acknowledge request with unsupported feature: {:x}", + features + ); + features &= !unsupported_features; + } + self.driver_features |= features; + } + + /// Read data of config from guest. + fn read_config(&self, offset: u64, mut data: &mut [u8]) -> Result<()> { + let config_slice = self.device_config.as_bytes(); + let config_size = config_slice.len() as u64; + if offset >= config_size { + return Err(ErrorKind::DevConfigOverflow(offset, config_size).into()); + } + if let Some(end) = offset.checked_add(data.len() as u64) { + data.write_all(&config_slice[offset as usize..cmp::min(end, config_size) as usize])?; + } + + Ok(()) + } + + /// Write data to config from guest. + fn write_config(&mut self, offset: u64, data: &[u8]) -> Result<()> { + let data_len = data.len(); + let config_slice = self.device_config.as_mut_bytes(); + let config_len = config_slice.len(); + if offset as usize + data_len > config_len { + return Err(ErrorKind::DevConfigOverflow(offset, config_len as u64).into()); + } + + config_slice[(offset as usize)..(offset as usize + data_len)].copy_from_slice(&data[..]); + + Ok(()) + } + + /// Activate the virtio device, this function is called by vcpu thread when frontend + /// virtio driver is ready and write `DRIVER_OK` to backend. + fn activate( + &mut self, + _mem_space: Arc, + interrupt_evt: EventFd, + interrupt_status: Arc, + queues: Vec>>, + queue_evts: Vec, + ) -> Result<()> { + let mut host_notifies = Vec::new(); + let backend = match &self.backend { + None => return Err("Failed to get backend".into()), + Some(backend_) => backend_, + }; + + backend.set_features(self.vhost_features)?; + backend.set_mem_table()?; + + for (queue_index, queue_mutex) in queues.iter().enumerate() { + let queue = queue_mutex.lock().unwrap(); + let actual_size = queue.vring.actual_size(); + let queue_config = queue.vring.get_queue_config(); + + backend.set_vring_num(queue_index, actual_size)?; + backend.set_vring_addr(&queue_config, queue_index, 0)?; + backend.set_vring_base(queue_index, 0)?; + backend.set_vring_kick(queue_index, &queue_evts[queue_index])?; + + drop(queue); + + let host_notify = VhostNotify { + notify_evt: EventFd::new(libc::EFD_NONBLOCK) + .chain_err(|| ErrorKind::EventFdCreate)?, + queue: queue_mutex.clone(), + }; + backend.set_vring_call(queue_index, &host_notify.notify_evt)?; + host_notifies.push(host_notify); + + let tap = match &self.tap { + None => bail!("Failed to get tap"), + Some(tap_) => tap_, + }; + backend.set_backend(queue_index, &tap.file)?; + } + + let handler = VhostIoHandler { + interrupt_evt: interrupt_evt.try_clone()?, + interrupt_status, + host_notifies, + }; + + MainLoop::update_event(EventNotifierHelper::internal_notifiers(Arc::new( + Mutex::new(handler), + )))?; + + Ok(()) + } +} diff --git a/device_model/src/virtio/vhost/kernel/vsock.rs b/device_model/src/virtio/vhost/kernel/vsock.rs new file mode 100644 index 00000000..bf23f04e --- /dev/null +++ b/device_model/src/virtio/vhost/kernel/vsock.rs @@ -0,0 +1,226 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::os::unix::io::RawFd; +use std::sync::atomic::AtomicU32; +use std::sync::{Arc, Mutex}; + +use address_space::AddressSpace; +use byteorder::{ByteOrder, LittleEndian}; +use machine_manager::config::VsockConfig; +use util::epoll_context::EventNotifierHelper; +use util::num_ops::{read_u32, write_u32}; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::ioctl::ioctl_with_ref; + +use super::super::super::super::micro_vm::main_loop::MainLoop; +use super::super::super::errors::{ErrorKind, Result, ResultExt}; +use super::super::super::{Queue, VirtioDevice, VIRTIO_TYPE_VSOCK}; +use super::super::{VhostNotify, VhostOps}; +use super::{VhostBackend, VhostIoHandler, VHOST_VSOCK_SET_GUEST_CID, VHOST_VSOCK_SET_RUNNING}; + +/// Number of virtqueues. +const QUEUE_NUM_VSOCK: usize = 3; +/// Size of each virtqueue. +const QUEUE_SIZE_VSOCK: u16 = 256; +/// Backend vhost-vsock device path. +const VHOST_PATH: &str = "/dev/vhost-vsock"; + +trait VhostVsockBackend { + /// Each guest should have an unique CID which is used to route data to the guest. + fn set_guest_cid(&self, cid: u64) -> Result<()>; + + fn set_running(&self, start: bool) -> Result<()>; +} + +impl VhostVsockBackend for VhostBackend { + fn set_guest_cid(&self, cid: u64) -> Result<()> { + let ret = unsafe { ioctl_with_ref(&self.fd, VHOST_VSOCK_SET_GUEST_CID(), &cid) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_VSOCK_SET_GUEST_CID".to_string()).into()); + } + Ok(()) + } + + fn set_running(&self, start: bool) -> Result<()> { + let on: u32 = if start { 1 } else { 0 }; + let ret = unsafe { ioctl_with_ref(&self.fd, VHOST_VSOCK_SET_RUNNING(), &on) }; + if ret < 0 { + return Err(ErrorKind::VhostIoctl("VHOST_VSOCK_SET_RUNNING".to_string()).into()); + } + Ok(()) + } +} + +/// Vsock device structure. +pub struct Vsock { + /// Configuration of the vsock device. + vsock_cfg: VsockConfig, + /// Related vhost-vsock kernel device. + backend: Option, + /// Bit mask of features supported by the backend. + device_features: u64, + /// Bit mask of features negotiated by the backend and the frontend. + driver_features: u64, + /// Configuration of virtio vsock. + config_space: Vec, + /// System address space. + mem_space: Arc, +} + +impl Vsock { + pub fn new(vsock_cfg: VsockConfig, mem_space: Arc) -> Self { + Vsock { + vsock_cfg, + backend: None, + device_features: 0_u64, + driver_features: 0_u64, + config_space: Vec::new(), + mem_space, + } + } +} + +impl VirtioDevice for Vsock { + /// Realize vhost virtio vsock device. + fn realize(&mut self) -> Result<()> { + let vhost_fd: Option = self.vsock_cfg.vhost_fd; + let backend = VhostBackend::new(&self.mem_space, VHOST_PATH, vhost_fd)?; + + self.device_features = backend.get_features()?; + self.backend = Some(backend); + + Ok(()) + } + + /// Get the virtio device type, refer to Virtio Spec. + fn device_type(&self) -> u32 { + VIRTIO_TYPE_VSOCK + } + + /// Get the count of virtio device queues. + fn queue_num(&self) -> usize { + QUEUE_NUM_VSOCK + } + + /// Get the queue size of virtio device. + fn queue_size(&self) -> u16 { + QUEUE_SIZE_VSOCK + } + + /// Get device features from host. + fn get_device_features(&self, features_select: u32) -> u32 { + read_u32(self.device_features, features_select) + } + + /// Set driver features by guest. + fn set_driver_features(&mut self, page: u32, value: u32) { + let mut features = write_u32(value, page); + let unsupported_features = features & !self.device_features; + if unsupported_features != 0 { + warn!("Unsupported feature ack (Vsock): {:x}", features); + features &= !unsupported_features; + } + self.driver_features |= features; + } + + /// Read data of config from guest. + fn read_config(&self, offset: u64, data: &mut [u8]) -> Result<()> { + match offset { + 0 if data.len() == 8 => LittleEndian::write_u64(data, self.vsock_cfg.guest_cid), + 0 if data.len() == 4 => { + LittleEndian::write_u32(data, (self.vsock_cfg.guest_cid & 0xffff_ffff) as u32) + } + 4 if data.len() == 4 => LittleEndian::write_u32( + data, + ((self.vsock_cfg.guest_cid >> 32) & 0xffff_ffff) as u32, + ), + _ => bail!("Failed to read config: offset {} exceeds", offset), + } + Ok(()) + } + + /// Write data to config from guest. + fn write_config(&mut self, offset: u64, data: &[u8]) -> Result<()> { + let data_len = data.len(); + let config_len = self.config_space.len(); + if offset as usize + data_len > config_len { + return Err(ErrorKind::DevConfigOverflow(offset, config_len as u64).into()); + } + + self.config_space[(offset as usize)..(offset as usize + data_len)] + .copy_from_slice(&data[..]); + + Ok(()) + } + + /// Activate the virtio device, this function is called by vcpu thread when frontend + /// virtio driver is ready and write `DRIVER_OK` to backend. + fn activate( + &mut self, + _: Arc, + interrupt_evt: EventFd, + interrupt_status: Arc, + queues: Vec>>, + queue_evts: Vec, + ) -> Result<()> { + let cid = self.vsock_cfg.guest_cid; + let mut host_notifies = Vec::new(); + // The third queue is an event-only queue that is not handled by the vhost + // subsystem (but still needs to exist). Split it off here. + let vhost_queues = queues[..2].to_vec(); + + // Preliminary setup for vhost net. + let backend = match &self.backend { + None => return Err("Failed to get backend for vsock".into()), + Some(backend_) => backend_, + }; + backend.set_owner()?; + backend.set_features(self.driver_features)?; + backend.set_mem_table()?; + + for (queue_index, queue_mutex) in vhost_queues.iter().enumerate() { + let queue = queue_mutex.lock().unwrap(); + let actual_size = queue.vring.actual_size(); + let queue_config = queue.vring.get_queue_config(); + + backend.set_vring_num(queue_index, actual_size)?; + backend.set_vring_addr(&queue_config, queue_index, 0)?; + backend.set_vring_base(queue_index, 0)?; + backend.set_vring_kick(queue_index, &queue_evts[queue_index])?; + drop(queue); + + let host_notify = VhostNotify { + notify_evt: EventFd::new(libc::EFD_NONBLOCK) + .chain_err(|| ErrorKind::EventFdCreate)?, + queue: queue_mutex.clone(), + }; + backend.set_vring_call(queue_index, &host_notify.notify_evt)?; + host_notifies.push(host_notify); + } + + backend.set_guest_cid(cid)?; + backend.set_running(true)?; + + let handler = VhostIoHandler { + interrupt_evt: interrupt_evt.try_clone()?, + interrupt_status, + host_notifies, + }; + + MainLoop::update_event(EventNotifierHelper::internal_notifiers(Arc::new( + Mutex::new(handler), + )))?; + + Ok(()) + } +} diff --git a/device_model/src/virtio/vhost/mod.rs b/device_model/src/virtio/vhost/mod.rs new file mode 100644 index 00000000..b6669f9e --- /dev/null +++ b/device_model/src/virtio/vhost/mod.rs @@ -0,0 +1,82 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +pub mod kernel; + +use std::sync::{Arc, Mutex}; + +use vmm_sys_util::eventfd::EventFd; + +use super::errors::Result; +use super::{Queue, QueueConfig}; + +/// Vhost vring call notify structure. +pub struct VhostNotify { + /// Used to register in vhost kernel, when virtio queue have io request will notify to vhost. + pub notify_evt: EventFd, + /// The related virtio queue. + pub queue: Arc>, +} + +pub trait VhostOps { + /// Set the current process as the (exclusive) owner of file descriptor + /// of the vhost backend. This must be run before any other vhost commands. + fn set_owner(&self) -> Result<()>; + + /// Get a bitmask of supported vhost specific features. + fn get_features(&self) -> Result; + + /// Set the vhost backend supported vhost specific features. This should be + /// a subset of supported features from VHOST_GET_FEATURES. + /// + /// # Arguments + /// * `features` - Bitmask of features to set. + fn set_features(&self, features: u64) -> Result<()>; + + /// Set memory layout. + fn set_mem_table(&self) -> Result<()>; + + /// Set number of descriptors in ring. This parameter can not be modified + /// while ring is running (bound to a device). + /// + /// # Arguments + /// * `queue_idx` - Index of the queue to set. + /// * `num` - Number of descriptors in the virtqueue. + fn set_vring_num(&self, queue_idx: usize, num: u16) -> Result<()>; + + /// Set addresses for the vring. + /// + /// # Arguments + /// * `config` - queue configuration. + fn set_vring_addr(&self, queue: &QueueConfig, index: usize, flags: u32) -> Result<()>; + + /// Set base value where queue looks for available descriptors. + /// + /// # Arguments + /// * `queue_idx` - Index of the queue to set. + /// * `last_avail_idx` - Index of the available descriptor. + fn set_vring_base(&self, queue_idx: usize, last_avail_idx: u16) -> Result<()>; + + /// Set eventfd to signal when buffers have been used. + /// + /// # Arguments + /// * `queue_idx` - Index of the queue to modify. + /// * `fd` - EventFd to trigger. + fn set_vring_call(&self, queue_idx: usize, fd: &EventFd) -> Result<()>; + + /// Set eventfd to poll for added buffers. + /// + /// # Arguments + /// * `queue_idx` - Index of the queue to modify. + /// * `fd` - EventFd that will be signaled from guest. + fn set_vring_kick(&self, queue_idx: usize, fd: &EventFd) -> Result<()>; +} diff --git a/license/LICENSE b/license/LICENSE new file mode 100644 index 00000000..9e32cdef --- /dev/null +++ b/license/LICENSE @@ -0,0 +1,127 @@ + 木兰宽松许可证, 第2版 + + 木兰宽松许可证, 第2版 + 2020年1月 http://license.coscl.org.cn/MulanPSL2 + + + 您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束: + + 0. 定义 + + “软件”是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。 + + “贡献”是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。 + + “贡献者”是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。 + + “法人实体”是指提交贡献的机构及其“关联实体”。 + + “关联实体”是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。 + + 1. 授予版权许可 + + 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。 + + 2. 授予专利许可 + + 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。 + + 3. 无商标许可 + + “本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。 + + 4. 分发限制 + + 您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。 + + 5. 免责声明与责任限制 + + “软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。 + + 6. 语言 + “本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。 + + 条款结束 + + 如何将木兰宽松许可证,第2版,应用到您的软件 + + 如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步: + + 1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字; + + 2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中; + + 3, 请将如下声明文本放入每个源文件的头部注释中。 + + Copyright (c) [Year] [name of copyright holder] + [Software Name] is licensed under Mulan PSL v2. + You can use this software according to the terms and conditions of the Mulan PSL v2. + You may obtain a copy of Mulan PSL v2 at: + http://license.coscl.org.cn/MulanPSL2 + THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + See the Mulan PSL v2 for more details. + + + Mulan Permissive Software License,Version 2 + + Mulan Permissive Software License,Version 2 (Mulan PSL v2) + January 2020 http://license.coscl.org.cn/MulanPSL2 + + Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions: + + 0. Definition + + Software means the program and related documents which are licensed under this License and comprise all Contribution(s). + + Contribution means the copyrightable work licensed by a particular Contributor under this License. + + Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License. + + Legal Entity means the entity making a Contribution and all its Affiliates. + + Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity. + + 1. Grant of Copyright License + + Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not. + + 2. Grant of Patent License + + Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken. + + 3. No Trademark License + + No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in Section 4. + + 4. Distribution Restriction + + You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software. + + 5. Disclaimer of Warranty and Limitation of Liability + + THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + + 6. Language + + THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL. + + END OF THE TERMS AND CONDITIONS + + How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software + + To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps: + + i Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner; + + ii Create a file named “LICENSE” which contains the whole context of this License in the first directory of your software package; + + iii Attach the statement to the appropriate annotated syntax at the beginning of each source file. + + + Copyright (c) [Year] [name of copyright holder] + [Software Name] is licensed under Mulan PSL v2. + You can use this software according to the terms and conditions of the Mulan PSL v2. + You may obtain a copy of Mulan PSL v2 at: + http://license.coscl.org.cn/MulanPSL2 + THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + See the Mulan PSL v2 for more details. diff --git a/license/Third_Party_Open_Source_Software_Notice.md b/license/Third_Party_Open_Source_Software_Notice.md new file mode 100644 index 00000000..a9674ea2 --- /dev/null +++ b/license/Third_Party_Open_Source_Software_Notice.md @@ -0,0 +1,358 @@ +THIRD PARTY OPEN SOURCE SOFTWARE NOTICE +Please note we provide an open source software notice for the third party open source software along with this software and/or this software component contributed by Huawei (in the following just “this SOFTWARE”). The open source software licenses are granted by the respective right holders. + +Warranty Disclaimer +THE OPEN SOURCE SOFTWARE IN THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES FOR MORE DETAILS. + +Copyright Notice and License Texts +Software: libc 0.2.71 +Copyright notice: +Copyright (c) 2014-2020 The Rust Project Developers +License: MIT or Apache License Version 2.0 + +Copyright (C) + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +---------------------------------------------------------------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Software: log 0.4.8 +Copyright notice: +Copyright (c) 2014 The Rust Project Developers +Copyright 2014-2015 The Rust Project Developers +Copyright 2015 The Rust Project Developers +License: MIT or Apache License Version 2.0 +Please see above. + +Software: byteorder 1.3.4 +Copyright notice: +Copyright (c) 2015 Andrew Gallant +License: MIT or Unlicense +Please see above. + +---------------------------------------------------------------- + +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to + +Software: serde 1.0.114 +Copyright notice: +Copyright (c) David Tolnay +Copyright (c) Erick Tryzelaar +License: MIT or Apache License Version 2.0 +Please see above. + +Software: serde_json 1.0.55 +Copyright notice: +Copyright (c) David Tolnay +Copyright (c) Erick Tryzelaar +License: MIT or Apache License Version 2.0 +Please see above. + +Software: error-chain 0.12.4 +Copyright notice: +Copyright (c) 2017 The Error-Chain Project Developers +License: MIT or Apache License Version 2.0 +Please see above. + +Software: vmm-sys-util 0.6.1 +Copyright notice: +Copyright 2019 Intel Corporation. All Rights Reserved. +Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +Copyright 2017 The Chromium OS Authors. All rights reserved. +Copyright (C) 2019 Alibaba Cloud Computing. All rights reserved. +Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +Copyright 2018 The Chromium OS Authors. All rights reserved. +Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +License: Apache License Version 2.0 or BSD 3-Clause +Please see above. + +---------------------------------------------------------------- + +Copyright (c) . All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + 1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + 3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Software: kvm-ioctls 0.5.0 +Copyright notice: +Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +Copyright 2017 The Chromium OS Authors. All rights reserved. +Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +License: MIT or Apache License Version 2.0 +Please see above. + +Software: kvm-bindings 0.2.0 +Copyright notice: +Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +License: The APACHE 2.0 License +Please see above. diff --git a/machine_manager/Cargo.toml b/machine_manager/Cargo.toml new file mode 100644 index 00000000..79fe9de8 --- /dev/null +++ b/machine_manager/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "machine_manager" +version = "0.1.0" +authors = ["Huawei StratoVirt Team"] +edition = "2018" +license = "Mulan PSL v2" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +util = { path = "../util" } +serde = { version = "1.0.114", features = ["derive"] } +serde_json = "1.0.55" +log = "0.4.8" +libc = "0.2.71" +error-chain = "0.12.4" +vmm-sys-util = "0.6.1" + +[features] +default = ["qmp"] +qmp = [] + diff --git a/machine_manager/src/config/boot_source.rs b/machine_manager/src/config/boot_source.rs new file mode 100644 index 00000000..514efd50 --- /dev/null +++ b/machine_manager/src/config/boot_source.rs @@ -0,0 +1,269 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate serde; +extern crate serde_json; + +use std::fmt; +use std::path::PathBuf; +use std::sync::Mutex; + +use serde::{Deserialize, Serialize}; + +use super::errors::{ErrorKind, Result}; +use crate::config::{ConfigCheck, Param, ParamOperation, VmConfig}; + +const MAX_STRING_LENGTH: usize = 255; +const MAX_PATH_LENGTH: usize = 4096; + +/// Config struct for boot-source. +/// Contains `kernel_file`, `kernel_cmdline` and `initrd`. +#[derive(Default, Clone, Debug, Serialize, Deserialize)] +pub struct BootSource { + /// Path of the kernel image. + pub kernel_file: PathBuf, + /// Kernel boot arguments. + pub kernel_cmdline: KernelParams, + /// Config of initrd. + pub initrd: Option, +} + +impl BootSource { + /// Create `BootSource` from `Value` structure. + /// + /// # Arguments + /// + /// * `Value` - structure can be gotten by `json_file`. + pub fn from_value(value: &serde_json::Value) -> Self { + let mut boot_source = BootSource::default(); + if value.get("kernel_image_path") != None { + boot_source.kernel_file = + PathBuf::from(&(value["kernel_image_path"].to_string().replace("\"", ""))); + } + if value.get("boot_args") != None { + boot_source.kernel_cmdline = + KernelParams::from_str((value["boot_args"]).to_string().replace("\"", "")) + } + if value.get("initrd_fs_path") != None { + boot_source.initrd = Some(InitrdConfig::new( + &(value["initrd_fs_path"].to_string().replace("\"", "")), + )) + } + boot_source + } + + /// Move all the elements of `other` into `Self.kernel_cmdline`. + pub fn append_kernel_cmdline(&mut self, other: &mut Vec) { + self.kernel_cmdline.append(other); + } +} + +impl ConfigCheck for BootSource { + fn check(&self) -> Result<()> { + if self.kernel_file.to_str().unwrap().len() > MAX_PATH_LENGTH { + return Err(ErrorKind::StringLengthTooLong( + "kernel_file path".to_string(), + MAX_PATH_LENGTH, + ) + .into()); + } + + if !self.kernel_file.is_file() { + return Err(ErrorKind::UnRegularFile("Input kernel_file".to_string()).into()); + } + + self.kernel_cmdline.check()?; + if self.initrd.is_some() { + self.initrd.as_ref().unwrap().check()?; + } + + Ok(()) + } +} + +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct InitrdConfig { + /// Path of the initrd image + pub initrd_file: PathBuf, + /// Size of initrd image + pub initrd_size: u64, + pub initrd_addr: Mutex, +} + +impl InitrdConfig { + pub fn new(initrd: &str) -> Self { + let initrd_size = match std::fs::metadata(initrd) { + Ok(meta) => meta.len() as u64, + _ => panic!("initrd file init failed {:?}!", initrd), + }; + InitrdConfig { + initrd_file: PathBuf::from(initrd), + initrd_size, + initrd_addr: Mutex::new(0), + } + } +} + +impl ConfigCheck for InitrdConfig { + fn check(&self) -> Result<()> { + if self.initrd_file.to_str().unwrap().len() > MAX_STRING_LENGTH { + return Err(ErrorKind::StringLengthTooLong( + "initrd_file".to_string(), + MAX_STRING_LENGTH, + ) + .into()); + } + + if !self.initrd_file.is_file() { + return Err(ErrorKind::UnRegularFile("Input initrd_file".to_string()).into()); + } + + Ok(()) + } +} + +impl Clone for InitrdConfig { + fn clone(&self) -> Self { + InitrdConfig { + initrd_file: self.initrd_file.to_path_buf(), + initrd_size: self.initrd_size, + initrd_addr: Mutex::new(0), + } + } +} + +/// Struct `KernelParams` used to parse kernel cmdline to config. +/// Contains a `Vec` and its `len()`. +#[derive(Default, Clone, Debug, Serialize, Deserialize)] +pub struct KernelParams { + pub params: Vec, + pub length: usize, +} + +impl ParamOperation for KernelParams { + /// Allocates an empty `KernelParams` + fn new() -> Self { + let params: Vec = Vec::new(); + let length: usize = 0; + KernelParams { params, length } + } + + /// Created `Kernel` from `String`. + fn from_str(kernel_cmdline: String) -> Self { + let split = kernel_cmdline.split(' '); + let vec = split.collect::>(); + let mut params: Vec = Vec::new(); + let mut length: usize = 0; + for item in vec { + params.push(Param::from_str(item)); + length += 1; + } + KernelParams { params, length } + } +} + +impl ConfigCheck for KernelParams { + fn check(&self) -> Result<()> { + for param in self.params.clone() { + if param.value.len() > MAX_STRING_LENGTH { + return Err(ErrorKind::StringLengthTooLong( + "kernel params".to_string(), + MAX_STRING_LENGTH, + ) + .into()); + } + } + + Ok(()) + } +} + +impl KernelParams { + /// Push new `Param` to `KernelParams`. + pub fn push(&mut self, item: Param) { + self.params.push(item); + self.length = self + .length + .checked_add(1) + .unwrap_or_else(|| panic!("Kernel params length is too long: {}", self.length)); + } + + /// Move all the `Param` into `KernelParams`. + pub fn append(&mut self, items: &mut Vec) { + self.length = self + .length + .checked_add(items.len()) + .unwrap_or_else(|| panic!("Kernel params length is too long: {}", self.length)); + self.params.append(items); + } + + /// Check `KernelParam` whether contains `item` or not. + pub fn contains(&self, item: &str) -> bool { + for i in 0..self.length { + if self.params[i].param_type == item { + return true; + } + } + false + } +} + +impl fmt::Display for KernelParams { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut vec: Vec = Vec::with_capacity(self.length); + for i in 0..self.length { + vec.push(self.params[i].to_string()); + } + write!(f, "{}", vec.join(" ")) + } +} + +impl VmConfig { + /// Update `-kernel kernel_file` config to `VmConfig` + pub fn update_kernel(&mut self, kernel_image: String) { + self.boot_source.kernel_file = PathBuf::from(kernel_image); + } + + /// Update `-append kernel_cmdline` config to `VmConfig` + pub fn update_kernel_cmdline(&mut self, cmdline: &[String]) { + let cmdline: String = cmdline.join(" "); + self.boot_source.kernel_cmdline = KernelParams::from_str(cmdline); + } + + /// Update `-initrd initrd_path` config to `VmConfig` + pub fn update_initrd(&mut self, initrd: String) { + self.boot_source.initrd = Some(InitrdConfig::new(&initrd)); + } +} + +#[cfg(test)] +mod tests { + use super::super::{Param, ParamOperation}; + use super::KernelParams; + + #[test] + fn test_kernel_params() { + let test_kernel = "reboot=k panic=1 pci=off nomodules 8250.nr_uarts=0"; + let mut test_kernel_param = KernelParams::from_str(test_kernel.to_string()); + + assert_eq!(test_kernel_param.length, 5); + + test_kernel_param.push(Param::from_str("maxcpus=8")); + assert_eq!(test_kernel_param.length, 6); + assert_eq!(test_kernel_param.contains("maxcpus"), true); + assert_eq!(test_kernel_param.contains("cpus"), false); + assert_eq!( + test_kernel_param.to_string(), + "reboot=k panic=1 pci=off nomodules 8250.nr_uarts=0 maxcpus=8" + ); + } +} diff --git a/machine_manager/src/config/chardev.rs b/machine_manager/src/config/chardev.rs new file mode 100644 index 00000000..5cf94836 --- /dev/null +++ b/machine_manager/src/config/chardev.rs @@ -0,0 +1,179 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate serde; +extern crate serde_json; + +use serde::{Deserialize, Serialize}; + +use super::errors::{ErrorKind, Result}; +use crate::config::{CmdParams, ConfigCheck, ParamOperation, VmConfig}; + +const MAX_STRING_LENGTH: usize = 255; +const MAX_PATH_LENGTH: usize = 4096; +const MAX_GUEST_CID: u64 = 4_294_967_295; +const MIN_GUEST_CID: u64 = 3; + +/// Config structure for virtio-console. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ConsoleConfig { + pub console_id: String, + pub socket_path: String, +} + +impl ConsoleConfig { + /// Create `ConsoleConfig` from `Value` structure. + /// + /// # Arguments + /// + /// * `Value` - structure can be gotten by `json_file`. + pub fn from_value(value: &serde_json::Value) -> Option> { + serde_json::from_value(value.clone()).ok() + } +} + +impl ConfigCheck for ConsoleConfig { + fn check(&self) -> Result<()> { + if self.console_id.len() > MAX_STRING_LENGTH { + return Err(ErrorKind::StringLengthTooLong( + "console id".to_string(), + MAX_STRING_LENGTH, + ) + .into()); + } + + if self.socket_path.len() > MAX_PATH_LENGTH { + return Err( + ErrorKind::StringLengthTooLong("socket path".to_string(), MAX_PATH_LENGTH).into(), + ); + } + + Ok(()) + } +} + +impl VmConfig { + /// Add new virtio-console device to `VmConfig`. + fn add_console(&mut self, console: ConsoleConfig) { + if let Some(mut consoles) = self.consoles.clone() { + consoles.push(console); + self.consoles = Some(consoles); + } else { + let mut consoles: Vec = Vec::new(); + consoles.push(console); + self.consoles = Some(consoles); + } + } + + /// Update '-console ...' network config to `VmConfig`. + pub fn update_console(&mut self, console_config: String) { + let cmd_params: CmdParams = CmdParams::from_str(console_config); + let mut console = ConsoleConfig::default(); + if let Some(console_id) = cmd_params.get("id") { + console.console_id = console_id.value; + } + if let Some(console_path) = cmd_params.get("path") { + console.socket_path = console_path.value; + } + self.add_console(console); + } + + /// Get virtio-console's config from `device` and `chardev` config. + pub fn get_virtio_console(&self) -> Vec { + let mut console_cfg: Vec = Vec::new(); + if let Some(console_devs) = self.consoles.as_ref() { + for console_dev in console_devs { + console_cfg.push(console_dev.clone()) + } + } + console_cfg + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct SerialConfig { + pub stdio: bool, +} + +impl SerialConfig { + /// Create `SerialConfig` from `Value` structure. + /// + /// # Arguments + /// + /// * `Value` - structure can be gotten by `json_file`. + pub fn from_value(value: &serde_json::Value) -> Option { + serde_json::from_value(value.clone()).ok() + } +} + +impl VmConfig { + pub fn update_serial(&mut self, serial_config: String) { + let cmd_params: CmdParams = CmdParams::from_str(serial_config); + + if let Some(serial_type) = cmd_params.get("") { + if serial_type.to_string() == "stdio" { + self.serial = Some(SerialConfig { stdio: true }); + } else { + self.serial = Some(SerialConfig { stdio: false }); + } + } + } +} + +/// Config structure for virtio-vsock. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct VsockConfig { + pub vsock_id: String, + pub guest_cid: u64, + pub vhost_fd: Option, +} + +impl VsockConfig { + /// Create `VsockConfig` from `Value` structure. + /// `Value` structure can be gotten by `json_file`. + pub fn from_value(value: &serde_json::Value) -> Option { + serde_json::from_value(value.clone()).ok() + } +} + +impl ConfigCheck for VsockConfig { + fn check(&self) -> Result<()> { + if self.vsock_id.len() > MAX_STRING_LENGTH { + return Err( + ErrorKind::StringLengthTooLong("vsock id".to_string(), MAX_STRING_LENGTH).into(), + ); + } + + if self.guest_cid < MIN_GUEST_CID || self.guest_cid >= MAX_GUEST_CID { + return Err(ErrorKind::GuestCidError.into()); + } + + Ok(()) + } +} + +impl VmConfig { + pub fn update_vsock(&mut self, vsock_config: String) { + let cmd_params: CmdParams = CmdParams::from_str(vsock_config); + + if let Some(device_type) = cmd_params.get("") { + if device_type.value.contains("vsock") { + let vhost_fd = cmd_params.get_value_i32("vhostfd"); + self.vsock = Some(VsockConfig { + vsock_id: cmd_params.get_value_str("id").unwrap(), + guest_cid: cmd_params.get_value_u64("guest-cid").unwrap(), + vhost_fd, + }); + } + } + } +} diff --git a/machine_manager/src/config/fs.rs b/machine_manager/src/config/fs.rs new file mode 100644 index 00000000..c901a18d --- /dev/null +++ b/machine_manager/src/config/fs.rs @@ -0,0 +1,123 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate serde; +extern crate serde_json; + +use serde::{Deserialize, Serialize}; + +use super::errors::{ErrorKind, Result}; +use crate::config::{CmdParams, ConfigCheck, ParamOperation, VmConfig}; + +const MAX_STRING_LENGTH: usize = 255; +const MAX_PATH_LENGTH: usize = 4096; +const MAX_SERIAL_NUM: usize = 20; + +/// Config struct for `drive`. +/// Contains block device's attr. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct DriveConfig { + pub drive_id: String, + pub path_on_host: String, + pub read_only: bool, + pub direct: bool, + pub serial_num: Option, +} + +impl DriveConfig { + /// Create `DriveConfig` from `Value` structure. + /// + /// # Arguments + /// + /// * `Value` - structure can be gotten by `json_file`. + pub fn from_value(value: &serde_json::Value) -> Option> { + serde_json::from_value(value.clone()).ok() + } +} + +impl Default for DriveConfig { + fn default() -> Self { + DriveConfig { + drive_id: "".to_string(), + path_on_host: "".to_string(), + read_only: false, + direct: true, + serial_num: None, + } + } +} + +impl ConfigCheck for DriveConfig { + fn check(&self) -> Result<()> { + if self.drive_id.len() > MAX_STRING_LENGTH { + return Err(ErrorKind::StringLengthTooLong( + "drive device id".to_string(), + MAX_STRING_LENGTH, + ) + .into()); + } + + if self.path_on_host.len() > MAX_PATH_LENGTH { + return Err(ErrorKind::StringLengthTooLong( + "drive device path".to_string(), + MAX_PATH_LENGTH, + ) + .into()); + } + + if self.serial_num.is_some() && self.serial_num.as_ref().unwrap().len() > MAX_SERIAL_NUM { + return Err(ErrorKind::StringLengthTooLong( + "drive serial number".to_string(), + MAX_SERIAL_NUM, + ) + .into()); + } + + Ok(()) + } +} + +impl VmConfig { + /// Add new block device to `VmConfig`. + fn add_drive(&mut self, drive: DriveConfig) { + if let Some(mut drives) = self.drives.clone() { + drives.push(drive); + self.drives = Some(drives); + } else { + let mut drives: Vec = Vec::new(); + drives.push(drive); + self.drives = Some(drives); + } + } + + /// Update '-drive ...' drive config to `VmConfig`. + pub fn update_drive(&mut self, drive_config: String) { + let cmd_params: CmdParams = CmdParams::from_str(drive_config); + let mut drive = DriveConfig::default(); + if let Some(drive_path) = cmd_params.get("file") { + drive.path_on_host = drive_path.value; + } + if let Some(drive_id) = cmd_params.get("id") { + drive.drive_id = drive_id.value; + } + if let Some(read_only) = cmd_params.get("readonly") { + drive.read_only = read_only.to_bool(); + } + if let Some(direct) = cmd_params.get("direct") { + drive.direct = direct.to_bool(); + } + drive.serial_num = cmd_params.get_value_str("serial"); + + self.add_drive(drive); + } +} diff --git a/machine_manager/src/config/machine_config.rs b/machine_manager/src/config/machine_config.rs new file mode 100644 index 00000000..6f7ebf17 --- /dev/null +++ b/machine_manager/src/config/machine_config.rs @@ -0,0 +1,143 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate serde; +extern crate serde_json; + +use serde::{Deserialize, Serialize}; + +use super::errors::{ErrorKind, Result}; +use crate::config::{CmdParams, ConfigCheck, ParamOperation, VmConfig}; + +const DEFAULT_CPUS: u8 = 1; +const DEFAULT_MEMSIZE: u64 = 128; +const MAX_NR_CPUS: u8 = 254; +const MIN_NR_CPUS: u8 = 1; +const MAX_MEMSIZE: u64 = 549_755_813_888; +const MIN_MEMSIZE: u64 = 134_217_728; +const MAX_STRING_LENGTH: usize = 255; +const M: u64 = 1024 * 1024; +const G: u64 = 1024 * 1024 * 1024; + +/// Config struct for machine-config. +/// Contains some basic Vm config about cpu, memory, name. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct MachineConfig { + pub name: String, + pub nr_cpus: u8, + pub mem_size: u64, + pub omit_vm_memory: bool, +} + +impl Default for MachineConfig { + /// Set default config for `machine-config`. + fn default() -> Self { + MachineConfig { + name: "StratoVirt".to_string(), + nr_cpus: DEFAULT_CPUS, + mem_size: DEFAULT_MEMSIZE * M, + omit_vm_memory: false, + } + } +} + +impl MachineConfig { + /// Create `MachineConfig` from `Value` structure. + /// + /// # Arguments + /// + /// * `Value` - structure can be gotten by `json_file`. + pub fn from_value(value: &serde_json::Value) -> Self { + let mut machine_config = MachineConfig::default(); + if value.get("name") != None { + machine_config.name = value["name"].to_string(); + } + if value.get("vcpu_count") != None { + machine_config.nr_cpus = value["vcpu_count"].to_string().parse::().unwrap(); + } + if value.get("mem_size") != None { + machine_config.mem_size = value["mem_size"].to_string().parse::().unwrap(); + } + if value.get("omit_vm_memory") != None { + machine_config.omit_vm_memory = + value["omit_vm_memory"].to_string().parse::().unwrap(); + } + machine_config + } +} + +impl ConfigCheck for MachineConfig { + fn check(&self) -> Result<()> { + if self.name.len() > MAX_STRING_LENGTH { + return Err( + ErrorKind::StringLengthTooLong("name".to_string(), MAX_STRING_LENGTH).into(), + ); + } + + if self.nr_cpus < MIN_NR_CPUS || self.nr_cpus > MAX_NR_CPUS { + return Err(ErrorKind::NrcpusError.into()); + } + + if self.mem_size < MIN_MEMSIZE || self.mem_size > MAX_MEMSIZE { + return Err(ErrorKind::MemsizeError.into()); + } + + Ok(()) + } +} + +impl VmConfig { + /// Update '-m' memory config to `VmConfig`. + pub fn update_memory(&mut self, mem_config: String) { + let cmd_params: CmdParams = CmdParams::from_str(mem_config); + if let Some(mut mem_size) = cmd_params.get("") { + if mem_size.value_replace_blank("M") || mem_size.value_replace_blank("m") { + self.machine_config.mem_size = get_inner(mem_size.value_to_u64().checked_mul(M)); + } else if mem_size.value_replace_blank("G") || mem_size.value_replace_blank("g") { + self.machine_config.mem_size = get_inner(mem_size.value_to_u64().checked_mul(G)); + } else { + self.machine_config.mem_size = mem_size.value_to_u64(); + } + } else if let Some(mut mem_size) = cmd_params.get("size") { + if mem_size.value_replace_blank("M") || mem_size.value_replace_blank("m") { + self.machine_config.mem_size = get_inner(mem_size.value_to_u64().checked_mul(M)); + } else if mem_size.value_replace_blank("G") || mem_size.value_replace_blank("g") { + self.machine_config.mem_size = get_inner(mem_size.value_to_u64().checked_mul(G)); + } else { + self.machine_config.mem_size = mem_size.value_to_u64(); + } + } + } + + /// Update '-smp' cpu config to `VmConfig`. + pub fn update_cpu(&mut self, cpu_config: String) { + let cmd_params: CmdParams = CmdParams::from_str(cpu_config); + if let Some(cpu_num) = cmd_params.get("") { + self.machine_config.nr_cpus = cpu_num.value_to_u8(); + } else if let Some(cpu_num) = cmd_params.get("cpus") { + self.machine_config.nr_cpus = cpu_num.value_to_u8(); + } + } + + /// Update '-omit_vm_memory' config to 'VmConfig'. + pub fn update_omit_vm_memory(&mut self) { + self.machine_config.omit_vm_memory = true; + } +} + +fn get_inner(outer: Option) -> T { + if let Some(x) = outer { + x + } else { + panic!("Integer overflow occurred!"); + } +} diff --git a/machine_manager/src/config/mod.rs b/machine_manager/src/config/mod.rs new file mode 100644 index 00000000..aff5bfdc --- /dev/null +++ b/machine_manager/src/config/mod.rs @@ -0,0 +1,458 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate serde; +extern crate serde_json; + +mod boot_source; +mod chardev; +mod fs; +mod machine_config; +mod network; + +use std::any::Any; +use std::fmt; + +use serde::{Deserialize, Serialize}; + +#[cfg(target_arch = "aarch64")] +use util::device_tree; + +pub use self::errors::Result; +pub use boot_source::*; +pub use chardev::*; +pub use fs::*; +pub use machine_config::*; +pub use network::*; + +pub mod errors { + error_chain! { + errors { + StringLengthTooLong(t: String, len: usize) { + description("Limit the length of String.") + display("Input {} string's length must be no more than {}.", t, len) + } + NrcpusError { + description("Limit the number of vcpu in StratoVirt.") + display("Number of vcpu should be more than 0 and less than 255.") + } + MemsizeError { + description("Limit the size of memory in StratoVirt.") + display("Size of memory should be less than 512G and more than 128M.") + } + GuestCidError { + description("Check legality of vsock guest-cid.") + display("Vsock guest-cid should be more than 3 and less than 4294967296.") + } + MacFormatError { + description("Check legality of vsock mac address.") + display("Mac address is illegal.") + } + UnknownVhostType { + description("Unknown vhost type.") + display("Unknown vhost type.") + } + UnRegularFile(t: String) { + description("Check legality of file.") + display("{} is not a regular File.", t) + } + } + } +} + +/// `MAX_VCPUS`: the most cpu number Vm support. +pub static MAX_VCPUS: u8 = 128_u8; + +/// Macro: From serde_json: Value $y to get member $z, use $s's from_value +/// function to convert. +/// +/// # Example +/// +/// ```text +/// config_parse!(machine_config, value, "machine-config", MachineConfig); +/// ``` +macro_rules! config_parse { + ( $x:expr, $y:expr, $z:expr, $s:tt ) => { + if let Some(tmp_value) = $y.get($z) { + $x = $s::from_value(tmp_value); + } + }; +} + +/// This main config structure for Vm, contains Vm's basic configuration and devices. +#[derive(Serialize, Deserialize, Clone, Default, Debug)] +pub struct VmConfig { + pub machine_config: MachineConfig, + pub boot_source: BootSource, + pub drives: Option>, + pub nets: Option>, + pub consoles: Option>, + pub vsock: Option, + pub serial: Option, +} + +impl VmConfig { + /// Create the `VmConfig` from `Value`. + /// + /// # Arguments + /// + /// * `Value` - structure can be gotten by `json_file`. + pub fn create_from_value(value: serde_json::Value) -> Result { + let mut machine_config = MachineConfig::default(); + let mut boot_source = BootSource::default(); + let mut drives = None; + let mut nets = None; + let mut consoles = None; + let mut vsock = None; + let mut serial = None; + + // Use macro to use from_value function for every member + config_parse!(machine_config, value, "machine-config", MachineConfig); + config_parse!(boot_source, value, "boot-source", BootSource); + config_parse!(drives, value, "drive", DriveConfig); + config_parse!(nets, value, "net", NetworkInterfaceConfig); + config_parse!(consoles, value, "console", ConsoleConfig); + config_parse!(vsock, value, "vsock", VsockConfig); + config_parse!(serial, value, "serial", SerialConfig); + + Ok(VmConfig { + machine_config, + boot_source, + drives, + nets, + consoles, + vsock, + serial, + }) + } + + /// Healthy check for `VmConfig` + pub fn check_vmconfig(&self, is_daemonize: bool) -> Result<()> { + self.boot_source.check()?; + self.machine_config.check()?; + + if self.drives.is_some() { + for drive in self.drives.as_ref().unwrap() { + drive.check()?; + } + } + + if self.nets.is_some() { + for net in self.nets.as_ref().unwrap() { + net.check()?; + } + } + + if self.consoles.is_some() { + for console in self.consoles.as_ref().unwrap() { + console.check()?; + } + } + + if self.vsock.is_some() { + self.vsock.as_ref().unwrap().check()?; + } + + if self.boot_source.initrd.is_none() && self.drives.is_none() { + bail!("Before Vm start, set a initrd or drive_file as rootfs"); + } + + if self.serial.is_some() && self.serial.as_ref().unwrap().stdio && is_daemonize { + bail!("Serial with stdio and daemonize can't be set together"); + } + + Ok(()) + } + + /// Update argument `name` to `VmConfig`. + /// + /// # Arguments + /// + /// * `name` - The name `String` updated to `VmConfig`. + pub fn update_name(&mut self, name: String) { + self.machine_config.name = name; + } +} + +#[cfg(target_arch = "aarch64")] +impl device_tree::CompileFDT for VmConfig { + fn generate_fdt_node(&self, _fdt: &mut Vec) -> util::errors::Result<()> { + Ok(()) + } +} + +/// This trait is to cast trait object to struct. +pub trait AsAny { + fn as_any(&self) -> &dyn Any; +} + +impl AsAny for T { + fn as_any(&self) -> &dyn Any { + self + } +} + +/// This trait is to check the legality of Config structure. +pub trait ConfigCheck: AsAny + Send + Sync { + /// To check the legality of Config structure. + /// + /// # Errors + /// + /// * `StringLengthTooLong` - Limit the length of String. + /// * `NrcpusError` - Limit the number of vcpu in StratoVirt. + /// * `MemsizeError` - Limit the size of memory in StratoVirt. + /// * `GuestCidError` - Vsock guest-cid is illegel. + /// * `MacFormatError` - Mac address is illegel. + /// * `UnRegularFile` - File is illegel. + fn check(&self) -> Result<()>; +} + +/// The basic structure to parse arguments to config. +/// +/// # Notes +/// +/// The attr format such as `param_type=value` can be treated as a `Param` +/// Single attr such as `quiet` can also be treated as Param +#[derive(Default, Clone, Debug, Serialize, Deserialize)] +pub struct Param { + /// The item on the left of `=`, if no `=`, param_type is "" + pub param_type: String, + /// The item on the right of `=`, if no `=`, the whole is value + pub value: String, +} + +impl Param { + /// Converts from `&str`. + /// + /// # Arguments + /// + /// * `item` - The `str` transformed to `Param`. + fn from_str(item: &str) -> Self { + let split = item.split('='); + let vec = split.collect::>(); + if vec.len() == 1 { + Param { + param_type: String::new(), + value: String::from(vec[0]), + } + } else { + Param { + param_type: String::from(vec[0]), + value: String::from(vec[1]), + } + } + } + + /// Converts `value` in `Param` to `u64`. + pub fn value_to_u64(&self) -> u64 { + self.value + .parse::() + .unwrap_or_else(|_| panic!("Unrecognized value to u64: {}", &self.value)) + } + + /// Converts `value` in `Param` to `u32`. + pub fn value_to_u32(&self) -> u32 { + self.value + .parse::() + .unwrap_or_else(|_| panic!("Unrecognized value to u32: {}", &self.value)) + } + + /// Converts `value` in `Param` to `u8`. + pub fn value_to_u8(&self) -> u8 { + self.value + .parse::() + .unwrap_or_else(|_| panic!("Unrecognized value to u8: {}", &self.value)) + } + + /// Replace `value`'s `str` in `Param` by blank. + /// + /// # Arguments + /// + /// * `s` - The `str` in `Param` will be replaced. + pub fn value_replace_blank(&mut self, s: &str) -> bool { + if self.value.contains(s) { + self.value = self.value.replace(s, ""); + true + } else { + false + } + } + + /// Converts `yes`,`on`,`true`,`no`,`off`,`false` in `value` to `bool`. + pub fn to_bool(&self) -> bool { + match self.value.as_ref() { + "yes" | "on" | "true" => true, + "no" | "off" | "false" => false, + _ => panic!("Can only give `yes`,`on`,`true`,`no`,`off`,`false` for boolean."), + } + } +} + +impl fmt::Display for Param { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut str1 = String::from(&self.param_type); + let param_str = if str1.is_empty() { + String::from(&self.value) + } else { + str1 += "="; + str1 + &self.value + }; + write!(f, "{}", param_str) + } +} + +/// `Operation` for `Param`. +/// +/// The trait `ParamOperation` define two function: `new()` and `from_str()`. +pub trait ParamOperation { + fn new() -> Self; + fn from_str(s: String) -> Self; +} + +/// Struct `CmdParams` used to parse arguments to config. +/// Contains a `Vec` and its `len()`. +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct CmdParams { + /// A `Vec` to restore `Param`s, each item in `Vec` is a basic Param, + /// such as `isrootfs=on`. + pub params: Vec, + /// The length of the whole cmdline, a basic param is simple one. + pub length: usize, +} + +impl ParamOperation for CmdParams { + /// Allocates an empty `CmdParams`. + fn new() -> Self { + let params: Vec = Vec::new(); + let length: usize = 0; + CmdParams { params, length } + } + + /// Created `CmdParams` from `String`. + /// + /// # Arguments + /// + /// * `cmdline_args`: The args `String` to be transformed. + fn from_str(cmdline_args: String) -> Self { + let split = cmdline_args.split(','); + let vec = split.collect::>(); + let mut params: Vec = Vec::new(); + let mut length: usize = 0; + + for item in vec { + params.push(Param::from_str(item)); + length += 1; + } + CmdParams { params, length } + } +} + +impl CmdParams { + /// Input the `Param`'s `param_type`, get its `value`. + /// + /// # Arguments + /// + /// * `item` - The item name `str` to get `Param`. + pub fn get(&self, item: &str) -> Option { + for i in 0..self.length { + if self.params[i].param_type == item { + return Some(self.params[i].clone()); + } + } + None + } + + /// Input the `Param`'s `param_type`, get its value. + /// + /// # Arguments + /// + /// * `item` - The item name `str` to get `Param`'s value `String`. + pub fn get_value_str(&self, item: &str) -> Option { + if let Some(param) = self.get(item) { + Some(param.value) + } else { + None + } + } + + /// Input the `Param`'s `param_type`, get its value to u32. + /// + /// # Arguments + /// + /// * `item` - The item name `str` to get `Param`'s value `i32`. + pub fn get_value_i32(&self, item: &str) -> Option { + if let Some(param) = self.get(item) { + Some(param.value_to_u32() as i32) + } else { + None + } + } + + /// Input the `Param`'s `param_type`, get its value to u32. + /// + /// # Arguments + /// + /// * `item` - The item name `str` to get `Param`'s value `u32`. + pub fn get_value_u64(&self, item: &str) -> Option { + if let Some(param) = self.get(item) { + Some(param.value_to_u64()) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_param() { + let test_param_str = "isrootfs=on"; + let mut test_param: Param = Param::from_str(&test_param_str); + + assert_eq!(test_param.to_string(), "isrootfs=on".to_string()); + assert_eq!(test_param.to_bool(), true); + + test_param.value = "off".to_string(); + assert_eq!(test_param.to_bool(), false); + + let test_param_str = "quiet"; + let mut test_param: Param = Param::from_str(&test_param_str); + + assert_eq!(test_param.to_string(), "quiet".to_string()); + test_param.value_replace_blank("et"); + assert_eq!(test_param.to_string(), "qui".to_string()); + + let test_param_str = "max_vcpu=8"; + let test_param: Param = Param::from_str(&test_param_str); + + assert_eq!(test_param.value_to_u8(), 8u8); + assert_eq!(test_param.value_to_u32(), 8u32); + assert_eq!(test_param.value_to_u64(), 8u64); + } + + #[test] + fn test_cmd_param() { + let test_cmdline = "socket,id=charconsole0,path=/tmp/console.sock"; + let test_cmdline_param = CmdParams::from_str(test_cmdline.to_string()); + + assert_eq!( + test_cmdline_param.get("id").unwrap().to_string(), + "id=charconsole0".to_string() + ); + assert_eq!( + test_cmdline_param.get("").unwrap().to_string(), + "socket".to_string() + ); + } +} diff --git a/machine_manager/src/config/network.rs b/machine_manager/src/config/network.rs new file mode 100644 index 00000000..49b6bae9 --- /dev/null +++ b/machine_manager/src/config/network.rs @@ -0,0 +1,163 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate serde; +extern crate serde_json; + +use serde::{Deserialize, Serialize}; + +use super::errors::{ErrorKind, Result}; +use crate::config::{CmdParams, ConfigCheck, ParamOperation, VmConfig}; + +const MAX_STRING_LENGTH: usize = 255; +const MAC_ADDRESS_LENGTH: usize = 17; + +/// Config struct for network +/// Contains network device config, such as `host_dev_name`, `mac`... +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NetworkInterfaceConfig { + pub iface_id: String, + pub host_dev_name: String, + pub mac: Option, + pub tap_fd: Option, + pub vhost_type: Option, + pub vhost_fd: Option, +} + +impl NetworkInterfaceConfig { + /// Create `NetworkInterfacesConfig` from `Value` structure + /// `Value` structure can be gotten by `json_file` + pub fn from_value(value: &serde_json::Value) -> Option> { + serde_json::from_value(value.clone()).ok() + } + + pub fn set_mac(&mut self, mac_addr: String) { + self.mac = Some(mac_addr); + } +} + +impl Default for NetworkInterfaceConfig { + fn default() -> Self { + NetworkInterfaceConfig { + iface_id: "".to_string(), + host_dev_name: "".to_string(), + mac: None, + tap_fd: None, + vhost_type: None, + vhost_fd: None, + } + } +} + +impl ConfigCheck for NetworkInterfaceConfig { + fn check(&self) -> Result<()> { + if self.iface_id.len() > MAX_STRING_LENGTH { + return Err( + ErrorKind::StringLengthTooLong("iface id".to_string(), MAX_STRING_LENGTH).into(), + ); + } + + if self.host_dev_name.len() > MAX_STRING_LENGTH { + return Err(ErrorKind::StringLengthTooLong( + self.host_dev_name.clone(), + MAX_STRING_LENGTH, + ) + .into()); + } + + if self.mac.is_some() && !check_mac_address(self.mac.as_ref().unwrap()) { + return Err(ErrorKind::MacFormatError.into()); + } + + if let Some(vhost_type) = self.vhost_type.as_ref() { + if vhost_type != "vhost-kernel" { + return Err(ErrorKind::UnknownVhostType.into()); + } + } + + Ok(()) + } +} + +impl VmConfig { + /// Add new network device to `VmConfig` + fn add_netdev(&mut self, net: NetworkInterfaceConfig) { + if let Some(mut nets) = self.nets.clone() { + nets.push(net); + self.nets = Some(nets); + } else { + let mut nets: Vec = Vec::new(); + nets.push(net); + self.nets = Some(nets); + } + } + + /// Update '-netdev ...' network config to `VmConfig` + /// Some attr in `NetworkInterfaceConfig` would be found in `DeviceConfig` + pub fn update_net(&mut self, net_config: String) { + let cmd_params: CmdParams = CmdParams::from_str(net_config); + let mut net = NetworkInterfaceConfig::default(); + + if let Some(net_id) = cmd_params.get("id") { + net.iface_id = net_id.value; + } + if let Some(net_hostname) = cmd_params.get("netdev") { + net.host_dev_name = net_hostname.value; + } + if let Some(net_mac) = cmd_params.get("mac") { + net.mac = Some(net_mac.value); + } + if let Some(tap_fd) = cmd_params.get("fds") { + net.tap_fd = Some(tap_fd.value_to_u32() as i32); + } + if let Some(vhost) = cmd_params.get("vhost") { + if vhost.to_bool() { + net.vhost_type = Some("vhost-kernel".to_string()); + } + } + if let Some(vhostfd) = cmd_params.get("vhostfds") { + net.vhost_fd = Some(vhostfd.value_to_u32() as i32); + } + + self.add_netdev(net); + } +} + +fn check_mac_address(mac: &str) -> bool { + if mac.len() != MAC_ADDRESS_LENGTH { + return false; + } + + let mac_vec: Vec<&str> = mac.split(':').collect(); + if mac_vec.len() != 6 { + return false; + } + + let bit_list = [ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', + 'C', 'D', 'E', 'F', + ]; + for mac_bit in mac_vec { + if mac_bit.len() != 2 { + return false; + } + let mut mac_bit_char = mac_bit.chars(); + if !bit_list.contains(&mac_bit_char.next().unwrap()) + || !bit_list.contains(&mac_bit_char.next().unwrap()) + { + return false; + } + } + + true +} diff --git a/machine_manager/src/lib.rs b/machine_manager/src/lib.rs new file mode 100644 index 00000000..1aac109e --- /dev/null +++ b/machine_manager/src/lib.rs @@ -0,0 +1,46 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! # Machine Manager +//! +//! API Interface and configuration over virtual machine. +//! +//! ## Design +//! +//! This crate offers support for: +//! 1. A communication way to handle VM outside. +//! 2. The API interface over VM inside and outside. +//! 3. Configuration for VM and its devices. + +#[macro_use] +extern crate log; +#[macro_use] +extern crate error_chain; +extern crate serde_json; + +pub mod config; +pub mod machine; +#[cfg(feature = "qmp")] +pub mod qmp; +pub mod socket; + +pub mod errors { + error_chain! { + links { + ConfigParser(crate::config::errors::Error, crate::config::errors::ErrorKind); + } + foreign_links { + Io(std::io::Error); + Json(serde_json::Error); + } + } +} diff --git a/machine_manager/src/machine.rs b/machine_manager/src/machine.rs new file mode 100644 index 00000000..2703c8d8 --- /dev/null +++ b/machine_manager/src/machine.rs @@ -0,0 +1,174 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate util; + +use std::os::unix::io::RawFd; + +#[cfg(feature = "qmp")] +use crate::qmp::Response; + +#[cfg(feature = "qmp")] +use crate::qmp::qmp_schema::{CacheOptions, FileOptions}; + +/// State for KVM VM. +#[derive(PartialEq, Copy, Clone)] +pub enum KvmVmState { + Created = 1, + Running = 2, + InMigrating = 3, + Migrated = 4, + Paused = 5, + Shutdown = 6, +} + +/// Event over StratoVirt lifetime. +pub enum VmEvent { + ShutdownCauseGuestReset, + ShutdownCauseGuestCrash, + ShutdownCauseFailEntry, + ShutdownCauseInternalError, +} + +unsafe impl Sync for VmEvent {} +unsafe impl Send for VmEvent {} + +/// Trait to handle virtual machine lifecycle. +/// +/// # Notes +/// +/// VM or Device Life State graph: +/// +/// `None` --`(new)`--> `Created` +/// `Created` --`(start)`--> `Running` +/// `Running` --`(pause)`--> `Paused` +/// `Paused` --`(resume)`--> `Running` +/// `KVM_VMSTATE_*` --`(destroy)`--> `None` +/// +/// **Notice**: +/// 1. Migrate state(`Migrated` and `InMigrating`), +/// not include in Life cycle, both migrate state should deal like `PAUSED` +/// state. +/// +/// 2. Snapshot state deal with `PAUSED` state. +/// +/// 3. every one concern with VM or Device state need to implement this trait, +/// will be notified when VM state changed through `lifecycle_notify` hook. +pub trait MachineLifecycle { + /// Start VM or Device, VM or Device enter running state after this call return. + fn start(&self) -> bool { + self.notify_lifecycle(KvmVmState::Created, KvmVmState::Paused) + } + + /// Pause VM or Device, VM or Device will temporarily stored in memory until it resumed + /// or destroyed. + fn pause(&self) -> bool { + self.notify_lifecycle(KvmVmState::Running, KvmVmState::Paused) + } + + /// Resume VM or Device, resume VM state to running state after this call return. + fn resume(&self) -> bool { + self.notify_lifecycle(KvmVmState::Paused, KvmVmState::Running) + } + + /// Close VM or Device, stop running. + fn destroy(&self) -> bool { + self.notify_lifecycle(KvmVmState::Running, KvmVmState::Shutdown) + } + + /// When VM or Device life state changed, notify concerned entry. + /// + /// # Arguments + /// + /// * `old` - The current `KvmVmState`. + /// * `new` - The new `KvmVmState` expected to transform. + fn notify_lifecycle(&self, old: KvmVmState, new: KvmVmState) -> bool; +} + +/// `AddressSpace` access interface of `Machine`. +/// +/// # Notes +/// RAM and peripheral mapping to the memory address space, +/// the CPU or other device can use the memory address to access the +/// certain RAM range or a certain device. +/// +/// Memory-mapped I/O(MMIO) peripheral refers to transfers using an +/// address space inside of normal memory. +/// +/// In x86 architecture, there is a special address space outside of +/// normal memory, the peripheral in the address space use port-mapped +/// I/O(PIO) mode. +pub trait MachineAddressInterface { + #[cfg(target_arch = "x86_64")] + fn pio_in(&self, port: u64, data: &mut [u8]) -> bool; + + #[cfg(target_arch = "x86_64")] + fn pio_out(&self, port: u64, data: &[u8]) -> bool; + + fn mmio_read(&self, addr: u64, data: &mut [u8]) -> bool; + + fn mmio_write(&self, addr: u64, data: &[u8]) -> bool; +} + +/// Device external api +/// +/// # Notes +/// +/// Some external api for device, which can be exposed to outer. +/// Including some query, setting and operation. +pub trait DeviceInterface { + /// Query vm running state. + #[cfg(feature = "qmp")] + fn query_status(&self) -> Response; + + /// Query each cpu's the topology info. + #[cfg(feature = "qmp")] + fn query_cpus(&self) -> Response; + + /// Query each `hotpluggable_cpus`'s topology info and hotplug message. + #[cfg(feature = "qmp")] + fn query_hotpluggable_cpus(&self) -> Response; + + /// Add a device with configuration. + fn device_add( + &self, + device_id: String, + driver: String, + addr: Option, + lun: Option, + ) -> bool; + + /// Delete a device with device id. + fn device_del(&self, device_id: String) -> bool; + + /// Creates a new block device. + fn blockdev_add( + &self, + node_name: String, + file: FileOptions, + cache: Option, + read_only: Option, + ) -> bool; + + /// Create a new network device. + fn netdev_add(&self, id: String, if_name: Option, fds: Option) -> bool; + + /// Receive a file descriptor via SCM rights and assign it a name. + #[cfg(feature = "qmp")] + fn getfd(&self, fd_name: String, if_fd: Option) -> Response; +} + +/// Machine interface which is exposed to inner hypervisor. +pub trait MachineInterface: MachineLifecycle + MachineAddressInterface {} + +/// Machine interface which is exposed to outer hypervisor. +pub trait MachineExternalInterface: MachineLifecycle + DeviceInterface {} diff --git a/machine_manager/src/qmp/mod.rs b/machine_manager/src/qmp/mod.rs new file mode 100644 index 00000000..6299ccc5 --- /dev/null +++ b/machine_manager/src/qmp/mod.rs @@ -0,0 +1,763 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! This module implements a simple way to realize QMP. +//! +//! # Qmp Introduction +//! +//! [Qmp](https://wiki.qemu.org/Documentation/QMP) is a Json-based protocol +//! which allows applications to control a VM instance. +//! It has three feature: +//! 1. Qmp server is no-async service as well as Qemu's. +//! Command + events can replace asynchronous command. +//! 2. Qmp server can only be connected a client at one time. +//! It's no situation where be communicated with many clients. +//! When it must use, can use other communication way not QMP. +//! 3. Qmp's message structure base is transformed by scripts from Qemu's +//! `qmp-schema.json`. It's can be compatible by Qemu's zoology. Those +//! transformed structures can be found in `machine_manager/src/qmp/qmp_schema.rs` +extern crate serde; +extern crate serde_json; + +#[allow(non_upper_case_globals)] +#[allow(non_camel_case_types)] +#[allow(non_snake_case)] +pub mod qmp_schema; + +use std::collections::BTreeMap; +use std::io::Write; +use std::os::unix::io::RawFd; +use std::sync::{Arc, RwLock}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use vmm_sys_util::terminal::Terminal; + +use crate::errors::Result; +use crate::machine::MachineExternalInterface; +use crate::socket::SocketRWHandler; +use qmp_schema as schema; +use schema::QmpCommand; + +static mut QMP_CHANNEL: Option> = None; + +/// Macro `event!`: send event to qmp-client. +/// +/// # Arguments +/// +/// * `$x` - event type +/// * `$y` - event context +/// +/// # Example +/// +/// ```text +/// #[macro_use] +/// use machine_manager::qmp::*; +/// +/// event!(SHUTDOWN; shutdown_msg); +/// event!(STOP); +/// event!(RESUME); +/// ``` +#[macro_export] +macro_rules! event { + ( $x:tt ) => {{ + QmpChannel::send_event(&$crate::qmp::qmp_schema::QmpEvent::$x { + data: Default::default(), + timestamp: $crate::qmp::create_timestamp(), + }); + }}; + ( $x:tt;$y:expr ) => {{ + QmpChannel::send_event(&$crate::qmp::qmp_schema::QmpEvent::$x { + data: $y, + timestamp: $crate::qmp::create_timestamp(), + }); + }}; +} + +/// Macro `create_command_matches!`: Generate a match statement for qmp_command +/// `$t` or `$tt`, which is combined with its handle func `$e`. +macro_rules! create_command_matches { + ( $x:expr; $(($t:tt, $e:stmt)),*; $(($tt:tt, $a:tt, $b:expr, $($tail:tt),*)),* ) => { + match $x { + $( + $crate::qmp::qmp_schema::QmpCommand::$t{ id, .. } => { + $e + id + }, + )* + $( + $crate::qmp::qmp_schema::QmpCommand::$tt{ arguments, id } => { + qmp_command_match!($a;$b;arguments;$($tail),*); + id + }, + )* + _ => None, + } + }; +} + +/// Macro: to execute handle func $y/$a with every arguments $y/$tail. +macro_rules! qmp_command_match { + ( $x:tt;$y:expr ) => { + { + $y.$x(); + } + }; + ( $x:tt;$y:expr;$z:expr ) => { + { + $z = $y.$x(); + } + }; + ( $x:tt;$y:expr;$a:expr;$($tail:tt),*) => { + { + $y.$x( + $($a.$tail),* + ); + } + }; +} + +/// Qmp greeting message. +/// +/// # Notes +/// +/// It contains the version of VM or fake Qemu version to adapt others. +#[derive(Default, Debug, Serialize, Deserialize, PartialEq)] +pub struct QmpGreeting { + #[serde(rename = "QMP")] + qmp: Greeting, +} + +#[derive(Default, Debug, Serialize, Deserialize, PartialEq)] +struct Greeting { + version: Version, + capabilities: Vec, +} + +#[derive(Default, Debug, Serialize, Deserialize, PartialEq)] +struct Version { + #[serde(rename = "qemu")] + application: VersionNumber, + package: String, +} + +#[derive(Default, Debug, Serialize, Deserialize, PartialEq)] +struct VersionNumber { + micro: u8, + minor: u8, + major: u8, +} + +impl QmpGreeting { + /// Create qmp greeting message. + /// + /// # Arguments + /// + /// * `micro` - Micro version number. + /// * `minor` - Minor version number. + /// * `major` - Major version number. + pub fn create_greeting(micro: u8, minor: u8, major: u8) -> Self { + let version_number = VersionNumber { + micro, + minor, + major, + }; + let cap: Vec = Default::default(); + let version = Version { + application: version_number, + package: "".to_string(), + }; + let greeting = Greeting { + version, + capabilities: cap, + }; + QmpGreeting { qmp: greeting } + } +} + +/// Qmp response to client +/// +/// # Notes +/// +/// It contains two kind response: `BadResponse` and `GoodResponse`. This two +/// kind response are fit by executing qmp command by success and failure. +#[derive(Debug, Serialize, Deserialize, PartialEq)] +pub struct Response { + #[serde(rename = "return", default, skip_serializing_if = "Option::is_none")] + return_: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + error: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, +} + +impl Response { + /// Create qmp response with inner `Value` and `id`. + /// + /// # Arguments + /// + /// * `v` - The `Value` of qmp `return` field. + /// * `id` - The `id` for qmp `Response`, it must be equal to `Request`'s + /// `id`. + pub fn create_response(v: Value, id: Option) -> Self { + Response { + return_: Some(v), + error: None, + id, + } + } + + /// Create a empty qmp response, `return` field will be empty. + pub fn create_empty_response() -> Self { + Response { + return_: Some(serde_json::to_value(Empty {}).unwrap()), + error: None, + id: None, + } + } + + /// Create a error qmo response with `err_class` and `id`. + /// # Arguments + /// + /// * `err_class` - The `QmpErrorClass` of qmp `error` field. + /// * `id` - The `id` for qmp `Response`, it must be equal to `Request`'s + /// `id`. + pub fn create_error_response( + err_class: schema::QmpErrorClass, + id: Option, + ) -> Result { + Ok(Response { + return_: None, + error: Some(ErrorMessage::new(&err_class)?), + id, + }) + } + + fn change_id(&mut self, id: Option) { + self.id = id; + } +} + +/// `ErrorMessage` for Qmp Response. +#[derive(Default, Debug, Serialize, Deserialize, PartialEq)] +pub struct ErrorMessage { + #[serde(rename = "class")] + errorkind: String, + desc: String, +} + +impl ErrorMessage { + fn new(e: &schema::QmpErrorClass) -> Result { + let content = e.to_content(); + let serde_str = serde_json::to_string(&e)?; + let serde_vec: Vec<&str> = serde_str.split(':').collect(); + let class_name = serde_vec[0]; + let len: usize = class_name.len(); + Ok(ErrorMessage { + errorkind: class_name[2..len - 1].to_string(), + desc: content, + }) + } +} + +/// Empty message for QMP. +#[derive(Default, Debug, Serialize, Deserialize, PartialEq)] +pub struct Empty {} + +/// Command trait for Deserialize and find back Response. +pub trait Command: Serialize { + type Res: DeserializeOwned; + const NAME: &'static str; + fn back(self) -> Self::Res; +} + +/// Event trait for Deserialize. +pub trait Event: DeserializeOwned { + const NAME: &'static str; +} + +/// `TimeStamp` structure for `QmpEvent`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TimeStamp { + seconds: u64, + microseconds: u64, +} + +/// Constructs a `TimeStamp` struct. +pub fn create_timestamp() -> TimeStamp { + let start = SystemTime::now(); + let since_the_epoch = start + .duration_since(UNIX_EPOCH) + .expect("Time went backwards"); + let seconds = u128::from(since_the_epoch.as_secs()); + let microseconds = (since_the_epoch.as_nanos() - seconds * 1_000_000_000) / (1_000 as u128); + TimeStamp { + seconds: seconds as u64, + microseconds: microseconds as u64, + } +} + +/// Accept qmp command, analyze and exec it. +/// +/// # Arguments +/// +/// * `stream_fd` - The input stream file description. +/// * `controller` - The controller which execute actual qmp command. +/// +/// # Errors +/// +/// This function will fail when json parser failed or socket file description broke. +pub fn handle_qmp(stream_fd: RawFd, controller: &Arc) -> Result<()> { + let mut qmp_service = crate::socket::SocketHandler::new(stream_fd); + match qmp_service.decode_line() { + (Ok(None), _) => Ok(()), + (Ok(buffer), if_fd) => { + info!("QMP: <-- {:?}", buffer); + let qmp_command: schema::QmpCommand = buffer.unwrap(); + let (return_msg, shutdown_flag) = qmp_command_exec(qmp_command, controller, if_fd); + info!("QMP: --> {:?}", return_msg); + qmp_service.send_str(&return_msg)?; + + // handle shutdown command + if shutdown_flag { + let shutdown_msg = schema::SHUTDOWN { + guest: false, + reason: "host-qmp-quit".to_string(), + }; + event!(SHUTDOWN; shutdown_msg); + + std::io::stdin() + .lock() + .set_canon_mode() + .expect("Failed to set terminal to canon mode."); + std::process::exit(1); + } + + Ok(()) + } + (Err(e), _) => { + let err_resp = schema::QmpErrorClass::GenericError(format!("{}", &e)); + warn!("Qmp json parser made an error:{}", e); + qmp_service.send_str(&serde_json::to_string(&Response::create_error_response( + err_resp, None, + )?)?)?; + Ok(()) + } + } +} + +/// Create a match , where `qmp_command` and its arguments matching by handle +/// function, and exec this qmp command. +fn qmp_command_exec( + qmp_command: QmpCommand, + controller: &Arc, + if_fd: Option, +) -> (String, bool) { + let mut qmp_response = Response::create_empty_response(); + let mut shutdown_flag = false; + + // Use macro create match to cover most Qmp command + let mut id = create_command_matches!( + qmp_command.clone(); + (stop, qmp_command_match!(pause; controller)), + (cont, qmp_command_match!(resume; controller)), + (query_status, qmp_command_match!(query_status; controller; qmp_response)), + (query_cpus, qmp_command_match!(query_cpus; controller; qmp_response)), + (query_hotpluggable_cpus, + qmp_command_match!(query_hotpluggable_cpus; controller; qmp_response)); + (device_add, device_add, controller, id, driver, addr, lun), + (device_del, device_del, controller, id), + (blockdev_add, blockdev_add, controller, node_name, file, cache, read_only), + (netdev_add, netdev_add, controller, id, if_name, fds) + ); + + // Handle the Qmp command which macro can't cover + if id.is_none() { + id = match qmp_command { + QmpCommand::quit { id, .. } => { + controller.destroy(); + shutdown_flag = true; + id + } + QmpCommand::getfd { arguments, id } => { + qmp_response = controller.getfd(arguments.fd_name, if_fd); + id + } + _ => None, + } + } + + // Change response id with input qmp message + qmp_response.change_id(id); + (serde_json::to_string(&qmp_response).unwrap(), shutdown_flag) +} + +/// The struct `QmpChannel` is the only struct can handle Global variable +/// `QMP_CHANNEL`. +/// It is used to send event to qmp client and restore some file descriptor +/// which was sended by client. +pub struct QmpChannel { + /// The `writer` to send `QmpEvent`. + event_writer: RwLock>, + /// Restore file descriptor received from client. + fds: Arc>>, +} + +impl QmpChannel { + /// Constructs a `QmpChannel` in global `QMP_CHANNEL`. + pub fn object_init() { + unsafe { + if QMP_CHANNEL.is_none() { + QMP_CHANNEL = Some(Arc::new(QmpChannel { + event_writer: RwLock::new(None), + fds: Arc::new(RwLock::new(BTreeMap::new())), + })); + } + } + } + + /// Bind a `SocketRWHanler` to `QMP_CHANNEL`. + /// + /// # Arguments + /// + /// * `writer` - The `SocketRWHandler` used to communicate with client. + pub fn bind_writer(writer: SocketRWHandler) { + *Self::inner().event_writer.write().unwrap() = Some(writer); + } + + /// Unbind `SocketRWHandler` from `QMP_CHANNEL`. + pub fn unbind() { + *Self::inner().event_writer.write().unwrap() = None; + } + + /// Check whether a `SocketRWHandler` bind with `QMP_CHANNEL` or not. + pub fn is_connected() -> bool { + Self::inner().event_writer.read().unwrap().is_some() + } + + /// Restore extern file descriptor in `QMP_CHANNEL`. + /// + /// # Arguments + /// + /// * `name` - Name of file descriptor. + /// * `fd` - File descriptor sent by client. + pub fn set_fd(name: String, fd: RawFd) { + Self::inner().fds.write().unwrap().insert(name, fd); + } + + /// Get extern file descriptor restored in `QMP_CHANNEL`. + /// + /// # Arguments + /// + /// * `name` - Name of file descriptor. + pub fn get_fd(name: &str) -> Option { + match Self::inner().fds.read().unwrap().get(name) { + Some(fd) => Some(*fd), + None => None, + } + } + + /// Send a `QmpEvent` to client. + /// + /// # Arguments + /// + /// * `event` - The `QmpEvent` sent to client. + #[allow(clippy::unused_io_amount)] + pub fn send_event(event: &schema::QmpEvent) { + if Self::is_connected() { + let event_str = serde_json::to_string(&event).unwrap(); + let mut writer_unlocked = Self::inner().event_writer.write().unwrap(); + let writer = writer_unlocked.as_mut().unwrap(); + writer.flush().unwrap(); + writer.write(event_str.as_bytes()).unwrap(); + writer.write(&[b'\n']).unwrap(); + info!("EVENT: --> {:?}", event); + } + } + + fn inner() -> &'static std::sync::Arc { + unsafe { + match &QMP_CHANNEL { + Some(channel) => channel, + None => { + panic!("Qmp channel not initialized"); + } + } + } + } +} + +#[cfg(test)] +mod tests { + extern crate serde_json; + use super::*; + use std::os::unix::net::{UnixListener, UnixStream}; + + #[test] + fn test_qmp_greeting_msg() { + let greeting_msg = QmpGreeting::create_greeting(1, 0, 4); + + let json_msg = r#" + { + "QMP":{ + "version":{ + "qemu":{ + "micro": 1, + "minor": 0, + "major": 4 + }, + "package": "" + }, + "capabilities": [] + } + } + "#; + let greeting_from_json: QmpGreeting = serde_json::from_str(json_msg).unwrap(); + + assert_eq!(greeting_from_json, greeting_msg); + } + + #[test] + fn test_qmp_resp() { + // 1.Empty response and ID change; + let mut resp = Response::create_empty_response(); + resp.change_id(Some(0)); + + let json_msg = r#"{"return":{},"id":0}"#; + assert_eq!(serde_json::to_string(&resp).unwrap(), json_msg); + + resp.change_id(Some(1)); + let json_msg = r#"{"return":{},"id":1}"#; + assert_eq!(serde_json::to_string(&resp).unwrap(), json_msg); + + // 2.Normal response + let resp_value = schema::StatusInfo { + singlestep: false, + running: true, + status: schema::RunState::running, + }; + let resp = Response::create_response(serde_json::to_value(&resp_value).unwrap(), None); + + let json_msg = r#"{"return":{"running":true,"singlestep":false,"status":"running"}}"#; + assert_eq!(serde_json::to_string(&resp).unwrap(), json_msg); + + // 3.Error response + let qmp_err = + schema::QmpErrorClass::GenericError("Invalid Qmp command arguments!".to_string()); + let resp = Response::create_error_response(qmp_err, None).unwrap(); + + let json_msg = + r#"{"error":{"class":"GenericError","desc":"Invalid Qmp command arguments!"}}"#; + assert_eq!(serde_json::to_string(&resp).unwrap(), json_msg); + } + + #[test] + fn test_qmp_event_msg() { + let event_json = + r#"{"event":"STOP","data":{},"timestamp":{"seconds":1575531524,"microseconds":91519}}"#; + let qmp_event: schema::QmpEvent = serde_json::from_str(&event_json).unwrap(); + match qmp_event { + schema::QmpEvent::STOP { + data: _, + timestamp: _, + } => { + assert!(true); + } + _ => assert!(false), + } + } + + // Environment Preparation for UnixSocket + fn prepare_unix_socket_environment(socket_id: &str) -> (UnixListener, UnixStream, UnixStream) { + let socket_name: String = format!("test_{}.sock", socket_id); + let _ = std::fs::remove_file(&socket_name); + + let listener = UnixListener::bind(&socket_name).unwrap(); + let client = UnixStream::connect(&socket_name).unwrap(); + let (server, _) = listener.accept().unwrap(); + (listener, client, server) + } + + // Environment Recovery for UnixSocket + fn recover_unix_socket_environment(socket_id: &str) { + let socket_name: String = format!("test_{}.sock", socket_id); + std::fs::remove_file(&socket_name).unwrap(); + } + + #[test] + fn test_qmp_event_macro() { + use crate::socket::{Socket, SocketRWHandler}; + use std::io::Read; + + // Pre test. Environment preparation + QmpChannel::object_init(); + let mut buffer = [0u8; 200]; + let (listener, mut client, server) = prepare_unix_socket_environment("06"); + + // Use event! macro to send event msg to client + let socket = Socket::from_unix_listener(listener, None); + socket.bind_unix_stream(server); + QmpChannel::bind_writer(SocketRWHandler::new(socket.get_stream_fd())); + + // 1.send no-content event + event!(STOP); + let length = client.read(&mut buffer).unwrap(); + let qmp_event: schema::QmpEvent = + serde_json::from_str(&(String::from_utf8_lossy(&buffer[..length]))).unwrap(); + match qmp_event { + schema::QmpEvent::STOP { + data: _, + timestamp: _, + } => { + assert!(true); + } + _ => assert!(false), + } + + // 2.send with-content event + let shutdown_event = schema::SHUTDOWN { + guest: true, + reason: "guest-shutdown".to_string(), + }; + event!(SHUTDOWN; shutdown_event); + let length = client.read(&mut buffer).unwrap(); + let qmp_event: schema::QmpEvent = + serde_json::from_str(&(String::from_utf8_lossy(&buffer[..length]))).unwrap(); + match qmp_event { + schema::QmpEvent::SHUTDOWN { data, timestamp: _ } => { + assert_eq!(data.guest, true); + assert_eq!(data.reason, "guest-shutdown".to_string()); + } + _ => assert!(false), + } + + // After test. Environment Recover + recover_unix_socket_environment("06"); + } + + #[test] + fn test_qmp_send_response() { + use crate::socket::Socket; + use std::io::Read; + + // Pre test. Environment preparation + let mut buffer = [0u8; 300]; + let (listener, mut client, server) = prepare_unix_socket_environment("07"); + + // Use event! macro to send event msg to client + let socket = Socket::from_unix_listener(listener, None); + socket.bind_unix_stream(server); + + // 1.send greeting response + socket.send_response(true); + let length = client.read(&mut buffer).unwrap(); + let qmp_response: QmpGreeting = + serde_json::from_str(&(String::from_utf8_lossy(&buffer[..length]))).unwrap(); + let qmp_greeting = QmpGreeting::create_greeting(1, 0, 4); + assert_eq!(qmp_greeting, qmp_response); + + // 2.send empty response + socket.send_response(false); + let length = client.read(&mut buffer).unwrap(); + let qmp_response: Response = + serde_json::from_str(&(String::from_utf8_lossy(&buffer[..length]))).unwrap(); + let qmp_empty_response = Response::create_empty_response(); + assert_eq!(qmp_empty_response, qmp_response); + + // After test. Environment Recover + recover_unix_socket_environment("07"); + drop(socket); + } + + #[derive(Clone)] + struct TestQmpHandler { + content: usize, + } + + impl TestQmpHandler { + fn get_content(&self) -> usize { + self.content + } + + // No response no args + fn handle_qmp_type_01(&mut self) { + self.content = 1; + } + + // With response and no args + fn handle_qmp_type_02(&mut self) -> String { + self.content = 2; + "It's type 2 handler".to_string() + } + + // No response with args + fn handle_qmp_type_03(&mut self, _arguments: String) { + self.content = 3; + } + } + + fn test_handle_qmp( + qmp_command: QmpCommand, + mut handler: TestQmpHandler, + ) -> (Option, String, usize) { + let mut resp_str = String::new(); + ( + create_command_matches!( + qmp_command; + (stop, qmp_command_match!(handle_qmp_type_01; handler)), + (query_cpus, qmp_command_match!(handle_qmp_type_02; handler; resp_str)); + (device_del, handle_qmp_type_03, handler, id) + ), + resp_str, + handler.get_content(), + ) + } + + #[test] + fn test_qmp_match_macro() { + let qmp_handler = TestQmpHandler { content: 0 }; + + // 1.Build a qmp command with id and no args, no response + let qmp_command = schema::QmpCommand::stop { + arguments: Default::default(), + id: Some(0), + }; + assert_eq!( + test_handle_qmp(qmp_command, qmp_handler.clone()), + (Some(0), String::new(), 1) + ); + + // 2.Build a qmp command with id and no args, with response + let qmp_command = schema::QmpCommand::query_cpus { + arguments: Default::default(), + id: Some(0), + }; + assert_eq!( + test_handle_qmp(qmp_command, qmp_handler.clone()), + (Some(0), "It's type 2 handler".to_string(), 2) + ); + + // 3.Build a qmp command with id and with args, no response + let qmp_command = schema::QmpCommand::device_del { + arguments: schema::device_del { + id: "cpu_0".to_string(), + }, + id: Some(0), + }; + assert_eq!( + test_handle_qmp(qmp_command, qmp_handler.clone()), + (Some(0), String::new(), 3) + ); + } +} diff --git a/machine_manager/src/qmp/qmp_schema.rs b/machine_manager/src/qmp/qmp_schema.rs new file mode 100644 index 00000000..93460197 --- /dev/null +++ b/machine_manager/src/qmp/qmp_schema.rs @@ -0,0 +1,819 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate serde; +extern crate serde_json; + +use serde::{Deserialize, Serialize}; +pub use serde_json::Value as Any; + +use crate::qmp::{Command, Empty, Event, TimeStamp}; + +/// A error enum for qmp +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum QmpErrorClass { + #[serde(rename = "GenericError")] + GenericError(String), + #[serde(rename = "CommandNotFound")] + CommandNotFound(String), + #[serde(rename = "DeviceNotActive")] + DeviceNotActive(String), + #[serde(rename = "DeviceNotFound")] + DeviceNotFound(String), + #[serde(rename = "KVMMissingCap")] + KVMMissingCap(String), +} + +impl QmpErrorClass { + pub fn to_content(&self) -> String { + match self { + QmpErrorClass::GenericError(s) => s.to_string(), + QmpErrorClass::CommandNotFound(s) => s.to_string(), + QmpErrorClass::DeviceNotActive(s) => s.to_string(), + QmpErrorClass::DeviceNotFound(s) => s.to_string(), + QmpErrorClass::KVMMissingCap(s) => s.to_string(), + } + } +} + +/// A enum to store all command struct +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "execute")] +pub enum QmpCommand { + #[serde(rename = "qmp_capabilities")] + qmp_capabilities { + #[serde(default)] + arguments: qmp_capabilities, + }, + quit { + #[serde(default)] + arguments: quit, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + stop { + #[serde(default)] + arguments: stop, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + cont { + #[serde(default)] + arguments: cont, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + device_add { + arguments: device_add, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + device_del { + arguments: device_del, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + netdev_add { + arguments: netdev_add, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + netdev_del { + arguments: netdev_del, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + #[serde(rename = "query-hotpluggable-cpus")] + query_hotpluggable_cpus { + #[serde(default)] + arguments: query_hotpluggable_cpus, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + #[serde(rename = "query-cpus")] + query_cpus { + #[serde(default)] + arguments: query_cpus, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + #[serde(rename = "query-status")] + query_status { + #[serde(default)] + arguments: query_status, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + getfd { + arguments: getfd, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + #[serde(rename = "blockdev-add")] + blockdev_add { + arguments: blockdev_add, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, + #[serde(rename = "blockdev-del")] + blockdev_del { + arguments: blockdev_del, + #[serde(default, skip_serializing_if = "Option::is_none")] + id: Option, + }, +} + +/// qmp_capabilities +/// +/// Enable QMP capabilities. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "qmp_capabilities" } +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct qmp_capabilities {} + +impl Command for qmp_capabilities { + const NAME: &'static str = "qmp_capabilities"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// quit +/// +/// This command will cause the StratoVirt process to exit gracefully. While every +/// attempt is made to send the QMP response before terminating, this is not +/// guaranteed. When using this interface, a premature EOF would not be +/// unexpected. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "quit" } +/// <- { "return": {}} +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct quit {} + +impl Command for quit { + const NAME: &'static str = "quit"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// stop +/// +/// Stop all guest VCPU execution +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "stop" } +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct stop {} + +impl Command for stop { + const NAME: &'static str = "stop"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// cont +/// +/// Resume guest VCPU execution. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "cont" } +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct cont {} + +impl Command for cont { + const NAME: &'static str = "cont"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// device_add +/// +/// # Arguments +/// +/// * `id` - the device's ID, must be unique. +/// * `driver` - the name of the new device's driver. +/// * `addr` - the address device insert into. +/// +/// Additional arguments depend on the type. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "device_add", +/// "arguments": { "id": "net-0", "driver": "virtio-net-mmio", "addr": "0x0"}} +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct device_add { + #[serde(rename = "id")] + pub id: String, + #[serde(rename = "driver")] + pub driver: String, + #[serde(rename = "addr")] + pub addr: Option, + #[serde(rename = "lun")] + pub lun: Option, +} + +impl Command for device_add { + const NAME: &'static str = "device_add"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct FileOptions { + pub driver: String, + pub filename: String, +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct CacheOptions { + #[serde(rename = "no-flush")] + pub no_flush: Option, + pub direct: Option, +} + +/// blockdev_add +/// +/// # Arguments +/// +/// * `node_name` - the device's ID, must be unique. +/// * `file` - the backend file information. +/// * `cache` - if use direct io. +/// * `read_only` - if readonly. +/// +/// Additional arguments depend on the type. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "blockdev_add", +/// "arguments": {"node-name": "drive-0", +/// "file": {"driver": "file", "filename": "/path/to/block"}, +/// "cache": {"direct": true}, "read-only": false }} +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct blockdev_add { + #[serde(rename = "node-name")] + pub node_name: String, + pub file: FileOptions, + pub cache: Option, + #[serde(rename = "read-only")] + pub read_only: Option, +} + +impl Command for blockdev_add { + const NAME: &'static str = "blockdev-add"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// netdev_add +/// +/// # Arguments +/// +/// * `id` - the device's ID, must be unique. +/// * `ifname` - the backend tap dev name. +/// * `fds` - the file fd opened by upper level. +/// +/// Additional arguments depend on the type. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "netdev_add", +/// "arguments": {"id": "net-0", "ifname": "tap0", "fds": 123 }} +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct netdev_add { + pub id: String, + #[serde(rename = "ifname")] + pub if_name: Option, + pub fds: Option, +} + +impl Command for netdev_add { + const NAME: &'static str = "netdev_add"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// device_del +/// +/// Remove a device from a guest +/// +/// # Arguments +/// +/// * `id` - the device's ID or QOM path. +/// +/// # Errors +/// +/// If `id` is not a valid device, DeviceNotFound. +/// +/// # Notes +/// +/// When this command completes, the device may not be removed from the +/// guest. Hot removal is an operation that requires guest cooperation. +/// This command merely requests that the guest begin the hot removal +/// process. Completion of the device removal process is signaled with a +/// DEVICE_DELETED event. Guest reset will automatically complete removal +/// for all devices. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "device_del", +/// "arguments": { "id": "net-0" } } +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct device_del { + pub id: String, +} + +impl Command for device_del { + const NAME: &'static str = "device_del"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct blockdev_del { + #[serde(rename = "node-name")] + pub node_name: String, +} + +impl Command for blockdev_del { + const NAME: &'static str = "blockdev-del"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// netdev_del +/// +/// Remove a network backend. +/// +/// # Arguments +/// +/// * `id` - The name of the network backend to remove. +/// +/// # Errors +/// +/// If `id` is not a valid network backend, DeviceNotFound +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "netdev_del", "arguments": { "id": "net-0" } } +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct netdev_del { + pub id: String, +} + +impl Command for netdev_del { + const NAME: &'static str = "netdev_del"; + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// query-hotpluggable-cpus: +/// +/// # Returns +/// +/// A list of Hotpluggable CPU objects. +/// +/// # Examples +/// +/// For pc machine type started with -smp 1,maxcpus=2: +/// ```text +/// -> { "execute": "query-hotpluggable-cpus" } +/// <- {"return": [ +/// { +/// "type": "qemu64-x86_64-cpu", "vcpus-count": 1, +/// "props": {"core-id": 0, "socket-id": 1, "thread-id": 0} +/// }, +/// { +/// "qom-path": "/machine/unattached/device[0]", +/// "type": "qemu64-x86_64-cpu", "vcpus-count": 1, +/// "props": {"core-id": 0, "socket-id": 0, "thread-id": 0} +/// } +/// ]} +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct query_hotpluggable_cpus {} + +impl Command for query_hotpluggable_cpus { + const NAME: &'static str = "query-hotpluggable-cpus"; + type Res = Vec; + + fn back(self) -> Vec { + Default::default() + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct HotpluggableCPU { + #[serde(rename = "type")] + pub type_: String, + #[serde(rename = "vcpus-count")] + pub vcpus_count: isize, + #[serde(rename = "props")] + pub props: CpuInstanceProperties, + #[serde(rename = "qom-path", default, skip_serializing_if = "Option::is_none")] + pub qom_path: Option, +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct CpuInstanceProperties { + #[serde(rename = "node-id", default, skip_serializing_if = "Option::is_none")] + pub node_id: Option, + #[serde(rename = "socket-id", default, skip_serializing_if = "Option::is_none")] + pub socket_id: Option, + #[serde(rename = "thread-id", default, skip_serializing_if = "Option::is_none")] + pub thread_id: Option, + #[serde(rename = "core-id", default, skip_serializing_if = "Option::is_none")] + pub core_id: Option, +} + +/// query-cpus: +/// +/// This command causes vCPU threads to exit to userspace, which causes +/// a small interruption to guest CPU execution. This will have a negative +/// impact on realtime guests and other latency sensitive guest workloads. +/// It is recommended to use @query-cpus-fast instead of this command to +/// avoid the vCPU interruption. +/// +/// # Returns +/// +/// A list of information about each virtual CPU. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "query-cpus" } +/// <- { "return": [ +/// { +/// "CPU":0, +/// "current":true, +/// "halted":false, +/// "qom_path":"/machine/unattached/device[0]", +/// "arch":"x86", +/// "thread_id":3134 +/// }, +/// { +/// "CPU":1, +/// "current":false, +/// "halted":true, +/// "qom_path":"/machine/unattached/device[2]", +/// "arch":"x86", +/// "thread_id":3135 +/// } +/// ] +/// } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct query_cpus {} + +impl Command for query_cpus { + const NAME: &'static str = "query-cpus"; + type Res = Vec; + + fn back(self) -> Vec { + Default::default() + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "arch")] +pub enum CpuInfo { + #[serde(rename = "x86")] + x86 { + #[serde(rename = "current")] + current: bool, + #[serde(rename = "qom_path")] + qom_path: String, + #[serde(rename = "halted")] + halted: bool, + #[serde(rename = "props", default, skip_serializing_if = "Option::is_none")] + props: Option, + #[serde(rename = "CPU")] + CPU: isize, + #[serde(rename = "thread_id")] + thread_id: isize, + #[serde(flatten)] + #[serde(rename = "x86")] + x86: CpuInfoX86, + }, + #[serde(rename = "arm")] + Arm { + #[serde(rename = "current")] + current: bool, + #[serde(rename = "qom_path")] + qom_path: String, + #[serde(rename = "halted")] + halted: bool, + #[serde(rename = "props", default, skip_serializing_if = "Option::is_none")] + props: Option, + #[serde(rename = "CPU")] + CPU: isize, + #[serde(rename = "thread_id")] + thread_id: isize, + #[serde(flatten)] + #[serde(rename = "Arm")] + arm: CpuInfoArm, + }, +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct CpuInfoX86 {} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct CpuInfoArm {} + +/// query-status +/// +/// Query the run status of all VCPUs. +/// +/// # Returns +/// +/// `StatusInfo` reflecting all VCPUs. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "query-status" } +/// <- { "return": { "running": true, +/// "singlestep": false, +/// "status": "running" } } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct query_status {} + +impl Command for query_status { + const NAME: &'static str = "query-status"; + type Res = StatusInfo; + + fn back(self) -> StatusInfo { + Default::default() + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct StatusInfo { + #[serde(rename = "singlestep")] + pub singlestep: bool, + #[serde(rename = "running")] + pub running: bool, + #[serde(rename = "status")] + pub status: RunState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum RunState { + #[serde(rename = "debug")] + debug, + #[serde(rename = "inmigrate")] + inmigrate, + #[serde(rename = "internal-error")] + internal_error, + #[serde(rename = "io-error")] + io_error, + #[serde(rename = "paused")] + paused, + #[serde(rename = "postmigrate")] + postmigrate, + #[serde(rename = "prelaunch")] + prelaunch, + #[serde(rename = "finish-migrate")] + finish_migrate, + #[serde(rename = "restore-vm")] + restore_vm, + #[serde(rename = "running")] + running, + #[serde(rename = "save-vm")] + save_vm, + #[serde(rename = "shutdown")] + shutdown, + #[serde(rename = "suspended")] + suspended, + #[serde(rename = "watchdog")] + watchdog, + #[serde(rename = "guest-panicked")] + guest_panicked, + #[serde(rename = "colo")] + colo, + #[serde(rename = "preconfig")] + preconfig, +} + +impl Default for RunState { + fn default() -> Self { + RunState::debug + } +} + +/// getfd +/// +/// Receive a file descriptor via SCM rights and assign it a name +/// +/// # Arguments +/// +/// * `fdname` - File descriptor name. +/// +/// # Examples +/// +/// ```text +/// -> { "execute": "getfd", "arguments": { "fdname": "fd1" } } +/// <- { "return": {} } +/// ``` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct getfd { + #[serde(rename = "fdname")] + pub fd_name: String, +} + +impl Command for getfd { + const NAME: &'static str = "getfd"; + + type Res = Empty; + + fn back(self) -> Empty { + Default::default() + } +} + +/// SHUTDOWN +/// +/// Emitted when the virtual machine has shut down, indicating that StratoVirt is +/// about to exit. +/// +/// # Notes +/// +/// If the command-line option "-no-shutdown" has been specified, StratoVirt +/// will not exit, and a STOP event will eventually follow the SHUTDOWN event +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SHUTDOWN { + /// If true, the shutdown was triggered by a guest request (such as + /// a guest-initiated ACPI shutdown request or other hardware-specific + /// action) rather than a host request (such as sending StratoVirt a SIGINT). + #[serde(rename = "guest")] + pub guest: bool, + pub reason: String, +} + +impl Event for SHUTDOWN { + const NAME: &'static str = "SHUTDOWN"; +} + +/// RESET +/// +/// Emitted when the virtual machine is reset +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RESET { + /// If true, the reset was triggered by a guest request (such as + /// a guest-initiated ACPI reboot request or other hardware-specific action + /// ) rather than a host request (such as the QMP command system_reset). + #[serde(rename = "guest")] + pub guest: bool, +} + +impl Event for RESET { + const NAME: &'static str = "RESET"; +} + +/// STOP +/// +/// Emitted when the virtual machine is stopped +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct STOP {} + +impl Event for STOP { + const NAME: &'static str = "STOP"; +} + +/// RESUME +/// +/// Emitted when the virtual machine resumes execution +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct RESUME {} + +impl Event for RESUME { + const NAME: &'static str = "RESUME"; +} + +/// DEVICE_DELETED +/// +/// Emitted whenever the device removal completion is acknowledged by the guest. +/// At this point, it's safe to reuse the specified device ID. Device removal can +/// be initiated by the guest or by HMP/QMP commands. +/// +/// # Examples +/// +/// ```text +/// <- { "event": "DEVICE_DELETED", +/// "data": { "device": "virtio-net-mmio-0", +/// "path": "/machine/peripheral/virtio-net-mmio-0" }, +/// "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DEVICE_DELETED { + /// Device name. + #[serde(rename = "device", default, skip_serializing_if = "Option::is_none")] + pub device: Option, + /// Device path. + #[serde(rename = "path")] + pub path: String, +} + +impl Event for DEVICE_DELETED { + const NAME: &'static str = "DEVICE_DELETED"; +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "event")] +pub enum QmpEvent { + #[serde(rename = "SHUTDOWN")] + SHUTDOWN { + data: SHUTDOWN, + timestamp: TimeStamp, + }, + #[serde(rename = "RESET")] + RESET { data: RESET, timestamp: TimeStamp }, + #[serde(rename = "STOP")] + STOP { + #[serde(default)] + data: STOP, + timestamp: TimeStamp, + }, + #[serde(rename = "RESUME")] + RESUME { + #[serde(default)] + data: RESUME, + timestamp: TimeStamp, + }, + #[serde(rename = "DEVICE_DELETED")] + DEVICE_DELETED { + data: DEVICE_DELETED, + timestamp: TimeStamp, + }, +} diff --git a/machine_manager/src/socket.rs b/machine_manager/src/socket.rs new file mode 100644 index 00000000..bb3bee39 --- /dev/null +++ b/machine_manager/src/socket.rs @@ -0,0 +1,850 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use serde::Deserialize; +use std::io; +use std::io::{Read, Write}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::sync::{Arc, Mutex, RwLock}; + +use util::epoll_context::{EventNotifier, EventNotifierHelper, NotifierOperation}; +use vmm_sys_util::epoll::EventSet; + +use super::errors::Result; +use crate::machine::MachineExternalInterface; +#[cfg(feature = "qmp")] +use crate::{ + qmp::qmp_schema::QmpEvent, + qmp::{QmpChannel, QmpGreeting, Response}, +}; + +const MAX_SOCKET_MSG_LENGTH: usize = 8192; + +/// The wrapper over Unix socket and socket handler. +/// +/// # Example +/// +/// ```no_run +/// use std::os::unix::net::{UnixListener, UnixStream}; +/// use std::os::unix::io::AsRawFd; +/// use std::io::prelude::*; +/// +/// use machine_manager::socket::Socket; +/// +/// fn main() -> std::io::Result<()> { +/// let listener = UnixListener::bind("/path/to/my/socket")?; +/// let socket = Socket::from_unix_listener(listener, None); +/// assert!(!socket.is_connected()); +/// +/// let client_stream = UnixStream::connect("/path/to/my/socket")?; +/// let server_stream = socket.accept_unix_stream(); +/// socket.bind_unix_stream(server_stream); +/// assert!(socket.is_connected()); +/// Ok(()) +/// } +/// ``` +pub struct Socket { + /// Type for Socket + sock_type: SocketType, + /// Socket listener tuple + listener: UnixListener, + /// Socket stream with RwLock + stream: RwLock>, + /// Perform socket command + performer: Option>, +} + +impl Socket { + /// Allocates a new `Socket` with `UnixListener`. + /// + /// # Arguments + /// + /// * `listener` - The `UnixListener` bind to `Socket`. + /// * `performer` - The `VM` to perform socket command. + pub fn from_unix_listener( + listener: UnixListener, + performer: Option>, + ) -> Self { + Socket { + sock_type: SocketType::Unix, + listener, + stream: RwLock::new(None), + performer, + } + } + + /// Get listener's fd from `Socket`. + pub fn get_listener_fd(&self) -> RawFd { + self.listener.as_raw_fd() + } + + /// Accept stream and bind to Socket. + pub fn accept(&self) { + match self.sock_type { + SocketType::Unix => { + let stream = self.accept_unix_stream(); + self.bind_unix_stream(stream); + } + } + + #[cfg(feature = "qmp")] + { + QmpChannel::bind_writer(SocketRWHandler::new(self.get_stream_fd())); + self.send_response(true); + } + } + + /// Accept a new incoming connection unix stream from unix listener. + pub fn accept_unix_stream(&self) -> UnixStream { + let (stream, _) = self.listener.accept().unwrap(); + stream + } + + /// Get socket type from `Socket`. + pub fn get_socket_type(&self) -> SocketType { + self.sock_type + } + + /// Bind `Socket` with a `UnixStream`. + /// + /// # Arguments + /// + /// * `unix_stream` - The `UnixStream` bind to `Socket`. + pub fn bind_unix_stream(&self, unix_stream: UnixStream) { + let stream = SocketStream::from_unix_stream(unix_stream); + *self.stream.write().unwrap() = Some(stream); + } + + /// Unbind stream from `Socket`, reset the state. + pub fn drop_stream(&self) { + *self.stream.write().unwrap() = None; + } + + /// Confirm whether socket stream bind to `Socket` or not. + pub fn is_connected(&self) -> bool { + self.stream.read().unwrap().is_some() + } + + /// Get socket fd from `Socket`, it a private function. + pub fn get_stream_fd(&self) -> RawFd { + if self.is_connected() { + self.stream.read().unwrap().as_ref().unwrap().socket_fd + } else { + panic!("Failed to get socket fd!"); + } + } + + /// Get a `SocketHandler` from `Socket`. + pub fn get_socket_handler(&self) -> SocketHandler { + SocketHandler::new(self.get_stream_fd()) + } + + /// In qmp feature, send event to client. + /// + /// # Arguments + /// + /// * `event` - The `QmpEvent` will be sent to client. + #[cfg(feature = "qmp")] + pub fn send_event(&self, event: &QmpEvent) { + if self.is_connected() { + let mut handler = self.get_socket_handler(); + let event_str = serde_json::to_string(&event).unwrap(); + handler.send_str(&event_str).unwrap(); + info!("EVENT: --> {:?}", event); + } + } + + /// In qmp feature, send empty or greeting response to client. + /// + /// # Arguments + /// + /// * `is_greeting` - Whether sending greeting response or not. + #[cfg(feature = "qmp")] + pub fn send_response(&self, is_greeting: bool) { + if self.is_connected() { + let mut handler = self.get_socket_handler(); + let resp = if is_greeting { + serde_json::to_string(&QmpGreeting::create_greeting(1, 0, 4)).unwrap() + } else { + serde_json::to_string(&Response::create_empty_response()).unwrap() + }; + handler.send_str(&resp).unwrap(); + info!("QMP: --> {:?}", resp); + } + } + + /// Create socket's accepted stream to `event_notifier`. + fn create_event_notifier( + &mut self, + shared_socket: Arc>, + ) -> Option> { + let mut notifiers = Vec::new(); + self.accept(); + + let mut handlers = Vec::new(); + let handler: Box Option>> = + Box::new(move |event, _| { + if event == EventSet::IN { + let socket_mutexed = shared_socket.lock().unwrap(); + let stream_fd = socket_mutexed.get_stream_fd(); + + #[cfg(feature = "qmp")] + { + let performer = &socket_mutexed.performer.as_ref().unwrap(); + + if let Err(e) = crate::qmp::handle_qmp(stream_fd, performer) { + error!("{}", e); + } + } + + #[cfg(not(feature = "qmp"))] + { + if let Err(e) = SocketRWHandler::new(stream_fd).read_fd() { + error!("{}", e); + } + } + } + if event & EventSet::HANG_UP == EventSet::HANG_UP { + let socket_mutexed = shared_socket.lock().unwrap(); + let stream_fd = socket_mutexed.get_stream_fd(); + let listener_fd = socket_mutexed.get_listener_fd(); + + #[cfg(feature = "qmp")] + { + QmpChannel::unbind(); + } + + Some(vec![EventNotifier::new( + NotifierOperation::Delete, + stream_fd, + Some(listener_fd), + EventSet::IN | EventSet::HANG_UP, + Vec::new(), + )]) + } else { + None + } + }); + handlers.push(Arc::new(Mutex::new(handler))); + + let notifier = EventNotifier::new( + NotifierOperation::AddShared, + self.get_stream_fd(), + Some(self.get_listener_fd()), + EventSet::IN | EventSet::HANG_UP, + handlers, + ); + + notifiers.push(notifier); + Some(notifiers) + } +} + +impl EventNotifierHelper for Socket { + fn internal_notifiers(shared_socket: Arc>) -> Vec { + let mut notifiers = Vec::new(); + + let socket = shared_socket.clone(); + let mut handlers = Vec::new(); + let handler: Box Option>> = + Box::new(move |_, _| socket.lock().unwrap().create_event_notifier(socket.clone())); + + handlers.push(Arc::new(Mutex::new(handler))); + + let notifier = EventNotifier::new( + NotifierOperation::AddShared, + shared_socket.lock().unwrap().get_listener_fd(), + None, + EventSet::IN, + handlers, + ); + + notifiers.push(notifier); + + notifiers + } +} + +/// Type for api socket. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum SocketType { + Unix = 1, +} + +/// Wrapper over UnixSteam. +#[derive(Debug)] +struct SocketStream { + /// `RawFd` for socket + socket_fd: RawFd, + /// Make `UnixStream` persistent without `drop` + persistent: Option, +} + +impl SocketStream { + fn from_unix_stream(stream: UnixStream) -> Self { + SocketStream { + socket_fd: stream.as_raw_fd(), + persistent: Some(stream), + } + } +} + +/// Wrapper over socket file description read and write message. +/// +/// # Examples +/// +/// ```no_run +/// use std::os::unix::net::UnixStream; +/// use std::os::unix::io::AsRawFd; +/// use std::io::prelude::*; +/// +/// use machine_manager::socket::SocketRWHandler; +/// +/// fn main() -> std::io::Result<()> { +/// let mut stream = UnixStream::connect("/path/to/my/socket")?; +/// let mut handler = SocketRWHandler::new(stream.as_raw_fd()); +/// stream.write_all(b"hello world")?; +/// let mut buffer = [0_u8; 20]; +/// let count = handler.read(&mut buffer)?; +/// println!("{}", String::from_utf8_lossy(&buffer[..count])); +/// Ok(()) +/// } +/// ``` +pub struct SocketRWHandler { + /// Socket fd to read and write message + socket_fd: RawFd, + /// Buffer to restore byte read and write with fd + buf: Vec, + /// Pos to buffer when read and write with fd + pos: usize, + /// Fds when read from fd's scm right + scm_fd: Vec, +} + +impl SocketRWHandler { + /// Allocates a new `SocketRWHandler` with a socket fd + /// + /// # Arguments + /// + /// * `r` - The file descriptor for socket. + pub fn new(r: RawFd) -> Self { + SocketRWHandler { + socket_fd: r, + buf: Vec::new(), + pos: 0, + scm_fd: Vec::new(), + } + } + + /// Get inner buf as a `String`. + pub fn get_buf_string(&mut self) -> Result { + if self.buf.len() > MAX_SOCKET_MSG_LENGTH { + bail!("The socket messege is too long."); + } + + Ok(String::from_utf8_lossy(&self.buf).trim().to_string()) + } + + /// Get the last file descriptor read from `scm_fd`. + pub fn getfd(&mut self) -> Option { + if self.scm_fd.is_empty() { + None + } else { + Some(self.scm_fd[self.scm_fd.len() - 1]) + } + } + + /// Receive bytes and scm_fd from socket file descriptor. + /// + /// # Notes + /// + /// Use [recvmsg(2)](https://linux.die.net/man/2/recvmsg) to receive + /// messages from `socket_fd`. Some fd can be passed over an `UnixSocket` + /// in a single Control Message. + /// This function can read both buffer[u8] and fd. + /// + /// # Errors + /// The socket file descriptor is broken. + fn read_fd(&mut self) -> std::io::Result<()> { + use libc::{ + c_uint, c_void, cmsghdr, iovec, msghdr, recvmsg, CMSG_DATA, CMSG_FIRSTHDR, CMSG_SPACE, + MSG_DONTWAIT, SCM_RIGHTS, SOL_SOCKET, + }; + + 'read: loop { + let tmp_buf = [0_u8; 1]; + let mut iov = iovec { + iov_base: tmp_buf.as_ptr() as *mut c_void, + iov_len: 1, + }; + + let mut cmsg_space = { + let mut space = 0; + space += + unsafe { CMSG_SPACE(std::mem::size_of::<[RawFd; 2]>() as c_uint) } as usize; + Some(Vec::::with_capacity(space)) + }; + + let (msg_control, msg_controllen) = cmsg_space + .as_mut() + .map(|v| (v.as_mut_ptr(), v.capacity())) + .unwrap_or((std::ptr::null_mut(), 0)); + + // In `musl` toolchain, msghdr has private member `__pad0` and `__pad1`, it can't be + // initialized in normal way. + let mut mhdr: msghdr = unsafe { std::mem::zeroed() }; + mhdr.msg_name = std::ptr::null_mut(); + mhdr.msg_namelen = 0; + mhdr.msg_iov = &mut iov as *mut iovec; + mhdr.msg_iovlen = 1; + mhdr.msg_control = msg_control as *mut c_void; + mhdr.msg_controllen = msg_controllen as _; + mhdr.msg_flags = 0; + + // MSG_DONTWAIT: Enables nonblocking operation, if the operation would block the call + // fails with the error EAGAIN or EWOULDBLOCK. When this error occurs, break loop + let ret = unsafe { recvmsg(self.socket_fd, &mut mhdr, MSG_DONTWAIT) }; + + if ret == -1 { + let sock_err = io::Error::last_os_error(); + if sock_err.kind() == io::ErrorKind::WouldBlock { + break 'read; + } else { + return Err(sock_err); + } + } + + let cmsg_hdr: Option<&cmsghdr> = unsafe { + if mhdr.msg_controllen > 0 { + cmsg_space + .as_mut() + .unwrap() + .set_len(mhdr.msg_controllen as usize); + CMSG_FIRSTHDR(&mhdr as *const msghdr) + } else { + std::ptr::null() + } + .as_ref() + }; + + if let Some(scm) = cmsg_hdr { + if scm.cmsg_level == SOL_SOCKET && scm.cmsg_type == SCM_RIGHTS { + let scm_cmsg_header = unsafe { + std::slice::from_raw_parts( + CMSG_DATA(scm), + std::mem::size_of::<[RawFd; 2]>() as usize, + ) + }; + for fd in scm_cmsg_header.iter() { + if *fd != 0 { + self.scm_fd.push(i32::from(*fd)); + } + } + } + }; + + self.buf.push(tmp_buf[0]); + if let Some(pos) = self.pos.checked_add(1) { + self.pos = pos; + } else { + return Err(io::ErrorKind::InvalidInput.into()); + } + } + Ok(()) + } + + /// Send bytes message with socket file descriptor. + /// + /// # Notes + /// Use [sendmsg(2)](https://linux.die.net/man/2/sendmsg) to send messages + /// to `socket_fd`. + /// Message is `self::buf`: Vec with `self::pos` and length. + /// + /// # Arguments + /// + /// * `length` - Length of the buf to write. + /// + /// # Errors + /// The socket file descriptor is broken. + fn write_fd(&mut self, length: usize) -> std::io::Result<()> { + use libc::{c_void, iovec, msghdr, sendmsg}; + + let mut iov = iovec { + iov_base: self.buf.as_slice()[(self.pos - length)..(self.pos - 1)].as_ptr() + as *mut c_void, + iov_len: length, + }; + + // In `musl` toolchain, msghdr has private member `__pad0` and `__pad1`, it can't be + // initialized in normal way. + let mut mhdr: msghdr = unsafe { std::mem::zeroed() }; + mhdr.msg_name = std::ptr::null_mut(); + mhdr.msg_namelen = 0; + mhdr.msg_iov = &mut iov as *mut iovec; + mhdr.msg_iovlen = 1; + mhdr.msg_control = std::ptr::null_mut(); + mhdr.msg_controllen = 0; + mhdr.msg_flags = 0; + + if unsafe { sendmsg(self.socket_fd, &mhdr, 0) } == -1 { + Err(io::Error::new( + io::ErrorKind::BrokenPipe, + "The socket pipe is broken!", + )) + } else { + Ok(()) + } + } + + /// Reset `SocketRWHandler` buffer and pos. + pub fn clear(&mut self) { + self.buf.clear(); + self.scm_fd.clear(); + self.pos = 0; + } +} + +impl Read for SocketRWHandler { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let start = self.pos; + self.read_fd()?; + + buf[0..self.pos - start].copy_from_slice(&self.buf[start..self.pos]); + Ok(self.pos - start) + } +} + +impl Write for SocketRWHandler { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.buf.extend(buf); + if let Some(pos) = self.pos.checked_add(buf.len()) { + self.pos = pos; + } else { + return Err(io::ErrorKind::InvalidInput.into()); + } + + self.write_fd(buf.len())?; + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.clear(); + Ok(()) + } +} + +/// The handler to handle socket stream and parse socket stream bytes to +/// json-string. +/// +/// # Examples +/// +/// ```no_run +/// use std::os::unix::net::UnixStream; +/// use std::os::unix::io::AsRawFd; +/// use std::io::prelude::*; +/// +/// use machine_manager::socket::SocketHandler; +/// +/// fn main() -> std::io::Result<()> { +/// let mut stream = UnixStream::connect("/path/to/my/socket")?; +/// let mut handler = SocketHandler::new(stream.as_raw_fd()); +/// handler.send_str(&String::from("hello world"))?; +/// let mut response = String::new(); +/// stream.read_to_string(&mut response)?; +/// println!("{}", response); +/// Ok(()) +/// } +/// ``` +pub struct SocketHandler { + /// Handler `Read` and `Write` for socket stream + stream: SocketRWHandler, + /// Buffer to leave with read result + buffer: String, +} + +impl SocketHandler { + /// Allocates a new `SocketRWHandler` with `socket_fd` + /// + /// # Arguments + /// + /// * `r` - The file descriptor for socket. + pub fn new(r: RawFd) -> Self { + SocketHandler { + stream: SocketRWHandler::new(r), + buffer: String::new(), + } + } + + /// Parse the bytes received by `SocketHandler`. + /// + /// # Notes + /// If the bytes ended with '\n', this function will remove it. And then + /// parse to Deserialize object. + pub fn decode_line<'de, D: Deserialize<'de>>( + &'de mut self, + ) -> (Result>, Option) { + self.buffer.clear(); + self.stream.clear(); + self.stream.read_fd().unwrap(); + match self.stream.get_buf_string() { + Ok(buffer) => { + self.buffer = buffer; + if self.stream.pos == 0 { + (Ok(None), None) + } else { + ( + serde_json::from_str(&self.buffer) + .map(Some) + .map_err(From::from), + self.stream.getfd(), + ) + } + } + Err(e) => (Err(e), None), + } + } + + /// Send String to `socket_fd`. + /// + /// # Arguments + /// + /// * `s` - The `String` send to `socket_fd`. + /// + /// # Errors + /// The socket file descriptor is broken. + pub fn send_str(&mut self, s: &str) -> std::io::Result<()> { + self.stream.flush().unwrap(); + match self.stream.write(s.as_bytes()) { + Ok(_) => { + let _ = self.stream.write(&[b'\n'])?; + Ok(()) + } + Err(_) => Err(io::Error::new( + io::ErrorKind::BrokenPipe, + "The socket pipe is broken!", + )), + } + } +} + +#[cfg(test)] +mod tests { + use std::io::{Read, Write}; + use std::os::unix::io::{AsRawFd, RawFd}; + use std::os::unix::net::{UnixListener, UnixStream}; + use std::time::Duration; + + use serde::{Deserialize, Serialize}; + + use super::{Socket, SocketHandler, SocketRWHandler, SocketType}; + + // Environment Preparation for UnixSocket + fn prepare_unix_socket_environment(socket_id: &str) -> (UnixListener, UnixStream, UnixStream) { + let socket_name: String = format!("test_{}.sock", socket_id); + let _ = std::fs::remove_file(&socket_name); + + let listener = UnixListener::bind(&socket_name).unwrap(); + + std::thread::sleep(Duration::from_millis(100)); + let client = UnixStream::connect(&socket_name).unwrap(); + let (server, _) = listener.accept().unwrap(); + (listener, client, server) + } + + // Environment Recovery for UnixSocket + fn recover_unix_socket_environment(socket_id: &str) { + let socket_name: String = format!("test_{}.sock", socket_id); + std::fs::remove_file(&socket_name).unwrap(); + } + + fn socket_basic_rw(client_fd: RawFd, server_fd: RawFd) -> bool { + // Create `write_handler` and `read_handler` from `client_fd` and `server_fd` + let mut write_handler = SocketRWHandler::new(client_fd); + let mut read_handler = SocketRWHandler::new(server_fd); + + // Send a `buf` from `write_handler` to `read_handler` + // 1.First write + let test_buf1: [u8; 11] = [104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100]; + + assert_eq!(write_handler.write(&test_buf1).unwrap(), 11); + assert_eq!(write_handler.pos, 11); + + let mut rst_buf = [0u8; 20]; + assert_eq!(read_handler.read(&mut rst_buf).unwrap(), 11); + assert_eq!(rst_buf[..11], test_buf1); + assert_eq!(read_handler.buf, write_handler.buf); + assert_eq!(read_handler.buf[..11], test_buf1); + assert_eq!(write_handler.pos, 11); + + // 2.Second write + let test_buf2: [u8; 10] = [104, 101, 108, 108, 111, 32, 114, 117, 115, 116]; + + assert_eq!(write_handler.write(&test_buf2).unwrap(), 10); + assert_eq!(write_handler.pos, 21); + + assert_eq!(read_handler.read(&mut rst_buf).unwrap(), 10); + assert_eq!(rst_buf[..10], test_buf2); + assert_eq!(read_handler.buf, write_handler.buf); + assert_eq!(read_handler.buf[11..], test_buf2); + assert_eq!(write_handler.pos, 21); + + // 3.Use 'flush' and test third time + let test_buf3: [u8; 6] = [115, 111, 99, 107, 101, 116]; + write_handler.flush().unwrap(); + read_handler.flush().unwrap(); + assert_eq!(write_handler.pos, 0); + assert_eq!(read_handler.pos, 0); + assert!(write_handler.buf.is_empty()); + assert!(read_handler.buf.is_empty()); + + assert_eq!(write_handler.write(&test_buf3).unwrap(), 6); + assert_eq!(write_handler.pos, 6); + + assert_eq!(read_handler.read(&mut rst_buf).unwrap(), 6); + assert_eq!(rst_buf[..6], test_buf3); + assert_eq!(read_handler.buf, write_handler.buf); + assert_eq!(read_handler.buf[..6], test_buf3); + assert_eq!(write_handler.pos, 6); + + true + } + + #[test] + fn test_unix_socket_read_and_write() { + // Pre test. Environment Preparation + let (_, client, server) = prepare_unix_socket_environment("01"); + + // Test fn: socket basic read and write + assert!(socket_basic_rw(client.as_raw_fd(), server.as_raw_fd())); + + // After test. Environment Recover + recover_unix_socket_environment("01"); + } + + #[test] + fn test_socket_handler_sendstr() { + // Pre test. Environment Preparation + let (_, mut client, server) = prepare_unix_socket_environment("02"); + let mut handler = SocketHandler::new(server.as_raw_fd()); + + // Send a `String` with fn `sendstr` in SocketHandler + // 1.send str + handler.send_str("I am a test str").unwrap(); + let mut response = [0u8; 50]; + let length = client.read(&mut response).unwrap(); + assert_eq!( + String::from_utf8_lossy(&response[..length]), + "I am a test str\n".to_string() + ); + + // 2.send String + let message = String::from("I am a test String"); + handler.send_str(&message).unwrap(); + let length = client.read(&mut response).unwrap(); + assert_eq!( + String::from_utf8_lossy(&response[..length]), + "I am a test String\n".to_string() + ); + + // After test. Environment Recover + recover_unix_socket_environment("02"); + } + + #[derive(Serialize, Deserialize, PartialEq, Debug)] + struct JsonTestStruct { + name: String, + age: u8, + phones: Vec, + } + + #[test] + fn test_socket_handler_json_parser() { + // Pre test. Environment Preparation + let (_, mut client, server) = prepare_unix_socket_environment("03"); + let mut handler = SocketHandler::new(server.as_raw_fd()); + + // Use fn `decode_line` in `SocketHandler` to receive and parse msg to json struct + // 1.msg without '\n' or 'EOF' + let data = r#" + { + "name": "Lucky Dog", + "age": 18, + "phones": [ + "+86 01234567890", + "+86 09876543210" + ] + } + "#; + client.write(data.as_bytes()).unwrap(); + let resp_json: JsonTestStruct = match handler.decode_line() { + (Ok(buffer), _) => buffer.unwrap(), + _ => panic!("Failed to decode line!"), + }; + assert_eq!( + resp_json, + JsonTestStruct { + name: "Lucky Dog".to_string(), + age: 18u8, + phones: vec!["+86 01234567890".to_string(), "+86 09876543210".to_string()], + }, + ); + + // 2.msg with '\n' + client.write(data.as_bytes()).unwrap(); + client.write(b"\n").unwrap(); + let resp_json: JsonTestStruct = match handler.decode_line() { + (Ok(buffer), _) => buffer.unwrap(), + _ => panic!("Failed to decode line!"), + }; + assert_eq!( + resp_json, + JsonTestStruct { + name: "Lucky Dog".to_string(), + age: 18u8, + phones: vec!["+86 01234567890".to_string(), "+86 09876543210".to_string()], + }, + ); + + // After test. Environment Recover + recover_unix_socket_environment("03"); + } + + #[test] + fn test_socket_lifecycle() { + // Pre test. Environment Preparation + let (listener, _, server) = prepare_unix_socket_environment("04"); + let socket = Socket::from_unix_listener(listener, None); + + // life cycle test + // 1.Unconnected + assert_eq!(socket.is_connected(), false); + + // 2.Connected + socket.bind_unix_stream(server); + assert_eq!(socket.is_connected(), true); + assert_eq!(socket.get_socket_type(), SocketType::Unix); + + // 3.Unbind SocketStream, reset state + socket.drop_stream(); + assert_eq!(socket.is_connected(), false); + + // 4.Accept and reconnect a new UnixStream + let _new_client = UnixStream::connect("test_04.sock"); + let new_server = socket.accept_unix_stream(); + socket.bind_unix_stream(new_server); + assert_eq!(socket.is_connected(), true); + + // After test. Environment Recover + recover_unix_socket_environment("04"); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..9fb043d5 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,150 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +#[macro_use] +extern crate error_chain; +#[macro_use] +extern crate log; +extern crate vmm_sys_util; + +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::net::UnixListener; +use std::sync::{Arc, Mutex}; + +use vmm_sys_util::terminal::Terminal; + +use device_model::cmdline::{check_api_channel, create_args_parser, create_vmconfig}; +use device_model::{register_seccomp, LightMachine, MainLoop}; +use machine_manager::config::VmConfig; +#[cfg(feature = "qmp")] +use machine_manager::qmp::QmpChannel; +use machine_manager::socket::Socket; +use util::epoll_context::EventNotifierHelper; +use util::unix::limit_permission; +use util::{arg_parser, daemonize::daemonize, logger}; + +error_chain! { + links { + Manager(machine_manager::errors::Error, machine_manager::errors::ErrorKind); + Vm(device_model::errors::Error, device_model::errors::ErrorKind); + Util(util::errors::Error, util::errors::ErrorKind); + } + foreign_links { + Io(std::io::Error); + } +} + +quick_main!(run); + +#[allow(clippy::cast_ptr_alignment)] +fn run() -> Result<()> { + let cmd_args = create_args_parser().get_matches()?; + + if let Some(logfile_path) = cmd_args.value_of("display log") { + if logfile_path.is_empty() { + logger::init_logger_with_env(Some(Box::new(std::io::stdout()))) + .expect("logger init failed!"); + } else { + let logfile = std::fs::OpenOptions::new() + .read(false) + .write(true) + .append(true) + .create(true) + .mode(0o640) + .open(logfile_path)?; + logger::init_logger_with_env(Some(Box::new(logfile))).expect("logger init failed!"); + } + } else { + logger::init_logger_with_env(None).expect("logger init failed!"); + } + + std::panic::set_hook(Box::new(|panic_msg| { + std::io::stdin() + .lock() + .set_canon_mode() + .expect("Failed to set terminal to canon mode."); + + let panic_file = panic_msg.location().map_or("", |loc| loc.file()); + let panic_line = panic_msg.location().map_or(0, |loc| loc.line()); + if let Some(msg) = panic_msg.payload().downcast_ref::() { + error!("Panic at [{}: {}]: {}.", panic_file, panic_line, msg); + } else { + error!("Panic at [{}: {}].", panic_file, panic_line); + } + })); + + match real_main(&cmd_args) { + Ok(()) => info!("MainLoop over, Vm exit"), + Err(ref e) => { + std::io::stdin() + .lock() + .set_canon_mode() + .expect("Failed to set terminal to canon mode."); + error!("{}", error_chain::ChainedError::display_chain(e)); + } + } + + Ok(()) +} + +fn real_main(cmd_args: &arg_parser::ArgMatches) -> Result<()> { + let vm_config: VmConfig = create_vmconfig(cmd_args)?; + info!("VmConfig is {:?}", vm_config); + + if cmd_args.is_present("daemonize") { + match daemonize(cmd_args.value_of("pidfile")) { + Ok(()) => info!("Daemonize mode start!"), + Err(e) => error!("Daemonize start failed: {}", e), + } + } else { + std::io::stdin() + .lock() + .set_raw_mode() + .chain_err(|| "Failed to set terminal to raw mode.")?; + } + + #[cfg(feature = "qmp")] + QmpChannel::object_init(); + MainLoop::object_init(); + + let vm = LightMachine::new(vm_config)?; + MainLoop::set_manager(vm.clone()); + + let api_socket = { + let (api_path, _) = check_api_channel(&cmd_args)?; + let listener = UnixListener::bind(&api_path)?; + limit_permission(&api_path)?; + Socket::from_unix_listener(listener, Some(vm.clone())) + }; + + MainLoop::update_event(EventNotifierHelper::internal_notifiers(Arc::new( + Mutex::new(api_socket), + )))?; + + vm.realize()?; + vm.vm_start( + cmd_args.is_present("freeze_cpu"), + !cmd_args.is_present("disable-seccomp"), + )?; + + if !cmd_args.is_present("disable-seccomp") { + register_seccomp()?; + } + + loop { + if !MainLoop::run()? { + break; + } + } + + Ok(()) +} diff --git a/util/Cargo.toml b/util/Cargo.toml new file mode 100644 index 00000000..35c2d229 --- /dev/null +++ b/util/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "util" +version = "0.1.0" +authors = ["Huawei StratoVirt Team"] +edition = "2018" +license = "Mulan PSL v2" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2.71" +log = { version = "0.4.8", features = ["std"]} +error-chain = "0.12.4" +kvm-ioctls = "0.5.0" +kvm-bindings = "0.2.0" +vmm-sys-util = "0.6.1" diff --git a/util/src/aio/libaio.rs b/util/src/aio/libaio.rs new file mode 100644 index 00000000..b07a6995 --- /dev/null +++ b/util/src/aio/libaio.rs @@ -0,0 +1,122 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use super::Result; + +pub const IOCB_FLAG_RESFD: u32 = 1; +pub const IOCB_FLAG_IOPRIO: u32 = 1 << 1; + +#[derive(Debug, Clone)] +pub struct Iovec { + pub iov_base: u64, + pub iov_len: u64, +} + +#[repr(C)] +#[allow(non_camel_case_types)] +#[derive(Default)] +pub struct IoCb { + pub data: u64, + pub key: u32, + pub aio_reserved1: u32, + pub aio_lio_opcode: u16, + pub aio_reqprio: u16, + pub aio_fildes: u32, + pub aio_buf: u64, + pub aio_nbytes: u64, + pub aio_offset: u64, + pub aio_reserved2: u64, + pub aio_flags: u32, + pub aio_resfd: u32, +} + +#[repr(C)] +#[allow(non_camel_case_types)] +#[derive(Copy, Clone)] +pub enum IoCmd { + PREAD = 0, + PWRITE = 1, + FSYNC = 2, + FDSYNC = 3, + NOOP = 6, + PREADV = 7, + PWRITEV = 8, +} + +#[repr(C)] +#[allow(non_camel_case_types)] +#[derive(Default)] +pub struct IoEvent { + pub data: u64, + pub obj: u64, + pub res: i64, + pub res2: i64, +} + +#[allow(non_camel_case_types)] +pub enum IoContext {} + +pub struct EventResult { + pub events: Vec, + pub nr: usize, +} + +pub struct LibaioContext { + pub ctx: *mut IoContext, + pub max_size: i32, +} + +impl LibaioContext { + pub fn new(max_size: i32) -> Result { + let mut ctx = std::ptr::null_mut(); + + let ret = unsafe { libc::syscall(libc::SYS_io_setup, max_size, &mut ctx) }; + if ret < 0 { + bail!("Failed to setup aio context, return {}.", ret); + } + + Ok(LibaioContext { ctx, max_size }) + } + + pub fn submit(&self, nr: i64, iocbp: &mut Vec<*mut IoCb>) -> Result<()> { + let ret = unsafe { libc::syscall(libc::SYS_io_submit, self.ctx, nr, iocbp.as_ptr()) }; + if ret < 0 { + bail!("Failed to submit aio, return {}.", ret); + } + + Ok(()) + } + + #[allow(clippy::zero_ptr)] + pub fn get_events(&self) -> Result { + let mut events: Vec<_> = (0..self.max_size).map(|_| IoEvent::default()).collect(); + + let evt_cnt = unsafe { + libc::syscall( + libc::SYS_io_getevents, + self.ctx, + 0, + i64::from(self.max_size), + events.as_mut_ptr(), + 0 as *mut libc::timespec, + ) + }; + if evt_cnt < 0 { + bail!("Failed to get aio events, return {}.", evt_cnt); + } + + Ok(EventResult { + events, + nr: evt_cnt as usize, + }) + } +} diff --git a/util/src/aio/mod.rs b/util/src/aio/mod.rs new file mode 100644 index 00000000..f0b04015 --- /dev/null +++ b/util/src/aio/mod.rs @@ -0,0 +1,183 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +mod libaio; +mod raw; + +use std::clone::Clone; +use std::marker::{Send, Sync}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; + +use vmm_sys_util::eventfd::EventFd; + +use super::errors::Result; +use super::link_list::{List, Node}; +pub use libaio::*; +pub use raw::*; + +type CbList = List>; +type CbNode = Node>; + +pub type AioCompleteFunc = Box, i64) + Sync + Send>; + +pub struct AioCb { + pub last_aio: bool, + pub file_fd: RawFd, + pub opcode: IoCmd, + pub iovec: Vec, + pub offset: usize, + pub process: bool, + pub iocb: Option>, + pub iocompletecb: T, +} + +impl AioCb { + pub fn new(cb: T) -> Self { + AioCb { + last_aio: true, + file_fd: 0, + opcode: IoCmd::NOOP, + iovec: Vec::new(), + offset: 0, + process: false, + iocb: None, + iocompletecb: cb, + } + } +} + +pub struct Aio { + pub ctx: Arc, + pub fd: EventFd, + pub aio_in_queue: CbList, + pub aio_in_flight: CbList, + max_events: usize, + complete_func: Arc>, +} + +impl Aio { + pub fn new(func: Arc>) -> Result { + let max_events = 128; + + Ok(Aio { + ctx: Arc::new(LibaioContext::new(max_events as i32)?), + fd: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + aio_in_queue: List::new(), + aio_in_flight: List::new(), + max_events, + complete_func: func, + }) + } + + pub fn handle(&mut self) -> Result<()> { + let evts = self.ctx.get_events()?; + for e in evts.events.iter().take(evts.nr) { + if e.res2 == 0 { + unsafe { + let node = e.data as *mut CbNode; + + (self.complete_func)(&(*node).value, e.res); + self.aio_in_flight.unlink(&(*node)); + + // free mem + if let Some(i) = (*node).value.iocb { + libc::free((*node).value.iovec.as_ptr() as *mut libc::c_void); + libc::free(i.as_ptr() as *mut libc::c_void); + }; + libc::free(node as *mut libc::c_void); + } + } + } + self.process_list() + } + + fn process_list(&mut self) -> Result<()> { + if self.aio_in_queue.len > 0 && self.aio_in_flight.len < self.max_events { + let mut iocbs = Vec::new(); + + for _ in self.aio_in_flight.len..self.max_events { + match self.aio_in_queue.pop_tail() { + Some(node) => { + iocbs.push(node.value.iocb.unwrap().as_ptr()); + self.aio_in_flight.add_head(node); + } + None => break, + } + } + + if !iocbs.is_empty() { + return self.ctx.submit(iocbs.len() as i64, &mut iocbs); + } + } + + Ok(()) + } + + pub fn rw_aio(&mut self, cb: AioCb) -> Result<()> { + let last_aio = cb.last_aio; + let opcode = cb.opcode; + let file_fd = cb.file_fd; + let iovec = (&*cb.iovec).as_ptr() as u64; + let sg_size = cb.iovec.len(); + let offset = cb.offset; + + let mut node = Box::new(Node::new(cb)); + let iocb = IoCb { + aio_lio_opcode: opcode as u16, + aio_fildes: file_fd as u32, + aio_buf: iovec, + aio_nbytes: sg_size as u64, + aio_offset: offset as u64, + aio_flags: IOCB_FLAG_RESFD, + aio_resfd: self.fd.as_raw_fd() as u32, + data: (&mut (*node) as *mut CbNode) as u64, + ..Default::default() + }; + node.value.iocb = std::ptr::NonNull::new(Box::into_raw(Box::new(iocb))); + + self.aio_in_queue.add_head(node); + if last_aio || self.aio_in_queue.len + self.aio_in_flight.len >= self.max_events { + return self.process_list(); + } + + Ok(()) + } + + pub fn rw_sync(&mut self, cb: AioCb) -> Result<()> { + let ret = match cb.opcode { + IoCmd::PREADV => { + let mut r = 0; + let mut off = cb.offset; + for iov in cb.iovec.iter() { + r = raw_read(cb.file_fd, iov.iov_base, iov.iov_len as usize, off)?; + off += iov.iov_len as usize; + } + r + } + IoCmd::PWRITEV => { + let mut r = 0; + let mut off = cb.offset; + for iov in cb.iovec.iter() { + r = raw_write(cb.file_fd, iov.iov_base, iov.iov_len as usize, off)?; + off += iov.iov_len as usize; + } + r + } + IoCmd::FDSYNC => raw_datasync(cb.file_fd)?, + _ => -1, + }; + (self.complete_func)(&cb, ret); + + Ok(()) + } +} diff --git a/util/src/aio/raw.rs b/util/src/aio/raw.rs new file mode 100644 index 00000000..27f6f039 --- /dev/null +++ b/util/src/aio/raw.rs @@ -0,0 +1,42 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use super::Result; +use libc::{c_void, fdatasync, pread, pwrite}; +use std::os::unix::io::RawFd; + +pub fn raw_read(fd: RawFd, buf: u64, size: usize, offset: usize) -> Result { + let ret = unsafe { pread(fd, buf as *mut c_void, size, offset as i64) as i64 }; + if ret < 0 { + bail!("Failed to pread for {}, return {}.", fd, ret); + } + + Ok(ret) +} + +pub fn raw_write(fd: RawFd, buf: u64, size: usize, offset: usize) -> Result { + let ret = unsafe { pwrite(fd, buf as *mut c_void, size, offset as i64) as i64 }; + if ret < 0 { + bail!("Failed to pwrite for {}, return {}.", fd, ret); + } + + Ok(ret) +} + +pub fn raw_datasync(fd: RawFd) -> Result { + let ret = unsafe { i64::from(fdatasync(fd)) }; + if ret < 0 { + bail!("Failed to fdatasync for {}, return {}.", fd, ret); + } + + Ok(ret) +} diff --git a/util/src/arg_parser.rs b/util/src/arg_parser.rs new file mode 100644 index 00000000..26750864 --- /dev/null +++ b/util/src/arg_parser.rs @@ -0,0 +1,631 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::collections::BTreeMap; +use std::env; +use std::io::Write; +use std::process; + +use crate::errors::{ErrorKind, Result}; + +const PREFIX_CHARS_SHORT: &str = "-"; +const PREFIX_CHARS_LONG: &str = "-"; +const HELP_SHORT: &str = "h"; +const HELP_LONG: &str = "help"; +const VERSION_SHORT: &str = "V"; +const VERSION_LONG: &str = "version"; +const FOUR_BLANK: &str = " "; +const EIGHT_BLANK: &str = " "; +const TWENTY_FOUT_BLANK: &str = " "; + +type ArgsMap = BTreeMap>; + +/// Format help type. +pub enum HelpType { + /// Argument as a Flag. + FLAGS, + /// Argument as a Option. + OPTION, + /// Argument will not output in help message. + HIDDEN, +} + +/// Structure to store `ArgParser` information, which contains a command line +/// program and all command line arguments can be used. The `ArgParser` are set +/// using the `ArgParser::get_matches` member methods to start parse process +/// cmdline. +/// +/// # Examples +/// +/// ```no_run +/// # use util::arg_parser::{ArgParser, Arg}; +/// let application = ArgParser::new("My Application") +/// .author("example") +/// .version("0.0.1") +/// .about("Description for application") +/// .arg( +/// Arg::with_name("arg_name") +/// ) +/// .get_matches(); +/// ``` +#[derive(Clone, Debug, Default)] +pub struct ArgParser<'a> { + name: &'a str, + version: Option<&'a str>, + author: Option<&'a str>, + about: Option<&'a str>, + args: BTreeMap<&'a str, Arg<'a>>, + allow_list: Vec, +} + +/// The structure is used to get information about arguments that were supplied +/// to the application from user. New instances of this struct are created by +/// using the `ArgParser::get_matches` methods. +#[derive(Debug, Default, Clone)] +pub struct ArgMatches<'a> { + pub args: BTreeMap<&'a str, Arg<'a>>, +} + +/// The structure of a command line argument. Used to set all the options that +/// define a valid argument for the application. +/// +/// # Examples +/// +/// ```rust +/// # use util::arg_parser::Arg; +/// let arg = Arg::with_name("name") +/// .long("name") +/// .value_name("arg_name") +/// .help("set the name of the arg.") +/// .takes_value(true); +/// ``` +#[derive(Clone, Debug, Default)] +pub struct Arg<'a> { + name: &'a str, + long: Option<&'a str>, + short: Option<&'a str>, + help: Option<&'a str>, + value_name: Option<&'a str>, + value: Option, + values: Option>, + possible_values: Option>, + required: bool, + presented: bool, + hiddable: bool, + multiple: bool, + can_no_value: bool, +} + +impl<'a> ArgParser<'a> { + /// Create a new `ArgParser` with a name. The name will be displayed to the + /// user when they use `-V` or `-h`. + /// + /// # Examples + /// + /// ```no_run + /// use util::arg_parser::ArgParser; + /// + /// let application = ArgParser::new("My Application"); + /// ``` + pub fn new(name: &'a str) -> Self { + let mut arg_parser = ArgParser::default().name(name); + + arg_parser + .allow_list + .push(format!("{}{}", PREFIX_CHARS_SHORT, HELP_SHORT)); + arg_parser + .allow_list + .push(format!("{}{}", PREFIX_CHARS_LONG, HELP_LONG)); + arg_parser + .allow_list + .push(format!("{}{}", PREFIX_CHARS_SHORT, VERSION_SHORT)); + arg_parser + .allow_list + .push(format!("{}{}", PREFIX_CHARS_LONG, VERSION_LONG)); + + arg_parser + } + + /// Set name for ArgParser. + fn name(mut self, name: &'a str) -> Self { + self.name = name; + self + } + + /// Set version for `ArgParser`. + pub fn version(mut self, version: &'a str) -> Self { + self.version = Some(version); + self + } + + /// Set author for `ArgParser`. + pub fn author(mut self, author: &'a str) -> Self { + self.author = Some(author); + self + } + + /// Set about for `ArgParser`. + pub fn about(mut self, about: &'a str) -> Self { + self.about = Some(about); + self + } + + /// Insert a new arg into `ArgParser`'s `args`. + pub fn arg(mut self, arg: Arg<'a>) -> Self { + if arg.long.is_some() { + self.allow_list + .push(format!("{}{}", PREFIX_CHARS_LONG, arg.long.unwrap())); + } + if arg.short.is_some() { + self.allow_list + .push(format!("{}{}", PREFIX_CHARS_SHORT, arg.short.unwrap())); + } + self.args.insert(arg.name, arg); + self + } + + /// Starts the parsing process.This method gets all user provided arguments + /// from [`env::args_os`] in order to allow for invalid UTF-8 code points. + pub fn get_matches(mut self) -> Result> { + let (arg_hash, multi_vec) = parse_cmdline(&self.allow_list)?; + + if arg_hash.contains_key(HELP_SHORT) || arg_hash.contains_key(HELP_LONG) { + self.output_help(); + process::exit(0); + } + + if arg_hash.contains_key(VERSION_SHORT) || arg_hash.contains_key(VERSION_LONG) { + self.show_version(); + process::exit(0); + } + + for arg in self.args.values_mut() { + (*arg).parse_from_hash(&arg_hash, &multi_vec)?; + } + + Ok(ArgMatches::new(self.args)) + } + + fn output_help(&self) { + let mut output_base: Vec = Vec::new(); + let mut output_flags: Vec = Vec::new(); + let mut output_options: Vec = Vec::new(); + + // help base output + output_base.push(format!("{} {}", self.name, self.version.unwrap_or(""))); + output_base.push(self.author.unwrap_or("").to_string()); + output_base.push(self.about.unwrap_or("").to_string()); + + // Default FLAGS + output_flags.push(format!( + "{}{}h, {}help Prints help information", + FOUR_BLANK, PREFIX_CHARS_SHORT, PREFIX_CHARS_LONG + )); + output_flags.push(format!( + "{}{}V, {}version Prints version information", + FOUR_BLANK, PREFIX_CHARS_SHORT, PREFIX_CHARS_LONG + )); + + // FLAGS and OPTIONS + for arg in self.args.values() { + let (help_str, help_type) = (*arg).help_message(); + match help_type { + HelpType::FLAGS => { + output_flags.push(help_str); + } + HelpType::OPTION => { + output_options.push(help_str); + } + HelpType::HIDDEN => {} + } + } + + // start output using stdout now + let stdout = std::io::stdout(); + let mut handle = std::io::BufWriter::new(stdout); + + // base output + for line in output_base { + writeln!(handle, "{}", line).unwrap(); + } + + // USAGE output + writeln!(handle, "USAGE:").unwrap(); + if output_options.is_empty() { + writeln!(handle, "{}{} [FLAGS]", FOUR_BLANK, get_name()).unwrap(); + } else { + writeln!(handle, "{}{} [FLAGS] [OPTIONS]", FOUR_BLANK, get_name()).unwrap(); + } + + // FLAGS output + writeln!(handle, "FLAGS:").unwrap(); + for line in output_flags { + writeln!(handle, "{}", line).unwrap(); + } + + // OPTIONS output + if !output_options.is_empty() { + writeln!(handle, "OPTIONS:").unwrap(); + for line in output_options { + writeln!(handle, "{}", line).unwrap(); + } + } + } + + fn show_version(&self) { + let stdout = std::io::stdout(); + let mut handle = std::io::BufWriter::new(stdout); + writeln!( + handle, + "{} {}", + self.name, + self.version.unwrap_or("Unknown") + ) + .unwrap(); + } +} + +impl<'a> Arg<'a> { + /// Create a new arg with arg's name. + pub fn with_name(name: &'a str) -> Self { + let mut arg = Arg::default(); + arg.name = name; + arg + } + + /// Set long argument for arg. + pub fn long(mut self, long: &'a str) -> Self { + self.long = Some(long); + self + } + + /// Set short argument for arg. + pub fn short(mut self, short: &'a str) -> Self { + self.short = Some(short); + self + } + + /// Set help message for arg. + pub fn help(mut self, help: &'a str) -> Self { + self.help = Some(help); + self + } + + /// Set hidden, it can hid help message for this argument. + pub fn hidden(mut self, hidden: bool) -> Self { + self.hiddable = hidden; + self + } + + /// Set multiple, it can allow use argument more than once. + pub fn multiple(mut self, multiple: bool) -> Self { + self.multiple = multiple; + self + } + + /// Set value_name for help message. + pub fn value_name(mut self, value_name: &'a str) -> Self { + self.value_name = Some(value_name); + self + } + + /// Set value kind for arguments. + pub fn takes_value(mut self, switch: bool) -> Self { + if switch { + self.value = Some(Default::default()); + } + self + } + + /// Set value kind for arguments. + pub fn takes_values(mut self, switch: bool) -> Self { + if switch { + self.values = Some(Vec::new()); + } + self + } + + /// Set required for arguments. + pub fn required(mut self, required: bool) -> Self { + self.required = required; + self + } + + /// Set can no value for arguments. + pub fn can_no_value(mut self, can: bool) -> Self { + self.can_no_value = can; + self + } + + /// Set default value for a argument. + pub fn default_value(mut self, value: &'a str) -> Self { + match self.value { + Some(_) => self.value = Some(value.to_string()), + None => { + if self.values.is_some() { + let values: Vec = vec![value.to_string()]; + self.values = Some(values); + } + } + } + self.presented = true; + self + } + + /// Set possible values for argument. + pub fn possible_values(mut self, values: Vec<&'a str>) -> Self { + self.possible_values = Some(values); + self + } + + /// Parse argument from a hashset. + fn parse_from_hash(&mut self, arg_hash: &ArgsMap, multi_vec: &[String]) -> Result<()> { + let long_name = self.long.unwrap().to_string(); + + if arg_hash.contains_key(&long_name) { + if !self.multiple && multi_vec.contains(&long_name) { + return Err(ErrorKind::DuplicateArgument(long_name).into()); + } + + if self.value.is_some() && (arg_hash[&long_name].len() > 1) && !self.multiple { + return Err(ErrorKind::DuplicateValue(long_name).into()); + } + + if (self.value.is_some() || self.values.is_some()) && (arg_hash[&long_name].is_empty()) + { + if self.can_no_value { + self.value = Some(Default::default()); + self.presented = true; + return Ok(()); + } else { + return Err(ErrorKind::MissingValue(long_name).into()); + } + } + + if (self.value.is_none() && self.values.is_none()) && (!arg_hash[&long_name].is_empty()) + { + return Err(ErrorKind::IllegelValue( + arg_hash[&long_name][0].to_string(), + long_name.to_string(), + ) + .into()); + } + + if self.value.is_some() { + if self.possible_value_check(&arg_hash[&long_name][0]) { + self.value = Some(arg_hash[&long_name][0].clone()); + } else { + return Err(ErrorKind::ValueOutOfPossible( + long_name, + format!("{:?}", self.possible_values), + ) + .into()); + } + } else if self.values.is_some() { + if self.possible_values_check(arg_hash[&long_name].clone()) { + self.values = Some(arg_hash[&long_name].clone()); + } else { + return Err(ErrorKind::ValueOutOfPossible( + long_name, + format!("{:?}", self.possible_values), + ) + .into()); + } + } + + self.presented = true; + } else if self.required { + return Err(ErrorKind::MissingArgument(long_name).into()); + } + + if self.short.is_some() { + let short_name = self.short.unwrap(); + if arg_hash.contains_key(short_name) { + if (self.value.is_none() && self.values.is_none()) + && (!arg_hash[short_name].is_empty()) + { + return Err(ErrorKind::IllegelValue( + arg_hash[short_name][0].to_string(), + short_name.to_string(), + ) + .into()); + } + + self.presented = true; + } else if self.required { + return Err(ErrorKind::MissingArgument(short_name.to_string()).into()); + } + } + + Ok(()) + } + + /// Produce help message for argument. + fn help_message(&self) -> (String, HelpType) { + if self.hiddable { + (String::new(), HelpType::HIDDEN) + } else if self.short.is_some() { + let font_str = format!( + "{}{}{}, {}{}", + FOUR_BLANK, + PREFIX_CHARS_SHORT, + self.short.unwrap(), + PREFIX_CHARS_LONG, + self.long.unwrap_or("") + ); + let mut help_str = format!("{}{}", TWENTY_FOUT_BLANK, self.help.unwrap_or("")); + let font_offset = font_str.len(); + help_str.replace_range(..font_offset, &font_str); + (help_str, HelpType::FLAGS) + } else { + let font_str = if self.values.is_some() { + format!( + "{}{}{} <{}>...", + EIGHT_BLANK, + PREFIX_CHARS_LONG, + self.long.unwrap(), + self.value_name.unwrap_or(self.name) + ) + } else { + format!( + "{}{}{} <{}>", + EIGHT_BLANK, + PREFIX_CHARS_LONG, + self.long.unwrap(), + self.value_name.unwrap_or(self.name) + ) + }; + let mut help_str = format!( + "{}{}{}{}", + TWENTY_FOUT_BLANK, + TWENTY_FOUT_BLANK, + TWENTY_FOUT_BLANK, + self.help.unwrap_or("") + ); + let font_offset = font_str.len(); + help_str.replace_range(..font_offset, &font_str); + (help_str, HelpType::OPTION) + } + } + + fn possible_value_check(&self, value: &'a str) -> bool { + if self.possible_values.is_some() { + self.possible_values.as_ref().unwrap().contains(&value) + } else { + true + } + } + + fn possible_values_check(&self, values: Vec) -> bool { + if self.possible_values.is_some() { + for value in values { + if !self.possible_value_check(&value) { + return false; + } + } + true + } else { + true + } + } +} + +impl<'a> ArgMatches<'a> { + fn new(args: BTreeMap<&'a str, Arg<'a>>) -> Self { + let mut arg_matches = ArgMatches::default(); + arg_matches.args = args; + arg_matches + } + + /// Get the single value for `arg`. + /// + /// # Arguments + /// + /// * `arg_name` - Name of `arg`. + pub fn value_of(&self, arg_name: &'a str) -> Option { + match self.args.get(arg_name) { + Some(arg) => { + if arg.presented { + arg.value.clone() + } else { + None + } + } + None => None, + } + } + + /// Get the all values for `arg`. + /// + /// # Arguments + /// + /// * `arg_name` - Name of `arg`. + pub fn values_of(&self, arg_name: &'a str) -> Option> { + match self.args.get(arg_name) { + Some(arg) => { + if arg.presented { + arg.values.clone() + } else { + None + } + } + None => None, + } + } + + /// Confirm whether the `arg` is given or not. + /// + /// # Arguments + /// + /// * `arg_name` - Name of `arg`. + pub fn is_present(&self, arg_name: &'a str) -> bool { + self.args[arg_name].presented + } +} + +#[allow(clippy::map_entry)] +fn parse_cmdline(allow_list: &[String]) -> Result<(ArgsMap, Vec)> { + let cmd_args: Vec = env::args().collect(); + let mut arg_map: BTreeMap> = BTreeMap::new(); + let mut multi_vec: Vec = Vec::new(); + + let mut i = (0, ""); + let mut j = 1; + for cmd_arg in &cmd_args[1..] { + if !allow_list.contains(&cmd_arg) && cmd_arg.starts_with(PREFIX_CHARS_SHORT) { + return Err(ErrorKind::UnexpectedArguments(cmd_arg.to_string()).into()); + } + + if cmd_arg.starts_with(PREFIX_CHARS_LONG) { + let arg_str = split_arg(cmd_arg, PREFIX_CHARS_LONG); + if arg_map.contains_key(&arg_str) { + multi_vec.push(arg_str); + } else { + arg_map.insert(arg_str, Vec::new()); + } + + i = (j, PREFIX_CHARS_LONG); + } else if cmd_arg.starts_with(PREFIX_CHARS_SHORT) { + let arg_str = split_arg(cmd_arg, PREFIX_CHARS_SHORT); + if arg_map.contains_key(&arg_str) { + multi_vec.push(arg_str); + } else { + arg_map.insert(arg_str, Vec::new()); + } + i = (j, PREFIX_CHARS_SHORT); + } else { + let arg_str = match i.1 { + PREFIX_CHARS_LONG => split_arg(&cmd_args[i.0], PREFIX_CHARS_LONG), + &_ => { + return Err(ErrorKind::UnexpectedArguments(cmd_arg.to_string()).into()); + } + }; + arg_map + .get_mut(arg_str.as_str()) + .unwrap() + .push(cmd_arg.to_string()); + } + j += 1; + } + Ok((arg_map, multi_vec)) +} + +fn get_name() -> String { + let cmd_args: Vec = env::args().collect(); + let name_str: Vec<&str> = cmd_args[0].split('/').collect(); + (*name_str.last().unwrap()).to_string() +} + +fn split_arg(arg: &str, prefix_chars: &str) -> String { + let i = prefix_chars.len(); + String::from(&arg[i..]) +} diff --git a/util/src/byte_code.rs b/util/src/byte_code.rs new file mode 100644 index 00000000..9b1dd6f3 --- /dev/null +++ b/util/src/byte_code.rs @@ -0,0 +1,69 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::mem::size_of; +use std::slice::{from_raw_parts, from_raw_parts_mut}; + +/// A trait bound defined for types which are safe to convert to a byte slice and +/// to create from a byte slice. +pub trait ByteCode: Default + Copy + Send + Sync { + /// Return the contents of an object (impl trait `ByteCode`) as a slice of bytes. + /// the inverse of this function is "from_bytes" + fn as_bytes(&self) -> &[u8] { + unsafe { from_raw_parts(self as *const Self as *const u8, size_of::()) } + } + + /// Return the contents of a mutable object (impl trait `ByteCode`) to a mutable slice of bytes. + /// the inverse of this function is "from_bytes_mut" + fn as_mut_bytes(&mut self) -> &mut [u8] { + unsafe { from_raw_parts_mut(self as *mut Self as *mut u8, size_of::()) } + } + + /// Creates an object (impl trait `ByteCode`) from a slice of bytes + /// + /// # Arguments + /// + /// * `data` - the slice of bytes that will be constructed as an object. + fn from_bytes(data: &[u8]) -> Option<&Self> { + if data.len() != size_of::() { + return None; + } + let obj_array = unsafe { from_raw_parts::(data.as_ptr() as *const _, data.len()) }; + Some(&obj_array[0]) + } + + /// Creates an mutable object (impl trait `ByteCode`) from a mutable slice of bytes + /// + /// # Arguments + /// + /// * `data` - the slice of bytes that will be constructed as an mutable object. + fn from_mut_bytes(data: &mut [u8]) -> Option<&mut Self> { + if data.len() != size_of::() { + return None; + } + let obj_array = + unsafe { from_raw_parts_mut::(data.as_mut_ptr() as *mut _, data.len()) }; + Some(&mut obj_array[0]) + } +} + +// Integer types of Rust satisfy the requirements of `trait ByteCode` +impl ByteCode for usize {} +impl ByteCode for u8 {} +impl ByteCode for u16 {} +impl ByteCode for u32 {} +impl ByteCode for u64 {} +impl ByteCode for isize {} +impl ByteCode for i8 {} +impl ByteCode for i16 {} +impl ByteCode for i32 {} +impl ByteCode for i64 {} diff --git a/util/src/checksum.rs b/util/src/checksum.rs new file mode 100644 index 00000000..3755c157 --- /dev/null +++ b/util/src/checksum.rs @@ -0,0 +1,35 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use super::byte_code::ByteCode; + +pub fn checksum(slice: &[u8]) -> u8 { + let mut sum: u32 = 0; + + for byte in slice.iter() { + sum += u32::from(*byte); + sum &= 0xff; + } + + (sum & 0xff) as u8 +} + +pub fn obj_checksum(t: &T) -> u8 { + let mut sum: u32 = 0; + + for byte in t.as_bytes().iter() { + sum += u32::from(*byte); + sum &= 0xff; + } + + (sum & 0xff) as u8 +} diff --git a/util/src/daemonize.rs b/util/src/daemonize.rs new file mode 100644 index 00000000..dbb46b1c --- /dev/null +++ b/util/src/daemonize.rs @@ -0,0 +1,176 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +#![deny(missing_docs)] +//! This file implements a high level wrapper for daemonize +//! +//! # Daemonize Introduction +//! +//! [daemonize](https://linux.die.net/man/1/daemonize) to runs a command as a +//! Unix daemonize. A daemon is a process that executes in the background either +//! waiting for some event to occur, or waiting to perform some specified task +//! on a periodic basis. A typical daemon program will: +//! 1. Close all open file descriptors(especially standard input, standard +//! output and standard error). +//! 2. Change its working directory to the root filesystem, to ensure that it +//! doesn't tie up another filesystem and prevent it from being unmounted. +//! 3. Reset its umask value. +//! 4. Run in the background(i.e., fork). +//! 5. Ignore all terminal I/O signals. +//! 6. Disassociate from the control terminal. +//! 7. Disassociate from its process group, to insulate itself from signals +//! sent to the process group. +//! 8. Handle any `SIGCLD` signals. + +extern crate libc; + +use std::cmp::Ordering; +use std::fs::{File, OpenOptions}; +use std::io::prelude::*; +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::RawFd; +use std::path::Path; +use std::process::exit; + +use crate::errors::{ErrorKind, Result}; + +/// Write process id to pid file. +fn create_pid_file(path: &str) -> Result<()> { + let pid: u32 = std::process::id(); + + if Path::new(path).exists() { + return Err(ErrorKind::PidFileExist.into()); + } + + let mut pid_file: File = OpenOptions::new() + .write(true) + .create(true) + .mode(0o600) + .open(path)?; + write!(pid_file, "{}", pid)?; + + Ok(()) +} + +/// [fork(2)](https://man7.org/linux/man-pages/man2/fork.2.html) +/// fork() creates a new process by duplicating the calling process. The new +/// process is referred to as the child process. The calling process is referred +/// to as the parent process. +/// **libc::fork()** may have three kinds ret: +/// if ret > 0 : current process is parent process, it's not expected, so exit +/// if ret < 0 : error occurred in fork() +/// if ret = 0 : current process is child process, it's expected +/// +/// # Errors +/// +/// `DaemonFork` Error, the ret of `libc::fork()` is less than zero. +fn fork() -> Result<()> { + let ret = unsafe { libc::fork() }; + + match ret.cmp(&0) { + Ordering::Less => Err(ErrorKind::DaemonFork.into()), + Ordering::Greater => exit(0), + Ordering::Equal => Ok(()), + } +} + +/// [setsid(2)](https://man7.org/linux/man-pages/man2/setsid.2.html) +/// setsid() creates a new session if the calling process is not a process group +/// leader. The calling process is the leader of the new session. The calling +/// process also becomes the process group leader or a new process group in the +/// session. +/// The calling process will be the only process in the new process group and in +/// the new session. New session has no controlling termimal. +/// +/// # Errors +/// +/// `DaemonSetsid` Error, the ret of `libc::setsid()` is -1 +fn set_sid() -> Result<()> { + let ret = unsafe { libc::setsid() }; + + if ret == -1 { + Err(ErrorKind::DaemonSetsid.into()) + } else { + Ok(()) + } +} + +/// Redirect stdio to `/dev/null`. +/// +/// Use [dup(2)](https://man7.org/linux/man-pages/man2/dup.2.html) +/// dup2(oldfd, newfd) creates a copy of the file descriptor `oldfd`, uses the +/// file descriptor number specified in `newfd`. If the file descriptor `newfd` +/// was previously open, it is silently closed before being reused. This +/// function use `dup2` to redirect file descriptor use to `/dev/null`. +/// +/// # Errors +/// +/// `DaemonRedirectStdio` Error, the ret of `libc::open()`, `libc::dup2()`, +/// `libc::close()`is -1 +fn redirect_stdio(fd: RawFd) -> Result<()> { + unsafe { + let devnull_fd = libc::open(b"/dev/null\0" as *const [u8; 10] as _, libc::O_RDWR); + + if devnull_fd == -1 { + return Err(ErrorKind::DaemonRedirectStdio.into()); + } + + if libc::dup2(devnull_fd, fd) == -1 { + return Err(ErrorKind::DaemonRedirectStdio.into()); + } + + if libc::close(devnull_fd) == -1 { + return Err(ErrorKind::DaemonRedirectStdio.into()); + } + } + + Ok(()) +} + +/// Daemonize a process. +/// +/// # Arguments +/// +/// * `pid_file` - Path where will create pid file. +/// +/// # Notes +/// This function do five things to daemonize a process: +/// 1. Reset its umask value. +/// 2. Run in the background use fork. +/// 3. Ignore all terminal I/O signals. +/// 4. Disassociate from the control terminal. +/// 5. Write pid to pidfile. +pub fn daemonize(pid_file: Option) -> Result<()> { + // The first fork make parent process quit, child process inherit parent's + // session ID and have a new process ID. It can guarantee child + // process will not be the first process in a session. + fork()?; + // Create a new session for process. Now parent process quit will not + // influence stratovirt process. But stratovirt becomes the first process in + // new section. + set_sid()?; + // The second fork make stratovirt run as daemonize process. It won't be the + // first process in this session and never get terminal control. + fork()?; + // Redirect stdio to `/dev/null`. + redirect_stdio(libc::STDIN_FILENO)?; + redirect_stdio(libc::STDOUT_FILENO)?; + redirect_stdio(libc::STDERR_FILENO)?; + + // Now can record PID to file. It won't be changed again in stratovirt's + // lifetime. + if let Some(path) = pid_file { + create_pid_file(&path)?; + } + + Ok(()) +} diff --git a/util/src/device_tree.rs b/util/src/device_tree.rs new file mode 100644 index 00000000..59f96423 --- /dev/null +++ b/util/src/device_tree.rs @@ -0,0 +1,219 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use super::errors::Result; +use libc::{c_char, c_int, c_void}; +use std::ffi::CString; + +pub const CLK_PHANDLE: u32 = 1; +pub const GIC_PHANDLE: u32 = 2; +pub const GIC_ITS_PHANDLE: u32 = 3; +pub const CPU_PHANDLE_START: u32 = 10; + +pub const GIC_FDT_IRQ_TYPE_SPI: u32 = 0; +pub const GIC_FDT_IRQ_TYPE_PPI: u32 = 1; +pub const IRQ_TYPE_EDGE_RISING: u32 = 1; +pub const IRQ_TYPE_LEVEL_HIGH: u32 = 4; + +pub const FDT_MAX_SIZE: u32 = 0x1_0000; + +extern "C" { + fn fdt_create(buf: *mut c_void, bufsize: c_int) -> c_int; + fn fdt_finish_reservemap(fdt: *mut c_void) -> c_int; + fn fdt_begin_node(fdt: *mut c_void, name: *const c_char) -> c_int; + fn fdt_end_node(fdt: *mut c_void) -> c_int; + fn fdt_finish(fdt: *const c_void) -> c_int; + fn fdt_open_into(fdt: *const c_void, buf: *mut c_void, size: c_int) -> c_int; + + fn fdt_path_offset(fdt: *const c_void, path: *const c_char) -> c_int; + fn fdt_add_subnode(fdt: *mut c_void, offset: c_int, name: *const c_char) -> c_int; + fn fdt_setprop( + fdt: *mut c_void, + offset: c_int, + name: *const c_char, + val: *const c_void, + len: c_int, + ) -> c_int; +} + +pub fn create_device_tree(fdt: &mut Vec) -> Result<()> { + let mut ret = unsafe { fdt_create(fdt.as_mut_ptr() as *mut c_void, FDT_MAX_SIZE as c_int) }; + if ret < 0 { + bail!("Failed to fdt_create, return {}.", ret); + } + + ret = unsafe { fdt_finish_reservemap(fdt.as_mut_ptr() as *mut c_void) }; + if ret < 0 { + bail!("Failed to fdt_finish_reservemap, return {}.", ret); + } + + let c_str = CString::new("").unwrap(); + ret = unsafe { fdt_begin_node(fdt.as_mut_ptr() as *mut c_void, c_str.as_ptr()) }; + if ret < 0 { + bail!("Failed to fdt_begin_node, return {}.", ret); + } + + ret = unsafe { fdt_end_node(fdt.as_mut_ptr() as *mut c_void) }; + if ret < 0 { + bail!("Failed to fdt_end_node, return {}.", ret); + } + + ret = unsafe { fdt_finish(fdt.as_mut_ptr() as *mut c_void) }; + if ret < 0 { + bail!("Failed to fdt_finish, return {}.", ret); + } + + ret = unsafe { + fdt_open_into( + fdt.as_ptr() as *mut c_void, + fdt.as_mut_ptr() as *mut c_void, + FDT_MAX_SIZE as c_int, + ) + }; + if ret < 0 { + bail!("Failed to fdt_open_into, return {}.", ret); + } + + Ok(()) +} + +pub fn add_sub_node(fdt: &mut Vec, node_path: &str) -> Result<()> { + let names: Vec<&str> = node_path.split('/').collect(); + if names.len() < 2 { + bail!("Failed to add sub node, node_path: {} invalid.", node_path); + } + + let node_name = names[names.len() - 1]; + let pare_name = names[0..names.len() - 1].join("/"); + + let c_str = if pare_name.is_empty() { + CString::new("/").unwrap() + } else { + CString::new(pare_name).unwrap() + }; + + let offset = unsafe { fdt_path_offset(fdt.as_ptr() as *const c_void, c_str.as_ptr()) }; + if offset < 0 { + bail!("Failed to fdt_path_offset, return {}.", offset); + } + + let c_str = CString::new(node_name).unwrap(); + let ret = unsafe { fdt_add_subnode(fdt.as_mut_ptr() as *mut c_void, offset, c_str.as_ptr()) }; + if ret < 0 { + bail!("Failed to fdt_add_subnode, return {}.", ret); + } + + Ok(()) +} + +pub fn set_property( + fdt: &mut Vec, + node_path: &str, + prop: &str, + val: Option<&[u8]>, +) -> Result<()> { + let c_str = CString::new(node_path).unwrap(); + let offset = unsafe { fdt_path_offset(fdt.as_ptr() as *const c_void, c_str.as_ptr()) }; + if offset < 0 { + bail!("Failed to fdt_path_offset, return {}.", offset); + } + + let (ptr, len) = if let Some(val) = val { + (val.as_ptr() as *const c_void, val.len() as i32) + } else { + (std::ptr::null::(), 0) + }; + + let c_str = CString::new(prop).unwrap(); + let ret = unsafe { + fdt_setprop( + fdt.as_mut_ptr() as *mut c_void, + offset, + c_str.as_ptr(), + ptr, + len, + ) + }; + if ret < 0 { + bail!("Failed to fdt_setprop, return {}.", ret); + } + + Ok(()) +} + +pub fn set_property_string( + fdt: &mut Vec, + node_path: &str, + prop: &str, + val: &str, +) -> Result<()> { + set_property( + fdt, + node_path, + prop, + Some(&([val.as_bytes(), &[0_u8]].concat())), + ) +} + +pub fn set_property_u32(fdt: &mut Vec, node_path: &str, prop: &str, val: u32) -> Result<()> { + set_property(fdt, node_path, prop, Some(&val.to_be_bytes())) +} + +pub fn set_property_u64(fdt: &mut Vec, node_path: &str, prop: &str, val: u64) -> Result<()> { + set_property(fdt, node_path, prop, Some(&val.to_be_bytes())) +} + +pub fn set_property_array_u32( + fdt: &mut Vec, + node_path: &str, + prop: &str, + array: &[u32], +) -> Result<()> { + let mut bytes: Vec = Vec::new(); + for &val in array { + bytes.append(&mut val.to_be_bytes().to_vec()); + } + set_property(fdt, node_path, prop, Some(&bytes)) +} + +pub fn set_property_array_u64( + fdt: &mut Vec, + node_path: &str, + prop: &str, + array: &[u64], +) -> Result<()> { + let mut bytes: Vec = Vec::new(); + for &val in array { + bytes.append(&mut val.to_be_bytes().to_vec()); + } + set_property(fdt, node_path, prop, Some(&bytes)) +} + +pub fn dump_dtb(fdt: &[u8], file_path: &str) { + use std::fs::File; + use std::io::Write; + + let mut f = File::create(file_path).unwrap(); + for i in fdt.iter() { + f.write_all(&[*i]).expect("Unable to write data"); + } +} + +/// Trait for devices to be added to the Flattened Device Tree. +pub trait CompileFDT { + /// function to generate fdt node + /// + /// # Arguments + /// + /// * `fdt` - the fdt slice to be expended. + fn generate_fdt_node(&self, fdt: &mut Vec) -> Result<()>; +} diff --git a/util/src/epoll_context.rs b/util/src/epoll_context.rs new file mode 100644 index 00000000..e1f8e654 --- /dev/null +++ b/util/src/epoll_context.rs @@ -0,0 +1,576 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate vmm_sys_util; + +use std::collections::BTreeMap; +use std::os::unix::io::RawFd; +use std::sync::{Arc, Mutex, RwLock}; + +use libc::{c_void, read}; +use vmm_sys_util::epoll::{ControlOperation, Epoll, EpollEvent, EventSet}; + +use crate::errors::{ErrorKind, Result}; + +const READY_EVENT_MAX: usize = 256; + +#[derive(Debug)] +pub enum NotifierOperation { + /// Add a file descriptor to the event table, and bind a notifier to + /// it, when some event happened on it, notice the only one notifiers. + AddExclusion = 1, + /// Try to add a notifier to a file descriptor, when some event + /// also notice me, the file descriptor must be read. + AddShared = 2, + /// Change the settings associated with a file descriptor. + Modify = 4, + /// Delete a file descriptor from the event table, if has one more notifiers, + /// file descriptor not closed. + Delete = 8, +} + +enum EventStatus { + /// Event is currently monitored in epoll. + Alive = 0, + /// Event is parked, temporarily not monitored. + Parked = 1, + /// Event is removed. + Removed = 2, +} +pub type NotifierCallback = dyn Fn(EventSet, RawFd) -> Option>; +/// Epoll Event Notifier Entry. +pub struct EventNotifier { + /// Raw file descriptor + pub raw_fd: i32, + /// Notifier operation + pub op: NotifierOperation, + /// Parked fd, temporarily removed from epoll + pub parked_fd: Option, + /// The types of events for which we use this fd + pub event: EventSet, + /// Event Handler List, one fd event may have many handlers + pub handlers: Vec>>>, + /// Event status + status: EventStatus, +} + +impl EventNotifier { + /// Constructs a new `EventNotifier`. + pub fn new( + op: NotifierOperation, + raw_fd: i32, + parked_fd: Option, + event: EventSet, + handlers: Vec>>>, + ) -> Self { + EventNotifier { + raw_fd, + op, + parked_fd, + event, + handlers, + status: EventStatus::Alive, + } + } +} + +/// `EventNotifier` Factory +/// +/// When an object have some `EventNotifier` wants +/// to add to main loop, the object need to implement +/// `InternalNotifiers` trait, so `MainLoop` would be +/// easy to get notifiers, and add to epoll context. +pub trait EventNotifierHelper { + fn internal_notifiers(_: Arc>) -> Vec; +} + +/// MainLoop manager, advise continue running or stop running +pub trait MainLoopManager { + fn main_loop_should_exit(&self) -> bool; + fn main_loop_cleanup(&self) -> Result<()>; +} + +/// Main Epoll Loop Context +#[allow(clippy::vec_box)] +pub struct MainLoopContext { + /// Epoll file descriptor. + epoll: Epoll, + /// Control epoll loop running. + manager: Option>, + /// Fds registered to the `MainLoop`. + events: Arc>>>, + /// Events abandoned are stored in garbage collector. + gc: Arc>>>, + /// Temp events vector, store wait returned events. + ready_events: Vec, +} + +impl MainLoopContext { + /// Constructs a new `MainLoopContext`. + pub fn new() -> Self { + MainLoopContext { + epoll: Epoll::new().unwrap(), + manager: None, + events: Arc::new(RwLock::new(BTreeMap::new())), + gc: Arc::new(RwLock::new(Vec::new())), + ready_events: vec![EpollEvent::default(); READY_EVENT_MAX], + } + } + + pub fn set_manager(&mut self, manager: Arc) { + self.manager = Some(manager); + } + + fn clear_gc(&mut self) { + let mut gc = self.gc.write().unwrap(); + gc.clear(); + } + + fn add_event(&mut self, event: EventNotifier) -> Result<()> { + // If there is one same alive event monitored, update the handlers. + // If there is one same parked event, update the handlers but warn. + // If there is no event in the map, insert the event and park the related. + let mut events_map = self.events.write().unwrap(); + if let Some(notifier) = events_map.get_mut(&event.raw_fd) { + if let NotifierOperation::AddExclusion = event.op { + return Err(ErrorKind::BadNotifierOperation.into()); + } + + let mut event = event; + notifier.handlers.append(&mut event.handlers); + if let EventStatus::Parked = notifier.status { + warn!("Parked event updated!"); + } + return Ok(()); + } + + let raw_fd = event.raw_fd; + events_map.insert(raw_fd, Box::new(event)); + let event = events_map.get(&raw_fd).unwrap(); + self.epoll.ctl( + ControlOperation::Add, + event.raw_fd, + EpollEvent::new(event.event, &**event as *const _ as u64), + )?; + + if let Some(parked_fd) = event.parked_fd { + if let Some(parked) = events_map.get_mut(&parked_fd) { + self.epoll + .ctl(ControlOperation::Delete, parked_fd, EpollEvent::default())?; + parked.status = EventStatus::Parked; + } else { + return Err(ErrorKind::NoParkedFd(parked_fd).into()); + } + } + + Ok(()) + } + + fn rm_event(&mut self, event: &EventNotifier) -> Result<()> { + // If there is one same parked event, return Error. + // If there is no event in the map, return Error. + // If there is one same alive event monitored, put the event in gc and reactivate the parked event. + let mut events_map = self.events.write().unwrap(); + match events_map.get_mut(&event.raw_fd) { + Some(notifier) => { + if let EventStatus::Parked = notifier.status { + return Err(ErrorKind::RemoveParked(event.raw_fd).into()); + } + + if let Err(error) = self.epoll.ctl( + ControlOperation::Delete, + notifier.raw_fd, + EpollEvent::default(), + ) { + let error_num = error.raw_os_error().unwrap(); + if error_num != libc::EBADF && error_num != libc::ENOENT { + return Err(ErrorKind::BadSyscall(error).into()); + } + } + notifier.status = EventStatus::Removed; + + if let Some(parked_fd) = notifier.parked_fd { + if let Some(parked) = events_map.get_mut(&parked_fd) { + self.epoll.ctl( + ControlOperation::Add, + parked_fd, + EpollEvent::new(parked.event, &**parked as *const _ as u64), + )?; + parked.status = EventStatus::Alive; + } else { + return Err(ErrorKind::NoParkedFd(parked_fd).into()); + } + } + + let event = events_map.remove(&event.raw_fd).unwrap(); + self.gc.write().unwrap().push(event); + } + None => { + return Err(ErrorKind::NoRegisterFd(event.raw_fd).into()); + } + } + + Ok(()) + } + + /// update fds registered to `MainLoop` according to the operation type. + /// + /// # Arguments + /// + /// * `notifiers` - event notifiers wanted to add to or remove from `MainLoop`. + pub fn update_events(&mut self, notifiers: Vec) -> Result<()> { + for en in notifiers { + match en.op { + NotifierOperation::AddExclusion | NotifierOperation::AddShared => { + self.add_event(en)?; + } + NotifierOperation::Delete => { + self.rm_event(&en)?; + } + _ => { + return Err(ErrorKind::UnExpectedOperationType.into()); + } + } + } + + Ok(()) + } + + /// Executes `epoll.wait()` to wait for events, and call the responding callbacks. + pub fn run(&mut self) -> Result { + match &self.manager { + Some(manager) => { + if manager.main_loop_should_exit() { + manager.main_loop_cleanup()?; + return Ok(false); + } + } + None => {} + } + + let ev_count = match self + .epoll + .wait(READY_EVENT_MAX, -1, &mut self.ready_events[..]) + { + Ok(ev_count) => ev_count, + Err(e) if e.raw_os_error() == Some(libc::EINTR) => 0, + Err(e) => return Err(ErrorKind::EpollWait(e).into()), + }; + + for i in 0..ev_count { + // It`s safe because elements in self.events_map never get released in other functions + let event = unsafe { + let event_ptr = self.ready_events[i].data() as *const EventNotifier; + &*event_ptr as &EventNotifier + }; + if let EventStatus::Alive = event.status { + let mut notifiers = Vec::new(); + for i in 0..event.handlers.len() { + let handle = event.handlers[i].lock().unwrap(); + match handle(self.ready_events[i].event_set(), event.raw_fd) { + None => {} + Some(mut notifier) => { + notifiers.append(&mut notifier); + } + } + } + self.update_events(notifiers)?; + } + } + + self.clear_gc(); + + Ok(true) + } +} + +impl Default for MainLoopContext { + fn default() -> Self { + Self::new() + } +} + +pub fn read_fd(fd: RawFd) -> u64 { + let mut value: u64 = 0; + + let ret = unsafe { + read( + fd, + &mut value as *mut u64 as *mut c_void, + std::mem::size_of::(), + ) + }; + + if ret == -1 { + error!("Failed to read fd"); + } + + value +} + +#[cfg(test)] +mod test { + use super::*; + use libc::*; + use std::os::unix::io::{AsRawFd, RawFd}; + use vmm_sys_util::{epoll::EventSet, eventfd::EventFd}; + + impl MainLoopContext { + fn check_existence(&self, fd: RawFd) -> Option { + let events_map = self.events.read().unwrap(); + match events_map.get(&fd) { + None => { + return None; + } + Some(notifier) => { + if let EventStatus::Alive = notifier.status { + Some(true) + } else { + Some(false) + } + } + } + } + + fn create_event(&mut self) -> i32 { + let fd = EventFd::new(EFD_NONBLOCK).unwrap(); + let result = fd.as_raw_fd(); + let event = EventNotifier::new( + NotifierOperation::AddShared, + fd.as_raw_fd(), + None, + EventSet::OUT, + Vec::new(), + ); + self.update_events(vec![event]).unwrap(); + result + } + } + + fn generate_handler(related_fd: i32) -> Box { + Box::new(move |_, _| { + let mut notifiers = Vec::new(); + let event = EventNotifier::new( + NotifierOperation::AddShared, + related_fd, + None, + EventSet::IN, + Vec::new(), + ); + notifiers.push(event); + Some(notifiers) + }) + } + + #[test] + fn basic_test() { + let mut mainloop = MainLoopContext::new(); + let mut notifiers = Vec::new(); + let fd1 = EventFd::new(EFD_NONBLOCK).unwrap(); + let fd1_related = EventFd::new(EFD_NONBLOCK).unwrap(); + + let handler1 = generate_handler(fd1_related.as_raw_fd()); + let mut handlers = Vec::new(); + handlers.push(Arc::new(Mutex::new(handler1))); + let event1 = EventNotifier::new( + NotifierOperation::AddShared, + fd1.as_raw_fd(), + None, + EventSet::OUT, + handlers.clone(), + ); + + notifiers.push(event1); + mainloop.update_events(notifiers).unwrap(); + mainloop.run().unwrap(); + // Event1 is OUT event, so its handler would be executed immediately. + // Event1's handler is to add a fd1_related event, thus checking fd1 and fd1_relate would + // make a basic function test. + assert!(mainloop.check_existence(fd1.as_raw_fd()).unwrap()); + assert!(mainloop.check_existence(fd1_related.as_raw_fd()).unwrap()); + } + + #[test] + fn parked_event_test() { + let mut mainloop = MainLoopContext::new(); + let mut notifiers = Vec::new(); + let fd1 = EventFd::new(EFD_NONBLOCK).unwrap(); + let fd2 = EventFd::new(EFD_NONBLOCK).unwrap(); + + let event1 = EventNotifier::new( + NotifierOperation::AddShared, + fd1.as_raw_fd(), + None, + EventSet::OUT, + Vec::new(), + ); + let event2 = EventNotifier::new( + NotifierOperation::AddShared, + fd2.as_raw_fd(), + Some(fd1.as_raw_fd()), + EventSet::OUT, + Vec::new(), + ); + + notifiers.push(event1); + notifiers.push(event2); + mainloop.update_events(notifiers).unwrap(); + mainloop.run().unwrap(); + + // For the reason that event1 is the parked event of event2, when event2 added, event1 would + // be set to parked. + assert!(!mainloop.check_existence(fd1.as_raw_fd()).unwrap()); + assert!(mainloop.check_existence(fd2.as_raw_fd()).unwrap()); + + let event2_remove = EventNotifier::new( + NotifierOperation::Delete, + fd2.as_raw_fd(), + Some(fd1.as_raw_fd()), + EventSet::OUT, + Vec::new(), + ); + mainloop.update_events(vec![event2_remove]).unwrap(); + + // Then we remove event2, event1 will be re-activated and event2 will be deleted (removed + // from events_map to gc). + assert!(mainloop.check_existence(fd1.as_raw_fd()).unwrap()); + assert!(mainloop.check_existence(fd2.as_raw_fd()).is_none()); + } + + #[test] + fn event_handler_test() { + let mut mainloop = MainLoopContext::new(); + let mut notifiers = Vec::new(); + let fd1 = EventFd::new(EFD_NONBLOCK).unwrap(); + let fd1_related = EventFd::new(EFD_NONBLOCK).unwrap(); + let fd1_related_update = EventFd::new(EFD_NONBLOCK).unwrap(); + + let handler1 = generate_handler(fd1_related.as_raw_fd()); + let handler1_update = generate_handler(fd1_related_update.as_raw_fd()); + let event1 = EventNotifier::new( + NotifierOperation::AddShared, + fd1.as_raw_fd(), + None, + EventSet::OUT, + vec![Arc::new(Mutex::new(handler1))], + ); + + let event1_update = EventNotifier::new( + NotifierOperation::AddShared, + fd1.as_raw_fd(), + None, + EventSet::OUT, + vec![Arc::new(Mutex::new(handler1_update))], + ); + + notifiers.push(event1); + notifiers.push(event1_update); + mainloop.update_events(notifiers).unwrap(); + mainloop.run().unwrap(); + + // Firstly, event1 with handler1 would be added. Then, event1's handlers would append + // handler1_update, which would register fd1_related_update in mainloop. + assert!(mainloop.check_existence(fd1_related.as_raw_fd()).unwrap()); + assert!(mainloop + .check_existence(fd1_related_update.as_raw_fd()) + .unwrap()); + } + + #[test] + fn error_operation_test() { + let mut mainloop = MainLoopContext::new(); + let fd1 = EventFd::new(EFD_NONBLOCK).unwrap(); + let leisure_fd = EventFd::new(EFD_NONBLOCK).unwrap(); + + // Delete unexist event + let event1 = EventNotifier::new( + NotifierOperation::Delete, + fd1.as_raw_fd(), + None, + EventSet::OUT, + Vec::new(), + ); + assert!(mainloop.update_events(vec![event1]).is_err()); + + // Add event with unexist parked event + let event1 = EventNotifier::new( + NotifierOperation::AddShared, + fd1.as_raw_fd(), + Some(leisure_fd.as_raw_fd()), + EventSet::OUT, + Vec::new(), + ); + assert!(mainloop.update_events(vec![event1]).is_err()); + + // Delete event with unexist parked event + let event1_delete = EventNotifier::new( + NotifierOperation::Delete, + fd1.as_raw_fd(), + Some(leisure_fd.as_raw_fd()), + EventSet::OUT, + Vec::new(), + ); + assert!(mainloop.update_events(vec![event1_delete]).is_err()); + } + + #[test] + fn error_parked_operation_test() { + let mut mainloop = MainLoopContext::new(); + let fd1 = EventFd::new(EFD_NONBLOCK).unwrap(); + let fd2 = EventFd::new(EFD_NONBLOCK).unwrap(); + + let event1 = EventNotifier::new( + NotifierOperation::AddShared, + fd1.as_raw_fd(), + None, + EventSet::OUT, + Vec::new(), + ); + mainloop.update_events(vec![event1]).unwrap(); + + let event2 = EventNotifier::new( + NotifierOperation::AddShared, + fd2.as_raw_fd(), + Some(fd1.as_raw_fd()), + EventSet::OUT, + Vec::new(), + ); + mainloop.update_events(vec![event2]).unwrap(); + + // Delete parked event + let event1 = EventNotifier::new( + NotifierOperation::Delete, + fd1.as_raw_fd(), + None, + EventSet::OUT, + Vec::new(), + ); + assert!(mainloop.update_events(vec![event1]).is_err()); + } + + #[test] + fn fd_released_test() { + let mut mainloop = MainLoopContext::new(); + let fd = mainloop.create_event(); + + // In this case, fd is already closed. But program was wrote to ignore the error. + let event = EventNotifier::new( + NotifierOperation::Delete, + fd, + None, + EventSet::OUT, + Vec::new(), + ); + + assert!(mainloop.update_events(vec![event]).is_ok()); + } +} diff --git a/util/src/kvm_ioctls_ext.rs b/util/src/kvm_ioctls_ext.rs new file mode 100644 index 00000000..b9d63fd7 --- /dev/null +++ b/util/src/kvm_ioctls_ext.rs @@ -0,0 +1,56 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use kvm_bindings::{kvm_device_attr, KVMIO}; +use kvm_ioctls::DeviceFd; +use vmm_sys_util::errno; +use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref}; + +pub type Result = std::result::Result; + +/// Gets a specified piece of device configuration and/or state. +/// +/// See the documentation for `KVM_GET_DEVICE_ATTR`. +/// +/// # Arguments +/// +/// * `device_attr` - The device attribute to be read. +pub fn get_device_attr(device_fd: &DeviceFd, device_attr: &mut kvm_device_attr) -> Result<()> { + let ret = unsafe { + // Here we trust the kernel not to read past the end of the kvm_device_attr struct. + ioctl_with_mut_ref(device_fd, KVM_GET_DEVICE_ATTR(), device_attr) + }; + if ret != 0 { + return Err(errno::Error::last()); + } + Ok(()) +} + +/// Check a specified piece of device feature. +/// +/// See the documentation for `KVM_HAS_DEVICE_ATTR`. +/// # Arguments +/// +/// * `device_attr` - The device attribute to be check. +pub fn check_device_attr(device_fd: &DeviceFd, device_attr: &kvm_device_attr) -> Result { + let ret = unsafe { + // Here we trust the kernel not to read past the end of the kvm_device_attr struct. + ioctl_with_ref(device_fd, KVM_HAS_DEVICE_ATTR(), device_attr) + }; + if ret < 0 { + return Err(errno::Error::last()); + } + Ok(ret as u32) +} + +ioctl_iow_nr!(KVM_GET_DEVICE_ATTR, KVMIO, 0xe2, kvm_device_attr); +ioctl_iow_nr!(KVM_HAS_DEVICE_ATTR, KVMIO, 0xe3, kvm_device_attr); diff --git a/util/src/lib.rs b/util/src/lib.rs new file mode 100644 index 00000000..553fee84 --- /dev/null +++ b/util/src/lib.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate libc; +#[macro_use] +extern crate error_chain; +#[macro_use] +extern crate vmm_sys_util; +#[macro_use] +extern crate log; +extern crate kvm_bindings; +extern crate kvm_ioctls; + +pub mod aio; +pub mod arg_parser; +pub mod byte_code; +pub mod checksum; +pub mod daemonize; +pub mod device_tree; +pub mod epoll_context; +pub mod kvm_ioctls_ext; +mod link_list; +pub mod num_ops; +pub mod seccomp; +pub mod tap; +pub mod unix; +#[macro_use] +pub mod logger; +#[macro_use] +pub mod offsetof; + +pub mod errors { + error_chain! { + foreign_links { + KvmIoctl(kvm_ioctls::Error); + Io(std::io::Error); + Nul(std::ffi::NulError); + } + errors { + // arg_parser submodule error + MissingArgument(t: String) { + description("The required argument was not provided.") + display("Argument '{}' required, but not found.", t) + } + MissingValue(t: String) { + description("A value for args was not provided.") + display("The argument '{}' requires a value, but none was supplied.", t) + } + IllegelValue(t1: String, t2: String) { + description("A value is illegel for args.") + display("The value '{}' is illegel for argument '{}'.", t1, t2) + } + ValueOutOfPossible(t1: String, t2: String) { + description("A value for args is out of possile values.") + display("The value of argument '{}' must be in '{}'.", t1, t2) + } + UnexpectedArguments(t: String) { + description("The provided argument was not expected.") + display("Found argument '{}' which wasn't expected, or isn't valid in the context.", t) + } + DuplicateArgument(t: String) { + description("The argument was provided more than once.") + display("The argument '{}' was provided more than once.", t) + } + DuplicateValue(t: String) { + description("The argument value was provided more than once.") + display("The argument '{}' only need one value.", t) + } + // daemonize submodule error + DaemonFork { + description("Unable to fork.") + display("Unable to fork.") + } + DaemonSetsid { + description("Unable to create new session.") + display("Unable to create new session.") + } + DaemonRedirectStdio { + description("Unable to redirect standard streams to /dev/null.") + display("Unable to redirect standard streams to /dev/null.") + } + PidFileExist { + description("Pidfile path is existed yet.") + display("Pidfile path is existed yet.") + } + // epoll_context error + BadSyscall(err: std::io::Error) { + description("Return a bad syscall.") + display("Found bad syscall, error is {} .", err) + } + UnExpectedOperationType { + description("Unsupported notifier operation type.") + display("Unsupported Epoll notifier operation type.") + } + EpollWait(err: std::io::Error) { + description("Failed to execute epoll_wait syscall.") + display("Failed to execute epoll_wait syscall: {} .", err) + } + NoRegisterFd(t: i32) { + description("The fd is not registered in epoll.") + display("The fd {} is not registered in epoll.", t) + } + NoParkedFd(t: i32) { + description("Found no parked fd in registered.") + display("Found no parked fd {}.", t) + } + RemoveParked(t: i32) { + description("Remove parked event.") + display("Remove parked event whose fd is {}.", t) + } + BadNotifierOperation { + description("Bad Notifier Operation.") + display("Notifier Operation non allowed.") + } + ChmodFailed(e: i32) { + description("Chmod command failed.") + display("Chmod command failed, os error {}", e) + } + } + } +} diff --git a/util/src/link_list.rs b/util/src/link_list.rs new file mode 100644 index 00000000..982fe382 --- /dev/null +++ b/util/src/link_list.rs @@ -0,0 +1,133 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::marker::PhantomData; +use std::ptr::NonNull; + +pub struct Node { + prev: Option>>, + next: Option>>, + pub value: T, +} + +#[derive(Default)] +pub struct List { + head: Option>>, + tail: Option>>, + pub len: usize, + marker: PhantomData>>, +} + +impl Node { + pub fn new(value: T) -> Self { + Node { + prev: None, + next: None, + value, + } + } +} + +impl List { + pub fn new() -> Self { + List { + head: None, + tail: None, + len: 0, + marker: PhantomData, + } + } + + #[inline] + pub fn add_tail(&mut self, mut node: Box>) { + node.prev = self.tail; + node.next = None; + unsafe { + let node = NonNull::new(Box::into_raw(node)); + if let Some(mut t) = self.tail { + t.as_mut().next = node; + } else { + self.head = node; + self.tail = node; + } + + self.tail = node; + self.len += 1; + } + } + + #[inline] + pub fn add_head(&mut self, mut node: Box>) { + node.prev = None; + node.next = self.head; + unsafe { + let node = NonNull::new(Box::into_raw(node)); + if let Some(mut h) = self.head { + h.as_mut().prev = node; + } else { + self.head = node; + self.tail = node; + } + + self.head = node; + self.len += 1; + } + } + + #[inline] + pub fn unlink(&mut self, node: &Node) { + unsafe { + match node.prev { + Some(mut p) => p.as_mut().next = node.next, + None => self.head = node.next, + } + + match node.next { + Some(mut n) => n.as_mut().prev = node.prev, + None => self.tail = node.prev, + } + } + self.len -= 1; + } + + #[inline] + pub fn pop_tail(&mut self) -> Option>> { + self.tail.map(|node| unsafe { + let node = Box::from_raw(node.as_ptr()); + self.tail = node.prev; + + match self.tail { + None => self.head = None, + Some(mut t) => t.as_mut().next = None, + } + + self.len -= 1; + node + }) + } + + #[inline] + pub fn pop_head(&mut self) -> Option>> { + self.head.map(|node| unsafe { + let node = Box::from_raw(node.as_ptr()); + self.head = node.next; + + match self.head { + None => self.head = None, + Some(mut h) => h.as_mut().prev = None, + } + + self.len -= 1; + node + }) + } +} diff --git a/util/src/logger.rs b/util/src/logger.rs new file mode 100644 index 00000000..9548965d --- /dev/null +++ b/util/src/logger.rs @@ -0,0 +1,120 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate libc; +extern crate log; + +use std::io::prelude::*; +use std::sync::Mutex; + +use crate::unix::gettid; +use log::{Level, LevelFilter, Log, Metadata, Record, SetLoggerError}; + +fn format_now() -> String { + let mut ts = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + + let mut ti: libc::tm = unsafe { std::mem::zeroed() }; + unsafe { + libc::clock_gettime(libc::CLOCK_REALTIME, &mut ts); + libc::localtime_r(&ts.tv_sec, &mut ti); + } + + format!( + "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}.{:09}", + ti.tm_year + 1900, + ti.tm_mon + 1, + ti.tm_mday, + ti.tm_hour, + ti.tm_min, + ti.tm_sec, + ts.tv_nsec + ) +} + +/// Format like "%year-%mon-%dayT%hour:%min:%sec.%nsec +struct VmLogger { + handler: Option>>, + level: Level, +} + +impl Log for VmLogger { + fn enabled(&self, metadata: &Metadata) -> bool { + self.handler.is_some() && metadata.level() <= self.level + } + + fn log(&self, record: &Record) { + if self.enabled(record.metadata()) { + let pid = unsafe { libc::getpid() }; + let tid = gettid(); + + self.handler.as_ref().map(|writer| match record.level() { + Level::Error => writer.lock().unwrap().write_fmt(format_args!( + "{:<5}: [{}][{}][{}: {}]:{}: {}\n", + format_now(), + pid, + tid, + record.file().unwrap_or(""), + record.line().unwrap_or(0), + record.level(), + record.args() + )), + _ => writer.lock().unwrap().write_fmt(format_args!( + "{:<5}: [{}][{}]:{}: {}\n", + format_now(), + pid, + tid, + record.level(), + record.args() + )), + }); + } + } + + fn flush(&self) {} +} + +pub fn init_vm_logger( + level: Option, + logfile: Option>, +) -> Result<(), log::SetLoggerError> { + let buffer = match logfile { + Some(x) => Some(Mutex::new(x)), + None => None, + }; + + let logger = VmLogger { + level: level.unwrap_or(Level::Info), + handler: buffer, + }; + + log::set_boxed_logger(Box::new(logger)).map(|()| log::set_max_level(LevelFilter::Trace)) +} + +pub fn init_logger_with_env(logfile: Option>) -> Result<(), SetLoggerError> { + let level = match std::env::var("QUANTVISOR_LOG_LEVEL") { + Ok(l) => match l.to_lowercase().as_str() { + "trace" => Level::Trace, + "debug" => Level::Debug, + "info" => Level::Info, + "warn" => Level::Warn, + _ => Level::Error, + }, + _ => Level::Error, + }; + + init_vm_logger(Some(level), logfile)?; + + Ok(()) +} diff --git a/util/src/num_ops.rs b/util/src/num_ops.rs new file mode 100644 index 00000000..9a1ece44 --- /dev/null +++ b/util/src/num_ops.rs @@ -0,0 +1,140 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! This module implements some operations of Rust primitive types. + +/// Calculate the aligned-up u64 value. +/// +/// # Arguments +/// +/// * `origin` - the origin value. +/// * `align` - the alignment. +/// +/// # Examples +/// +/// ```rust +/// extern crate util; +/// use util::num_ops::round_up; +/// +/// let value = round_up(1003 as u64, 4 as u64); +/// assert!(value == Some(1004)); +/// ``` +pub fn round_up(origin: u64, align: u64) -> Option { + match origin % align { + 0 => Some(origin), + diff => origin.checked_add(align - diff), + } +} + +/// Calculate the aligned-down u64 value. +/// +/// # Arguments +/// +/// * `origin` - the origin value. +/// * `align` - the alignment. +/// +/// # Examples +/// +/// ```rust +/// extern crate util; +/// use util::num_ops::round_down; +/// +/// let value = round_down(1003 as u64, 4 as u64); +/// assert!(value == Some(1000)); +/// ``` +pub fn round_down(origin: u64, align: u64) -> Option { + match origin % align { + 0 => Some(origin), + diff => origin.checked_sub(diff), + } +} + +/// Get the first half or second half of u64. +/// +/// # Arguments +/// +/// * `value` - The origin value to get u32 from. +/// * `page` - Value is 0 or 1, determines which half to return. +/// +/// # Examples +/// +/// ```rust +/// extern crate util; +/// use util::num_ops::read_u32; +/// +/// let value = read_u32(0x2000_1000_0000, 1); +/// assert!(value == 0x2000); +/// ``` +pub fn read_u32(value: u64, page: u32) -> u32 { + match page { + 0 => value as u32, + 1 => (value >> 32) as u32, + _ => 0_u32, + } +} + +/// Write the given u32 to the first or second half in u64, +/// returns the u64 value. +/// +/// # Arguments +/// +/// * `value` - The origin u32 value. +/// * `page` - Value is 0 or 1, determines which half to write. +/// +/// # Examples +/// +/// ```rust +/// extern crate util; +/// use util::num_ops::write_u32; +/// +/// let value = write_u32(0x1000_0000, 1); +/// assert!(value == 0x1000_0000_0000_0000); +/// ``` +pub fn write_u32(value: u32, page: u32) -> u64 { + match page { + 0 => u64::from(value), + 1 => u64::from(value) << 32, + _ => 0_u64, + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn round_up_test() { + let result = round_up(10001 as u64, 100 as u64); + assert_eq!(result, Some(10100)); + } + + #[test] + fn round_down_test() { + let result = round_down(10001 as u64, 100 as u64); + assert_eq!(result, Some(10000)); + } + + #[test] + fn test_read_u32_from_u64() { + let value = 0x1234_5678_9012_3456u64; + assert_eq!(read_u32(value, 0), 0x9012_3456u32); + assert_eq!(read_u32(value, 1), 0x1234_5678u32); + assert_eq!(read_u32(value, 2), 0); + } + + #[test] + fn test_set_u64_from_half32bit() { + assert_eq!(write_u32(0x1234_5678, 0), 0x1234_5678u64); + assert_eq!(write_u32(0x1234_5678, 1), 0x1234_5678_0000_0000u64); + assert_eq!(write_u32(0x1234_5678, 2), 0); + } +} diff --git a/util/src/offsetof.rs b/util/src/offsetof.rs new file mode 100644 index 00000000..21fd59c6 --- /dev/null +++ b/util/src/offsetof.rs @@ -0,0 +1,245 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +/// Macro: Calculate offset of specified field in a type. +#[macro_export] +macro_rules! __offset_of { + ($type_name:ty, $field:ident) => { + unsafe { &(*(std::ptr::null::<$type_name>())).$field as *const _ as usize } + }; +} + +/// Macro: Calculate offset of a field in a recursive type. +/// +/// # Arguments +/// +/// The Arguments is: a type name and its field name, +/// follows by a series of sub-type's name and its field's name. +/// +/// # Examples +/// +/// ```rust +/// #[macro_use] +/// extern crate util; +/// +/// fn main() { +/// struct Rectangle { +/// pub length: u64, +/// pub width: u64, +/// } +/// assert_eq!(offset_of!(Rectangle, length), 0); +/// assert_eq!(offset_of!(Rectangle, width), 8); +/// } +/// ``` +#[macro_export] +macro_rules! offset_of { + ($type_name:ty, $field:ident) => { __offset_of!($type_name, $field) }; + ($type_name:ty, $field:ident, $($sub_type_name:ty, $sub_field:ident), +) => { + __offset_of!($type_name, $field) + offset_of!($($sub_type_name, $sub_field), +) + }; +} + +#[cfg(test)] +mod tests { + #[test] + fn test_offset_of() { + #[repr(C)] + pub struct Student { + student_id: u32, + weight: u8, + age: u8, + marks: u32, + is_male: bool, + } + assert_eq!(offset_of!(Student, student_id), 0); + assert_eq!(offset_of!(Student, weight), 4); + assert_eq!(offset_of!(Student, age), 5); + assert_eq!(offset_of!(Student, marks), 8); + assert_eq!(offset_of!(Student, is_male), 12); + + #[repr(C, packed)] + pub struct Student_packed { + student_id: u32, + weight: u8, + age: u8, + marks: u32, + is_male: bool, + } + assert_eq!(offset_of!(Student_packed, student_id), 0); + assert_eq!(offset_of!(Student_packed, weight), 4); + assert_eq!(offset_of!(Student_packed, age), 5); + assert_eq!(offset_of!(Student_packed, marks), 6); + assert_eq!(offset_of!(Student_packed, is_male), 10); + } + + #[test] + fn test_offset_of_recursive() { + mod recursive { + #[repr(C)] + pub struct grand_parent { + pub a: u8, + pub b: u32, + pub c: parent, + } + + #[repr(C)] + pub struct parent { + pub a: u16, + pub b: i32, + pub c: son, + } + + #[repr(C)] + pub struct son { + pub a: u32, + pub b: u8, + pub c: u64, + } + } + + assert_eq!(offset_of!(recursive::grand_parent, a), 0); + assert_eq!(offset_of!(recursive::grand_parent, b), 4); + assert_eq!(offset_of!(recursive::grand_parent, c), 8); + assert_eq!( + offset_of!(recursive::grand_parent, c, recursive::parent, a), + 8 + ); + assert_eq!( + offset_of!(recursive::grand_parent, c, recursive::parent, b), + 12 + ); + assert_eq!( + offset_of!(recursive::grand_parent, c, recursive::parent, c), + 16 + ); + assert_eq!( + offset_of!( + recursive::grand_parent, + c, + recursive::parent, + c, + recursive::son, + a + ), + 16 + ); + assert_eq!( + offset_of!( + recursive::grand_parent, + c, + recursive::parent, + c, + recursive::son, + b + ), + 20 + ); + assert_eq!( + offset_of!( + recursive::grand_parent, + c, + recursive::parent, + c, + recursive::son, + c + ), + 24 + ); + + mod recursive_packed { + #[repr(C, packed)] + pub struct grand_parent { + pub a: u32, + pub b: u8, + pub c: parent, + } + + #[repr(C, packed)] + pub struct parent { + pub a: u16, + pub b: i32, + pub c: son, + } + + #[repr(C, packed)] + pub struct son { + pub a: u32, + pub b: u8, + pub c: u64, + } + } + + assert_eq!(offset_of!(recursive_packed::grand_parent, a), 0); + assert_eq!(offset_of!(recursive_packed::grand_parent, b), 4); + assert_eq!(offset_of!(recursive_packed::grand_parent, c), 5); + assert_eq!( + offset_of!( + recursive_packed::grand_parent, + c, + recursive_packed::parent, + a + ), + 5 + ); + assert_eq!( + offset_of!( + recursive_packed::grand_parent, + c, + recursive_packed::parent, + b + ), + 7 + ); + assert_eq!( + offset_of!( + recursive_packed::grand_parent, + c, + recursive_packed::parent, + c + ), + 11 + ); + assert_eq!( + offset_of!( + recursive_packed::grand_parent, + c, + recursive_packed::parent, + c, + recursive_packed::son, + a + ), + 11 + ); + assert_eq!( + offset_of!( + recursive_packed::grand_parent, + c, + recursive_packed::parent, + c, + recursive_packed::son, + b + ), + 15 + ); + assert_eq!( + offset_of!( + recursive_packed::grand_parent, + c, + recursive_packed::parent, + c, + recursive_packed::son, + c + ), + 16 + ); + } +} diff --git a/util/src/seccomp.rs b/util/src/seccomp.rs new file mode 100644 index 00000000..8919c500 --- /dev/null +++ b/util/src/seccomp.rs @@ -0,0 +1,583 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +//! A seccomp-bpf crate. +//! +//! The crate to set bpf-filter to seccomp for process or thread. +//! +//! ## Design +//! +//! This crate offers support for: +//! 1. A quick way to set bpf-filter rules. +//! 2. Register bpf-filter rules to seccomp. +//! +//! ## Platform Support +//! +//! - `x86_64` +//! - `aarch64` +//! +//! ## Examples +//! +//! A simple code to read 1024 bytes in a regular file. +//! ```no_run +//! use std::fs::File; +//! use std::io::Read; +//! +//! let mut f: File = File::open("/path/to/file").unwrap(); +//! let mut buffer = [0u8; 1024]; +//! f.read(&mut buffer).unwrap(); +//! println!("{}", String::from_utf8_lossy(&buffer)); +//! ``` +//! +//! With seccomp to limit 1024 bytes read. +//! +//! ```should_panic +//! extern crate libc; +//! +//! use std::fs::File; +//! use std::io::Read; +//! use util::seccomp::*; +//! +//! let mut seccomp_filter = SyscallFilter::new(SeccompOpt::Trap); +//! +//! let nr_open = { +//! #[cfg(target_arch="x86_64")] +//! let nr = libc::SYS_open; +//! #[cfg(target_arch="aarch64")] +//! let nr = libc::SYS_openat; +//! nr +//! }; +//! +//! seccomp_filter.push(&mut BpfRule::new(nr_open)); +//! seccomp_filter.push(&mut BpfRule::new(libc::SYS_fcntl)); +//! seccomp_filter.push( +//! &mut BpfRule::new(libc::SYS_read) +//! .add_constraint(SeccompCmpOpt::Ne, 2, 1024) +//! ); +//! seccomp_filter.push(&mut BpfRule::new(libc::SYS_write)); +//! seccomp_filter.push(&mut BpfRule::new(libc::SYS_close)); +//! seccomp_filter.push(&mut BpfRule::new(libc::SYS_sigaltstack)); +//! seccomp_filter.push(&mut BpfRule::new(libc::SYS_munmap)); +//! seccomp_filter.push(&mut BpfRule::new(libc::SYS_exit_group)); +//! seccomp_filter.realize().unwrap(); +//! +//! let mut f: File = File::open("/path/to/file").unwrap(); +//! let mut buffer = [0u8; 1024]; +//! f.read(&mut buffer).unwrap(); +//! println!("{}", String::from_utf8_lossy(&buffer)); +//! ``` +//! This programe will be trapped. + +extern crate libc; + +use crate::errors::Result; +use crate::{__offset_of, offset_of}; + +// BPF Instruction classes +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L7 +const BPF_LD: u16 = 0x00; +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L12 +const BPF_JMP: u16 = 0x05; +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L13 +const BPF_RET: u16 = 0x06; + +// BPF ld/ldx fields +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L18 +const BPF_W: u16 = 0x00; +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L24 +const BPF_ABS: u16 = 0x20; + +// BPF alu/jmp fields +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L45 +const BPF_JEQ: u16 = 0x10; +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L46 +const BPF_JGT: u16 = 0x20; +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L47 +const BPF_JGE: u16 = 0x30; +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/bpf_common.h#L50 +const BPF_K: u16 = 0x00; + +/// BPF programs must return a 32-bit value. +/// +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/seccomp.h#L33-40 +const SECCOMP_RET_KILL: u32 = 0x0000_0000; +const SECCOMP_RET_TRAP: u32 = 0x0003_0000; +const SECCOMP_RET_ERRNO: u32 = 0x0005_0000; +const SECCOMP_RET_TRACE: u32 = 0x7ff0_0000; +const SECCOMP_RET_ALLOW: u32 = 0x7fff_0000; +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/seccomp.h#L45 +const SECCOMP_RET_MASK: u32 = 0x0000_ffff; + +/// System call convention as an AUDIT_ARCH_* value +#[cfg(target_arch = "x86_64")] +const EM_X86_64: u32 = 62; +#[cfg(target_arch = "aarch64")] +const EM_AARCH64: u32 = 183; +const __AUDIT_ATCH_64BIT: u32 = 0x8000_0000; +const __AUDIT_ARCH_LE: u32 = 0x4000_0000; +#[cfg(target_arch = "x86_64")] +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/audit.h#L413 +const AUDIT_ARCH_X86_64: u32 = EM_X86_64 | __AUDIT_ATCH_64BIT | __AUDIT_ARCH_LE; +#[cfg(target_arch = "aarch64")] +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/audit.h#L376 +const AUDIT_ARCH_AARCH64: u32 = EM_AARCH64 | __AUDIT_ATCH_64BIT | __AUDIT_ARCH_LE; + +/// Compared operator in bpf filter rule. +#[derive(Copy, Clone, PartialEq)] +pub enum SeccompCmpOpt { + /// Equal. + Eq, + /// Not Equal. + Ne, + /// Greater than. + Gt, + /// Less than. + Lt, + /// Greater or equal. + Ge, + /// Less or equal. + Le, +} + +/// Operation defined to handle seccomp event. +/// +/// # Notes +/// These operation one-to-one correspondence with BPF-filter return value: +/// `SECCOMP_RET_KILL_PROCESS`, `SECCOMP_RET_KILL_THREAD`, `SECCOMP_RET_TRAP`, +/// `SECCOMP_RET_ERRNO`, `SECCOMP_RET_TRACE`, `SECCOMP_RET_ALLOW`, `SECCOMP_RET_LOG`. +#[derive(Copy, Clone, PartialEq, Debug)] +pub enum SeccompOpt { + /// Kill the task immediately. + Kill, + /// Disallow and force a SIGSYS. + Trap, + /// Returns an errno. + Errno(u32), + /// Pass to a tracer or disallow. + Trace(u32), + /// Allow. + Allow, +} + +impl Into for SeccompOpt { + fn into(self) -> u32 { + match self { + SeccompOpt::Kill => SECCOMP_RET_KILL, + SeccompOpt::Trap => SECCOMP_RET_TRAP, + SeccompOpt::Errno(x) => SECCOMP_RET_ERRNO | (x & SECCOMP_RET_MASK), + SeccompOpt::Trace(x) => SECCOMP_RET_TRACE | (x & SECCOMP_RET_MASK), + SeccompOpt::Allow => SECCOMP_RET_ALLOW, + } + } +} + +/// The format of BPF programe executes over. +/// +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/seccomp.h#L56 +#[repr(C, packed)] +struct SeccompData { + /// System call number + nr: i32, + /// indicates system call convention as an AUDIT_ARCH_* value + arch: u32, + /// CPU IP + instruction_pointer: u64, + /// up to 6 system call arguments always stored as 64-bit values regardless + /// of the architecture + args: [u64; 6], +} + +impl SeccompData { + fn nr() -> u32 { + offset_of!(SeccompData, nr) as u32 + } + + fn arch() -> u32 { + offset_of!(SeccompData, arch) as u32 + } + + fn args(num: u32) -> u32 { + let offset_of_u64 = + offset_of!(SeccompData, args) - offset_of!(SeccompData, instruction_pointer); + offset_of!(SeccompData, args) as u32 + num * offset_of_u64 as u32 + } +} + +/// Filter block +/// +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/filter.h#L24 +#[repr(C)] +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct SockFilter { + /// Actual filter code + code: u16, + /// Jump true + jt: u8, + /// Jump false + jf: u8, + /// Generic multiuse field + k: u32, +} + +/// Required for SO_ATTACH_FILTER +/// +/// See: https://elixir.bootlin.com/linux/v4.19.123/source/include/uapi/linux/filter.h#L31 +#[repr(C)] +struct SockFProg { + /// Number of filter blocks. + len: u16, + /// Point of SockFilter list. + sock_filter: *const SockFilter, +} + +#[inline(always)] +fn bpf_stmt(code: u16, k: u32) -> SockFilter { + SockFilter { + code, + jt: 0, + jf: 0, + k, + } +} + +#[inline(always)] +fn bpf_jump(code: u16, k: u32, jt: u8, jf: u8) -> SockFilter { + SockFilter { code, jt, jf, k } +} + +/// Validate the syscall's arch is correct. +fn validate_architecture() -> Vec { + vec![ + bpf_stmt(BPF_LD + BPF_W + BPF_ABS, SeccompData::arch()), + #[cfg(target_arch = "x86_64")] + bpf_jump(BPF_JMP + BPF_JEQ, AUDIT_ARCH_X86_64, 1, 0), + #[cfg(target_arch = "aarch64")] + bpf_jump(BPF_JMP + BPF_JEQ, AUDIT_ARCH_AARCH64, 1, 0), + bpf_stmt(BPF_RET + BPF_K, SECCOMP_RET_KILL), + ] +} + +/// Create a bpf-filter rule to get the syscall number from `SeccompData`. +fn examine_syscall() -> Vec { + vec![bpf_stmt(BPF_LD + BPF_W + BPF_ABS, SeccompData::nr())] +} + +/// Create a bpf-filter rule for handle syscall undefined rule. +fn handle_process(opt: SeccompOpt) -> Vec { + vec![bpf_stmt(BPF_RET + BPF_K, opt.into())] +} + +/// A wrapper structure of a list of bpf_filters for a syscall's rule. +#[derive(Debug)] +pub struct BpfRule { + /// The first bpf_filter to compare syscall number. + header_rule: SockFilter, + /// The inner rules to limit the arguments of syscall. + inner_rules: Vec, + /// The last bpf_filter to allow syscall. + tail_rule: SockFilter, +} + +impl BpfRule { + /// Create a new BpfRule to allow a syscall from a syscall number. + /// + /// # Arguments + /// * `syscall_num` - the number of system call. + pub fn new(syscall_num: i64) -> BpfRule { + BpfRule { + header_rule: bpf_jump(BPF_JMP + BPF_JEQ + BPF_K, syscall_num as u32, 0, 1), + inner_rules: Vec::new(), + tail_rule: bpf_stmt(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + } + } + + /// Allow a syscall with arguments limitation in bpf-filter. + /// + /// # Arguments + /// * `cmp` - Compare operator for given args_value and the raw args_value. + /// * `args_num` - The index number of system call's arguments. + /// * `args_value` - The value of args_num you want to limit. This value + /// used with `cmp` together. + pub fn add_constraint(mut self, cmp: SeccompCmpOpt, args_num: u32, args_value: u32) -> BpfRule { + if self.inner_rules.is_empty() { + self.tail_rule = bpf_stmt(BPF_LD + BPF_W + BPF_ABS, SeccompData::nr()); + } + + // Create a bpf_filter to get args in `SeccompData`. + let args_filter = bpf_stmt(BPF_LD + BPF_W + BPF_ABS, SeccompData::args(args_num)); + + // Create a bpf_filter to limit args in syscall. + let constraint_filter = match cmp { + SeccompCmpOpt::Eq => bpf_jump(BPF_JMP + BPF_JEQ + BPF_K, args_value, 0, 1), + SeccompCmpOpt::Ne => bpf_jump(BPF_JMP + BPF_JEQ + BPF_K, args_value, 1, 0), + SeccompCmpOpt::Ge => bpf_jump(BPF_JMP + BPF_JGE + BPF_K, args_value, 0, 1), + SeccompCmpOpt::Gt => bpf_jump(BPF_JMP + BPF_JGT + BPF_K, args_value, 0, 1), + SeccompCmpOpt::Le => bpf_jump(BPF_JMP + BPF_JGE + BPF_K, args_value, 1, 0), + SeccompCmpOpt::Lt => bpf_jump(BPF_JMP + BPF_JGT + BPF_K, args_value, 1, 0), + }; + + self.append(&mut vec![ + args_filter, + constraint_filter, + bpf_stmt(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + ]); + self + } + + /// Change `BpfRules` to a list of `SockFilter`. It will be used when + /// seccomp taking effect. + fn as_vec(&mut self) -> Vec { + let mut bpf_filters = vec![self.header_rule]; + bpf_filters.append(&mut self.inner_rules); + bpf_filters.push(self.tail_rule); + bpf_filters + } + + /// Add bpf_filters to `inner_rules`. + fn append(&mut self, bpf_filters: &mut Vec) { + let offset = bpf_filters.len() as u8; + + self.header_rule.jf += offset; + self.inner_rules.append(bpf_filters); + } +} + +/// This structure to create, manage, realize a seccomp rule. +#[derive(Debug)] +pub struct SyscallFilter { + /// A list of Bpf-filter. + sock_filters: Vec, + /// Operation for all syscall call not in rules. + opt: SeccompOpt, +} + +impl SyscallFilter { + /// Create a seccomp rule. + /// + /// # Arguments + /// * `opt` - Operation for all syscall call not in rules. + pub fn new(opt: SeccompOpt) -> SyscallFilter { + let mut sock_filters = Vec::new(); + sock_filters.extend(validate_architecture()); + sock_filters.extend(examine_syscall()); + + SyscallFilter { sock_filters, opt } + } + + /// Add a list of Bpf-filter rules to seccomp. + /// + /// # Arguments + /// * `bpf_rule` - The bpf syscall rule contains a list of Bpf-filters. + /// + /// # Notice + /// The flow to add new bpf-filter rules to seccomp is irreversible after + /// realized seccomp. + pub fn push(&mut self, bpf_rule: &mut BpfRule) { + self.sock_filters.append(&mut bpf_rule.as_vec()); + } + + /// Make seccomp take effect. + /// + /// # Notice + /// After use this function, all rules in seccomp will take effect whatever + /// this structure dropped or not. You can only use this function once in + /// a thread. Otherwise you will get an error. + pub fn realize(mut self) -> Result<()> { + //Add opt as a bpf_filter to sock_filters + self.sock_filters.append(&mut handle_process(self.opt)); + + let sock_bpf_vec = self.sock_filters; + + // This operation can guarantee seccomp make use for all users and subprocess. + let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; + if ret != 0 { + bail!("Seccomp: prctl(2) set no new privs failed."); + } + + let prog = SockFProg { + len: sock_bpf_vec.len() as u16, + sock_filter: sock_bpf_vec.as_ptr(), + }; + let bpf_prog_ptr = &prog as *const SockFProg; + + // Use prctl(2) to make bpf rules take effect. + let ret = unsafe { + libc::prctl( + libc::PR_SET_SECCOMP, + libc::SECCOMP_MODE_FILTER, + bpf_prog_ptr, + ) + }; + if ret != 0 { + bail!("Seccomp: prctl(2) set seccomp filter mode failed."); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_enable_syscall() { + // a list of bpf_filter to allow `read` syscall and forbidden others + // in x86_64. + let bpf_vec = vec![ + SockFilter { + code: 0x20, + jt: 0, + jf: 0, + k: 4, + }, + #[cfg(target_arch = "x86_64")] + SockFilter { + code: 0x15, + jt: 1, + jf: 0, + k: 0xC000_003E, + }, + #[cfg(target_arch = "aarch64")] + SockFilter { + code: 0x15, + jt: 1, + jf: 0, + k: 0xC000_00B7, + }, + SockFilter { + code: 0x06, + jt: 0, + jf: 0, + k: 0, + }, + SockFilter { + code: 0x20, + jt: 0, + jf: 0, + k: 0, + }, + #[cfg(target_arch = "x86_64")] + SockFilter { + code: 0x15, + jt: 0, + jf: 1, + k: 0, + }, + #[cfg(target_arch = "aarch64")] + SockFilter { + code: 0x15, + jt: 0, + jf: 1, + k: 63, + }, + SockFilter { + code: 0x06, + jt: 0, + jf: 0, + k: 0x7fff_0000, + }, + ]; + + let mut seccomp_filter = SyscallFilter::new(SeccompOpt::Trap); + seccomp_filter.push(&mut BpfRule::new(libc::SYS_read)); + + assert_eq!(seccomp_filter.sock_filters, bpf_vec); + } + + #[test] + fn test_enable_syscall_extra() { + // a list of bpf_filter to allow read `1024` bytes in x86_64 and + // forbidden others + let bpf_vec = vec![ + SockFilter { + code: 0x20, + jt: 0, + jf: 0, + k: 4, + }, + #[cfg(target_arch = "x86_64")] + SockFilter { + code: 0x15, + jt: 1, + jf: 0, + k: 0xC000_003E, + }, + #[cfg(target_arch = "aarch64")] + SockFilter { + code: 0x15, + jt: 1, + jf: 0, + k: 0xC000_00B7, + }, + SockFilter { + code: 0x06, + jt: 0, + jf: 0, + k: 0, + }, + SockFilter { + code: 0x20, + jt: 0, + jf: 0, + k: 0, + }, + #[cfg(target_arch = "x86_64")] + SockFilter { + code: 0x15, + jt: 0, + jf: 4, + k: 0, + }, + #[cfg(target_arch = "aarch64")] + SockFilter { + code: 0x15, + jt: 0, + jf: 4, + k: 63, + }, + SockFilter { + code: 0x20, + jt: 0, + jf: 0, + k: 0x20, + }, + SockFilter { + code: 0x15, + jt: 0, + jf: 1, + k: 1024, + }, + SockFilter { + code: 0x06, + jt: 0, + jf: 0, + k: 0x7fff_0000, + }, + SockFilter { + code: 0x20, + jt: 0, + jf: 0, + k: 0, + }, + ]; + + let mut seccomp_filter = SyscallFilter::new(SeccompOpt::Trap); + seccomp_filter.push(&mut BpfRule::new(libc::SYS_read).add_constraint( + SeccompCmpOpt::Eq, + 2, + 1024, + )); + + assert_eq!(seccomp_filter.sock_filters, bpf_vec); + } +} diff --git a/util/src/tap.rs b/util/src/tap.rs new file mode 100644 index 00000000..1668a71f --- /dev/null +++ b/util/src/tap.rs @@ -0,0 +1,115 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +use std::fs::{File, OpenOptions}; +use std::io::{Read, Result as IoResult, Write}; +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; + +use super::errors::{Result, ResultExt}; + +pub const TUN_F_CSUM: u32 = 1; +pub const TUN_F_TSO4: u32 = 2; +pub const TUN_F_TSO6: u32 = 4; +pub const TUN_F_UFO: u32 = 16; +pub const TUN_F_VIRTIO: u32 = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO; + +const IFF_TAP: u16 = 0x02; +const IFF_NO_PI: u16 = 0x1000; +const IFF_VNET_HDR: u16 = 0x4000; +const TUNTAP_PATH: &str = "/dev/net/tun"; + +ioctl_iow_nr!(TUNSETIFF, 84, 202, ::std::os::raw::c_int); +ioctl_iow_nr!(TUNSETOFFLOAD, 84, 208, ::std::os::raw::c_int); +ioctl_iow_nr!(TUNSETVNETHDRSZ, 84, 216, ::std::os::raw::c_int); + +#[repr(C)] +pub struct IfReq { + ifr_name: [u8; 16], + ifr_flags: u16, +} + +pub struct Tap { + pub file: File, +} + +impl Tap { + pub fn new(name: Option<&str>, fd: Option) -> Result { + let file; + + if let Some(name) = name { + if name.len() > 15 { + return Err(format!("Open tap {} failed, name too long.", name).into()); + } + + let mut ifr_name = [0_u8; 16]; + let (left, _) = ifr_name.split_at_mut(name.len()); + left.copy_from_slice(name.as_bytes()); + + let mut if_req = IfReq { + ifr_name, + ifr_flags: IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, + }; + + let file_ = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) + .open(TUNTAP_PATH) + .chain_err(|| format!("Open {} failed.", TUNTAP_PATH))?; + + unsafe { ioctl_with_mut_ref(&file_, TUNSETIFF(), &mut if_req) }; + + file = file_; + } else if let Some(fd) = fd { + file = unsafe { + libc::fcntl(fd, libc::F_SETFL, libc::O_NONBLOCK); + File::from_raw_fd(fd) + }; + } else { + return Err("Open tap failed, unsupported operation.".into()); + } + + Ok(Tap { file }) + } + + pub fn set_offload(&self, flags: u32) -> Result<()> { + let ret = unsafe { ioctl_with_val(&self.file, TUNSETOFFLOAD(), flags as libc::c_ulong) }; + if ret < 0 { + return Err("ioctl TUNSETOFFLOAD failed.".to_string().into()); + } + + Ok(()) + } + + pub fn set_hdr_size(&self, len: u32) -> Result<()> { + let ret = unsafe { ioctl_with_ref(&self.file, TUNSETVNETHDRSZ(), &len) }; + if ret < 0 { + return Err("ioctl TUNSETVNETHDRSZ failed.".to_string().into()); + } + + Ok(()) + } + + pub fn read(&mut self, buf: &mut [u8]) -> IoResult { + self.file.read(buf) + } + + pub fn write(&mut self, buf: &[u8]) -> IoResult { + self.file.write(&buf) + } + + pub fn as_raw_fd(&self) -> RawFd { + self.file.as_raw_fd() + } +} diff --git a/util/src/unix.rs b/util/src/unix.rs new file mode 100644 index 00000000..d3bc2078 --- /dev/null +++ b/util/src/unix.rs @@ -0,0 +1,33 @@ +// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. +// +// StratoVirt is licensed under Mulan PSL v2. +// You can use this software according to the terms and conditions of the Mulan +// PSL v2. +// You may obtain a copy of Mulan PSL v2 at: +// http://license.coscl.org.cn/MulanPSL2 +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +// See the Mulan PSL v2 for more details. + +extern crate libc; + +use super::errors::{ErrorKind, Result}; + +/// This function returns the caller's thread ID(TID). +pub fn gettid() -> u64 { + unsafe { libc::syscall(libc::SYS_gettid) as u64 } +} + +/// This function used to remove group and others permission using libc::chmod. +pub fn limit_permission(path: &str) -> Result<()> { + let file_path = path.as_bytes().to_vec(); + let cstr_file_path = std::ffi::CString::new(file_path).unwrap(); + let ret = unsafe { libc::chmod(cstr_file_path.as_ptr(), 0o600) }; + + if ret == 0 { + Ok(()) + } else { + Err(ErrorKind::ChmodFailed(ret).into()) + } +} -- Gitee