[root@localhost cloud_images]# lsmod | grep vhost_net vhost_net 262144 0 vhost 262144 1 vhost_net tap 262144 1 vhost_net tun 262144 2 vhost_net [root@localhost cloud_images]#
vhost-net網卡的后端默認使用linux的虛擬網橋tap設備,qemu和虛擬機內部使用virtio-net虛擬網卡。 步驟1: 創建linux網橋和tap設備(對於fedora,centos,redhat等默認有創建好的虛擬網卡) brctl addbr virbr0 brctl stp virbr0 on ip tuntap add name virbr0-nic mode tap ip link set dev virbr0-nic up 步驟二:將host網卡添加到virbr0的一個port,並把ip配置給virbr0 brctl addif virbr0 eth0 brctl addif virbr0 virbr0-nic dhclient virbr0 步驟三:用命令行起一個虛擬機 sudo x86_64-softmmu/qemu-system-x86_64 --enable-kvm -m 5120 -drive file=/home/fang/vm/centos.img,if=virtio -net nic,model=virtio -net tap,ifname=virbr0-nic,script=no -vnc :0
用vhost_net后端驅動
前面提到virtio在宿主機中的后端處理程序(backend)一般是由用戶空間的QEMU提供的,然而如果對於網絡IO請求的后端處理能夠在在內核空間來完成,則效率會更高,會提高網絡吞吐量和減少網絡延遲。在比較新的內核中有一個叫做“vhost-net”的驅動模塊,它是作為一個內核級別的后端處理程序,將virtio-net的后端處理任務放到內核空間中執行,從而提高效率。
在第4章介紹“使用網橋模式”的網絡配置時,有幾個選項和virtio相關的,這里也介紹一下。
-net tap,[,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h][,vhostforce=on|off]
vnet_hdr =on|off
設置是否打開TAP設備的“IFF_VNET_HDR”標識。“vnet_hdr=off”表示關閉這個標識;“vnet_hdr=on”則強制開啟這個標識,如果沒有這個標識的支持,則會觸發錯誤。IFF_VNET_HDR是tun/tap的一個標識,打開它則允許發送或接受大數據包時僅僅做部分的校驗和檢查。打開這個標識,可以提高virtio_net驅動的吞吐量。
vhost=on|off
設置是否開啟vhost-net這個內核空間的后端處理驅動,它只對使用MIS-X[5]中斷方式的virtio客戶機有效。
vhostforce=on|off
設置是否強制使用vhost作為非MSI-X中斷方式的Virtio客戶機的后端處理程序。
vhostfs=h
設置為去連接一個已經打開的vhost網絡設備。
用如下的命令行啟動一個客戶機,就在客戶機中使用virtio-net作為前端驅動程序,而后端處理程序則使用vhost-net(當然需要當前宿主機內核支持vhost-net模塊)。
[root@jay-linux kvm_demo]# qemu-system-x86_64 rhel6u3.img -smp 2 -m 1024 -net nic,model=virtio,macaddr=00:16:3e:22:22:22 -net tap,vnet_hdr=on,vhost=on
VNC server running on
::1:5900'
qemu-system-aarch64: network script /usr/local/bin/../etc/qemu-ifup failed with status 256
[root@localhost cloud_images]# cat qemu-ifup #!/bin/sh set -x switch=virbr0 if [ -n "$1" ];then ip tuntap add $1 mode tap user `whoami` ip link set $1 up sleep 0.5s ip link set $1 master $switch exit 0 else echo "Error: no interface specified" exit 1 fi
qemu-system-aarch64 -name vm2 -daemonize \ -enable-kvm -M virt -cpu host -smp 2 -m 4096 \ -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on \ -numa node,memdev=mem -mem-prealloc \ -global virtio-blk-device.scsi=off \ -device virtio-scsi-device,id=scsi \ -kernel vmlinuz-4.18 --append "console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1" \ -initrd initramfs-4.18 \ -drive file=vhuser-test1.qcow2 \ -serial telnet:localhost:4322,server,nowait \ -monitor telnet:localhost:4321,server,nowait \ -net nic,model=virtio,macaddr=00:16:3e:22:22:22 -net tap,id=hostnet1,script=qemu-ifup,vnet_hdr=on,vhost=on \ -vnc :10
當給一個Qemu進程傳遞了參數-netdev tap,vhost=on 的時候,QEMU會通過調用幾個ioctl命令對這個文件描述符進行一些初始化的工作,然后進行特性的協商,從而宿主機跟客戶機的vhost-net driver建立關系。 QEMU代碼調用如下:
vhost_net_init -> vhost_dev_init -> vhost_net_ack_features
+ switch=virbr0 + '[' -n tap0 ']' ++ whoami + ip tuntap add tap0 mode tap user root ioctl(TUNSETIFF): Device or resource busy + ip link set tap0 up + sleep 0.5s + ip link set tap0 master virbr0 + exit 0 [root@localhost cloud_images]# brctl show bridge name bridge id STP enabled interfaces virbr0 8000.5254000d3f71 yes enp125s0f1 tap0 virbr0-nic
[root@localhost cloud_images]# ps -elf | grep vhost 7 S root 49044 1 9 80 0 - 88035 poll_s 21:56 ? 00:00:11 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 2 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -drive file=vhuser-test1.qcow2 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -net nic,model=virtio,macaddr=00:16:3e:22:22:22 -net tap,id=hostnet1,script=qemu-ifup,vnet_hdr=on,vhost=on -vnc :10 1 S root 49065 2 0 80 0 - 0 vhost_ 21:56 ? 00:00:00 [vhost-49044]
[root@localhost ~]# cat /proc/49065/stack [<ffff000008085ed4>] __switch_to+0x8c/0xa8 [<ffff000001e507d8>] vhost_worker+0x148/0x170 [vhost] [<ffff0000080f8638>] kthread+0x10c/0x138 [<ffff000008084f54>] ret_from_fork+0x10/0x18 [<ffffffffffffffff>] 0xffffffffffffffff [root@localhost ~]#
- To get qemu vcpu thread id # ps -eLo ruser,pid,ppid,lwp,args | grep qemu-kvm | grep -v grep - To get vhost thread id # ps -eaf |grep vhost-$pid_of_qemu-kvm |grep -v grep
[root@localhost cloud_images]# ps -eLo ruser,pid,ppid,lwp,args | grep qemu-system-aarch64 | grep vhost | grep -v grep | wc -l 20
[root@localhost cloud_images]# ps -eLo ruser,pid,ppid,lwp,args | grep qemu-system-aarch64 | grep -v grep root 48617 1 48617 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48618 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48619 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48672 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48674 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48676 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48677 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48678 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48679 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48680 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48681 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48682 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48683 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48684 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48685 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48686 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48687 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48688 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48689 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48690 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 root 48617 1 48782 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10
什么是 vhost
vhost 是 virtio 的一種后端實現方案,在 virtio 簡介中,我們已經提到 virtio 是一種半虛擬化的實現方案,需要虛擬機端和主機端都提供驅動才能完成通信,通常,virtio 主機端的驅動是實現在用戶空間的 qemu 中,而 vhost 是實現在內核中,是內核的一個模塊 vhost-net.ko。為什么要實現在內核中,有什么好處呢,請接着往下看。

為什么要用 vhost
在 virtio 的機制中,guest 與 用戶空間的 Hypervisor 通信,會造成多次的數據拷貝和 CPU 特權級的上下文切換。例如 guest 發包給外部網絡,首先,guest 需要切換到 host kernel,然后 host kernel 會切換到 qemu 來處理 guest 的請求, Hypervisor 通過系統調用將數據包發送到外部網絡后,會切換回 host kernel , 最后再切換回 guest。這樣漫長的路徑無疑會帶來性能上的損失。
vhost 正是在這樣的背景下提出的一種改善方案,它是位於 host kernel 的一個模塊,用於和 guest 直接通信,數據交換直接在 guest 和 host kernel 之間通過 virtqueue 來進行,qemu 不參與通信,但也沒有完全退出舞台,它還要負責一些控制層面的事情,比如和 KVM 之間的控制指令的下發等。
vhost 的數據流程
下圖左半部分是 vhost 負責將數據發往外部網絡的過程, 右半部分是 vhost 大概的數據交互流程圖。其中,qemu 還是需要負責 virtio 設備的適配模擬,負責用戶空間某些管理控制事件的處理,而 vhost 實現較為純凈,以一個獨立的模塊完成 guest 和 host kernel 的數據交換過程。

vhost 與 virtio 前端的通信主要采用一種事件驅動 eventfd 的機制來實現,guest 通知 vhost 的事件要借助 kvm.ko 模塊來完成,vhost 初始化期間,會啟動一個工作線程 work 來監聽 eventfd,一旦 guest 發出對 vhost 的 kick event,kvm.ko 觸發 ioeventfd 通知到 vhost,vhost 通過 virtqueue 的 avail ring 獲取數據,並設置 used ring。同樣,從 vhost 工作線程向 guest 通信時,也采用同樣的機制,只不過這種情況發的是一個回調的 call envent,kvm.ko 觸發 irqfd 通知 guest。
總結
vhost 與 kvm 的事件通信通過 eventfd 機制來實現,主要包括兩個方向的 event,一個是 guest 到 vhost 方向的 kick event,通過 ioeventfd 實現;另一個是 vhost 到 guest 方向的 call event,通過 irqfd 實現。
Vhost 概述
Linux kernel 中的vhost driver提供了KVM在kernel環境中的virtio設備的模擬。vhost把QEMU模擬設備的代碼放在了linux kernel里面,所以設備模擬代碼可以直接進入kernel子系統,從而不需要從用戶空間通過系統調用陷入內核,減少了由於模擬IO導致的性能下降。
vhost-net是在宿主機上對vhost 網卡的模擬,同樣,也有vhost-blk,對block設備的模擬,以及vhost-scsi,對scsi設備的模擬。
vhost 在kernel中的代碼位於 drivers/vhost/vhost.c
Vhost 驅動模型
vhost driver創建了一個字符設備 /dev/vhost-net,這個設備可以被用戶空間打開,並可以被ioctl命令操作。當給一個Qemu進程傳遞了參數-netdev tap,vhost=on 的時候,QEMU會通過調用幾個ioctl命令對這個文件描述符進行一些初始化的工作,然后進行特性的協商,從而宿主機跟客戶機的vhost-net driver建立關系。 QEMU代碼調用如下:
vhost_net_init -> vhost_dev_init -> vhost_net_ack_features
在vhost_net_init中調用了 vhost_dev_init ,打開/dev/vhost-net這個設備,然后返回一個文件描述符作為vhost-net的后端, vhost_dev_init 調用的ioctl命令有
r = ioctl(hdev->control, VHOST_SET_OWNER, NULL);
Kernel 中的定義為:
1. /* Set current process as the (exclusive) owner of this file descriptor. This
2. * must be called before any other vhost command. Further calls to
3. * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */
4. #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
然后獲取VHOST支持的特性
r = ioctl(hdev->control, VHOST_GET_FEATURES, &features);
同樣,kernel中的定義為:
1. /* Features bitmask for forward compatibility. Transport bits are used for
2. * vhost specific features. */
3. #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
QEMU中用vhost_net 這個數據結構代表打開的vhost_net 實例:
1. struct vhost_net {
2. struct vhost_dev dev;
3. struct vhost_virtqueue vqs[2];
4. int backend;
5. NetClientState *nc;
6. };
使用ioctl設置完后,QEMU注冊memory_listener 回調函數:
1. hdev->memory_listener = (MemoryListener) {
2. .begin = vhost_begin,
3. .commit = vhost_commit,
4. .region_add = vhost_region_add,
5. .region_del = vhost_region_del,
6. .region_nop = vhost_region_nop,
7. .log_start = vhost_log_start,
8. .log_stop = vhost_log_stop,
9. .log_sync = vhost_log_sync,
10. .log_global_start = vhost_log_global_start,
11. .log_global_stop = vhost_log_global_stop,
12. .eventfd_add = vhost_eventfd_add,
13. .eventfd_del = vhost_eventfd_del,
14. .priority = 10
15. };
vhost_region_add 是為了將QEMU guest的地址空間映射到vhost driver
最后進行特性的協商:
1. /* Set sane init value. Override when guest acks. */
2. vhost_net_ack_features(net, 0);
與此同時,kernel中要創建一個kernel thread 用於處理I/O事件和設備的模擬。 kernel代碼 drivers/vhost/vhost.c:
在vhost_dev_set_owner中,調用了這個函數用於創建worker線程(線程名字為vhost-qemu+進程pid)
Kernel 中的virtio 模擬
vhost並沒有完全模擬一個pci設備,相反,它只把自己限制在對virtqueue的操作上。
worker thread 一直等待virtqueue的數據,對於vhost-net來說,當virtqueue的tx隊列中有數據了,它會把數據傳送到與其相關聯的tap設備文件描述符上。
相反,worker thread 也要進行tap文件描述符的輪詢,對於vhost-net,當tap文件描述符有數據到來時候,worker thread會被喚醒,然后將數據傳送到rx隊列中。
vhost的在用戶空間的接口
數據已經准備好了,如何通知客戶機呢?
從vhost模塊的依賴性可以得知,vhost這個模塊並沒有依賴於kvm 模塊,也就是說,理論上其他應用只要實現了和vhost接口,也可以調用vhost來進行數據傳輸的加速。
但也正是因為vhost跟kvm模塊沒什么關系,當QEMU(KVM)guest把數據放到tx隊列(virtqueue)上之后,它是沒有辦法直接通知vhost數據准備了的。
不過,vhost 設置了一個eventfd文件描述符,這個文件描述符被前面我們提到的worker thread 監控,所以QEMU可以通過向eventfd發送消息告訴vhost數據准備好了。
QEMU的做法是這樣的,在QEMU中注冊一個ioeventfd,當guest 發生I/O退出了,會被KVM捕捉到,KVM向vhost發送eventfd從而告知vhost KVM guest已經准備好數據了。由於 worker thread監控這個eventfd,在收到消息后,知道guest已經把數據放到了tx隊列,可以進行對戲
vhost通過發出一個guest中斷,通過KVM提供的irqevent,告訴guest需要傳送的buffer已經放到了rx virtqueue了,QEMU(KVM)注冊這個irq PCI事件,得知內核空間的數據准備好了,調用guest驅動進行數據的讀取。
所以,總的來說 vhost 實例需要知道的有三樣
- guest的內存映射,也就是virtqueue,用於數據的傳輸
- qemu kick eventfd,vhost接收guest發送的消息,該消息被worker thread捕獲
- call event(irqfd)用於通知guest
以上三點在QEMU初始化的時候准備好,數據的傳輸只在內核空間就完成了,不需要QEMU進行干預,所以這也是為什么使用vhost進行傳輸數據高效的原因