網絡虛擬化有和存儲虛擬化類似的地方,例如它們都是基於virtio 的,因而在看網絡虛擬化的過程中,會看到和存儲虛擬化很像的數據結構和原理。但是網絡虛擬化也有自己的特殊性。例如,存儲虛擬化是將宿主機上的文件作為客戶機上的硬盤,而網絡虛擬化需要依賴於內核協議棧進行網絡包的封裝與解封裝。那怎么實現客戶機和宿主機之間的互通呢?就來看一看解析初始化的過程。還是從Virtio Network Device這個設備的初始化講起,如下所示:
static const TypeInfo device_type_info = { .name = TYPE_DEVICE, .parent = TYPE_OBJECT, .instance_size = sizeof(DeviceState), .instance_init = device_initfn, .instance_post_init = device_post_init, .instance_finalize = device_finalize, .class_base_init = device_class_base_init, .class_init = device_class_init, .abstract = true, .class_size = sizeof(DeviceClass), }; static const TypeInfo virtio_device_info = { .name = TYPE_VIRTIO_DEVICE, .parent = TYPE_DEVICE, .instance_size = sizeof(VirtIODevice), .class_init = virtio_device_class_init, .instance_finalize = virtio_device_instance_finalize, .abstract = true, .class_size = sizeof(VirtioDeviceClass), }; static const TypeInfo virtio_net_info = { .name = TYPE_VIRTIO_NET, .parent = TYPE_VIRTIO_DEVICE, .instance_size = sizeof(VirtIONet), .instance_init = virtio_net_instance_init, .class_init = virtio_net_class_init, }; static void virtio_register_types(void) { type_register_static(&virtio_net_info); } type_init(virtio_register_types)
Virtio Network Device這種類的定義是有多層繼承關系的,TYPE_VIRTIO_NET的父類是TYPE_VIRTIO_DEVICE,TYPE_VIRTIO_DEVICE的父類是TYPE_DEVICE,TYPE_DEVICE的父類是TYPE_OBJECT,繼承關系就到頭了。type_init用於注冊這種類,這里面每一層都有class_init,用於從TypeImpl生成xxxClass,也有instance_init,會將xxxClass初始化為實例。TYPE_VIRTIO_NET層的class_init函數是virtio_net_class_init,它定義了DeviceClass的realize函數為virtio_net_device_realize,這一點和存儲塊設備是一樣的,如下所示:
static void virtio_net_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIONet *n = VIRTIO_NET(dev); NetClientState *nc; int i; ...... virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size); /* * We set a lower limit on RX queue size to what it always was. * Guests that want a smaller ring can always resize it without * help from us (using virtio 1 and up). */ if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE || n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE || !is_power_of_2(n->net_conf.rx_queue_size)) { ...... return; } if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE || n->net_conf.tx_queue_size > VIRTQUEUE_MAX_SIZE || !is_power_of_2(n->net_conf.tx_queue_size)) { ...... return; } n->max_queues = MAX(n->nic_conf.peers.queues, 1); if (n->max_queues * 2 + 1 > VIRTIO_QUEUE_MAX) { ...... return; } n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues); n->curr_queues = 1; ...... n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n), n->net_conf.tx_queue_size); for (i = 0; i < n->max_queues; i++) { virtio_net_add_queue(n, i); } n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl); qemu_macaddr_default_if_unset(&n->nic_conf.macaddr); memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac)); n->status = VIRTIO_NET_S_LINK_UP; if (n->netclient_type) { n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf, n->netclient_type, n->netclient_name, n); } else { n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf, object_get_typename(OBJECT(dev)), dev->id, n); } ...... }
static void virtio_net_add_queue(VirtIONet *n, int index)
{
VirtIODevice *vdev = VIRTIO_DEVICE(n); n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size, virtio_net_handle_rx); if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) { n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_timer); n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_tx_timer, &n->vqs[index]); } else { n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_bh); n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]); } n->vqs[index].tx_waiting = 0; n->vqs[index].n = n; }
static void virtio_net_add_queue(VirtIONet *n, int index) { VirtIODevice *vdev = VIRTIO_DEVICE(n); n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size, virtio_net_handle_rx); if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) { n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_timer); n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_tx_timer, &n->vqs[index]); } else { n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_bh); n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]); } n->vqs[index].tx_waiting = 0; n->vqs[index].n = n; }
這里面創建了一個VirtIODevice,這一點和存儲虛擬化也是一樣的。virtio_init用來初始化這個設備。VirtIODevice結構里面有一個VirtQueue數組,這就是virtio前端和后端互相傳數據的隊列,最多有VIRTIO_QUEUE_MAX個。
剛才說的都是一樣的地方,其實也有不一樣的地方。會發現這里面有這樣的語句n->max_queues * 2 + 1 > VIRTIO_QUEUE_MAX。為什么要乘以2呢?這是因為對於網絡設備來講,應該分發送隊列和接收隊列兩個方向。接下來調用virtio_net_add_queue來初始化隊列,可以看出這里面就有發送tx_vq和接收rx_vq兩個隊列,如下所示:
typedef struct VirtIONetQueue { VirtQueue *rx_vq; VirtQueue *tx_vq; QEMUTimer *tx_timer; QEMUBH *tx_bh; uint32_t tx_waiting; struct { VirtQueueElement *elem; } async_tx; struct VirtIONet *n; } VirtIONetQueue; static void virtio_net_add_queue(VirtIONet *n, int index) { VirtIODevice *vdev = VIRTIO_DEVICE(n); n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size, virtio_net_handle_rx); ...... n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_bh); n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]); n->vqs[index].n = n;
每個VirtQueue中,都有一個vring用來維護這個隊列里面的數據;另外還有函數virtio_net_handle_rx用於處理網絡包的接收;函數virtio_net_handle_tx_bh用於網絡包的發送,這個函數后面會用到。接下來,qemu_new_nic會創建一個虛擬機里面的網卡,如下所示
NICState *qemu_new_nic(NetClientInfo *info, NICConf *conf, const char *model, const char *name, void *opaque) { NetClientState **peers = conf->peers.ncs; NICState *nic; int i, queues = MAX(1, conf->peers.queues); ...... nic = g_malloc0(info->size + sizeof(NetClientState) * queues); nic->ncs = (void *)nic + info->size; nic->conf = conf; nic->opaque = opaque; for (i = 0; i < queues; i++) { qemu_net_client_setup(&nic->ncs[i], info, peers[i], model, name, NULL); nic->ncs[i].queue_index = i; } return nic; } static void qemu_net_client_setup(NetClientState *nc, NetClientInfo *info, NetClientState *peer, const char *model, const char *name, NetClientDestructor *destructor) { nc->info = info; nc->model = g_strdup(model); if (name) { nc->name = g_strdup(name); } else { nc->name = assign_name(nc, model); } QTAILQ_INSERT_TAIL(&net_clients, nc, next); nc->incoming_queue = qemu_new_net_queue(qemu_deliver_packet_iov, nc); nc->destructor = destructor; QTAILQ_INIT(&nc->filters); }
初始化過程解析完畢以后,接下來從qemu的啟動過程看起。對於網卡的虛擬化,qemu的啟動參數里面有關的是下面兩行
-netdev tap,fd=32,id=hostnet0,vhost=on,vhostfd=37 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=fa:16:3e:d1:2d:99,bus=pci.0,addr=0x3
qemu的main函數會調用net_init_clients進行網絡設備的初始化,可以解析net參數,也可以解析netdev參數,如下所示:
int net_init_clients(Error **errp) { QTAILQ_INIT(&net_clients); if (qemu_opts_foreach(qemu_find_opts("netdev"), net_init_netdev, NULL, errp)) { return -1; } if (qemu_opts_foreach(qemu_find_opts("nic"), net_param_nic, NULL, errp)) { return -1; } if (qemu_opts_foreach(qemu_find_opts("net"), net_init_client, NULL, errp)) { return -1; } return 0; }
net_init_clients會解析參數。上面的參數netdev會調用net_init_netdev->net_client_init->net_client_init1。net_client_init1會根據不同的driver類型,調用不同的初始化函數,如下所示:
static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])( const Netdev *netdev, const char *name, NetClientState *peer, Error **errp) = { [NET_CLIENT_DRIVER_NIC] = net_init_nic, [NET_CLIENT_DRIVER_TAP] = net_init_tap, [NET_CLIENT_DRIVER_SOCKET] = net_init_socket, [NET_CLIENT_DRIVER_HUBPORT] = net_init_hubport, ...... };
由於配置的driver類型是tap,因而這里會調用net_init_tap->net_tap_init->tap_open,如下所示:
#define PATH_NET_TUN "/dev/net/tun" int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required, int mq_required, Error **errp) { struct ifreq ifr; int fd, ret; int len = sizeof(struct virtio_net_hdr); unsigned int features; TFR(fd = open(PATH_NET_TUN, O_RDWR)); memset(&ifr, 0, sizeof(ifr)); ifr.ifr_flags = IFF_TAP | IFF_NO_PI; if (ioctl(fd, TUNGETFEATURES, &features) == -1) { features = 0; } if (features & IFF_ONE_QUEUE) { ifr.ifr_flags |= IFF_ONE_QUEUE; } if (*vnet_hdr) { if (features & IFF_VNET_HDR) { *vnet_hdr = 1; ifr.ifr_flags |= IFF_VNET_HDR; } else { *vnet_hdr = 0; } ioctl(fd, TUNSETVNETHDRSZ, &len); } ...... ret = ioctl(fd, TUNSETIFF, (void *) &ifr); ...... fcntl(fd, F_SETFL, O_NONBLOCK); return fd;
在tap_open中打開一個文件"/dev/net/tun",然后通過ioctl操作這個文件。這是Linux內核的一項機制,和KVM機制很像,其實這就是一種通過打開這個字符設備文件,然后通過ioctl操作這個文件和內核打交道,來使用內核的這么一種能力,如下圖所示:
為什么需要使用內核的機制呢?因為網絡包需要從虛擬機里面發送到虛擬機外面,發送到宿主機上的時候,必須是一個正常的網絡包才能被轉發。要形成一個網絡包,那就需要經過復雜的協議棧。客戶機會將網絡包發送給qemu,qemu自己沒有網絡協議棧,現去實現一個也不可能,太復雜了,於是它就要借助內核的力量。qemu會將客戶機發送給它的網絡包轉換成為文件流,寫入"/dev/net/tun"字符設備,就像寫一個文件一樣。內核中TUN/TAP字符設備驅動會收到這個寫入的文件流,然后交給TUN/TAP的虛擬網卡驅動,這個驅動會將文件流再次轉成網絡包,交給TCP/IP棧,最終從虛擬TAP網卡tap0發出來,成為標准的網絡包。后面會看到這個過程。
現在到內核里面,看一看打開"/dev/net/tun"字符設備后,內核會發生什么事情。內核的實現在drivers/net/tun.c文件中,這是一個字符設備驅動程序,應該符合字符設備的格式,如下所示:
module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun"); static int __init tun_init(void) { ...... ret = rtnl_link_register(&tun_link_ops); ...... ret = misc_register(&tun_miscdev); ...... ret = register_netdevice_notifier(&tun_notifier_block); ...... }
這里面注冊了一個tun_miscdev字符設備,從它的定義可以看出,這就是"/dev/net/tun"字符設備,如下所示:
static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; static const struct file_operations tun_fops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, };
qemu的tap_open函數會打開這個字符設備PATH_NET_TUN。打開字符設備的過程這里不再重復,總之到了驅動這一層,調用的是tun_chr_open,如下所示:
static int tun_chr_open(struct inode *inode, struct file * file) { struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->wq.wait); RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data(&tfile->socket, &tfile->sk); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); return 0; }
在tun_chr_open的參數里面有一個struct file,它代表的就是打開的字符設備文件"/dev/net/tun",因而往這個字符設備文件中寫數據,就會通過這個struct file寫入。這個struct file里面的file_operations,按照字符設備打開的規則,指向的就是tun_fops。另外還需要在tun_chr_open創建一個結構struct tun_file,並且將struct file的private_data指向它,如下所示:
/* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures * to serve as one transmit queue for tuntap device. */ struct tun_file { struct sock sk; struct socket socket; struct socket_wq wq; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct list_head next; struct tun_struct *detached; struct skb_array tx_array; }; struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; }; static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }
在struct tun_file中有一個成員struct tun_struct,它里面有一個struct net_device,這個用來表示宿主機上的tuntap網絡設備。在struct tun_file中,還有struct socket和struct sock,因為要用到內核的網絡協議棧,所以就需要這兩個結構,這在以前網絡協議部分已經分析過了。所以按照struct tun_file的注釋所說,這是一個很重要的數據結構,"/dev/net/tun"對應的struct file的private_data指向它,因而可以接收qemu發過來的數據。除此之外,它還可以通過struct sock來操作內核協議棧,然后將網絡包從宿主機上的tuntap網絡設備發出去,宿主機上的tuntap網絡設備對應的struct net_device也歸它管。
32. 在qemu的tap_open函數中,打開這個字符設備文件之后,接下來要做的事情是,通過ioctl來設置宿主機的網卡 TUNSETIFF。接下來,ioctl到了內核里面會調用tun_chr_ioctl,如下所示:
static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; void __user* argp = (void __user*)arg; struct ifreq ifr; kuid_t owner; kgid_t group; int sndbuf; int vnet_hdr_sz; unsigned int ifindex; int le; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == SOCK_IOC_TYPE) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } ...... tun = __tun_get(tfile); if (cmd == TUNSETIFF) { ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(sock_net(&tfile->sk), file, &ifr); ...... if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; } ......
在__tun_chr_ioctl中,首先通過copy_from_user把配置從用戶態拷貝到內核態,調用tun_set_iff設置tuntap網絡設備,然后調用copy_to_user將配置結果返回。tun_set_iff的實現如下所示:
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; ...... char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); err = dev_get_valid_name(net, dev, name); dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; tun_net_init(dev); tun_flow_init(tun); err = tun_attach(tun, file, false); err = register_netdevice(tun->dev); netif_carrier_on(tun->dev); if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strcpy(ifr->ifr_name, tun->dev->name); return 0; }
tun_set_iff創建了struct tun_struct和struct net_device,並且將這個tuntap網絡設備通過register_netdevice注冊到內核中,這樣就能在宿主機上通過ip addr看到這個網卡了,如下圖所示:
33. 下面來解析關聯前端設備驅動和后端設備驅動的過程。來看在客戶機中發送一個網絡包的時候,會發生哪些事情。虛擬機里面的進程發送一個網絡包,通過文件系統和Socket調用網絡協議棧到達網絡設備層,只不過這個不是普通的網絡設備,而是virtio_net的驅動。virtio_net的驅動程序代碼在Linux操作系統的源代碼里面,文件名為drivers/net/virtio_net.c,如下所示:
static __init int virtio_net_driver_init(void) { ret = register_virtio_driver(&virtio_net_driver); ...... } module_init(virtio_net_driver_init); module_exit(virtio_net_driver_exit); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio network driver"); MODULE_LICENSE("GPL"); static struct virtio_driver virtio_net_driver = { .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, .validate = virtnet_validate, .probe = virtnet_probe, .remove = virtnet_remove, .config_changed = virtnet_config_changed, ...... };
在virtio_net的驅動程序的初始化代碼中,需要注冊一個驅動函數virtio_net_driver。當一個設備驅動作為一個內核模塊被初始化的時候,probe函數會被調用,因而來看一下virtnet_probe:
static int virtnet_probe(struct virtio_device *vdev) { int i, err; struct net_device *dev; struct virtnet_info *vi; u16 max_queue_pairs; int mtu; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs); /* Set up network device as normal. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; dev->netdev_ops = &virtnet_netdev; dev->features = NETIF_F_HIGHDMA; dev->ethtool_ops = &virtnet_ethtool_ops; SET_NETDEV_DEV(dev, &vdev->dev); ...... /* MTU range: 68 - 65535 */ dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU; /* Set up our device-specific information */ vi = netdev_priv(dev); vi->dev = dev; vi->vdev = vdev; vdev->priv = vi; vi->stats = alloc_percpu(struct virtnet_stats); INIT_WORK(&vi->config_work, virtnet_config_changed_work); ...... vi->max_queue_pairs = max_queue_pairs; /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ err = init_vqs(vi); netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); virtnet_init_settings(dev); err = register_netdev(dev); virtio_device_ready(vdev); virtnet_set_queues(vi, vi->curr_queue_pairs); ...... }
在virtnet_probe中會創建struct net_device,並且通過register_netdev注冊這個網絡設備,這樣在客戶機里面就能看到這個網卡了。在virtnet_probe中,還有一件重要的事情就是,init_vqs會初始化發送和接收的virtqueue,如下所示:
static int init_vqs(struct virtnet_info *vi) { int ret; /* Allocate send & receive queues */ ret = virtnet_alloc_queues(vi); ret = virtnet_find_vqs(vi); ...... get_online_cpus(); virtnet_set_affinity(vi); put_online_cpus(); return 0; } static int virtnet_alloc_queues(struct virtnet_info *vi) { int i; vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL); vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL); INIT_DELAYED_WORK(&vi->refill, refill_work); for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].pages = NULL; netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll, napi_weight); netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, napi_tx ? napi_weight : 0); sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); } return 0; }
按照之前的virtio原理,virtqueue是一個介於客戶機前端和qemu后端的一個結構,用於在這兩端之間傳遞數據,對於網絡設備來講有發送和接收兩個方向的隊列。這里建立的struct virtqueue是客戶機前端對於隊列管理的數據結構。隊列的實體需要通過函數virtnet_find_vqs查找或者生成,這里還會指定接收隊列的callback函數為skb_recv_done,發送隊列的callback函數為skb_xmit_done。當buffer使用發生變化的時候,可以調用這個callback函數進行通知,如下所示:
static int virtnet_find_vqs(struct virtnet_info *vi) { vq_callback_t **callbacks; struct virtqueue **vqs; int ret = -ENOMEM; int i, total_vqs; const char **names; /* Allocate space for find_vqs parameters */ vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL); callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL); names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL); /* Allocate/initialize parameters for send/receive virtqueues */ for (i = 0; i < vi->max_queue_pairs; i++) { callbacks[rxq2vq(i)] = skb_recv_done; callbacks[txq2vq(i)] = skb_xmit_done; names[rxq2vq(i)] = vi->rq[i].name; names[txq2vq(i)] = vi->sq[i].name; } ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks, names, ctx, NULL); ...... for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].vq = vqs[rxq2vq(i)]; vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); vi->sq[i].vq = vqs[txq2vq(i)]; } ...... }
這里的find_vqs是在struct virtnet_info里的struct virtio_device里的struct virtio_config_ops *config里面定義的。根據virtio_config_ops的定義,find_vqs會調用vp_modern_find_vqs,到這一步和塊設備是一樣的了。在vp_modern_find_vqs 中,vp_find_vqs會調用vp_find_vqs_intx。在vp_find_vqs_intx 中,通過request_irq注冊一個中斷處理函數vp_interrupt,當設備向隊列中寫入信息時會產生一個中斷,也就是vq中斷。中斷處理函數需要調用相應隊列的回調函數,然后根據隊列的數目,依次調用vp_setup_vq完成virtqueue、vring的分配和初始化。
同樣,這些數據結構會和virtio后端的VirtIODevice、VirtQueue、vring對應起來,都應該指向剛才創建的那一段內存。客戶機同樣會通過調用專門給外部設備發送指令的函數iowrite告訴外部的pci設備,這些共享內存的地址。至此前端設備驅動和后端設備驅動之間的兩個收發隊列就關聯好了,這兩個隊列的格式和塊設備是一樣的。
virtio 數據流交互機制
vring 主要通過兩個環形緩沖區來完成數據流的轉發,如下圖所示。

vring 包含三個部分,描述符數組 desc,可用的 available ring 和使用過的 used ring。
desc 用於存儲一些關聯的描述符,每個描述符記錄一個對 buffer 的描述,available ring 則用於 guest 端表示當前有哪些描述符是可用的,而 used ring 則表示 host 端哪些描述符已經被使用。
Virtio 使用 virtqueue 來實現 I/O 機制,每個 virtqueue 就是一個承載大量數據的隊列,具體使用多少個隊列取決於需求,例如,virtio 網絡驅動程序(virtio-net)使用兩個隊列(一個用於接受,另一個用於發送),而 virtio 塊驅動程序(virtio-blk)僅使用一個隊列。
具體的,假設 guest 要向 host 發送數據,首先,guest 通過函數 virtqueue_add_buf 將存有數據的 buffer 添加到 virtqueue 中,然后調用 virtqueue_kick 函數,virtqueue_kick 調用 virtqueue_notify 函數,通過寫入寄存器的方式來通知到 host。host 調用 virtqueue_get_buf 來獲取 virtqueue 中收到的數據。

vm_find_vqs --> vm_setup_vq | | --> vring_create_virtqueue |--> vring_init |--> __vring_new_virtqueue virtqueue_add_split | --> dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE) | --> vq->split.vring.desc vq->split.vring.avail
struct vring_virtqueue { struct virtqueue vq; /* Is this a packed ring? */ bool packed_ring; /* Is DMA API used? */ bool use_dma_api; /* Can we use weak barriers? */ bool weak_barriers; /* Other side has made a mess, don't try any more. */ bool broken; /* Host supports indirect buffers */ bool indirect; /* Host publishes avail event idx */ bool event; /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ unsigned int num_added; /* Last used index we've seen. */ u16 last_used_idx; union { /* Available for split ring */ struct { /* Actual memory layout for this queue. */ struct vring vring; /* Last written value to avail->flags */ u16 avail_flags_shadow; /* * Last written value to avail->idx in * guest byte order. */ u16 avail_idx_shadow; /* Per-descriptor state. */ struct vring_desc_state_split *desc_state; /* DMA address and size information */ dma_addr_t queue_dma_addr; size_t queue_size_in_bytes; } split; /* Available for packed ring */ struct { /* Actual memory layout for this queue. */ struct { unsigned int num; struct vring_packed_desc *desc; struct vring_packed_desc_event *driver; struct vring_packed_desc_event *device; } vring; /* Driver ring wrap counter. */ bool avail_wrap_counter; /* Device ring wrap counter. */ bool used_wrap_counter; /* Avail used flags. */ u16 avail_used_flags; /* Index of the next avail descriptor. */ u16 next_avail_idx; /* * Last written value to driver->flags in * guest byte order. */ u16 event_flags_shadow; /* Per-descriptor state. */ struct vring_desc_state_packed *desc_state; struct vring_desc_extra_packed *desc_extra; /* DMA address and size information */ dma_addr_t ring_dma_addr; dma_addr_t driver_event_dma_addr; dma_addr_t device_event_dma_addr; size_t ring_size_in_bytes; size_t event_size_in_bytes; } packed; }; /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); /* DMA, allocation, and size information */ bool we_own_ring; #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; /* Figure out if their kicks are too delayed. */ bool last_add_time_valid; ktime_t last_add_time; #endif };
virtqueue創建 + DMA地址
struct virtqueue *vring_create_virtqueue( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) { if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name); } dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); if (vq->packed_ring) return vq->packed.ring_dma_addr; return vq->split.queue_dma_addr; } static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); struct virtio_mmio_vq_info *info; struct virtqueue *vq; unsigned long flags; unsigned int num; int err; if (!name) return NULL; /* Select the queue we're interested in */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); /* Queue shouldn't already be set up. */ if (readl(vm_dev->base + (vm_dev->version == 1 ? VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) { err = -ENOENT; goto error_available; } /* Allocate and fill out our active queue description */ info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) { err = -ENOMEM; goto error_kmalloc; } num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); if (num == 0) { err = -ENOENT; goto error_new_virtqueue; } /* Create the vring */ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, true, true, ctx, vm_notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; } /* Activate the queue */ writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); if (vm_dev->version == 1) { u64 q_pfn = virtqueue_get_desc_addr(vq) >> PAGE_SHIFT; /* * virtio-mmio v1 uses a 32bit QUEUE PFN. If we have something * that doesn't fit in 32bit, fail the setup rather than * pretending to be successful. */ if (q_pfn >> 32) { dev_err(&vdev->dev, "platform bug: legacy virtio-mmio must not be used with RAM above 0x%llxGB\n", 0x1ULL << (32 + PAGE_SHIFT - 30)); err = -E2BIG; goto error_bad_pfn; } writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN); writel(q_pfn, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { u64 addr; addr = virtqueue_get_desc_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH); addr = virtqueue_get_avail_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH); addr = virtqueue_get_used_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH); writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); } vq->priv = info; info->vq = vq; spin_lock_irqsave(&vm_dev->lock, flags); list_add(&info->node, &vm_dev->virtqueues); spin_unlock_irqrestore(&vm_dev->lock, flags); return vq; } linux kernel static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], const bool *ctx, struct irq_affinity *desc) { err = request_irq(irq, vm_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vm_dev); if (err) return err; for (i = 0; i < nvqs; ++i) { if (!names[i]) { vqs[i] = NULL; continue; } vqs[i] = vm_setup_vq(vdev, queue_idx++, callbacks[i], names[i], ctx ? ctx[i] : false); if (IS_ERR(vqs[i])) { vm_del_vqs(vdev); return PTR_ERR(vqs[i]); } } return 0; } static struct virtqueue *vring_create_virtqueue_split( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) { struct virtqueue *vq; void *queue = NULL; dma_addr_t dma_addr; size_t queue_size_in_bytes; struct vring vring; /* TODO: allocate each queue chunk individually */ for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); if (queue) break; if (!may_reduce_num) return NULL; } queue_size_in_bytes = vring_size(num, vring_align); vring_init(&vring, num, queue, vring_align); vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, notify, callback, name); return vq; } /* Only available for split ring */ struct virtqueue *__vring_new_virtqueue(unsigned int index, struct vring vring, struct virtio_device *vdev, bool weak_barriers, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) { unsigned int i; struct vring_virtqueue *vq; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return NULL; vq = kmalloc(sizeof(*vq), GFP_KERNEL); if (!vq) return NULL; vq->packed_ring = false; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; vq->vq.num_free = vring.num; vq->vq.index = index; vq->we_own_ring = false; vq->notify = notify; vq->weak_barriers = weak_barriers; vq->broken = false; vq->last_used_idx = 0; vq->num_added = 0; vq->use_dma_api = vring_use_dma_api(vdev); #ifdef DEBUG vq->in_use = false; vq->last_add_time_valid = false; #endif vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; vq->split.queue_dma_addr = 0; vq->split.queue_size_in_bytes = 0; vq->split.vring = vring; vq->split.avail_flags_shadow = 0; vq->split.avail_idx_shadow = 0; /* No callback? Tell other side not to bother us. */ if (!callback) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = cpu_to_virtio16(vdev, vq->split.avail_flags_shadow); } vq->split.desc_state = kmalloc_array(vring.num, sizeof(struct vring_desc_state_split), GFP_KERNEL); if (!vq->split.desc_state) { kfree(vq); return NULL; } /* Put everything in free lists. */ vq->free_head = 0; for (i = 0; i < vring.num-1; i++) vq->split.vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); memset(vq->split.desc_state, 0, vring.num * sizeof(struct vring_desc_state_split)); list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; } static inline int virtqueue_add_split(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, void *ctx, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); struct scatterlist *sg; struct vring_desc *desc; unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; head = vq->free_head; if (virtqueue_use_indirect(_vq, total_sg)) desc = alloc_indirect_split(_vq, total_sg, gfp); else { desc = NULL; WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); } if (desc) { /* Use a single buffer which doesn't continue */ indirect = true; /* Set up rest to use this indirect table. */ i = 0; descs_used = 1; } else { indirect = false; desc = vq->split.vring.desc; i = head; descs_used = total_sg; } if (vq->vq.num_free < descs_used) { pr_debug("Can't add buf len %i - avail = %i\n", descs_used, vq->vq.num_free); /* FIXME: for historical reasons, we force a notify here if * there are outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */ if (out_sgs) vq->notify(&vq->vq); if (indirect) kfree(desc); END_USE(vq); return -ENOSPC; } for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE); if (vring_mapping_error(vq, addr)) goto unmap_release; desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; i = virtio16_to_cpu(_vq->vdev, desc[i].next); } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE); if (vring_mapping_error(vq, addr)) goto unmap_release; desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; i = virtio16_to_cpu(_vq->vdev, desc[i].next); } } /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); if (indirect) { /* Now that the indirect table is filled in, map it. */ dma_addr_t addr = vring_map_single( vq, desc, total_sg * sizeof(struct vring_desc), DMA_TO_DEVICE); if (vring_mapping_error(vq, addr)) goto unmap_release; vq->split.vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); vq->split.vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr); vq->split.vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); } /* We're using some buffers from the free list. */ vq->vq.num_free -= descs_used; /* Update free pointer */ if (indirect) vq->free_head = virtio16_to_cpu(_vq->vdev, vq->split.vring.desc[head].next); else vq->free_head = i; /* Store token and indirect buffer state. */ vq->split.desc_state[head].data = data; if (indirect) vq->split.desc_state[head].indir_desc = desc; else vq->split.desc_state[head].indir_desc = ctx; /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); vq->num_added++; }
qemu
void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr) { if (!vdev->vq[n].vring.num) { return; } vdev->vq[n].vring.desc = addr; virtio_queue_update_rings(vdev, n); }
https://blog.csdn.net/qq_33588730/article/details/105397879