dpdk例子中提供了兩種方法與linux kernel協議棧交互: TAP和KNI; 兩種方法都是創建虛擬設備用於收發報文;
TAP/TUN設備的創建
static int tap_create(char *name) { struct ifreq ifr; int fd, ret; fd = open("/dev/net/tun", O_RDWR); if (fd < 0) return fd; memset(&ifr, 0, sizeof(ifr)); /* TAP device without packet information */ ifr.ifr_flags = IFF_TAP | IFF_NO_PI; if (name && *name) rte_snprintf(ifr.ifr_name, IFNAMSIZ, name); ret = ioctl(fd, TUNSETIFF, (void *) &ifr); if (ret < 0) { close(fd); return ret; } if (name) rte_snprintf(name, IFNAMSIZ, ifr.ifr_name); return fd;
發送報文到TAP設備
for (;;) { struct rte_mbuf *pkts_burst[PKT_BURST_SZ]; unsigned i; /* 從PMD收包 */ const unsigned nb_rx = rte_eth_rx_burst(port_ids[lcore_id], 0, pkts_burst, PKT_BURST_SZ); core_stats[lcore_id].rx += nb_rx; for (i = 0; likely(i < nb_rx); i++) { struct rte_mbuf *m = pkts_burst[i]; /* 把收到的報文內容寫到TAP設備 */ /* Ignore return val from write() */ int ret = write(tap_fd, rte_pktmbuf_mtod(m, void*), rte_pktmbuf_data_len(m)); /* mbuf free */ rte_pktmbuf_free(m); if (unlikely(ret < 0)) lcore_stats[lcore_id].dropped++; else lcore_stats[lcore_id].tx++; } }
報文發送到kernel中后在TAP/TUN設備中需要通過橋接/路由進行L2/L3轉發后,到另外一個TAP/TUN設備供應用程序讀取
for (;;) { int ret; struct rte_mbuf *m = rte_pktmbuf_alloc(pktmbuf_pool); if (m == NULL) continue; /* 從TAP設備讀報文內容 */ ret = read(tap_fd, m->pkt.data, MAX_PACKET_SZ); lcore_stats[lcore_id].rx++; if (unlikely(ret < 0)) { FATAL_ERROR("Reading from %s interface failed", tap_name); } /* 轉換為mbuf */ m->pkt.nb_segs = 1; m->pkt.next = NULL; m->pkt.pkt_len = (uint16_t)ret; m->pkt.data_len = (uint16_t)ret; /* 發送報文 */ ret = rte_eth_tx_burst(port_ids[lcore_id], 0, &m, 1); if (unlikely(ret < 1)) { rte_pktmbuf_free(m); lcore_stats[lcore_id].dropped++; } else { lcore_stats[lcore_id].tx++; } }
這種方法比較簡單.但是這個方法的效率可能比較低,數據需要從用戶空間復制到內核空間,最后生成skb的時候還需要復制一次? 具體TAP/TUN的內核代碼后面有需要再進一步學習吧;
KNI的實現, example實現了以下功能
RX方向: PMD分配mbuf, 收包線程收到后把mbuf放入rx_q FIFO, kni線程從rx_q取出mbuf然后轉換為skb調用netif_rx把報文發送到協議棧中; 最后rx_q中取出來的mbuf放入free_q中, 由用戶空間的收包線程釋放;
TX方向: 從協議棧發到kni設備上的報文,kni的發包函數kni_net_tx從alloc_q中取可用的mbuf, 把skb轉換為mbuf, 並把mbuf放入tx_q中, 用戶空間的tx線程從tx_q取出mbuf並調用PMD驅動的發包函數發送報文;
/* Initialise each port */ for (port = 0; port < nb_sys_ports; port++) { /* Skip ports that are not enabled */ if (!(ports_mask & (1 << port))) continue; /* 初始化端口的收發包隊列並啟用 */ init_port(port); if (port >= RTE_MAX_ETHPORTS) rte_exit(EXIT_FAILURE, "Can not use more than " "%d ports for kni\n", RTE_MAX_ETHPORTS); /* 為每個物理端口分配一個KNI設備 */ kni_alloc(port); }
static int kni_alloc(uint8_t port_id) { uint8_t i; struct rte_kni *kni; struct rte_kni_conf conf; struct kni_port_params **params = kni_port_params_array; if (port_id >= RTE_MAX_ETHPORTS || !params[port_id]) return -1; /* 根據該端口的內核線程數決定kni設備數 */ /* 多線程模式 如果沒有指定內核線程數則每個port創建一個kni設備且不指定內核線程對應的lcore 如果指定了內核線程個數則根據內核線程個數為每個port創建相應個數的kni設備並指定內核線程對應的lcore 單線程模式 如果沒有指定內核線程數則每個port創建一個kni設備 如果指定了內核線程個數則根據內核線程個數為每個port創建相應個數的kni設備 */ params[port_id]->nb_kni = params[port_id]->nb_lcore_k ? params[port_id]->nb_lcore_k : 1; for (i = 0; i < params[port_id]->nb_kni; i++) { /* Clear conf at first */ memset(&conf, 0, sizeof(conf)); if (params[port_id]->nb_lcore_k) { rte_snprintf(conf.name, RTE_KNI_NAMESIZE, "vEth%u_%u", port_id, i);
/* 多線程模式強制綁定內核線程到某個lcore */ conf.core_id = params[port_id]->lcore_k[i]; conf.force_bind = 1; } else rte_snprintf(conf.name, RTE_KNI_NAMESIZE, "vEth%u", port_id); conf.group_id = (uint16_t)port_id; conf.mbuf_size = MAX_PACKET_SZ; /* 創建kni設備 */ /* * The first KNI device associated to a port * is the master, for multiple kernel thread * environment. */ if (i == 0) { struct rte_kni_ops ops; struct rte_eth_dev_info dev_info; memset(&dev_info, 0, sizeof(dev_info)); rte_eth_dev_info_get(port_id, &dev_info); conf.addr = dev_info.pci_dev->addr; conf.id = dev_info.pci_dev->id; memset(&ops, 0, sizeof(ops)); ops.port_id = port_id; ops.change_mtu = kni_change_mtu; ops.config_network_if = kni_config_network_interface; kni = rte_kni_alloc(pktmbuf_pool, &conf, &ops); } else kni = rte_kni_alloc(pktmbuf_pool, &conf, NULL); if (!kni) rte_exit(EXIT_FAILURE, "Fail to create kni for " "port: %d\n", port_id); params[port_id]->kni[i] = kni; } return 0; }
struct rte_kni * rte_kni_alloc(struct rte_mempool *pktmbuf_pool, const struct rte_kni_conf *conf, struct rte_kni_ops *ops) { int ret; struct rte_kni_device_info dev_info; struct rte_kni *ctx; char intf_name[RTE_KNI_NAMESIZE]; #define OBJNAMSIZ 32 char obj_name[OBJNAMSIZ]; char mz_name[RTE_MEMZONE_NAMESIZE]; const struct rte_memzone *mz; if (!pktmbuf_pool || !conf || !conf->name[0]) return NULL; /* 通過/dev/kni發送創建kni設備請求 */ /* Check FD and open once */ if (kni_fd < 0) { kni_fd = open("/dev/" KNI_DEVICE, O_RDWR); if (kni_fd < 0) { RTE_LOG(ERR, KNI, "Can not open /dev/%s\n", KNI_DEVICE); return NULL; } } /* vEthx_x (port_thread)或者vEthx(port) */ rte_snprintf(intf_name, RTE_KNI_NAMESIZE, conf->name); rte_snprintf(mz_name, RTE_MEMZONE_NAMESIZE, "KNI_INFO_%s", intf_name); /* rte_kni */ mz = kni_memzone_reserve(mz_name, sizeof(struct rte_kni), SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx = mz->addr; if (ctx->in_use) { RTE_LOG(ERR, KNI, "KNI %s is in use\n", ctx->name); goto fail; } memset(ctx, 0, sizeof(struct rte_kni)); if (ops) memcpy(&ctx->ops, ops, sizeof(struct rte_kni_ops)); memset(&dev_info, 0, sizeof(dev_info)); dev_info.bus = conf->addr.bus; dev_info.devid = conf->addr.devid; dev_info.function = conf->addr.function; dev_info.vendor_id = conf->id.vendor_id; dev_info.device_id = conf->id.device_id; dev_info.core_id = conf->core_id; dev_info.force_bind = conf->force_bind; dev_info.group_id = conf->group_id; dev_info.mbuf_size = conf->mbuf_size; rte_snprintf(ctx->name, RTE_KNI_NAMESIZE, intf_name); rte_snprintf(dev_info.name, RTE_KNI_NAMESIZE, intf_name); RTE_LOG(INFO, KNI, "pci: %02x:%02x:%02x \t %02x:%02x\n", dev_info.bus, dev_info.devid, dev_info.function, dev_info.vendor_id, dev_info.device_id); /* 初始化7個FIFO,分別用於TX RX ALLOC FREE REQ RESP SYNC */ /* TX RING */ rte_snprintf(obj_name, OBJNAMSIZ, "kni_tx_%s", intf_name); mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx->tx_q = mz->addr; kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX); dev_info.tx_phys = mz->phys_addr; /* RX RING */ rte_snprintf(obj_name, OBJNAMSIZ, "kni_rx_%s", intf_name); mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx->rx_q = mz->addr; kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX); dev_info.rx_phys = mz->phys_addr; /* ALLOC RING */ rte_snprintf(obj_name, OBJNAMSIZ, "kni_alloc_%s", intf_name); mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx->alloc_q = mz->addr; kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX); dev_info.alloc_phys = mz->phys_addr; /* FREE RING */ rte_snprintf(obj_name, OBJNAMSIZ, "kni_free_%s", intf_name); mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx->free_q = mz->addr; kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX); dev_info.free_phys = mz->phys_addr; /* Request RING */ rte_snprintf(obj_name, OBJNAMSIZ, "kni_req_%s", intf_name); mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx->req_q = mz->addr; kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX); dev_info.req_phys = mz->phys_addr; /* Response RING */ rte_snprintf(obj_name, OBJNAMSIZ, "kni_resp_%s", intf_name); mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx->resp_q = mz->addr; kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX); dev_info.resp_phys = mz->phys_addr; /* Req/Resp sync mem area */ rte_snprintf(obj_name, OBJNAMSIZ, "kni_sync_%s", intf_name); mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); KNI_MZ_CHECK(mz == NULL); ctx->sync_addr = mz->addr; dev_info.sync_va = mz->addr; dev_info.sync_phys = mz->phys_addr; /* MBUF mempool */ rte_snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_OBJ_NAME, pktmbuf_pool->name); mz = rte_memzone_lookup(mz_name); KNI_MZ_CHECK(mz == NULL); /* 記錄mbuf pool的虛擬地址和物理地址,用於內核線程計算偏移 */ dev_info.mbuf_va = mz->addr; dev_info.mbuf_phys = mz->phys_addr; ctx->pktmbuf_pool = pktmbuf_pool; ctx->group_id = conf->group_id; ctx->mbuf_size = conf->mbuf_size; /* 發送創建kni設備請求 */ ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); KNI_MZ_CHECK(ret < 0); ctx->in_use = 1; return ctx; fail: return NULL; }
kernel的kni設備收到ioctl后調用kni_ioctl_create創建kni設備
static int kni_ioctl_create(unsigned int ioctl_num, unsigned long ioctl_param) { int ret; struct rte_kni_device_info dev_info; struct pci_dev *pci = NULL; struct pci_dev *found_pci = NULL; struct net_device *net_dev = NULL; struct net_device *lad_dev = NULL; struct kni_dev *kni, *dev, *n; printk(KERN_INFO "KNI: Creating kni...\n"); /* Check the buffer size, to avoid warning */ if (_IOC_SIZE(ioctl_num) > sizeof(dev_info)) return -EINVAL; /* Copy kni info from user space */ ret = copy_from_user(&dev_info, (void *)ioctl_param, sizeof(dev_info)); if (ret) { KNI_ERR("copy_from_user in kni_ioctl_create"); return -EIO; } /** * Check if the cpu core id is valid for binding, * for multiple kernel thread mode. */ if (multiple_kthread_on && dev_info.force_bind && !cpu_online(dev_info.core_id)) { KNI_ERR("cpu %u is not online\n", dev_info.core_id); return -EINVAL; } /* 遍歷kni設備鏈表通過名字比較是否已經創建 */ /* Check if it has been created */ down_read(&kni_list_lock); list_for_each_entry_safe(dev, n, &kni_list_head, list) { if (kni_check_param(dev, &dev_info) < 0) { up_read(&kni_list_lock); return -EINVAL; } } up_read(&kni_list_lock); /* 虛擬設備創建 */ net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name, kni_net_init); if (net_dev == NULL) { KNI_ERR("error allocating device \"%s\"\n", dev_info.name); return -EBUSY; } kni = netdev_priv(net_dev);
/* 參數保存在priv中 */ kni->net_dev = net_dev; kni->group_id = dev_info.group_id; kni->core_id = dev_info.core_id; strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE); /* Translate user space info into kernel space info */ kni->tx_q = phys_to_virt(dev_info.tx_phys); kni->rx_q = phys_to_virt(dev_info.rx_phys); kni->alloc_q = phys_to_virt(dev_info.alloc_phys); kni->free_q = phys_to_virt(dev_info.free_phys); kni->req_q = phys_to_virt(dev_info.req_phys); kni->resp_q = phys_to_virt(dev_info.resp_phys); kni->sync_va = dev_info.sync_va; kni->sync_kva = phys_to_virt(dev_info.sync_phys); kni->mbuf_kva = phys_to_virt(dev_info.mbuf_phys); kni->mbuf_va = dev_info.mbuf_va; #ifdef RTE_KNI_VHOST kni->vhost_queue = NULL; kni->vq_status = BE_STOP; #endif kni->mbuf_size = dev_info.mbuf_size; KNI_PRINT("tx_phys: 0x%016llx, tx_q addr: 0x%p\n", (unsigned long long) dev_info.tx_phys, kni->tx_q); KNI_PRINT("rx_phys: 0x%016llx, rx_q addr: 0x%p\n", (unsigned long long) dev_info.rx_phys, kni->rx_q); KNI_PRINT("alloc_phys: 0x%016llx, alloc_q addr: 0x%p\n", (unsigned long long) dev_info.alloc_phys, kni->alloc_q); KNI_PRINT("free_phys: 0x%016llx, free_q addr: 0x%p\n", (unsigned long long) dev_info.free_phys, kni->free_q); KNI_PRINT("req_phys: 0x%016llx, req_q addr: 0x%p\n", (unsigned long long) dev_info.req_phys, kni->req_q); KNI_PRINT("resp_phys: 0x%016llx, resp_q addr: 0x%p\n", (unsigned long long) dev_info.resp_phys, kni->resp_q); KNI_PRINT("mbuf_phys: 0x%016llx, mbuf_kva: 0x%p\n", (unsigned long long) dev_info.mbuf_phys, kni->mbuf_kva); KNI_PRINT("mbuf_va: 0x%p\n", dev_info.mbuf_va); KNI_PRINT("mbuf_size: %u\n", kni->mbuf_size); KNI_DBG("PCI: %02x:%02x.%02x %04x:%04x\n", dev_info.bus, dev_info.devid, dev_info.function, dev_info.vendor_id, dev_info.device_id); pci = pci_get_device(dev_info.vendor_id, dev_info.device_id, NULL); /* Support Ethtool */ while (pci) { KNI_PRINT("pci_bus: %02x:%02x:%02x \n", pci->bus->number, PCI_SLOT(pci->devfn), PCI_FUNC(pci->devfn)); if ((pci->bus->number == dev_info.bus) && (PCI_SLOT(pci->devfn) == dev_info.devid) && (PCI_FUNC(pci->devfn) == dev_info.function)) { found_pci = pci; switch (dev_info.device_id) { #define RTE_PCI_DEV_ID_DECL_IGB(vend, dev) case (dev): #include <rte_pci_dev_ids.h> ret = igb_kni_probe(found_pci, &lad_dev); break; #define RTE_PCI_DEV_ID_DECL_IXGBE(vend, dev) \ case (dev): #include <rte_pci_dev_ids.h> ret = ixgbe_kni_probe(found_pci, &lad_dev); break; default: ret = -1; break; } KNI_DBG("PCI found: pci=0x%p, lad_dev=0x%p\n", pci, lad_dev); if (ret == 0) { kni->lad_dev = lad_dev; kni_set_ethtool_ops(kni->net_dev); } else { KNI_ERR("Device not supported by ethtool"); kni->lad_dev = NULL; } kni->pci_dev = found_pci; kni->device_id = dev_info.device_id; break; } pci = pci_get_device(dev_info.vendor_id, dev_info.device_id, pci); } if (pci) pci_dev_put(pci); /* 注冊虛擬設備 */ ret = register_netdev(net_dev); if (ret) { KNI_ERR("error %i registering device \"%s\"\n", ret, dev_info.name); kni_dev_remove(kni); return -ENODEV; } #ifdef RTE_KNI_VHOST kni_vhost_init(kni); #endif /** * Create a new kernel thread for multiple mode, set its core affinity, * and finally wake it up. */ if (multiple_kthread_on) { /* 多線程模式為每個kni設備創建一個內核線程 */ kni->pthread = kthread_create(kni_thread_multiple, (void *)kni, "kni_%s", kni->name); if (IS_ERR(kni->pthread)) { kni_dev_remove(kni); return -ECANCELED; } /* 綁定內核線程到對應的lcore */ if (dev_info.force_bind) kthread_bind(kni->pthread, kni->core_id); /* 喚醒該收包線程 */ wake_up_process(kni->pthread); } down_write(&kni_list_lock); list_add(&kni->list, &kni_list_head); up_write(&kni_list_lock); return 0; }
回到主線程的收發包
if (flag == LCORE_RX) { RTE_LOG(INFO, APP, "Lcore %u is reading from port %d\n", kni_port_params_array[i]->lcore_rx, kni_port_params_array[i]->port_id); while (1) { f_stop = rte_atomic32_read(&kni_stop); if (f_stop) break; kni_ingress(kni_port_params_array[i]); } } else if (flag == LCORE_TX) { RTE_LOG(INFO, APP, "Lcore %u is writing to port %d\n", kni_port_params_array[i]->lcore_tx, kni_port_params_array[i]->port_id); while (1) { f_stop = rte_atomic32_read(&kni_stop); if (f_stop) break; kni_egress(kni_port_params_array[i]); } }
收包線程從PMD中收包並發送給kni,然后檢查是否有需要處理的請求消息
static void kni_ingress(struct kni_port_params *p) { uint8_t i, port_id; unsigned nb_rx, num; uint32_t nb_kni; struct rte_mbuf *pkts_burst[PKT_BURST_SZ]; if (p == NULL) return; nb_kni = p->nb_kni; port_id = p->port_id; for (i = 0; i < nb_kni; i++) {
/* 從PMD驅動中收包 */
/* Burst rx from eth */ nb_rx = rte_eth_rx_burst(port_id, 0, pkts_burst, PKT_BURST_SZ); if (unlikely(nb_rx > PKT_BURST_SZ)) { RTE_LOG(ERR, APP, "Error receiving from eth\n"); return; }
/* 把所有收到的mbuf都存入rx_q中 */
/* Burst tx to kni */ num = rte_kni_tx_burst(p->kni[i], pkts_burst, nb_rx); kni_stats[port_id].rx_packets += num;
/* 處理端口的狀態變化 */ rte_kni_handle_request(p->kni[i]); if (unlikely(num < nb_rx)) { /* Free mbufs not tx to kni interface */ kni_burst_free_mbufs(&pkts_burst[num], nb_rx - num); kni_stats[port_id].rx_dropped += nb_rx - num; } } }
unsigned rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num) { /* mbuf放入tx_q */ unsigned ret = kni_fifo_put(kni->rx_q, (void **)mbufs, num); /* Get mbufs from free_q and then free them */ kni_free_mbufs(kni); return ret; } static void kni_free_mbufs(struct rte_kni *kni) { int i, ret; struct rte_mbuf *pkts[MAX_MBUF_BURST_NUM]; /* 從free_q中取mbuf並釋放 */ ret = kni_fifo_get(kni->free_q, (void **)pkts, MAX_MBUF_BURST_NUM); if (likely(ret > 0)) { for (i = 0; i < ret; i++) rte_pktmbuf_free(pkts[i]); } }
這里由於生產者和消費者都只有一個,因此FIFO中並未用到互斥/同步機制
/** * Adds num elements into the fifo. Return the number actually written */ static inline unsigned kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num) { unsigned i = 0; /* 把write和read復制到局部變量中 */ unsigned fifo_write = fifo->write; unsigned fifo_read = fifo->read; unsigned new_write = fifo_write; for (i = 0; i < num; i++) { new_write = (new_write + 1) & (fifo->len - 1); /* 空間已滿 */ if (new_write == fifo_read) break; /* 存入數據 */ fifo->buffer[fifo_write] = data[i]; fifo_write = new_write; } /* 更新FIFO的write */ fifo->write = fifo_write; return i; } /** * Get up to num elements from the fifo. Return the number actully read */ static inline unsigned kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num) { unsigned i = 0; /* 把write和read復制到局部變量中 */ unsigned new_read = fifo->read; unsigned fifo_write = fifo->write; for (i = 0; i < num; i++) { /* 沒有數據需要讀取 */ if (new_read == fifo_write) break; /* 讀取數據 */ data[i] = fifo->buffer[new_read]; new_read = (new_read + 1) & (fifo->len - 1); } /* 更新FIFO的read */ fifo->read = new_read; return i; }
發包線程
/** * Interface to dequeue mbufs from tx_q and burst tx */ static void kni_egress(struct kni_port_params *p) { uint8_t i, port_id; unsigned nb_tx, num; uint32_t nb_kni; struct rte_mbuf *pkts_burst[PKT_BURST_SZ]; if (p == NULL) return; nb_kni = p->nb_kni; port_id = p->port_id; for (i = 0; i < nb_kni; i++) { /* 從kni設備收包 */ /* Burst rx from kni */ num = rte_kni_rx_burst(p->kni[i], pkts_burst, PKT_BURST_SZ); if (unlikely(num > PKT_BURST_SZ)) { RTE_LOG(ERR, APP, "Error receiving from KNI\n"); return; } /* 發送給PMD發包 */ /* Burst tx to eth */ nb_tx = rte_eth_tx_burst(port_id, 0, pkts_burst, (uint16_t)num); kni_stats[port_id].tx_packets += nb_tx; if (unlikely(nb_tx < num)) { /* Free mbufs not tx to NIC */ kni_burst_free_mbufs(&pkts_burst[nb_tx], num - nb_tx); kni_stats[port_id].tx_dropped += num - nb_tx; } } }
unsigned rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num) { /* 從tx_q中取出mbuf */ unsigned ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num); /* 申請mbuf放入allc_q */ /* Allocate mbufs and then put them into alloc_q */ kni_allocate_mbufs(kni); return ret; } static void kni_allocate_mbufs(struct rte_kni *kni) { int i, ret; struct rte_mbuf *pkts[MAX_MBUF_BURST_NUM]; /* Check if pktmbuf pool has been configured */ if (kni->pktmbuf_pool == NULL) { RTE_LOG(ERR, KNI, "No valid mempool for allocating mbufs\n"); return; } /* 每次申請MAX_MBUF_BURST_NUM個mbuf */ for (i = 0; i < MAX_MBUF_BURST_NUM; i++) { pkts[i] = rte_pktmbuf_alloc(kni->pktmbuf_pool); if (unlikely(pkts[i] == NULL)) { /* Out of memory */ RTE_LOG(ERR, KNI, "Out of memory\n"); break; } } /* No pkt mbuf alocated */ if (i <= 0) return; /* 放入allc_q */ ret = kni_fifo_put(kni->alloc_q, (void **)pkts, i); /* 隊列已滿 回收未入隊的mbuf */ /* Check if any mbufs not put into alloc_q, and then free them */ if (ret >= 0 && ret < i && ret < MAX_MBUF_BURST_NUM) { int j; for (j = ret; j < i; j++) rte_pktmbuf_free(pkts[j]); } }
下面看下內核的kni設備的收發包函數
對於kni設備的收包函數,單線程模式下,打開設備的時候會啟動kni_thread_single線程; 多線程模式下,創建kni設備的時候會啟動kni_thread_multiple線程;
static int kni_thread_single(void *unused) { int j; struct kni_dev *dev, *n; while (!kthread_should_stop()) { down_read(&kni_list_lock); for (j = 0; j < KNI_RX_LOOP_NUM; j++) { /* 單線程模式下遍歷所有kni設備 */ list_for_each_entry_safe(dev, n, &kni_list_head, list) { #ifdef RTE_KNI_VHOST kni_chk_vhost_rx(dev); #else /* 從rx_q中收包 */ kni_net_rx(dev); #endif /* 用戶空間對request的響應 */ kni_net_poll_resp(dev); } } up_read(&kni_list_lock); /* reschedule out for a while */ schedule_timeout_interruptible(usecs_to_jiffies( \ KNI_KTHREAD_RESCHEDULE_INTERVAL)); } return 0; }
收包函數, 沒有配置lo模式的時候就是kni_net_rx_normal
static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal; /* rx interface */ void kni_net_rx(struct kni_dev *kni) { /** * It doesn't need to check if it is NULL pointer, * as it has a default value */ (*kni_net_rx_func)(kni); }
/* * RX: normal working mode */ static void kni_net_rx_normal(struct kni_dev *kni) { unsigned ret; uint32_t len; unsigned i, num, num_rq, num_fq; struct rte_kni_mbuf *kva; struct rte_kni_mbuf *va[MBUF_BURST_SZ]; void * data_kva; struct sk_buff *skb; struct net_device *dev = kni->net_dev; /* 每次收包的個數必須為rx_q和free_q的最小值且不超過MBUF_BURST_SZ */ /* Get the number of entries in rx_q */ num_rq = kni_fifo_count(kni->rx_q); /* Get the number of free entries in free_q */ num_fq = kni_fifo_free_count(kni->free_q); /* Calculate the number of entries to dequeue in rx_q */ num = min(num_rq, num_fq); num = min(num, (unsigned)MBUF_BURST_SZ); /* Return if no entry in rx_q and no free entry in free_q */ if (num == 0) return; /* Burst dequeue from rx_q */ ret = kni_fifo_get(kni->rx_q, (void **)va, num); if (ret == 0) return; /* Failing should not happen */ /* mbuf轉換為skb */ /* Transfer received packets to netif */ for (i = 0; i < num; i++) { /* mbuf kva */ kva = (void *)va[i] - kni->mbuf_va + kni->mbuf_kva; len = kva->data_len; /* data kva */ data_kva = kva->data - kni->mbuf_va + kni->mbuf_kva; skb = dev_alloc_skb(len + 2); if (!skb) { KNI_ERR("Out of mem, dropping pkts\n"); /* Update statistics */ kni->stats.rx_dropped++; } else { /* Align IP on 16B boundary */ skb_reserve(skb, 2); memcpy(skb_put(skb, len), data_kva, len); skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); skb->ip_summed = CHECKSUM_UNNECESSARY; /* 發送skb到協議棧 */ /* Call netif interface */ netif_receive_skb(skb); /* Update statistics */ kni->stats.rx_bytes += len; kni->stats.rx_packets++; } } /* 通知用戶空間釋放mbuf */ /* Burst enqueue mbufs into free_q */ ret = kni_fifo_put(kni->free_q, (void **)va, num); if (ret != num) /* Failing should not happen */ KNI_ERR("Fail to enqueue entries into free_q\n"); }
kni發包函數
static int kni_net_tx(struct sk_buff *skb, struct net_device *dev) { int len = 0; unsigned ret; struct kni_dev *kni = netdev_priv(dev); struct rte_kni_mbuf *pkt_kva = NULL; struct rte_kni_mbuf *pkt_va = NULL; dev->trans_start = jiffies; /* save the timestamp */ /* Check if the length of skb is less than mbuf size */ if (skb->len > kni->mbuf_size) goto drop; /** * Check if it has at least one free entry in tx_q and * one entry in alloc_q. */ if (kni_fifo_free_count(kni->tx_q) == 0 || kni_fifo_count(kni->alloc_q) == 0) { /** * If no free entry in tx_q or no entry in alloc_q, * drops skb and goes out. */ goto drop; } /* skb轉mbuf */ /* dequeue a mbuf from alloc_q */ ret = kni_fifo_get(kni->alloc_q, (void **)&pkt_va, 1); if (likely(ret == 1)) { void *data_kva; pkt_kva = (void *)pkt_va - kni->mbuf_va + kni->mbuf_kva; data_kva = pkt_kva->data - kni->mbuf_va + kni->mbuf_kva; len = skb->len; memcpy(data_kva, skb->data, len); if (unlikely(len < ETH_ZLEN)) { memset(data_kva + len, 0, ETH_ZLEN - len); len = ETH_ZLEN; } pkt_kva->pkt_len = len; pkt_kva->data_len = len; /* enqueue mbuf into tx_q */ ret = kni_fifo_put(kni->tx_q, (void **)&pkt_va, 1); if (unlikely(ret != 1)) { /* Failing should not happen */ KNI_ERR("Fail to enqueue mbuf into tx_q\n"); goto drop; } } else { /* Failing should not happen */ KNI_ERR("Fail to dequeue mbuf from alloc_q\n"); goto drop; } /* Free skb and update statistics */ dev_kfree_skb(skb); kni->stats.tx_bytes += len; kni->stats.tx_packets++; return NETDEV_TX_OK; drop: /* Free skb and update statistics */ dev_kfree_skb(skb); kni->stats.tx_dropped++; return NETDEV_TX_OK; }