圖1. kni結構圖
從結構圖中可以看到KNI需要內核模塊的支持,即rte_kni.ko
當rte_kni模塊加載時,創建/dev/kni設備節點(rte_kni模塊創建kni雜項設備,文件系統節點/dev/kni需要手動或者通過udev機制創建),藉此節點,DPDK KNI應用可控制和與內核rte_kni模塊交互。
在內核模塊rte_kni加載時,可指定一些可選的參數以控制其行為:
# modinfo rte_kni.ko
lo_mode: KNI loopback mode (default=lo_mode_none):
lo_mode_none Kernel loopback disabled
lo_mode_fifo Enable kernel loopback with fifo
lo_mode_fifo_skb Enable kernel loopback with fifo and skb buffer
kthread_mode: Kernel thread mode (default=single):
single Single kernel thread mode enabled.
multiple Multiple kernel thread mode enabled.
carrier: Default carrier state for KNI interface (default=off):
off Interfaces will be created with carrier state set to off.
on Interfaces will be created with carrier state set to on.
典型的情況是,在加載rte_kni模塊時不指定任何參數,DPDK應用可由內核網絡協議棧獲取和向其發送報文。不指定任何參數,意味着僅創建一個內核線程處理所有的KNI虛擬設備在內核側的報文接收,並且禁用回環模式,KNI接口的默認鏈路狀態為關閉off
回環模式
以測試為目的,在加載rte_kni模塊式可指定lo_mode參數:
# insmod kmod/rte_kni.ko lo_mode=lo_mode_fifo
lo_mode_fifo回環模式將在內核空間中操作FIFO環隊列,由函數kni_fifo_get(kni->rx_q,...)和kni_fifo_put(kni->tx_q,...)實現從rx_q接收隊列讀取報文,再寫入發送隊列tx_q來實現回環操作。
# insmod kmod/rte_kni.ko lo_mode=lo_mode_fifo_skb
lo_mode_fifo_skb回環模式在以上lo_mode_fifo的基礎之上,增加了sk_buff緩存的相關拷貝操作。具體包括將rx_q接收隊列的數據拷貝到分配的接收skb緩存中。以及分配發送skb緩存,將之前由rx_q隊列接收數據拷貝到發送skb緩存中,使用函數kni_net_tx(skb, dev)發送skb緩存數據。最終將數據報文拷貝到mbuf結構中,使用kni_fifo_put函數加入到tx_q發送隊列。可見此回環測試模式,更加接近真實的使用場景
如果沒有指定lo_mode參數,回環模式將禁用。
默認鏈路狀態
內核模塊rte_kni創建的KNI虛擬接口的鏈路狀態,可通過模塊加裝時的carrier選項控制。
如果指定了carrier=off,當接口管理使能時,內核模塊將接口的鏈路狀態設置為關閉。DPDK應用可通過函數rte_kni_update_link設置KNI虛擬接口的鏈路狀態。這對於需要KNI虛擬接口狀態與對應的物理接口實際狀態一致的應用是有用的。
如果指定了carrier=on,當接口管理使能時,內核模塊將自動設置接口的鏈路狀態為啟用。這對於僅將KNI接口作為純虛擬接口,而不對應任何物理硬件;或者並不想通過rte_kni_update_link函數顯示設置接口鏈路狀態的DPDK應用是有用的。對於物理口為連接任何鏈路而進行的回環模式測試也是有用的。
以下,設置默認的鏈路狀態為啟用:
# insmod kmod/rte_kni.ko carrier=on
以下,設置默認的鏈路狀態為關閉:
# insmod kmod/rte_kni.ko carrier=off
如果carrier參數沒有指定,KNI虛擬接口的默認鏈路狀態為關閉。
在任何KNI虛擬接口創建之前,rte_kni內核模塊必須加裝到內核,並且經由rte_kni_init函數配置(獲取/dev/kni設備節點文件句柄)
int rte_kni_init(unsigned int max_kni_ifaces __rte_unused) { /* Check FD and open */ if (kni_fd < 0) { kni_fd = open("/dev/" KNI_DEVICE, O_RDWR); if (kni_fd < 0) { RTE_LOG(ERR, KNI, "Can not open /dev/%s\n", KNI_DEVICE); return -1; } }
模塊初始化函數kni_init也非常簡單。除了解析上面的參數配置外,比較重要的就是注冊misc設備和配置lo_mode。
1 static int __init 2 kni_init(void) 3 { 4 int rc; 5 6 if (kni_parse_kthread_mode() < 0) { 7 pr_err("Invalid parameter for kthread_mode\n"); 8 return -EINVAL; 9 } 10 11 if (multiple_kthread_on == 0) 12 pr_debug("Single kernel thread for all KNI devices\n"); 13 else 14 pr_debug("Multiple kernel thread mode enabled\n"); 15 16 if (kni_parse_carrier_state() < 0) { //carrier可配置為off和on,默認為off 17 pr_err("Invalid parameter for carrier\n"); 18 return -EINVAL; 19 } 20 21 if (kni_dflt_carrier == 0) 22 pr_debug("Default carrier state set to off.\n"); 23 else 24 pr_debug("Default carrier state set to on.\n"); 25 26 #ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS 27 rc = register_pernet_subsys(&kni_net_ops); 28 #else 29 rc = register_pernet_gen_subsys(&kni_net_id, &kni_net_ops); 30 #endif 31 if (rc) 32 return -EPERM; 33 34 rc = misc_register(&kni_misc); 35 if (rc != 0) { 36 pr_err("Misc registration failed\n"); 37 goto out; 38 } 39 40 /* Configure the lo mode according to the input parameter */ 41 kni_net_config_lo_mode(lo_mode); 42 43 return 0; 44 45 out: 46 #ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS 47 unregister_pernet_subsys(&kni_net_ops); 48 #else 49 unregister_pernet_gen_subsys(kni_net_id, &kni_net_ops); 50 #endif 51 return rc; 52 }
kni_net_config_lo_mode:
1 void 2 kni_net_config_lo_mode(char *lo_str) 3 { 4 if (!lo_str) { 5 pr_debug("loopback disabled"); 6 return; 7 } 8 9 if (!strcmp(lo_str, "lo_mode_none")) 10 pr_debug("loopback disabled"); 11 else if (!strcmp(lo_str, "lo_mode_fifo")) { 12 pr_debug("loopback mode=lo_mode_fifo enabled"); 13 kni_net_rx_func = kni_net_rx_lo_fifo; 14 } else if (!strcmp(lo_str, "lo_mode_fifo_skb")) { 15 pr_debug("loopback mode=lo_mode_fifo_skb enabled"); 16 kni_net_rx_func = kni_net_rx_lo_fifo_skb; 17 } else { 18 pr_debug("Unknown loopback parameter, disabled"); 19 } 20 }
lo_mode可配置為lo_mode_none,lo_mode_fifo,和lo_mode_fifo_skb,默認為lo_mode_none。另外兩個在實際產品中基本不會用到
配置lo_mode,函數指針kni_net_rx_func指向不同的函數,默認是kni_net_rx_func
通過register_pernet_subsys或者register_pernet_gen_subsys,注冊了kni_net_ops,保證每個namespace都會調用kni_init_net進行初始化
注冊為misc設備后,其工作機制由注冊的miscdevice決定,即
1 static const struct file_operations kni_fops = { 2 .owner = THIS_MODULE, 3 .open = kni_open, //檢查保證一個namespace只能打開kni一次,打開后將kni基於namespace的私有數據賦值給打開的文件file->private_data,以便后面使用 4 .release = kni_release, 5 .unlocked_ioctl = (void *)kni_ioctl, 6 .compat_ioctl = (void *)kni_compat_ioctl, 7 }; 8 9 static struct miscdevice kni_misc = { 10 .minor = MISC_DYNAMIC_MINOR, 11 .name = KNI_DEVICE, 12 .fops = &kni_fops, 13 };
如何使用kni設備呢?
1 static int 2 kni_ioctl(struct inode *inode, uint32_t ioctl_num, unsigned long ioctl_param) 3 { 4 int ret = -EINVAL; 5 struct net *net = current->nsproxy->net_ns; 6 7 pr_debug("IOCTL num=0x%0x param=0x%0lx\n", ioctl_num, ioctl_param); 8 9 /* 10 * Switch according to the ioctl called 11 */ 12 switch (_IOC_NR(ioctl_num)) { 13 case _IOC_NR(RTE_KNI_IOCTL_TEST): 14 /* For test only, not used */ 15 break; 16 case _IOC_NR(RTE_KNI_IOCTL_CREATE): 17 ret = kni_ioctl_create(net, ioctl_num, ioctl_param); 18 break; 19 case _IOC_NR(RTE_KNI_IOCTL_RELEASE): 20 ret = kni_ioctl_release(net, ioctl_num, ioctl_param); 21 break; 22 default: 23 pr_debug("IOCTL default\n"); 24 break; 25 } 26 27 return ret; 28 }
RTE_KNI_IOCTL_CREATE和RTE_KNI_IOCTL_RELEASE,分別對應DPDK用戶態的rte_kni_alloc和rte_kni_release,即申請kni interface和釋放kni interface
rte_kni_alloc:
1 struct rte_kni * 2 rte_kni_alloc(struct rte_mempool *pktmbuf_pool, 3 const struct rte_kni_conf *conf, 4 struct rte_kni_ops *ops) 5 { 6 int ret; 7 struct rte_kni_device_info dev_info; 8 struct rte_kni *kni; 9 struct rte_tailq_entry *te; 10 struct rte_kni_list *kni_list; 11 12 if (!pktmbuf_pool || !conf || !conf->name[0]) 13 return NULL; 14 15 /* Check if KNI subsystem has been initialized */ 16 if (kni_fd < 0) { 17 RTE_LOG(ERR, KNI, "KNI subsystem has not been initialized. Invoke rte_kni_init() first\n"); 18 return NULL; 19 } 20 21 rte_mcfg_tailq_write_lock(); 22 23 kni = __rte_kni_get(conf->name); 24 if (kni != NULL) { 25 RTE_LOG(ERR, KNI, "KNI already exists\n"); 26 goto unlock; 27 } 28 29 te = rte_zmalloc("KNI_TAILQ_ENTRY", sizeof(*te), 0); 30 if (te == NULL) { 31 RTE_LOG(ERR, KNI, "Failed to allocate tailq entry\n"); 32 goto unlock; 33 } 34 35 kni = rte_zmalloc("KNI", sizeof(struct rte_kni), RTE_CACHE_LINE_SIZE); 36 if (kni == NULL) { 37 RTE_LOG(ERR, KNI, "KNI memory allocation failed\n"); 38 goto kni_fail; 39 } 40 41 strlcpy(kni->name, conf->name, RTE_KNI_NAMESIZE); 42 43 if (ops) 44 memcpy(&kni->ops, ops, sizeof(struct rte_kni_ops)); 45 else 46 kni->ops.port_id = UINT16_MAX; 47 48 memset(&dev_info, 0, sizeof(dev_info)); 49 dev_info.core_id = conf->core_id; 50 dev_info.force_bind = conf->force_bind; 51 dev_info.group_id = conf->group_id; 52 dev_info.mbuf_size = conf->mbuf_size; 53 dev_info.mtu = conf->mtu; 54 dev_info.min_mtu = conf->min_mtu; 55 dev_info.max_mtu = conf->max_mtu; 56 57 memcpy(dev_info.mac_addr, conf->mac_addr, RTE_ETHER_ADDR_LEN); 58 59 strlcpy(dev_info.name, conf->name, RTE_KNI_NAMESIZE); 60 61 ret = kni_reserve_mz(kni);//kni_reserve_mz申請連續的物理內存,並用其作為各個ring 62 if (ret < 0) 63 goto mz_fail; 64 65 /* TX RING */ 66 kni->tx_q = kni->m_tx_q->addr; 67 kni_fifo_init(kni->tx_q, KNI_FIFO_COUNT_MAX); 68 dev_info.tx_phys = kni->m_tx_q->phys_addr; 69 70 /* RX RING */ 71 kni->rx_q = kni->m_rx_q->addr; 72 kni_fifo_init(kni->rx_q, KNI_FIFO_COUNT_MAX); 73 dev_info.rx_phys = kni->m_rx_q->phys_addr; 74 75 /* ALLOC RING */ 76 kni->alloc_q = kni->m_alloc_q->addr; 77 kni_fifo_init(kni->alloc_q, KNI_FIFO_COUNT_MAX); 78 dev_info.alloc_phys = kni->m_alloc_q->phys_addr; 79 80 /* FREE RING */ 81 kni->free_q = kni->m_free_q->addr; 82 kni_fifo_init(kni->free_q, KNI_FIFO_COUNT_MAX); 83 dev_info.free_phys = kni->m_free_q->phys_addr; 84 85 /* Request RING */ 86 kni->req_q = kni->m_req_q->addr; 87 kni_fifo_init(kni->req_q, KNI_FIFO_COUNT_MAX); 88 dev_info.req_phys = kni->m_req_q->phys_addr; 89 90 /* Response RING */ 91 kni->resp_q = kni->m_resp_q->addr; 92 kni_fifo_init(kni->resp_q, KNI_FIFO_COUNT_MAX); 93 dev_info.resp_phys = kni->m_resp_q->phys_addr; 94 95 /* Req/Resp sync mem area */ 96 kni->sync_addr = kni->m_sync_addr->addr; 97 dev_info.sync_va = kni->m_sync_addr->addr; 98 dev_info.sync_phys = kni->m_sync_addr->phys_addr; 99 100 kni->pktmbuf_pool = pktmbuf_pool; 101 kni->group_id = conf->group_id; 102 kni->mbuf_size = conf->mbuf_size; 103 104 dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0; 105 106 ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); //使用IOCTL創建內核對應的虛擬網口 107 if (ret < 0) 108 goto ioctl_fail; 109 110 te->data = kni; 111 112 kni_list = RTE_TAILQ_CAST(rte_kni_tailq.head, rte_kni_list); 113 TAILQ_INSERT_TAIL(kni_list, te, next); 114 115 rte_mcfg_tailq_write_unlock(); 116 117 /* Allocate mbufs and then put them into alloc_q */ 118 kni_allocate_mbufs(kni); //分配隊列內存資源 119 120 return kni; 121 122 ioctl_fail: 123 kni_release_mz(kni); 124 mz_fail: 125 rte_free(kni); 126 kni_fail: 127 rte_free(te); 128 unlock: 129 rte_mcfg_tailq_write_unlock(); 130 131 return NULL; 132 }
分配每個kni口對應的結構,並管理起來:
1 /** 2 * KNI context 3 */ 4 struct rte_kni { 5 char name[RTE_KNI_NAMESIZE]; /**< KNI interface name */ 6 uint16_t group_id; /**< Group ID of KNI devices */ 7 uint32_t slot_id; /**< KNI pool slot ID */ 8 struct rte_mempool *pktmbuf_pool; /**< pkt mbuf mempool */ 9 unsigned int mbuf_size; /**< mbuf size */ 10 11 const struct rte_memzone *m_tx_q; /**< TX queue memzone */ 12 const struct rte_memzone *m_rx_q; /**< RX queue memzone */ 13 const struct rte_memzone *m_alloc_q;/**< Alloc queue memzone */ 14 const struct rte_memzone *m_free_q; /**< Free queue memzone */ 15 16 struct rte_kni_fifo *tx_q; /**< TX queue */ 17 struct rte_kni_fifo *rx_q; /**< RX queue */ 18 struct rte_kni_fifo *alloc_q; /**< Allocated mbufs queue */ 19 struct rte_kni_fifo *free_q; /**< To be freed mbufs queue */ 20 21 const struct rte_memzone *m_req_q; /**< Request queue memzone */ 22 const struct rte_memzone *m_resp_q; /**< Response queue memzone */ 23 const struct rte_memzone *m_sync_addr;/**< Sync addr memzone */ 24 25 /* For request & response */ 26 struct rte_kni_fifo *req_q; /**< Request queue */ 27 struct rte_kni_fifo *resp_q; /**< Response queue */ 28 void *sync_addr; /**< Req/Resp Mem address */ 29 30 struct rte_kni_ops ops; /**< operations for request */ 31 };
接着根據rte_kni_conf信息配置rte_kni_device_info:
1 /* 2 * Struct used to create a KNI device. Passed to the kernel in IOCTL call 3 */ 4 5 struct rte_kni_device_info { 6 char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ 7 8 phys_addr_t tx_phys; 9 phys_addr_t rx_phys; 10 phys_addr_t alloc_phys; 11 phys_addr_t free_phys; 12 13 /* Used by Ethtool */ 14 phys_addr_t req_phys; 15 phys_addr_t resp_phys; 16 phys_addr_t sync_phys; 17 void * sync_va; 18 19 /* mbuf mempool */ 20 void * mbuf_va; 21 phys_addr_t mbuf_phys; 22 23 uint16_t group_id; /**< Group ID */ 24 uint32_t core_id; /**< core ID to bind for kernel thread */ 25 26 __extension__ 27 uint8_t force_bind : 1; /**< Flag for kernel thread binding */ 28 29 /* mbuf size */ 30 unsigned mbuf_size; 31 unsigned int mtu; 32 unsigned int min_mtu; 33 unsigned int max_mtu; 34 uint8_t mac_addr[6]; 35 uint8_t iova_mode; 36 };
kni的資源初始化完成后使用IOCTL創建內核對應的虛擬網口:
ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
驅動執行kni_ioctl_create:
1 static int 2 kni_ioctl_create(struct net *net, uint32_t ioctl_num, 3 unsigned long ioctl_param) 4 { 5 struct kni_net *knet = net_generic(net, kni_net_id); 6 int ret; 7 struct rte_kni_device_info dev_info; 8 struct net_device *net_dev = NULL; 9 struct kni_dev *kni, *dev, *n; 10 11 pr_info("Creating kni...\n"); 12 /* Check the buffer size, to avoid warning */ 13 if (_IOC_SIZE(ioctl_num) > sizeof(dev_info)) 14 return -EINVAL; 15 16 /* Copy kni info from user space */ 17 if (copy_from_user(&dev_info, (void *)ioctl_param, sizeof(dev_info))) 18 return -EFAULT; 19 20 /* Check if name is zero-ended */ 21 if (strnlen(dev_info.name, sizeof(dev_info.name)) == sizeof(dev_info.name)) { 22 pr_err("kni.name not zero-terminated"); 23 return -EINVAL; 24 } 25 26 /** 27 * Check if the cpu core id is valid for binding. 28 */ 29 if (dev_info.force_bind && !cpu_online(dev_info.core_id)) { 30 pr_err("cpu %u is not online\n", dev_info.core_id); 31 return -EINVAL; 32 } 33 34 /* Check if it has been created */ 35 down_read(&knet->kni_list_lock); 36 list_for_each_entry_safe(dev, n, &knet->kni_list_head, list) { 37 if (kni_check_param(dev, &dev_info) < 0) { 38 up_read(&knet->kni_list_lock); 39 return -EINVAL; 40 } 41 } 42 up_read(&knet->kni_list_lock); 43 44 net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name, 45 #ifdef NET_NAME_USER 46 NET_NAME_USER, 47 #endif 48 kni_net_init); 49 if (net_dev == NULL) { 50 pr_err("error allocating device \"%s\"\n", dev_info.name); 51 return -EBUSY; 52 } 53 54 dev_net_set(net_dev, net); 55 56 kni = netdev_priv(net_dev); 57 58 kni->net_dev = net_dev; 59 kni->core_id = dev_info.core_id; 60 strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE); 61 62 /* Translate user space info into kernel space info */ 63 if (dev_info.iova_mode) { 64 #ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT 65 kni->tx_q = iova_to_kva(current, dev_info.tx_phys); 66 kni->rx_q = iova_to_kva(current, dev_info.rx_phys); 67 kni->alloc_q = iova_to_kva(current, dev_info.alloc_phys); 68 kni->free_q = iova_to_kva(current, dev_info.free_phys); 69 70 kni->req_q = iova_to_kva(current, dev_info.req_phys); 71 kni->resp_q = iova_to_kva(current, dev_info.resp_phys); 72 kni->sync_va = dev_info.sync_va; 73 kni->sync_kva = iova_to_kva(current, dev_info.sync_phys); 74 kni->usr_tsk = current; 75 kni->iova_mode = 1; 76 #else 77 pr_err("KNI module does not support IOVA to VA translation\n"); 78 return -EINVAL; 79 #endif 80 } else { 81 82 kni->tx_q = phys_to_virt(dev_info.tx_phys); 83 kni->rx_q = phys_to_virt(dev_info.rx_phys); 84 kni->alloc_q = phys_to_virt(dev_info.alloc_phys); 85 kni->free_q = phys_to_virt(dev_info.free_phys); 86 87 kni->req_q = phys_to_virt(dev_info.req_phys); 88 kni->resp_q = phys_to_virt(dev_info.resp_phys); 89 kni->sync_va = dev_info.sync_va; 90 kni->sync_kva = phys_to_virt(dev_info.sync_phys); 91 kni->iova_mode = 0; 92 } 93 94 kni->mbuf_size = dev_info.mbuf_size; 95 96 pr_debug("tx_phys: 0x%016llx, tx_q addr: 0x%p\n", 97 (unsigned long long) dev_info.tx_phys, kni->tx_q); 98 pr_debug("rx_phys: 0x%016llx, rx_q addr: 0x%p\n", 99 (unsigned long long) dev_info.rx_phys, kni->rx_q); 100 pr_debug("alloc_phys: 0x%016llx, alloc_q addr: 0x%p\n", 101 (unsigned long long) dev_info.alloc_phys, kni->alloc_q); 102 pr_debug("free_phys: 0x%016llx, free_q addr: 0x%p\n", 103 (unsigned long long) dev_info.free_phys, kni->free_q); 104 pr_debug("req_phys: 0x%016llx, req_q addr: 0x%p\n", 105 (unsigned long long) dev_info.req_phys, kni->req_q); 106 pr_debug("resp_phys: 0x%016llx, resp_q addr: 0x%p\n", 107 (unsigned long long) dev_info.resp_phys, kni->resp_q); 108 pr_debug("mbuf_size: %u\n", kni->mbuf_size); 109 110 /* if user has provided a valid mac address */ 111 if (is_valid_ether_addr(dev_info.mac_addr)) 112 memcpy(net_dev->dev_addr, dev_info.mac_addr, ETH_ALEN); 113 else 114 /* 115 * Generate random mac address. eth_random_addr() is the 116 * newer version of generating mac address in kernel. 117 */ 118 random_ether_addr(net_dev->dev_addr); 119 120 if (dev_info.mtu) 121 net_dev->mtu = dev_info.mtu; 122 #ifdef HAVE_MAX_MTU_PARAM 123 net_dev->max_mtu = net_dev->mtu; 124 125 if (dev_info.min_mtu) 126 net_dev->min_mtu = dev_info.min_mtu; 127 128 if (dev_info.max_mtu) 129 net_dev->max_mtu = dev_info.max_mtu; 130 #endif 131 132 ret = register_netdev(net_dev); //注冊netdev 133 if (ret) { 134 pr_err("error %i registering device \"%s\"\n", 135 ret, dev_info.name); 136 kni->net_dev = NULL; 137 kni_dev_remove(kni); 138 free_netdev(net_dev); 139 return -ENODEV; 140 } 141 142 netif_carrier_off(net_dev); 143 144 ret = kni_run_thread(knet, kni, dev_info.force_bind); //啟動內核接收線 145 if (ret != 0) 146 return ret; 147 148 down_write(&knet->kni_list_lock); 149 list_add(&kni->list, &knet->kni_list_head); 150 up_write(&knet->kni_list_lock); 151 152 return 0; 153 }
通過phys_to_virt將ring的物理地址轉成虛擬地址使用,這樣就保證了KNI的用戶態和內核態使用同一片物理地址,從而做到零拷貝
1 static int 2 kni_run_thread(struct kni_net *knet, struct kni_dev *kni, uint8_t force_bind) 3 { 4 /** 5 * Create a new kernel thread for multiple mode, set its core affinity, 6 * and finally wake it up. 7 */ 8 if (multiple_kthread_on) { 9 kni->pthread = kthread_create(kni_thread_multiple, 10 (void *)kni, "kni_%s", kni->name); 11 if (IS_ERR(kni->pthread)) { 12 kni_dev_remove(kni); 13 return -ECANCELED; 14 } 15 16 if (force_bind) 17 kthread_bind(kni->pthread, kni->core_id); 18 wake_up_process(kni->pthread); 19 } else { 20 mutex_lock(&knet->kni_kthread_lock); 21 22 if (knet->kni_kthread == NULL) { 23 knet->kni_kthread = kthread_create(kni_thread_single, 24 (void *)knet, "kni_single"); 25 if (IS_ERR(knet->kni_kthread)) { 26 mutex_unlock(&knet->kni_kthread_lock); 27 kni_dev_remove(kni); 28 return -ECANCELED; 29 } 30 31 if (force_bind) 32 kthread_bind(knet->kni_kthread, kni->core_id); 33 wake_up_process(knet->kni_kthread); 34 } 35 36 mutex_unlock(&knet->kni_kthread_lock); 37 } 38 39 return 0; 40 }
如果KNI模塊的參數指定了多線程模式,每創建一個kni設備,就創建一個內核線程。如果為單線程模式,則檢查是否已經啟動了kni_thread。沒有的話,創建唯一的kni內核thread kni_single,有的話,則什么都不做。
用戶態與內核態報文交互的方式:
從ingress方向看,rte_eth_rx_burst時mbuf mempool中分配內存,通過rx_q發送給KNI,KNI線程將mbuf從rx_q中出隊,將其轉換成skb后,就將原mbuf通過free_q歸還給用戶rx thread,並由rx_thread釋放,可以看出ingress方向的mbuf完全由用戶態rx_thread自行管理。
從egress方向看,當KNI口發包時,從 mbuf cache中取得一個mbuf,將skb內容copy到mbuf中,進入tx_q隊列,tx_thread將該mbuf出隊並完成發送,因為發送后該mbuf會被釋放。所以要重新alloc一個mbuf通過alloc_q歸還給kernel。這部分mbuf是上面用戶態填充的alloc_q。
這是DPDK app向KNI設備寫入數據,也就是發給內核的情況。當內核從KNI設備發送數據時,按照內核的流程處理,最終會調用到net_device_ops->ndo_start_xmit。對於KNI驅動來說,即kni_net_tx。
1 /* 2 * Transmit a packet (called by the kernel) 3 */ 4 static int 5 kni_net_tx(struct sk_buff *skb, struct net_device *dev) 6 { 7 int len = 0; 8 uint32_t ret; 9 struct kni_dev *kni = netdev_priv(dev); 10 struct rte_kni_mbuf *pkt_kva = NULL; 11 void *pkt_pa = NULL; 12 void *pkt_va = NULL; 13 14 /* save the timestamp */ 15 #ifdef HAVE_TRANS_START_HELPER 16 netif_trans_update(dev); 17 #else 18 dev->trans_start = jiffies; 19 #endif 20 21 /* Check if the length of skb is less than mbuf size */ 22 if (skb->len > kni->mbuf_size) 23 goto drop; 24 25 /** 26 * Check if it has at least one free entry in tx_q and 27 * one entry in alloc_q. 28 */ 29 if (kni_fifo_free_count(kni->tx_q) == 0 || 30 kni_fifo_count(kni->alloc_q) == 0) { 31 /** 32 * If no free entry in tx_q or no entry in alloc_q, 33 * drops skb and goes out. 34 */ 35 goto drop; 36 } 37 38 /* dequeue a mbuf from alloc_q */ 39 ret = kni_fifo_get(kni->alloc_q, &pkt_pa, 1); 40 if (likely(ret == 1)) { 41 void *data_kva; 42 43 pkt_kva = get_kva(kni, pkt_pa); 44 data_kva = get_data_kva(kni, pkt_kva); 45 pkt_va = pa2va(pkt_pa, pkt_kva); 46 47 len = skb->len; 48 memcpy(data_kva, skb->data, len); 49 if (unlikely(len < ETH_ZLEN)) { 50 memset(data_kva + len, 0, ETH_ZLEN - len); 51 len = ETH_ZLEN; 52 } 53 pkt_kva->pkt_len = len; 54 pkt_kva->data_len = len; 55 56 /* enqueue mbuf into tx_q */ 57 ret = kni_fifo_put(kni->tx_q, &pkt_va, 1); 58 if (unlikely(ret != 1)) { 59 /* Failing should not happen */ 60 pr_err("Fail to enqueue mbuf into tx_q\n"); 61 goto drop; 62 } 63 } else { 64 /* Failing should not happen */ 65 pr_err("Fail to dequeue mbuf from alloc_q\n"); 66 goto drop; 67 } 68 69 /* Free skb and update statistics */ 70 dev_kfree_skb(skb); 71 dev->stats.tx_bytes += len; 72 dev->stats.tx_packets++; 73 74 return NETDEV_TX_OK; 75 76 drop: 77 /* Free skb and update statistics */ 78 dev_kfree_skb(skb); 79 dev->stats.tx_dropped++; 80 81 return NETDEV_TX_OK; 82 }
1.對skb報文長度做檢查,不能超過mbuf的大小。然后檢查發送隊列tx_q是否還有空位,“內存隊列”是否有剩余的mbuf
2.從alloc_q取出一個內存塊,將其轉換為虛擬地址,然后將skb的數據復制過去,最后將其追加到發送隊列tx_q中
3.發送完成后,就直接釋放skb並更新統計計數
DPDK提供了兩個API rte_kni_rx_burst和rte_kni_tx_burst,用於從KNI接收報文和向KNI發送報文
1 unsigned 2 rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned int num) 3 { 4 unsigned int ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num); 5 6 /* If buffers removed, allocate mbufs and then put them into alloc_q */ 7 if (ret) 8 kni_allocate_mbufs(kni); 9 10 return ret; 11 } 12 13 14 unsigned 15 rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned int num) 16 { 17 num = RTE_MIN(kni_fifo_free_count(kni->rx_q), num); 18 void *phy_mbufs[num]; 19 unsigned int ret; 20 unsigned int i; 21 22 for (i = 0; i < num; i++) 23 phy_mbufs[i] = va2pa_all(mbufs[i]); 24 25 ret = kni_fifo_put(kni->rx_q, phy_mbufs, num); 26 27 /* Get mbufs from free_q and then free them */ 28 kni_free_mbufs(kni); 29 30 31 }
1.接收報文時,從kni->tx_q直接取走所有報文。前面內核用KNI發送報文時,填充的就是這個fifo。當取走了報文后,DPDK應用層的調用kni_allocate_mbufs,負責給tx_q填充空閑mbuf,供內核使用。
2.發送報文時,先將要發送給KNI的報文地址轉換為物理地址,然后enqueue到kni->rx_q中(內核的KNI實現也是從這個fifo中讀取報文),最后調用kni_free_mbufs釋放掉內核處理完的mbuf報文