一. DPDK源碼版本: DPDK19.02 http://core.dpdk.org/download/
二. DPDK 初始化部分
1.初始化EAL環境,rte_eal_init();
2.解析參數,因為DPDK的參數在EAL初始化時就進行了解析,所以,這里主要解析的是我們自己的參數,可以使用getopt_long函數。
3.初始化內存池等,這里要注意放在接口的初始化之前,為接收數據包做准備。
4.初始化接口
5.啟動所有核上的線程。rte_eal_mp_remote_launch()
三. 下面詳細講解初始化作用過程:
2.1 EAL初始化
1)EAL功能作用:
• Intel® DPDK loading and launching
• Support for multi-process and multi-thread execution types
• Core affinity/assignment procedures
• System memory allocation/de-allocation
• Atomic/lock operations
• Time reference
• PCI bus access
• Trace and debug functions
• CPU feature identification
• Interrupt handling
• Alarm operations
ref: http://doc.dpdk.org/guides/prog_guide/ (詳細可參考文章)
2) 初始化程序: 源文件eal.c
1 /* Launch threads, called at application init(). */ 2 int 3 rte_eal_init(int argc, char **argv) 4 { 5 int i, fctret, ret; 6 pthread_t thread_id; 7 static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); 8 const char *p; 9 static char logid[PATH_MAX]; 10 char cpuset[RTE_CPU_AFFINITY_STR_LEN]; 11 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 12 13 /* checks if the machine is adequate */ 14 //檢測cpu的標識是否支持 15 //dpdk在進行cpu運行時,會考慮采用cpu高級指令來優化運算速度。 16 if (!rte_cpu_is_supported()) { 17 rte_eal_init_alert("unsupported cpu type."); 18 rte_errno = ENOTSUP; 19 return -1; 20 } 21 22 //操作靜態局部變量run_once確保函數只執行一次 23 if (!rte_atomic32_test_and_set(&run_once)) { 24 rte_eal_init_alert("already called initialization."); 25 rte_errno = EALREADY; 26 return -1; 27 } 28 29 p = strrchr(argv[0], '/'); 30 strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); 31 thread_id = pthread_self(); 32 33 //初始化結構體struct internal_config 34 eal_reset_internal_config(&internal_config); 35 36 /* set log level as early as possible */ 37 //解析命令行參數,只處理“--log-level”,保存在internal_config.log_level 38 eal_log_level_parse(argc, argv); 39 40 //獲取系統中的CPU數量 41 if (rte_eal_cpu_init() < 0) { 42 rte_eal_init_alert("Cannot detect lcores."); 43 rte_errno = ENOTSUP; 44 return -1; 45 } 46 47 /* 48 EAL初始化參數: 49 -c COREMASK:要使用CPU core16進制掩碼。注意core編號在不同的平台不一樣,需要事先確定好。 50 -n NUM:每個處理器socket的內存通道數 51 -b domain:bus:devid.func:網口黑名單,EAL不能使用的PCI設備(可以同時存在多個-b選項) 52 –socket-mem:在指定socket上分配大頁內存 53 -m MB:指定分配大大頁內存數,不限處理器的socket。加以使用—socket-mem代替這個參數 54 -r NUM:內存的rank數 55 -v:顯示程序版本號 56 –huge-dir:大頁內存的掛載點 57 –file-prefix:大頁內存文件的前綴 58 –proc-type:進程類型(primary,secondary,auto) 59 –xen-dom0:支持程序在Xen Domain0中非大頁內存下運行 60 –vmware-tsc-map:使用VMware TSC代替本地的RDTSC 61 –base-virtaddr :指定虛擬地址的基址 62 –vfio-intr:指定VFIO使用的中斷類型(如果不是用VFIO則無效) 63 -c是必須的,其它都是可選的。 64 */ 65 fctret = eal_parse_args(argc, argv); 66 if (fctret < 0) { 67 rte_eal_init_alert("Invalid 'command line' arguments."); 68 rte_errno = EINVAL; 69 rte_atomic32_clear(&run_once); 70 return -1; 71 } 72 73 //根據命令行參數初始化internal_config 74 if (eal_plugins_init() < 0) { 75 rte_eal_init_alert("Cannot init plugins"); 76 rte_errno = EINVAL; 77 rte_atomic32_clear(&run_once); 78 return -1; 79 } 80 81 if (eal_option_device_parse()) { 82 rte_errno = ENODEV; 83 rte_atomic32_clear(&run_once); 84 return -1; 85 } 86 87 /* 88 主應用的情況(RTE_PROC_PRIMARY) 89 rte_eal_config_create 90 eal_runtime_config_path:獲取runtime配置文件路徑,如“/var/run/.rte_config” 91 打開文件,上鎖,mmap映射文件到內存 92 將early configuration structure(全局變量early_mem_config)拷貝到此內存中,rte_config.mem_config指向這塊內存 93 映射地址保存在rte_config.mem_config->mem_cfg_addr中,用於從應用將來映射到相同的地址 94 從應用的情況(RTE_PROC_SECONDARY) 95 rte_eal_config_attach 96 eal_runtime_config_path 97 打開文件,mmap映射文件到內存 98 rte_config.mem_config指向映射的內存 99 rte_eal_mcfg_wait_complete 100 如果struct rte_mem_config結構的magic成員沒有被寫成RTE_MAGIC,就繼續等待 101 (主應用ready后會將struct rte_mem_config結構的magic成員寫成RTE_MAGIC) 102 rte_eal_config_reattach 103 從前面mmap映射文件中獲取主應用mmap的映射地址(即rte_config.mem_config->mem_cfg_addr) 104 munmap解除先前的映射 105 指定主應用映射地址重新執行mmap映射,如果最終映射地址和指定映射地址不一致,則出錯退出 106 將rte_config.mem_config指向重新映射的內存 107 */ 108 rte_config_init(); 109 110 111 /* 112 初始化global interrupt source head 113 創建pipe 114 創建線程來等待處理中斷,線程執行函數為eal_intr_thread_main 115 線程運行循環 116 epoll_create:創建epoll文件描述符 117 epoll_ctl:把前面創建的the read end of the pipe,添加到epoll wait list中 118 遍歷以global interrupt source head為頭部的struct rte_intr_source結構鏈表 119 如果當前struct rte_intr_source結構沒有掛載的callback函數,跳過 120 把所有的uio device file descriptor,添加到epoll wait list中 121 eal_intr_handle_interrupts 122 epoll_wait:wait for an I/O event on an epoll file descriptor 123 eal_intr_process_interrupts 124 遍歷所有發生的I/O eventc 125 如果the read end of the pipe可用,執行read操作,函數返回 126 遍歷struct rte_intr_source結構鏈表,查找當前I/O event對應的structrte_intr_source結構 127 根據interrupt handle type(uio/alarm/…),確定需要讀取的字節長度 128 執行文件read操作 129 如果read數據成功,執行當前struct rte_intr_source結構掛載的所有callback函數 130 調用eal_intr_process_interrupts返回負數,本次中斷處理結束返回 131 關閉epoll文件描述符 132 如果創建線程成功,調用rte_thread_setname給線程設置名稱“eal-intr-thread” 133 pthread_setname_np 134 循環(browse all running lcores except the master lcore) 135 創建主線程與子線程通信使用的pipe 136 設置子線程狀態為WAIT 137 創建子線程,線程執行函數為eal_thread_loop 138 根據線程ID,獲取當前線程的lcore_id 139 獲取主線程向子線程通信所用管道,子線程讀取數據的file descriptor(m2s) 140 獲取子線程向主線程通信所用管道,子線程發送數據的file descriptor(s2m) 141 eal_thread_set_affinity:設置子線程cpu affinity 142 eal_thread_dump_affinity 143 線程主循環 144 等待讀取主線程發送的命令 145 設置線程狀態為RUNNING 146 向主線程發送ack 147 讀取當前lcore對應的structlcore_config結構中的lcore_function_t類型函數指針,及調用參數 148 執行所指函數,並存儲返回值 149 設置線程狀態為FINISHED 150 如果創建線程成功,調用rte_thread_setname給線程設置名稱“lcore-slave-xx” 151 152 */ 153 if (rte_eal_intr_init() < 0) { 154 rte_eal_init_alert("Cannot init interrupt-handling thread"); 155 return -1; 156 } 157 158 /* Put mp channel init before bus scan so that we can init the vdev 159 * bus through mp channel in the secondary process before the bus scan. 160 */ 161 /* 162 多進程的情況稍微復雜一些,除了線程間的通信外,還要完成primary進程和其他secondary進程的通信 163 模塊初始化中的下面函數完成的(mp表示multiple process) 164 其內部會單獨創建一個線程用來接收來自其他進程的消息 165 */ 166 if (rte_mp_channel_init() < 0) { 167 rte_eal_init_alert("failed to init mp channel"); 168 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 169 rte_errno = EFAULT; 170 return -1; 171 } 172 } 173 174 /* register multi-process action callbacks for hotplug */ 175 //注冊一個action 176 if (rte_mp_dev_hotplug_init() < 0) { 177 rte_eal_init_alert("failed to register mp callback for hotplug"); 178 return -1; 179 } 180 181 /* 182 bus scan提供的主接口,內部會調用所有bus->scan。接口的目的是掃描所有bus下注冊的設備 183 bus下默認的設備路徑在/sys/bus/pci/devices 184 同內核掃描流程不同,DPDK只是將kernel掃描pci后建立的sysfs信息讀取出來,獲得內核已經掃描好的pci信息 185 在linux設備模型中總線類型下掛有屬於該bus的device和driver的文件夾,每個文件夾里存在具體的device指向實際的設備文件 186 /sys/bus/pci/devices/ 187 188 */ 189 if (rte_bus_scan()) { 190 rte_eal_init_alert("Cannot scan the buses for devices"); 191 rte_errno = ENODEV; 192 rte_atomic32_clear(&run_once); 193 return -1; 194 } 195 196 /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */ 197 if (internal_config.iova_mode == RTE_IOVA_DC) { 198 /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ 199 //獲取全局配置結構struct rte_config,初始指向全局變量early_mem_config 200 rte_eal_get_configuration()->iova_mode = 201 rte_bus_get_iommu_class(); 202 203 /* Workaround for KNI which requires physical address to work */ 204 if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && 205 rte_eal_check_module("rte_kni") == 1) { 206 rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; 207 RTE_LOG(WARNING, EAL, 208 "Some devices want IOVA as VA but PA will be used because.. " 209 "KNI module inserted\n"); 210 } 211 } else { 212 rte_eal_get_configuration()->iova_mode = 213 internal_config.iova_mode; 214 } 215 216 if (internal_config.no_hugetlbfs == 0) { 217 /* rte_config isn't initialized yet */ 218 ret = internal_config.process_type == RTE_PROC_PRIMARY ? 219 eal_hugepage_info_init() : 220 eal_hugepage_info_read(); 221 if (ret < 0) { 222 rte_eal_init_alert("Cannot get hugepage information."); 223 rte_errno = EACCES; 224 rte_atomic32_clear(&run_once); 225 return -1; 226 } 227 } 228 229 if (internal_config.memory == 0 && internal_config.force_sockets == 0) { 230 if (internal_config.no_hugetlbfs) 231 internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; 232 } 233 234 if (internal_config.vmware_tsc_map == 1) { 235 #ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT 236 rte_cycles_vmware_tsc_map = 1; 237 RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " 238 "you must have monitor_control.pseudo_perfctr = TRUE\n"); 239 #else 240 RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " 241 "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); 242 #endif 243 } 244 245 rte_srand(rte_rdtsc()); 246 247 /* 248 調用fopencookie,定義一個定制的寫日志接口 249 調用openlog打開日志 250 rte_eal_common_log_init: 251 STAILQ_INIT:初始化Singly-linked Tail queue,隊頭為log_history 252 rte_mempool_create 253 如果創建mempool失敗,調用rte_mempool_lookup 254 獲取鏈接所有mempool結構鏈表的頭結構structrte_mempool_list 255 遍歷鏈接所有mempool結構鏈表的所有結點 256 比較struct rte_tailq_entry結構的data域指向的struct rte_mempool結構的名稱, 257 是否與指定名稱相同 258 返回找到的指向struct rte_mempool結構的指針,或NULL 259 */ 260 if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { 261 rte_eal_init_alert("Cannot init logging."); 262 rte_errno = ENOMEM; 263 rte_atomic32_clear(&run_once); 264 return -1; 265 } 266 267 #ifdef VFIO_PRESENT 268 if (rte_eal_vfio_setup() < 0) { 269 rte_eal_init_alert("Cannot init VFIO"); 270 rte_errno = EAGAIN; 271 rte_atomic32_clear(&run_once); 272 return -1; 273 } 274 #endif 275 /* in secondary processes, memory init may allocate additional fbarrays 276 * not present in primary processes, so to avoid any potential issues, 277 * initialize memzones first. 278 */ 279 /* 280 rte_memzone在DPDK的內存資源管理中起到的是其他資源管家的作用,默認情況下, 281 在DPDK初始化時會創建RTE_MAX_MEMZONE個rte_memzone, 282 每一個都可以記錄一個rte_ring或者rte_mempool的內存位置 283 每一個rte_ring或者rte_mempool都有一個指針回指到它關聯的rte_memzone 284 Memzone是內存分配的基本單元,mempool,malloc_heap在需要內存時,都會執行rte_memzone_reserve操作 285 rte_memzone_reserve 從memseg中分配一塊內存出來 286 */ 287 if (rte_eal_memzone_init() < 0) { 288 rte_eal_init_alert("Cannot init memzone"); 289 rte_errno = ENODEV; 290 return -1; 291 } 292 293 /* 294 1.獲取所有預留hugepage的物理地址並按物理地址進行排序 295 2.根據物理物理地址,虛擬地址,soket_id等將hugpages組合成memseg 296 3.將所有memseg信息在所有dpdk程序間共享 297 */ 298 if (rte_eal_memory_init() < 0) { 299 rte_eal_init_alert("Cannot init memory"); 300 rte_errno = ENOMEM; 301 return -1; 302 } 303 304 /* the directories are locked during eal_hugepage_info_init */ 305 //解鎖hugepage目錄(由前面的eal_hugepage_info_init函數加鎖) 306 eal_hugedirs_unlock(); 307 308 /* 309 1.函數將連續的memseg使用heap的方式管理起來,heap數據抽象 310 2.注冊register_mp_requests 311 3.rte_memseg_contig_walk遍歷memseg list中連續的mem seg,然后使用malloc_add_seg將這些內存加入heap的管理 312 4.heap的管理在malloc_heap_add_memory中實現 313 */ 314 if (rte_eal_malloc_heap_init() < 0) { 315 rte_eal_init_alert("Cannot init malloc heap"); 316 rte_errno = ENODEV; 317 return -1; 318 } 319 320 if (rte_eal_tailqs_init() < 0) { 321 rte_eal_init_alert("Cannot init tail queues for objects"); 322 rte_errno = EFAULT; 323 return -1; 324 } 325 326 //賦值全局的struct rte_intr_handle結構,調用timerfd_create函數創建定時器timer對象 327 if (rte_eal_alarm_init() < 0) { 328 rte_eal_init_alert("Cannot init interrupt-handling thread"); 329 /* rte_eal_alarm_init sets rte_errno on failure. */ 330 return -1; 331 } 332 333 /* 334 設定全局變量eal_timer_source為EAL_TIMER_TSC(TSC/HPET) 335 set_tsc_freq:設置TSC frequency(每秒鍾時鍾中斷的次數) 336 解析文件“/proc/cpuinfo”,檢查“flags”屬性中“constant_tsc”和“nonstop_tsc”是否存在 337 */ 338 if (rte_eal_timer_init() < 0) { 339 rte_eal_init_alert("Cannot init HPET or TSC timers"); 340 rte_errno = ENOTSUP; 341 return -1; 342 } 343 344 /* 345 獲取masterlcore對應的numa socket 346 rte_eal_get_physmem_layout:獲取struct rte_memseg結構數組地址 347 遍歷struct rte_memseg結構數組,檢查特定struct rte_memseg結構是否存在(對應此numa socket,並且長度大於0) 348 */ 349 eal_check_mem_on_local_socket(); 350 351 /* 352 設置主線程的lcore_id 353 eal_thread_set_affinity 354 rte_sys_gettid:獲取線程的tid 355 設置線程的CPU親和性,記錄numasocket等信息 356 */ 357 eal_thread_init_master(rte_config.master_lcore); 358 359 //dump當前線程的CPU affinity 360 ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); 361 362 RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", 363 rte_config.master_lcore, (uintptr_t)thread_id, cpuset, 364 ret == 0 ? "" : "..."); 365 366 RTE_LCORE_FOREACH_SLAVE(i) { 367 368 /* 369 * create communication pipes between master thread 370 * and children 371 */ 372 if (pipe(lcore_config[i].pipe_master2slave) < 0) 373 rte_panic("Cannot create pipe\n"); 374 if (pipe(lcore_config[i].pipe_slave2master) < 0) 375 rte_panic("Cannot create pipe\n"); 376 377 lcore_config[i].state = WAIT; 378 379 /* create a thread for each lcore */ 380 ret = pthread_create(&lcore_config[i].thread_id, NULL, 381 eal_thread_loop, NULL); 382 if (ret != 0) 383 rte_panic("Cannot create thread\n"); 384 385 /* Set thread_name for aid in debugging. */ 386 snprintf(thread_name, sizeof(thread_name), 387 "lcore-slave-%d", i); 388 ret = rte_thread_setname(lcore_config[i].thread_id, 389 thread_name); 390 if (ret != 0) 391 RTE_LOG(DEBUG, EAL, 392 "Cannot set name for lcore thread\n"); 393 } 394 395 /* 396 * Launch a dummy function on all slave lcores, so that master lcore 397 * knows they are all ready when this function returns. 398 */ 399 /*指示所有子線程啟動一個dummyfunction*/ 400 /* 401 檢查各個子線程/lcore的狀態是否處於WAIT 402 rte_eal_remote_launch:向各個子線程/lcore發送執行命令 403 獲取主線程向子線程通信所用管道,主線程發送數據的file descriptor(m2s) 404 獲取子線程向主線程通信所用管道,主線程讀取數據的file descriptor(s2m) 405 將lcore_function_t類型函數指針,及調用參數填入當前lcore對應的structlcore_config結構 406 向子線程發送命令 407 等待讀取子線程發送的ack 408 如果最后一個參數值為CALL_MASTER(lcore handler executed by master core),主線程也執行所指函數 409 */ 410 rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); 411 rte_eal_mp_wait_lcore(); 412 413 /* initialize services so vdevs register service during bus_probe. */ 414 ret = rte_service_init(); 415 if (ret) { 416 rte_eal_init_alert("rte_service_init() failed"); 417 rte_errno = ENOEXEC; 418 return -1; 419 } 420 421 /* Probe all the buses and devices/drivers on them */ 422 if (rte_bus_probe()) { 423 rte_eal_init_alert("Cannot probe devices"); 424 rte_errno = ENOTSUP; 425 return -1; 426 } 427 428 #ifdef VFIO_PRESENT 429 /* Register mp action after probe() so that we got enough info */ 430 if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) 431 return -1; 432 #endif 433 434 /* initialize default service/lcore mappings and start running. Ignore 435 * -ENOTSUP, as it indicates no service coremask passed to EAL. 436 */ 437 ret = rte_service_start_with_defaults(); 438 if (ret < 0 && ret != -ENOTSUP) { 439 rte_errno = ENOEXEC; 440 return -1; 441 } 442 443 /* 444 * Clean up unused files in runtime directory. We do this at the end of 445 * init and not at the beginning because we want to clean stuff up 446 * whether we are primary or secondary process, but we cannot remove 447 * primary process' files because secondary should be able to run even 448 * if primary process is dead. 449 * 450 * In no_shconf mode, no runtime directory is created in the first 451 * place, so no cleanup needed. 452 */ 453 if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { 454 rte_eal_init_alert("Cannot clear runtime directory\n"); 455 return -1; 456 } 457 458 /* 459 如果是主應用,將全局內存配置struct rte_mem_config結構的magic成員寫成RTE_MAGIC, 460 表明主應用EAL初始化完成 461 */ 462 rte_eal_mcfg_complete(); 463 464 /* Call each registered callback, if enabled */ 465 rte_option_init(); 466 467 return fctret; 468 }
