buf_addr
當前mbuf的虛擬地址,標准buf addr的指向的內存是在mbuf頭部開始,偏移一個mbuf頭加上一個私有數據的大小。如下所示:
m->buf_addr = (char *)m + sizeof(struct rte_mbuf) + priv_size;
初始化這個變量是在我們創建mbuf的mempool的時候完成的
rte_pktmbuf_pool_create rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL); rte_pktmbuf_init m->buf_addr = (char *)m + mbuf_size;
buf的物理地址
union {
rte_iova_t buf_iova;
rte_iova_t buf_physaddr; /**< deprecated */
} __rte_aligned(sizeof(rte_iova_t));
mbuf對應的物理地址,一般mbuf物理地址在初始化mempool的時候就設置了,在mbuf對應obj的head里面存放,如下結構體的objhdr里面的iova/physaddr
struct rte_mempool_objhdr { STAILQ_ENTRY(rte_mempool_objhdr) next; /**< Next in list. */ struct rte_mempool *mp; /**< The mempool owning the object. */ RTE_STD_C11 union { rte_iova_t iova; /**< IO address of the object. */ phys_addr_t physaddr; /**< deprecated - Physical address of the object. */ }; #ifdef RTE_LIBRTE_MEMPOOL_DEBUG uint64_t cookie; /**< Debug cookie. */ #endif };
這個轉化關系如下:
m->buf_iova = rte_mempool_virt2iova(m) + sizeof(struct rte_mbuf) + priv_size;
mbuf結構體中的pkt的next字段記錄下一個segment的地址
m的pkt總長度是seg1+seg2+seg3三段數據之和。
data_off
這個變量是標識mbuf的data room開始地址到報文起始位置的偏移,默認是設置為RTE_PKTMBUF_HEADROOM(128),
我們在創建一個mbuf的mem pool的時候,會指定data room的大小,如下所示的data_room_size參數,
struct rte_mempool * rte_pktmbuf_pool_create(const char *name, unsigned int n, unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size, int socket_id) { return rte_pktmbuf_pool_create_by_ops(name, n, cache_size, priv_size, data_room_size, socket_id, NULL); }
data_room_size標識每一個mbuf的數據報文的最大值,一般會設置大於一個mtu+128B的頭部預留空間
dpdk提供一個默認宏定義:
#define RTE_PKTMBUF_HEADROOM 128
#define RTE_MBUF_DEFAULT_DATAROOM 2048
#define RTE_MBUF_DEFAULT_BUF_SIZE (RTE_MBUF_DEFAULT_DATAROOM + RTE_PKTMBUF_HEADROOM)
所以當我們從mbuf pool alloc一塊mbuf過來的時候,都會reset一下mbuf的變量,里面就包含了重置data_off,具體如下:
static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m) { m->data_off = (uint16_t)RTE_MIN((uint16_t)RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len); } static inline void rte_pktmbuf_reset(struct rte_mbuf *m) { m->next = NULL; m->pkt_len = 0; m->tx_offload = 0; m->vlan_tci = 0; m->vlan_tci_outer = 0; m->nb_segs = 1; m->port = MBUF_INVALID_PORT; m->ol_flags = 0; m->packet_type = 0; rte_pktmbuf_reset_headroom(m); m->data_len = 0; __rte_mbuf_sanity_check(m, 1); }
static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m) { m->data_off = (uint16_t)RTE_MIN((uint16_t)RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len); }
/** * A macro that points to an offset into the data in the mbuf. * * The returned pointer is cast to type t. Before using this * function, the user must ensure that the first segment is large * enough to accommodate its data. * * @param m * The packet mbuf. * @param o * The offset into the mbuf data. * @param t * The type to cast the result into. */ #define rte_pktmbuf_mtod_offset(m, t, o) \ ((t)((char *)(m)->buf_addr + (m)->data_off + (o))) /** * A macro that points to the start of the data in the mbuf. * * The returned pointer is cast to type t. Before using this * function, the user must ensure that the first segment is large * enough to accommodate its data. * * @param m * The packet mbuf. * @param t * The type to cast the result into. */ #define rte_pktmbuf_mtod(m, t) rte_pktmbuf_mtod_offset(m, t, 0)
#define rte_pktmbuf_data_len(m) ((m)->data_len)
1、不需要分片
IP報頭跟四層報文都需要長度是4的倍數;TCP報文頭部中固定長度是20字節 TCP頭部選項:TCP頭部的最后一個選項字段(options)是可變長的可選信息。這部分最多包含40字節,因為TCP頭部最長是60字節(其中還包含前面討論的20字節的固定部分)。 4位頭部長度(header length):標識該TCP頭部有多少個32bit字(4字節)。因為4位最大能標識15,所以TCP頭部最長是60字節。 int ipv4_hdrlen = (iph->version_ihl & RTE_IPV4_HDR_IHL_MASK) << 2; pkt_len = ntcp_payload_len + ipv4_hdrlen + (tcph->data_off >> 4) * 4; rte_pktmbuf_data_len(mbuf) = rte_pktmbuf_pkt_len(mbuf) = pkt_len + RTE_ETHER_HDR_LEN;
Mbuf
概述
DPDK mbuf實現了message buffer,可以存儲報文數據或者控制信息等。mbuf存儲在mempool中,以便在數據面提高訪問性能。
原理
DPDK把元數據(metadata)和實際數據存儲在一個mbuf中,並且使mbuf結構體盡量小,目前僅占用2個cache line,且最常訪問的成員在第1個cache line中。
mbuf從前至后主要由mbuf首部(即rte_mbuf結構體)、head room、實際數據和tailroom構成。用戶還可以在mbuf首部和head room之前加入一定長度的私有數據(private data)。head room的大小在DPDK編譯配置文件(如common_linuxapp)中指定,如 CONFIG_RTE_PKTMBUF_HEADROOM=128
。mbuf的基本結構如下圖所示:

一些指針、成員或函數結果的內容在下表中列出,mbuf指針簡寫為m:
項 | 內容 |
---|---|
m | 首部,即mbuf結構體 |
m->buf_addr | headroom起始地址 |
m->data_off | data起始地址相對於buf_addr的偏移 |
m->buf_len | mbuf和priv之后內存的長度,包含headroom |
m->pkt_len | 整個mbuf鏈的data總長度 |
m->data_len | 實際data的長度 |
m->buf_addr+m->data_off | 實際data的起始地址 |
rte_pktmbuf_mtod(m) | 同上 |
rte_pktmbuf_data_len(m) | 同m->data_len |
rte_pktmbuf_pkt_len | 同m->pkt_len |
rte_pktmbuf_data_room_size | 同m->buf_len |
rte_pktmbuf_headroom | headroom長度 |
rte_pktmbuf_tailroom | 尾部剩余空間長度 |
注:data_off = MIN(headroom_len, buf_len)
上圖中的buf只有一個數據段,在某些情況下,比如要處理巨幀(jumbo frame)時,可以把多個mbuf鏈接起來組成一個mbuf。下圖是包含3個數據段的mbuf:

對於鏈式的mbuf,僅在第一個mbuf結構體中包含元數據信息。
以下代碼分別創建了兩個mbuf,給它們添加數據,最后將它們組合成鏈。在此過程中打印了上表中的一些數據,可以幫助理解各指針和長度的含義,其中省去了錯誤處理代碼。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
static int mbuf_demo(void) { int ret; struct rte_mempool* mpool; struct rte_mbuf *m, *m2; struct rte_pktmbuf_pool_private priv; priv.mbuf_data_room_size = 1600 + RTE_PKTMBUF_HEADROOM - 16; priv.mbuf_priv_size = 16; mpool = rte_mempool_create("test_pool", ITEM_COUNT, ITEM_SIZE, CACHE_SIZE, sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, &priv, rte_pktmbuf_init, NULL, 0, MEMPOOL_F_SC_GET); m = rte_pktmbuf_alloc(mpool); mbuf_dump(m); // (1) rte_pktmbuf_append(m, 1400); mbuf_dump(m); // (2) m2 = rte_pktmbuf_alloc(mpool); rte_pktmbuf_append(m2, 500); mbuf_dump(m2); ret = rte_pktmbuf_chain(m, m2); mbuf_dump(m); // (3) return 0; }
|
首先注意第8,9,16行,為了演示用戶私有數據,在創建mempool時傳入了priv,這將在每個mbuf的首部后面添加16字節的私有數據,然后才是head room。內存池對象數目、第個對象的大小和cache大小分別是:
#define ITEM_COUNT 1024
#define ITEM_SIZE (1600 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) #define CACHE_SIZE 32
1600是預估的一個packet的最大長度。
在(1)處,新分配了一個mbuf m,此時m的data長度為0,打印結果如下:
RTE_PKTMBUF_HEADROOM: 128
sizeof(mbuf): 128
m: 0x7fbf1a810000
m->buf_addr: 0x7fbf1a810090
m->data_off: 128
m->buf_len: 1712
m->pkt_len: 0
m->data_len: 0
m->buf_addr+m->data_off: 0x7fbf1a810110
rte_pktmbuf_mtod(m): 0x7fbf1a810110
rte_pktmbuf_data_len(m): 0
rte_pktmbuf_pkt_len(m): 0
rte_pktmbuf_headroom(m): 128
rte_pktmbuf_tailroom(m): 1584
rte_pktmbuf_data_room_size(mpool): 1712
rte_pktmbuf_priv_size(mpool): 16
用圖表示如下:

在(2),用rte_pktmbuf_append模擬給m填充了1400字節的data,此時打印結果如下:
m: 0x7fbf1a810000
m->buf_addr: 0x7fbf1a810090
m->data_off: 128
m->buf_len: 1712
m->pkt_len: 1400
m->data_len: 1400
m->buf_addr+m->data_off: 0x7fbf1a810110
rte_pktmbuf_mtod(m): 0x7fbf1a810110
rte_pktmbuf_data_len(m): 1400
rte_pktmbuf_pkt_len(m): 1400
rte_pktmbuf_headroom(m): 128
rte_pktmbuf_tailroom(m): 184
rte_pktmbuf_data_room_size(mpool): 1712
rte_pktmbuf_priv_size(mpool): 16
用圖表示如下:

之后創建m2並給它添加data,在(3)處將m與m2連接,m做為鏈的首節點,此時m的打印結果如下:
m: 0x7fbf1a810000
m->buf_addr: 0x7fbf1a810090
m->data_off: 128
m->buf_len: 1712
m->pkt_len: 1900
m->data_len: 1400
m->buf_addr+m->data_off: 0x7fbf1a810110
rte_pktmbuf_mtod(m): 0x7fbf1a810110
rte_pktmbuf_data_len(m): 1400
rte_pktmbuf_pkt_len(m): 1900
rte_pktmbuf_headroom(m): 128
rte_pktmbuf_tailroom(m): 184
rte_pktmbuf_data_room_size(mpool): 1712
rte_pktmbuf_priv_size(mpool): 16
注意pkt_len的變化,它已經加上了m2的500字節。如果此時打印m—>next, 會發現m->next == m2。
數據結構
rte_mbuf(librte_mbuf/rte_mbuf.h):
struct rte_mbuf { MARKER cacheline0; void *buf_addr; /**< Virtual address of segment buffer. */ phys_addr_t buf_physaddr; /**< Physical address of segment buffer. */ uint16_t buf_len; /**< Length of segment buffer. */ /* next 6 bytes are initialised on RX descriptor rearm */ MARKER8 rearm_data; uint16_t data_off; /** * 16-bit Reference counter. * It should only be accessed using the following functions: * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and * rte_mbuf_refcnt_set(). The functionality of these functions (atomic, * or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC * config option. */ union { rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ uint16_t refcnt; /**< Non-atomically accessed refcnt */ }; uint8_t nb_segs; /**< Number of segments. */ uint8_t port; /**< Input port. */ uint64_t ol_flags; /**< Offload features. */ /* remaining bytes are set on RX when pulling packet from descriptor */ MARKER rx_descriptor_fields1; /* * The packet type, which is the combination of outer/inner L2, L3, L4 * and tunnel types. */ union { uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */ struct { uint32_t l2_type:4; /**< (Outer) L2 type. */ uint32_t l3_type:4; /**< (Outer) L3 type. */ uint32_t l4_type:4; /**< (Outer) L4 type. */ uint32_t tun_type:4; /**< Tunnel type. */ uint32_t inner_l2_type:4; /**< Inner L2 type. */ uint32_t inner_l3_type:4; /**< Inner L3 type. */ uint32_t inner_l4_type:4; /**< Inner L4 type. */ }; }; uint32_t pkt_len; /**< Total pkt len: sum of all segments. */ uint16_t data_len; /**< Amount of data in segment buffer. */ uint16_t vlan_tci; /**< VLAN Tag Control Identifier (CPU order) */ union { uint32_t rss; /**< RSS hash result if RSS enabled */ struct { union { struct { uint16_t hash; uint16_t id; }; uint32_t lo; /**< Second 4 flexible bytes */ }; uint32_t hi; /**< First 4 flexible bytes or FD ID, dependent on PKT_RX_FDIR_* flag in ol_flags. */ } fdir; /**< Filter identifier if FDIR enabled */ struct { uint32_t lo; uint32_t hi; } sched; /**< Hierarchical scheduler */ uint32_t usr; /**< User defined tags. See rte_distributor_process() */ } hash; /**< hash information */ uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */ uint16_t vlan_tci_outer; /**< Outer VLAN Tag Control Identifier (CPU order) */ /* second cache line - fields only used in slow path or on TX */ MARKER cacheline1 __rte_cache_aligned; union { void *userdata; /**< Can be used for external metadata */ uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */ }; struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */ struct rte_mbuf *next; /**< Next segment of scattered packet. */ /* fields to support TX offloads */ union { uint64_t tx_offload; /**< combined for easy fetch */ struct { uint64_t l2_len:7; /**< L2 (MAC) Header Length. */ uint64_t l3_len:9; /**< L3 (IP) Header Length. */ uint64_t l4_len:8;