dpdk網卡收包分析


 一個網絡報文從網卡接收到被應用處理,中間主要需要經歷兩個階段:

階段一:網卡通過其DMA硬件將收到的報文寫入到收包隊列中(入隊)
階段二:應用從收包隊列中讀取報文(出隊)
由於目前正在使用vpp/dpdk 優化waf引擎的工作,所以就看看ixgbe網卡在dpdk框架下是怎么工作的。
下面分別介紹一下 收包隊列結構 初始化(使能) 收包流程

收發包的配置和初始化,主要是配置收發隊列等。

收發包的配置最主要的工作就是配置網卡的收發隊列,設置DMA拷貝數據包的地址等。使用數據包時,只要去對應隊列取出指定地址的數據即可;主題配置函數見 rte_eth_dev_configure ;當收發隊列配置完成后,就調用設備的配置函數,進行最后的配置。(*dev->dev_ops->dev_configure)(dev),-----進入ixgbe_dev_configure()來分析其過程,主要是調用了ixgbe_check_mq_mode()來檢查隊列的模式。然后設置允許接收批量和向量的模式

2.數據包的獲取和發送,主要是從隊列中獲取到數據包或者把數據包放到隊列中。
收包隊列的構造主要是通過網卡隊列設置函數 rte_eth_rx_queue_setup設置相關參數;最后,調用到隊列的setup函數做最后的初始化。ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
socket_id, rx_conf, mp);
對於ixgbe設備,rx_queue_setup就是函數ixgbe_dev_rx_queue_setup()

說一說主要的結構體:

/* Receive Descriptor - Advanced 
    pkt_addr:報文數據的物理地址,網卡DMA將報文數據通過該物理地址寫入
對應的內存空間。
    hdr_addr:報文的頭信息,hdr_addr的最后一個bit為DD位,因為是union結構,
即status_error的最后一個bit也對應DD位。
    網卡每次來了新的數據包,就檢查rx_ring當前這個buf的DD位是否為0,
如果為0那么表示當前buf可以使用,就讓DMA將數據包copy到這個buf中,
然后設置DD為1。如果為1,那么網卡就認為rx_ring隊列滿了,
直接會將這個包給丟棄掉,記錄一次imiss。(0->1)*/
union ixgbe_adv_rx_desc {
    struct {
        __le64 pkt_addr; /* Packet buffer address */
        __le64 hdr_addr; /* Header buffer address */
    } read;
    struct {
        struct {
            union {
                __le32 data;
                struct {
                    __le16 pkt_info; /* RSS, Pkt type */
                    __le16 hdr_info; /* Splithdr, hdrlen */
                } hs_rss;
            } lo_dword;
            union {
                __le32 rss; /* RSS Hash */
                struct {
                    __le16 ip_id; /* IP id */
                    __le16 csum; /* Packet Checksum */
                } csum_ip;
            } hi_dword;
        } lower;
        struct {
            __le32 status_error; /* ext status/error */
            __le16 length; /* Packet length */
            __le16 vlan; /* VLAN tag */
        } upper;
    } wb;  /* writeback */
};
/**
 * Structure associated with each descriptor of the RX ring of a RX queue.
 sw_ring是由一個動態申請的數組構建的環形隊列,隊列的元素是ixgbe_rx_entry類型,
 隊列的大小可配,一般最大可配4096
 mbuf:報文mbuf結構指針,mbuf用於管理一個報文,主要包含報文相關信息和報文數據。
 */
struct ixgbe_rx_entry {
    struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
};
/**
 * Structure associated with each RX queue.
 */
struct ixgbe_rx_queue {
    struct rte_mempool  *mb_pool; /**< mbuf pool to populate RX ring. */
    /*rx_ring主要存儲報文數據的物理地址,物理地址供網卡DMA使用,
    也稱為DMA地址(硬件使用物理地址,將報文copy到報文物理位置上)。*/
    volatile union ixgbe_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
    uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
    volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
    volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
    /*sw_ring主要存儲報文數據的虛擬地址,虛擬地址供應用使用
    (軟件使用虛擬地址,讀取報文)報文數據的物理地址可以由報文數據的虛擬地址轉化得到。*/
    struct ixgbe_rx_entry *sw_ring; /**< address of RX software ring. */
    struct ixgbe_scattered_rx_entry *sw_sc_ring; /**< address of scattered Rx software ring. */
    struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
    struct rte_mbuf *pkt_last_seg; /**< Last segment of current packet. */
    uint64_t            mbuf_initializer; /**< value to init mbufs */
    uint16_t            nb_rx_desc; /**< number of RX descriptors. */
    uint16_t            rx_tail;  /**< current value of RDT register. */
    uint16_t            nb_rx_hold; /**< number of held free RX desc. */
    uint16_t rx_nb_avail; /**< nr of staged pkts ready to ret to app */
    uint16_t rx_next_avail; /**< idx of next staged pkt to ret to app */
    uint16_t rx_free_trigger; /**< triggers rx buffer allocation */
    uint8_t            rx_using_sse;
    /**< indicates that vector RX is in use */
#ifdef RTE_LIBRTE_SECURITY
    uint8_t            using_ipsec;
    /**< indicates that IPsec RX feature is in use */
#endif
#ifdef RTE_IXGBE_INC_VECTOR
    uint16_t            rxrearm_nb;     /**< number of remaining to be re-armed */
    uint16_t            rxrearm_start;  /**< the idx we start the re-arming from */
#endif
    uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
    uint16_t            queue_id; /**< RX queue index. */
    uint16_t            reg_idx;  /**< RX queue register index. */
    uint16_t            pkt_type_mask;  /**< Packet type mask for different NICs. */
    uint16_t            port_id;  /**< Device port identifier. */
    uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwise. */
    uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
    uint8_t             rx_deferred_start; /**< not in global dev start. */
    /** flags to set in mbuf when a vlan is detected. */
    uint64_t            vlan_flags;
    uint64_t        offloads; /**< Rx offloads with DEV_RX_OFFLOAD_* */
    /** need to alloc dummy mbuf, for wraparound when scanning hw ring */
    struct rte_mbuf fake_mbuf;
    /** hold packets to return to application */
    struct rte_mbuf *rx_stage[RTE_PMD_IXGBE_RX_MAX_BURST*2];
};`
View Code

 

收包隊列的啟動主要是通過調用rte_eth_dev_start

DPDK是零拷貝的,那么分配的mem_pool中的對象怎么和隊列以及驅動聯系起來呢????

設備的啟動是從rte_eth_dev_start()中開始,會調用  

diag = (*dev->dev_ops->dev_start)(dev);

找到設備啟動的真正啟動函數:ixgbe_dev_start

其中隊列初始化流程函數為:

ixgbe_alloc_rx_queue_mbufs(struct ixgbe_rx_queue *rxq)
{
    struct ixgbe_rx_entry *rxe = rxq->sw_ring;
    uint64_t dma_addr;
    unsigned int i;

    /* Initialize software ring entries 
    隊列所屬內存池的ring中循環取出了nb_rx_desc個mbuf指針,
    填充rxq->sw_ring。每個指針都指向內存池里的一個數據包空間
    然后就先填充了新分配的mbuf結構,最最重要的是填充計算了dma_addr
    初始化queue ring,即rxd的信息,標明了驅動把數據包放在dma_addr處。
    最后把分配的mbuf“放入”queue 的sw_ring中,
    這樣,驅動收過來的包,就直接放在了sw_ring中。
    */
    for (i = 0; i < rxq->nb_rx_desc; i++) {
        volatile union ixgbe_adv_rx_desc *rxd;
        struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);

        if (mbuf == NULL) {
            PMD_INIT_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
                     (unsigned) rxq->queue_id);
            return -ENOMEM;
        }

        mbuf->data_off = RTE_PKTMBUF_HEADROOM;
        mbuf->port = rxq->port_id;

        dma_addr =
            rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
        rxd = &rxq->rx_ring[i];
        rxd->read.hdr_addr = 0;
        rxd->read.pkt_addr = dma_addr;
        rxe[i].mbuf = mbuf;
    }

    return 0;
}

數據包的獲取

網卡收到報文后,先存於網卡本地的buffer-Rx(Rx FIFO)中,然后由DMA通過PCI總線將報文數據寫入操作系統的內存中,即數據報文完成入隊操作,那么數據包的獲取就是指上層應用從隊列中去取出這些數據包

業務層面獲取數據包是從rte_eth_rx_burst()開始:

int16_t nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id], rx_pkts, nb_pkts);

這里的dev->rx_pkt_burst在驅動初始化的時候已經注冊過了,對於ixgbe設備,就是ixgbe_recv_pkts()函數

uint16_t
ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
        uint16_t nb_pkts)
{
    struct ixgbe_rx_queue *rxq;
    volatile union ixgbe_adv_rx_desc *rx_ring;
    volatile union ixgbe_adv_rx_desc *rxdp;
    struct ixgbe_rx_entry *sw_ring;
    struct ixgbe_rx_entry *rxe;
    struct rte_mbuf *rxm;
    struct rte_mbuf *nmb;
    union ixgbe_adv_rx_desc rxd;
    uint64_t dma_addr;
    uint32_t staterr;
    uint32_t pkt_info;
    uint16_t pkt_len;
    uint16_t rx_id;
    uint16_t nb_rx;
    uint16_t nb_hold;
    uint64_t pkt_flags;
    uint64_t vlan_flags;

    nb_rx = 0;
    nb_hold = 0;
    rxq = rx_queue;
    rx_id = rxq->rx_tail;//從隊列的tail位置開始取包
    rx_ring = rxq->rx_ring;
    sw_ring = rxq->sw_ring;
    vlan_flags = rxq->vlan_flags;
    while (nb_rx < nb_pkts) {//循環獲取nb_pkts個包
        /*
         * The order of operations here is important as the DD status
         * bit must not be read after any other descriptor fields.
         * rx_ring and rxdp are pointing to volatile data so the order
         * of accesses cannot be reordered by the compiler. If they were
         * not volatile, they could be reordered which could lead to
         * using invalid descriptor fields when read from rxd.
         */
        rxdp = &rx_ring[rx_id];
        staterr = rxdp->wb.upper.status_error;
    //檢查DD位是否為1,是1則說明該位置已放入數據包,否則表示沒有報文,退出
        if (!(staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
            break;
        rxd = *rxdp;

        /*
         * End of packet.
         *
         * If the IXGBE_RXDADV_STAT_EOP flag is not set, the RX packet
         * is likely to be invalid and to be dropped by the various
         * validation checks performed by the network stack.
         *
         * Allocate a new mbuf to replenish the RX ring descriptor.
         * If the allocation fails:
         *    - arrange for that RX descriptor to be the first one
         *      being parsed the next time the receive function is
         *      invoked [on the same queue].
         *
         *    - Stop parsing the RX ring and return immediately.
         *
         * This policy do not drop the packet received in the RX
         * descriptor for which the allocation of a new mbuf failed.
         * Thus, it allows that packet to be later retrieved if
         * mbuf have been freed in the mean time.
         * As a side effect, holding RX descriptors instead of
         * systematically giving them back to the NIC may lead to
         * RX ring exhaustion situations.
         * However, the NIC can gracefully prevent such situations
         * to happen by sending specific "back-pressure" flow control
         * frames to its peer(s).
         */
        PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
               "ext_err_stat=0x%08x pkt_len=%u",
               (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
               (unsigned) rx_id, (unsigned) staterr,
               (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
        //申請一個mbuf(nmb),用於交換
        nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
        if (nmb == NULL) {
            PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
                   "queue_id=%u", (unsigned) rxq->port_id,
                   (unsigned) rxq->queue_id);
            rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
            break;
        }

        nb_hold++;
       
        rxe = &sw_ring[rx_id];
        rx_id++;
        if (rx_id == rxq->nb_rx_desc)
            rx_id = 0;

        /* Prefetch next mbuf while processing current one. */
        rte_ixgbe_prefetch(sw_ring[rx_id].mbuf);

        /*
         * When next RX descriptor is on a cache-line boundary,
         * prefetch the next 4 RX descriptors and the next 8 pointers
         * to mbufs.
         */
        if ((rx_id & 0x3) == 0) {
            rte_ixgbe_prefetch(&rx_ring[rx_id]);
            rte_ixgbe_prefetch(&sw_ring[rx_id]);
        }
        //從sw_ring中讀取一個報文mbuf(存入rxm)
        rxm = rxe->mbuf;
         //往sw_ring中填空一個新報文mbuf(nmb)
        rxe->mbuf = nmb;
         //新mbuf對應的報文數據物理地址填入rx_ring對應位置,並將hdr_addr置0(DD位置0)
        dma_addr =
            rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
        rxdp->read.hdr_addr = 0;
        rxdp->read.pkt_addr = dma_addr;

        /*
         * Initialize the returned mbuf.
         * 1) setup generic mbuf fields:
         *    - number of segments,
         *    - next segment,
         *    - packet length,
         *    - RX port identifier.
         * 2) integrate hardware offload data, if any:
         *    - RSS flag & hash,
         *    - IP checksum flag,
         *    - VLAN TCI, if any,
         *    - error flags.
         */
        pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
                      rxq->crc_len);
        //對讀取mbuf的報文信息進行初始化
        rxm->data_off = RTE_PKTMBUF_HEADROOM;
        rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
        rxm->nb_segs = 1;
        rxm->next = NULL;
        rxm->pkt_len = pkt_len;
        rxm->data_len = pkt_len;
        rxm->port = rxq->port_id;

        pkt_info = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
        /* Only valid if PKT_RX_VLAN set in pkt_flags */
        rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);

        pkt_flags = rx_desc_status_to_pkt_flags(staterr, vlan_flags);
        pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
        pkt_flags = pkt_flags |
            ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
        rxm->ol_flags = pkt_flags;
        rxm->packet_type =
            ixgbe_rxd_pkt_info_to_pkt_type(pkt_info,
                               rxq->pkt_type_mask);

        if (likely(pkt_flags & PKT_RX_RSS_HASH))
            rxm->hash.rss = rte_le_to_cpu_32(
                        rxd.wb.lower.hi_dword.rss);
        else if (pkt_flags & PKT_RX_FDIR) {
            rxm->hash.fdir.hash = rte_le_to_cpu_16(
                    rxd.wb.lower.hi_dword.csum_ip.csum) &
                    IXGBE_ATR_HASH_MASK;
            rxm->hash.fdir.id = rte_le_to_cpu_16(
                    rxd.wb.lower.hi_dword.csum_ip.ip_id);
        }
        /*
         * Store the mbuf address into the next entry of the array
         * of returned packets.
         *///讀取的報文mbuf存入rx_pkts
        rx_pkts[nb_rx++] = rxm;
    }
    rxq->rx_tail = rx_id;

    /*
     * If the number of free RX descriptors is greater than the RX free
     * threshold of the queue, advance the Receive Descriptor Tail (RDT)
     * register.
     * Update the RDT with the value of the last processed RX descriptor
     * minus 1, to guarantee that the RDT register is never equal to the
     * RDH register, which creates a "full" ring situtation from the
     * hardware point of view...
     */
    nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
    if (nb_hold > rxq->rx_free_thresh) {
        PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
               "nb_hold=%u nb_rx=%u",
               (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
               (unsigned) rx_id, (unsigned) nb_hold,
               (unsigned) nb_rx);
        rx_id = (uint16_t) ((rx_id == 0) ?
                     (rxq->nb_rx_desc - 1) : (rx_id - 1));
        IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
        nb_hold = 0;
    }
    rxq->nb_rx_hold = nb_hold;
    return nb_rx;
}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM