dpdk l2fwd


之前在helloworld中主要分析了hugepage的使用,這回在l2fwd中主要分析一下uio和PMD的實現

 

main函數中首先調用了rte_eal_init初始化eal環境,其中主要是hugepage的初始化;

ret = rte_eal_init(argc, argv);
if (ret < 0)
    rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");

 

 

接着創建了mbuf pool

/* create the mbuf pool */
l2fwd_pktmbuf_pool =
    rte_mempool_create("mbuf_pool", NB_MBUF,
               MBUF_SIZE, 32,
               sizeof(struct rte_pktmbuf_pool_private),
               rte_pktmbuf_pool_init, NULL,
               rte_pktmbuf_init, NULL,
               rte_socket_id(), 0);
if (l2fwd_pktmbuf_pool == NULL)
    rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");

 

 

然后是PMD驅動的注冊和PCI設備驅動加載

/* init driver(s) */
if (rte_pmd_init_all() < 0)
    rte_exit(EXIT_FAILURE, "Cannot init pmd\n");

if (rte_eal_pci_probe() < 0)
    rte_exit(EXIT_FAILURE, "Cannot probe PCI\n");

 

首先是PMD驅動的注冊,目前DPDK支持igb igbvf em ixgbe ixgbevf virtio vmxnet3;不過這些具體是什么還不清楚,后面以虛擬機環境中使用的em驅動為例子分析;

static inline
int rte_pmd_init_all(void)
{
    int ret = -ENODEV;

#ifdef RTE_LIBRTE_IGB_PMD
    if ((ret = rte_igb_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init igb PMD\n");
        return (ret);
    }
    if ((ret = rte_igbvf_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init igbvf PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_IGB_PMD */

#ifdef RTE_LIBRTE_EM_PMD
    if ((ret = rte_em_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init em PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_EM_PMD */

#ifdef RTE_LIBRTE_IXGBE_PMD
    if ((ret = rte_ixgbe_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init ixgbe PMD\n");
        return (ret);
    }
    if ((ret = rte_ixgbevf_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init ixgbevf PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_IXGBE_PMD */

#ifdef RTE_LIBRTE_VIRTIO_PMD
    if ((ret = rte_virtio_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init virtio PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_VIRTIO_PMD */

#ifdef RTE_LIBRTE_VMXNET3_PMD
    if ((ret = rte_vmxnet3_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init vmxnet3 PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_VMXNET3_PMD */

    if (ret == -ENODEV)
        RTE_LOG(ERR, PMD, "No PMD(s) are configured\n");
    return (ret);
}

 

注冊EM驅動

int
rte_em_pmd_init(void)
{
    rte_eth_driver_register(&rte_em_pmd);
    return 0;
}

/**
 * Register an Ethernet [Poll Mode] driver.
 *
 * Function invoked by the initialization function of an Ethernet driver
 * to simultaneously register itself as a PCI driver and as an Ethernet
 * Poll Mode Driver.
 * Invokes the rte_eal_pci_register() function to register the *pci_drv*
 * structure embedded in the *eth_drv* structure, after having stored the
 * address of the rte_eth_dev_init() function in the *devinit* field of
 * the *pci_drv* structure.
 * During the PCI probing phase, the rte_eth_dev_init() function is
 * invoked for each PCI [Ethernet device] matching the embedded PCI
 * identifiers provided by the driver.
 */
void
rte_eth_driver_register(struct eth_driver *eth_drv)
{
    eth_drv->pci_drv.devinit = rte_eth_dev_init;
    rte_eal_pci_register(&eth_drv->pci_drv);
}

/* register a driver */
void
rte_eal_pci_register(struct rte_pci_driver *driver)
{
    TAILQ_INSERT_TAIL(&driver_list, driver, next);
}

 

這里PMD驅動結構包含了PMD驅動部分和PCI驅動部分

/**
 * @internal
 * The structure associated with a PMD Ethernet driver.
 *
 * Each Ethernet driver acts as a PCI driver and is represented by a generic
 * *eth_driver* structure that holds:
 *
 * - An *rte_pci_driver* structure (which must be the first field).
 *
 * - The *eth_dev_init* function invoked for each matching PCI device.
 *
 * - The size of the private data to allocate for each matching device.
 */
struct eth_driver {
    struct rte_pci_driver pci_drv;    /**< The PMD is also a PCI driver. */
    eth_dev_init_t eth_dev_init;      /**< Device init function. */
    unsigned int dev_private_size;    /**< Size of device private data. */
};

 

接下來,如果不存在白名單則加載每個device的所有驅動;在白名單中的device加載驅動失敗直接退出;

/*
 * Scan the content of the PCI bus, and call the devinit() function for
 * all registered drivers that have a matching entry in its id_table
 * for discovered devices.
 */
int
rte_eal_pci_probe(void)
{
    struct rte_pci_device *dev = NULL;

    TAILQ_FOREACH(dev, &device_list, next)
        if (!eal_dev_whitelist_exists())
            pci_probe_all_drivers(dev);
        else if (pcidev_is_whitelisted(dev) && pci_probe_all_drivers(dev) < 0 )
                rte_exit(EXIT_FAILURE, "Requested device " PCI_PRI_FMT
                        " cannot be used\n", dev->addr.domain,dev->addr.bus,
                        dev->addr.devid, dev->addr.function);

    return 0;
}

對於每個device,嘗試是否可以加載driver,RTE_PCI_DRV_MULTIPLE標記的驅動需要加載多次,第三方驅動可能需要;

/*
 * If vendor/device ID match, call the devinit() function of all
 * registered driver for the given device. Return -1 if no driver is
 * found for this device.
 * For drivers with the RTE_PCI_DRV_MULTIPLE flag enabled, register
 * the same device multiple times until failure to do so.
 * It is required for non-Intel NIC drivers provided by third-parties such
 * as 6WIND.
 */
static int
pci_probe_all_drivers(struct rte_pci_device *dev)
{
    struct rte_pci_driver *dr = NULL;
    int rc;

    dev->blacklisted = !!is_blacklisted(dev);
    TAILQ_FOREACH(dr, &driver_list, next) {
        rc = rte_eal_pci_probe_one_driver(dr, dev);
        if (rc < 0)
            /* negative value is an error */
            break;
        if (rc > 0)
            /* positive value means driver not found */
            continue;
        /* initialize subsequent driver instances for this device */
        if ((dr->drv_flags & RTE_PCI_DRV_MULTIPLE) &&
                (!dev->blacklisted))
            while (rte_eal_pci_probe_one_driver(dr, dev) == 0)
                ;
        return 0;
    }
    return -1;
}

 

驅動加載

/*
 * If vendor/device ID match, call the devinit() function of the
 * driver.
 */
int
rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct rte_pci_device *dev)
{
    struct rte_pci_id *id_table;

    /* id table位於rte_pci_dev_ids.h */
    for (id_table = dr->id_table ; id_table->vendor_id != 0; id_table++) {

        /* check if device's identifiers match the driver's ones */
        if (id_table->vendor_id != dev->id.vendor_id &&
                id_table->vendor_id != PCI_ANY_ID)
            continue;
        if (id_table->device_id != dev->id.device_id &&
                id_table->device_id != PCI_ANY_ID)
            continue;
        if (id_table->subsystem_vendor_id != dev->id.subsystem_vendor_id &&
                id_table->subsystem_vendor_id != PCI_ANY_ID)
            continue;
        if (id_table->subsystem_device_id != dev->id.subsystem_device_id &&
                id_table->subsystem_device_id != PCI_ANY_ID)
            continue;

        /* 當前driver與device匹配 */
        struct rte_pci_addr *loc = &dev->addr;

        RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
                loc->domain, loc->bus, loc->devid, loc->function,
                dev->numa_node);

        RTE_LOG(DEBUG, EAL, "  probe driver: %x:%x %s\n", dev->id.vendor_id,
                dev->id.device_id, dr->name);

        /* 黑名單設備不加載 */
        /* no initialization when blacklisted, return without error */
        if (dev->blacklisted) {
            RTE_LOG(DEBUG, EAL, "  Device is blacklisted, not initializing\n");
            return 0;
        }

#ifdef RTE_EAL_UNBIND_PORTS
        if (dr->drv_flags & RTE_PCI_DRV_NEED_IGB_UIO) {
            /* unbind driver and load uio resources for Intel NICs */
            if (pci_switch_module(dr, dev, 1, IGB_UIO_NAME) < 0)
                return -1;
        } else if (dr->drv_flags & RTE_PCI_DRV_FORCE_UNBIND &&
                   rte_eal_process_type() == RTE_PROC_PRIMARY) {
            /* unbind current driver */
            if (pci_unbind_kernel_driver(dev) < 0)
                return -1;
        }
#else
        /* 首先獲取設備的uio映射地址和大小,然后映射到/dev/uiox上 */
        if (dr->drv_flags & RTE_PCI_DRV_NEED_IGB_UIO)
            /* just map resources for Intel NICs */
            if (pci_uio_map_resource(dev) < 0)
                return -1;
#endif

        /* reference driver structure */
        dev->driver = dr;

        /* 調用PCI驅動的初始化函數 */
        /* call the driver devinit() function */
        return dr->devinit(dr, dev);
    }
    /* return positive value if driver is not found */
    return 1;
}

 

映射PCI地址空間到用戶空間的過程

/* map the PCI resource of a PCI device in virtual memory */
static int
pci_uio_map_resource(struct rte_pci_device *dev)
{
    int i, j;
    char dirname[PATH_MAX];
    char filename[PATH_MAX];
    char devname[PATH_MAX]; /* contains the /dev/uioX */
    void *mapaddr;
    int uio_num;
    unsigned long start,size;
    uint64_t phaddr;
    uint64_t offset;
    uint64_t pagesz;
    ssize_t nb_maps;
    struct rte_pci_addr *loc = &dev->addr;
    struct uio_resource *uio_res;
    struct uio_map *maps;

    dev->intr_handle.fd = -1;

    /* PRIMARY進程才做映射 */
    /* secondary processes - use already recorded details */
    if ((rte_eal_process_type() != RTE_PROC_PRIMARY) &&
        (dev->id.vendor_id != PCI_VENDOR_ID_QUMRANET))
        return (pci_uio_map_secondary(dev));

    /* 通過/sys/bus/pci/devices/0000:02:01.0/uio/uio0找到與當前device關聯的uio設備ID */
    /* find uio resource */
    uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname));
    if (uio_num < 0) {
        RTE_LOG(WARNING, EAL, "  "PCI_PRI_FMT" not managed by UIO driver, "
                "skipping\n", loc->domain, loc->bus, loc->devid, loc->function);
        return -1;
    }

    /* 忽略 */
    if(dev->id.vendor_id == PCI_VENDOR_ID_QUMRANET) {
        /* get portio size */
        rte_snprintf(filename, sizeof(filename),
             "%s/portio/port0/size", dirname);
        if (eal_parse_sysfs_value(filename, &size) < 0) {
            RTE_LOG(ERR, EAL, "%s(): cannot parse size\n",
                __func__);
            return -1;
        }

        /* get portio start */
        rte_snprintf(filename, sizeof(filename),
             "%s/portio/port0/start", dirname);
        if (eal_parse_sysfs_value(filename, &start) < 0) {
            RTE_LOG(ERR, EAL, "%s(): cannot parse portio start\n",
                __func__);
            return -1;
        }
        dev->mem_resource[0].addr = (void *)(uintptr_t)start;
        dev->mem_resource[0].len =  (uint64_t)size;
        RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx with size=0x%lx\n", start, size);
        /* rte_virtio_pmd does not need any other bar even if available */
        return (0);
    }
    
    /* allocate the mapping details for secondary processes*/
    if ((uio_res = rte_zmalloc("UIO_RES", sizeof (*uio_res), 0)) == NULL) {
        RTE_LOG(ERR, EAL,
            "%s(): cannot store uio mmap details\n", __func__);
        return (-1);
    }

    rte_snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
    rte_snprintf(uio_res->path, sizeof(uio_res->path), "%s", devname);
    memcpy(&uio_res->pci_addr, &dev->addr, sizeof(uio_res->pci_addr));

    /* uio設備所有map記錄到uio_res->maps中,並返回map的個數 */
    /* collect info about device mappings */
    if ((nb_maps = pci_uio_get_mappings(dirname, uio_res->maps,
            sizeof (uio_res->maps) / sizeof (uio_res->maps[0])))
            < 0)
        return (nb_maps);
 
    uio_res->nb_maps = nb_maps;

    /* Map all BARs */
    pagesz = sysconf(_SC_PAGESIZE);
 
    maps = uio_res->maps;
    for (i = 0; i != PCI_MAX_RESOURCE; i++) {
    
        /* rte_eal_init -> rte_eal_pci_init 中初始化了dev->mem_resource */
        /* /sys/bus/pci/devices/0000:02:01.0/resource 文件中讀取
         物理地址起始地址      物理地址結束          FLAG(第10個bit表示IO memory)
         0x00000000fd5a0000 0x00000000fd5bffff 0x0000000000140204
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x00000000fdff0000 0x00000000fdffffff 0x0000000000140204
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000002000 0x000000000000203f 0x0000000000040101
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x00000000e7b00000 0x00000000e7b0ffff 0x000000000004e200
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        */
        /* skip empty BAR */
        if ((phaddr = dev->mem_resource[i].phys_addr) == 0)
            continue;
 
        /* 查找PCI IO地址和uio匹配的 */
        for (j = 0; j != nb_maps && (phaddr != maps[j].phaddr ||
                dev->mem_resource[i].len != maps[j].size);
                j++)
            ;
 
        /* 打開/dev/uiox,把它的內存映射到用戶空間 */
        /* if matching map is found, then use it */
        if (j != nb_maps) {
            offset = j * pagesz;
            if (maps[j].addr != NULL ||
                    (mapaddr = pci_map_resource(dev,
                    NULL, devname, (off_t)offset,
                    (size_t)maps[j].size)) == NULL) {
                return (-1);
            }
 
            maps[j].addr = mapaddr;
            maps[j].offset = offset;
            dev->mem_resource[i].addr = mapaddr;
        }
    }
    /* uio_res加入uio_res_list鏈表 */ 
    TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);

    return (0);
}

 

回到pci驅動的初始化rte_eth_dev_init

static int
rte_eth_dev_init(struct rte_pci_driver *pci_drv,
         struct rte_pci_device *pci_dev)
{
    struct eth_driver    *eth_drv;
    struct rte_eth_dev *eth_dev;
    int diag;

    eth_drv = (struct eth_driver *)pci_drv;

    /* 分配或查找名為rte_eth_dev_data的memzone,並從全局數組rte_eth_devices中返回當前端口的entry */
    eth_dev = rte_eth_dev_allocate();
    if (eth_dev == NULL)
        return -ENOMEM;

    if (rte_eal_process_type() == RTE_PROC_PRIMARY){
        /* 分配PMD驅動的private內存 */
        eth_dev->data->dev_private = rte_zmalloc("ethdev private structure",
                  eth_drv->dev_private_size,
                  CACHE_LINE_SIZE);
        if (eth_dev->data->dev_private == NULL)
            rte_panic("Cannot allocate memzone for private port data\n");
    }
    eth_dev->pci_dev = pci_dev;
    eth_dev->driver = eth_drv;
    eth_dev->data->rx_mbuf_alloc_failed = 0;

    /* init user callbacks */
    TAILQ_INIT(&(eth_dev->callbacks));

    /*
     * Set the default maximum frame size.
     */
    eth_dev->data->max_frame_size = ETHER_MAX_LEN;

    /* 這次調用的是PMD驅動的初始化, 當前函數的上下文為PCI驅動的初始化函數 */
    /* Invoke PMD device initialization function */
    diag = (*eth_drv->eth_dev_init)(eth_drv, eth_dev);
    if (diag == 0)
        return (0);

    /* 初始化出錯,回收內存,端口數修正 */
    PMD_DEBUG_TRACE("driver %s: eth_dev_init(vendor_id=0x%u device_id=0x%x)"
            " failed\n", pci_drv->name,
            (unsigned) pci_dev->id.vendor_id,
            (unsigned) pci_dev->id.device_id);
    if (rte_eal_process_type() == RTE_PROC_PRIMARY)
        rte_free(eth_dev->data->dev_private);
    nb_ports--;
    return diag;
}

 

PMD驅動的初始化過程

static int
eth_em_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
        struct rte_eth_dev *eth_dev)
{
    struct rte_pci_device *pci_dev;
    struct e1000_hw *hw =
        E1000_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
    struct e1000_vfta * shadow_vfta =
        E1000_DEV_PRIVATE_TO_VFTA(eth_dev->data->dev_private);

    pci_dev = eth_dev->pci_dev;
    eth_dev->dev_ops = &eth_em_ops;
    eth_dev->rx_pkt_burst = (eth_rx_burst_t)&eth_em_recv_pkts;
    eth_dev->tx_pkt_burst = (eth_tx_burst_t)&eth_em_xmit_pkts;

    /* for secondary processes, we don't initialise any further as primary
     * has already done this work. Only check we don't need a different
     * RX function */
    if (rte_eal_process_type() != RTE_PROC_PRIMARY){
        if (eth_dev->data->scattered_rx)
            eth_dev->rx_pkt_burst =
                (eth_rx_burst_t)&eth_em_recv_scattered_pkts;
        return 0;
    }

    hw->hw_addr = (void *)pci_dev->mem_resource[0].addr;
    hw->device_id = pci_dev->id.device_id;

    /* For ICH8 support we'll need to map the flash memory BAR */

    if (e1000_setup_init_funcs(hw, TRUE) != E1000_SUCCESS ||
            em_hw_init(hw) != 0) {
        PMD_INIT_LOG(ERR, "port_id %d vendorID=0x%x deviceID=0x%x: "
            "failed to init HW",
            eth_dev->data->port_id, pci_dev->id.vendor_id,
            pci_dev->id.device_id);
        return -(ENODEV);
    }

    /* Allocate memory for storing MAC addresses */
    eth_dev->data->mac_addrs = rte_zmalloc("e1000", ETHER_ADDR_LEN *
            hw->mac.rar_entry_count, 0);
    if (eth_dev->data->mac_addrs == NULL) {
        PMD_INIT_LOG(ERR, "Failed to allocate %d bytes needed to "
            "store MAC addresses",
            ETHER_ADDR_LEN * hw->mac.rar_entry_count);
        return -(ENOMEM);
    }

    /* Copy the permanent MAC address */
    ether_addr_copy((struct ether_addr *) hw->mac.addr,
        eth_dev->data->mac_addrs);

    /* initialize the vfta */
    memset(shadow_vfta, 0, sizeof(*shadow_vfta));

    PMD_INIT_LOG(INFO, "port_id %d vendorID=0x%x deviceID=0x%x\n",
            eth_dev->data->port_id, pci_dev->id.vendor_id,
            pci_dev->id.device_id);

    rte_intr_callback_register(&(pci_dev->intr_handle),
        eth_em_interrupt_handler, (void *)eth_dev);

    return (0);
}

PMD驅動初始化主要是一些硬件相關的寄存器初始化以及函數的初始化,細節就不再分析了;函數的最后注冊了一個中斷處理函數,下面主要分析中斷處理的過程;

int
rte_intr_callback_register(struct rte_intr_handle *intr_handle,
            rte_intr_callback_fn cb, void *cb_arg)
{
    int ret, wake_thread;
    struct rte_intr_source *src;
    struct rte_intr_callback *callback;

    wake_thread = 0;

    /* intr_handle.fd為pci內存映射對應/dev/uiox文件描述符 */
    /* first do parameter checking */
    if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
        RTE_LOG(ERR, EAL,
            "Registering with invalid input parameter\n");
        return -EINVAL;
    }

    /* allocate a new interrupt callback entity */
    callback = rte_zmalloc("interrupt callback list",
                sizeof(*callback), 0);
    if (callback == NULL) {
        RTE_LOG(ERR, EAL, "Can not allocate memory\n");
        return -ENOMEM;
    }
    callback->cb_fn = cb;
    callback->cb_arg = cb_arg;

    rte_spinlock_lock(&intr_lock);

    /* check if there is at least one callback registered for the fd */
    TAILQ_FOREACH(src, &intr_sources, next) {
        if (src->intr_handle.fd == intr_handle->fd) {
            /* we had no interrupts for this */
            if TAILQ_EMPTY(&src->callbacks)
                wake_thread = 1;

            TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
            ret = 0;
            break;
        }
    }

    /* no existing callbacks for this - add new source */
    if (src == NULL) {
        if ((src = rte_zmalloc("interrupt source list",
                sizeof(*src), 0)) == NULL) {
            RTE_LOG(ERR, EAL, "Can not allocate memory\n");
            rte_free(callback);
            ret = -ENOMEM;
        } else {
            src->intr_handle = *intr_handle;
            TAILQ_INIT(&src->callbacks);
            TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
            TAILQ_INSERT_TAIL(&intr_sources, src, next);
            wake_thread = 1;
            ret = 0;
        }
    }

    rte_spinlock_unlock(&intr_lock);

    /* wake_thread=1會通知中斷處理線程有新的fd加入 */
    /**
     * check if need to notify the pipe fd waited by epoll_wait to
     * rebuild the wait list.
     */
    if (wake_thread)
        if (write(intr_pipe.writefd, "1", 1) < 0)
            return -EPIPE;

    return (ret);
}

 

在rte_eal_init初始化過程中調用了rte_eal_intr_init, rte_eal_intr_init里面會初始化一個中斷處理線程

int
rte_eal_intr_init(void)
{
    int ret = 0;

    /* init the global interrupt source head */
    TAILQ_INIT(&intr_sources);

    /**
     * create a pipe which will be waited by epoll and notified to
     * rebuild the wait list of epoll.
     */
    if (pipe(intr_pipe.pipefd) < 0)
        return -1;

    /* 中斷處理線程,用於監聽intr_sources中fd是否需要處理,並調用對應回調 */
    /* create the host thread to wait/handle the interrupt */
    ret = pthread_create(&intr_thread, NULL,
            eal_intr_thread_main, NULL);
    if (ret != 0)
        RTE_LOG(ERR, EAL,
            "Failed to create thread for interrupt handling\n");

    return -ret;
}

 

/**
 * It builds/rebuilds up the epoll file descriptor with all the
 * file descriptors being waited on. Then handles the interrupts.
 *
 * @param arg
 *  pointer. (unused)
 *
 * @return
 *  never return;
 */
static __attribute__((noreturn)) void *
eal_intr_thread_main(__rte_unused void *arg)
{
    struct epoll_event ev;

    /* host thread, never break out */
    for (;;) {
        /* build up the epoll fd with all descriptors we are to
         * wait on then pass it to the handle_interrupts function
         */
        static struct epoll_event pipe_event = {
            .events = EPOLLIN | EPOLLPRI,
        };
        struct rte_intr_source *src;
        unsigned numfds = 0;

        /* 創建epoll */
        /* create epoll fd */
        int pfd = epoll_create(1);
        if (pfd < 0)
            rte_panic("Cannot create epoll instance\n");

        /* 如果有新的中斷處理函數注冊,則會設置intr_pipe.readfd,本線程則會重新讀取intr_sources中所有中斷並加入epoll */
        pipe_event.data.fd = intr_pipe.readfd;
        /**
         * add pipe fd into wait list, this pipe is used to
         * rebuild the wait list.
         */
        if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
                        &pipe_event) < 0) {
            rte_panic("Error adding fd to %d epoll_ctl, %s\n",
                    intr_pipe.readfd, strerror(errno));
        }
        numfds++;

        rte_spinlock_lock(&intr_lock);

        /* intr_sources中所有fd加入epoll */
        TAILQ_FOREACH(src, &intr_sources, next) {
            if (src->callbacks.tqh_first == NULL)
                continue; /* skip those with no callbacks */
            ev.events = EPOLLIN | EPOLLPRI;
            ev.data.fd = src->intr_handle.fd;

            /**
             * add all the uio device file descriptor
             * into wait list.
             */
            if (epoll_ctl(pfd, EPOLL_CTL_ADD,
                    src->intr_handle.fd, &ev) < 0){
                rte_panic("Error adding fd %d epoll_ctl, %s\n",
                    src->intr_handle.fd, strerror(errno));
            }
            else
                numfds++;
        }
        rte_spinlock_unlock(&intr_lock);

        /* 等待fd事件,然后調用對應callback */
        /* serve the interrupt */
        eal_intr_handle_interrupts(pfd, numfds);

        /**
         * when we return, we need to rebuild the
         * list of fds to monitor.
         */
        close(pfd);
    }
}

 

static int
eal_intr_process_interrupts(struct epoll_event *events, int nfds)
{
    int n, bytes_read;
    struct rte_intr_source *src;
    struct rte_intr_callback *cb;
    union rte_intr_read_buffer buf;
    struct rte_intr_callback active_cb;

    for (n = 0; n < nfds; n++) {

        /* 如果是intr_pipe.readfd,則表示需要重新建epoll的fd等待鏈表,本循環退出 */
        /**
         * if the pipe fd is ready to read, return out to
         * rebuild the wait list.
         */
        if (events[n].data.fd == intr_pipe.readfd){
            int r = read(intr_pipe.readfd, buf.charbuf,
                    sizeof(buf.charbuf));
            RTE_SET_USED(r);
            return -1;
        }

        /* 中斷fd */
        rte_spinlock_lock(&intr_lock);
        TAILQ_FOREACH(src, &intr_sources, next)
            if (src->intr_handle.fd ==
                    events[n].data.fd)
                break;
        if (src == NULL){
            rte_spinlock_unlock(&intr_lock);
            continue;
        }

        /* mark this interrupt source as active and release the lock. */
        src->active = 1;
        rte_spinlock_unlock(&intr_lock);

        /* EM的中斷只需要處理這兩個 */
        /* set the length to be read dor different handle type */
        switch (src->intr_handle.type) {
        case RTE_INTR_HANDLE_UIO:
            bytes_read = 4;
            break;
        case RTE_INTR_HANDLE_ALARM:
            bytes_read = sizeof(uint64_t);
            break;
        default:
            bytes_read = 1;
            break;
        }

        /**
         * read out to clear the ready-to-be-read flag
         * for epoll_wait.
         */
        bytes_read = read(events[n].data.fd, &buf, bytes_read);

        if (bytes_read < 0)
            RTE_LOG(ERR, EAL, "Error reading from file "
                "descriptor %d: %s\n", events[n].data.fd,
                            strerror(errno));
        else if (bytes_read == 0)
            RTE_LOG(ERR, EAL, "Read nothing from file "
                "descriptor %d\n", events[n].data.fd);

        /* callback調用 */
        /* grab a lock, again to call callbacks and update status. */
        rte_spinlock_lock(&intr_lock);

        if (bytes_read > 0) {

            /* Finally, call all callbacks. */
            TAILQ_FOREACH(cb, &src->callbacks, next) {

                /* make a copy and unlock. */
                active_cb = *cb;
                rte_spinlock_unlock(&intr_lock);

                /* call the actual callback */
                active_cb.cb_fn(&src->intr_handle,
                    active_cb.cb_arg);

                /*get the lcok back. */
                rte_spinlock_lock(&intr_lock);
            }
        }

        /* we done with that interrupt source, release it. */
        src->active = 0;
        rte_spinlock_unlock(&intr_lock);
    }

    return 0;
}

 

對於E1000的驅動注冊的callback eth_em_interrupt_handler里面處理了link狀態的回調, link down消息則關閉收發包, link up開啟收發包;

static void
eth_em_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
                            void *param)
{
    struct rte_eth_dev *dev = (struct rte_eth_dev *)param;

    /* 讀取寄存器, 硬件狀態是否改變? */
    eth_em_interrupt_get_status(dev);
    /* 根據link狀態設置對應的收發包寄存器 */
    eth_em_interrupt_action(dev);

    /* 調用用戶注冊的回調, 如果用戶關心事件可以用rte_eth_dev_callback_register注冊相應回調 */
    _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC);
}

 

后面還有收發包隊列的初始化, 待分析;


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM