linux源碼解讀（十七）：紅黑樹在內核的應用——epoll

本文轉載自查看原文 2022-01-21 21:57 1591 操作系統原理

　　1、簡單介紹一下epoll的出現的背景：這里以java代碼為例，最原始的server代碼如下：

while(true){
         ServerSocket ss = new ServerSocket(8888);
         System.out.println("啟動服務器....");
         Socket s = ss.accept();//阻塞點
         System.out.println("客戶端:"+s.getInetAddress().getLocalHost()+"已連接到服務器");
         
         BufferedReader br = new BufferedReader(new InputStreamReader(s.getInputStream()));
         //讀取客戶端發送來的消息
         String mess = br.readLine();//阻塞2
         System.out.println("客戶端："+mess);
         BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(s.getOutputStream()));
         bw.write(mess+"\n");
         bw.flush();  

}

　　單線程中，一個死循環不停地接受客戶端的連接和數據；上述代碼有兩個“卡點”：第一個是accept函數，server在8888端口監聽有沒有client來連接，如果沒有就一直阻塞，代碼沒法往下繼續執行！第二個是readLine函數，server和client建立連接后要等client發送數據；如果沒收到client的數據也一直阻塞，后面的代碼還是沒法執行！這種代碼的缺陷就很明顯了：單線程同時只能處理一個client的連接和收數據，如果沒有就一直阻塞，其他client的連接請求無法處理！

　　既然上述問題是單線程導致的，改成多線程不就解決了？每個線程單獨accept和readLine，就算阻塞也只阻塞單個線程；但凡有新client連接，server就單獨開個線程去accept和readLine，client之間互不影響，問題完美解決？？？？？(￣▽￣)" 如果真有這么簡單，就沒epoll啥事了！多線程也有缺陷：client連接后不見的會持續發數據，但是server卻要分配線程去監聽，同時做好接收數據的准備；server這邊單獨開一個sockt（linux操作系統底層用的是fd表示）+線程會明顯會消耗server的硬件資源；如果有大量的client連接后卻不發送數據，server會因為分配了大量的線程+socket，讓cpu（大量線程之間輪詢，產生上下文切換）甚至內存被打滿（這不就是DDOS攻擊么？），整個服務器沒法繼續干活了，這可咋整？

　　2、回顧上面的問題，本質都是因為accept和readLine阻塞引起的，如果改成單線程+非阻塞能不能解決了？epoll孕育而生！

（1）還是按照以往的思路，先看看有哪些相關的結構體：eventpoll，包含了紅黑樹的root節點和readList鏈表！既然包含了root節點，肯定需要先生成該結構體實例的！

struct eventpoll {
　　...
　　/*紅黑樹的根節點，這棵樹中存儲着所有添加到epoll中的事件，
　　也就是這個epoll監控的事件*/
　　struct rb_root rbr;
　　/*雙向鏈表rdllist保存着將要通過epoll_wait返回給用戶的、滿足條件的事件*/
　　struct list_head rdllist;
　　...
};

　　　epitem：組成了紅黑樹的節點。和epoll這種業務相關的字段就是epoll_event了！

struct epitem {
　　...
　　//紅黑樹節點
　　struct rb_node rbn;
　　//雙向鏈表節點
　　struct list_head rdllink;
　　//事件句柄等信息
　　struct epoll_filefd ffd;
　　//指向其所屬的eventepoll對象
　　struct eventpoll *ep;
　　//期待的事件類型
/* The structure that describe the interested events and the source fd */
　　struct epoll_event event;
   //下一個epitem實例
   struct epitem *next;　　...
}; // 這里包含每一個事件對應着的信息。

　　從源碼的英文注釋看，epoll_event結構體就兩個字段：由事件和fd組成的，讓事件和fd產生映射關系，避免事件和fd張冠李戴！比如明明是進程A的socket收到了數據，卻錯誤映射到了進程B的socket就尷尬了！

struct epoll_event {
    __u32 events;
    __u64 data;
} EPOLL_PACKED;

　　這3個結構體之間的脈絡關系如下：這里標注了3個函數，也正是這3個函數構建了紅黑樹和兩個雙向鏈表！

注意：這個圖上有兩個鏈表，task鏈表組成的struct字段如下：最核心的就是第2個和第3個字段了！當進程沒收到數據時，會進入這個wait_queue；當網卡收到數據后，會通過中斷通知cpu來取數據，再執行第三個回調函數func！

struct __wait_queue {
    unsigned int        flags;
    void            *private;//等待隊列的task_struct指針
    wait_queue_func_t    func;//進程喚醒后執行的回調函數
    struct list_head    task_list;
};

這個結構體實例之間的關系：

（2）既然紅黑樹的根節點是在eventpoll結構體中，那么肯定要先生成eventpoll實例，這個是在epoll_create函數中實現的，如下：

/*
 * Open an eventpoll file descriptor.
 */
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    int error, fd;
    struct eventpoll *ep = NULL;
    struct file *file;

    /* Check the EPOLL_* constant for consistency.  */
    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;
    /*
     * Create the internal data structure ("struct eventpoll").
       創建一個eventpoll實例，里面初始化紅黑樹、鏈表的節點
     */
    error = ep_alloc(&ep);
    if (error < 0)
        return error;
    /*
     * Creates all the items needed to setup an eventpoll file. That is,
     * a file structure and a free file descriptor.
     */
    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    if (fd < 0) {
        error = fd;
        goto out_free_ep;
    }
    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                 O_RDWR | (flags & O_CLOEXEC));
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto out_free_fd;
    }
    ep->file = file;
    fd_install(fd, file);
    return fd;

out_free_fd:
    put_unused_fd(fd);
out_free_ep:
    ep_free(ep);
    return error;
}

　　既然是create函數，核心功能就是生成eventpoll結構體的實例，這個是通過調用ep_alloc實現的！

/*生成eventpoll實例*/
static int ep_alloc(struct eventpoll **pep)
{
    int error;
    struct user_struct *user;
    struct eventpoll *ep;

    user = get_current_user();
    error = -ENOMEM;
    ep = kzalloc(sizeof(*ep), GFP_KERNEL);
    if (unlikely(!ep))
        goto free_uid;

    spin_lock_init(&ep->lock);
    mutex_init(&ep->mtx);
    init_waitqueue_head(&ep->wq);
    init_waitqueue_head(&ep->poll_wait);
    INIT_LIST_HEAD(&ep->rdllist);//readList第一個節點初始化
    ep->rbr = RB_ROOT;//紅黑樹根節點
    ep->ovflist = EP_UNACTIVE_PTR;
    ep->user = user;

    *pep = ep;

    return 0;

free_uid:
    free_uid(user);
    return error;
}

　　紅黑樹的根節點、鏈表頭節點都生成后，接下來就是構建整顆樹以及鏈表了，這些都是在epoll_ctl中實現的；先看看epoll操作的類型：主要有下面3種：增加、刪除和修改！

static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };

　　這還是個系統調用，核心代碼如下（函數開始有很多容錯的代碼忽略了，避免影響對主干代碼的理解）：就是個switch結構，3種不同的options對應3種不同的操作！本質上就是讓內核知道用戶關注哪些事件；事件發生的時候需要回調哪些函數！

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)
{
    int error;
    int full_check = 0;
    struct fd f, tf;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;
    struct eventpoll *tep = NULL;
        ..........
        /*
     * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
     * above, we can be sure to be able to use the item looked up by
     * ep_find() till we release the mutex. 在紅黑樹中根據fd找到epi結構體
     */
    epi = ep_find(ep, tf.file, fd);

    error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= POLLERR | POLLHUP;
            /*epoll節點插入，包括list節點和紅黑樹節點*/
            error = ep_insert(ep, &epds, tf.file, fd, full_check);
        } else
            error = -EEXIST;
        if (full_check)
            clear_tfile_check_list();
        break;
    case EPOLL_CTL_DEL:
        if (epi)
            /*epoll節點刪除，包括list節點和紅黑樹節點*/
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) {
            if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                epds.events |= POLLERR | POLLHUP;
                /*epoll節點更改，包括list節點和紅黑樹節點*/
                error = ep_modify(ep, epi, &epds);
            }
        } else
            error = -ENOENT;
        break;
    }
        ..........
}

從上面的代碼可知，其實核心的函數是ep_insert、ep_remove、ep_modify! 先看看ep_insert函數(為了突出重點，省略末尾的容錯代碼)，主要干了這么幾件事：

構建wait_queue隊列（通過鏈表實現）
構建紅黑樹
注冊喚醒task后的回調函數

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
             struct file *tfile, int fd, int full_check)
{
    int error, revents, pwake = 0;
    unsigned long flags;
    long user_watches;
    struct epitem *epi;
    struct ep_pqueue epq;

    user_watches = atomic_long_read(&ep->user->epoll_watches);
    if (unlikely(user_watches >= max_user_watches))
        return -ENOSPC;
    if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
        return -ENOMEM;

    /* Item initialization follow here ... */
    INIT_LIST_HEAD(&epi->rdllink);//初始化readyList鏈表頭
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    ep_set_ffd(&epi->ffd, tfile, fd);
    epi->event = *event;
    epi->nwait = 0;
    epi->next = EP_UNACTIVE_PTR;
    if (epi->event.events & EPOLLWAKEUP) {
        error = ep_create_wakeup_source(epi);
        if (error)
            goto error_create_wakeup_source;
    } else {
        RCU_INIT_POINTER(epi->ws, NULL);
    }

    /* Initialize the poll table using the queue callback */
    epq.epi = epi;
    /*注冊回到函數*/
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

    /*
     * Attach the item to the poll hooks and get current event bits.
     * We can safely use the file* here because its usage count has
     * been increased by the caller of this function. Note that after
     * this operation completes, the poll callback can start hitting
     * the new item.
     */
    revents = ep_item_poll(epi, &epq.pt);

    /*
     * We have to check if something went wrong during the poll wait queue
     * install process. Namely an allocation for a wait queue failed due
     * high memory pressure.
     */
    error = -ENOMEM;
    if (epi->nwait < 0)
        goto error_unregister;

    /* Add the current item to the list of active epoll hook for this file */
    spin_lock(&tfile->f_lock);
    list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_lock);

    /*
     * Add the current item to the RB tree. All RB tree operations are
     * protected by "mtx", and ep_insert() is called with "mtx" held.
       epitem節點就是在這里插入紅黑樹的
     */
    ep_rbtree_insert(ep, epi);

    /* now check if we've created too many backpaths */
    error = -EINVAL;
    if (full_check && reverse_path_check())
        goto error_remove_epi;

    /* We have to drop the new item inside our item list to keep track of it */
    spin_lock_irqsave(&ep->lock, flags);

    /* If the file is already "ready" we drop it inside the ready list 
    */
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);//把當前event加入readyList的尾部
        ep_pm_stay_awake(epi);

        /* Notify waiting tasks that events are available 
        如果網卡收到數據，需要喚醒等待的task，並執行實現設置好的回調函數
        */
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);//喚醒進程后執行的回調函數
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    spin_unlock_irqrestore(&ep->lock, flags);

    atomic_long_inc(&ep->user->epoll_watches);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    return 0;
}

　　相比增加，刪除節點就容易多了：直接從紅黑樹和readyList刪除

/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 * all the associated resources. Must be called with "mtx" held.
 */
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
    unsigned long flags;
    struct file *file = epi->ffd.file;

    /*
     * Removes poll wait queue hooks. We _have_ to do this without holding
     * the "ep->lock" otherwise a deadlock might occur. This because of the
     * sequence of the lock acquisition. Here we do "ep->lock" then the wait
     * queue head lock when unregistering the wait queue. The wakeup callback
     * will run by holding the wait queue head lock and will call our callback
     * that will try to get "ep->lock".
     */
    ep_unregister_pollwait(ep, epi);

    /* Remove the current item from the list of epoll hooks */
    spin_lock(&file->f_lock);
    list_del_rcu(&epi->fllink);
    spin_unlock(&file->f_lock);

    rb_erase(&epi->rbn, &ep->rbr);//從紅黑樹刪除

    spin_lock_irqsave(&ep->lock, flags);
    if (ep_is_linked(&epi->rdllink))
        list_del_init(&epi->rdllink);//從readyList刪除
    spin_unlock_irqrestore(&ep->lock, flags);

    wakeup_source_unregister(ep_wakeup_source(epi));
    /*
     * At this point it is safe to free the eventpoll item. Use the union
     * field epi->rcu, since we are trying to minimize the size of
     * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
     * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
     * use of the rbn field.
     */
    call_rcu(&epi->rcu, epi_rcu_free);

    atomic_long_dec(&ep->user->epoll_watches);

    return 0;
}

　　最后就是epoll_wait函數了: 也是個系統調用。為了突出主干，省略了前面的容錯代碼，核心就是調用了ep_poll函數；

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
        int, maxevents, int, timeout)
{
    int error;
    struct fd f;
    struct eventpoll *ep;
        ..........
     /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
    ep = f.file->private_data;

    /* Time to fish for events ... */
    error = ep_poll(ep, events, maxevents, timeout);

}

　　ep_poll函數的實現邏輯也不復雜：在超時允許的時間內查看readyList；如果不為空說明有事件准備好了，然后把這些事件推送到用戶空間！

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
 *           event buffer.
 *           找到ready的事件
 * @ep: Pointer to the eventpoll context.
 * @events: Pointer to the userspace buffer where the ready events should be
 *          stored.
 * @maxevents: Size (in terms of number of events) of the caller event buffer.
 * @timeout: Maximum timeout for the ready events fetch operation, in
 *           milliseconds. If the @timeout is zero, the function will not block,
 *           while if the @timeout is less than zero, the function will block
 *           until at least one event has been retrieved (or an error
 *           occurred).
 *
 * Returns: Returns the number of ready events which have been fetched, or an
 *          error code, in case of error.
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res = 0, eavail, timed_out = 0;
    unsigned long flags;
    u64 slack = 0;
    wait_queue_t wait;
    ktime_t expires, *to = NULL;

    if (timeout > 0) {
        struct timespec64 end_time = ep_set_mstimeout(timeout);

        slack = select_estimate_accuracy(&end_time);
        to = &expires;
        *to = timespec64_to_ktime(end_time);
    } else if (timeout == 0) {
        /*
         * Avoid the unnecessary trip to the wait queue loop, if the
         * caller specified a non blocking operation.
         */
        timed_out = 1;
        spin_lock_irqsave(&ep->lock, flags);
        goto check_events;
    }

fetch_events:
    spin_lock_irqsave(&ep->lock, flags);

    if (!ep_events_available(ep)) {
        /* 如果沒有任何事件發生，就讓出cpu；一旦有事件到達，會被ep_poll_callback函數喚醒！
         * We don't have any available event to return to the caller.
         * We need to sleep here, and we will be wake up by
         * ep_poll_callback() when events will become available.
         */
        init_waitqueue_entry(&wait, current);//把當前進程放入等待隊列，並注冊喚醒回調函數
        __add_wait_queue_exclusive(&ep->wq, &wait);

        for (;;) {
            /*
             * We don't want to sleep if the ep_poll_callback() sends us
             * a wakeup in between. That's why we set the task state
             * to TASK_INTERRUPTIBLE before doing the checks.
             */
            set_current_state(TASK_INTERRUPTIBLE);
            if (ep_events_available(ep) || timed_out)//如果有ready的事件，或已經等待超時就跳出死循環
                break;
            if (signal_pending(current)) {
                res = -EINTR;
                break;
            }

            spin_unlock_irqrestore(&ep->lock, flags);
            //主動讓出cpu，進入隨眠狀態；等有事件發生時，進程切換回來繼續執行for循環，到上面的break代碼跳出
            if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
                timed_out = 1;

            spin_lock_irqsave(&ep->lock, flags);
        }

        __remove_wait_queue(&ep->wq, &wait);//移除wait_queue
        __set_current_state(TASK_RUNNING);//喚醒后進程狀態設置為running
    }
check_events:
    /* Is it worth to try to dig for events ? */
    eavail = ep_events_available(ep);//遍歷readyList，發現不為空哦，說明有事件產生了！

    spin_unlock_irqrestore(&ep->lock, flags);

    /*
     * Try to transfer events to user space. In case we get 0 events and
     * there's still timeout left over, we go trying again in search of
     * more luck. 把ready的事件拷貝到用戶空間，讓上層應用繼續處理
     */
    if (!res && eavail &&
        !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
        goto fetch_events;

    return res;
}

上面epoll_create、epoll_ctl和epoll_wait看起來很多，其實用起來很簡單，demo代碼如下（完整的用例見文章末尾）：先用epoll_create創建epoll實例，再用epoll_ctl添加fd和事件的映射，個人覺得epoll思路的精髓就體現在epoll_wait這里了：這函數可以設置等待（或則阻塞）的時長。如果執行的時候發現readyList有准備好的事件，或者說超時就返回繼續執行，不會一直阻塞在這里傻等；再加上這里是單線程循環執行，完美解決了文章開頭的多線程+阻塞的問題！

　　當網卡收到數據，也就是有事件發生時，會挨個喚醒wait_queue的task，然后執行每個task的回調函數，執行回調函數的名稱是ep_poll_callback，其調用的核心代碼如下：

/*
 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
 * number) then we wake all the non-exclusive tasks and one exclusive task.
 *
 * There are circumstances in which we can try to wake a task which has already
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
   執行等待隊列中每個task被喚醒后的回調函數
 */
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, int wake_flags, void *key)
{
    wait_queue_t *curr, *next;

    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;
        /*執行回調函數*/
        if (curr->func(curr, mode, wake_flags, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}

windows下用vmware+kali+qemu+vscode調試的效果如下：

epoll回調函數斷點：左上方能看到各個變量的值，左下方能看到函數的調用棧！這個在逆向特別有用：比如在malloc斷下，就能查到是哪個函數在分配內存，進而存放加密數據了！

　　3、epoll的一個完整的demo用例，方便直觀理解epoll的功能：

#include <iostream>  
#include <sys/socket.h>  
#include <sys/epoll.h>  
#include <netinet/in.h>  
#include <arpa/inet.h>  
#include <fcntl.h>  
#include <unistd.h>  
#include <stdio.h>  
#include <errno.h>  
  
using namespace std;  
  
#define MAXLINE 100  
#define OPEN_MAX 100  
#define LISTENQ 20  
#define SERV_PORT 5000  
#define INFTIM 1000  
  
void setnonblocking(int sock)  
{  
    int opts;  
     opts=fcntl(sock,F_GETFL);  
    if(opts<0)  
    {  
        perror("fcntl(sock,GETFL)");  
        exit(1);  
    }  
     opts = opts|O_NONBLOCK;  
    if(fcntl(sock,F_SETFL,opts)<0)  
    {  
        perror("fcntl(sock,SETFL,opts)");  
        exit(1);  
    }  
}  
  
int main(int argc, char* argv[])  
{  
    int i, maxi, listenfd, connfd, sockfd,epfd,nfds, portnumber;  
    ssize_t n;  
    char line[MAXLINE];  
    socklen_t clilen;  
    string szTemp("");  
  
    if ( 2 == argc )  
    {  
        if( (portnumber = atoi(argv[1])) < 0 )  
        {  
            fprintf(stderr,"Usage:%s portnumber\a\n",argv[0]);  
            return 1;  
        }  
    }  
    else  
    {  
        fprintf(stderr,"Usage:%s portnumber\a\n",argv[0]);  
        return 1;  
    }  
  
    //聲明epoll_event結構體的變量,ev用於注冊事件,數組用於回傳要處理的事件  
    struct epoll_event ev, events[20];  
      
    //創建一個epoll的句柄，size用來告訴內核這個監聽的數目一共有多大  
    epfd = epoll_create(256); //生成用於處理accept的epoll專用的文件描述符  
      
    struct sockaddr_in clientaddr;  
    struct sockaddr_in serveraddr;  
    listenfd = socket(AF_INET, SOCK_STREAM, 0);  
      
    //把socket設置為非阻塞方式  
    //setnonblocking(listenfd);  
  
    //設置與要處理的事件相關的文件描述符  
    //要監聽的fd
    ev.data.fd=listenfd;  
      
    //設置要處理的事件類型  
    //要監聽的事件：收到數據后可讀、邊緣觸發
    ev.events=EPOLLIN|EPOLLET;  
  
    //注冊epoll事件 ，也就上述要是監聽fd+監聽的事件，通過ev傳入
    //本質就是在紅黑樹上增刪改查
    epoll_ctl(epfd,EPOLL_CTL_ADD,listenfd,&ev);  
      
    bzero(&serveraddr, sizeof(serveraddr)); /*配置Server socket的相關信息 */  
    serveraddr.sin_family = AF_INET;  
    char *local_addr="127.0.0.1";  
    inet_aton(local_addr,&(serveraddr.sin_addr));//htons(portnumber);  
    serveraddr.sin_port=htons(portnumber);  
    bind(listenfd,(sockaddr *)&serveraddr, sizeof(serveraddr));  
    listen(listenfd, LISTENQ);  
      
    maxi = 0;  
      
    for ( ; ; ) {  
          
        //等待epoll事件的發生  
        //返回需要處理的事件數目nfds，如返回0表示已超時。  
        nfds=epoll_wait(epfd,events,20,500);  
          
        //處理所發生的所有事件  
        for(i=0; i < nfds; ++i)  
        {  
            //如果新監測到一個SOCKET用戶連接到了綁定的SOCKET端口，建立新的連接。  
            if(events[i].data.fd == listenfd)  
            {  
                //收到了用戶建立連接的請求再調用accept，避免阻塞浪費時間
                connfd = accept(listenfd,(sockaddr *)&clientaddr, &clilen);  
                if(connfd < 0)  
                {  
                    perror("connfd < 0");  
                    exit(1);  
                }  
                //setnonblocking(connfd);  
                char *str = inet_ntoa(clientaddr.sin_addr);  
                cout << "accapt a connection from " << str << endl;  
                  
                //設置用於讀操作的文件描述符  
                  ev.data.fd=connfd;  
                  
                //設置用於注冊的讀操作事件  
                  ev.events=EPOLLIN|EPOLLET;  
  
                //注冊ev  
                epoll_ctl(epfd,EPOLL_CTL_ADD,connfd,&ev); /* 添加新建立的fd，同樣需要監控這個fd有沒有收發數據、邊緣觸發等 */  
            }  
            //如果是已經連接的用戶，並且收到數據，那么進行讀入。  
            else if(events[i].events&EPOLLIN)  
            {  
                cout << "EPOLLIN" << endl;  
                if ( (sockfd = events[i].data.fd) < 0)  
                    continue;  
                //網卡收到了數據再調用recv把數據拷貝到用戶空間，避免阻塞浪費時間
                if ( (n = recv(sockfd, line, sizeof(line), 0)) < 0)   
                {    
                    // Connection Reset:你連接的那一端已經斷開了，而你卻還試着在對方已斷開的socketfd上讀寫數據！  
                    if (errno == ECONNRESET)  
                    {  
                        close(sockfd);  
                        events[i].data.fd = -1;  
                    }   
                    else  
                        std::cout<<"readline error"<<std::endl;  
                }   
                else if (n == 0) //讀入的數據為空  
                {  
                    close(sockfd);  
                    events[i].data.fd = -1;  
                }  
                  
                szTemp = "";  
                szTemp += line;  
                szTemp = szTemp.substr(0,szTemp.find('\r')); /* remove the enter key */  
                memset(line,0,100); /* clear the buffer */  
                //line[n] = '\0';  
                cout << "Readin: " << szTemp << endl;  
                  
                //設置用於寫操作的文件描述符  
                ev.data.fd=sockfd;  
                  
                //設置用於注冊的寫操作事件  
                ev.events=EPOLLOUT|EPOLLET;  
                  
                //修改sockfd上要處理的事件為EPOLLOUT，即fd上可寫了、可以發送數據了
                epoll_ctl(epfd,EPOLL_CTL_MOD,sockfd,&ev); /* 修改紅黑樹 */  
  
            }  
            else if(events[i].events&EPOLLOUT) // 如果有數據發送  
            {  
                sockfd = events[i].data.fd;  
                szTemp = "Server:" + szTemp + "\n";  
                send(sockfd, szTemp.c_str(), szTemp.size(), 0); 
                  
                //設置用於讀操作的文件描述符  
                ev.data.fd=sockfd;  
                 
                //設置用於注冊的讀操作事件  
                ev.events=EPOLLIN|EPOLLET;  
                  
                //修改sockfd上要處理的事件為EPOLIN ，即fd上可讀了、可以接收數據了
                epoll_ctl(epfd,EPOLL_CTL_MOD,sockfd,&ev); /* 修改紅黑樹 */  
            }  
        } //(over)處理所發生的所有事件  
    } //(over)等待epoll事件的發生  
      
    close(epfd);  
    return 0;  
}

總結：

1、紅黑樹的作用：當有事件發生時，可以快速根據fd查找epitem（找到得epiterm會組成鏈表傳遞給用戶空間做進一步處理），比遍歷鏈表快多了！

2、內核中鏈表適用的場景：用來做隊列或棧，存儲的每個節點都要處理（說白了就是需要遍歷），不存在查找的需求場景！

3、epoll事件底層最終是中斷觸發的：當網卡收到數據后，通過中斷通知操作系統來取數據，進而觸發epoll事件！

參考：

1、https://www.bilibili.com/video/BV15z4y1m7Gt/?spm_id_from=333.788.recommend_more_video.-1 epoll源碼剖析

2、https://os.51cto.com/article/649405.html epoll原理

3、https://www.bilibili.com/video/BV1Sq4y1q7Gv?zw https://zhuanlan.zhihu.com/p/445453676 linux內核調試環境搭建

4、https://blog.csdn.net/Eunice_fan1207/article/details/99674021 Linux內核剖析-----IO復用函數epoll內核源碼剖析

5、https://www.cnblogs.com/carekee/articles/2760693.html epoll用例

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 linux源碼解讀（十四）：紅黑樹在內核的應用——紅黑樹原理和api解析 linux源碼解讀（十八）：紅黑樹在內核的應用——timer定時器 linux源碼解讀（十五）：紅黑樹在內核的應用——CFS調度器 linux源碼解讀（十六）：紅黑樹在內核的應用——虛擬內存管理 (java 8)HashMap-紅黑樹轉換-源碼解讀紅黑樹(三)之 Linux內核中紅黑樹的經典實現詳解Linux內核紅黑樹算法的實現 linux內核中的紅黑樹代碼解析紅黑樹及其在Linux內存管理中的應用詳解探討epoll原理（紅黑樹、rdlist的實現）