linux源碼解讀（二十）：網絡通信簡介——socket&sock結構體介紹

本文轉載自查看原文 2022-02-02 20:53 1404 操作系統原理

　　linux下的網絡編程離不開socket，中文被翻譯為套接字。任何網絡通信都必須先建立socket，再通過socket給對方收發數據！數據接受的demo代碼如下：

#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#define SET_PORT 3490
int main(void)
{
    int sockfd, new_fd;
    struct sockaddr_in my_addr;
    struct sockaddr_in their_addr;
    int sin_size;
    sockfd = socket(PF_INET, SOCK_STREAM, 0);
    my_addr.sin_family = AF_INET;
    my_addr.sin_port = htons(_INT_PORT);
    my_addr.sin_addr.s_addr = INADDR_ANY;
    bzero(&(my_addr.sin_zero),sizeof(my_addr.sin_zero));
    bind(sockfd, (struct sockaddr *)&my_addr,sizeof(struct sockaddr));// 綁定套接字
    listen(sockfd, 10);                                                     // 監聽套接字
    sin_size = sizeof(struct sockaddr_in);
    new_fd = accept(sockfd, &their_addr, &sin_size);                        // 接收套接字
}

可以看出，需要先調用socket函數建立socket，再綁定套接字，最后監聽和接受數據。這個socket到底是啥？linux在內核中又是怎么使用的了？

1、（1）socket是個結構體，字段不多，但是嵌套了其他結構體，各種嵌套的關系標識如下：

　　proto_ops：用戶層調用的各種接口就是在這里注冊的（篇幅有限，截圖的字段不全）
wq：等待該socket的進程隊列和異步通知隊列；換句話說：同一個socket可能有多個進程都在等待使用！
sock：應該是socket結構體最核心的嵌套結構體了（篇幅有限，截圖的字段不全）！

（2）socket結構體有了，接下來就是創建和初始化了！linux內核創建socket的函數是__sock_create，核心代碼如下：

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    int err;
    struct socket *sock;
    const struct net_proto_family *pf;
        .........

     /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
        本質：創建socket結構體，存放在inode，通過superblock統一檢索和管理
     */
    sock = sock_alloc();
        .........
        /*socket就是在這里創建的,實際調用的是inet_create
          af_inet.c文件中：
          static const struct net_proto_family inet_family_ops = {
               .family = PF_INET,
               .create = inet_create,
               .owner    = THIS_MODULE,
    };*/
    err = pf->create(net, sock, protocol, kern);
    ..................
}

創建socket的核心函數就2個:sock_alloc，還有pf->create！先看第一個sock_alloc，代碼如下：

/**
 *    sock_alloc    -    allocate a socket
 *
 *    Allocate a new inode and socket object. The two are bound together
 *    and initialised. The socket is then returned. If we are out of inodes
 *    NULL is returned.
    明明是申請socket，底層卻分配inode，這是為啥了？
    1、socket也需要管理，放在inode后通過super_bloc統一檢索和管理
    2、socket的屬性字段自然也存放在inode節點了
    3、符合萬物皆文件的理念
 */

struct socket *sock_alloc(void)
{
    struct inode *inode;
    struct socket *sock;
    //從超級塊里分配一個inode
    inode = new_inode_pseudo(sock_mnt->mnt_sb);
    if (!inode)
        return NULL;
    /*把inode和socket綁定在一起，通過inode尋址socket，便於管理*/
    sock = SOCKET_I(inode);

    kmemcheck_annotate_bitfield(sock, type);//標記shadow memory來表示這塊內存已經使用了
    inode->i_ino = get_next_ino();
    inode->i_mode = S_IFSOCK | S_IRWXUGO;
    inode->i_uid = current_fsuid();
    inode->i_gid = current_fsgid();
    inode->i_op = &sockfs_inode_ops;

    this_cpu_add(sockets_in_use, 1);
    return sock;
}

　　本質上就是分配一個inode，然后和socket結構體綁定，通過inode尋址socket結構體！socket結構體有了，接下來就是在socket內部嵌套的sock結構體了！其生成和初始化的工作都是在inet_create內部完成的，代碼如下：

static int inet_create(struct net *net, struct socket *sock, int protocol,
               int kern)
{
    struct sock *sk;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    int try_loading_module = 0;
    int err;

    if (protocol < 0 || protocol >= IPPROTO_MAX)
        return -EINVAL;

    sock->state = SS_UNCONNECTED;//初始化狀態當然設置成未連接了

    /* Look for the requested type/protocol pair. */
lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

        err = 0;
        /* Check the non-wild match. */
        if (protocol == answer->protocol) {
            if (protocol != IPPROTO_IP)
                break;
        } else {
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) {
                protocol = answer->protocol;
                break;
            }
            if (IPPROTO_IP == answer->protocol)
                break;
        }
        err = -EPROTONOSUPPORT;
    }

    if (unlikely(err)) {
        if (try_loading_module < 2) {
            rcu_read_unlock();
            /*
             * Be more specific, e.g. net-pf-2-proto-132-type-1
             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
             */
            if (++try_loading_module == 1)
                request_module("net-pf-%d-proto-%d-type-%d",
                           PF_INET, protocol, sock->type);
            /*
             * Fall back to generic, e.g. net-pf-2-proto-132
             * (net-pf-PF_INET-proto-IPPROTO_SCTP)
             */
            else
                request_module("net-pf-%d-proto-%d",
                           PF_INET, protocol);
            goto lookup_protocol;
        } else
            goto out_rcu_unlock;
    }

    err = -EPERM;
    if (sock->type == SOCK_RAW && !kern &&
        !ns_capable(net->user_ns, CAP_NET_RAW))
        goto out_rcu_unlock;

    sock->ops = answer->ops;
    answer_prot = answer->prot;
    answer_flags = answer->flags;
    rcu_read_unlock();

    WARN_ON(!answer_prot->slab);

    err = -ENOBUFS;
    /*從cpu緩存或堆內存分配空間存儲sock實例，並初始化*/
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
    if (!sk)
        goto out;

    err = 0;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = SK_CAN_REUSE;
    /*
    1、強制轉換成inet_sock類型，便於繼續初始化；
    2、inet和sk指針並未改變，指向的是同一塊內存地址，兩個指針可以同時使用
    */
    inet = inet_sk(sk);
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

    inet->nodefrag = 0;

    if (SOCK_RAW == sock->type) {
        inet->inet_num = protocol;
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }

    if (net->ipv4.sysctl_ip_no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->inet_id = 0;
    /*
    1、初始化sk_buff的讀、寫、錯誤隊列
    2、關聯socket和sock的實例
    3、定義sock的回調函數
    4、初始化其他sock字段
    */
    sock_init_data(sock, sk);

    sk->sk_destruct       = inet_sock_destruct;//析構時的回調函數
    sk->sk_protocol       = protocol;//協議類型
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
    //sk和inet交替使用來初始化
    inet->uc_ttl    = -1;
    inet->mc_loop    = 1;
    inet->mc_ttl    = 1;
    inet->mc_all    = 1;
    inet->mc_index    = 0;
    inet->mc_list    = NULL;
    inet->rcv_tos    = 0;

    sk_refcnt_debug_inc(sk);//引用計數+1

    if (inet->inet_num) {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */
        inet->inet_sport = htons(inet->inet_num);
        /* Add to protocol hash chains. */
        err = sk->sk_prot->hash(sk);
        if (err) {
            sk_common_release(sk);
            goto out;
        }
    }

    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);
        if (err)
            sk_common_release(sk);
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}

　　整個邏輯並不復雜，先是調用sk_alloc函數生成sock實例，再調用sock_init_data初始化sock實力，並和socket實例關聯，所以我個人認為sock_init_data是最核心的函數，如下:

/*
1、初始化sk_buff的讀、寫、錯誤隊列
2、關聯socket和sock的實例
3、定義sock的回調函數
4、初始化其他sock字段
*/
void sock_init_data(struct socket *sock, struct sock *sk)
{
    /*初始化sk_buff的讀寫、錯誤隊列*/
    skb_queue_head_init(&sk->sk_receive_queue);
    skb_queue_head_init(&sk->sk_write_queue);
    skb_queue_head_init(&sk->sk_error_queue);

    sk->sk_send_head    =    NULL;
    //初始化定時器
    init_timer(&sk->sk_timer);

    sk->sk_allocation    =    GFP_KERNEL;
    sk->sk_rcvbuf        =    sysctl_rmem_default;
    sk->sk_sndbuf        =    sysctl_wmem_default;
    sk->sk_state        =    TCP_CLOSE;
    //這里終於把socket和sock實例關聯起來了
    sk_set_socket(sk, sock);

    sock_set_flag(sk, SOCK_ZAPPED);

    if (sock) {
        sk->sk_type    =    sock->type;
        sk->sk_wq    =    sock->wq;
        sock->sk    =    sk;
    } else
        sk->sk_wq    =    NULL;

    rwlock_init(&sk->sk_callback_lock);
    lockdep_set_class_and_name(&sk->sk_callback_lock,
            af_callback_keys + sk->sk_family,
            af_family_clock_key_strings[sk->sk_family]);

    sk->sk_state_change    =    sock_def_wakeup;//狀態改變后的回調函數
    sk->sk_data_ready    =    sock_def_readable;//有數據可讀的回調函數
    sk->sk_write_space    =    sock_def_write_space;//有緩存可寫的回調函數
    sk->sk_error_report    =    sock_def_error_report;//發生io錯誤時的回調函數
    sk->sk_destruct        =    sock_def_destruct;

    sk->sk_frag.page    =    NULL;
    sk->sk_frag.offset    =    0;
    sk->sk_peek_off        =    -1;

    sk->sk_peer_pid     =    NULL;
    sk->sk_peer_cred    =    NULL;
    sk->sk_write_pending    =    0;
    sk->sk_rcvlowat        =    1;
    sk->sk_rcvtimeo        =    MAX_SCHEDULE_TIMEOUT;
    sk->sk_sndtimeo        =    MAX_SCHEDULE_TIMEOUT;

    sk->sk_stamp = ktime_set(-1L, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
    sk->sk_napi_id        =    0;
    sk->sk_ll_usec        =    sysctl_net_busy_read;
#endif

    sk->sk_max_pacing_rate = ~0U;
    sk->sk_pacing_rate = ~0U;
    sk->sk_incoming_cpu = -1;
    /*
     * Before updating sk_refcnt, we must commit prior changes to memory
     * (Documentation/RCU/rculist_nulls.txt for details)
     */
    smp_wmb();
    atomic_set(&sk->sk_refcnt, 1);
    atomic_set(&sk->sk_drops, 0);
}

　　上面有幾個回調函數，其實實現的邏輯的代碼結構基本是一樣的：

/*
 *    Default Socket Callbacks
 當sock的狀態發生改變時，會調用此函數來進行處理
 */

static void sock_def_wakeup(struct sock *sk)
{
    struct socket_wq *wq;

    rcu_read_lock();
    wq = rcu_dereference(sk->sk_wq);
    if (skwq_has_sleeper(wq))//有進程阻塞在這個socket
    //喚醒所有在等待這個socket的進程，核心就是執行進程喚醒的回調函數
        wake_up_interruptible_all(&wq->wait);
    rcu_read_unlock();
}
/*sock有輸入數據可讀時，會調用此函數來處理*/
static void sock_def_readable(struct sock *sk)
{
    struct socket_wq *wq;

    rcu_read_lock();
    wq = rcu_dereference(sk->sk_wq);
    if (skwq_has_sleeper(wq))
        /* 喚醒等待數據的進程，核心還是執行回調函數 */
        wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
                        POLLRDNORM | POLLRDBAND);
    /* 異步通知隊列的處理。
     * 檢查應用程序是否通過recv()類調用來等待接收數據，如果沒有就發送SIGIO信號，
     * 告知它有數據可讀。
     * how為函數的處理方式，band為用來告知的IO類型。
     */
    sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    rcu_read_unlock();
}

　　當有可讀數據的時候，肯定第一時間通知相應的進程來讀取數據，核心是通過sk_wake_async函數實現的；而sk_wake_async最終調用了kill_fasync_rcu來給排隊等待的隊列發出SIGIO信號，通知這些隊列中的進程來取數據了！異步的好處在這里就凸顯了：進程不用在這里空轉等數據，而是可以釋放cpu去執行其他進程的代碼；等socket有數據后再通過類似中斷的形式通知等待的進程來取數據了！

/*
 * rcu_read_lock() is held
   函數名有kill，但實際是向隊列的進程發送SIGIO信號
 */
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
    while (fa) {
        struct fown_struct *fown;
        unsigned long flags;

        if (fa->magic != FASYNC_MAGIC) {
            printk(KERN_ERR "kill_fasync: bad magic number in "
                   "fasync_struct!\n");
            return;
        }
        spin_lock_irqsave(&fa->fa_lock, flags);
        if (fa->fa_file) {
            fown = &fa->fa_file->f_owner;
            /* Don't send SIGURG to processes which have not set a
               queued signum: SIGURG has its own default signalling
               mechanism. */
            if (!(sig == SIGURG && fown->signum == 0))
                send_sigio(fown, fa->fa_fd, band);
        }
        spin_unlock_irqrestore(&fa->fa_lock, flags);
        fa = rcu_dereference(fa->fa_next);
    }
}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 socket通信簡介 ICE通信簡介 Socket網絡通信（異步）——TCP socket網絡編程----addrinfo結構體和getaddrinfo函數介紹 OpenMPI源碼剖析：網絡通信原理（一） Linux命令（七）——網絡配置和網絡通信 java實現最基礎的socket網絡通信網絡通信socket連接數上限 C# Socket通信簡單實例網絡通信