套接字之相關系統調用的調用流程


 最近一直在讀內核網絡協議棧源碼,這里以ipv4/tcp為例對socket相關系統調用的流程做一個簡要整理,這些相關系統調用的內部細節雖然各有不同,但其調用流程則基本一致;

調用流程:

(1)系統調用 –> (2)查找socket –> (3)執行socket的對應操作函數  –> (4)執行傳輸層協議的對應操作函數;

中間核心數據結構為inetws_array[],位於af_inet.c,以第一個元素type=SOCK_STREAM,protocol=IPPROTO_TCP為例,該類型適用與tcp協議,當創建tcp socket時,其操作socket->ops賦值為&inet_stream_ops,對應的傳輸控制塊操作sock->sk_prot賦值為&tcp_prot;

 1 /* Upon startup we insert all the elements in inetsw_array[] into
 2  * the linked list inetsw.
 3  */
 4 static struct inet_protosw inetsw_array[] =
 5 {
 6     {
 7         .type =       SOCK_STREAM,
 8         .protocol =   IPPROTO_TCP,
 9         .prot =       &tcp_prot,
10         .ops =        &inet_stream_ops,
11         .flags =      INET_PROTOSW_PERMANENT |
12                   INET_PROTOSW_ICSK,
13     },
14 
15     {
16         .type =       SOCK_DGRAM,
17         .protocol =   IPPROTO_UDP,
18         .prot =       &udp_prot,
19         .ops =        &inet_dgram_ops,
20         .flags =      INET_PROTOSW_PERMANENT,
21        },
22 
23        {
24         .type =       SOCK_DGRAM,
25         .protocol =   IPPROTO_ICMP,
26         .prot =       &ping_prot,
27         .ops =        &inet_sockraw_ops,
28         .flags =      INET_PROTOSW_REUSE,
29        },
30 
31        {
32            .type =       SOCK_RAW,
33            .protocol =   IPPROTO_IP,    /* wild card */
34            .prot =       &raw_prot,
35            .ops =        &inet_sockraw_ops,
36            .flags =      INET_PROTOSW_REUSE,
37        }
38 };

 

查看inet_stream_ops結構會發現,其中包含了各種socket系統調用的對應的處理函數;

 1 const struct proto_ops inet_stream_ops = {
 2     .family           = PF_INET,
 3     .owner           = THIS_MODULE,
 4     .release       = inet_release,
 5     .bind           = inet_bind,
 6     .connect       = inet_stream_connect,
 7     .socketpair       = sock_no_socketpair,
 8     .accept           = inet_accept,
 9     .getname       = inet_getname,
10     .poll           = tcp_poll,
11     .ioctl           = inet_ioctl,
12     .listen           = inet_listen,
13     .shutdown       = inet_shutdown,
14     .setsockopt       = sock_common_setsockopt,
15     .getsockopt       = sock_common_getsockopt,
16     .sendmsg       = inet_sendmsg,
17     .recvmsg       = inet_recvmsg,
18     .mmap           = sock_no_mmap,
19     .sendpage       = inet_sendpage,
20     .splice_read       = tcp_splice_read,
21     .read_sock       = tcp_read_sock,
22     .peek_len       = tcp_peek_len,
23 #ifdef CONFIG_COMPAT
24     .compat_setsockopt = compat_sock_common_setsockopt,
25     .compat_getsockopt = compat_sock_common_getsockopt,
26     .compat_ioctl       = inet_compat_ioctl,
27 #endif
28 };

 

具體實例,以tcp bind系統調用為例:

 1 SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
 2 {
 3     struct socket *sock;
 4     struct sockaddr_storage address;
 5     int err, fput_needed;
 6 
 7     /* 獲取socket ,fput_need標識是否需要減少文件引用計數*/
 8     sock = sockfd_lookup_light(fd, &err, &fput_needed);
 9     if (sock) {
10         /* 將用戶空間地址復制到內核空間 */
11         err = move_addr_to_kernel(umyaddr, addrlen, &address);
12         if (err >= 0) {
13             /* 安全模塊的bind檢查 */
14             err = security_socket_bind(sock,
15                            (struct sockaddr *)&address,
16                            addrlen);
17             if (!err)
18                 /* 調用socket的bind操作 */
19                 err = sock->ops->bind(sock,
20                               (struct sockaddr *)
21                               &address, addrlen);
22         }
23 
24         /* 根據fput_needed決定是否減少引用計數 */
25         fput_light(sock->file, fput_needed);
26     }
27     return err;
28 }

 

上面的sock->ops->bind操作實際是調用了inet_stream_ops.bind

 1 /* 地址綁定 */
 2 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 3 {
 4     /*  省略無關代碼 */
 5     /* If the socket has its own bind function then use it. (RAW) */
 6     /* 
 7         如果傳輸控制塊有自己的bind操作則調用,
 8         目前只有raw實現了自己的bind 
 9     */
10     if (sk->sk_prot->bind) {
11         err = sk->sk_prot->bind(sk, uaddr, addr_len);
12         goto out;
13     }
14     
15     /* 省略無關代碼 */
16 
17     /* 
18         端口不為0,或者端口為0允許綁定 
19         則使用協議的具體獲取端口函數綁定端口
20     */
21     if ((snum || !inet->bind_address_no_port) &&
22         sk->sk_prot->get_port(sk, snum)) {
23 
24         /* 綁定失敗 */
25         inet->inet_saddr = inet->inet_rcv_saddr = 0;
26 
27         /* 端口在使用中 */
28         err = -EADDRINUSE;
29         goto out_release_sock;
30     }
31 
32    /* 省略無關代碼 */
33 out_release_sock:
34     release_sock(sk);
35 out:
36     return err;
37 }

 

上面的sk->sk_prot->bind以及sk->sk_prot->get_port為具體傳輸層實現的對應操作函數,其中只有raw socket實現了bind操作,我們不關注,而以tcp的get_port操作為例,實際上也就是調用了tcp_prot.get_port,具體tcp實現為inet_csk_get_port;(該函數尚未分析,后續補充)

 1 /* Obtain a reference to a local port for the given sock,
 2  * if snum is zero it means select any available local port.
 3  * We try to allocate an odd port (and leave even ports for connect())
 4  */
 5 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 6 {
 7     bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 8     struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
 9     int ret = 1, port = snum;
10     struct inet_bind_hashbucket *head;
11     struct net *net = sock_net(sk);
12     struct inet_bind_bucket *tb = NULL;
13     kuid_t uid = sock_i_uid(sk);
14 
15     if (!port) {
16         head = inet_csk_find_open_port(sk, &tb, &port);
17         if (!head)
18             return ret;
19         if (!tb)
20             goto tb_not_found;
21         goto success;
22     }
23     head = &hinfo->bhash[inet_bhashfn(net, port,
24                       hinfo->bhash_size)];
25     spin_lock_bh(&head->lock);
26     inet_bind_bucket_for_each(tb, &head->chain)
27         if (net_eq(ib_net(tb), net) && tb->port == port)
28             goto tb_found;
29 tb_not_found:
30     tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
31                      net, head, port);
32     if (!tb)
33         goto fail_unlock;
34 tb_found:
35     if (!hlist_empty(&tb->owners)) {
36         if (sk->sk_reuse == SK_FORCE_REUSE)
37             goto success;
38 
39         if ((tb->fastreuse > 0 && reuse) ||
40             sk_reuseport_match(tb, sk))
41             goto success;
42         if (inet_csk_bind_conflict(sk, tb, true, true))
43             goto fail_unlock;
44     }
45 success:
46     if (!hlist_empty(&tb->owners)) {
47         tb->fastreuse = reuse;
48         if (sk->sk_reuseport) {
49             tb->fastreuseport = FASTREUSEPORT_ANY;
50             tb->fastuid = uid;
51             tb->fast_rcv_saddr = sk->sk_rcv_saddr;
52             tb->fast_ipv6_only = ipv6_only_sock(sk);
53 #if IS_ENABLED(CONFIG_IPV6)
54             tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
55 #endif
56         } else {
57             tb->fastreuseport = 0;
58         }
59     } else {
60         if (!reuse)
61             tb->fastreuse = 0;
62         if (sk->sk_reuseport) {
63             /* We didn't match or we don't have fastreuseport set on
64              * the tb, but we have sk_reuseport set on this socket
65              * and we know that there are no bind conflicts with
66              * this socket in this tb, so reset our tb's reuseport
67              * settings so that any subsequent sockets that match
68              * our current socket will be put on the fast path.
69              *
70              * If we reset we need to set FASTREUSEPORT_STRICT so we
71              * do extra checking for all subsequent sk_reuseport
72              * socks.
73              */
74             if (!sk_reuseport_match(tb, sk)) {
75                 tb->fastreuseport = FASTREUSEPORT_STRICT;
76                 tb->fastuid = uid;
77                 tb->fast_rcv_saddr = sk->sk_rcv_saddr;
78                 tb->fast_ipv6_only = ipv6_only_sock(sk);
79 #if IS_ENABLED(CONFIG_IPV6)
80                 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
81 #endif
82             }
83         } else {
84             tb->fastreuseport = 0;
85         }
86     }
87     if (!inet_csk(sk)->icsk_bind_hash)
88         inet_bind_hash(sk, tb, port);
89     WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
90     ret = 0;
91 
92 fail_unlock:
93     spin_unlock_bh(&head->lock);
94     return ret;
95 }

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM