解析 socket 函數
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { int retval; struct socket *sock; int flags; ...... if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; retval = sock_create(family, type, protocol, &sock); ...... retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); ...... return retval; }
int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; ...... sock = sock_alloc(); ...... sock->type = type; ...... pf = rcu_dereference(net_families[family]); ...... err = pf->create(net, sock, protocol, kern); ...... *res = sock; return 0; }
這里先是分配了一個 struct socket 結構。接下來我們要用到 family 參數。這里有一個 net_families 數組,我們可以以 family 參數為下標,找到對應的 struct net_proto_family。
/* Supported address families. */ #define AF_UNSPEC 0 #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ ...... #define AF_INET6 10 /* IP version 6 */ ...... #define AF_MPLS 28 /* MPLS */ ...... #define AF_MAX 44 /* For now.. */ #define NPROTO AF_MAX struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
//net/ipv4/af_inet.c static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create,//這個用於socket系統調用創建 ...... }
static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; /* Look for the requested type/protocol pair. */ lookup_protocol: list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } ...... sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; ...... sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); ...... inet = inet_sk(sk); inet->nodefrag = 0; if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } inet->inet_id = 0; sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; if (inet->inet_num) { inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); } ...... }
在 inet_create 中,我們先會看到一個循環 list_for_each_entry_rcu。在這里,第二個參數 type 開始起作用。因為循環查看的是 inetsw[sock->type]
static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, } }
socket 是用於負責對上給用戶提供接口,並且和文件系統關聯。而 sock,負責向下對接內核網絡協議棧
在 sk_alloc 函數中,struct inet_protosw *answer 結構的 tcp_prot 賦值給了 struct sock *sk 的 sk_prot 成員。
tcp_prot 的定義如下,里面定義了很多的函數,都是 sock 之下內核協議棧的動作
struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .hash = inet_hash, .get_port = inet_csk_get_port, ...... }
bind 函數
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); if (err >= 0) { err = sock->ops->bind(sock, (struct sockaddr *) &address, addrlen); } fput_light(sock->file, fput_needed); } return err; }
1. sockfd_lookup_light 會根據 fd 文件描述符,找到 struct socket 結構。
2. 然后將 sockaddr 從用戶態拷貝到內核態,然后調用 struct socket 結構里面 ops 的 bind 函數。
3. 根據前面創建 socket 的時候的設定,調用的是 inet_stream_ops 的 bind 函數,也即調用 inet_bind。
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); unsigned short snum; ...... snum = ntohs(addr->sin_port); ...... inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; /* Make sure we are allowed to bind here. */ if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) { ...... } inet->inet_sport = htons(inet->inet_num); inet->inet_daddr = 0; inet->inet_dport = 0; sk_dst_reset(sk); }
bind 里面會調用 sk_prot 的 get_port 函數,也即 inet_csk_get_port 來檢查端口是否沖突,是否可以綁定。
如果允許,則會設置 struct inet_sock 的本方的地址 inet_saddr 和本方的端口 inet_sport,對方的地址 inet_daddr 和對方的端口 inet_dport 都初始化為 0
listen 函數
SYSCALL_DEFINE2(listen, int, fd, int, backlog) { struct socket *sock; int err, fput_needed; int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; if ((unsigned int)backlog > somaxconn) backlog = somaxconn; err = sock->ops->listen(sock, backlog); fput_light(sock->file, fput_needed); } return err; }
1. 在 listen 中,我們還是通過 sockfd_lookup_light,根據 fd 文件描述符,找到 struct socket 結構。
2. 接着,我們調用 struct socket 結構里面 ops 的 listen 函數。
3. 根據前面創建 socket 的時候的設定,調用的是 inet_stream_ops 的 listen 函數,也即調用 inet_listen
int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; old_state = sk->sk_state; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { err = inet_csk_listen_start(sk, backlog); } sk->sk_max_ack_backlog = backlog; }
如果這個 socket 還不在 TCP_LISTEN 狀態,會調用 inet_csk_listen_start 進入監聽狀態。
int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err = -EADDRINUSE; reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { ...... } ...... }
這里面建立了一個新的結構 inet_connection_sock,這個結構一開始是 struct inet_sock,inet_csk 其實做了一次強制類型轉換,擴大了結構
struct inet_connection_sock 結構比較復雜各種狀態的隊列,各種超時時間、擁塞控制等字眼。我們說 TCP 是面向連接的,就是客戶端和服務端都是有一個結構維護連接的狀態,就是指這個結構。
accept 函數
SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen) { return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen, int, flags) { struct socket *sock, *newsock; struct file *newfile; int err, len, newfd, fput_needed; struct sockaddr_storage address; ...... sock = sockfd_lookup_light(fd, &err, &fput_needed); newsock = sock_alloc(); newsock->type = sock->type; newsock->ops = sock->ops; newfd = get_unused_fd_flags(flags); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); err = sock->ops->accept(sock, newsock, sock->file->f_flags, false); if (upeer_sockaddr) { if (newsock->ops->getname(newsock, (struct sockaddr *)&address, &len, 2) < 0) { } err = move_addr_to_user(&address, len, upeer_sockaddr, upeer_addrlen); } fd_install(newfd, newfile); ...... }
accept 函數的實現,印證了 socket 的原理中說的那樣,原來的 socket 是監聽 socket,這里我們會找到原來的 struct socket,並基於它去創建一個新的 newsock。這才是連接 socket。除此之外,我們還會創建一個新的 struct file 和 fd,並關聯到 socket。
調用 struct socket 的 sock->ops->accept,也即會調用 inet_stream_ops 的 accept 函數,也即 inet_accept
int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk1 = sock->sk; int err = -EINVAL; struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); sock_rps_record_flow(sk2); sock_graft(sk2, newsock); newsock->state = SS_CONNECTED; }
inet_accept 會調用 struct sock 的 sk1->sk_prot->accept,也即 tcp_prot 的 accept 函數,inet_csk_accept 函數
/* * This will accept the next outstanding connection. */ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *req; struct sock *newsk; int error; if (sk->sk_state != TCP_LISTEN) goto out_err; /* Find already established connection */ if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); error = inet_csk_wait_for_connect(sk, timeo); } req = reqsk_queue_remove(queue, sk); newsk = req->sk; ...... } /* * Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; }
net_csk_accept 的實現,印證了上面我們講的兩個隊列的邏輯。如果 icsk_accept_queue 為空,則調用 inet_csk_wait_for_connect 進行等待;等待的時候,調用 schedule_timeout,讓出 CPU,並且將進程狀態設置為 TASK_INTERRUPTIBLE。
如果再次 CPU 醒來,我們會接着判斷 icsk_accept_queue 是否為空,同時也會調用 signal_pending 看有沒有信號可以處理。一旦 icsk_accept_queue 不為空,就從 inet_csk_wait_for_connect 中返回,在隊列中取出一個 struct sock 對象賦值給 newsk
connect 函數
什么情況下,icsk_accept_queue 才不為空呢?當然是三次握手結束才可以。接下來我們來分析三次握手的過程
三次握手一般是由客戶端調用 connect 發起
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); err = move_addr_to_kernel(uservaddr, addrlen, &address); err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags); }
connect 函數的實現一開始你應該很眼熟,還是通過 sockfd_lookup_light,根據 fd 文件描述符,找到 struct socket 結構。
接着,我們會調用 struct socket 結構里面 ops 的 connect 函數,根據前面創建 socket 的時候的設定,調用 inet_stream_ops 的 connect 函數,也即調用 inet_stream_connect
/* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; int err; long timeo; switch (sock->state) { ...... case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; err = sk->sk_prot->connect(sk, uaddr, addr_len); sock->state = SS_CONNECTING; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { ...... if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } sock->state = SS_CONNECTED; }
connect 函數的實現一開始你應該很眼熟,還是通過 sockfd_lookup_light,根據 fd 文件描述符,找到 struct socket 結構。
接着,我們會調用 struct socket 結構里面 ops 的 connect 函數,根據前面創建 socket 的時候的設定,調用 inet_stream_ops 的 connect 函數,也即調用 inet_stream_connect
/* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; int err; long timeo; switch (sock->state) { ...... case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; err = sk->sk_prot->connect(sk, uaddr, addr_len); sock->state = SS_CONNECTING; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { ...... if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } sock->state = SS_CONNECTED; }
如果 socket 處於 SS_UNCONNECTED 狀態,那就調用 struct sock 的 sk->sk_prot->connect,也即 tcp_prot 的 connect 函數——tcp_v4_connect 函數
在 tcp_v4_connect 函數中,ip_route_connect 其實是做一個路由的選擇。為什么呢?
因為三次握手馬上就要發送一個 SYN 包了,這就要湊齊源地址、源端口、目標地址、目標端口。
目標地址和目標端口是服務端的,已經知道源端口是客戶端隨機分配的,源地址應該用哪一個呢?這時候要選擇一條路由,看從哪個網卡出去,就應該填寫哪個網卡的 IP 地址
接下來,在發送 SYN 之前,我們先將客戶端 socket 的狀態設置為 TCP_SYN_SENT。
然后初始化 TCP 的 seq num,也即 write_seq,然后調用 tcp_connect 進行發送
/* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; ...... tcp_connect_init(sk); ...... buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); ...... tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); ...... tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { tp->snd_nxt = TCP_SKB_CB(buff)->seq; tp->pushed_seq = TCP_SKB_CB(buff)->seq; } ...... /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; }
在 tcp_connect 中,有一個新的結構 struct tcp_sock,如果打開他,你會發現他是 struct inet_connection_sock 的一個擴展,struct inet_connection_sock 在 struct tcp_sock 開頭的位置,通過強制類型轉換訪問,故伎重演又一次
struct tcp_sock 里面維護了更多的 TCP 的狀態,咱們同樣是遇到了再分析
接下來 tcp_init_nondata_skb 初始化一個 SYN 包,tcp_transmit_skb 將 SYN 包發送出去,inet_csk_reset_xmit_timer 設置了一個 timer,如果 SYN 發送不成功,則再次發送
TCP 層處理
static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, .early_demux_handler = tcp_v4_early_demux, .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, .netns_ok = 1, .icmp_strict_tag_validation = 1, }
通過 struct net_protocol 結構中的 handler 進行接收,調用的函數是 tcp_v4_rcv。
接下來的調用鏈為 tcp_v4_rcv->tcp_v4_do_rcv->tcp_rcv_state_process。
tcp_rcv_state_process,顧名思義,是用來處理接收一個網絡包后引起狀態變化的
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; switch (sk->sk_state) { ...... case TCP_LISTEN: ...... if (th->syn) { acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; if (!acceptable) return 1; consume_skb(skb); return 0; } ...... }
目前服務端是處於 TCP_LISTEN 狀態的,而且發過來的包是 SYN,因而就有了上面的代碼,調用 icsk->icsk_af_ops->conn_request 函數。
struct inet_connection_sock 對應的操作是 inet_connection_sock_af_ops,按照下面的定義,其實調用的是 tcp_v4_conn_request
const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), .mtu_reduced = tcp_v4_mtu_reduced, };
tcp_v4_conn_request 會調用 tcp_conn_request,這個函數也比較長,里面調用了 send_synack,但實際調用的是 tcp_v4_send_synack。
具體發送的過程我們不去管它,看注釋我們能知道,這是收到了 SYN 后,回復一個 SYN-ACK,回復完畢后,服務端處於 TCP_SYN_RECV
int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { ...... af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : TCP_SYNACK_COOKIE); ...... } /* * Send a SYN-ACK after having received a SYN. */ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type) {......}
都是 TCP 協議棧,所以過程和服務端沒有太多區別,還是會走到 tcp_rcv_state_process 函數的,只不過由於客戶端目前處於 TCP_SYN_SENT 狀態,就進入了下面的代碼分支
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; switch (sk->sk_state) { ...... case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } ...... }
tcp_rcv_synsent_state_process 會調用 tcp_send_ack,發送一個 ACK-ACK,發送后客戶端處於 TCP_ESTABLISHED 狀態
又輪到服務端接收網絡包了,我們還是歸 tcp_rcv_state_process 函數處理。由於服務端目前處於狀態 TCP_SYN_RECV 狀態,因而又走了另外的分支。當收到這個網絡包的時候,服務端也處於 TCP_ESTABLISHED 狀態,三次握手結束
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; ...... switch (sk->sk_state) { case TCP_SYN_RECV: if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); tp->copied_seq = tp->rcv_nxt; tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); if (sk->sk_socket) sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); break; ...... }
socket 的 Write 操作
對於網絡包的發送,我們可以使用對於 socket 文件的寫入系統調用,也就是 write 系統調用
對於 socket 來講,它的 file_operations 定義如下:
static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, };
按照文件系統的寫入流程,調用的是 sock_write_iter:
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *from, .msg_iocb = iocb}; ssize_t res; ...... res = sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; }
在 sock_write_iter 中,我們通過 VFS 中的 struct file,將創建好的 socket 結構拿出來,然后調用 sock_sendmsg。
而 sock_sendmsg 會調用 sock_sendmsg_nosec
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg)); ...... }
tcp_sendmsg 函數
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; long timeo; ...... /* Ok commence sending. */ copied = 0; restart: mss_now = tcp_send_mss(sk, &size_goal, flags); while (msg_data_left(msg)) { int copy = 0; int max = size_goal; skb = tcp_write_queue_tail(sk); if (tcp_send_head(sk)) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; } if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; ...... first_skb = skb_queue_empty(&sk->sk_write_queue); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, first_skb); ...... skb_entail(sk, skb); copy = size_goal; max = size_goal; ...... } /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); /* Where to copy to? */ if (skb_availroom(skb) > 0) { /* We have some space in skb head. Superb! */ copy = min_t(int, copy, skb_availroom(skb)); err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); ...... } else { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); ...... copy = min_t(int, copy, pfrag->size - pfrag->offset); ...... err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); ...... pfrag->offset += copy; } ...... tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; } if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; ...... } ...... }
msg 是用戶要寫入的數據,這個數據要拷貝到內核協議棧里面去發送;
在內核協議棧里面,網絡包的數據都是由 struct sk_buff 維護的,因而第一件事情就是找到一個空閑的內存空間,將用戶要寫入的數據,拷貝到 struct sk_buff 的管轄范圍內。
而第二件事情就是發送 struct sk_buff。
while (msg_data_left(msg)):
- 第一步,tcp_write_queue_tail 從 TCP 寫入隊列 sk_write_queue 中拿出最后一個 struct sk_buff,在這個寫入隊列中排滿了要發送的 struct sk_buff,為什么要拿最后一個呢?這里面只有最后一個,可能會因為上次用戶給的數據太少,而沒有填滿
- 第二步,tcp_send_mss 會計算 MSS,也即 Max Segment Size。這是什么呢?這個意思是說,我們在網絡上傳輸的網絡包的大小是有限制的,而這個限制在最底層開始
MTU(Maximum Transmission Unit,最大傳輸單元)是二層的一個定義。以以太網為例,MTU 為 1500 個 Byte,前面有 6 個 Byte 的目標 MAC 地址,6 個 Byte 的源 MAC 地址,2 個 Byte 的類型,后面有 4 個 Byte 的 CRC 校驗,共 1518 個 Byte。
在 IP 層,一個 IP 數據報在以太網中傳輸,如果它的長度大於該 MTU 值,就要進行分片傳輸。在 TCP 層有個 MSS(Maximum Segment Size,最大分段大小),等於 MTU 減去 IP 頭,再減去 TCP 頭。也就是,在不分片的情況下,TCP 里面放的最大內容。
3. 第三步,如果 copy 小於 0,說明最后一個 struct sk_buff 已經沒地方存放了,需要調用 sk_stream_alloc_skb,重新分配 struct sk_buff,然后調用 skb_entail,將新分配的 sk_buff 放到隊列尾部
為了減少內存拷貝的代價,有的網絡設備支持分散聚合(Scatter/Gather)I/O,顧名思義,就是 IP 層沒必要通過內存拷貝進行聚合,讓散的數據零散的放在原處,在設備層進行聚合。如果使用這種模式,網絡包的數據就不會放在連續的數據區域,而是放在 struct skb_shared_info 結構里面指向的離散數據,skb_shared_info 的成員變量 skb_frag_t frags[MAX_SKB_FRAGS],會指向一個數組的頁面,就不能保證連續了
4. 在注釋 /* Where to copy to? */ 后面有個 if-else 分支。if 分支就是 skb_add_data_nocache 將數據拷貝到連續的數據區域。else 分支就是 skb_copy_to_page_nocache 將數據拷貝到 struct skb_shared_info 結構指向的不需要連續的頁面區域。
5. 第五步,就是要發生網絡包了。第一種情況是積累的數據報數目太多了,因而我們需要通過調用 __tcp_push_pending_frames 發送網絡包。第二種情況是,這是第一個網絡包,需要馬上發送,調用 tcp_push_one。無論 __tcp_push_pending_frames 還是 tcp_push_one,都會調用 tcp_write_xmit 發送網絡包。
tcp_write_xmit 函數
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; ...... max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; ...... tso_segs = tcp_init_tso_segs(skb, mss_now); ...... cwnd_quota = tcp_cwnd_test(tp, skb); ...... if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } ...... limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, max_segs), nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; ...... if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } ...... }
TSO(TCP Segmentation Offload): 如果發送的網絡包非常大,就像上面說的一樣,要進行分段。分段這個事情可以由協議棧代碼在內核做,但是缺點是比較費 CPU,另一種方式是延遲到硬件網卡去做,需要網卡支持對大數據包進行自動分段,可以降低 CPU 負載。
tcp_init_tso_segs 會調用 tcp_set_skb_tso_segs -> 調用函數 tcp_mss_split_point,開始計算切分的 limit。這里面會計算 max_len = mss_now * max_segs,根據現在不切分來計算 limit,所以下一步的判斷中,大部分情況下 tso_fragment 不會被調用,等待到了底層網卡來切分
擁塞窗口的概念(cwnd,congestion window: 也就是說為了避免拼命發包,把網絡塞滿了,定義一個窗口的概念,在這個窗口之內的才能發送,超過這個窗口的就不能發送,來控制發送的頻率
那窗口大小是多少呢?就是遵循下面這個著名的擁塞窗口變化圖:
一開始的窗口只有一個 mss 大小叫作 slow start(慢啟動)。一開始的增長速度的很快的,翻倍增長。一旦到達一個臨界值 ssthresh,就變成線性增長,我們就稱為擁塞避免。什么時候算真正擁塞呢?就是出現了丟包。一旦丟包,一種方法是馬上降回到一個 mss,然后重復先翻倍再線性對的過程。如果覺得太過激進,也可以有第二種方法,就是降到當前 cwnd 的一半,然后進行線性增長。
在代碼中,tcp_cwnd_test 會將當前的 snd_cwnd,減去已經在窗口里面尚未發送完畢的網絡包,那就是剩下的窗口大小 cwnd_quota,也即就能發送這么多了
接收窗口rwnd 的概念(receive window),也叫滑動窗口: 如果說擁塞窗口是為了怕把網絡塞滿,在出現丟包的時候減少發送速度,那么滑動窗口就是為了怕把接收方塞滿,而控制發送速度
滑動窗口,其實就是接收方告訴發送方自己的網絡包的接收能力,超過這個能力,我就受不了了。
因為滑動窗口的存在,將發送方的緩存分成了四個部分:
- 第一部分:發送了並且已經確認的。這部分是已經發送完畢的網絡包,這部分沒有用了,可以回收
- 第二部分:發送了但尚未確認的。這部分,發送方要等待,萬一發送不成功,還要重新發送,所以不能刪除
- 第三部分:沒有發送,但是已經等待發送的。這部分是接收方空閑的能力,可以馬上發送,接收方收得了
- 第四部分:沒有發送,並且暫時還不會發送的。這部分已經超過了接收方的接收能力,再發送接收方就收不了了
因為滑動窗口的存在,接收方的緩存也要分成了三個部分:
- 第一部分:接受並且確認過的任務。這部分完全接收成功了,可以交給應用層了
-
第二部分:還沒接收,但是馬上就能接收的任務。這部分有的網絡包到達了,但是還沒確認,不算完全完畢,有的還沒有到達,那就是接收方能夠接受的最大的網絡包數量第二部分:還沒接收,但是馬上就能接收的任務。這部分有的網絡包到達了,但是還沒確認,不算完全完畢,有的還沒有到達,那就是接收方能夠接受的最大的網絡包數量
- 第三部分:還沒接收,也沒法接收的任務。這部分已經超出接收方能力
在網絡包的交互過程中,接收方會將第二部分的大小,作為 AdvertisedWindow 發送給發送方,發送方就可以根據他來調整發送速度了
在 tcp_snd_wnd_test 函數中,會判斷 sk_buff 中的 end_seq 和 tcp_wnd_end(tp) 之間的關系,也即這個 sk_buff 是否在滑動窗口的允許范圍之內。如果不在范圍內,說明發送要受限制了,我們就要把 is_rwnd_limited 設置為 true
接下來,tcp_mss_split_point 函數要被調用了:
static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int max_segs, int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) return max_len; needed = min(skb->len, window); if (max_len <= needed) return max_len; ...... return needed; }
1. 判斷是否會因為超出 mss 而分段
2. 判斷另一個條件,就是是否在滑動窗口的運行范圍之內,如果小於窗口的大小,也需要分段,也即需要調用 tso_fragment
在一個循環的最后,是調用 tcp_transmit_skb,真的去發送一個網絡包
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcphdr *th; int err; tp = tcp_sk(sk); skb->skb_mstamp = tp->tcp_mstamp; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); skb_push(skb, tcp_header_size); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); th->check = 0; th->urg_ptr = 0; ...... tcp_options_write((__be32 *)(th + 1), tp, &opts); th->window = htons(min(tp->rcv_wnd, 65535U)); ...... err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); ...... }
tcp_transmit_skb 這個函數比較長,主要做了兩件事情,第一件事情就是填充 TCP 頭,如果我們對着 TCP 頭的格式
這里面有源端口,設置為 inet_sport,有目標端口,設置為 inet_dport;有序列號,設置為 tcb->seq;
有確認序列號,設置為 tp->rcv_nxt。我們把所有的 flags 設置為 tcb->tcp_flags。設置選項為 opts。設置窗口大小為 tp->rcv_wnd
全部設置完畢之后,就會調用 icsk_af_ops 的 queue_xmit 方法,icsk_af_ops 指向 ipv4_specific,也即調用的是 ip_queue_xmit 函數
const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), .mtu_reduced = tcp_v4_mtu_reduced, }
ip_queue_xmit 函數
從 ip_queue_xmit 函數開始,我們就要進入 IP 層的發送邏輯了
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res; inet_opt = rcu_dereference(inet->inet_opt); fl4 = &fl->u.ip4; rt = skb_rtable(skb); /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); if (!rt) { __be32 daddr; /* Use correct destination address if we have options. */ daddr = inet->inet_daddr; ...... rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst); packet_routed: /* OK, we know where to send it, allocate and build IP header. */ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); /* Transport layer set skb->h.foo itself. */ if (inet_opt && inet_opt->opt.optlen) { iph->ihl += inet_opt->opt.optlen >> 2; ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; res = ip_local_out(net, sk, skb); ...... }
1. 選取路由
也即我要發送這個包應該從哪個網卡出去
這件事情主要由 ip_route_output_ports 函數完成
接下來的調用鏈為:ip_route_output_ports->ip_route_output_flow->__ip_route_output_key->ip_route_output_key_hash->ip_route_output_key_hash_rcu
struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) { struct net_device *dev_out = NULL; int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; ...... err = fib_lookup(net, fl4, res, 0); ...... make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); ...... }
ip_route_output_key_hash_rcu 先會調用 fib_lookup
FIB 全稱是 Forwarding Information Base,轉發信息表。其實就是咱們常說的路由表
static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; ...... tb = fib_get_table(net, RT_TABLE_MAIN); if (tb) err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); ...... }
路由表可以有多個,一般會有一個主表,RT_TABLE_MAIN。然后 fib_table_lookup 函數在這個表里面進行查找
找到了路由,然后,ip_route_output_key_hash_rcu 會調用 __mkroute_output,創建一個 struct rtable,表示找到的路由表項。
這個結構是由 rt_dst_alloc 函數分配的
struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool nopolicy, bool noxfrm, bool will_cache) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, (will_cache ? 0 : DST_HOST) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->rt_flags = flags; rt->rt_type = type; rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_gateway = 0; rt->rt_uses_gateway = 0; rt->rt_table_id = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; }
最終返回 struct rtable 實例,第一部分也就完成了
2. 准備 IP 層的頭,往里面填充內容
服務類型設置為 tos: 標識位里面設置是否允許分片 frag_off。如果不允許,而遇到 MTU 太小過不去的情況,就發送 ICMP 報錯
TTL : 是這個包的存活時間,為了防止一個 IP 包迷路以后一直存活下去,每經過一個路由器 TTL 都減一,減為零則“死去”
protocol: 指的是更上層的協議,這里是 TCP
源地址和目標地址由 ip_copy_addrs 設置
3. 調用 ip_local_out 發送 IP 包
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); skb->protocol = htons(ETH_P_IP); return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); }
ip_local_out 先是調用 __ip_local_out,然后里面調用了 nf_hook。這是什么呢?nf 的意思是 Netfilter,這是 Linux 內核的一個機制,用於在網絡發送和轉發的關鍵節點上加上 hook 函數,這些函數可以截獲數據包,對數據包進行干預
ip_local_out 再調用 dst_output,就是真正的發送數據
/* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return skb_dst(skb)->output(net, sk, skb); }
這里調用的就是 struct rtable 成員 dst 的 ouput 函數。
在 rt_dst_alloc 中,我們可以看到,output 函數指向的是 ip_output
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); }
在 ip_output 里面,我們又看到了熟悉的 NF_HOOK。這一次是 NF_INET_POST_ROUTING,也即 POSTROUTING 鏈,處理完之后,調用 ip_finish_output
ip_finish_output 函數
從 ip_finish_output 函數開始,發送網絡包的邏輯由第三層到達第二層。ip_finish_output 最終調用 ip_finish_output2。
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; u32 nexthop; ...... nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); res = neigh_output(neigh, skb); return res; } ...... }
在 ip_finish_output2 中,先找到 struct rtable 路由表里面的下一跳,下一跳一定和本機在同一個局域網中,可以通過二層進行通信,因而通過 __ipv4_neigh_lookup_noref,查找如何通過二層訪問下一跳。
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); }
__ipv4_neigh_lookup_noref 是從本地的 ARP 表中查找下一跳的 MAC 地址。ARP 表的定義如下
struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .id = "arp_cache", ...... .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, };
如果在 ARP 表中沒有找到相應的項,則調用 __neigh_create 進行創建
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { u32 hash_val; int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; memcpy(n->primary_key, pkey, key_len); n->dev = dev; dev_hold(dev); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { ...... } ...... if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock)); n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } ...... rcu_assign_pointer(n->next, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); rcu_assign_pointer(nht->hash_buckets[hash_val], n); ...... }
__neigh_create 先調用 neigh_alloc,創建一個 struct neighbour 結構,用於維護 MAC 地址和 ARP 相關的信息。這個名字也很好理解,大家都是在一個局域網里面,可以通過 MAC 地址訪問到,當然是鄰居了。
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; ...... n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; ...... }
在 neigh_alloc 中,我們先分配一個 struct neighbour 結構並且初始化。
這里面比較重要的有兩個成員:
一個是 arp_queue,所以上層想通過 ARP 獲取 MAC 地址的任務,都放在這個隊列里面。
另一個是 timer 定時器,我們設置成,過一段時間就調用 neigh_timer_handler,來處理這些 ARP 任務。
__neigh_create 然后調用了 arp_tbl 的 constructor 函數,也即調用了 arp_constructor,在這里面定義了 ARP 的操作 arp_hh_ops
static int arp_constructor(struct neighbour *neigh) { __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; ...... neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); ...... neigh->ops = &arp_hh_ops; ...... neigh->output = neigh->ops->output; ...... } static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, };
__neigh_create 最后是將創建的 struct neighbour 結構放入一個哈希表,從里面的代碼邏輯比較容易看出,這是一個數組加鏈表的鏈式哈希表,先計算出哈希值 hash_val,得到相應的鏈表,然后循環這個鏈表找到對應的項,如果找不到就在最后插入一項
回到 ip_finish_output2,在 __neigh_create 之后,會調用 neigh_output 發送網絡包
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) { ...... return n->output(n, skb); }
按照上面對於 struct neighbour 的操作函數 arp_hh_ops 的定義,output 調用的是 neigh_resolve_output。
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { if (!neigh_event_send(neigh, skb)) { ...... rc = dev_queue_xmit(skb); } ...... }
在 neigh_resolve_output 里面,首先 neigh_event_send 觸發一個事件,看能否激活 ARP
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { int rc; bool immediate_probe = false; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/2); neigh_add_timer(neigh, next); immediate_probe = true; } ...... } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { ....... __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_Bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) neigh_probe(neigh); ....... }
激活 ARP 分兩種情況:
第一種情況是馬上激活,也即 immediate_probe
另一種情況是延遲激活則僅僅設置一個 timer。
然后將 ARP 包放在 arp_queue 上。如果馬上激活,就直接調用 neigh_probe;如果延遲激活,則定時器到了就會觸發 neigh_timer_handler
在這里面還是會調用 neigh_probe
static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); ...... if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); ...... }
solicit 調用的是 arp_solicit,在這里我們可以找到對於 arp_send_dst 的調用,創建並發送一個 arp 包,得到結果放在 struct dst_entry 里面
static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; ...... skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); ...... skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); }
回到 neigh_resolve_output 中,當 ARP 發送完畢,就可以調用 dev_queue_xmit 發送二層網絡包了
/** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit * @accel_priv: private data used for L2 forwarding offload * * Queue a buffer for transmission to a network device. */ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; struct netdev_queue *txq; struct Qdisc *q; ...... txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } ...... }
這里還有另一個變量叫做 struct Qdisc,這個是什么呢?如果我們在一台 Linux 機器上運行 ip addr,我們能看到對於一個網卡,都有下面的輸出
# ip addr 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc pfifo_fast state UP group default qlen 1000 link/ether fa:16:3e:75:99:08 brd ff:ff:ff:ff:ff:ff inet 10.173.32.47/21 brd 10.173.39.255 scope global noprefixroute dynamic eth0 valid_lft 67104sec preferred_lft 67104sec inet6 fe80::f816:3eff:fe75:9908/64 scope link valid_lft forever preferred_lft forever
qdisc 全稱是 queueing discipline,中文叫排隊規則: 內核如果需要通過某個網絡接口發送數據包,都需要按照為這個接口配置的 qdisc(排隊規則)把數據包加入隊列
接下來,__dev_xmit_skb 開始進行網絡包發送
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { ...... rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { ...... __qdisc_run(q); } ...... } void __qdisc_run(struct Qdisc *q) { int quota = dev_tx_weight; int packets; while (qdisc_restart(q, &packets)) { /* * Ordered by possible occurrence: Postpone processing if * 1. we've exceeded packet quota * 2. another process needs the CPU; */ quota -= packets; if (quota <= 0 || need_resched()) { __netif_schedule(q); break; } } qdisc_run_end(q); }
__dev_xmit_skb 會將請求放入隊列,然后調用 __qdisc_run 處理隊列中的數據。
qdisc_restart 用於數據的發送。qdisc 的另一個功能是用於控制網絡包的發送速度,因而如果超過速度,就需要重新調度,則會調用 __netif_schedule
static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); }
__netif_schedule 會調用 __netif_reschedule,發起一個軟中斷 NET_TX_SOFTIRQ
NET_TX_SOFTIRQ 的處理函數是 net_tx_action,用於發送網絡包
static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); ...... if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock; head = head->next_sched; ...... qdisc_run(q); } } }
net_tx_action 還是調用了 qdisc_run,還是會調用 __qdisc_run,然后調用 qdisc_restart 發送網絡包
static inline int qdisc_restart(struct Qdisc *q, int *packets) { struct netdev_queue *txq; struct net_device *dev; spinlock_t *root_lock; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) return 0; root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); }
qdisc_restart 將網絡包從 Qdisc 的隊列中拿下來,然后調用 sch_direct_xmit 進行發送
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; if (likely(skb)) { if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); } ...... if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ ret = qdisc_qlen(q); } else { /* Driver returned NETDEV_TX_BUSY - requeue skb */ ret = dev_requeue_skb(skb, q); } ...... }
在 sch_direct_xmit 中,調用 dev_hard_start_xmit 進行發送,如果發送不成功,會返回 NETDEV_TX_BUSY。
這說明網絡卡很忙,於是就調用 dev_requeue_skb,重新放入隊列。
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; while (skb) { struct sk_buff *next = skb->next; rc = xmit_one(skb, dev, txq, next != NULL); skb = next; if (netif_xmit_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } } ...... }
在 dev_hard_start_xmit 中,是一個 while 循環。每次在隊列中取出一個 sk_buff,調用 xmit_one 發送。xmit_one->netdev_start_xmit->__netdev_start_xmit
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) { skb->xmit_more = more ? 1 : 0; return ops->ndo_start_xmit(skb, dev); }
這個時候,已經到了設備驅動層了。我們能看到,drivers/net/ethernet/intel/ixgb/ixgb_main.c 里面有對於這個網卡的操作的定義
static const struct net_device_ops ixgb_netdev_ops = { .ndo_open = ixgb_open, .ndo_stop = ixgb_close, .ndo_start_xmit = ixgb_xmit_frame, .ndo_set_rx_mode = ixgb_set_multi, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ixgb_set_mac, .ndo_change_mtu = ixgb_change_mtu, .ndo_tx_timeout = ixgb_tx_timeout, .ndo_vlan_rx_add_vid = ixgb_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ixgb_vlan_rx_kill_vid, .ndo_fix_features = ixgb_fix_features, .ndo_set_features = ixgb_set_features, };
在這里面,我們可以找到對於 ndo_start_xmit 的定義,調用 ixgb_xmit_frame
static netdev_tx_t ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) { struct ixgb_adapter *adapter = netdev_priv(netdev); ...... if (count) { ixgb_tx_queue(adapter, count, vlan_id, tx_flags); /* Make sure there is space in the ring for the next send. */ ixgb_maybe_stop_tx(netdev, &adapter->tx_ring, DESC_NEEDED); } ...... return NETDEV_TX_OK; }
在 ixgb_xmit_frame 中,我們會得到這個網卡對應的適配器,然后將其放入硬件網卡的隊列中。
發送總結
- VFS 層:write 系統調用找到 struct file,根據里面的 file_operations 的定義,調用 sock_write_iter 函數。sock_write_iter 函數調用 sock_sendmsg 函數。
- Socket 層:從 struct file 里面的 private_data 得到 struct socket,根據里面 ops 的定義,調用 inet_sendmsg 函數。
- Sock 層:從 struct socket 里面的 sk 得到 struct sock,根據里面 sk_prot 的定義,調用 tcp_sendmsg 函數。
- TCP 層:tcp_sendmsg 函數會調用 tcp_write_xmit 函數,tcp_write_xmit 函數會調用 tcp_transmit_skb,在這里實現了 TCP 層面向連接的邏輯。
- IP 層:擴展 struct sock,得到 struct inet_connection_sock,根據里面 icsk_af_ops 的定義,調用 ip_queue_xmit 函數。
- IP 層:ip_route_output_ports 函數里面會調用 fib_lookup 查找路由表。FIB 全稱是 Forwarding Information Base,轉發信息表,也就是路由表。在 IP 層里面要做的另一個事情是填寫 IP 層的頭。
- 在 IP 層還要做的一件事情就是通過 iptables 規則。
- MAC 層:IP 層調用 ip_finish_output 進行 MAC 層。
- MAC 層需要 ARP 獲得 MAC 地址,因而要調用 ___neigh_lookup_noref 查找屬於同一個網段的鄰居,他會調用 neigh_probe 發送 ARP。
- 有了 MAC 地址,就可以調用 dev_queue_xmit 發送二層網絡包了,它會調用 __dev_xmit_skb 會將請求放入隊列。
- 設備層:網絡包的發送回觸發一個軟中斷 NET_TX_SOFTIRQ 來處理隊列中的數據。這個軟中斷的處理函數是 net_tx_action。
- 在軟中斷處理函數中,會將網絡包從隊列上拿下來,調用網絡設備的傳輸函數 ixgb_xmit_frame,將網絡包發的設備的隊列上去。