一、基礎知識
TCP通過稱為“主動確認重傳”(PAR)的方式提供可靠的通信。傳輸層的協議數據單元(PDU)稱為段。使用PAR的設備重新發送數據單元,直到它收到確認為止。如果接收端接收的數據單元已損壞(使用用於錯誤檢測的傳輸層的校驗和功能檢查數據),則接收端將丟棄該段。因此,發送方必須重新發送未收到確認的數據單元。通過上述機制,可以實現在發送方(客戶端)和接收方(服務器)之間交換三個段,以建立可靠的TCP連接。這一機制是這樣工作的:
- 步驟1(SYN):第一步,客戶端要與服務器建立連接,因此它發送一個帶有SYN(同步序列號)的段,該段通知服務器客戶端可能開始通信以及以段以什么序列號開始通信。
- 步驟2(SYN + ACK):服務器通過設置SYN-ACK信號位來響應客戶端請求。Acknowledgement(ACK)表示收到的段的響應,SYN表示服務器端的段從哪個序列號開始通信。
- 步驟3(ACK):在最后一步中,客戶端確認服務器的響應,並且雙方都建立了可靠的連接,它們將開始實際的數據傳輸
步驟1、2建立一個方向的連接參數(序列號),並確認該參數。步驟2、3為另一個方向建立連接參數(序列號),並確認該參數。利用這些,建立了全雙工通信。
注–在客戶端和服務器之間建立連接時,會隨機選擇初始序列號。
二、實驗過程
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; call = array_index_nospec(call, SYS_SENDMMSG + 1); len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: #call=1 err = __sys_socket(a0, a1, a[2]); break; case SYS_BIND: #call=2 err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: #call=3 err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: #call=4 err = __sys_listen(a0, a1); break; case SYS_ACCEPT: #call=5 err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: #call=6 err = __sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: #call=7 err = __sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: #call=8 err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: #call=9 err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], NULL, 0); break; case SYS_SENDTO: #call=10 err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: #call=11 err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], NULL, NULL); break; case SYS_RECVFROM: #call=12 err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: #call=13 err = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: #call=14 err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: #call=15 err = __sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: #call=16 err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_SENDMMSG: #call=17 err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], true); break; case SYS_RECVMSG: #call=18 err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_RECVMMSG: #call=19 if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct __kernel_timespec __user *)a[4], NULL); else err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], NULL, (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: #call=20 err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err;
在上次實驗中,我們發現了sys_socketcall根據傳入call的數值決定調用的函數,又根據gdb信息不難定位到__sys_socket,__sys_connect,__sys_listen,__sys_accept4四個函數,因此打開gdb,連接並打斷點,如圖所示。
也就是說在這些函數的調用后,我們實現了TCP通信,那么我們接下來就依次看看具體的源代碼來看看TCP的三次握手是怎么實現的。
(1)首先是__sys_socket,代碼如下
int __sys_socket(int family, int type, int protocol) { int retval; struct socket *sock; int flags; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; retval = sock_create(family, type, protocol, &sock); if (retval < 0) return retval; return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); }
在這個函數中,主要是調用了sock_create和sock_map_fd函數,找到這兩個函數,源碼如下:
int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); }
調用了__sock_create函數,源碼如下:
int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ sock = sock_alloc(); if (!sock) { net_warn_ratelimited("socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type; #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put; /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; out_release: rcu_read_unlock(); goto out_sock_release; }
struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; return sock; } EXPORT_SYMBOL(sock_alloc);
可以看到,調用了sock_alloc函數初始化socket的相關信息。
sock_map_fd()主要用於對socket的*file指針初始化,經過sock_map_fd()操作后,socket就通過其*file指針與VFS管理的文件進行了關聯,便可以進行文件的各種操作,如read、write、lseek、ioctl等.
static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); if (unlikely(fd < 0)) { sock_release(sock); return fd; } newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { fd_install(fd, newfile); return fd; } put_unused_fd(fd); return PTR_ERR(newfile); }
(2)其次是__sys_connect,源碼如下
int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; //獲得socket sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out;
//將地址移動到內核空間 err = move_addr_to_kernel(uservaddr, addrlen, &address); if (err < 0) goto out_put; err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen); if (err) goto out_put; err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags);
//對於流式套接字,sock->ops為 inet_stream_ops -->inet_stream_connect //對於數據報套接字,sock->ops為 inet_dgram_ops --> inet_dgram_connect
out_put: fput_light(sock->file, fput_needed); out: return err; }
由於我們時TCP協議,采用的肯定是流的形式,繼續看inet_stream_connect,在gdb中打斷點,找到inet_stream_connec的定義:
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { int err; lock_sock(sock->sk); err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); return err; }
采用鎖保證操作的原子性,調用的__inet_stream_connect函數源碼:
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
int err;
lock_sock(sock->sk);
err = __inet_stream_connect(sock, uaddr, addr_len, flags);
release_sock(sock->sk);
return err;
}
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
long timeo;
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out;
}
//判斷socket狀態
switch (sock->state) {
default:
err = -EINVAL;
goto out;
case SS_CONNECTED:
err = -EISCONN;
goto out;
case SS_CONNECTING:
err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED://未建立連接,因此發起連接走的是這個流程
err = -EISCONN;
if (sk->sk_state != TCP_CLOSE)
goto out;
//主處理函數,最終調用的是tcp_v4_connect()函數
err = sk->sk_prot->connect(sk, uaddr, addr_len);
if (err < 0)
goto out;
sock->state = SS_CONNECTING;
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
*/
//如果是非阻塞調用,那么最后返回的就是這個錯誤碼
err = -EINPROGRESS;
break;
}
//如果connect設置的是非阻塞,獲取超時時間
//超時時間可以通過SO_SNDTIMEO選項設置
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
tcp_sk(sk)->fastopen_req &&
tcp_sk(sk)->fastopen_req->data ? 1 : 0;
/* Error code is set above */
//非阻塞時,timeo為0,直接返回;否則設置定時器,然后調度出去,等待超時返回
if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
goto out;
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
}
...
}
該函數的作用是判斷socket的狀態,根據狀態判斷做出動作,sk->sk_prot指向tcp_prot,因此sk->sk_prot->connect最終調用的就是tcp_v4_connect()。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); __be16 orig_sport, orig_dport; __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; struct ip_options_rcu *inet_opt; ... nexthop = daddr = usin->sin_addr.s_addr;//賦值下一跳地址和目的地址, inet_opt = rcu_dereference_protected(inet->inet_opt, sock_owned_by_user(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; nexthop = inet_opt->opt.faddr; } orig_sport = inet->inet_sport;//源地址 orig_dport = usin->sin_port;//源端口 fl4 = &inet->cork.fl.u.ip4; //根據當前信息,查找路由,並新建路由緩存 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, orig_dport, sk); ... if (!inet->inet_saddr) //如果socket沒有綁定ip地址,使用路由查詢返回的結果 inet->inet_saddr = fl4->saddr; //inet_rcv_saddr表示的是本地綁定的ip地址,也就是源地址 inet->inet_rcv_saddr = inet->inet_saddr; if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port;//目的端口 inet->inet_daddr = daddr;//目的地址 inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; tcp_set_state(sk, TCP_SYN_SENT);//socket進入SYN-SENT狀態 //綁定ip和端口號,並將sock加入哈希表中 err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; sk_set_txhash(sk); //使用新的端口號再次做路由查詢, //因為如果客戶端沒有用bind()綁定IP地址和端口號,上面inet_hash_connect() //就會自動選擇一個端口號,因此源端口會不一樣 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); if (!tp->write_seq && likely(!tp->repair)) //生成序列號 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, usin->sin_port); inet->inet_id = tp->write_seq ^ jiffies; //由socket層轉入TCP層,構造SYN報文並發送 err = tcp_connect(sk); ... }
客戶端自動發起連接,至此第一步完成。
在分析connect()系統調用時,我們已經發送SYN報文,所以服務端就需要作出回應了,SYN報文到達TCP層由tcp_v4_rcv()接管。
int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); ... //checksum檢查,其實也就是完整性校驗 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) goto csum_error; th = tcp_hdr(skb);//獲取TCP頭部 iph = ip_hdr(skb);//獲取ip頭部 TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; //根據報文的源和目的地址在established哈希表以及listen哈希表中查找連接 //對於正要建立的連接,返回的就是listen哈希表的連接 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; process: //如果此時socket狀態處於time_wait,那就進入對應的處理流程中 if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; ... th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); sk_mark_napi_id(sk, skb);//記錄napi的id skb->dev = NULL; bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) {//如果sk沒有被用戶鎖定,即沒在使用 //檢查是否需要先進入prequeue隊列 if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb);//進入到主處理函數 //如果用戶正在使用,則數據包進入backlog中 //不太理解的是為什么limit入參是sk_rcvbuf和sk_sndbuf之和 } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); sock_put(sk); return ret; ... do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (skb->len < (th->doff << 2)) { inet_twsk_put(inet_twsk(sk)); goto bad_packet; } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } //處理在time_wait狀態收到報文的情況 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; } /* Fall through to ACK */ } case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: tcp_v4_send_reset(sk, skb); inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; } goto discard_it; }
接收到SYN包后要查看下該報文是否之前已建立的連接,通過__inet_lookup_skb()查找是否有匹配的連接。
static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, const __be16 sport, const __be16 dport) { //sk_buff結構體里有一個變量指向sock,即skb->sk //但是對於尚未建立連接的skb來說,其sk變量為空,因此會走進__inet_lookup() struct sock *sk = skb_steal_sock(skb); const struct iphdr *iph = ip_hdr(skb); if (sk) return sk; else return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, iph->saddr, sport, iph->daddr, dport, inet_iif(skb)); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { u16 hnum = ntohs(dport); //查找established哈希表 struct sock *sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); //查找listen哈希表 return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, daddr, hnum, dif); }
最終會在listen哈希表中找到該連接,也就是服務端的監聽socket。
之后如果當前這個監聽socket沒有被使用,就會進入prequeue隊列中處理,但是由於這是SYN報文,還沒有進程接收數據,所以不會進入prequeue的真正處理中。
bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); //如果設置了/proc/sys/net/ipv4/tcp_low_latency(低時延)參數,默認為0 //或者用戶還沒有調用接收函數接收數據,那么不使用prequeue隊列 //ucopy.task會在接收數據函數recvmsg()中設置為接收數據的當前進程 //所以對於第一個SYN報文,會從以下分支返回 if (sysctl_tcp_low_latency || !tp->ucopy.task) return false; if (skb->len <= tcp_hdrlen(skb) && skb_queue_len(&tp->ucopy.prequeue) == 0) return false; if (likely(sk->sk_rx_dst)) skb_dst_drop(skb); else skb_dst_force_safe(skb); //加入到prequeue隊列尾部 __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; //如果prequeue隊列長度大於socket連接的接收緩沖區, //將prequeue中的數據報文轉移到receive_queue中 if (tp->ucopy.memory > sk->sk_rcvbuf) { struct sk_buff *skb1; BUG_ON(sock_owned_by_user(sk)); //從prequeue中摘鏈 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { sk_backlog_rcv(sk, skb1);//放入backlog中 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED); } tp->ucopy.memory = 0; //如果prequeue中有報文了,那么喚醒睡眠的進程來收取報文 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { //喚醒sk上睡眠的進程,這里只喚醒其中一個,避免驚群現象 //至於怎么喚醒,選擇哪個喚醒,暫未研究 wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN | POLLRDNORM | POLLRDBAND); //沒有ACK需要發送,重置延時ACK定時器 if (!inet_csk_ack_scheduled(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, (3 * tcp_rto_min(sk)) / 4, TCP_RTO_MAX); } return true; }
既然不會進入到prequeue隊列中,那就進入tcp_v4_do_rcv()的處理,這是個主要的報文處理函數。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk; ... //SYN報文走的是這里 if (sk->sk_state == TCP_LISTEN) { //查找對應的半連接狀態的socket struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; //可知,對於SYN報文,返回的還是入參sk,即nsk=sk if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); //這里是除ESTABLISHED and TIME_WAIT狀態外報文的歸宿。。。 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } return 0; ... }
半連接狀態socket通過tcp_v4_hnd_req()查找。
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ //查找半連接隊列,對於SYN報文肯定找不到 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev, false); //再一次查找established哈希表,以防在此期間重傳過SYN報文且建立了連接 //對於SYN報文這里也是返回空的 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { bh_lock_sock(nsk); return nsk; } inet_twsk_put(inet_twsk(nsk)); return NULL; } #ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); #endif //所以最終返回的還是原來的連接,即該函數對於SYN報文啥都沒做 return sk; }
加入半連接隊列通過icsk->icsk_af_ops->conn_request操作。我們知道icsk->icsk_af_ops指向ipv4_specific。
const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, ... };
所以加入半連接的操作就是由tcp_v4_conn_request()操作。
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&tcp_request_sock_ops, &tcp_request_sock_ipv4_ops, sk, skb); drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; }
tcp_v4_conn_request()對tcp_conn_request()做了一個簡單的封裝。
int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tmp_opt; struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; bool want_cookie = false, fastopen; struct flowi fl; struct tcp_fastopen_cookie foc = { .len = -1 }; int err; //如果開啟了syncookies選項,/proc/sys/net/ipv4/ if ((sysctl_tcp_syncookies == 2 || //或者此時半連接隊列已經滿了 //同時isn不是由tcp_timewait_state_process()函數選擇 //那么判斷是否需要發送syncookie inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); //不需要發送syncookies就直接丟棄報文 if (!want_cookie) goto drop; } //如果全連接隊列滿了,同時半連接隊列里尚未重傳過的SYN報文個數大於1 //那么就直接丟棄報文 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } //都沒問題的話,那就分配一個request_sock,表示一個請求 //這個內存分配是從tcp的slab中分配的 req = inet_reqsk_alloc(rsk_ops); if (!req) goto drop; inet_rsk(req)->ireq_family = sk->sk_family; //af_ops即為tcp_request_sock_ipv4_ops,這個結構體比較重要,請留意 tcp_rsk(req)->af_specific = af_ops; tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = tp->rx_opt.user_mss; //分析該請求的tcp各個選項,比如時間戳、窗口大小、快速開啟等選項 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;//記錄時間戳選項開啟情況 //將剛才分析的請求的TCP選項記錄到剛剛分配的request_sock中,即req中 tcp_openreq_init(req, &tmp_opt, skb); af_ops->init_req(req, sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; //如果不需要發送syncookies //同時isn不是由tcp_timewait_state_process()函數選擇 if (!want_cookie && !isn) { //如果開啟了time_wait狀態連接快速回收 //即設置/proc/sys/net/ipv4/tcp_tw_recycle if (tcp_death_row.sysctl_tw_recycle) { bool strict; //查找路由 dst = af_ops->route_req(sk, &fl, req, &strict); if (dst && strict && //主要用於判斷是否會和該IP的舊連接沖突 //這里就涉及到nat環境下丟包的問題 !tcp_peer_is_proven(req, dst, true, tmp_opt.saw_tstamp)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } /* Kill the following clause, if you dislike this way. */ //如果沒有開啟syncookies選項 else if (!sysctl_tcp_syncookies && //同時,半連接隊列長度已經大於syn backlog隊列的3/4 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && //並且當前連接和舊連接有沖突 !tcp_peer_is_proven(req, dst, false, tmp_opt.saw_tstamp)) { //很有可能遭受synflood 攻擊 pr_drop_req(req, ntohs(tcp_hdr(skb)->source), rsk_ops->family); goto drop_and_release; } //生成隨機報文序列號 isn = af_ops->init_seq(skb); } if (!dst) { dst = af_ops->route_req(sk, &fl, req, NULL); if (!dst) goto drop_and_free; } tcp_ecn_create_request(req, skb, sk, dst); //如果要發送syncookies,那就發送 if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; } tcp_rsk(req)->snt_isn = isn; tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && tcp_try_fastopen(sk, skb, req, &foc, dst); //這里便是調用tcp_v4_send_synack()發送SYNACK報文了 err = af_ops->send_synack(sk, dst, &fl, req, skb_get_queue_mapping(skb), &foc); if (!fastopen) { if (err || want_cookie) goto drop_and_free; tcp_rsk(req)->listener = NULL; //發送報文后將該請求加入半連接隊列,同時啟動SYNACK定時器 //調用inet_csk_reqsk_queue_hash_add()完成上述操作 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } return 0; ... }
要加入半連接隊列首先要創建一個request_sock,用於表示客戶端發起的請求,然后是做一些初始化,其中req->ts_recent后續會用到多次,這個變量表示的就是對端發送報文的時間(前提是對端開啟了時間戳選項)。
static inline void tcp_openreq_init(struct request_sock *req, struct tcp_options_received *rx_opt, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; //如果對端開啟時間戳,那么記錄下這個時間,也就是對方發送SYN報文的時間 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok;//時間戳開啟標志 ireq->sack_ok = rx_opt->sack_ok; ireq->snd_wscale = rx_opt->snd_wscale; ireq->wscale_ok = rx_opt->wscale_ok; ireq->acked = 0; ireq->ecn_ok = 0; ireq->ir_rmt_port = tcp_hdr(skb)->source; //目的端口,也就是當前服務端的監聽端口 ireq->ir_num = ntohs(tcp_hdr(skb)->dest); }
初始化完這個請求后就要看下這個請求是否有問題。主要檢查的就是看是否會和當前ip的上次通訊有沖突。該操作通過tcp_peer_is_proven()檢查。
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN * to provide reliability equal to one * provided by timewait state. */ #define TCP_PAWS_WINDOW 1 /* Replay window for per-host * timestamps. It must be less than * minimal timewait lifetime. */ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check, bool timestamps) { struct tcp_metrics_block *tm; bool ret; if (!dst) return false; rcu_read_lock(); tm = __tcp_get_metrics_req(req, dst); if (paws_check) { //如果當前ip的上次tcp通訊發生在60s內 if (tm && (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && //同時當前ip上次tcp通信的時間戳大於本次tcp,或者沒有開啟時間戳開關 //從這里看,快速回收打開選項就很容易導致nat環境丟包 ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||!timestamps)) ret = false; else ret = true; } else { if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) ret = true; else ret = false; } rcu_read_unlock(); return ret; }
SYNACK報文就是通過tcp_v4_send_synack()發送。
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff * skb; //獲取路由 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; //准備synack報文,該報文使用的是用戶的send buffer內存 skb = tcp_make_synack(sk, dst, req, foc); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); skb_set_queue_mapping(skb, queue_mapping); //傳到IP層繼續處理,組建ip頭,然后發送報文 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->opt); err = net_xmit_eval(err); } return err; }
發送完SYNACK報文,接着就是將該連接放入半連接隊列了,同時啟動我們的SYNACK定時器。這一動作通過af_ops->queue_hash_add實現,由上面結構體可知,也就是調用inet_csk_reqsk_queue_hash_add()。
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, inet_rsk(req)->ir_rmt_port, lopt->hash_rnd, lopt->nr_table_entries); //添加到半連接隊列 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); //更新半連接隊列統計信息,同時開啟SYNACK定時器 inet_csk_reqsk_queue_added(sk, timeout); }
剩下的工作就是將報文封裝到ip層,由網卡發出。
隨着SYNACK報文的發送,連接建立隨着第二次握手報文來到客戶端。客戶端接收到這個SYNACK報文,就認為連接建立了。仍然從TCP層開始分析,依然是由tcp_v4_rcv()入手。
int tcp_v4_rcv(struct sk_buff *skb) { ... //根據報文的源和目的地址在established哈希表以及listen哈希表中查找連接 //由於之前調用connect時已經將連接加入到established哈希表中 //所以在接收到服務端的SYNACK時,就能從established表中找到對應的連接 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; ... ret = 0; if (!sock_owned_by_user(sk)) {//如果sk沒有被用戶鎖定,及沒在使用 if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb);//進入到主處理函數 } ... }
然后還是來到老地方——tcp_v4_do_rcv()。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ ... } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { ... } else sock_rps_save_rxhash(sk, skb); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } return 0; ... }
不過因為此時我們socket的狀態是SYN_SENT,所以就直接進入tcp_rcv_state_process()的處理流程了。
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; bool acceptable; u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; switch (sk->sk_state) { ... case TCP_SYN_SENT: //進入到synack報文的處理流程 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } ... }
層層深入,最后進入的歸宿是tcp_rcv_synsent_state_process()。
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; //分析TCP選項 tcp_parse_options(skb, &tp->rx_opt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) {//處理帶ACK標志的報文 //如果接收到的確認號小於或等於已發送未確認的序列號, //或者大於下次要發送數據的序列號,非法報文,發送RST報文 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) goto reset_and_undo; //如果開啟了時間戳選項,且回顯時間戳不為空 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && //且回顯時間戳不在當前時間和SYN報文發送的時間窗內,就認為該報文非法 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } if (th->rst) {//ack報文不允許出現rst標志 tcp_reset(sk); goto discard; } if (!th->syn)//除了上面的幾種標志位和SYN標志位,其余報文都丟棄 goto discard_and_undo; TCP_ECN_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); //確認ACK的確認號正常 tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and * move to established. */ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ tp->snd_wnd = ntohs(th->window); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; tp->window_clamp = min(tp->window_clamp, 65535U); } //如果連接支持時間戳選項 if (tp->rx_opt.saw_tstamp) { tp->rx_opt.tstamp_ok = 1; tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; tcp_store_ts_recent(tp);//記錄對端的時間戳 } else { tp->tcp_header_len = sizeof(struct tcphdr); } if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp); tcp_mtup_init(sk);//mtu探測初始化 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ tp->copied_seq = tp->rcv_nxt; smp_mb(); //連接建立完成,將連接狀態推向established //然后喚醒等在該socket的所有睡眠進程 tcp_finish_connect(sk, skb); //快速開啟選項 if ((tp->syn_fastopen || tp->syn_data) && tcp_rcv_fastopen_synack(sk, skb, &foc)) return -1; /* 如果有以下情況,不會馬上發送ACK報文 * 1.有數據等待發送 * 2.用戶設置了TCP_DEFER_ACCEPT選項 * 3.禁用快速確認模式,可通過TCP_QUICKACK設置 */ if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { //設置ICSK_ACK_SCHED標識,有ACK等待發送,當前不發送 inet_csk_schedule_ack(sk); //最后一次接收到數據包的時間 icsk->icsk_ack.lrcvtime = tcp_time_stamp; //設置快速確認模式,以及快速確認模式下可以發送的ACK報文數 tcp_enter_quickack_mode(sk); //激活延遲ACK定時器,超時時間為200ms //最多延遲200ms就會發送ACK報文 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); discard: __kfree_skb(skb); return 0; } else { tcp_send_ack(sk);//否則馬上發送ACK報文,即第三次握手報文 } return -1; } //沒有ACK標記,只有RST標記,丟棄報文 if (th->rst) { goto discard_and_undo; } /* PAWS check. */ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_reject(&tp->rx_opt, 0)) goto discard_and_undo; //在SYNSENT狀態收到syn報文,說明這是同時打開的場景 if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. */ //設置連接狀態為SYN_RECV tcp_set_state(sk, TCP_SYN_RECV); } ... }
我們知道,客戶端收到這個SYNACK報文后就會進入ESTABLISHED狀態,這主要是tcp_finish_connect()里操作的。
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); //對於客戶端來說,此時連接已經建立,設置連接狀態為established tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); //初始化TCP metrics,用於保存連接相關的路由信息 tcp_init_metrics(sk); //初始化擁塞控制 tcp_init_congestion_control(sk); //記錄最后一個數據包發送的時間戳 tp->lsndtime = tcp_time_stamp; //初始化接收緩存和發送緩存 tcp_init_buffer_space(sk); //如果使用了SO_KEEPALIVE選項,那就激活保活定時器 if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); else tp->pred_flags = 0; if (!sock_flag(sk, SOCK_DEAD)) { //指向sock_def_wakeup,喚醒該socket上所有睡眠的進程 sk->sk_state_change(sk); //如果進程使用了異步通知,發送SIGIO信號通知進程可寫 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } }
tcp_finish_connect()中主要有以下幾點重要操作:
將socket狀態推向ESTABLISHED,也就意味着在客戶端來看,連接已經建立
然后是初始化路由和擁塞控制等一下參數
同時如果用戶開啟了保活定時器,此時開始生效,計算連接空閑時間
最后就是喚醒該socket上所有睡眠的進程,如果有進程使用異步通知,則發送SIGIO信號通知進程可寫
最后就是發送第三次握手報文——ACK報文。不過ACK報文並不一定是馬上發送,在一下幾種情況下會延遲發送。
當前剛好有數據等待發送
用戶設置了TCP_DEFER_ACCEPT選項
禁用快速確認模式,可通過TCP_QUICKACK選項設置
不過即使延遲,也最多延遲200ms,這個通過延遲ACK定時器操作。
如果是馬上發送ACK報文,則通過tcp_send_ack()發送。
/* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (buff == NULL) {//分配失敗 //和上面講到的延遲ACK一樣,設置延遲ACK,稍后再發送 inet_csk_schedule_ack(sk); //超時時間為200ms inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; //激活延遲ACK定時器 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); //初始化無數據的skb tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 * We also avoid tcp_wfree() overhead (cache line miss accessing * tp->tsq_flags) by using regular sock_wfree() */ skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ skb_mstamp_get(&buff->skb_mstamp); //又到了這個路徑,往后就是IP層了 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); }
客戶端發送第三個握手報文ACK報文后,客戶端其實就已經處於連接建立的狀態,此時服務端還需要接收到這個ACK報文才算最終完成連接建立。
TCP層接收到ACK還是由tcp_v4_rcv()處理,這就是TCP層的對外接口。
int tcp_v4_rcv(struct sk_buff *skb) { ... //根據報文的源和目的地址在established哈希表以及listen哈希表中查找連接 //之前服務端接收到客戶端的SYN報文時,socket的狀態依然是listen //所以在接收到客戶端的ACK時(第三次握手),依然從listen哈希表中找到對應的連接 //這里有個疑問就是,既然此時還是listen狀態,為啥所有的解釋都是說在接收到SYN //報文后服務端進入SYN_RECV,連netstat命令查出來的也是。。。 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; ... ret = 0; if (!sock_owned_by_user(sk)) {//如果sk沒有被用戶鎖定,及沒在使用 if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb);//進入到主處理函數 ... }
在SYN報文接收時就會將請求放入半連接隊列,因此在第三次握手時就能在半連接隊列找到對應的請求連接了。
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; struct request_sock **prev; //在第一次握手時會將連接放入半連接隊列,因此這里是能找到對應請求連接的 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, iph->saddr, iph->daddr); //找到之前的連接 if (req) //使用這個req創建一個socket並返回 return tcp_check_req(sk, skb, req, prev, false); ... } struct request_sock *inet_csk_search_req(const struct sock *sk, struct request_sock ***prevp, const __be16 rport, const __be32 raddr, const __be32 laddr) { const struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; struct request_sock *req, **prev; //遍歷半連接隊列,查找對應連接 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, lopt->nr_table_entries)]; (req = *prev) != NULL; prev = &req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); if (ireq->ir_rmt_port == rport && ireq->ir_rmt_addr == raddr && ireq->ir_loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { WARN_ON(req->sk); *prevp = prev; break; } } return req; }
找到這個半連接請求后,就根據這個請求信息創建一個新的socket,由tcp_check_req()操作。
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct request_sock **prev, bool fastopen) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(skb, &tmp_opt, 0, NULL);//分析TCP頭部選項 if (tmp_opt.saw_tstamp) {//如果開啟了時間戳選項 //這個時間其實就是客戶端發送SYN報文的時間 //req->ts_recent是在收到SYN報文時記錄的 tmp_opt.ts_recent = req->ts_recent; //注釋里寫ts_recent_stamp表示的是記錄ts_recent時的時間 //這里通過推算的方法得出ts_recent的時間,但是我覺得明顯估計的不對 //按照代碼說的,如果SYNACK報文沒有重傳(req->num_timeout=0) //那么ts_recent_stamp即為當前時間減去1 //但是收到SYN報文的時間肯定不可能是1s前,連接建立也就幾毫秒的事。。。 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); //確認時間戳是否回繞,比較第一次握手報文和第三次握手報文的時間戳 //沒有回繞,返回false paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } //如果接收到的報文序列號等於之前SYN報文的序列號,說明這是一個重傳SYN報文 //如果SYN報文時間戳沒有回繞,那就重新發送SYNACK報文,然后更新半連接超時時間 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !inet_rtx_syn_ack(sk, req))//沒有超過速率限制,那就重發SYNACK報文 req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX) + jiffies;//更新半連接的超時時間 return NULL; } //收到的ACK報文的確認號不對,返回listen socket if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which * is essentially ACK extension and too early or too late values * should cause reset in unsynchronized states. */ /* RFC793: "first check sequence number". */ //報文時間戳回繞,或者報文序列不在窗口范圍,發送ACK后丟棄 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } /* In sequence, PAWS is OK. */ //開啟了時間戳,且收到的報文序列號小於等於期望接收的序列號 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) req->ts_recent = tmp_opt.rcv_tsval;//更新ts_recent為第三次握手報文的時間戳 //清除SYN標記 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ //設置了TCP_DEFER_ACCEPT,即延遲ACK選項,且該ACK沒有攜帶數據,那就先丟棄 if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1;//標記已經接收過ACK報文了 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ //這里總算是正常ACK報文了,創建一個新的socket並返回 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; //將老的socket從半連接隊列里摘鏈 inet_csk_reqsk_queue_unlink(sk, req, prev); //刪除摘除的請求,然后更新半連接隊列的統計信息 //如果半連接隊列為空,刪除SYNACK定時器 inet_csk_reqsk_queue_removed(sk, req); //將新創建的新socket加入全連接隊列里,並更新隊列統計信息 inet_csk_reqsk_queue_add(sk, req, child); return child; ... }
找到半連接隊列里的請求后,還需要和當前接收到的報文比較,檢查是否出現時間戳回繞的情況(timestamps選項開啟的前提下),通過tcp_paws_reject()檢測。
static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, int rst) { //檢查時間戳是否回繞 if (tcp_paws_check(rx_opt, 0)) return false; //第三次握手報文一般都不會有rst標志 //另一個條件是,當前時間和上次該ip通信的時間間隔大於TCP_PAWS_MSL(60s), //即一個time_wait狀態持續時間 if (rst && get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL) return false; return true; } static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, int paws_win) { //rx_opt->ts_recent是SYN報文發送時間, //rx_opt->rcv_tsval是客戶端發送第三次握手報文的時間 //也就是要保證時間戳沒有回繞,正常情況下這里就滿足返回了 if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; //距離上一次收到這個ip的報文過去了24天,一般不可能 if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)) return true; //沒有開啟時間戳 if (!rx_opt->ts_recent) return true; return false; }
確認時間戳未發生回繞后,看下是不是重傳的SYN報文,如果是那就重發SYNACK報文,並重置SYNACK定時器。
接下來一個比較重要的點就是,如果開啟了TCP_DEFER_ACCEPT選項,即延遲ACK選項,但是接收到的這個ACK沒有攜帶數據,那就先丟棄,標記收到過ACK報文,等待后續客戶端發送數據再做連接建立的真正操作。
重重檢查后,總算是要創建新的socket了,因此inet_csk(sk)->icsk_af_ops->syn_recv_sock上場了。我們熟悉的icsk_af_ops又來了,它指向的是ipv4_specific,
const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, ... };
所以創建新socket就是tcp_v4_syn_recv_sock()完成的。
/* * The three way handshake has completed - we got a valid synack - * now create the new socket. */ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_request_sock *ireq; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif struct ip_options_rcu *inet_opt; //如果全連接隊列已經滿了,那就丟棄報文 if (sk_acceptq_is_full(sk)) goto exit_overflow; //創建一個新的socket用於連接建立后的處理,原來的socket繼續監聽新發起的連接 newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit_nonewsk; ... //處理新創建的socket的端口,一般就是和原來監聽socket使用同一個端口 if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; //將新創建的新socket加入established哈希表中 __inet_hash_nolisten(newsk, NULL); return newsk; ... }
檢查全連接隊列是否滿了,滿了就丟棄報文,否則請出tcp_create_openreq_child()創建新socket。
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) { //創建子socket struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); //接下來就是新socket的各種初始化了 if (newsk != NULL) { const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); ... //初始化新socket的各個定時器 tcp_init_xmit_timers(newsk); ... //如果開啟時間戳 if (newtp->rx_opt.tstamp_ok) { //記錄第三次握手報文發送的時間,上面的流程已經將ts_recent更新 newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } ... } return newsk; } struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); if (newsk != NULL) { struct inet_connection_sock *newicsk = inet_csk(newsk); //新創建的socket狀態設置為SYN_RECV newsk->sk_state = TCP_SYN_RECV; newicsk->icsk_bind_hash = NULL; //記錄目的端口,以及服務器端端口 inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; inet_sk(newsk)->mc_list = NULL; newicsk->icsk_retransmits = 0;//重傳次數 newicsk->icsk_backoff = 0;//退避指數 newicsk->icsk_probes_out = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); security_inet_csk_clone(newsk, req); } return newsk; }
從inet_csk_clone_lock()中我們終於看到socket的狀態進入SYN_RECV了,千呼萬喚始出來啊,這都是第三次握手報文了,說好的接收到SYN報文就進入SYN_RECV的呢,騙得我好苦。
創建好新的socket后,需要將該socket歸檔,處理其使用端口,並且放入bind哈希表,這樣之后我們才能查詢得到這個新的socket。
int __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; //一般新socket和原先的socket端口都是一樣的 if (tb->port != port) { /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, sock_net(sk), head, port); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; } } }
//將新的socket加入bind哈希表中 inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; }
但是加入bind哈希表並不足夠,bind哈希表只是存儲綁定的ip和端口信息,還需要以下幾個動作:
將新建的socket加入establish哈希表
將原先老的socket從半連接隊列里拆除並更新半連接統計信息
將新建的socket加入全連接隊列
加入全連接隊列由inet_csk_reqsk_queue_add()操作,
static inline void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child) { reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); } static inline void reqsk_queue_add(struct request_sock_queue *queue, struct request_sock *req, struct sock *parent, struct sock *child) { //將請求req的sk指針指向新建立的sock,這樣在后面調用accept()時 //就能通過全連接隊列上的這個請求找到這個新創建的sock,然后進行通信 req->sk = child; sk_acceptq_added(parent); if (queue->rskq_accept_head == NULL) queue->rskq_accept_head = req; else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; req->dl_next = NULL; } static inline void sk_acceptq_added(struct sock *sk) { //全連接隊列里連接數量統計更新 sk->sk_ack_backlog++; }
有一點要注意的就是,加入半連接隊列的函數是inet_csk_reqsk_queue_added(),和加入全連接隊列的函數就差一個單詞,一個是add,一個是added,別混淆了。
返回這個新創建的socket后,就進入tcp_child_process()函數繼續深造。
int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) { int ret = 0; int state = child->sk_state; //新的socket沒有沒用戶占用 if (!sock_owned_by_user(child)) { //對,又是它,就是它,處理各種狀態socket的接口 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent, 0); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ //用戶占用則加入backlog隊列 __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return ret; }
接着socket就要從SYN_RECV進入ESTABLISHED狀態了,這就又要tcp_rcv_state_process()出馬了。
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; bool acceptable; u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; ... req = tp->fastopen_rsk;//快速開啟選項相關 ... /* step 5: check the ACK field */ //檢查ACK確認號的合法值 acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) > 0; switch (sk->sk_state) { //這個是新創建的socket,所以此時狀態是SYN_RECV case TCP_SYN_RECV: if (!acceptable) return 1; /* Once we leave TCP_SYN_RECV, we no longer need req * so release it. */ if (req) {//快速開啟走這個流程 synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { //非快速開啟流程 synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk);//初始化擁塞控制 //mtu探測初始化 tcp_mtup_init(sk); tp->copied_seq = tp->rcv_nxt; //初始化接收和發送緩存空間 tcp_init_buffer_space(sk); } smp_mb(); //服務端連接狀態終於抵達終點,established tcp_set_state(sk, TCP_ESTABLISHED); //調用sock_def_wakeup喚醒該sock上等待隊列的所有進程 sk->sk_state_change(sk); /* Note, that this wakeup is only for marginal crossed SYN case. * Passively open sockets are not waked up, because * sk->sk_sleep == NULL and sk->sk_socket == NULL. */ //對於服務端是被動開啟socket,所以不會走這個流程 if (sk->sk_socket) sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); ... } /* step 6: check the URG bit */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ switch (sk->sk_state) { ... case TCP_ESTABLISHED: //這時socket已經是established狀態了,可以處理及接收數據了 tcp_data_queue(sk, skb); queued = 1; break; } /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); } if (!queued) { discard: __kfree_skb(skb); } return 0; }
最終,服務端也到達established狀態,建立了可靠連接。