一、TCP中的數據收發過程
TCP協議簡介
TCP/IP是面向連接的、可靠的、基於字節流的傳輸層通信協議。TCP的全稱Transport Control Protocal即(數據)傳輸控制協議,其主要包含了建立連接(三次握手)、滑動窗口機制和擁塞控制。TCP數據段的首部如下圖所示:

下面我們略過TCP建立連接的部分,對TCP滑動窗口和擁塞控制進行介紹
TCP滑動窗口
TCP是全雙工的協議,會話雙方可以同時收發信息,所以滑動窗口分為發送窗口和接收窗口。發送窗口包括了“已發送但未收到確認”和“未發送但對方允許發送”的數據,接收窗口包括了“未接收但允許對方發送”的數據。用滑動窗口實現字節流的傳輸的可靠性來源於確認重傳機制,發送窗口只有在收到對已發送字節的確認ACK后才會移動左邊界,接收窗口只有在前面字節都已接收的情況下才會移動左邊界(不能空缺)。滑動窗口機制圖示如下:
TCP擁塞控制
TCP有流量控制的功能,可以根據網絡擁塞情況調整收發速度。實現這一功能的算法主要有四種:慢啟動、擁塞避免、快重傳和快恢復。引入三個參數:擁塞窗口(cwnd)限制了發送端接收到確認前可以發送的最大數據量,接收端通知窗口(rwnd)限制了接收端可接收數據量,慢啟動閾值(ssthresh)決定了傳輸方式是用慢啟動還是擁塞避免。cwnd與rwnd即對應TCP滑動窗口中的發送窗口和接收窗口。cwnd每收到一次確認增長一次,在慢啟動階段為二次指數增長,在達到ssthresh后改變為線性增長(擁塞避免),出現網絡超時后重新回到慢啟動,而在收到3-ACK后采用快重傳與快恢復,即cwnd減半后線性增長。整個機制的具體描述如下圖:

二、send和recv源代碼分析
TCP SOCKET的系統調用的總入口位於linux/net/socket.c中的SYSCALL_DEFINE2函數,查看后發現send、sento與recv、recvfrom其實只對應兩個系統調用:__sys_sendto和__sys_recvfrom。

查看這兩個函數的源代碼:
__sys_sendto
int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
struct sockaddr __user *addr, int addr_len)
{
struct socket *sock;
struct sockaddr_storage address;
int err;
struct msghdr msg;
struct iovec iov;
int fput_needed;
err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter);
if (unlikely(err))
return err;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
if (addr) {
err = move_addr_to_kernel(addr, addr_len, &address);
if (err < 0)
goto out_put;
msg.msg_name = (struct sockaddr *)&address;
msg.msg_namelen = addr_len;
}
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
msg.msg_flags = flags;
err = sock_sendmsg(sock, &msg);
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
}
__sys_recvfrom
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
struct sockaddr __user *addr, int __user *addr_len)
{
struct socket *sock;
struct iovec iov;
struct msghdr msg;
struct sockaddr_storage address;
int err, err2;
int fput_needed;
err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter);
if (unlikely(err))
return err;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
msg.msg_control = NULL;
msg.msg_controllen = 0;
/* Save some cycles and don't copy the address if not needed */
msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
/* We assume all kernel code knows the size of sockaddr_storage */
msg.msg_namelen = 0;
msg.msg_iocb = NULL;
msg.msg_flags = 0;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &msg, flags);
if (err >= 0 && addr != NULL) {
err2 = move_addr_to_user(&address,
msg.msg_namelen, addr, addr_len);
if (err2 < 0)
err = err2;
}
fput_light(sock->file, fput_needed);
out:
return err;
}
可以看到,這兩個系統調用只是簡單的分配了發送緩沖區和接收緩沖區的文件指針,並未涉及滑動窗口和擁塞控制。
然而我們發現在SYSCALL_DEFINE2中還有4個與收發數據有關的調用,__sys_sendmsg、__sys_sendmmsg、__sys_recvmsg和__sys_recvmmsg

查看它們的源代碼:
__sys_sendmsg
long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
bool forbid_cmsg_compat)
{
int fput_needed, err;
struct msghdr msg_sys;
struct socket *sock;
if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
return -EINVAL;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
fput_light(sock->file, fput_needed);
out:
return err;
}
__sys_sendmmsg
int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
unsigned int flags, bool forbid_cmsg_compat)
{
int fput_needed, err, datagrams;
struct socket *sock;
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
struct used_address used_address;
unsigned int oflags = flags;
if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
return -EINVAL;
if (vlen > UIO_MAXIOV)
vlen = UIO_MAXIOV;
datagrams = 0;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
return err;
used_address.name_len = UINT_MAX;
entry = mmsg;
compat_entry = (struct compat_mmsghdr __user *)mmsg;
err = 0;
flags |= MSG_BATCH;
while (datagrams < vlen) {
if (datagrams == vlen - 1)
flags = oflags;
if (MSG_CMSG_COMPAT & flags) {
err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
&msg_sys, flags, &used_address, MSG_EOR);
if (err < 0)
break;
err = __put_user(err, &compat_entry->msg_len);
++compat_entry;
} else {
err = ___sys_sendmsg(sock,
(struct user_msghdr __user *)entry,
&msg_sys, flags, &used_address, MSG_EOR);
if (err < 0)
break;
err = put_user(err, &entry->msg_len);
++entry;
}
if (err)
break;
++datagrams;
if (msg_data_left(&msg_sys))
break;
cond_resched();
}
fput_light(sock->file, fput_needed);
/* We only return an error if no datagrams were able to be sent */
if (datagrams != 0)
return datagrams;
return err;
}
__sys_recvmsg
long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
bool forbid_cmsg_compat)
{
int fput_needed, err;
struct msghdr msg_sys;
struct socket *sock;
if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
return -EINVAL;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
fput_light(sock->file, fput_needed);
out:
return err;
}
__sys_recvmmsg
int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags,
struct __kernel_timespec __user *timeout,
struct old_timespec32 __user *timeout32)
{
int datagrams;
struct timespec64 timeout_sys;
if (timeout && get_timespec64(&timeout_sys, timeout))
return -EFAULT;
if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
return -EFAULT;
if (!timeout && !timeout32)
return do_recvmmsg(fd, mmsg, vlen, flags, NULL);
datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
if (datagrams <= 0)
return datagrams;
if (timeout && put_timespec64(&timeout_sys, timeout))
datagrams = -EFAULT;
if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
datagrams = -EFAULT;
return datagrams;
}
觀察以上代碼,發現__sys_sendmmsg給出了發送窗口的基本結構,__sys_recvmmsg中涉及了超時重傳,其他滑動窗口與擁塞控制的細節都未出現,說明其實現並不在這一層級中。這些機制的實現應該在tcp協議棧的初始化過程中就已經完成。
三、運行跟蹤
給__sys_sendto、__sys_recvfrom、__sys_sendmsg、__sys_sendmmsg、__sys_recvmsg和__sys_recvmmsg都打上斷點,然后運行reply/hi

發現只有__sys_sendto和__sys_recvfrom被調用了兩次,說明在tcp的send與recv過程中,只需要分配好發送和接收方的文件指針(分配符),即可完成數據的收發。而控制完整數據收發過程的相應機制的實現,與這一層級無關。
