-
重要結構體
-
struct socket結構體
// 普通的 BSD 標准 socket 結構體
// socket_state: socket 狀態, 連接?不連接?
// type: socket type (%SOCK_STREAM, etc)
// flags: socket flags (%SOCK_NOSPACE, etc)
// ops: 專用協議的socket的操作
// file: 與socket 有關的指針列表
// sk: 負責協議相關結構體,這樣就讓這個這個結構體和協議分開。
// wq: 等待隊列
struct socket {
socket_state state;
kmemcheck_bitfield_begin(type);
short type;
kmemcheck_bitfield_end(type);
unsigned long flags;
struct socket_wq __rcu *wq;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};
-
struct socket的創建
// socket() 本質上是 glibc 中的函數,執行的實際上是 sys_socketcall() 系統調用。
// sys_socketcall() 幾乎是所有的socket函數的入口,
// 也就是 bind,connect 等函數都是需要asmlinkage long sys_socketcall(int call, unsigned long __user *args); 、、sys_socketcall() 作為入口,函數如下:
// include/linux/syscalls.h
asmlinkage long sys_socketcall(int call, unsigned long __user *args);
// net/socket.c
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
unsigned long a[AUDITSC_ARGS];
unsigned long a0, a1;
int err;
unsigned int len;
if (call < 1 || call > SYS_SENDMMSG)
return -EINVAL;
len = nargs[call];
if (len > sizeof(a))
return -EINVAL;
/* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, len))
return -EFAULT;
err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
if (err)
return err;
a0 = a[0];
a1 = a[1];
// 判斷,然后運行相對應的函數
switch (call) {
case SYS_SOCKET: // 這里就是 sys_socket(),
err = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_CONNECT:
err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_LISTEN:
err = sys_listen(a0, a1);
break;
// ... ...
default:
err = -EINVAL;
break;
}
return err;
}
// include/linux/syscalls.h
asmlinkage long sys_socket(int, int, int);
// net/socket.c
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
// 這里創建了 socket 結構體
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
// 與文件系統進行關聯
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
-
sock_create()函數
// include/linux/net.h
int sock_create(int family, int type, int proto, struct socket **res);
// net/socket.c
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);
// include/linux/net.h
int __sock_create(struct net *net, int family, int type, int proto,
struct socket **res, int kern);
// net/socket.c
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
// 檢查 協議族是否在范圍呢
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX) // 檢查類型
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/ // 檢查用的是PF_INET 其實這個都是兼容的。
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
// 安全機制檢查
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/ // ----> sock_alloc 接下面
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
// ... ...
return 0;
// ... ...
}
EXPORT_SYMBOL(__sock_create);
-
sock_alloc()函數解析,被上面的__sock_create()函數調用
// net/socket.c
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
return NULL;
sock = SOCKET_I(inode);
kmemcheck_annotate_bitfield(sock, type);
inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO; // 模式
inode->i_uid = current_fsuid(); // 獲取當前的uid
inode->i_gid = current_fsgid(); // 獲取當前的gid
inode->i_op = &sockfs_inode_ops; // 操作
this_cpu_add(sockets_in_use, 1);
return sock;
}
// 申請一個 socket 結構體 ,名字為 sock
// 申請一個新的節點和一個新的 socket 項目, 綁定他們兩個並且初始化
// 如果申請inode 失敗返回 NULL, 或者返回sock
// 接下來我們再看到 SOCKET_I(inode);
// include/net/sock.h
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
// 然后我們發現,返回的是 inode 內的socket 結構體。
// 我們可以分析一個 container_of() 這個是怎么定義的。
// include/linux/kernel.h
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
// typeof 將 ptr 的指針臨時保存起來為 __mptr
// 然后用這個 __mptr 指針減去下面的 member 的便宜量。
// 得到的就是 type 這個結構體的頭指針。
// offsetof include/linux/stddef.h
#undef offsetof
#ifdef __compiler_offsetof
#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
#else
#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
#endif
// 反正這里有點難理解,最后得到的結果是 type 這個結構體的頭指針。
// 所以說 SOCKET_I() 得到的是 struct socket_alloc 的頭指針
// include/net/sock.h
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
-
回到 __sock_create() 繼續分析
// net/socket.c --> __sock_create()
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
如果在 make menuconfig 中選上 編譯成模塊的選項,則會運行上面這個部分。
里面先是檢查對應的協議族的操作表是否已經安裝,如果沒有安裝則使用 request_module 進行安裝,現在都是在 TCP/IP協議下進行分析,所以 family 是 AF_INET , 也就是 2 , 所以實際檢查的全局變量是 net_families[2], 這個全局變量是在系統初始化時由 net/ipv4/af_inet.c 文件進行安裝,具體代碼是:
// net/ipv4/af_inet.c
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
// 各個協議的注冊
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif
/*
* Add all the base protocols.
*/
// 各個協議的添加,添加不成功則報錯
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
// 把這個關鍵性的鏈接表一個個注冊上去
// ******************************************************
// inetsw_array 結構體數組數組, 這里面有包含每個的協議,比如說tcp_prot
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_REUSE,
},
// ... ...
}
// tcp_prot ---> net/ipv4/tcp_ipv4.c
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock, // 這是init 函數會在后面被調用
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);
// ***********************************************************
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
// 各個協議模塊的初始化
/*
* Set the ARP module up
*/
arp_init();
/*
* Set the IP module up
*/
ip_init();
tcp_v4_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
/* Setup UDP memory threshold */
udp_init();
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
ping_init();
/*
* Set the ICMP layer up
*/
if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
ipfrag_init();
dev_add_pack(&ip_packet_type);
ip_tunnel_core_init();
rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
goto out;
}
fs_initcall(inet_init);
-
很粗淺的看完協議那一部分之后我們回到 __sock_create()
// net/socket.c
// 看到 這個回調函數的調用
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
// 先看一個 inet_protosw 結構體
// include/net/protocol.h
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
unsigned short protocol; /* This is the L4 protocol number. */
struct proto *prot;
const struct proto_ops *ops;
unsigned char flags; /* See INET_PROTOSW_* below. */
};
// 上面的 create 函數對應的是 net/ipv4/af_inet.c 里面的 inet_create 函數
// net/ipv4/af_inet.c
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
int try_loading_module = 0;
int err;
// 檢查協議是否在范圍之內
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
// 設置狀態為未連接
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
// 遍歷尋找請求的協議類型
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
// 遍歷 inetsw[] 數組,其實就是次數而已
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
// 檢查對應的協議,然后再選擇合適的協議
/* Check the non-wild match. */
// 找到對應的協議,如果找到對應的協議,但是protocol 不是 IPPRORO_IP,則直接退出
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
// 如果沒有對應的協議則返回錯誤碼
err = -EPROTONOSUPPORT;
}
// 如果沒有加載模塊的保護措施
if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
// 檢查通用性,只有root 權限然后使用原始套接字
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
// 對socket 的操作集合進行了互聯。
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
/* 此處調用sk_alloc分配一個struct sock,該結構體龐大,其作用是網絡層對socket的表示,意思就是IP協議下有很多東西比如IP地址,網卡接口,端口等等信息需要再socket層中有所體現從而使編程者方便使用,然后就利用指針等形式把內容進行一定程度上的映射。sk_alloc首先對sock->proto和sock_creator進行設置,設置成當前協議對應的proto調用sk_prot_alloc()根據是否提供了slab緩存而判斷是使用slab緩存還是通用緩存。只要分配成功,則調用sock_lock_init()對緩存進行初始化,主要是對sock鎖、等待隊列以及進程數據結構中的網絡空間結構進行分配。初始化完了后調用sock_net_set()函數對網絡空間結構進行記錄,然后最后增加一個net計數器。至此回到inet_create,判斷是否成功分配 */
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
err = 0;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
// 返回一個 struct inet_sock 的指針給 inet
inet = inet_sk(sk);
// 判斷是不是面向連通
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
// 判斷是不是原始套接字,如果是,新建IP頭部。
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
// 判斷是否采用路徑 MTU 發現算法
if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
// 進一步初始化結構體 sk (struct sock)
// sock_init_data: 初始化接收,發送,錯誤信息隊列,三個隊列都是雙向鏈表,屬於sk_buff_head 結構體,其中會把 sk_buff 結構體串聯在一起,初始化數據包發送定時器,變量,(主要是函數指針)
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
// 這里,就是調用了協議里面的 init 函數 tcp_v4_init_sock
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
-
tcp_v4_init_sock函數
static int tcp_v4_init_sock(struct sock *sk)
{
// 強制轉換類型
struct inet_connection_sock *icsk = inet_csk(sk);
// 調用這個進行初始化 ,里面就時關於tcp 的一些初始化了,到此為止
tcp_init_sock(sk);
// ipv4 專用操作
icsk->icsk_af_ops = &ipv4_specific;
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
return 0;
}
-
到此, sock_create 分析完畢
-
最后回到 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
// net/socket.c
// 剛才分析完畢
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
// socket 映射到文件系統
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
// net/socket.c
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags);
if (unlikely(fd < 0))
return fd;
// 申請一個 sock file 節點
newfile = sock_alloc_file(sock, flags, NULL);
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile);
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
// 這里所展現的意思是,把socket當成一個文件節點進行操作,open, read,write ,ioctl 等
