轉載請注明:http://blog.chinaunix.net/uid-20788636-id-4408276.html
1.2 sock_map_fd函數
在用戶空間創建了一個socket后,返回值是一個文件描述符,下面分析一下創建socket時怎么和文件描述符聯系的。在SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)最后調用sock_map_fd進行關聯,其中返回的retval就是用戶空間獲取的文件描述符fd,sock就是調用sock_create創建成功的socket.
sock_map_fd()主要用於對socket的*file指針初始化,經過sock_map_fd()操作后,socket就通過其*file指針與VFS管理的文件進行了關聯,便可以進行文件的各種操作,如read、write、lseek、ioctl等.
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags);//根據flags獲取沒有使用的fd,具體分析見1.2.1
if (unlikely(fd < 0))
return fd;
newfile = sock_alloc_file(sock, flags, NULL);
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile);
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
1.2.1 get_unused_fd_flags函數
get_unused_fd_flags()函數調用__alloc_fd分配一個新的可用的fd
int __alloc_fd(struct files_struct *files,
unsigned start, unsigned end, unsigned flags)
{
unsigned int fd;
int error;
struct fdtable *fdt;
spin_lock(&files->file_lock);
repeat:
/*得到本進程的文件描述符表*/
fdt = files_fdtable(files);
fd = start;//從start開始,這里的start為0
/* files->next_fd為上一次查找確定的下一個可用空閑的文件描述符,這樣可以提高獲取的效率,如果fd小於files->next_fd的話就可以直接使用next_fd */
if (fd < files->next_fd)
fd = files->next_fd;
/*當fd小於目前進程支持的最大的描述符號,那么可以通過fds_bits位圖,從fd位開始查找,找到下一個0位,即下一個空閑描述符。*/
if (fd < fdt->max_fds)
fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
/*
* N.B. For clone tasks sharing a files structure, this test
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
if (fd >= end)
goto out;
/* 如需要則擴展文件描述符表 */
error = expand_files(files, fd);
if (error < 0)
goto out;
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
if (error)
goto repeat;
/*
設置next_fd,用於下次加速查找空閑的fd。
當start大於next_fd時,不會設置next_fd以避免文件描述符的不連續
*/
if (start <= files->next_fd)
files->next_fd = fd + 1;
/* 將fd添加到已打開的文件描述符表中 */
__set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
error = fd;
#if 1
/* Sanity check */
if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
#endif
out:
spin_unlock(&files->file_lock);
return error;
}
1.2.2 sock_alloc_file函數
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct qstr name = { .name = "" };
struct path path;
struct file *file;
if (dname) {//這里的dname為空
name.name = dname;
name.len = strlen(name.name);
} else if (sock->sk) {
/*這里的name應該是TCP 根據struct proto tcp_prot */
name.name = sock->sk->sk_prot_creator->name;
name.len = strlen(name.name);
}
/*申請一個新的dentry,其中sock_mnt->mnt_sb在前面已經分析過了,是一個sock_fs_type文件系統掛載點,*/
path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
if (unlikely(!path.dentry))
return ERR_PTR(-ENOMEM);
path.mnt = mntget(sock_mnt);
/*將文件操作的函數綁定到inode,對於dentry是在sockfs_mount函數中sockfs_dentry_operations,該函數在sock_init是調用,在前面有分析 */
d_instantiate(path.dentry, SOCK_INODE(sock));
SOCK_INODE(sock)->i_fop = &socket_file_ops;
/*申請新的file,將path,file,關聯起來*/
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
&socket_file_ops);
if (unlikely(IS_ERR(file))) {
/* drop dentry, keep inode */
ihold(path.dentry->d_inode);
path_put(&path);
return file;
}
sock->file = file;//sock->file和剛分配的file關聯起來
file->f_flags = O_RDWR | (flags & O_NONBLOCK);//設置file的標志
file->private_data = sock;//file的私有數據指針指向sock.
return file;
}
Socket創建流程圖
附錄:對於sk_alloc分配的內存大小問題分析
在分析中經常看到此種類型的強制轉換inet = inet_sk(sk);,其中inet被定義為struct inet_sock *inet;結構體,我們看結構體的定義sock結構體的大小小於struct inet_sock,這樣是無法進行強制類型轉換的,但在實際分配的過程中sock分配的大小為tcp_sock的大小,而該結構足夠大。
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
sock_net_set(sk, get_net(net));
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
sock_update_netprioidx(sk);
}
return sk;
}
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
struct sock *sk;
struct kmem_cache *slab;
/*這里分配內存空間時,分為兩種情況,第一種情況是從高速緩存上分配,第二種是普通的分配*/
slab = prot->slab;
if (slab != NULL) {
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);---------------------(1)
if (!sk)
return sk;
if (priority & __GFP_ZERO) {
if (prot->clear_sk)
prot->clear_sk(sk, prot->obj_size);
else
sk_prot_clear_nulls(sk, prot->obj_size);
}
} else
sk = kmalloc(prot->obj_size, priority);---------------------------(2)
if (sk != NULL) {
kmemcheck_annotate_bitfield(sk, flags);
if (security_sk_alloc(sk, family, priority))
goto out_free;
if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
}
return sk;
out_free_sec:
security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
else
kfree(sk);
return NULL;
}
(1)第一種情況:sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO) 這里的slap等於slab = prot->slab;也就是函數傳遞過來的struct proto *prot,再看一下這個結構體是怎么定義的?在inet_create函數中sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);,這里的answer_prot為answer_prot = answer->prot;在看一下answer->prot是如何來的?
在inet_ctreate函數中通過遍歷inetsw數組獲取到struct inet_protosw *answer;
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
其中inetsw的定義下面類型的數組,如果是SOCK_STREAM類型的socket,這里的prot = tcp_prot
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
再看一下
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.mtu_reduced = tcp_v4_mtu_reduced,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
在af_inet.c文件中的inet_init函數中的
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
if (!sysctl_local_reserved_ports)
goto out;
//該函數是注冊tcp_prot,在該函數中對tcp_prot->slab進行內存分配
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out_free_reserved_ports;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif
/*
* Add all the base protocols.
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. 對inetsw進行初始化操作*/
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
/*將inetsw_array 加入到對於的inetsw鏈表中,就可以在inet_create 函數中進行遍歷*/
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
/*
* Set the ARP module up
*/
arp_init();
/*
* Set the IP module up
*/
ip_init();
tcp_v4_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
/* Setup UDP memory threshold */
udp_init();
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
ping_init();
/*
* Set the ICMP layer up
*/
if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
ipfrag_init();
dev_add_pack(&ip_packet_type);
rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
out_free_reserved_ports:
kfree(sysctl_local_reserved_ports);
goto out;
}
在proto_register函數中,主要是關注prot->slab進行了初始化。
int proto_register(struct proto *prot, int alloc_slab)
{
if (alloc_slab) {
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
SLAB_HWCACHE_ALIGN | prot->slab_flags,
NULL);// 這里的餓prot->obj_size為.obj_size = sizeof(struct tcp_sock),
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
goto out;
}
……………………..
}
(2)對於第二種情況,主要prot->obj_size,就是struct proto tcp_prot 中初始化的.obj_size = sizeof(struct tcp_sock)。sk = kmalloc(prot->obj_size, priority);---------------------------(2)
下面是五個相關的數據結構,tcp_sock結構體占用的空間是最大的,所以在分配內存空間時,都是分配的tcp_sock的大小,這樣在后面進行強制轉換的過程中可以保證正確。