linux下的網絡編程離不開socket,中文被翻譯為套接字。任何網絡通信都必須先建立socket,再通過socket給對方收發數據!數據接受的demo代碼如下:
#include <string.h> #include <sys/socket.h> #include <sys/types.h> #define SET_PORT 3490 int main(void) { int sockfd, new_fd; struct sockaddr_in my_addr; struct sockaddr_in their_addr; int sin_size; sockfd = socket(PF_INET, SOCK_STREAM, 0); my_addr.sin_family = AF_INET; my_addr.sin_port = htons(_INT_PORT); my_addr.sin_addr.s_addr = INADDR_ANY; bzero(&(my_addr.sin_zero),sizeof(my_addr.sin_zero)); bind(sockfd, (struct sockaddr *)&my_addr,sizeof(struct sockaddr));// 綁定套接字 listen(sockfd, 10); // 監聽套接字 sin_size = sizeof(struct sockaddr_in); new_fd = accept(sockfd, &their_addr, &sin_size); // 接收套接字 }
可以看出,需要先調用socket函數建立socket,再綁定套接字,最后監聽和接受數據。 這個socket到底是啥?linux在內核中又是怎么使用的了?
1、(1)socket是個結構體,字段不多,但是嵌套了其他結構體,各種嵌套的關系標識如下:
- proto_ops:用戶層調用的各種接口就是在這里注冊的(篇幅有限,截圖的字段不全)
- wq:等待該socket的進程隊列和異步通知隊列;換句話說:同一個socket可能有多個進程都在等待使用!
- sock:應該是socket結構體最核心的嵌套結構體了(篇幅有限,截圖的字段不全)!
(2)socket結構體有了,接下來就是創建和初始化了!linux內核創建socket的函數是__sock_create,核心代碼如下:
int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; ......... /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. 本質:創建socket結構體,存放在inode,通過superblock統一檢索和管理 */ sock = sock_alloc(); ......... /*socket就是在這里創建的,實際調用的是inet_create af_inet.c文件中: static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, };*/ err = pf->create(net, sock, protocol, kern); .................. }
創建socket的核心函數就2個:sock_alloc,還有pf->create!先看第一個sock_alloc,代碼如下:
/** * sock_alloc - allocate a socket * * Allocate a new inode and socket object. The two are bound together * and initialised. The socket is then returned. If we are out of inodes * NULL is returned. 明明是申請socket,底層卻分配inode,這是為啥了? 1、socket也需要管理,放在inode后通過super_bloc統一檢索和管理 2、socket的屬性字段自然也存放在inode節點了 3、符合萬物皆文件的理念 */ struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; //從超級塊里分配一個inode inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; /*把inode和socket綁定在一起,通過inode尋址socket,便於管理*/ sock = SOCKET_I(inode); kmemcheck_annotate_bitfield(sock, type);//標記shadow memory來表示這塊內存已經使用了 inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; this_cpu_add(sockets_in_use, 1); return sock; }
本質上就是分配一個inode,然后和socket結構體綁定,通過inode尋址socket結構體!socket結構體有了,接下來就是在socket內部嵌套的sock結構體了!其生成和初始化的工作都是在inet_create內部完成的,代碼如下:
static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; sock->state = SS_UNCONNECTED;//初始化狀態當然設置成未連接了 /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOBUFS; /*從cpu緩存或堆內存分配空間存儲sock實例,並初始化*/ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; /* 1、強制轉換成inet_sock類型,便於繼續初始化; 2、inet和sk指針並未改變,指向的是同一塊內存地址,兩個指針可以同時使用 */ inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; inet->nodefrag = 0; if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; inet->inet_id = 0; /* 1、初始化sk_buff的讀、寫、錯誤隊列 2、關聯socket和sock的實例 3、定義sock的回調函數 4、初始化其他sock字段 */ sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct;//析構時的回調函數 sk->sk_protocol = protocol;//協議類型 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; //sk和inet交替使用來初始化 inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; sk_refcnt_debug_inc(sk);//引用計數+1 if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); if (err) { sk_common_release(sk); goto out; } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }
整個邏輯並不復雜,先是調用sk_alloc函數生成sock實例,再調用sock_init_data初始化sock實力,並和socket實例關聯,所以我個人認為sock_init_data是最核心的函數,如下:
/* 1、初始化sk_buff的讀、寫、錯誤隊列 2、關聯socket和sock的實例 3、定義sock的回調函數 4、初始化其他sock字段 */ void sock_init_data(struct socket *sock, struct sock *sk) { /*初始化sk_buff的讀寫、錯誤隊列*/ skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); sk->sk_send_head = NULL; //初始化定時器 init_timer(&sk->sk_timer); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; //這里終於把socket和sock實例關聯起來了 sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; sk->sk_wq = sock->wq; sock->sk = sk; } else sk->sk_wq = NULL; rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup;//狀態改變后的回調函數 sk->sk_data_ready = sock_def_readable;//有數據可讀的回調函數 sk->sk_write_space = sock_def_write_space;//有緩存可寫的回調函數 sk->sk_error_report = sock_def_error_report;//發生io錯誤時的回調函數 sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = ktime_set(-1L, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = sysctl_net_busy_read; #endif sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) */ smp_wmb(); atomic_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); }
上面有幾個回調函數,其實實現的邏輯的代碼結構基本是一樣的:
/* * Default Socket Callbacks 當sock的狀態發生改變時,會調用此函數來進行處理 */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq))//有進程阻塞在這個socket //喚醒所有在等待這個socket的進程,核心就是執行進程喚醒的回調函數 wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } /*sock有輸入數據可讀時,會調用此函數來處理*/ static void sock_def_readable(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) /* 喚醒等待數據的進程,核心還是執行回調函數 */ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); /* 異步通知隊列的處理。 * 檢查應用程序是否通過recv()類調用來等待接收數據,如果沒有就發送SIGIO信號, * 告知它有數據可讀。 * how為函數的處理方式,band為用來告知的IO類型。 */ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); }
當有可讀數據的時候,肯定第一時間通知相應的進程來讀取數據,核心是通過sk_wake_async函數實現的;而sk_wake_async最終調用了kill_fasync_rcu來給排隊等待的隊列發出SIGIO信號,通知這些隊列中的進程來取數據了!異步的好處在這里就凸顯了:進程不用在這里空轉等數據,而是可以釋放cpu去執行其他進程的代碼;等socket有數據后再通過類似中斷的形式通知等待的進程來取數據了!
/* * rcu_read_lock() is held 函數名有kill,但實際是向隊列的進程發送SIGIO信號 */ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } spin_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } spin_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } }