Socket bind系統調用簡要分析


主要查看linux kernel 源碼:Socket.c 以及af_inet.c文件

1.1 bind分析

#include <sys/types.h> /* See NOTES */
#include <sys/socket.h>
int bind(int sockfd, const struct sockaddr *addr, socklen_t addrlen);

其中的參數解釋如下。
·sockfd :表示要綁定地址的套接字描述符。
·addr :表示綁定到套接字的地址。
·addrlen :表示綁定的地址長度。
返回值 0 表示成功, -1 則表示錯誤

因為 Linux 的套接字是針對多種協議族的,而每個協議族都可以有不同的地址類型。所以 Linux 套接字關於地址的系統調用,統一使用了一個公共結構體,並要求
調用者將實際地址參數進行強制類型轉換,以此來避免編譯警告

/*
 *	Bind a name to a socket. Nothing much to do here since it's
 *	the protocol's responsibility to handle the local address.
 *
 *	We move the socket address to kernel space before we call
 *	the protocol layer (having also checked the address is ok).
 */

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
	struct socket *sock;
	struct sockaddr_storage address;
	int err, fput_needed;
//通過socket文件符fd找到socket   
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock) { //將地址信息由用戶態copy到內核 
		err = move_addr_to_kernel(umyaddr, addrlen, &address);
		if (err >= 0) {
			err = security_socket_bind(sock,
						   (struct sockaddr *)&address,
						   addrlen);
			if (!err) //調用TCP對應的插口函數執行綁定功能
				err = sock->ops->bind(sock,
						      (struct sockaddr *)
						      &address, addrlen);
		}
		fput_light(sock->file, fput_needed);
	}
	return err;
}

 如果sock->ops指向inet_stream_ops,那么sock->ops->bind就指向inet_bind:

sock->ops指向inet_dgram_ops,那么sock->ops->bind就指向inet_bind:

//SOCK_STREAM套接口的socket層操作函數集實例為inet_stream_ops,其中綁定函數為inet_bind()。
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct sock *sk = sock->sk;/* BSD socket 實例 */
	struct inet_sock *inet = inet_sk(sk);/* INET實例 */
	unsigned short snum;
	int chk_addr_ret;
	int err;

	/* If the socket has its own bind function then use it. (RAW) 
    用於原始套接字,TCP協議實例tcp_prot不含此函數指針
    */
	if (sk->sk_prot->bind) {
		err = sk->sk_prot->bind(sk, uaddr, addr_len);
		goto out;
	}
	err = -EINVAL;//地址類型必須是struct sockaddr_in
	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

	if (addr->sin_family != AF_INET) {//地址族必須是AF_INET
		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
		 * only if s_addr is INADDR_ANY.
		 */
		err = -EAFNOSUPPORT;
		if (addr->sin_family != AF_UNSPEC ||
		    addr->sin_addr.s_addr != htonl(INADDR_ANY))
			goto out;
	}
 /* 在路由中檢查IP地址類型,單播、多播還是廣播 */
	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);

	/* Not specified by any standard per-se, however it breaks too
	 * many applications when removed.  It is unfortunate since
	 * allowing applications to make a non-local bind solves
	 * several problems with systems using dynamic addressing.
	 * (ie. your servers still start up even if your ISDN link
	 *  is temporarily down)
	 */
 /* sysctl_ip_nonlocal_bind表示是否允許綁定非本地的IP地址。
     * inet->freebind表示是否允許綁定非主機地址。
     * 這里需要允許綁定非本地地址,除非是發送給自己、多播或廣播。
     */
	err = -EADDRNOTAVAIL;
	if (!sysctl_ip_nonlocal_bind &&
	    !(inet->freebind || inet->transparent) &&
	    addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
	    chk_addr_ret != RTN_LOCAL &&
	    chk_addr_ret != RTN_MULTICAST &&
	    chk_addr_ret != RTN_BROADCAST)
		goto out;

	snum = ntohs(addr->sin_port);
	err = -EACCES;
  /* snum為0表示讓系統隨機選擇一個未使用的端口,因此是合法的。
    * 如要需要綁定的端口為1 ~ 1023,則需要對應的特權。
    */
	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
		goto out;

	/*      We keep a pair of addresses. rcv_saddr is the one
	 *      used by hash lookups, and saddr is used for transmit.
	 *
	 *      In the BSD API these are the same except where it
	 *      would be illegal to use them (multicast/broadcast) in
	 *      which case the sending device address is used.
	 */
	lock_sock(sk);

	/* Check these errors (active socket, double bind).
* 如果套接字不在初始狀態TCP_CLOSE,或者已經綁定端口了,則出錯。
     * 一個socket最多可以綁定一個端口,而一個端口則可能被多個socket共用。
 */
	err = -EINVAL;
	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
		goto out_release_sock;
/* 綁定地址 */
	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
		inet->inet_saddr = 0;  /* Use device */

	/* Make sure we are allowed to bind here. 
* 如果使用的是TCP,則sk_prot為tcp_prot,get_port為inet_csk_get_port()
     * 端口可用的話返回0。

*/
	if (sk->sk_prot->get_port(sk, snum)) {
		inet->inet_saddr = inet->inet_rcv_saddr = 0;
		err = -EADDRINUSE;
		goto out_release_sock;
	}
 /* inet_rcv_saddr表示綁定的地址,接收數據時用於查找socket */
	if (inet->inet_rcv_saddr)
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)/* 表示綁定了本地端口 */
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->inet_sport = htons(inet->inet_num); /* 綁定端口 */
	inet->inet_daddr = 0;
	inet->inet_dport = 0;
	sk_dst_reset(sk);
	err = 0;
out_release_sock:
	release_sock(sk);
out:
	return err;
}

 SOCK_STREAM套接口的TCP層操作函數集實例為tcp_prot,其中端口綁定函數為inet_csk_get_port()

/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 */
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
	struct inet_bind_hashbucket *head;
	struct hlist_node *node;
	struct inet_bind_bucket *tb;
	int ret, attempts = 5;
	struct net *net = sock_net(sk);
	int smallest_size = -1, smallest_rover;

	local_bh_disable();/* 禁止下半部,防止 進程 軟中斷搶占 */

    /* 如果snum為0,系統自動為sock選擇一個端口號 */
	if (!snum) {
		int remaining, rover, low, high;

again:
		inet_get_local_port_range(&low, &high); /* 獲取端口號的取值范圍 */
		remaining = (high - low) + 1;/* 取值范圍內端口號的個數 */
		smallest_rover = rover = net_random() % remaining + low;/* 隨機選取范圍內的一個端口 */

		smallest_size = -1;
		do { /* 查看端口是否屬於保留的 */
			if (inet_is_reserved_local_port(rover))
				goto next_nolock;/* rover加1,繼續 */
				 /* 根據端口號,確定所在的哈希桶 */
			head = &hashinfo->bhash[inet_bhashfn(net, rover,
					hashinfo->bhash_size)];
			spin_lock(&head->lock);
			inet_bind_bucket_for_each(tb, node, &head->chain)/* 從頭遍歷哈希桶 */
				if (net_eq(ib_net(tb), net) && tb->port == rover) {/* 如果端口被使用了 */
					if (tb->fastreuse > 0 &&
					    sk->sk_reuse &&
					    sk->sk_state != TCP_LISTEN &&
					    (tb->num_owners < smallest_size || smallest_size == -1)) {
						smallest_size = tb->num_owners;
						smallest_rover = rover;
						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
							snum = smallest_rover;
							goto tb_found;
						}
					}/* 檢查是否有端口綁定沖突,該端口是否能重用 */
					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
						snum = rover;
						goto tb_found;
					}
					goto next;/* 此端口不可重用,看下一個 */
				}
			break;/* 找到了沒被用的端口,退出 */
		next:
			spin_unlock(&head->lock);
		next_nolock:
			if (++rover > high)
				rover = low;
		} while (--remaining > 0);

		/* Exhausted local port range during search?  It is not
		 * possible for us to be holding one of the bind hash
		 * locks if this test triggers, because if 'remaining'
		 * drops to zero, we broke out of the do/while loop at
		 * the top level, not from the 'break;' statement.
		 */
		ret = 1;
		if (remaining <= 0) {
			if (smallest_size != -1) {
				snum = smallest_rover;
				goto have_snum;
			}
			goto fail;
		}
		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover; /* 自動選擇的可用端口 */
	} else {
have_snum:
		head = &hashinfo->bhash[inet_bhashfn(net, snum,
				hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, node, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == snum)
				goto tb_found;/* 發現端口在用 */
	}
	tb = NULL;
	goto tb_not_found;
tb_found:
	if (!hlist_empty(&tb->owners)) {
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

		if (tb->fastreuse > 0 &&
		    sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
		    smallest_size == -1) {
			goto success;
		} else {
			ret = 1;
			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
				if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
				    smallest_size != -1 && --attempts >= 0) {
					spin_unlock(&head->lock);
					goto again;
				}

				goto fail_unlock;
			}
		}
	}
tb_not_found:
	ret = 1;
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
		goto fail_unlock;
	if (hlist_empty(&tb->owners)) {/* 端口上有綁定sock時 */
		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
			tb->fastreuse = 1;
		else
			tb->fastreuse = 0;
	} else if (tb->fastreuse &&
		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
		tb->fastreuse = 0;
success:
	if (!inet_csk(sk)->icsk_bind_hash)
		inet_bind_hash(sk, tb, snum);
	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}
我們可以指定系統自動分配端口號時,端口的區間:

/proc/sys/net/ipv4/ip_local_port_range,默認為:32768 61000

也可以指定要保留的端口區間:

/proc/sys/net/ipv4/ip_local_reserved_ports,默認為空


端口綁定沖突
面向連接的、傳輸層的協議族相關的操作函數集:

/*
 * Pointers to address related TCP functions
 * (i.e. things that depend on the address family)
 */
struct inet_connection_sock_af_ops {
    ...
    int (*bind_conflict) (const struct sock *sk, const struct inet_bind_bucket *tb, bool relax);
    ...
};
const struct inet_connection_sock_af_ops ipv4_specific = {
    ...
    .bind_conflict = inet_csk_bind_conflict, /* 用於判斷綁定端口是否沖突 */
    ...
};

int inet_csk_bind_conflict(const struct sock *sk,
			   const struct inet_bind_bucket *tb, bool relax)
{
	struct sock *sk2;
	struct hlist_node *node;
	int reuse = sk->sk_reuse;

	/*
	 * Unlike other sk lookup places we do not check
	 * for sk_net here, since _all_ the socks listed
	 * in tb->owners list belong to the same net - the
	 * one this bucket belongs to. * 遍歷此端口上的sock。
	 */

	sk_for_each_bound(sk2, node, &tb->owners) {
	 /* 沖突的條件1:不是同一socket、綁定在相同的設備上 */
		if (sk != sk2 &&
		    !inet_v6_ipv6only(sk2) &&
		    (!sk->sk_bound_dev_if ||
		     !sk2->sk_bound_dev_if ||
		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
			 /* 沖突的條件2:綁定在相同的IP上
             * 沖突的條件3(符合一個即滿足):
             * 3.1 本socket不允許重用
             * 3.2 鏈表中的socket不允許重用
             * 3.3 鏈表中的socket處於監聽狀態
             */
			if (!reuse || !sk2->sk_reuse ||
			    sk2->sk_state == TCP_LISTEN) {
				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
				    sk2_rcv_saddr == sk_rcv_saddr(sk))
					break;
			}
			if (!relax && reuse && sk2->sk_reuse &&
			    sk2->sk_state != TCP_LISTEN) {
				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);

				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
				    sk2_rcv_saddr == sk_rcv_saddr(sk))
					break;
			}
		}
	}
	return node != NULL;
}

 Q: 什么情況下會出現沖突呢?

A: 同時符合以下條件才會沖突:

1. 綁定的設備相同(不允許自動選擇設備)

2. 綁定的IP地址相同(不允許自動選擇IP)

3 以下條件有一個成立:

    3.1 要綁定的socket不允許重用

    3.2 已綁定的socket不允許重用

    3.3 已綁定的socket處於監聽狀態  

    3.4 relax參數為false

 我們看到系統自動選擇端口時,relax為false,是不允許這種情況的。
tcp: bind() fix autoselection to share ports

The current code checks for conflicts when the application requests a specific port.

If there is no conflict, then the request is granted.

On the other hand, the port autoselection done by the kernel fails when all ports are bound

even when there is a port whith no conflict available.

The fix changes port autoselection to check if there is a conflict and use it if not.


作者的意思是,在系統自動選擇端口時,判斷可重用端口的主要條件為:

tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN

而其實不符合此條件、但通過bind_conflict()檢查的端口也是可以重用的
簡單的把上面那幾行代碼就這樣加進去,系統自動選擇端口的思路就變為:

1. 隨機選取一個端口。

2. 檢查其是否被使用了。

    2.1 沒有被使用,那么就是這個端口了,退出:)

    2.2 被使用了,檢查重用是否有沖突。

          2.2.1 沒有沖突,就重用這個端口,退出!

          2.2.2 有沖突,繼續遍歷。

3. 端口++,重復1和2。

 

 linux 內核后面 增加了SO_REUSEPORT, 並且多進程監聽同一個端口,這個 也要考慮唄,

這個是為了解決驚群問題,進群的問題 有很多方案  為什么要 選擇這一種 ,后面在一一細說


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM