bind()实现源码分析

bind()

内核版本:3.10.0-514.16.1.el7.x86_64
下述源码分析均以tcp socket为背景

1
2
3
#include <sys/types.h>
#include <sys/socket.h>
int bind(int sockfd, struct sockaddr *my_addr, socklen_t addrlen);
  • socket文件描述符
  • 要绑定的承载地址和端口的结构体 struct sockaddr
  • 第二个参数struct sockaddr的长度

该函数负责绑定套接字的地址和端口,按照绑定者身份来分,会存在两种情况

情况1:绑定者为客户端,主动发起请求方,绑定地址和端口成功后,会使用该地址和端口进行发包
一般情况下,客户端的地址和端口都是其自动选择的,不需要绑定动作。
情况2:绑定者为服务端,被动连接接收方,绑定地址和端口成功后,客户端只能向该地址和端口发送连接请求。服务端往往需要绑定地址和端口。如果服务端存在多网卡情况,其只需要绑定服务端口即可,其目的地址就是客户端访问的目的地址。

sys_bind

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;

sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address);
if (err >= 0) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
if (!err)
err = sock->ops->bind(sock,
(struct sockaddr *)
&address, addrlen);//inet_bind
}
fput_light(sock->file, fput_needed);
}
return err;
}
  • sockfd_lookup_light 和move_addr_to_kernel分别为根据fd从当前进程取出socket和把参数从用户空间考入地址空间
  • bind系统调用最重要函数为sock->ops->bind
  • 在TCP协议情况下inet_stream_ops中bind成员函数为inet_bind
  • 后续为对此函数的分析

inet_bind

实现较为复杂,现在版本和原始版本相比,支持端口复用了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
int err;

/* If the socket has its own bind function then use it. (RAW) */
/*raw socket才会用到,tcp_proc无此函数*/
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len);
goto out;
}
err = -EINVAL;
/*地址长度检验*/
if (addr_len < sizeof(struct sockaddr_in))
goto out;

/*bind地址中协议检查,必须是下面两种情况
* 1.绑定的地址协议为AF_INET
* 2.绑定协议为0(AF_UNSPEC)同时地址也为0
* 否则直接退出inet_bind ,返回地址不支持错误码
*/
if (addr->sin_family != AF_INET) {
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
* only if s_addr is INADDR_ANY.
*/
err = -EAFNOSUPPORT;
if (addr->sin_family != AF_UNSPEC ||
addr->sin_addr.s_addr != htonl(INADDR_ANY))
goto out;
}

/*获取根据IP地址得出地址类型
RTN_LOCAL 本机地址
RTN_MULTICAST 多播
RTN_BROADCAST 广播
RTN_UNICAST
*/
chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);

/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
/* 地址类型必须是本机,多播,组播中的一个,否则直接返回,报地址参数异常
*
*/
if (!net->ipv4_sysctl_ip_nonlocal_bind &&
!(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
chk_addr_ret != RTN_BROADCAST)
goto out;

snum = ntohs(addr->sin_port);
err = -EACCES;
/*
* 要绑定的端口小于1024时候,要求运行该应用程序的为超级权限
* 否则返回并报权限不运行的错误
*/
if (snum && snum < PROT_SOCK &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;

/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
lock_sock(sk);

/* Check these errors (active socket, double bind). */
err = -EINVAL;
/*bind动作发生在最初状态,其TCP状态是CLOSE且没有绑定过
* 否则直接判别为异常
*/
if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
/*inet_rcv_saddr 用作hash表查找使用
*inet_saddr作为发包源地址
*当为广播和组播时候发送地址为0
*/
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */

/* Make sure we are allowed to bind here. */
/* TCP时候该函数负责查询该端口是否被使用,没有被使用返回0,否则返回非0
*如果已经被使用,则退出bind函数,并返回地址和端口已经被使用错误-EADDRINUSE
*sk->sk_prot->get_port= inet_csk_get_port
*/
if (sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
/*
* 更新sk->sk_userlocks标记,表明本地地址和端口已经绑定
*/
if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
release_sock(sk);
out:
return err;
}
EXPORT_SYMBOL(inet_bind);

  • 绑定地址长度和协议检查 长度异常返回-EINVAL 表示参数异常,协议不支持 -EAFNOSUPPORT
  • 对绑定地址进行类型检查inet_addr_type,必须是本机地址,组播和广播地址类型 -EADDRNOTAVAIL 否则报地址参数异常
  • 如果端口小于1024 ,必须为超级权限ns_capable 否则 err = -EACCES 权限不允许
  • sk->sk_prot->get_port = inet_csk_get_port 四层端口检查,看是否被使用
  • 更新sk->skuserlocks标记,代表地址和端口已经被绑定

扩展函数:
inet_csk_get_port TCP四层端口检查
inet_addr_type 地址类型判别
ns_capable 超级权限检查

inet_csk_get_port

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk);
int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;

/*禁止上下半部,防止进程冲突*/
local_bh_disable();
/*
* 如果没有bind端口
*/
if (!snum) {/*没有指定端口会自动选择端口*/
int remaining, rover, low, high;

again:
/*获取端口的取值范围*/
inet_get_local_port_range(net, &low, &high);/*后文辉对其进行分析*/
if (attempt_half) {
int half = low + ((high - low) >> 1);

if (attempt_half == 1)
high = half;
else
low = half;
}
/*取值范围内端口数*/
remaining = (high - low) + 1;
/*随机选择端口*/
smallest_rover = rover = net_random() % remaining + low;

smallest_size = -1;
do {
/*保留端口检查,服务端可以设置 /proc/sys/net/ipv4/ip_local_reserved_ports */
if (inet_is_reserved_local_port(rover))
goto next_nolock;/*端口加1继续*/

/*根据端口号和HASHsize从确定hash桶,并锁住它,后续便利查找*/
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) {

/*判断端口是否可以复用,如果可以复用即使在链表中也一样复用*/
if (((tb->fastreuse > 0 &&
sk->sk_reuse &&
sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {

/*记录下端口的使用个数和端口*/
smallest_size = tb->num_owners;
smallest_rover = rover;

/*系统绑定端口已经超过最大端口数了,要去检查inet_csk_bind_conflict绑定是否存在冲突*/
if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {

/*ok,绑定没有冲突*/
snum = smallest_rover;
goto tb_found;
}
}

/*端口冲突检查*/
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
goto tb_found;
}
/*此端口在链表中但是不能复用,继续下一个*/
goto next;
}
break;/*不在bind表中,端口可以使用,直接跳出循环*/
next:
spin_unlock(&head->lock);
next_nolock:
/*已经找到最大端口了,从最小开始找*/
if (++rover > high)
rover = low;
} while (--remaining > 0);/*en,最大5次查找机会*/

/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
/*没有找到端口,那就最后一次机会*/
if (remaining <= 0) {
if (smallest_size != -1) {
snum = smallest_rover;
goto have_snum;
}
if (attempt_half == 1) {
/* OK we now try the upper half of the range */
attempt_half = 2;
goto again;
}
goto fail;
}
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
/*找到可用的端口了*/
snum = rover;
} else {
/*指定绑定了端口,在绑定的链表中查找,如果查找到,代表已经被绑定*/
have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;/*端口已经被绑定*/
}

/*在绑定链表中没有发现,后续会创建*/
tb = NULL;
goto tb_not_found;

tb_found:
if (!hlist_empty(&tb->owners)) {

/*要bind的sk标记SK_FORCE_REUSE可以强制复用*/
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;

if (((tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) {
/* 是否可以复用的判别
* fastreuseport Google添加选项&& 已经开启端口复用 && 当前socket uid和查找到的uid相符合
* 当前socket也可以放到bind hash中,后续会将其加入
*/
goto success;
} else {
ret = 1;
/*端口绑定冲突,自动分配端口绑定冲突会走到此处,在自动分配端口时候进行了下列类似判别
*所以此判断基本不会执行知道跳到tb_not_found这个时候tb不为null的
*/
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
}

goto fail_unlock;
}
}
}
tb_not_found:
ret = 1;
/*绑定时没有发现过tb,直接创建一个*/
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {/*没有绑定过socket*/
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
else
tb->fastreuse = 0;

/*设置了SO_REUSEPORT选项*/
if (sk->sk_reuseport) {
tb->fastreuseport = 1;
tb->fastuid = uid;
} else
tb->fastreuseport = 0;
} else {/*如果绑定过socket*/
if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
if (tb->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
tb->fastreuseport = 0;
}
success:
/*找到可用端口,添加绑定表*/
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);/*sk被放到tb->owners中*/
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;

fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}

如果端口为0;则自动选取端口选择过程如下:

先在[low,half] or [half,high]中随机选取一个端口,作为循环获取端口的起始端口,开始以下流程

步骤1: 保留端口检查,不满足,端口加1,重试次数减1,继续从步骤1开始

步骤2: 从当前端口映射的hash桶中取出列表头,遍历检查该端口是否被使用
        步骤2-1:没有被使用,直接退出循环,tb为NULL,创建tb,跳转到tb_not_found将该端口连同创建的tb加入该hash桶的链表中,sk也被放到tb->owners中管理,结束退出
        步骤2-2: 端口被使用了,检查端口使用是否冲突
        步骤2-2-1:没有冲突,推出循环,跳转到tb_found,复用检查成功,sk被放到tb->owners中,结束退出
        步骤2-2-2:存在冲突,直接端口+1,继续循环查找

步骤3:如果上半部分已经查找完毕,继续[half,high]中选择一个端口,进行步骤1

attempt_half

sk->sk_reuse == SK_CAN_REUSE
取端口范围 [low ,half]
否则
取端口范围 [half,high]

  • 该值会影响上述选择端口的流程从上半端还是从下半端选择端口
  • 如果sk->sk_reuse被置SK_CAN_REUSE标记则先从下半端开始选择端口
  • 否则直接从上半端选择端口

small_size和small_rover

what’s the fuck!!! 疑惑了好久
small_size和small_rover在3.10的版本中根本就没有使用基本用不到
3.10版本的端口查找原则是确定端口查找区间,随机选择端口,只要该端口能复用就直接使用,已经完全去除了优先选择复用端口数较小的端口这一原则了(3.2kernel)
So amazing!这两个变量可以去除了

inet_get_local_port_range

1
2
3
4
5
6
7
8
9
10
11
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;

do {
seq = read_seqbegin(&net->ipv4_sysctl_local_ports.lock);

*low = net->ipv4_sysctl_local_ports.range[0];
*high = net->ipv4_sysctl_local_ports.range[1];
} while (read_seqretry(&net->ipv4_sysctl_local_ports.lock, seq));
}
1
2
sysctl -a|grep ip_local_port_range
net.ipv4.ip_local_port_range = 32768 60999
  • 上述读取端口范围是用户态的ip_local_port_range,默认是3w多以后的,可以调整此参数扩大端口范围
  • 上述read_seqbegin这种方式读取数据,是一种顺序锁,适用于读多写少的方式用方式,后续专门处博文研究

tcp端口冲突检查

inet_csk(sk)->icsk_af_ops->bind_conflict

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
.sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
.bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};

static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);

tcp_init_sock(sk);

icsk->icsk_af_ops = &ipv4_specific;

#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif

return 0;
}

  • 从上文得知inet_csk(sk)->icsk_af_ops->bind_conflict 函数是inet_csk_bind_conflict
  • af_ops在tcp_v4_init_sock初始化

inet_csk_bind_conflict分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax)
{
struct sock *sk2;
int reuse = sk->sk_reuse;
int reuseport = sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk);

/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners list belong to the same net - the
* one this bucket belongs to.
*/

sk_for_each_bound(sk2, &tb->owners) {

/*不会冲突情况1:socket绑定设备不同*/
if (sk != sk2 &&
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
/*
*不会冲突情况2:地址不同
*/
if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
/*
* 不会冲突情况3:
* 条件A: (reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN)
* 条件B:(reuseport
* && sk2->sk_reuseport
* &&(sk2->sk_state == TCP_TIME_WAIT || uid_eq(uid, sock_i_uid(sk2))))
* 条件A和条件B只要有一个成立,就不会冲突
* 条件A成立条件:
* 链上sock和待检查sock开启地址复用且链上状态不是监听状态
* 条件B成立条件:
* 链上sock和待检查sock开启端口复用且链表上状态为TW
* 链上sock和待检查sock开启端口复用且两个sock的uid相同
*/
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
/*没有开启relax,要绑定方不能复用,已绑定方不能复用,以绑定方处理监听状态*/
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {

if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
}
}
return sk2 != NULL;
}

在端口自动选择时可以重用端口条件为:

a设备不同
b绑定ip地址不同
c要绑定sock和已绑定sock地址允许重用,且已绑定socket不处于监听状态
d 链上sock和待检查sock开启端口复用且链表上状态为TW
e 链上sock和待检查sock开启端口复用且两个sock的uid相同

关于条件c的补充条件:即使c满足,也需要看relax的值确定,relax为TRUE时可复用,为fase时候不能复用
自动端口时候relax为false,所以条件c消失,仅仅剩下a、b、d、e四个条件