tcp socket发送缓冲区

tcp socket发送缓冲区探究结论

1: 未设置SO_SNDBUF时,sk->sk_sndbuf值由tcp_finish_connect->tcp_init_buffer_space->tcp_sndbuf_expand决定,TCP协议栈会自己计算一个值出来46080,sk_sndbuf是46080和net.ipv4.tcp_wmem[2](4194304)的较小值

2: 设置SO_SNDBUF后,tcp_sndbuf_expand将不会再被调用,其值情况完全由sock_setsockopt决定

2-1: 设置值较小 value < 2304 { SOCK_MIN_SNDBUF(4608)/2 }

sk_sndbuf = 4608

2-2: 设置值适中 { SOCK_MIN_SNDBUF(4608)/2 } < value < net.core.wmem_max

sk_sndbuf = value*2

2-3: 设置值较大 value > net.core.wmem_max

sk_sndbuf = net.core.wmem_max* 2

默认情况下(未设置SO_SNDBUF)

net.core.wmem_default = 212992
net.core.wmem_max = 212992
net.ipv4.tcp_wmem = 4096 16384 4194304

  • TCPsocket未connect之前 sendbuf:16384 sk->sk_sndbuf是sysctl_tcp_wmem[1]的值
  • connect之后,sendbuf:46080

通过调试机制可知,sendbuf默认大小为sysctl_tcp_wmem[1] 为16384
connect连接连接到服务端后,sendbuf变为46080,该值不是尚书配置中任何一个值

原因探究

阶段1:tcp_init_sock初始化,sk->sk_sndbuf = sysctl_tcp_wmem[1]

阶段2:主动连接进入ES状态时候,状态切换时候调用tcp_sndbuf_expand调整sk_sndbuf

stp脚本探测结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
tcp_v4_connect[2017/6/20,10:57:56]local=0.0.0.0:3000,remote=0.0.0.0:0 state:CLOSE,sndbubf 0:16384
tcp_v4_connect return [2017/6/20,10:57:56]local=192.168.55.178:3000,remote=180.97.33.108:80 state:SYN_SENT,sndbubf 1280:16384
tcp_input:302 return [2017/6/20,10:57:56]local=192.168.55.178:3000,remote=180.97.33.108:80 state:ESTABLISHED,sndbubf 0:16384 sndmem : 46080 permss 2304
0xffffffff815c3527 : tcp_sndbuf_expand+0x67/0x90 [kernel]
0xffffffff815c7ba8 : tcp_init_buffer_space+0x178/0x190 [kernel]
0xffffffff815cbbae : tcp_finish_connect+0x6e/0x120 [kernel]
0xffffffff815cc297 : tcp_rcv_state_process+0x637/0xf20 [kernel]
0xffffffff815d5ffb : tcp_v4_do_rcv+0x17b/0x340 [kernel]
0xffffffff815d76d9 : tcp_v4_rcv+0x799/0x9a0 [kernel]
0xffffffff815b1094 : ip_local_deliver_finish+0xb4/0x1f0 [kernel]
0xffffffff815b1379 : ip_local_deliver+0x59/0xd0 [kernel]
0xffffffff815b0d1a : ip_rcv_finish+0x8a/0x350 [kernel]
0xffffffff815b16a6 : ip_rcv+0x2b6/0x410 [kernel]
0xffffffff815700d2 : __netif_receive_skb_core+0x582/0x800 [kernel]
0xffffffff81570368 : __netif_receive_skb+0x18/0x60 [kernel]
0xffffffff815703f0 : netif_receive_skb_internal+0x40/0xc0 [kernel]
0xffffffff81571578 : napi_gro_receive+0xd8/0x130 [kernel]
0xffffffffa00472fc [e1000]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
int sndmem, per_mss;
u32 nr_segs;

/* Worst case is non GSO/TSO : each frame consumes one skb
* and skb->head is kmalloced using power of two area of memory
*/
per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

per_mss = roundup_pow_of_two(per_mss) +
SKB_DATA_ALIGN(sizeof(struct sk_buff));

nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

/* Fast Recovery (RFC 5681 3.2) :
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to POLLOUT)
*/
sndmem = 2 * nr_segs * per_mss;

if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
}

设置发送缓冲区大小为较小值

1
2
3
4
5
6
7
8
9
socklen_t sendbuflen = 0;
socklen_t len = sizeof(sendbuflen);
getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, &len);
printf("default,sendbuf:%d\n", sendbuflen);

socklen_t sendbuflen = 100;
setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, len);
getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, &len);
printf("now,sendbuf:%d\n", sendbuflen);

输出信息如下:
default,sendbuf:16384
now,sendbuf:4608

输出信息总结:设置sendbubf为100时,没有生效,反而设置出来一个较大的值4608

原因探究

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/

int sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
···
case SO_SNDBUF:
/* Don't error on this BSD doesn't and if you think
* about it this is right. Otherwise apps have to
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
···
default:
ret = -ENOPROTOOPT;
break;
}
release_sock(sk);
return ret;
}

#define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))
#define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2)

设置socket选项SO_SNDBUF会触发系统调用最终调用sock_setsockopt函数,其处理设置选项过程如上:
其会将用户设置的缓冲区大小乘以2,然后和SOCK_MIN_SNDBUF(4608)比较,取较大值
因此最终较小的缓冲区设置值200没有生效,生效的是4608

设置发送缓冲区大小为中间值

缓冲区系统设置值大小:
net.core.wmem_max = 212992
net.ipv4.tcp_wmem = 4096 16384 4194304

实验动作将缓冲区大小设置为3000

1
2
3
4
5
6
7
8
9
socklen_t sendbuflen = 0;
socklen_t len = sizeof(sendbuflen);
getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, &len);
printf("default,sendbuf:%d\n", sendbuflen);

socklen_t sendbuflen = 3000;
setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, len);
getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, &len);
printf("now,sendbuf:%d\n", sendbuflen);

实验程序输出:
default,sendbuf:16384
now,sendbuf:6000
输出信息总结:设置大小3000生效,sndbuf大小会被设置成为3000*2

设置发送缓冲区大小威较大值

缓冲区系统设置值大小:
net.core.wmem_max = 212992
net.ipv4.tcp_wmem = 4096 16384 4194304

实验动作将缓冲区大小设置为230000

1
2
3
4
5
6
7
8
9
socklen_t sendbuflen = 0;
socklen_t len = sizeof(sendbuflen);
getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, &len);
printf("default,sendbuf:%d\n", sendbuflen);

socklen_t sendbuflen = 230000;
setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, len);
getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void*)&sendbuflen, &len);
printf("now,sendbuf:%d\n", sendbuflen);

实现程序输出:
default,sendbuf:16384
now,sendbuf:425984
实验结果分析:设置大小23000(大于系统212992),sendbuf最终结果为212992*2

原因探究

1
2
3
4
5
6
7
8
case SO_SNDBUF:
val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;

val为用户set的值,其在选择时候会同sysctl_wmem_max比较,选取一个较小的值,如果设置值大于sysctl_wmem_max值的话,val就取系统wmem的最大值。

如上可知:230000 > net.core.wmem_max ,所以用户设置SO_SNDBUF选项最大只能取net.core.wmem_max,所以最终sk_sndbubf值为net.core.wmem_max*2 即425984

其它说明

tcp socket记录当前发送队列的占用缓冲区大小的变量为sk_wmem_queued
和发送缓冲区判断函数如下:

1
2
3
4
5
6
7
8
static inline bool sk_stream_memory_free(const struct sock *sk)
{
if (sk->sk_wmem_queued >= sk->sk_sndbuf)
return false;

return sk->sk_prot->stream_memory_free ?
sk->sk_prot->stream_memory_free(sk) : true;
}

从上述判别中我们可以知道,发送缓冲区记录和比对单位均是字节