connect()实现源码分析

connect()

内核版本:3.10.0-514.16.1.el7.x86_64
下述源码分析均以tcp socket为背景

用户态函数

int connect(int sockfd, const struct sockaddr *addr,socklen_t addrlen);
参数:

socketfd socket文件描述索引下标
addr 要连接的服务端的地址
addrlen addr的长度

返回值:

-1 失败
strerror(errno)可帮助获取失败原因
常见失败原因有:
ETIMEOUT Connection timed out服务端一直未回复syn ack,尝试多次syn后返回
ECONNREFUSED Connection refused 服务端端口没有开启,回复rst
EHOSTUNREACH No route to host 服务端在同局域网内arp请求获取办不到对方mac

0 成功

用法:

1
2
3
4
5
6
struct sockaddr_in remote_addr;
memset(&remote_addr,0,sizeof(remote_addr));
remote_addr.sin_family=AF_INET;
remote_addr.sin_addr.s_addr=inet_addr("180.97.33.108");
remote_addr.sin_port = htons(80);
connect(fd,(struct sockaddr*)&remote_addr,sizeof(struct sockaddr)

实例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <stdio.h>
#include <sys/socket.h>
#include <unistd.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

#define SERVER_PORT 20000

void usage(char *name)
{
printf("usage: %s IP\n", name);
}
int main(int argc, char **argv)
{
int server_fd, client_fd, length = 0;
struct sockaddr_in server_addr, client_addr;
socklen_t socklen = sizeof(server_addr);

if(argc < 2)
{
usage(argv[0]);
exit(1);
}
if((client_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0)
{
printf("create socket error, exit!\n");
exit(1);
}
srand(time(NULL));
bzero(&client_addr, sizeof(client_addr));
client_addr.sin_family = AF_INET;
client_addr.sin_addr.s_addr = htons(INADDR_ANY);

bzero(&server_addr, sizeof(server_addr));
server_addr.sin_family = AF_INET;
inet_aton(argv[1], &server_addr.sin_addr);
server_addr.sin_port = htons(SERVER_PORT);

if(connect(client_fd, (struct sockaddr*)&server_addr, socklen) < 0)
{
printf("can not connect to %s, exit!\n", argv[1]);
printf("%s\n", strerror(errno));
exit(1);
}
return 0;
}

运行方法:
[root@localhost socketdemo]# gcc connect.c -o connect
[root@localhost socketdemo]# ./connect 192.168.55.181

系统调用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
unsigned long a[AUDITSC_ARGS];
unsigned long a0, a1;
int err;
unsigned int len;

if (call < 1 || call > SYS_SENDMMSG)
return -EINVAL;

len = nargs[call];
if (len > sizeof(a))
return -EINVAL;

/* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, len))
return -EFAULT;

err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
if (err)
return err;

a0 = a[0];
a1 = a[1];

switch (call) {
case SYS_SOCKET:
err = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_CONNECT:
err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
break;
...

default:
err = -EINVAL;
break;
}
return err;
}
  • 系统调用sys_socketcall会携带(fd,serveraddr,serveraddrlen)参数
  • 系统中断处理函数sys_socketcall会将参数从用户态考入到内核态局部变量a中
  • 调用sys_connect函数 sys_connect(a0, (struct sockaddr __user *)a1, a[2]);

sys_connect执行入口分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,int,addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;

sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
err = move_addr_to_kernel(uservaddr, addrlen, &address);
if (err < 0)
goto out_put;

err =
security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
if (err)
goto out_put;

err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
sock->file->f_flags);
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
  • 根据fd描述符号从当前进程current的files指针中的struct fd_table中的fd成员取出file
  • fdt->fd是一个数组用来管理当前进程的file指针
  • 从file中privatedata中获取到socket变量
  • 把connect连接的服务端地址存入内核空间中move_addr_to_kernel
  • sock->ops->connect 以tco为例,此处会调用inet_stream_connect 函数集合中的inet_stream_connect

inet_stream_connect分析

1
2
3
4
5
6
7
8
9
10
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
int err;

lock_sock(sock->sk);
err = __inet_stream_connect(sock, uaddr, addr_len, flags);
release_sock(sock->sk);
return err;
}

inet_stream_connect() 为tcp socket时候connect动作调用的函数
改函数会调用__inet_stream_connect函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
long timeo;

//socket地址长度检查,不合法返回
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
// 地址协议族检查,如果不合法则关闭连接
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out;
}


switch (sock->state) {
//非法参数
default:
err = -EINVAL;
goto out;
//该socket和对端连接已经建立
case SS_CONNECTED:
err = -EISCONN;
goto out;
//该socket和对端连接建立中
case SS_CONNECTING:
err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
//该socket和对未连接
case SS_UNCONNECTED:
err = -EISCONN;
//如果未连接,但是socket还不是TCP_CLOSE状态错误返回
if (sk->sk_state != TCP_CLOSE)
goto out;
//tcp调用tcp_v4_connect,发送syn
err = sk->sk_prot->connect(sk, uaddr, addr_len);
if (err < 0)
goto out;

//发送syn后sock状态从未连接更新为连接中
sock->state = SS_CONNECTING;

/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
*/
err = -EINPROGRESS;
break;
}

//默认情况下未设置非阻塞socket标志,timeo不为0,设置非阻塞,该值为0
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

//发送syn后等待后续握手完成
/*
* 阻塞socket
* inet_wait_for_connect 会等待协议栈层的处理
* 1.等待超过timeo,connect返回EINPROGRESS 表明正在处理
* 2.收到信号
* 3.正常完成握手,返回0
* 非阻塞socket
* 直接退出connect函数并返回EINPROGRESS,表明协议栈正在处理
*/
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
tcp_sk(sk)->fastopen_req &&
tcp_sk(sk)->fastopen_req->data ? 1 : 0;

/* Error code is set above */
if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
goto out;

err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
}

/* Connection was closed by RST, timeout, ICMP error
* or another process disconnected us.
*/
if (sk->sk_state == TCP_CLOSE)
goto sock_error;

/* sk->sk_err may be not zero now, if RECVERR was ordered by user
* and error was received after socket entered established state.
* Hence, it is handled normally after connect() return successfully.
*/
//TCP握手完成,连接已经建立
sock->state = SS_CONNECTED;
err = 0;
out:
return err;

//异常处理,关闭连接
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
if (sk->sk_prot->disconnect(sk, flags))
sock->state = SS_DISCONNECTING;
goto out;
}
  • __inet_stream_connect检查地址长度和协议族
  • 检查sock状态,正常情况下状态为SS_UNCONNECTED
  • sk->sk_prot->connect tcp_v4_connect来发送syn
  • 在syn包发完以后会有两种处理情况

情况1:立即返回,针对于非阻塞socket,此时协议栈正在处理握手connect会返回-EINPROGRESS
情况2:阻塞运行

阻塞时间超时后,connect返回-EINPROGRESS
收到信号,connect返回-ERESTARTSYS,-EINTR

inet_wait_for_connect函数分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
//初始化等待队列链表,设置队列唤醒后回调函数autoremove_wake_function
DEFINE_WAIT(wait);

prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending += writebias;

/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
* Connect() does not allow to get error notifications
* without closing the socket.
*/
while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
release_sock(sk);/*等下要睡眠了释放sk锁*/


timeo = schedule_timeout(timeo);
/*
* 调用schedule_timeout sleep until timeout
* 收到信号后,timeout值返回剩余等待时间
* 超时timeout后,返回0
*/

/*进程被唤醒后新上sk锁*/
lock_sock(sk);

/*进程有带处理信号,或者睡眠超时,推出循环*/
if (signal_pending(current) || !timeo)
break;
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}

/*等待结束后,将进程从等待队列删除,标记为TASK_RUNNING*/
finish_wait(sk_sleep(sk), &wait);
sk->sk_write_pending -= writebias;
return timeo;
}
  • DEFINE_WAIT函数很重要其设置了唤醒时候删除队列成员调用的回调函数autoremove_wake_funtion
  • 睡眠前进程被设置成TASK_INTERRUPTIBLE状态
  • SO_SNDTIMEO选项对上述的睡眠非常重要
  • SO_SNDTIMEO被设置,则睡眠时间会安装设置值
  • SO_SNDTIMEO没有被设置,则在没有收到信号前一只阻塞
  • 睡眠结束,进程从睡眠队列中删除,并标记为TASK_RUNNING

prepare_to_wait实现分析

1
2
3
4
5
6
7
8
9
10
11
void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;

wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
  • prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
  • 把wait放入q队列中,设置当前进程状态为TASK_INTERRUPTIBLE
  • TASK_INTERRUPTIBLE 是一种睡眠信号
  • 标记TASK_INTERRUPTIBLE的信号会被唤醒并处理信号

阻塞socket唤醒机制

[root@localhost stp]# stap bt.stp sock_def_wakeup

WARNING: Missing unwind data for a module, rerun with ‘stap -d e1000’
—————-START————————-
In process [swapper/2]
RIP: ffffffff81558150
RSP: ffff88003fd03970 EFLAGS: 00000246
RAX: 0000000000004308 RBX: ffff88003a82a6c0 RCX: 0000000000000000
RDX: 0000000050000000 RSI: 0000000000ca00c8 RDI: ffff88003a82a6c0
RBP: ffff88003fd03988 R08: ffff88003db89708 R09: ffff88003e001800
R10: ffffffff815dabca R11: 0000000000000000 R12: ffff88001bfa3700
R13: ffff880002db6762 R14: 0000000000000218 R15: ffff880002db675a
FS: 0000000000000000(0000) GS:ffff88003fd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00007ffaf3049072 CR3: 000000003b0b7000 CR4: 00000000000406e0
0xffffffff81558150 : sock_def_wakeup+0x0/0x40 [kernel]
0xffffffff815cbc09 : tcp_finish_connect+0xc9/0x120 [kernel]
0xffffffff815cc297 : tcp_rcv_state_process+0x637/0xf20 [kernel]
0xffffffff815d5ffb : tcp_v4_do_rcv+0x17b/0x340 [kernel]
0xffffffff815d76d9 : tcp_v4_rcv+0x799/0x9a0 [kernel]
0xffffffff815b1094 : ip_local_deliver_finish+0xb4/0x1f0 [kernel]
0xffffffff815b1379 : ip_local_deliver+0x59/0xd0 [kernel]
0xffffffff815b0d1a : ip_rcv_finish+0x8a/0x350 [kernel]
0xffffffff815b16a6 : ip_rcv+0x2b6/0x410 [kernel]
0xffffffff815700d2 : netif_receive_skb_core+0x582/0x800 [kernel]
0xffffffff81570368 :
netif_receive_skb+0x18/0x60 [kernel]
0xffffffff815703f0 : netif_receive_skb_internal+0x40/0xc0 [kernel]
0xffffffff81571578 : napi_gro_receive+0xd8/0x130 [kernel]
0xffffffffa00472fc [e1000]
—————-END————————-

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);

tcp_set_state(sk, TCP_ESTABLISHED);

if (skb != NULL) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
}

/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);

tcp_init_metrics(sk);

tcp_init_congestion_control(sk);

/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
*/
tp->lsndtime = tcp_time_stamp;

tcp_init_buffer_space(sk);

if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));

if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
else
tp->pred_flags = 0;

if (!sock_flag(sk, SOCK_DEAD)) {
/*握手完成唤醒所有进程*/
sk->sk_state_change(sk);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
}
  • sock_def_wakeup ->wake_up_interruptible_all
  • 上述过程发声在三次握手完成后,TCP从syn send或者syn rcv切换到establish状态时候发生
  • tcp_finish_connect->sk->sk_state_change[sock_def_wakeup]
  • 此次唤醒是全部唤醒sk上等待队列的进程