socket()实现源码分析

socket()

内核版本:3.10.0-514.16.1.el7.x86_64

1
2
3
4
#include <sys/types.h>          /* See NOTES */
#include <sys/socket.h>
int socket(int domain, int type, int protocol);
fd=socket(PF_INET,SOCK_STREAM,0

(1).接口说明:

按照顺序可传入如下参数:

  • PF_INEAT
  • SOCK_STREAM,SOCK_DGRAM,SOCK_RAW
  • IPPROTO_TCP,IPPROTO_UDP,IPPROTO_IP

返回值说明

  • EAFNOSUPPORT 不支持地址类型
  • EMFILE 进程文件表溢出
  • ENFILE 核心内存不足无法建立新的socket
  • EINVAL 参数domain/type/protocol不合法
  • EACCES 权限不允许
  • ENOBUFS/ENOMEM 内存不足
  • EPROTONOSUPPORT domain指定的类型不支持参数type或者protocol

(2).内核调用栈

socket

(3).结构体说明

struct socket

面向用户态的结构体
基于虚拟文件系统创建
创建socket时最先创建的结构体

struct sock

网络层socket

struct inet_sock

INET域socket表示
提供INET域的一些属性,TTL、 组播、 地址 、端口

struct raw_socket、struct udp—sock、 struct inet_connection_sock

是对struct inet_sock的扩展
struct raw_socket要处理ICMP
struct udp_sock udp协议socket
struct inet_connection_sock面向连接socket
struct tcp_sock TCP协议socket ,对inet_connection_sock扩展,增加了滑动窗口等拥塞控制属性
struct inet_timewait_sock网络层超时控制使用
struct tcp_timewait_sock TCP协议超时控制使用

(4).struct socket创建源码分析

(4.1).sock_alloc函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;

inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
return NULL;

sock = SOCKET_I(inode);

kmemcheck_annotate_bitfield(sock, type);
inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_op = &sockfs_inode_ops;

this_cpu_add(sockets_in_use, 1);
return sock;
}
  • 一起申请两块内存struct socket和struct inode
  • 两块内存用struct socket_alloc联系起来
  • inode是linux用来刻画一个存放在内存中的文件的
  • socket是一种网络文件类型,可以通过文件描述符使用read和write等文件操作函数操作socket
  • 有了inode就支持了虚拟文件系统的操作

(4.2).sock_alloc->new_inode_pseudo->alloc_inode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
struct inode *new_inode_pseudo(struct super_block *sb)
{
struct inode *inode = alloc_inode(sb);

if (inode) {
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
INIT_LIST_HEAD(&inode->i_sb_list);
}
return inode;
}
static struct inode *alloc_inode(struct super_block *sb)
{
struct inode *inode;

if (sb->s_op->alloc_inode)
inode = sb->s_op->alloc_inode(sb);
else
inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);

if (!inode)
return NULL;

if (unlikely(inode_init_always(sb, inode))) {
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
else
kmem_cache_free(inode_cachep, inode);
return NULL;
}

return inode;
}
  • alloc_inode获取内存有两种方式 1.通过自己alloc_inode分配 2.从高速缓存中分配

(4.3).alloc_inode -> sock_alloc_inode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
struct socket_wq *wq;

ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
wq = kmalloc(sizeof(*wq), GFP_KERNEL);
if (!wq) {
kmem_cache_free(sock_inode_cachep, ei);
return NULL;
}
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
RCU_INIT_POINTER(ei->socket.wq, wq);

ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL;

return &ei->vfs_inode;
}
  • socket结构体最终会调用上述函数申请内存
  • 该函数会在sock_init中被注册和挂载到系统上

(4.4).sock_init 中sock_allok_inode挂载过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
err = register_filesystem(&sock_fs_type);
if (err)
goto out_fs;
sock_mnt = kern_mount(&sock_fs_type);
if (IS_ERR(sock_mnt)) {
err = PTR_ERR(sock_mnt);
goto out_mount;
...
static struct file_system_type sock_fs_type = {
.name = "sockfs",
.mount = sockfs_mount,
.kill_sb = kill_anon_super,
};
static struct dentry *sockfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_pseudo(fs_type, "socket:", &sockfs_ops,
&sockfs_dentry_operations, SOCKFS_MAGIC);
}
static const struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
.destroy_inode = sock_destroy_inode,
.statfs = simple_statfs,
};
  • sock_init -> register mount -> sock_fs_type->sockfs_mount->sockfs_ops->sock_alloc_node

(4.5).pf->create 即TCP/IP协议族的创建函数inet_create初始化步骤

(4.5.1).PF_INET协议族的create函数inet_create会被组册

1
2
3
4
5
6
7
(void)sock_register(&inet_family_ops);

static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};

(4.5.2).注册过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
int sock_register(const struct net_proto_family *ops)
{
int err;

if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
  • 协议族选项ops会根基协议族类型PF_INET被放置到net_families系统全局变量中

(4.5.3).__sock_create使用过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
socket.c/__sock_create
...
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;

/* Now protected by module ref count */
rcu_read_unlock();

err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
  • 根据socket传输过来的协议族PF_INET查找全局变量net_families获取ops
  • 通过ops->create调用inet_create根据具体协议创建网络层socket struct sock

(4.6).inet_create都干了什么?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
int try_loading_module = 0;
int err;

if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;

sock->state = SS_UNCONNECTED;//步骤1:设置socket状态SS_UNCONNECTED

/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();//步骤2:根据socket协议找到inet处理函数 connect、bind、accept、listen、等
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}

if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}

err = -EPERM;
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
//步骤3: 把协协议的inet操作集合赋值给socket结构的ops
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_flags = answer->flags;
rcu_read_unlock();

WARN_ON(answer_prot->slab == NULL);

err = -ENOBUFS;
//步骤4:申请struct sock结构体,并切把协议操作集合赋值给sock结构体
//sk->sk_prot = sk->sk_prot_creator =协议操作集合;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out;

err = 0;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
//步骤5:inet_sock进行相关初始化
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

inet->nodefrag = 0;

if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}

if (net->sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;

inet->inet_id = 0;

sock_init_data(sock, sk);

sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;

sk_refcnt_debug_inc(sk);

if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
//步骤6:调用协议层初始化函数tcp_v4_init_sock()进行始化
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
  • 设置socket状态SS_UNCONNECTED
  • 根据协议类型找到具体的协议类型操作集合,例如协议处理函数tcp_proc和inet层处理函数集合inet_stream_ops
  • socket->ops 获得协议操作集合inet_stream_ops
  • 申请sock,并把tcp_proc赋值给它 sk->sk_prot = sk->sk_prot_creator=tcp_proc
  • 把申请的sock和inet_sock进行初始化
  • sk->sk_prot->init(sk) 调用tcp_proc深度初始化TCP相关信息

尽管流程主要干了上述的事情,仍需要深入探究的问题是:
a. inet_protosw inet_protosw初始化过程如何?
b. inet_sock和sock是什么关系?
c. 从inet_protosw获取的prot和ops哪些结构体上会记录使用?

(4.6.1).inet_protosw初始化过程如何?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},

{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};
//inet_init
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);

//inet_protosw放入全局inetsw管理
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;

spin_lock_bh(&inetsw_lock);

if (p->type >= SOCK_MAX)
goto out_illegal;

/* If we are trying to override a permanent protocol, bail. */
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);

/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol)
break;
last_perm = lh;
}

answer = NULL;
}
if (answer)
goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
return;
out_permanent:
pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
  • inet_init 会把inet_protosw方式inet_sw中
  • inet_protosw很重要,其含有协议的具体操作函数tcp_close,tcp_v4_connect,tcp_recvmsg等
  • inet_protosw,内还包含inet层操作函数 inet_bind,inet_accept,inet_bind,inet_listen等

(4.6.2). inet_sock和sock是什么关系?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;

sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
sock_net_set(sk, get_net(net));
atomic_set(&sk->sk_wmem_alloc, 1);

sock_update_classid(sk);
sock_update_netprioidx(sk);
}

return sk;
}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
struct sock *sk;
struct kmem_cache *slab;

slab = prot->slab;
if (slab != NULL) {
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
if (!sk)
return sk;
if (priority & __GFP_ZERO) {
if (prot->clear_sk)
prot->clear_sk(sk, prot->obj_size);
else
sk_prot_clear_nulls(sk, prot->obj_size);
}
} else
sk = kmalloc(prot->obj_size, priority);//申请内存大小为prot的objsize

if (sk != NULL) {
kmemcheck_annotate_bitfield(sk, flags);

if (security_sk_alloc(sk, family, priority))
goto out_free;

if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
}

return sk;

out_free_sec:
security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
else
kfree(sk);
return NULL;
}

从上述sk_alloc -> sk_prot_alloc -> obj_size

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
  • struct tcp_sock 包含strcut inet_sock 包含 struct sock
  • 上述结构体为互相包含的关系
  • 实际上在申请sock时候,申请内存大小为tcp_sock大小,也就是说三个结构体共同诞生了

(4.6.3). 从inet_protosw获取的prot和ops哪些结构体上会记录使用?

struct socket会在inet_create函数中获取到ops
sock->ops = answer->ops;
struct sock在sk_allloc函数中获取pro
sk->sk_prot = sk->sk_prot_creator = prot;

(5).socket与文件系统

socket与文件系统关联通过sock_map_fd完成

其步骤如下:

1:获取fd get_unused_fd_flags

该函数从当前进程管理的files获取可用的fd

2:申请file sock_alloc_file

将struct socket放到file的private_data管理 file->private_data = sock

3:将file根据当前fd安装到current->files中

files有一个指针fdt
fdt->fd是一个类型为file指针的数组,数组下标为fd
rcu_assign_pointer(fdt->fd[fd], file); 将file安装fd为数组下标放到current->files管理