最精简的vxlan封装解封装过程

vxlan使用背景如下图

vxlan 环境实现脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#vbox虚拟机1
ip netns add left
ip link add name veth1 type veth peer name sw1-p1
ip link set dev veth1 netns left
ip netns exec left ifconfig veth1 10.0.0.1/24 up

ovs-vsctl add-br sw1
ovs-vsctl add-port sw1 sw1-p1
ip link set sw1-p1 up

#vbox虚拟机2
ip netns add right
ip link add name veth1 type veth peer name sw2-p1
ip link set dev veth1 netns right
ip netns exec right ifconfig veth1 10.0.0.2/24 up

ovs-vsctl add-br sw2
ovs-vsctl add-port sw2 sw2-p1
ip link set sw2-p1 up

vxlan实现过程粗略过程说明

  • 默认端口4789
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[root@localhost ~]# netstat -nulp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
udp 0 0 0.0.0.0:14803 0.0.0.0:* 2250/dhclient
udp 0 0 0.0.0.0:12221 0.0.0.0:* 723/dhclient
udp 0 0 0.0.0.0:68 0.0.0.0:* 2250/dhclient
udp 0 0 0.0.0.0:68 0.0.0.0:* 723/dhclient
udp 0 0 127.0.0.1:323 0.0.0.0:* 674/chronyd
udp 0 0 0.0.0.0:4789 0.0.0.0:* -
udp6 0 0 :::48083 :::* 723/dhclient
udp6 0 0 ::1:323 :::* 674/chronyd
udp6 0 0 :::4789 :::* -
udp6 0 0 :::42780 :::* 2250/dhclient
[root@localhost ~]#
[root@localhost ~]#
  • 系统建立的vxlan_sys_4789接口用来目的端口为4789的内核udp socket监听
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
[root@localhost ~]# ip addr
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN qlen 1
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP qlen 1000
link/ether 08:00:27:69:6e:c9 brd ff:ff:ff:ff:ff:ff
inet 10.0.2.15/24 brd 10.0.2.255 scope global dynamic enp0s3
valid_lft 86079sec preferred_lft 86079sec
inet6 fe80::236e:8cc4:b25d:f30b/64 scope link
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP qlen 1000
link/ether 08:00:27:87:25:16 brd ff:ff:ff:ff:ff:ff
inet 192.168.56.102/24 brd 192.168.56.255 scope global dynamic enp0s8
valid_lft 706sec preferred_lft 706sec
inet6 fe80::8a01:cdf3:b4e3:5db6/64 scope link
valid_lft forever preferred_lft forever
4: ovs-system: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
link/ether a2:b4:8e:70:bb:cd brd ff:ff:ff:ff:ff:ff
5: sw1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
link/ether 56:78:07:db:b7:49 brd ff:ff:ff:ff:ff:ff
6: vxlan_sys_4789: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 65470 qdisc noqueue master ovs-system state UNKNOWN qlen 1000
link/ether 42:b4:6c:96:43:9c brd ff:ff:ff:ff:ff:ff
7: sw1-p1@if8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master ovs-system state UP qlen 1000
link/ether 6e:f1:2a:cb:97:e4 brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet6 fe80::6cf1:2aff:fecb:97e4/64 scope link
valid_lft forever preferred_lft forever
  • 物理口可以抓取到vxlan流量
1
2
3
4
5
6
7
8
[root@localhost ~]# tcpdump -i enp0s8 -ne udp
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on enp0s8, link-type EN10MB (Ethernet), capture size 262144 bytes
01:34:59.020741 08:00:27:92:8d:41 > 08:00:27:87:25:16, ethertype IPv4 (0x0800), length 148: 192.168.56.103.54252 > 192.168.56.102.4789: VXLAN, flags [I] (0x08), vni 0
3a:72:db:83:a4:1f > 12:d8:2b:1f:c1:52, ethertype IPv4 (0x0800), length 98: 10.0.0.2 > 10.0.0.1: ICMP echo request, id 2426, seq 40, length 64
01:34:59.020889 08:00:27:87:25:16 > 08:00:27:92:8d:41, ethertype IPv4 (0x0800), length 148: 192.168.56.102.52961 > 192.168.56.103.4789: VXLAN, flags [I] (0x08), vni 0
12:d8:2b:1f:c1:52 > 3a:72:db:83:a4:1f, ethertype IPv4 (0x0800), length 98: 10.0.0.1 > 10.0.0.2: ICMP echo reply, id 2426, seq 40, length 64
^C
  • 如果想将解封装后流量发给openvswitch需要在openvswitch上建立类型为vxlan的口,如tun0

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    [root@localhost ~]# ovs-ofctl show sw1

    OFPT_FEATURES_REPLY (xid=0x2): dpid:0000567807dbb749

    n_tables:254, n_buffers:256

    capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP

    actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst

    1(sw1-p1): addr:6e:f1:2a:cb:97:e4

    config: 0

    state: 0

    current: 10GB-FD COPPER

    speed: 10000 Mbps now, 0 Mbps max

    3(tun0): addr:c2:58:e0:88:9d:1e

    config: 0

    state: 0

    speed: 0 Mbps now, 0 Mbps max

    LOCAL(sw1): addr:56:78:07:db:b7:49

    config: PORT_DOWN

    state: LINK_DOWN

    speed: 0 Mbps now, 0 Mbps max

    OFPT_GET_CONFIG_REPLY (xid=0x4): frags=normal miss_send_len=
  • 说明:

在接收到对端发过来的vxlan数据流量时候,因其目的端口是4789,内核收到发给这个端口的数据包时,会对数据流量进行vxlan解封装,因为vxlan接口配置到ovs上,所以会将解封装后的数据流量送给openvswitch转发处理

如果openvswitch数据流量发给了配置的vxlan口,vxlan口会对数据包进行vxlan封装,然后对封装后的数据包进行二三层转发

vxlan收包处理过程

openvswitch vxlan收包过程如下

默认情况下发给4789端口的udp数据包,会在内核态呗截取,交给vxlan_rcv处理,vxlan_rcv该函数负责解封装然后将数据包挂入gcells

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
0xffffffff8156efa0 : __napi_schedule+0x0/0x50 [kernel]

0xffffffffa045d67b : vxlan_rcv+0x99b/0xb00 [vxlan]

0xffffffff815e2818 : udp_queue_rcv_skb+0x1f8/0x4f0 [kernel]

0xffffffff815e355a : __udp4_lib_rcv+0x54a/0x880 [kernel]

0xffffffff815e3dfa : udp_rcv+0x1a/0x20 [kernel]

0xffffffff815b1584 : ip_local_deliver_finish+0xb4/0x1f0 [kernel]

0xffffffff815b1869 : ip_local_deliver+0x59/0xd0 [kernel]

0xffffffff815b120a : ip_rcv_finish+0x8a/0x350 [kernel]

0xffffffff815b1b96 : ip_rcv+0x2b6/0x410 [kernel]

0xffffffff81570062 : __netif_receive_skb_core+0x582/0x800 [kernel]

0xffffffff815702f8 : __netif_receive_skb+0x18/0x60 [kernel]

0xffffffff81570380 : netif_receive_skb_internal+0x40/0xc0 [kernel]

0xffffffff81571498 : napi_gro_receive+0xd8/0x130 [kernel]

0xffffffffa00472fc : e1000_clean_rx_irq+0x2ac/0x4f0 [e1000]

0xffffffffa0047d31 : e1000_clean+0x281/0x8f0 [e1000]

0xffffffff81570b20 : net_rx_action+0x170/0x380 [kernel]

0xffffffff8108f63f : __do_softirq+0xef/0x280 [kernel]

0xffffffff8169919c : call_softirq+0x1c/0x30 [kernel]

0xffffffff8102d365 : do_softirq+0x65/0xa0 [kernel]

0xffffffff8108f9d5 : irq_exit+0x115/0x120 [kernel]

软中断出发时候net_rx_action 会处理调用gro_cell_poll从gcells中取出skb进行消耗最终调用__netif_receive_skb_core下的ovs_vport_receive将数据包送给openvswitch流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
0xffffffffa043ea40 : ovs_vport_receive+0x0/0xd0 [openvswitch]

0xffffffffa043fc8e : netdev_frame_hook+0xde/0x160 [openvswitch]

0xffffffff8156fcc2 : __netif_receive_skb_core+0x1e2/0x800 [kernel]

0xffffffff815702f8 : __netif_receive_skb+0x18/0x60 [kernel]

0xffffffff81570380 : netif_receive_skb_internal+0x40/0xc0 [kernel]

0xffffffff81571498 : napi_gro_receive+0xd8/0x130 [kernel]

0xffffffffa045a30a : gro_cell_poll+0x7a/0xc0 [vxlan]

0xffffffff81570b20 : net_rx_action+0x170/0x380 [kernel]

0xffffffff8108f63f : __do_softirq+0xef/0x280 [kernel]

0xffffffff8169919c : call_softirq+0x1c/0x30 [kernel]

0xffffffff8102d365 : do_softirq+0x65/0xa0 [kernel]

0xffffffff8108f9d5 : irq_exit+0x115/0x120 [kernel]

0xffffffff81699d38 : do_IRQ+0x58/0xf0 [kernel]

0xffffffff8168eded : ret_from_intr+0x0/0x15 [kernel]

数据包送给openvswitch流程在openvswitch内部处理过程和无差别,因为此时数据包已经是解过封装了。所以该数据包会发给namespace left

该数据包会呗放入到CPU队列中等待left namespace协议栈读取消耗

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
0xffffffff8156f130 : enqueue_to_backlog+0x0/0x170 [kernel]

0xffffffff8156f2e5 : netif_rx_internal+0x45/0x120 [kernel]

0xffffffff8156f3de : dev_forward_skb+0x1e/0x30 [kernel]

0xffffffffa03a34ba : veth_xmit+0x2a/0x60 [veth]

0xffffffff8156f8a1 : dev_hard_start_xmit+0x171/0x3b0 [kernel]

0xffffffff81572656 : __dev_queue_xmit+0x466/0x570 [kernel]

0xffffffff81572770 : dev_queue_xmit+0x10/0x20 [kernel]

0xffffffffa03881d4 : ovs_vport_send+0x44/0xb0 [openvswitch]

0xffffffffa037a300 : do_output.isra.31+0x40/0x150 [openvswitch]

0xffffffffa037b74d : do_execute_actions+0x73d/0x890 [openvswitch]

0xffffffffa037b8e1 : ovs_execute_actions+0x41/0x130 [openvswitch]

0xffffffffa037e929 : ovs_packet_cmd_execute+0x2c9/0x2f0 [openvswitch]

0xffffffff815a6d5a : genl_family_rcv_msg+0x20a/0x430 [kernel]

0xffffffff815a7011 : genl_rcv_msg+0x91/0xd0 [kernel]

0xffffffff815a4f89 : netlink_rcv_skb+0xa9/0xc0 [kernel]

0xffffffff815a54b8 : genl_rcv+0x28/0x40 [kernel]

0xffffffff815a467d : netlink_unicast+0xed/0x1b0 [kernel]

0xffffffff815a4a5e : netlink_sendmsg+0x31e/0x690 [kernel]

0xffffffff81555ef0 : sock_sendmsg+0xb0/0xf0 [kernel]

0xffffffff81556799 : ___sys_sendmsg+0x3a9/0x3c0 [kernel]

namespace left协议栈收到该数包发现是发给本机接口的数据包,直接回复icmp reply

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
0xffffffff815e8040 : icmp_rcv+0x0/0x380 [kernel]

0xffffffff815b1584 : ip_local_deliver_finish+0xb4/0x1f0 [kernel]

0xffffffff815b1869 : ip_local_deliver+0x59/0xd0 [kernel]

0xffffffff815b120a : ip_rcv_finish+0x8a/0x350 [kernel]

0xffffffff815b1b96 : ip_rcv+0x2b6/0x410 [kernel]

0xffffffff81570062 : __netif_receive_skb_core+0x582/0x800 [kernel]

0xffffffff815702f8 : __netif_receive_skb+0x18/0x60 [kernel]

0xffffffff8157159e : process_backlog+0xae/0x170 [kernel]

0xffffffff81570b20 : net_rx_action+0x170/0x380 [kernel]

0xffffffff8108f63f : __do_softirq+0xef/0x280 [kernel]

0xffffffff8169919c : call_softirq+0x1c/0x30 [kernel]

0xffffffff8102d365 : do_softirq+0x65/0xa0 [kernel]

0xffffffff8108e894 : local_bh_enable+0x94/0xa0 [kernel]

0xffffffffa037e930 : ovs_packet_cmd_execute+0x2d0/0x2f0 [openvswitch]

0xffffffff815a6d5a : genl_family_rcv_msg+0x20a/0x430 [kernel]

0xffffffff815a7011 : genl_rcv_msg+0x91/0xd0 [kernel]

0xffffffff815a4f89 : netlink_rcv_skb+0xa9/0xc0 [kernel]

0xffffffff815a54b8 : genl_rcv+0x28/0x40 [kernel]

0xffffffff815a467d : netlink_unicast+0xed/0x1b0 [kernel]

0xffffffff815a4a5e : netlink_sendmsg+0x31e/0x690 [kernel]

vxlan发包过程

因为最终数据包从openvswitch侧发给了vxlan口,vxlan口会调用dev_hard_start_xmit将数据包发送出去,因为是vxlan口所以需要对数据包进行封装,很显然封装的过程具体实现细节

发生在udp_tunnel_xmit_skb 和 iptunnel_xmit函数中,最后调用ip_local_out_sk将封装好的数据包当成本机数据包发出去,当然此时二层、三次转发查找路由的过程,都是借用的本机发包的流程了,这里就不再详细说明了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
0xffffffff815fbfc0 : iptunnel_xmit+0x0/0x1a0 [kernel]

0xffffffffa02b12b3 : udp_tunnel_xmit_skb+0xe3/0x100 [udp_tunnel]

0xffffffffa039a253 : vxlan_xmit_one+0x7e3/0xb60 [vxlan]

0xffffffffa039b81f : vxlan_xmit+0x41f/0xce0 [vxlan]

0xffffffff8156f8a1 : dev_hard_start_xmit+0x171/0x3b0 [kernel]

0xffffffff81572656 : __dev_queue_xmit+0x466/0x570 [kernel]

0xffffffff81572770 : dev_queue_xmit+0x10/0x20 [kernel]

0xffffffffa03881d4 : ovs_vport_send+0x44/0xb0 [openvswitch]

0xffffffffa037a300 : do_output.isra.31+0x40/0x150 [openvswitch]

0xffffffffa037b74d : do_execute_actions+0x73d/0x890 [openvswitch]

0xffffffffa037b8e1 : ovs_execute_actions+0x41/0x130 [openvswitch]

0xffffffffa037f445 : ovs_dp_process_packet+0x85/0x110 [openvswitch]

0xffffffffa0387aac : ovs_vport_receive+0x6c/0xd0 [openvswitch]

0xffffffffa0388c8e : netdev_frame_hook+0xde/0x160 [openvswitch]

0xffffffff8156fcc2 : __netif_receive_skb_core+0x1e2/0x800 [kernel]

0xffffffff815702f8 : __netif_receive_skb+0x18/0x60 [kernel]

0xffffffff8157159e : process_backlog+0xae/0x170 [kernel]

0xffffffff81570b20 : net_rx_action+0x170/0x380 [kernel]

0xffffffff8108f63f : __do_softirq+0xef/0x280 [kernel]

0xffffffff8169919c : call_softirq+0x1c/0x30 [kernel]

vlxan数据包UDP端口的选择

从代码实现来看,应该是根据vxlan封装前的源目的ip和端口进行hash获取的UDP发送端口,细节后续再研究

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
int min, int max, bool use_eth)
{
u32 hash;

if (min >= max) {
/* Use default range */
inet_get_local_port_range(net, &min, &max);
}

hash = skb_get_hash(skb);
if (unlikely(!hash) && use_eth) {
/* Can't find a normal hash, caller has indicated an Ethernet
* packet so use that to compute a hash.
*/
hash = jhash(skb->data, 2 * ETH_ALEN,
(__force u32) skb->protocol);
}

/* Since this is being sent on the wire obfuscate hash a bit
* to minimize possbility that any useful information to an
* attacker is leaked. Only upper 16 bits are relevant in the
* computation for 16 bit port value.
*/
hash ^= hash << 16;

return htons((((u64) hash * (max - min)) >> 32) + min);
}