最近在学习calico, calico通过添加路由表实现容器之间,容器与主机,跨节点容器等之间的通信。
calico把容器的veth网卡独立出来, 没有挂载到网桥上, 而且也没有地址。
没有挂载网桥,可能是因为更好的管理pod的网络以及使用arp代理。
然后想到,如果没有独立出来,而是挂到网桥上,网桥没有地址, 可不可以通信。 理论上是可以的,完全可以通过路由实现。 只是实验还是要做的。
很简单,使用ip命令完全可以搞定。
一个测试
创建两个网络命名空间
创建test1于test2网络命名空间
[root@client ~]# ip netns add test1
[root@client ~]# ip netns add test2
创建主机节点上的网桥
创建网桥tt0并up。
[root@client ~]# ip link add tt0 type bridge
[root@client ~]# ip link set tt0 up
[root@client ~]# ip link list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: ens192: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
link/ether 00:0c:29:35:02:ec brd ff:ff:ff:ff:ff:ff
3: tt0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/ether 2a:6c:f5:aa:79:fc brd ff:ff:ff:ff:ff:ff
创建两个veth网卡
创建两个veth网卡veth1,veth2
[root@client ~]# ip link add veth1 type veth peer name vetha
[root@client ~]# ip link add veth2 type veth peer name vethb
[root@client ~]# ip link list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: ens192: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
link/ether 00:0c:29:35:02:ec brd ff:ff:ff:ff:ff:ff
3: tt0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/ether 2a:6c:f5:aa:79:fc brd ff:ff:ff:ff:ff:ff
4: vetha@veth1: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether 4e:29:32:cd:51:09 brd ff:ff:ff:ff:ff:ff
5: veth1@vetha: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether 62:24:34:76:70:20 brd ff:ff:ff:ff:ff:ff
6: vethb@veth2: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether 6a:af:60:1a:1d:5f brd ff:ff:ff:ff:ff:ff
7: veth2@vethb: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether 32:f1:5a:77:3b:41 brd ff:ff:ff:ff:ff:ff
把veth网卡的一端放到网络命名空间里
[root@client ~]# ip link set vetha netns test1
[root@client ~]# ip link set vethb netns test2
[root@client ~]# ip link list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: ens192: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
link/ether 00:0c:29:35:02:ec brd ff:ff:ff:ff:ff:ff
3: tt0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/ether 2a:6c:f5:aa:79:fc brd ff:ff:ff:ff:ff:ff
5: veth1@if4: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether 62:24:34:76:70:20 brd ff:ff:ff:ff:ff:ff link-netnsid 0
7: veth2@if6: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether 32:f1:5a:77:3b:41 brd ff:ff:ff:ff:ff:ff link-netnsid 1
把veth网卡另一端放到tt0网桥并up
[root@client ~]# ip link set veth1 master tt0
[root@client ~]# ip link set veth2 master tt0
[root@client ~]# ip link set veth1 up
[root@client ~]# ip link set veth2 up
[root@client ~]# ip link list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: ens192: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
link/ether 00:0c:29:35:02:ec brd ff:ff:ff:ff:ff:ff
3: tt0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000
link/ether 32:f1:5a:77:3b:41 brd ff:ff:ff:ff:ff:ff
5: veth1@if4: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue master tt0 state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
link/ether 62:24:34:76:70:20 brd ff:ff:ff:ff:ff:ff link-netnsid 0
7: veth2@if6: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue master tt0 state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
link/ether 32:f1:5a:77:3b:41 brd ff:ff:ff:ff:ff:ff link-netnsid 1
查看一下网络命名空间里的状态
test1的:
[root@client ~]# ip netns exec test1 ip addr list
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
4: vetha@if5: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default qlen 1000
link/ether 4e:29:32:cd:51:09 brd ff:ff:ff:ff:ff:ff link-netnsid 0
[root@client ~]# ip netns exec test1 ip route list
[root@client ~]#
test2也是这种情况, 只有网卡,没有ip没有路由。
两个网络命名空间分别配置地址
给网络命名空间里的网卡配置地址并up。
test1:
[root@client ~]# ip netns exec test1 ip link set vetha up
[root@client ~]# ip netns exec test1 ip addr add 192.168.10.1/32 dev vetha
这里的子网掩码无所谓, 因为calico里对容器设置的32,所以我这里也是32了。
[root@client ~]# ip netns exec test1 ip addr list
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
4: vetha@if5: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 4e:29:32:cd:51:09 brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 192.168.10.1/32 scope global vetha
valid_lft forever preferred_lft forever
inet6 fe80::4c29:32ff:fecd:5109/64 scope link
valid_lft forever preferred_lft forever
test2:
[root@client ~]# ip netns exec test2 ip link set vethb up
[root@client ~]# ip netns exec test2 ip addr add 192.168.10.2/32 dev vethb
[root@client ~]# ip netns exec test2 ip addr list
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
6: vethb@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 6a:af:60:1a:1d:5f brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 192.168.10.2/32 scope global vethb
valid_lft forever preferred_lft forever
inet6 fe80::68af:60ff:fe1a:1d5f/64 scope link
valid_lft forever preferred_lft forever
如果上面的掩码不是32,比如24, 现在两个网络空间之间已经可以互通了。
网络命名空间添加路由
test1:
[root@client ~]# ip netns exec test1 ip route list
[root@client ~]# ip netns exec test1 ip route list
[root@client ~]# ip netns exec test1 ip route add default dev vetha
[root@client ~]# ip netns exec test1 ip route list
default dev vetha scope link
test2:
[root@client ~]# ip netns exec test2 ip route list
[root@client ~]# ip netns exec test2 ip route add default dev vethb
[root@client ~]# ip netns exec test2 ip route list
default dev vethb scope link
主机添加路由
现在两个网络空间之间已经通了。但是与主机之间不通。主机可以收到arp请求,但是因为不是一个网络,就丢弃了。
主机现在的路由:
[root@client ~]# ip route list
default via 172.100.102.1 dev ens192 proto static metric 100
172.100.102.0/24 dev ens192 proto kernel scope link src 172.100.102.90 metric 100
[root@client ~]# ip route add 192.168.10.0/24 dev tt0
[root@client ~]# ip route list
default via 172.100.102.1 dev ens192 proto static metric 100
172.100.102.0/24 dev ens192 proto kernel scope link src 172.100.102.90 metric 100
192.168.10.0/24 dev tt0 scope link
测试通信
与主机的通信,172.100.102.90是主机的地址。
[root@client ~]# ip netns exec test1 ping 172.100.102.90
PING 172.100.102.90 (172.100.102.90) 56(84) bytes of data.
64 bytes from 172.100.102.90: icmp_seq=1 ttl=64 time=0.040 ms
64 bytes from 172.100.102.90: icmp_seq=2 ttl=64 time=0.025 ms
网络空间之间通信
[root@client ~]# ip netns exec test1 ping 192.168.10.2
PING 192.168.10.2 (192.168.10.2) 56(84) bytes of data.
64 bytes from 192.168.10.2: icmp_seq=1 ttl=64 time=0.026 ms
主机到网络空间的通信
[root@client ~]# ping 192.168.10.1
PING 192.168.10.1 (192.168.10.1) 56(84) bytes of data.
64 bytes from 192.168.10.1: icmp_seq=1 ttl=64 time=0.016 ms
网络空间与自己,是不通的, 因为lo网卡没有up。
[root@client ~]# ip netns exec test1 ping 192.168.10.1
PING 192.168.10.1 (192.168.10.1) 56(84) bytes of data.
^C
up起来就可以了, 这个不是问题。
后续
一般情况下,主机访问其他网络, 会通过arp请求先获取网关的mac地址, 但是这里没有网关。可以说本身就是网关。
所以不会发送获取网关mac的arp请求, 直接广播目标ip的arp请求。
[root@client ~]# ip netns exec test1 ping 172.100.102.90
[root@client ~]# tcpdump -vv -nn -i tt0
tcpdump: listening on tt0, link-type EN10MB (Ethernet), capture size 262144 bytes
12:37:28.530360 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.10.1 tell 172.100.102.90, length 28
12:37:28.530377 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.90 tell 192.168.10.1, length 28
12:37:28.530386 ARP, Ethernet (len 6), IPv4 (len 4), Reply 172.100.102.90 is-at 32:f1:5a:77:3b:41, length 28
12:37:28.530394 ARP, Ethernet (len 6), IPv4 (len 4), Reply 192.168.10.1 is-at 4e:29:32:cd:51:09, length 28
如果访问外部的节点,同样是这样。
[root@client ~]# ip netns exec test1 ping 172.100.102.91
12:38:14.441766 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.91 tell 192.168.10.1, length 28
12:38:15.442378 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.91 tell 192.168.10.1, length 28
12:38:16.444394 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.91 tell 192.168.10.1, length 28
12:38:18.441424 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.91 tell 192.168.10.1, length 28
12:38:19.442346 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.91 tell 192.168.10.1, length 28
12:38:20.444394 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.91 tell 192.168.10.1, length 28
只不过没有应答,访问肯定也是不通的。
calico的实现以及用ip命令实现
calico的实现
calico的实现是arp代理。
首先,容器的路由是这样子的:
[root@k8s-node1 netns]# ip netns exec tst1 ip route list
default via 169.254.1.1 dev eth0
169.254.1.1 dev eth0 scope link
伪造了一个地址,好让arp可以去请求。
[root@k8s-node1 netns]# tcpdump -nn -vv -i cali0fcdfd9451d
12:48:18.086711 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 169.254.1.1 tell 10.6.36.68, length 28
12:48:18.086746 ARP, Ethernet (len 6), IPv4 (len 4), Reply 169.254.1.1 is-at ee:ee:ee:ee:ee:ee, length 28
然后主机上对应的veth网卡,开启proxy_arp。这样,在容器的arp请求过来以后,系统会返回这个网卡mac地址的arp响应, 然后问题就解决了,容器的后续请求就发送到主机上了,再由主机去路由。
下面再来看一下calico一些不同的地方:
看一下容器的arp列表, tst1是容器的网络命名空间:
[root@k8s-node1 netns]# ip netns exec tst1 ip neigh show
169.254.1.1 dev eth0 lladdr ee:ee:ee:ee:ee:ee STALE
以及,主机上的veth网卡的mac:
15: cali0fcdfd9451d@if3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1440 qdisc noqueue state UP mode DEFAULT group default
link/ether ee:ee:ee:ee:ee:ee brd ff:ff:ff:ff:ff:ff link-netnsid 0
16: calie9597d34cad@if3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1440 qdisc noqueue state UP mode DEFAULT group default
link/ether ee:ee:ee:ee:ee:ee brd ff:ff:ff:ff:ff:ff link-netnsid 1
发现都是ee:ee:ee:ee:ee:ee
,可能就是因为这个mac地址只是容器与主机来交互的,所以calico就把mac固定了。不同的容器由主机隔离了,没有在一个网络,也就不会有影响。
自己来实现一下
linux上开启arp代理很简单。 一个参数,还有不能挂载网桥。
这个参数: /proc/sys/net/ipv4/conf/veth2/proxy_arp
我这里现在还是这状态:
14:31:20.018362 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.70 tell 192.168.10.2, length 28
14:31:21.020407 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.70 tell 192.168.10.2, length 28
[root@client ~]# cat /proc/sys/net/ipv4/conf/veth2/proxy_arp
0
修改参数为1
[root@client ~]# echo 1 > /proc/sys/net/ipv4/conf/veth2/proxy_arp
现在还是不行的,需要把veth2网卡在tt0网桥上摘下来。
[root@client ~]# ip link set veth2 nomaster
然后还有路由,之前只有tt0网桥的路由。
[root@client ~]# ip route add 192.168.10.2 dev veth2
然后在ping就有东西了:
[root@client ~]# ip netns exec test2 ping 172.100.102.70
tcpdump抓包
[root@client ~]# tcpdump -nn -vv -i veth2
14:55:46.811647 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 172.100.102.70 tell 192.168.10.2, length 28
14:55:47.570338 ARP, Ethernet (len 6), IPv4 (len 4), Reply 172.100.102.70 is-at 32:f1:5a:77:3b:41, length 28
14:55:47.570359 IP (tos 0x0, ttl 64, id 34084, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.10.2 > 172.100.102.70: ICMP echo request, id 8169, seq 1, length 64
14:55:47.811371 IP (tos 0x0, ttl 64, id 34608, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.10.2 > 172.100.102.70: ICMP echo request, id 8169, seq 2, length 64
抓包看到的mac地址,可以发现就是veth2网卡的mac。
开启路由转发,添加SNAT就可以通信了。
[root@client ~]# echo 1 > /proc/sys/net/ipv4/ip_forward
[root@client ~]# iptables -t nat -A POSTROUTING -s 192.168.10.0/24 -j SNAT --to-source 172.100.102.90
[root@client ~]# ip netns exec test2 ping 172.100.102.70
PING 172.100.102.70 (172.100.102.70) 56(84) bytes of data.
64 bytes from 172.100.102.70: icmp_seq=1 ttl=63 time=0.116 ms
^C
--- 172.100.102.70 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.116/0.116/0.116/0.000 ms
[root@client ~]# ip netns exec test2 ping www.baidu.com
PING www.a.shifen.com (61.135.169.121) 56(84) bytes of data.
64 bytes from 61.135.169.121 (61.135.169.121): icmp_seq=1 ttl=51 time=608 ms
64 bytes from 61.135.169.121 (61.135.169.121): icmp_seq=2 ttl=51 time=4.63 ms
^C
现在网络命名空间里的路由还没有修改:
[root@client ~]# ip netns exec test2 ip route list
default dev vethb scope link
没有在网络命名空间里添加伪造的地址,这样也可以通信。只是不清楚是否稳定,毕竟calico选择添加了地址。
研究过程中有时需要清理arp表,使用这个命令。
ip netns exec test2 ip neigh flush dev vethb