Q: 环境是什么样的?
A: 本机上安装了virtualbox,创建了三个虚拟机,安装了kubernetes。
# kubectl version
Client Version: version.Info{Major:"1", Minor:"13", GitVersion:"v1.13.3", GitCommit:"721bfa751924da8d1680787490c54b9179b1fed0", GitTreeState:"clean", BuildDate:"2019-02-01T20:08:12Z", GoVersion:"go1.11.5", Compiler:"gc", Platform:"linux/amd64"}
Server Version: version.Info{Major:"1", Minor:"13", GitVersion:"v1.13.3", GitCommit:"721bfa751924da8d1680787490c54b9179b1fed0", GitTreeState:"clean", BuildDate:"2019-02-01T20:00:57Z", GoVersion:"go1.11.5", Compiler:"gc", Platform:"linux/amd64"}
一共三个节点:
# kubectl get nodes -o wide
[root@master ~]# kubectl get nodes -o wide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
master Ready master 58d v1.13.3 192.168.99.110 <none> CentOS Linux 7 (Core) 3.10.0-957.5.1.el7.x86_64 docker://18.9.2
node1 Ready <none> 58d v1.13.3 192.168.99.111 <none> CentOS Linux 7 (Core) 3.10.0-957.5.1.el7.x86_64 docker://18.9.2
node2 Ready <none> 58d v1.13.3 192.168.99.112 <none> CentOS Linux 7 (Core) 3.10.0-957.5.1.el7.x86_64 docker://18.9.2
每台虚拟机都有两个虚拟网卡,一个是10.0.2.0/15,另一个是192.168.99.0/24:
[root@master ~]# ip addr
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 08:00:27:81:e2:68 brd ff:ff:ff:ff:ff:ff
inet 10.0.2.15/24 brd 10.0.2.255 scope global noprefixroute dynamic enp0s3
valid_lft 85457sec preferred_lft 85457sec
inet6 fe80::41da:5105:3c83:a04c/64 scope link noprefixroute
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 08:00:27:b9:a7:d9 brd ff:ff:ff:ff:ff:ff
inet 192.168.99.110/24 brd 192.168.99.255 scope global noprefixroute enp0s8
valid_lft forever preferred_lft forever
inet6 fe80::a00:27ff:feb9:a7d9/64 scope link
valid_lft forever preferred_lft forever
Q: 问题是什么?
A: kubernetes的两个pod无法通信:
[root@master ~]# kubectl get pods -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
my-release-tomcat-7c876666dc-kwz9k 1/1 Running 8 58d 10.244.1.30 node1 <none> <none>
nginx-685db758cc-w8sns 1/1 Running 1 23h 10.244.2.16 node2 <none> <none>
sleep-754684654f-9wf48 1/1 Running 5 45d 10.244.1.32 node1 <none> <none>
其中,node1上的pod “sleep-754684654f-9wf48 ”可以ping通同为node1上的pod "my-release-tomcat-7c876666dc-kwz9k ",但是却无法ping通node2上的pod “nginx-685db758cc-w8sns”。
Q: 怎么调试的?
A: 思路:
- 首先找到sleep pod对应的veth是哪个
- 然后查看该veth的路由表
- 由于使用的flannel,因此最终找到跨节点发送的vxlan信息
找到veth
用的方法比较简单,首先在pod里面执行ip addr:
[root@master ~]# kubectl exec -it sleep-754684654f-9wf48 -- ip addr
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
3: eth0@if9: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1450 qdisc noqueue state UP
link/ether 0a:58:0a:f4:01:20 brd ff:ff:ff:ff:ff:ff
inet 10.244.1.32/24 scope global eth0
valid_lft forever preferred_lft forever
注意"eth0@if9"最后的9,到该pod运行的node1上执行ip addr:
# ip addr
9: veth9e0f264f@if3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue master cni0 state UP group default
link/ether 0a:e1:de:d3:00:8c brd ff:ff:ff:ff:ff:ff link-netnsid 2
inet6 fe80::8e1:deff:fed3:8c/64 scope link
valid_lft forever preferred_lft forever
注意第一个数字9,这是该veth的序号(index)。就认定是它了,怎么确认一下呢?使用tcpdump监听该veth,同时在pod里面进行ping:
# on node1
[root@node1 ~]# tcpdump -vv -ni veth9e0f264f icmp
tcpdump: listening on veth9e0f264f, link-type EN10MB (Ethernet), capture size 262144 bytes
22:19:51.184285 IP (tos 0x0, ttl 64, id 58774, offset 0, flags [DF], proto ICMP (1), length 84)
10.244.1.32 > 10.244.1.30: ICMP echo request, id 2816, seq 0, length 64
22:19:51.184327 IP (tos 0x0, ttl 64, id 53793, offset 0, flags [none], proto ICMP (1), length 84)
10.244.1.30 > 10.244.1.32: ICMP echo reply, id 2816, seq 0, length 64
22:19:52.184522 IP (tos 0x0, ttl 64, id 59673, offset 0, flags [DF], proto ICMP (1), length 84)
10.244.1.32 > 10.244.1.30: ICMP echo request, id 2816, seq 1, length 64
在master节点上进行ping操作:
[root@master ~]# kubectl exec -it sleep-754684654f-9wf48 -- ping 10.244.1.30
PING 10.244.1.30 (10.244.1.30): 56 data bytes
64 bytes from 10.244.1.30: seq=0 ttl=64 time=0.130 ms
64 bytes from 10.244.1.30: seq=1 ttl=64 time=0.106 ms
64 bytes from 10.244.1.30: seq=2 ttl=64 time=0.136 ms
64 bytes from 10.244.1.30: seq=3 ttl=64 time=0.105 ms
64 bytes from 10.244.1.30: seq=4 ttl=64 time=0.132 ms
64 bytes from 10.244.1.30: seq=5 ttl=64 time=0.079 ms
64 bytes from 10.244.1.30: seq=6 ttl=64 time=0.110 ms
可以看到tcpdump已经抓取到了ping发送的icmp包。那么证明这里的veth就是容器里面的eth0的pair veth。
同时,ping另外一个在node2上的pod则不通:
[root@master ~]# kubectl exec -it sleep-754684654f-9wf48 -- ping 10.244.2.16
PING 10.244.2.16 (10.244.2.16): 56 data bytes
^C
--- 10.244.2.16 ping statistics ---
106 packets transmitted, 0 packets received, 100% packet loss
查看该veth的路由表
在node1上使用ip route命令:
[root@node1 ~]# ip route
default via 10.0.2.2 dev enp0s3 proto dhcp metric 100
default via 192.168.99.1 dev enp0s8 proto static metric 101
10.0.2.0/24 dev enp0s3 proto kernel scope link src 10.0.2.15 metric 100
10.244.0.0/24 via 10.244.0.0 dev flannel.1 onlink
10.244.1.0/24 dev cni0 proto kernel scope link src 10.244.1.1
10.244.2.0/24 via 10.244.2.0 dev flannel.1 onlink
172.17.0.0/16 dev docker0 proto kernel scope link src 172.17.0.1
192.168.99.0/24 dev enp0s8 proto kernel scope link src 192.168.99.111 metric 101
注意“10.244.2.0/24 via 10.244.2.0 dev flannel.1 onlink”,上面我们ping不通的目标地址是10.244.2.16,因此根据ip route的结果,该地址会通过flannel.1进行转发。其中onlink表示需要在neigh表中寻找:
[root@node1 ~]# ip neigh show 10.244.2.0
10.244.2.0 dev flannel.1 lladdr 46:7c:e8:d5:f7:29 PERMANENT
继续使用bridge fdb show进行查找:
[root@node1 ~]# bridge fdb show | grep 46:7c:e8:d5:f7:29
46:7c:e8:d5:f7:29 dev flannel.1 dst 10.0.2.15 self permanent
使用ip link查看链路层信息:
[root@node1 ~]# ip -d link show flannel.1
5: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UNKNOWN mode DEFAULT group default
link/ether a6:f2:81:7e:a2:dc brd ff:ff:ff:ff:ff:ff promiscuity 0
vxlan id 1 local 10.0.2.15 dev enp0s3 srcport 0 0 dstport 8472 nolearning ageing 300 noudpcsum noudp6zerocsumtx noudp6zerocsumrx addrgenmode eui64 numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
注意,-d表示显示详细信息。注意信息中的“vxlan id 1 local 10.0.2.15 dev enp0s3 srcport 0 0 dstport 8472”,表明vxlan是建立在10.0.2.15这个eth上的,其端口是8472。
Q: 问题原因及解决?
A: 问题是三个虚拟机是通过192.168.99.0/24进行连接的,而10.0.2.0/24无法连通。但是flannel在安装的时候默认使用的是第一个非loopback的eth,因此使用了10.0.2.0的eth,而不是192.168.99.0的eth。
解决办法是先卸载flannel,然后重装。
安装的时候使用的命令是:
kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/a70459be0084506e4ec919aa1c114638878db11b/Documentation/kube-flannel.yml
卸载之前:
[root@master ~]# kubectl get pods --all-namespaces
NAMESPACE NAME READY STATUS RESTARTS AGE
default my-release-tomcat-7c876666dc-kwz9k 1/1 Running 8 58d
default nginx-685db758cc-w8sns 1/1 Running 1 24h
default sleep-754684654f-9wf48 1/1 Running 5 45d
kube-system coredns-86c58d9df4-jpbvn 1/1 Running 11 58d
kube-system coredns-86c58d9df4-jvtdb 1/1 Running 12 58d
kube-system etcd-master 1/1 Running 13 58d
kube-system kube-apiserver-master 1/1 Running 13 58d
kube-system kube-controller-manager-master 1/1 Running 11 58d
kube-system kube-flannel-ds-amd64-c5n9q 1/1 Running 9 58d
kube-system kube-flannel-ds-amd64-n96rq 1/1 Running 14 58d
kube-system kube-flannel-ds-amd64-wtrxx 1/1 Running 8 58d
kube-system kube-proxy-hhn9w 1/1 Running 11 58d
kube-system kube-proxy-hj2mv 1/1 Running 9 58d
kube-system kube-proxy-qqrzn 1/1 Running 9 58d
kube-system kube-scheduler-master 1/1 Running 12 58d
kube-system tiller-deploy-dbb85cb99-fr5th 1/1 Running 5 45d
使用:
kubectl delete -f kube-flannel.yml
其中kube-flannel.yml是下载下来的flannel安装yaml。命令执行后:
[root@master ~]# kubectl get pods --all-namespaces
NAMESPACE NAME READY STATUS RESTARTS AGE
default my-release-tomcat-7c876666dc-kwz9k 1/1 Running 8 58d
default nginx-685db758cc-w8sns 1/1 Running 1 24h
default sleep-754684654f-9wf48 1/1 Running 5 45d
kube-system coredns-86c58d9df4-jpbvn 1/1 Running 11 58d
kube-system coredns-86c58d9df4-jvtdb 1/1 Running 12 58d
kube-system etcd-master 1/1 Running 13 58d
kube-system kube-apiserver-master 1/1 Running 13 58d
kube-system kube-controller-manager-master 1/1 Running 11 58d
kube-system kube-proxy-hhn9w 1/1 Running 11 58d
kube-system kube-proxy-hj2mv 1/1 Running 9 58d
kube-system kube-proxy-qqrzn 1/1 Running 9 58d
kube-system kube-scheduler-master 1/1 Running 12 58d
kube-system tiller-deploy-dbb85cb99-fr5th 1/1 Running 5 45d
可以看到flannel相关的pod不见了。
修改kube-flannel.yml中的:
containers:
- name: kube-flannel
image: quay.io/coreos/flannel:v0.11.0-amd64
command:
- /opt/bin/flanneld
args:
- --ip-masq
- --kube-subnet-mgr
- --iface=enp0s8
在正确的位置加上"--iface=enp0s8",enp0s8是正确的eth。
执行"kubectl apply -f kube-flannel.yml"
执行完成之后:
[root@master ~]# kubectl get pods --all-namespaces -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
default my-release-tomcat-7c876666dc-kwz9k 1/1 Running 8 58d 10.244.1.30 node1 <none> <none>
default nginx-685db758cc-w8sns 1/1 Running 1 24h 10.244.2.16 node2 <none> <none>
default sleep-754684654f-9wf48 1/1 Running 5 45d 10.244.1.32 node1 <none> <none>
kube-system coredns-86c58d9df4-jpbvn 1/1 Running 11 58d 10.244.0.25 master <none> <none>
kube-system coredns-86c58d9df4-jvtdb 1/1 Running 12 58d 10.244.0.24 master <none> <none>
kube-system etcd-master 1/1 Running 13 58d 192.168.99.110 master <none> <none>
kube-system kube-apiserver-master 1/1 Running 13 58d 192.168.99.110 master <none> <none>
kube-system kube-controller-manager-master 1/1 Running 11 58d 192.168.99.110 master <none> <none>
kube-system kube-flannel-ds-amd64-6jxs4 1/1 Running 0 8s 192.168.99.111 node1 <none> <none>
kube-system kube-flannel-ds-amd64-l7nx9 1/1 Running 0 8s 192.168.99.110 master <none> <none>
kube-system kube-flannel-ds-amd64-mk2s6 1/1 Running 0 8s 192.168.99.112 node2 <none> <none>
kube-system kube-proxy-hhn9w 1/1 Running 11 58d 192.168.99.110 master <none> <none>
kube-system kube-proxy-hj2mv 1/1 Running 9 58d 192.168.99.112 node2 <none> <none>
kube-system kube-proxy-qqrzn 1/1 Running 9 58d 192.168.99.111 node1 <none> <none>
kube-system kube-scheduler-master 1/1 Running 12 58d 192.168.99.110 master <none> <none>
kube-system tiller-deploy-dbb85cb99-fr5th 1/1 Running 5 45d 10.244.1.31 node1 <none> <none>
这个时候我们再ping一下:
[root@master ~]# kubectl exec -it sleep-754684654f-9wf48 -- ping 10.244.2.16
PING 10.244.2.16 (10.244.2.16): 56 data bytes
64 bytes from 10.244.2.16: seq=0 ttl=62 time=1.683 ms
64 bytes from 10.244.2.16: seq=1 ttl=62 time=0.554 ms
64 bytes from 10.244.2.16: seq=2 ttl=62 time=0.730 ms
^C
--- 10.244.2.16 ping statistics ---
3 packets transmitted, 3 packets received, 0% packet loss
可以看到能够ping成功10.244.2.16了。
那么,我们再在node1上执行ip link show看看:
[root@node1 ~]# ip -d link show flannel.1
10: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UNKNOWN mode DEFAULT group default
link/ether 2a:7a:3b:bb:7d:75 brd ff:ff:ff:ff:ff:ff promiscuity 0
vxlan id 1 local 192.168.99.111 dev enp0s8 srcport 0 0 dstport 8472 nolearning ageing 300 noudpcsum noudp6zerocsumtx noudp6zerocsumrx addrgenmode eui64 numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
可以看到"vxlan id 1 local 192.168.99.111 dev enp0s8 srcport 0 0 dstport 8472"已经不一样了。