Terway
ENI独占模式
源码分析
func podNetworkType(daemonMode string, pod *corev1.Pod) string {
switch daemonMode {
case daemon.ModeENIMultiIP:
return daemon.PodNetworkTypeENIMultiIP
case daemon.ModeVPC:
podAnnotation := pod.GetAnnotations()
useENI := false
if needEni, ok := podAnnotation[podNeedEni]; ok && (needEni != "" && needEni != ConditionFalse && needEni != "0") {
useENI = true
}
for _, c := range pod.Spec.Containers {
if _, ok := c.Resources.Requests[deviceplugin.ENIResName]; ok {
useENI = true
break
}
}
if useENI {
return daemon.PodNetworkTypeVPCENI
}
return daemon.PodNetworkTypeVPCIP
case daemon.ModeENIOnly:
return daemon.PodNetworkTypeVPCENI
}
}
ENI独占模式时,对应的POD网络模式是VPC-ENI,此时的网络资源请求类型就不一样了
switch pod.PodNetworkType {
case daemon.PodNetworkTypeVPCENI:
reply.IPType = rpc.IPType_TypeVPCENI
else {
req := &eni.LocalIPRequest{}
resourceRequests = append(resourceRequests, req)
}
}
对于VPC-ENI类型,可以看到此时的网络资源请求类型是LocalIPRequest
func (l *Local) Allocate(ctx context.Context, cni *daemon.CNI, request ResourceRequest) (chan *AllocResp, []Trace) {
expectV4 := 0
expectV6 := 0
if l.enableIPv4 {
ipv4 := l.ipv4.PeekAvailable(cni.PodID, lo.IPv4)
if ipv4 == nil && len(l.ipv4)+l.allocatingV4 >= l.cap {
return nil, []Trace{{Condition: Full}}
} else if ipv4 == nil {
expectV4 = 1
}
}
l.allocatingV4 += expectV4
l.cond.Broadcast()
respCh := make(chan *AllocResp)
go l.allocWorker(ctx, cni, lo, respCh, func() {
...
})
return respCh, nil
}
LocalIPRequest这种类型分配IP的流程相对复杂一点,这里它会维护一个IP可用集合,分配IP的时候就是遍历这个集合,从中获取可用的IP
所谓可用的IP就是集合中还没绑定POD的那些IP
func (s Set) PeekAvailable(podID string, prefer netip.Addr) *IP {
for _, v := range s {
if v.status == ipStatusValid && v.podID == ""{
return v
}
}
return nil
}
如果集合中没有可用IP,它会通过信号量通知其它携程帮他分配IP,然后自己等待IP分配好了之后,再遍历集合去获取可用的IP
func (l *Local) allocWorker(ctx context.Context, cni *daemon.CNI, request *LocalIPRequest, respCh chan *AllocResp, onErrLocked func()) {
for {
resp := &AllocResp{}
var ip types.IPSet2
if l.enableIPv4 {
ipv4 = l.ipv4.PeekAvailable(cni.PodID, request.IPv4)
if ipv4 == nil {
l.cond.Wait()
continue
}
ip.IPv4 = ipv4.ip
}
return
}
}
这里没有可用IP时它会通过l.cond.Broadcast()
去唤醒携程帮它分配IP,再其它携程帮它分配好IP之前它用过l.cond.Wait()
将自己挂起,等待其它携程唤醒自己
可见真正干活的是另外的携程
func (l *Local) factoryAllocWorker(ctx context.Context) {
l.cond.L.Lock()
log := logf.FromContext(ctx)
for {
if l.allocatingV4 <= 0 && l.allocatingV6 <= 0 {
l.cond.Wait()
continue
}
// wait a small period
l.cond.L.Unlock()
time.Sleep(300 * time.Millisecond)
l.cond.L.Lock()
if l.eni == nil {
// create eni
v4Count := min(l.batchSize, max(l.allocatingV4, 1))
v6Count := min(l.batchSize, l.allocatingV6)
l.status = statusCreating
l.cond.L.Unlock()
err := l.rateLimitEni.Wait(ctx)
eni, ipv4Set, ipv6Set, err := l.factory.CreateNetworkInterface(v4Count, v6Count, l.eniType)
l.cond.L.Lock()
l.eni = eni
l.allocatingV4 -= v4Count
l.allocatingV6 -= v6Count
l.allocatingV4 = max(l.allocatingV4, 0)
l.allocatingV6 = max(l.allocatingV6, 0)
primary, err := netip.ParseAddr(eni.PrimaryIP.IPv4.String())
if err == nil {
for _, v := range ipv4Set {
l.ipv4.Add(NewValidIP(v, netip.MustParseAddr(v.String()) == primary))
}
}
l.status = statusInUse
}
l.cond.Broadcast()
}
}
这个携程就是真正分配IP的了,再不需要分配IP的时候,即l.allocatingV4 <= 0
,它会一直挂起,等待被需要分配IP的携程唤醒
上述有了分配IP的需求进来了,它就会被唤醒干活了
func (a *Aliyun) CreateNetworkInterface(ipv4, ipv6 int, eniType string) (*daemon.ENI, []netip.Addr, []netip.Addr, error) {
ctx, cancel := context.WithTimeout(a.ctx, time.Second*60)
defer cancel()
// 1. create eni
var eni *client.NetworkInterface
var vswID string
err := wait.ExponentialBackoffWithContext(a.ctx, backoff.Backoff(backoff.ENICreate), func(ctx context.Context) (bool, error) {
vsw, innerErr := a.vsw.GetOne(ctx, a.openAPI, a.zoneID, a.vSwitchOptions)
eni, innerErr = a.openAPI.CreateNetworkInterface(ctx, trunk, vswID, a.securityGroupIDs, a.resourceGroupID, ipv4, ipv6, a.eniTags)
return true, nil
})
r := &daemon.ENI{
ID: eni.NetworkInterfaceID,
MAC: eni.MacAddress,
VSwitchID: eni.VSwitchID,
Type: eni.Type,
}
r.PrimaryIP.SetIP(eni.PrivateIPAddress)
v4Set, err := func() ([]netip.Addr, error) {
var ips []netip.Addr
for _, v := range eni.PrivateIPSets {
addr, err := netip.ParseAddr(v.PrivateIpAddress)
ips = append(ips, addr)
}
return ips, nil
}()
// 2. attach eni
err = a.openAPI.AttachNetworkInterface(ctx, eni.NetworkInterfaceID, a.instanceID, "")
// 3. wait metadata ready & update cidr
err = validateIPInMetadata(ctx, v4Set, func() []netip.Addr {
exists, err := metadata.GetIPv4ByMac(r.MAC)
return exists
})
prefix, err := metadata.GetVSwitchCIDR(eni.MacAddress)
r.VSwitchCIDR.SetIPNet(prefix.String())
gw, err := metadata.GetENIGatewayAddr(eni.MacAddress)
r.GatewayIP.SetIP(gw.String())
return r, v4Set, v6Set, nil
}
这里主要就是和阿里云 云主机相关的一些交互了
- 首先查询vswitch,vswitch id 就是当前云主机所在的vswitch,可以通过metadata获取到
curl http://100.100.100.200/latest/meta-data/vswitch-id
vsw-8vbddxzcxxxxxxp1evxd6r
然后通过ECS客户端开通ENI,关联的vswitch就是上面的这个
然后将ENI绑定到当前云主机,当前云主机有一个唯一的实例ID,也是通过metadata获取
curl http://100.100.100.200/latest/meta-data/instance-id
i-8vb4cxxxxxxxxxxxxxzahaxyv
- 然后确保这个ENI已经绑定到了当前云主机上,并且IP也分配到了,也是通过metadata获取
curl http://100.100.100.200/latest/meta-data/network/interfaces/macs/00:11:22:33:44:55/private-ipv4s
192.168.128.15
- 然后查询ENI所属的vswitch的CIDR,也是通过metadata获取
curl http://100.100.100.200/latest/meta-data/network/interfaces/macs/00:11:22:33:44:55/vswitch-cidr-block
192.168.128.0/24
- 然后查询ENI的网关,也是通过metadata获取
curl http://100.100.100.200/latest/meta-data/network/interfaces/macs/00:11:22:33:44:55/gateway
192.168.128.253
上述ENI准备好了之后,就会把对应的IP地址加入到集合里,然后唤醒需要分配IP的携程即可
有了IP之后,就会转换为网络配置
func (l *LocalIPResource) ToRPC() []*rpc.NetConf {
cfg := &rpc.NetConf{
BasicInfo: &rpc.BasicInfo{
PodIP: l.IP.ToRPC(),
PodCIDR: l.ENI.VSwitchCIDR.ToRPC(),
GatewayIP: l.ENI.GatewayIP.ToRPC(),
ServiceCIDR: nil,
},
ENIInfo: &rpc.ENIInfo{
MAC: l.ENI.MAC,
Trunk: false,
Vid: 0,
GatewayIP: l.ENI.GatewayIP.ToRPC(),
},
Pod: nil,
IfName: "",
ExtraRoutes: nil,
DefaultRoute: true,
}
return []*rpc.NetConf{cfg}
}
然后补充Service CIDR,获取方式和前面VPV模式是一样的
c.BasicInfo.ServiceCIDR = n.k8s.GetServiceCIDR().ToRPC()
有了网络配置后,就可以开始配置网卡了,由于此时的ipType 对应的是VPC-ENI,所以对应的网卡配置类型为独占ENI
func getDatePath(ipType rpc.IPType, vlanStripType types.VlanStripType, trunk bool) types.DataPath {
switch ipType {
case rpc.IPType_TypeVPCENI:
return types.ExclusiveENI
}
}
因为已经分配好了IP地址,所以这里就不需要IPAM插件了,直接使用分配好的IP地址即可
switch setupCfg.DP {
case types.ExclusiveENI:
if setupCfg.ContainerIfName == args.IfName {
containerIPNet = setupCfg.ContainerIPNet
gatewayIPSet = setupCfg.GatewayIP
}
err = datapath.NewExclusiveENIDriver().Setup(setupCfg, cniNetns)
最后再看下网卡配置过程
func (r *ExclusiveENI) Setup(cfg *types.SetupConfig, netNS ns.NetNS) error {
// 1. move link in
nicLink, err := netlink.LinkByIndex(cfg.ENIIndex)
hostNetNS, err := ns.GetCurrentNS()
defer hostNetNS.Close()
err = utils.LinkSetNsFd(nicLink, netNS)
// 2. setup addr and default route
err = netNS.Do(func(netNS ns.NetNS) error {
// 2.1 setup addr
contLink, err := netlink.LinkByName(nicLink.Attrs().Name)
contCfg := generateContCfgForExclusiveENI(cfg, contLink)
err = nic.Setup(contLink, contCfg)
// for now we only create slave link for eth0
if !cfg.DisableCreatePeer && cfg.ContainerIfName == "eth0" {
err = veth.Setup(&veth.Veth{
IfName: cfg.HostVETHName, // name for host ns side
PeerName: defaultVethForENI,
}, hostNetNS)
var mac net.HardwareAddr
err = hostNetNS.Do(func(netNS ns.NetNS) error {
hostPeer, innerErr := netlink.LinkByName(cfg.HostVETHName)
mac = hostPeer.Attrs().HardwareAddr
return innerErr
})
veth1, err := netlink.LinkByName(defaultVethForENI)
veth1Cfg := generateVeth1Cfg(cfg, veth1, mac)
return nic.Setup(veth1, veth1Cfg)
}
return nil
})
hostPeer, err := netlink.LinkByName(cfg.HostVETHName)
hostPeerCfg := generateHostSlaveCfg(cfg, hostPeer)
err = nic.Setup(hostPeer, hostPeerCfg)
return nil
}
容器内的网卡配置时, 首先直接将ENI设备移到容器命名空间内,可见这种模式下容器是直接分配的ENI网卡
然后配置容器ENI网卡名称、设置ENI网卡的IP地址、默认路由
func generateContCfgForExclusiveENI(cfg *types.SetupConfig, link netlink.Link) *nic.Conf {
var addrs []*netlink.Addr
var routes []*netlink.Route
var rules []*netlink.Rule
var sysctl map[string][]string
else {
addrs = utils.NewIPNetToMaxMask(cfg.ContainerIPNet)
}
if cfg.ContainerIPNet.IPv4 != nil {
// add default route
if cfg.DefaultRoute {
routes = append(routes, &netlink.Route{
LinkIndex: link.Attrs().Index,
Scope: netlink.SCOPE_UNIVERSE,
Dst: "0.0.0.0/0",
Gw: cfg.GatewayIP.IPv4,
Flags: int(netlink.FLAG_ONLINK),
})
}
}
contCfg := &nic.Conf{
IfName: cfg.ContainerIfName,
MTU: cfg.MTU,
Addrs: addrs,
Routes: routes,
Rules: rules,
SysCtl: sysctl,
}
return contCfg
}
设置ENI网卡名称为eth0、然后设置的IP地址就是ENI的IP地址、然后添加默认路由,注意这里默认路由的网关设备就是ENI的网关地址
default via 192.168.128.253 dev eth0 onlink
如此,ENI网卡就配置好了,但是还需要一个veth网卡
err = veth.Setup(&veth.Veth{
IfName: cfg.HostVETHName, // name for host ns side
PeerName: "veth1",
}, hostNetNS)
veth网卡在容器内的网卡名称就是veth1
,在宿主机上的名称就是calixxxxxxxxxxx
然后配置容器内veth网卡的名称、配置veth网卡的IP地址、配置veth网卡的默认路由、配置veth网卡的静态ARP
func generateVeth1Cfg(cfg *types.SetupConfig, link netlink.Link, peerMAC net.HardwareAddr) *nic.Conf {
var routes []*netlink.Route
var neighs []*netlink.Neigh
var sysctl map[string][]string
if cfg.ContainerIPNet.IPv4 != nil {
// 169.254.1.1 dev veth1
routes = append(routes, &netlink.Route{
LinkIndex: link.Attrs().Index,
Scope: netlink.SCOPE_LINK,
Dst: "169.254.1.1",
})
if cfg.ServiceCIDR != nil && cfg.ServiceCIDR.IPv4 != nil {
routes = append(routes, &netlink.Route{
LinkIndex: link.Attrs().Index,
Dst: "10.96.0.0/12",
Gw: "169.254.1.1/32",
Flags: int(netlink.FLAG_ONLINK),
})
}
neighs = append(neighs, &netlink.Neigh{
LinkIndex: link.Attrs().Index,
IP: 169.254.1.1,
HardwareAddr: peerMAC,
State: netlink.NUD_PERMANENT,
})
}
contCfg := &nic.Conf{
IfName: "veth1",
MTU: cfg.MTU,
Addrs: "192.168.128.15/32",
Routes: routes,
Neighs: neighs,
SysCtl: sysctl,
}
return contCfg
}
设置容器内veth网卡的名称为veth1
veth1网卡的IP地址仍为ENI的IP地址
然后是veth1默认路由
169.254.1.1 dev veth1 scope link
10.96.0.0/12 via 169.254.1.1 dev veth1 onlink
然后是静态ARP,对应的MAC地址就是宿主机上calixxxxxxxxxx设备的MAC地址
? (169.254.1.1) at da:44:55:66:77:88 [ether] on eth0
最后是宿主机上的veth网卡配置
func generateHostSlaveCfg(cfg *types.SetupConfig, link netlink.Link) *nic.Conf {
var addrs []*netlink.Addr
var routes []*netlink.Route
if cfg.ContainerIPNet.IPv4 != nil {
addrs = append(addrs, &netlink.Addr{
IPNet: "169.254.1.1/32",
})
// add route to container
routes = append(routes, &netlink.Route{
LinkIndex: link.Attrs().Index,
Scope: netlink.SCOPE_LINK,
Dst: "192.168.128.15/32",
})
}
contCfg := &nic.Conf{
IfName: cfg.HostVETHName,
MTU: cfg.MTU,
Addrs: addrs,
Routes: routes,
SysCtl: sysctl,
}
return contCfg
}
首先设置宿主机上的veth网卡名称为calixxxxxxxxxxx
设置calixxxxxxxxxxxxxx网卡IP地址为169.254.1.1/32
设置calixxxxxxxxxxxxxx网卡的默认路由
192.168.128.15/32 dev calixxxxxxxxxxxxxx scope link
可以看到这种模式下,容器内是有两个网卡的,其中ENI网卡直连的是VPC;另外的veth网卡是处理Service请求的