文章内容
1、 kubernetes高可用集群二进制部署
1.1 kubernetes高可用集群二进制部署v1.24.2
1.2 横向扩容
添加master
添加node
1.3 升级kubernetes
1.4 升级containerd
2、etcd的备份和恢复-基于快照
kubernetes高可用集群二进制部署v1.24.2
注意:生产集群 是3台master 2台harbor
高可用集群所需节点配置如下
类型 | 服务器 IP | 主机名 | VIP |
---|---|---|---|
k8s master1 | 192.168.50.190 | k8s-master1.example.com | 192.168.50.188 |
k8s master2 | 192.168.50.191 | k8s-master2.example.com | 192.168.50.188 |
k8s master3 | 192.168.50.199 | k8s-master3.example.com | 192.168.50.188 |
harbor1 | 192.168.50.194 | k8s-harbor1.example.com | |
etcd节点1 | 192.168.50.192 | k8s-etcd1.example.com | |
etcd节点2 | 192.168.50.193 | k8s-etcd2.example.com | |
etcd节点3 | 192.168.50.195 | k8s-etcd3.example.com | |
Haproxy1 | 192.168.50.196 | k8s-ha1.example.com | 192.168.50.188 |
Haproxy2 | 192.168.50.200 | k8s-ha2.example.com | 192.168.50.188 |
node节点1 | 1921.68.50.197 | k8s-node1.example.com | |
node节点2 | 1921.68.50.198 | k8s-node2.example.com | |
deploy节点 | 192.168.50.196 | k8s-deploy.example.com |
部署haproxy和keepalived高可用负载均衡
在ha server部署
apt install keepalived haproxy -y
配置keepalived
keepalived模板
find / -name keepalived.*
/usr/share/doc/keepalived/samples/keepalived.conf.vrrp
cp /usr/share/doc/keepalived/samples/keepalived.conf.vrrp /etc/keepalived/keepalived.conf
配置keepalived
vim /etc/keepalived/keepalived.conf
! Configuration File for keepalived
global_defs {
notification_email {
acassen
}
notification_email_from Alexandre.Cassen@firewall.loc
smtp_server 192.168.200.1
smtp_connect_timeout 30
router_id LVS_DEVEL
}
vrrp_instance VI_1 {
state MASTER
interface eth0
garp_master_delay 10
smtp_alert
virtual_router_id 51
priority 100
advert_int 1
authentication {
auth_type PASS
auth_pass 1111
}
virtual_ipaddress {
192.168.50.188 dev eth0 label eth0:0
}
}
启动服务
systemctl start keepalived.service
systemctl enable keepalived.service
配置haproxy (分别部署在两台服务器上)
vim /etc/haproxy/haproxy.cfg
增加配置
listen k8s-6443
bind 192.168.50.188:6443
mode tcp
server 192.168.50.190 192.168.50.190:6443 check inter 3s fall 3 rise 3
server 192.168.50.191 192.168.50.191:6443 check inter 3s fall 3 rise 3
server 192.168.50.199 192.168.50.199:6443 check inter 3s fall 3 rise 3
检查配置文件语法
haproxy -f /etc/haproxy/haproxy.cfg
启动服务
systemctl start haproxy
systemctl enable haproxy
systemctl status haproxy
部署harbor
生产环境推荐部署两台harbor 高可用部署
二进制安装docker
# tar xvf docker-19.03.15-binary-install.tar.gz
# ./docker-install.sh
配置 Docker Compose
wget https://github.com/docker/compose/releases/download/v2.3.4/docker-compose-linux-x86_64
chmod a+x docker-compose-linux-x86_64
mv docker-compose-linux-x86_64 /usr/bin/docker-compose
验证安装结果
docker-compose version
Harbor 离线安装
下载harbor安装包
mkdir /apps
cd /apps/
wget https://github.com/goharbor/harbor/releases/download/v2.5.3/harbor-offline-installer-v2.5.3.tgz
tar xzvf harbor-offline-installer-v2.5.3.tgz
harbor TLS证书签发
`
自签名CA机构
mkdir /apps/harbor/certs
cd /apps/harbor/certs
openssl genrsa -out ca.key 4096
openssl req -x509 -new -nodes -sha512 -days 3650 \ -subj "/C=CN/ST=Beijing/L=Beijing/O=example/OU=Personal/CN=example.com" \ -key ca.key \ -out ca.crt
客户端域名证书申请
touch /root/.rnd
openssl genrsa -out harbor.example.com.key 4096
openssl req -sha512 -new \ -subj "/C=CN/ST=Beijing/L=Beijing/O=example/OU=Personal/CN=example.com" \ -key harbor.example.com.key \ -out harbor.example.com.csr
准备签发环境
cat > v3.ext <<-EOF
> authorityKeyIdentifier=keyid,issuer
> basicConstraints=CA:FALSE
> keyUsage = digitalSignature, nonRepudiation, keyEncipherment, dataEncipherment
> extendedKeyUsage = serverAuth
> subjectAltName = @alt_names
> [alt_names]
> DNS.1=example.com
> DNS.2=example.net
> DNS.3=harbor.example.com
> EOF
使用自签名CA签发证书
# openssl x509 -req -sha512 -days 3650 \
> -extfile v3.ext \
> -CA ca.crt -CAkey ca.key -CAcreateserial \
> -in harbor.example.com.csr \
> -out harbor.example.com.crt
配置harbor
cd /apps/harbor/
cp harbor.yml.tmpl harbor.yml #默认配置文件
vim harbor.yml
更新hostname (推荐用域名)
hostname: harbor.example.com
配置证书
certificate: /apps/harbor/certs/harbor.example.com.crt
private_key: /apps/harbor/certs/harbor.example.com.key
安装harbor
./install.sh --with-trivy --with-chartmuseum
登录harbor
创建公开项目 baseimages
kubeasz 部署kubernetes v1.24
1.基础系统配置
2c/4g内存/40g硬盘(该配置仅测试用)
操作系统 最小化安装Ubuntu 20.04.3或者CentOS 7 Minimal
配置基础网络、更新源、SSH登录、主机名等
2.准备安装依赖工具(部署节点)
安装git ansible
apt install git ansible -y
3.准备ssh免密登陆配置(部署节点)
从部署节点能够ssh免密登陆所有节点,并且设置python软连接#$IP为所有节点地址包括自身,按照提示输入yes 和root密码
ssh-keygen
apt install sshpass -y
cat key.sh
#!/bin/bash
IP="
192.168.50.190
192.168.50.191
192.168.50.199
192.168.50.193
192.168.50.195
192.168.50.192
192.168.50.197
"
for node in ${IP};do
sshpass -p 123456 ssh-copy-id -o StrictHostKeyChecking=no ${node}
echo "${node} 秘钥copy完成"
ssh ${node} ln -s /usr/bin/python3 /usr/bin/python
echo "${node} /usr/bin/python3 软连接创建完成"
done
为每个节点设置python软链接
ssh $IP ln -s /usr/bin/python3 /usr/bin/python
在部署节点编排k8s安装
使用kubeasz版本3.3.1 Kubernetes version 1.24
下载项目源码、二进制及离线镜像下载工具脚本ezdown,举例使用kubeasz版本3.3.1
export release=3.3.1
wget https://github.com/easzlab/kubeasz/releases/download/${release}/ezdown
chmod +x ./ezdown
可以根据部署需求更改ezdown
下载kubeasz代码、二进制、默认容器镜像(更多关于ezdown的
参数,运行./ezdown 查看)
# 国内环境
./ezdown -D
docker ps
docker images
创建集群配置实例
#创建新集群 k8s-01
cd /etc/kubeasz/
./ezctl new k8s-cluster1
2022-07-25 23:25:07 DEBUG generate custom cluster files in /etc/kubeasz/clusters/k8s-cluster1
2022-07-25 23:25:07 DEBUG set versions
2022-07-25 23:25:08 DEBUG cluster k8s-cluster1: files successfully created.
2022-07-25 23:25:08 INFO next steps 1: to config '/etc/kubeasz/clusters/k8s-cluster1/hosts'
2022-07-25 23:25:08 INFO next steps 2: to config '/etc/kubeasz/clusters/k8s-cluster1/config.yml'
编辑hosts文件
指定master节点、etcd节点、node节点、VIP、运行时、网络组件类型、service IP与Pod IP 范围等配置信息
vim /etc/kubeasz/clusters/k8s-cluster1/hosts
[etcd]
192.168.50.192
192.168.50.193
192.168.50.195
# master node(s)
[kube_master]
192.168.50.190
192.168.50.191
# work node(s)
[kube_node]
192.168.50.197
# --------- Main Variables ---------------
# Secure port for apiservers
SECURE_PORT="6443"
# Cluster container-runtime supported: docker, containerd
# if k8s version >= 1.24, docker is not supported
CONTAINER_RUNTIME="containerd"
# Network plugins supported: calico, flannel, kube-router, cilium, kube-ovn
CLUSTER_NETWORK="calico"
# Service proxy mode of kube-proxy: 'iptables' or 'ipvs'
PROXY_MODE="ipvs"
# K8S Service CIDR, not overlap with node(host) networking
SERVICE_CIDR="10.100.0.0/16"
# Cluster CIDR (Pod CIDR), not overlap with node(host) networking
CLUSTER_CIDR="10.200.0.0/16"
# NodePort Range
NODE_PORT_RANGE="30000-62767"
# Cluster DNS Domain
CLUSTER_DNS_DOMAIN="cluster.local"
# -------- Additional Variables (don't change the default value right now) ---
# Binaries Directory
bin_dir="/usr/local/bin"
集群组件等配置项可以在config.yml 文件中修改
/etc/kubeasz/clusters/k8s-cluster1/config.yml
easzlab/pause:3.7 镜像推送至harbor
docker push harbor.example.com/baseimages/pause:3.7
更新基础镜像地址
# [containerd]基础容器镜像
SANDBOX_IMAGE: "harbor.example.com/baseimages/pause:3.7"
node、master 配置hosts
echo "192.168.50.194 harbor.example.com" >> /etc/hosts
k8s 集群 master 节点证书配置
可以添加多个ip和域名(比如增加公网ip和域名)
MASTER_CERT_HOSTS:
- "192.168.50.188"
- "api.example.com"
#node 节点上 pod 网段掩码长度(决定每个节点最多能分配的pod ip地址)
#如果flannel 使用 --kube-subnet-mgr 参数,那么它将读取该设置为每个节点分配pod网段
#https://github.com/coreos/flannel/issues/847
NODE_CIDR_LEN: 21
#node节点最大pod 数
MAX_PODS: 500
# 关闭 coredns 自动安装
dns_install: "no"
ENABLE_LOCAL_DNS_CACHE: false
#metric server 自动安装
metricsserver_install: "no"
#dashboard 自动安装
dashboard_install: "no"
开始安装集群
分步安装
命令集 1:集群安装相关操作
./ezctl setup k8s-cluster1 01
安装etcd
./ezctl setup k8s-cluster1 02
部署etcd报错
TASK [etcd : 创建etcd证书请求] ***********************************************************************************************************************************************************
fatal: [192.168.50.195]: FAILED! => {"msg": "Failed to get information on remote file (/etc/kubeasz/clusters/k8s-cluster1/ssl/etcd-csr.json): /bin/sh: 1: /usr/bin/python: not found\n"}
fatal: [192.168.50.193]: FAILED! => {"msg": "Failed to get information on remote file (/etc/kubeasz/clusters/k8s-cluster1/ssl/etcd-csr.json): /bin/sh: 1: /usr/bin/python: not found\n"}
fatal: [192.168.50.192]: FAILED! => {"msg": "Failed to get information on remote file (/etc/kubeasz/clusters/k8s-cluster1/ssl/etcd-csr.json): /bin/sh: 1: /usr/bin/python: not found\n"}
PLAY RECAP *************************************************************************************************************************************************************************
192.168.50.192 : ok=3 changed=0 unreachable=0 failed=1 skipped=0 rescued=0 ignored=0
192.168.50.193 : ok=3 changed=0 unreachable=0 failed=1 skipped=0 rescued=0 ignored=0
192.168.50.195 : ok=3 changed=0 unreachable=0 failed=1 skipped=0 rescued=0 ignored=0
为每个节点设置python软链接
ln -s /usr/bin/python3 /usr/bin/python
再次 安装etcd
./ezctl setup k8s-cluster1 02
验证etcd集群状态
systemctl status etcd 查看服务状态
journalctl -u etcd 查看运行日志
在任一 etcd 集群节点上执行如下命令
#在etcd节点执行检查
export NODE_IPS="192.168.50.192 192.168.50.193 192.168.50.195"
for ip in ${NODE_IPS}; do
ETCDCTL_API=3 etcdctl \
--endpoints=https://${ip}:2379 \
--cacert=/etc/kubernetes/ssl/ca.pem \
--cert=/etc/kubernetes/ssl/etcd.pem \
--key=/etc/kubernetes/ssl/etcd-key.pem \
endpoint health; done
https://192.168.50.192:2379 is healthy: successfully committed proposal: took = 11.791111ms
https://192.168.50.193:2379 is healthy: successfully committed proposal: took = 35.345335ms
https://192.168.50.195:2379 is healthy: successfully committed proposal: took = 22.612324ms
安装容器运行时
内部私有仓库配置 (147行)
vim roles/containerd/templates/config.toml.j2
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."harbor.example.com"]
endpoint = ["https://harbor.example.com"]
[plugins."io.containerd.grpc.v1.cri".registry.configs."harbor.example.com".tls]
insecure_skip_verify = true #
[plugins."io.containerd.grpc.v1.cri".registry.configs."harbor.example.com".auth]
username = "admin"
password = "Harbor12345"
ezctl setup k8s-01 03
node 、master验证拉取镜像
crictl pull harbor.example.com/baseimages/pause:3.7
安装kube_master节点
./ezctl setup k8s-cluster1 04
master 集群的验证
验证 master节点的主要组件:
#查看进程状态
systemctl status kube-apiserver
systemctl status kube-controller-manager
systemctl status kube-scheduler
#查看进程运行日志
journalctl -u kube-apiserver
journalctl -u kube-controller-manager
journalctl -u kube-scheduler
执行 kubectl get componentstatus 可以看到
NAME STATUS MESSAGE ERROR
scheduler Healthy ok
controller-manager Healthy ok
etcd-0 Healthy {"health": "true"}
etcd-2 Healthy {"health": "true"}
etcd-1 Healthy {"health": "true"}
安装kube_node节点
./ezctl setup k8s-cluster1 05
验证 node 状态
systemctl status kubelet # 查看状态
systemctl status kube-proxy
journalctl -u kubelet # 查看日志
journalctl -u kube-proxy
运行 kubectl get node
kubectl get node
NAME STATUS ROLES AGE VERSION
192.168.50.190 Ready,SchedulingDisabled master 1h v1.24.2
192.168.50.191 Ready,SchedulingDisabled master 1h v1.24.2
192.168.50.197 Ready node 1h v1.24.2
192.168.50.198 Ready node 1h v1.24.2
安装网络组件
calico镜像上传至harbor私有仓库
docker tag calico/node:v3.19.4 harbor.example.com/baseimages/calico-node:v3.19.4
docker push harbor.example.com/baseimages/calico-node:v3.19.4
docker tag calico/pod2daemon-flexvol:v3.19.4 harbor.example.com/baseimages/calico-pod2daemon-flexvol:v3.19.4
docker push harbor.example.com/baseimages/calico-pod2daemon-flexvol:v3.19.4
docker tag calico/cni:v3.19.4 harbor.example.com/baseimages/calico-cni:v3.19.4
docker push harbor.example.com/baseimages/calico-cni:v3.19.4
docker tag calico/kube-controllers:v3.19.4 harbor.example.com/baseimages/calico-kube-controllers:v3.19.4
docker push harbor.example.com/baseimages/calico-kube-controllers:v3.19.4
更新镜像地址
vim roles/calico/templates/calico-v3.19.yaml.j2
- name: install-cni
image: harbor.example.com/baseimages/calico-cni:v3.19.4
- name: flexvol-driver
image: harbor.example.com/baseimages/calico-pod2daemon-flexvol:v3.19.4
- name: calico-node
image: harbor.example.com/baseimages/calico-node:v3.19.4
- name: calico-kube-controllers
image: harbor.example.com/baseimages/calico-kube-controllers:v3.19.4
安装calico网络组件
./ezctl setup k8s-cluster1 06
验证calico网络
执行calico安装成功后可以验证如下:(需要等待镜像下载完成,有时候即便上一步已经配置了docker国内加速,还是可能比较慢,请确认以下容器运行起来以后,再执行后续验证步骤)
kubectl get pod --all-namespaces
kubectl get pod --all-namespaces
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system calico-kube-controllers-7979997d97-zrxxg 1/1 Running 1 (92m ago) 3h3m
kube-system calico-node-778xl 1/1 Running 0 3h3m
kube-system calico-node-rb7fw 1/1 Running 1 (91m ago) 3h3m
kube-system calico-node-tgx2x 1/1 Running 0 3h3m
kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system calico-kube-controllers-7979997d97-zrxxg 1/1 Running 1 (109m ago) 3h20m
kube-system calico-node-778xl 1/1 Running 0 3h20m
kube-system calico-node-rb7fw 1/1 Running 1 (108m ago) 3h20m
kube-system calico-node-tgx2x 1/1 Running 0 3h20m
kubectl create ns myserver
kubectl run net-test1 --image=centos:7.9.2009 sleep 100000000 -n myserver
kubectl run net-test2 --image=centos:7.9.2009 sleep 100000000 -n myserver
kubectl run net-test3 --image=centos:7.9.2009 sleep 100000000 -n myserver
kubectl get pod -n myserver -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
net-test1 1/1 Running 0 3h24m 10.200.218.2 192.168.50.197 <none> <none>
net-test2 1/1 Running 0 3h22m 10.200.218.3 192.168.50.197 <none> <none>
net-test3 1/1 Running 0 3h22m 10.200.218.4 192.168.50.197 <none> <none>
kubectl exec -it net-test1 bash -n myserver
分别测试容器到公网、node节点、容器 通信正常
[root@net-test1 /]# ping 223.6.6.6
PING 223.6.6.6 (223.6.6.6) 56(84) bytes of data.
64 bytes from 223.6.6.6: icmp_seq=1 ttl=115 time=4.80 ms
64 bytes from 223.6.6.6: icmp_seq=2 ttl=115 time=4.80 ms
^C
--- 223.6.6.6 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 4.801/4.801/4.802/0.069 ms
[root@net-test1 /]# ping 192.168.50.197
PING 192.168.50.197 (192.168.50.197) 56(84) bytes of data.
64 bytes from 192.168.50.197: icmp_seq=1 ttl=64 time=0.094 ms
^C
--- 192.168.50.197 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.094/0.094/0.094/0.000 ms
[root@net-test1 /]# ping 10.200.218.4
PING 10.200.218.4 (10.200.218.4) 56(84) bytes of data.
64 bytes from 10.200.218.4: icmp_seq=1 ttl=63 time=0.111 ms
64 bytes from 10.200.218.4: icmp_seq=2 ttl=63 time=0.057 ms
^C
--- 10.200.218.4 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1032ms
rtt min/avg/max/mdev = 0.057/0.084/0.111/0.027 ms
kubernetes集群维护 节点维护
添加master节点
ssh 免密码登录
ssh-copy-id 192.168.50.199
配置python软链接
ssh 192.168.50.199 ln -s /usr/bin/python3 /usr/bin/python
新增master节点
./ezctl add-master k8s-cluster1 192.168.50.199
验证新节点master 服务状态
systemctl status kube-apiserver
systemctl status kube-controller-manager
systemctl status kube-scheduler
查看新master的服务日志
journalctl -u kube-apiserver -f
查看集群节点,可以看到新 master节点 Ready, 并且禁止了POD 调度功能
kubectl get node
NAME STATUS ROLES AGE VERSION
192.168.50.190 Ready,SchedulingDisabled master 28h v1.24.2
192.168.50.191 Ready,SchedulingDisabled master 28h v1.24.2
192.168.50.197 Ready node 28h v1.24.2
192.168.50.198 Ready node 29h v1.24.2
192.168.50.199 Ready,SchedulingDisabled master 2h v1.24.2 #新增master节点
配置文件更新
/etc/kubeasz/clusters/k8s-cluster1/hosts
/etc/kubeasz/clusters/k8s-cluster1/ssl/
/etc/kubeasz/clusters/k8s-cluster1/yml/calico.yaml
验证新节点master 服务状态
systemctl status kube-apiserver
systemctl status kube-controller-manager
systemctl status kube-scheduler
#查看新master的服务日志
journalctl -u kube-apiserver -f
kubectl get node
NAME STATUS ROLES AGE VERSION
192.168.50.190 Ready,SchedulingDisabled master 46h v1.24.2
192.168.50.191 NotReady,SchedulingDisabled master 46h v1.24.2
192.168.50.197 Ready node 46h v1.24.2
192.168.50.199 Ready,SchedulingDisabled master 44m v1.24.2 #新增 master节点
删除 kube_master 节点
删除kube_master节点大致流程为:(参考ezctl 中del-master函数和playbooks/33.delmaster.yml)检测是否可以删除迁移节点 pod删除 master 相关服务及文件删除 node 相关服务及文件从集群删除 node 节点从 ansible hosts 移除节点在 ansible 控制端更新 kubeconfig更新 node 节点 haproxy
配置操作步骤
./ezctl del-master k8s-cluster1 192.168.50.199 # 假设待删除节点 192.168.50.199
添加node节点
1、配置主机名、hosts(harbor私有仓库域名)
2、#ssh 免密码登录
ssh-copy-id 192.168.50.198
#部分操作系统需要配置python软链接
ssh 192.168.50.198 ln -s /usr/bin/python3 /usr/bin/python
3、#新增节点
./ezctl add-node k8s-cluster1 192.168.50.198
4、验证新节点状态
$ kubectl get node
#验证新节点的网络插件calico 或flannel 的Pod 状态
$ kubectl get pod -n kube-system
删除 kube_node 节点
./ezctl del-node k8s-cluster1 192.168.50.198
升级kubernetes
下载kubernetes v1.24.3
tar xf kubernetes-client-linux-amd64.tar.gz
tar xf kubernetes-node-linux-amd64.tar.gz
tar xf kubernetes-server-linux-amd64.tar.gz
tar xf kubernetes.tar.gz
/usr/local/src/kubernetes/server/bin
升级master1 192.168.50.190
在所有node节点上,将要升级的master节点从kube-lb中剔除
vim /etc/kube-lb/conf/kube-lb.conf
stream {
upstream backend {
server 192.168.50.199:6443 max_fails=2 fail_timeout=3s;
#server 192.168.50.190:6443 max_fails=2 fail_timeout=3s;
server 192.168.50.191:6443 max_fails=2 fail_timeout=3s;
}
systemctl reload kube-lb.service
在master1 升级 替换相关二进制文件
kube-apiserver
kube-controller-manager
kube-proxy
kube-scheduler
kubectl
kubelet
停止相关服务
systemctl stop kube-apiserver kube-controller-manager kube-scheduler kube-proxy kubelet
替换二进制文件
root@k8s-ha1:/usr/local/src/kubernetes/server/bin# scp kube-apiserver kube-controller-manager kube-scheduler kube-proxy kubelet kubectl 192.168.50.190:/usr/local/bin/
验证版本
/usr/local/bin/kube-controller-manager --version
Kubernetes v1.24.3
启动服务
systemctl start kube-apiserver kube-controller-manager kube-scheduler kube-proxy kubelet
查看版本
kubectl get node
NAME STATUS ROLES AGE VERSION
192.168.50.190 Ready,SchedulingDisabled master 2d3h v1.24.3 #已升级
192.168.50.191 NotReady,SchedulingDisabled master 2d3h v1.24.2
192.168.50.197 Ready node 2d3h v1.24.2
192.168.50.198 Ready node 3h39m v1.24.2
192.168.50.199 Ready,SchedulingDisabled master 5h43m v1.24.2
重新上线升级后的Master节点 (在所有node节点上操作)
vim /etc/kube-lb/conf/kube-lb.conf
stream {
upstream backend {
server 192.168.50.199:6443 max_fails=2 fail_timeout=3s;
server 192.168.50.190:6443 max_fails=2 fail_timeout=3s;
server 192.168.50.191:6443 max_fails=2 fail_timeout=3s;
}
systemctl reload kube-lb.service
然后再升级其他两台master
1、在所有node节点上,将要升级的master节点从kube-lb中剔除
2、替换相关二进制文
3、停止相关服务
4、替换二进制文件
5、启动服务
6、查看版本
7、重新上线升级后的Master节点
升级Node节点
逐台node操作升级
驱逐节点上的pod
kubectl drain 192.168.50.198 --ignore-daemonsets --force
查看node状态
kubectl get node
NAME STATUS ROLES AGE VERSION
192.168.50.190 Ready,SchedulingDisabled master 2d14h v1.24.3
192.168.50.191 NotReady,SchedulingDisabled master 2d14h v1.24.2
192.168.50.197 Ready node 2d14h v1.24.2
192.168.50.198 Ready,SchedulingDisabled node 15h v1.24.2 #已经驱逐
192.168.50.199 Ready,SchedulingDisabled master 17h v1.24.3
node节点停止服务(node节点操作)
root@k8s-node2:~# systemctl stop kubelet kube-proxy.service
替换二进制可执行文件(部署节点执行)
scp kubelet kube-proxy kubectl 192.168.50.198:/usr/local/bin/
启动服务(node节点操作)
root@k8s-node2:~# systemctl start kubelet kube-proxy.service
验证版本
root@k8s-node2:~# kubectl get node
NAME STATUS ROLES AGE VERSION
192.168.50.190 Ready,SchedulingDisabled master 2d15h v1.24.3
192.168.50.191 NotReady,SchedulingDisabled master 2d15h v1.24.2
192.168.50.197 Ready node 2d15h v1.24.2
192.168.50.198 Ready,SchedulingDisabled node 15h v1.24.3
192.168.50.199 Ready,SchedulingDisabled master 17h v1.24.3
恢复节点可调度(部署节点执行)
kubectl uncordon 192.168.50.198
node/192.168.50.198 uncordoned
查看node状态恢复正常
kubectl get node
NAME STATUS ROLES AGE VERSION
192.168.50.190 Ready,SchedulingDisabled master 2d15h v1.24.3
192.168.50.191 NotReady,SchedulingDisabled master 2d15h v1.24.2
192.168.50.197 Ready node 2d15h v1.24.2
192.168.50.198 Ready node 15h v1.24.3
192.168.50.199 Ready,SchedulingDisabled master 17h v1.24.
按上面的顺序 逐一 升级其他node节点
1、驱逐节点上的pod
2、node节点停止服务
3、替换二进制可执行文件
4、启动服务
5、验证版本
6、恢复节点可调度
7、查看node状态
替换kubeasz bin文件
(部署节点操作 ,后面通过kubeasz 部署 使用升级后的版本)
cp kube-apiserver kube-controller-manager kube-scheduler kube-proxy kubelet kubectl /etc/kubeasz/bin/
升级containerd、runc、crictl
下载 containerd
https://github.com/containerd/containerd/releases/download/v1.6.6/containerd-1.6.6-linux-amd64.tar.gz
tar xf containerd-1.6.6-linux-amd64.tar.gz
下载runc
cd /usr/local/src/bin
wget https://github.com/opencontainers/runc/releases/download/v1.1.3/runc.amd64
mv runc.amd64 runc
chmod +x runc
下载crictl
wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.24.2/crictl-v1.24.2-linux-amd64.tar.gz
tar xf crictl-v1.24.2-linux-amd64.tar.gz
rm -f crictl-v1.24.2-linux-amd64.tar.gz
下载 nerctl
wget https://github.com/containerd/nerdctl/releases/download/v0.22.0/nerdctl-0.22.0-linux-amd64.tar.gz
tar xf nerdctl-0.22.0-linux-amd64.tar.gz
rm -f nerdctl-0.22.0-linux-amd64.tar.gz
复制到kubeasz containerd-bin (以后部署使用)
cp ./* /etc/kubeasz/bin/containerd-bin/
参考kubeasz containerd配置
/etc/kubeasz/roles/containerd/tasks/main.yml
- name: 下载 containerd 二进制文件
copy: src={{ base_dir }}/bin/containerd-bin/{{ item }} dest={{ bin_dir }}/{{ item }} mode=0755
with_items:
- containerd
- containerd-shim
- containerd-shim-runc-v1
- containerd-shim-runc-v2
- crictl
- ctr
- runc
node节点升级containerd
驱逐节点上的pod(部署节点操作)
kubectl drain 192.168.50.197 --ignore-daemonsets --force
kubectl get node
停止节点服务 kubelet containerd kube-proxy
systemctl stop kubelet kube-proxy containerd
systemctl disable kubelet kube-proxy containerd
reboot
替换节点二进制文件
scp ./* 192.168.50.197:/usr/local/bin/
启动服务
systemctl enable kubelet kube-proxy containerd
systemctl start kubelet kube-proxy containerd
恢复节点可调度
kubectl uncordon 192.168.50.197
查看服务及版本状态
kubectl get node -o wide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
192.168.50.190 Ready,SchedulingDisabled master 2d17h v1.24.3 192.168.50.190 <none> Ubuntu 20.04.3 LTS 5.4.0-122-generic containerd://1.6.4
192.168.50.191 NotReady,SchedulingDisabled master 2d17h v1.24.2 192.168.50.191 <none> Ubuntu 20.04.3 LTS 5.4.0-122-generic containerd://1.6.4
192.168.50.197 Ready node 2d17h v1.24.3 192.168.50.197 <none> Ubuntu 20.04.3 LTS 5.4.0-122-generic containerd://1.6.6 #已升级
192.168.50.198 Ready node 17h v1.24.3 192.168.50.198 <none> Ubuntu 20.04.3 LTS 5.4.0-81-generic containerd://1.6.4
192.168.50.199 Ready,SchedulingDisabled master 19h v1.24.3 192.168.50.199 <none> Ubuntu 20.04.3 LTS 5.4.0-122-generic containerd://1.6.4
按上面的顺序 逐一 升级其他node节点
1、驱逐节点上的pod
2、停止节点服务 kubelet containerd kube-proxy
3、替换二进制文件
4、启动服务
5、恢复节点可调度
6、查看服务及版本状态
drain 192.168.50.198 --ignore-daemonsets --force
systemctl stop kubelet kube-proxy containerd
systemctl disable kubelet kube-proxy containerd
reboot
scp ./* 192.168.50.198:/usr/local/bin/
kubectl uncordon 192.168.50.198
kubectl get node
master 节点升级containerd
1、在所有node节点上,将要升级的master节点从kube-lb中剔除
vim /etc/kube-lb/conf/kube-lb.conf
systemctl reload kube-lb.service
2、停止节点服务 kubelet containerd kube-proxy
systemctl stop kubelet kube-proxy containerd
systemctl disable kubelet kube-proxy containerd
reboot
3、替换二进制文件
scp ./* 192.168.50.190:/usr/local/bin/
4、启动服务
systemctl enable kubelet kube-proxy containerd
systemctl start kubelet kube-proxy containerd
5、重新上线升级后的Master节点
vim /etc/kube-lb/conf/kube-lb.conf
systemctl reload kube-lb.service
6、查看服务及版本状态
kubectl get node -o wide
etcd的备份和恢复-基于快照
方法1(不推荐):
备份数据
etcdctl snapshot save /data/etcd-backup/etcd-snapshot-202207301730.db
{"level":"info","ts":"2022-07-30T17:30:32.876+0800","caller":"snapshot/v3_snapshot.go:65","msg":"created temporary db file","path":"/data/etcd-backup/etcd-snapshot-202207301730.db.part"}
{"level":"info","ts":"2022-07-30T17:30:32.906+0800","logger":"client","caller":"v3/maintenance.go:211","msg":"opened snapshot stream; downloading"}
{"level":"info","ts":"2022-07-30T17:30:32.906+0800","caller":"snapshot/v3_snapshot.go:73","msg":"fetching snapshot","endpoint":"127.0.0.1:2379"}
{"level":"info","ts":"2022-07-30T17:30:33.069+0800","logger":"client","caller":"v3/maintenance.go:219","msg":"completed snapshot read; closing"}
{"level":"info","ts":"2022-07-30T17:30:33.076+0800","caller":"snapshot/v3_snapshot.go:88","msg":"fetched snapshot","endpoint":"127.0.0.1:2379","size":"3.3 MB","took":"now"}
{"level":"info","ts":"2022-07-30T17:30:33.076+0800","caller":"snapshot/v3_snapshot.go:97","msg":"saved","path":"/data/etcd-backup/etcd-snapshot-202207301730.db"}
Snapshot saved at /data/etcd-backup/etcd-snapshot-202207301730.db
查看备份
etcdctl --write-out=table snapshot status /data/etcd-backup/etcd-snapshot-202207301730.db
Deprecated: Use `etcdutl snapshot status` instead.
+----------+----------+------------+------------+
| HASH | REVISION | TOTAL KEYS | TOTAL SIZE |
+----------+----------+------------+------------+
| 90bbad6d | 382573 | 1037 | 3.3 MB |
+----------+----------+------------+------------+
模拟删除pod
kubectl delete pod net-test3 -n myserver
恢复数据
在etcd1执行
rm -rf /var/lib/etcd/
etcdctl snapshot restore /data/etcd-backup/etcd-snapshot-202207301730.db --data-dir="/var/lib/etcd"
将数据恢复到一个新的不存在数据的目录中
重启etcd
systemctl restart etcd
三台etcd依此恢复数据
验证etcd集群状态
kubectl get pod -n myserver
自动备份etcd数据脚本
mkdir /data/etcd-backup/ -p
#!/bin/bash
source /etc/profile
DATE=`date +%Y-%m-%d_%H-%M-%S`
etcdctl snapshot save /data/etcd-backup/etcd-snapshot-${DATE}.db
方法二推荐: kubeasz备份、恢复 etcd
kubeasz备份
./ezctl backup k8s-cluster1
或者如下手动执行ansible命令# ansible-playbook -i clusters/k8s-cluster1/hosts -e @clusters/k8s-cluster1/config.yml playbooks/94.backup.yml
备份文件数据 (备份的3个net pod)
ls clusters/k8s-cluster1/backup/
snapshot.db snapshot_202207301646.db
模拟删除pod
kubectl delete pod net-test3 -n myserver
再次备份(删除一个pod后的数据)
./ezctl backup k8s-cluster1
ls clusters/k8s-cluster1/backup/
snapshot.db snapshot_202207301646.db snapshot_202207301649.db
root@k8s-ha1:/etc/kubeasz# ls clusters/k8s-cluster1/backup/ -l
total 9540
-rw------- 1 root root 3252256 Jul 30 16:49 snapshot.db #最新的备份数据 用于恢复
-rw------- 1 root root 3252256 Jul 30 16:46 snapshot_202207301646.db
-rw------- 1 root root 3252256 Jul 30 16:49 snapshot_202207301649.db
恢复集群及验证
可以在 roles/cluster-restore/defaults/main.yml 文件中配置需要恢复的 etcd备份版本(从上述备份目录中选取),默认使用最近一次备份;
执行恢复后,需要一定时间等待 pod/svc 等资源恢复重建。
ezctl restore k8s-01
或者如下手动执行ansible命令# ansible-playbook -i clusters/k8s-01/hosts -e @clusters/k8s-01/config.yml playbooks/95.restore.ym
恢复到三个net pod 数据
mv snapshot_202207301646.db snapshot.db
root@k8s-ha1:/etc/kubeasz/clusters/k8s-cluster1/backup# /etc/kubeasz/ezctl restore k8s-cluster1
查看pod数据 (恢复三个net )
kubectl get pod -n myserver
NAME READY STATUS RESTARTS AGE
linux70-nginx-deployment-55dc5fdcf9-8bv4t 1/1 Running 0 36m
linux70-nginx-deployment-55dc5fdcf9-hwvbj 1/1 Running 0 36m
linux70-nginx-deployment-55dc5fdcf9-jjzx6 1/1 Running 0 36m
linux70-nginx-deployment-55dc5fdcf9-m6fjt 1/1 Running 0 36m
net-test1 1/1 Running 0 33m
net-test2 1/1 Running 0 21m
net-test3 1/1 Running 0 21m