一:环境需求
1:dev-box节点机器:作为远程操作控制机器,独立于集群之外,另外系统必须是ubuntu16.04,其他版本系统官方未测试过,内存最好4G,硬盘100GB+
2:master节点机器:作为整个集群的核心,主要部署Kubernetes的主节点(etcd,kube-proxy,kube-manage,pause等)以及openpai服务,并且目前尚无高可用解决方案,所以master节点只有一台。另外系统必须是ubuntu16.04,其他版本系统官方未测试过,内存建议给45G+,CPU最好8核心+,硬盘500GB+
3:worker节点机器:作为整个集群的工作机器,主要部署kubernetesd的node节点(kubectl ,kubelet,)另外系统必须是ubuntu16.04,其他版本系统官方未测试过,内存建议给20G+,CPU最好8核心+,硬盘500GB+
整个集群间网络互通
二: 注意事项
* 所有机器上的docker已经被正确安装,/etc/docker/daemon.json中必须有配置,不然执行python环境检查时会报错。
* dev-box机器上必须能够免密登陆到master和worker节点上
* worker节点种类有NvidiaGPUworker型(需要安装Nvidia驱动,建议安装Nvidia-418),CPUworker型,EnflameDTU型,目前只试过CPUworker型的,其他类型的worker类型差别不大。
* 如果要使用GPUworker,请参考[GPUworker官方文档](https://openpai.readthedocs.io/zh_CN/latest/manual/cluster-admin/installation-faqs-and-troubleshooting.html#how-to-install-gpu-driver)
三:开始部署
3.1:优化
#临时加永久关闭swap分区
swapoff -a #临时关闭
vim /etc/fstab #删除swap挂载点保存 #永久关闭,不要重启
cat /etc/hosts #各节点间设置hosts互信,集群所有节点都需要做
10.12.50.200 worker
10.12.50.201 dev-box
10.12.50.202 master
apt install ntp #,各节点间安装NTP服务,保证时间同步,集群间所有节点都需要做组件会自动在/etc/ntp.conf中生成
systemctl status ntp #查看服务状态
git clone https://github.com/microsoft/pai.git #在dev-box机器上拉取openpai的repo
3.2:配置layout.yaml文件
#在dev-box机器上checkout,选择需要安装的OpenPAI版本
cd pai #切换到pai目录下
git checkout v1.6.0 #选择v1.6.0版本
#在dev-box机器上编辑并详解layout.yaml文件
vim /root/pai/contrib/kubespray/config/layout.yaml
# GPU cluster example
# This is a cluster with one master node and two worker nodes
machine-sku: #master节点配置段
master-machine: # define a machine sku
# the resource requirements for all the machines of this sku
# We use the same memory format as Kubernetes, e.g. Gi, Mi
# Reference: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory
mem: 45Gi #最大分配内存,如果有50G,建议写45G
cpu:
# the number of CPU vcores
vcore: 4 #CPU的核心数,条件有限,所以这里是4C,如果是GPU型的建议是8C
gpu-machine: #GPU的worker配置项,因为没有纯GPU服务器,所以这里只需要改下内存大小和CPU核心数,后面安装时选择跳过即可。
computing-device:
# For `type`, please follow the same format specified in device plugin.
# For example, `nvidia.com/gpu` is for NVIDIA GPU, `amd.com/gpu` is for AMD GPU,
# and `enflame.com/dtu` is for Enflame DTU.
# Reference: https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
type: nvidia.com/gpu #默认选择的是Nvidia官方驱动
model: K80
count: 4
mem: 18Gi #内存是18GB
cpu:
vcore: 4 #CPU核心数是4C
machine-list: #配置应用项
- hostname: pai-master # name of the machine, **do not** use upper case alphabet letters for hostname
hostip: 10.12.50.202 #openpai的master节点IP
machine-type: master-machine # only one master-machine supported
pai-master: "true" #引用上面定义的master-machine
- hostname: pai-worker1
hostip: 10.12.50.200 #openpai的worker节点IP
machine-type: gpu-machine
pai-worker: "true" #引用上面定义的gpu-machine
#- hostname: pai-worker2 #如果有多个worker节点写多行即可,没有的话就注释掉
# hostip: 10.0.0.3
# machine-type: gpu-machine
# pai-worker: "true"
3.3:配置config.yaml文件
#在dev-box机器上编辑并详解config.yaml文件
vim /root/pai/contrib/kubespray/config/config.yaml
user: root #dev-box机器ssh集群的用户名
password: 123456 #ssh用户密码
docker_image_tag: v1.6.0 #使用最新的版本,也可以自己改动
# Optional
#######################################################################
# OpenPAI Customized Settings #
#######################################################################
#enable_hived_scheduler: true #非CPU,GPU,Enflameworker节点的机器,去掉注释并改为false
enable_docker_cache: false #关闭docker缓存
docker_cache_storage_backend: "azure" # or "filesystem"
docker_cache_azure_account_name: ""
docker_cache_azure_account_key: ""
docker_cache_azure_container_name: "dockerregistry"
docker_cache_fs_mount_path: "/var/lib/registry"
docker_cache_remote_url: "https://registry-1.docker.io"
docker_cache_htpasswd: ""
enable_marketplace: "false"
#############################################
# Ansible-playbooks' inventory hosts' vars. #
#############################################
ssh_key_file_path: /path/to/you/key/file
#####################################
# OpenPAI's service image registry. #
#####################################
#docker_registry_domain: docker.io #中国大陆用户注释掉以下选项,如果可以翻墙则需要自己有docker.io的账户和密码,用于拉取相关镜像
#docker_registry_namespace: openpai
#docker_registry_username: exampleuser
#docker_registry_password: examplepasswd
################################################################
# OpenPAI's daemon qos config. #
# By default, the QoS class for PAI daemon is BestEffort. #
# If you want to promote QoS class to Burstable or Guaranteed, #
# you should set the value to true. #
################################################################
qos-switch: "false" #关闭即可,如果开启的话,每台节点平均内存需要提高1G左右
###########################################################################################
# Pre-check setting #
###########################################################################################
docker_check: false #docker版本及其引擎检测
resource_check: true
########################################################################################
# Advanced docker configuration. If you are not familiar with them, don't change them. #
########################################################################################
#docker_data_root: /mnt/docker #openpai中docker的高级用法,官方建议不熟悉按默认配置即可
#docker_config_file_path: /etc/docker/daemon.json
#docker_iptables_enabled: false
## An obvious use case is allowing insecure-registry access to self hosted registries.
## Can be ipaddress and domain_name.
## example define 172.19.16.11 or mirror.registry.io #如果你有自己的私有镜像仓库,你也可以自己在这里定义
#openpai_docker_insecure_registries:
#- mirror.registry.io
#- 172.19.16.11
## Add other registry,example China registry mirror. #如果你自己的私有镜像仓库在国内,你也可以自己在这里定义
#openpai_docker_registry_mirrors:
#- https://registry.docker-cn.com
#- https://mirror.aliyuncs.com
#######################################################################
# kubespray setting #
#######################################################################
# If you couldn't access to gcr.io or docker.io, please configure it. #这里的设置是部署Kubernetes的所需要的镜像下载地址,国内用户使用如下配置
gcr_image_repo: "gcr.io"
kube_image_repo: "gcr.io/google-containers" #指定k8s下载基础镜像地址,如coredns,proxy等
quay_image_repo: "quay.io"
docker_image_repo: "docker.io"
etcd_image_repo: "quay.io/coreos/etcd" #etcd数据库镜像下载
pod_infra_image_repo: "gcr.io/google_containers/pause-{{ image_arch }}"
kubeadm_download_url: "https://storage.googleapis.com/kubernetes-release/release/{{ kubeadm_version }}/bin/linux/{{ image_arch }}/kubeadm"
hyperkube_download_url: "https://storage.googleapis.com/kubernetes-release/release/{{ kube_version }}/bin/linux/{{ image_arch }}/hyperkube"
#openpai_kube_network_plugin: calico #calico二层网络插件
openpai_kubespray_extra_var:
pod_infra_image_repo: "mirrorgooglecontainers/pause-{{ image_arch }}" #下载pod根容器官方使用的是gcr.azk8s.cn/google_containers,我试了这个地址,已经被关闭了,只能使用这个地址mirrorgooglecontainers
dnsautoscaler_image_repo: "mirrorgooglecontainers/cluster-proportional-autoscaler-{{ image_arch }}" #下载自动扩缩容镜像
tiller_image_repo: "gcr.azk8s.cn/kubernetes-helm/tiller"
registry_proxy_image_repo: "gcr.azk8s.cn/google_containers/kube-registry-proxy"
metrics_server_image_repo: "gcr.azk8s.cn/google_containers/metrics-server-amd64"
addon_resizer_image_repo: "gcr.azk8s.cn/google_containers/addon-resizer"
dashboard_image_repo: "mirrorgooglecontainers/kubernetes-dashboard-{{ image_arch }}" #下载dashboard镜像
#######################################################################
# host daemon port setting #
#######################################################################
host_daemon_port_start: 40000
host_daemon_port_end: 65535
3.4:安装K8S
cd /root/pai/contrib/kubespray #开始安装Kubernetes
/bin/bash quick-start-kubespray.sh -v #-v查看完整日志