1.环境
GPU: `Tesla T4`
OS:`CentOS-7`
Kernel: `3.10.0-1160.105.1.el7.x86_64`
2. 驱动安装
./NVIDIA-Linux-x86_64-535.129.03.run
3.nvidia-docker 安装
- docker
[root@skywalking-0001 ~]# wget -O /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo [root@skywalking-0001 ~]# yum install -y yum-utils device-mapper-persistent-data lvm2 [root@skywalking-0001 ~]# yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo [root@skywalking-0001 ~]# sed -i 's+download.docker.com+mirrors.aliyun.com/docker-ce+' /etc/yum.repos.d/docker-ce.repo [root@skywalking-0001 ~]# yum makecache fast [root@skywalking-0001 ~]# yum -y install docker-ce-19.03.9-3.el7 # 配置镜像加速器 [root@skywalking-0001 ~]# setenforce 0 && systemctl stop firewalld [root@skywalking-0001 ~]# mkdir -p /etc/docker [root@skywalking-0001 ~]# tee /etc/docker/daemon.json <<-'EOF' { "registry-mirrors": ["https://6ysopw9t.mirror.aliyuncs.com"] } EOF # 修改docker 工作目录 [root@skywalking-0001 ~]# mkdir /data/docker # 在ExecStart=/usr/bin/dockerd -H fd:// --containerd=/run/containerd/containerd.sock 后追加 --data-root=/data/docker [root@skywalking-0001 ~]# vim /usr/lib/systemd/system/docker.service [root@skywalking-0001 ~]# systemctl daemon-reload && systemctl restart docker
-
Nvidia-docker
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo sudo yum install -y nvidia-container-toolkit sudo yum install -y nvidia-docker2 sudo systemctl restart docker
-
验证
docker run --gpus all nvidia/cuda:10.0-base /bin/sh -c "while true; do echo hello world; sleep 1; done" docker run --help | grep -i gpus
4.常见问题
-
Unable to load: nvidia-installer ncurses v6 user interface
需要禁用nouveau
临时禁用: modprobe -r nouveau 永久禁用:(需要重启) mv /etc/modprobe.d/nvidia-installer-disable-nouveau.conf /etc/modprobe.d/blacklist.conf mv /boot/initramfs-$(uname -r).img /boot/initramfs-3.10.0-1160.el7.x86_64.img.bak dracut -v /boot/initramfs-$(uname -r).img $(uname -r) reboot
-
ERROR: Unable to find the development tool
cc
in your path;yum install gcc gcc-c++ -y
-
ERROR: Unable to find the kernel source tree for the currently running kernel
yum install kernel-devel -y ./NVIDIA-Linux-x86_64-535.129.03.run --kernel-source-path=/usr/src/kernels/3.10.0-1160.105.1.el7.x86_64
-
ERROR: Unable to load the kernel module 'nvidia.ko'
uname -r 和 ls /usr/src/kernels,查看是否一致 如果不一致: yum update reboot ./NVIDIA-Linux-x86_64-535.129.03.run