#!/bin/bash -x
#PBS -q gpu
#PBS -j oe
#PBS -l nodes=1:ppn=1
cd $PBS_O_WORKDIR
# your script ###
source activate python3.12_cuda
export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/cuda-12.2/bin
export CUDA_VISIBLE_DEVICES=0
nvidia-smi
运行GPU任务需要cuda 软件,使用nvidia-smi我们可以检测显卡的负载。
比如,我们GPU节点有两个显卡, 0 RTX 4090-24G显存, 1 RTX4090-24G显卡,其中0 显卡负载37%。
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.10 Driver Version: 535.86.10 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4090 On | 00000000:65:00.0 Off | Off |
| 32% 30C P2 60W / 450W | 831MiB / 24564MiB | 22% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce RTX 4090 On | 00000000:98:00.0 Off | Off |
| 32% 22C P8 20W / 450W | 3MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 5066 G /usr/bin/X 9MiB |
| 0 N/A N/A 5119 G /usr/bin/gnome-shell 12MiB |
| 0 N/A N/A 7984 C python 396MiB |
| 0 N/A N/A 32633 C python 390MiB |
+---------------------------------------------------------------------------------------+