- PKG_MEAM
- PKG_REAXFF
- PKG_GPU
- PKG_VORONOI
- DOWNLOAD_VORO=no
- VORO_INCLUDE_DIR=${INSTALL_PREFIX}/include/voro++
- VORO_LIBRARY=${INSTALL_PREFIX}/lib/libvoro++.a
- PKG_KOKKOS
- Kokkos_ENABLE_CUDA
- Kokkos_ARCH_VOLTA70
- Kokkos_ENABLE_CUDA_LAMBDA
- 部分步骤依赖于网络代理
- 需要执行
unset CMAKE_PREFIX_PATH跳过conda提供的cuda环境 - 建议更改
/tmp位置以避免临时文件过大 voro++-0.4.6只提供了静态文件安装,需要修改config.mk文件支持动态库导出,这对于deepmd是必须的export TORCH_CUDA_ARCH_LIST="7.0"手动指定目标架构,只需要编译支持V100的目标代码,可以加速编译lmp -k on g 1 -sf kk建议使用kokkos运行- 需清理makefile产生的中间文件和最终二进制来避免cmake构建出错
- lammps (20220623)
- deepmd (https://github.com/hztttt/deepmd-kit.git)(spin_virial f4fec1d)
- libtorch-cxx11-abi-shared-with-deps-2.5.1+cu121.zip
- voro++-0.4.6.tar.gz
- 系统 glibc>=2.28 (tensorflow使用cxx11abi,所以pytorch也需要cxx11abi)
- cuda<=12.2 (本分区驱动最大支持cuda版本)
- gcc>=9
- openmpi
- cudnn
- nccl
NVIDIA container image for PyTorch, release 23.07
该环境内包含
- Ubuntu 22.04
- NVIDIA CUDA® 12.1.1
- NVIDIA cuDNN 8.9.3
- NVIDIA NCCL 2.18.3
- OpenMPI 4.1.4+
包含安装编译需要的二进制依赖
apptainer通过 environment modules 导入pt_23.07.tar通过本地上传至集群00_image_create.sh由于所需内存较大,所以需要计算节点执行
docker pull nvcr.io/nvidia/pytorch:23.07-py3
docker save -o pt_23.07.tar nvcr.io/nvidia/pytorch:23.07-py3
scp -P2222 pt_23.07.tar scv6266@[email protected]:~/run/dev260127sbatch 00_image_create.shfiles/bashrc替换容器内bashrc用于condafiles/condarc替换容器内conda源01_env_create.shpkgs/torch-2.5.1+cu121-cp310-cp310-linux_x86_64.whl(可选项,可以提前下好缓存)
ALL_PREFIX=/data/run01/$USER/dev260127
module load apptainer/1.2.4
apptainer -v exec \
--fakeroot \
-B /data -B /data01 -B /data02 \
-B ${ALL_PREFIX}/files/bashrc:/root/.bashrc \
-B ${ALL_PREFIX}/files/condarc:/root/.condarc \
-B ${ALL_PREFIX}/tmp:/tmp \
${ALL_PREFIX}/sif/dev260128.sif \
bash ${ALL_PREFIX}/01_env_create.shpkgs/libtorch-cxx11-abi-shared-with-deps-2.5.1+cu121.ziptorch2.5.1不具备cxx11abi,需要使用libtorch,请将此包解压至libtorchfiles/bashrc将libtorch添加到环境变量中- http代理, pip安装deepmd时需要联网
02_1_build_deepmd.shpy接口和c++接口都进行构建02_build_deepmd.shdeepmd会尝试寻找cuda运行时,因此推荐在计算节点编译
sbatch 02_build_deepmd.sh
pkgs/libtorch-cxx11-abi-shared-with-deps-2.5.1+cu121.zippkgs/voro++-0.4.6.tar.gz解压至voro++-0.4.6/使用files/bashrc提供deepmd-kit的c++接口库, voro++的库files/nvcc_wrapper提供编译接口,需要手动修改host_compiler="g++"default_arch="sm_70"nvcc_compiler=nvcc
files/VORONOI_config.mk需要手动修改CFLAGS支持lammps动态库导出CFLAGS=-Wall -ansi -pedantic -O3 -fPIC
03_1_build_lammps.sh修改lammps的ccmake/CMakeLists.txt来使用built-in模式构建deepmd-kit扩展,只需要执行一次03_2_build_lammps.shlammps构建脚本03_3_build_voro.shvoro++构建脚本,由此拉起apptainer03_4_build_voro.shvoro++构建脚本,此处实际执行,会检查是否包含fPIC符号03_build_lammps.sh提交lammps构建脚本
bash 03_3_build_voro.sh
sbatch 03_build_lammps.sh
files/bashrc允许root用户执行installdeepmd/voro++/lammps的二进制libtorchpython_envpython环境10_1_run.sh计算脚本10_run.sh提交计算脚本
sbatch 10_run.sh#!/bin/bash
# 加载环境
source ~/.bashrc
# 自定义变量
ALL_PREFIX=/data/run01/$USER/dev260127
CONDA_ENV_NAME=python_env
# 进入conda环境
conda activate ${ALL_PREFIX}/${CONDA_ENV_NAME}
# 测试lammps
mpirun -n 1 lmp -h
# 运行测试
cd ${ALL_PREFIX}/lammps_23Jun2022_NVT_sep/examples/voronoi
mpirun -n 1 lmp -k on g 1 -sf kk -in in.voronoi
#!/bin/bash
#SBATCH --gpus=1
# 自定义变量
ALL_PREFIX=/data/run01/$USER/dev260127
# 加载模块
module load apptainer/1.2.4
# 启动容器
cd ~
apptainer -v exec \
--nv --fakeroot \
-B /data -B /data01 -B /data02 \
-B ${ALL_PREFIX}/files/bashrc:/root/.bashrc \
-B ${ALL_PREFIX}/tmp:/tmp \
${ALL_PREFIX}/sif/dev260128.sif \
bash ${ALL_PREFIX}/10_1_run.sh