Don't forget line 18 to 25 create a hostfile; you can export the path for it
and then can use deepspeed --hostfile=$varibale_name_that_have_been_exported other stuff
| #!/bin/bash | |
| #SBATCH --job-name="elm" | |
| #SBATCH --partition=gpu | |
| #SBATCH --mem-per-cpu=16GB # Amount of CPU memory | |
| #SBATCH --nodes=1 | |
| #SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node! | |
| #SBATCH --cpus-per-task=6 # Number of cores per tasks | |
| #SBATCH --hint=nomultithread # We get physical cores not logical | |
| #SBATCH --gres=gpu:8 # Number of gpus | |
| #SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go | |
| #SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go | |
| #SBATCH --exclusive # Turn off node sharing | |
| #SBATCH --comment=elm | |
| module load openmpi | |
| module load cuda/11.4 | |
| mkdir -p /fsx/home-$(whoami)/hostfiles | |
| hostfile=/fsx/home-$(whoami)/hostfiles/hosts_$SLURM_JOBID | |
| rm $hostfile &> /dev/null # for consecutive calls to this script in interactive jobs | |
| for i in `scontrol show hostnames $SLURM_NODELIST` | |
| do | |
| echo $i slots=8 >>$hostfile | |
| done | |
| export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
| export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
| export MASTER_PORT=12802 | |
| export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | |
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib | |
| export NCCL_PROTO=simple | |
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib | |
| export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin | |
| export FI_EFA_FORK_SAFE=1 | |
| export FI_LOG_LEVEL=1 | |
| export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
| export NCCL_DEBUG=info | |
| export OMPI_MCA_mtl_base_verbose=1 | |
| export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
| export FI_PROVIDER=efa | |
| export FI_EFA_TX_MIN_CREDITS=64 | |
| export NCCL_TREE_THRESHOLD=0 | |
| export OMPI_MCA_pml="^cm" | |
| export OMPI_MCA_btl="tcp,self" | |
| export OMPI_MCA_btl_tcp_if_exclude="lo,docker1" | |
| export OMPI_MCA_plm_rsh_no_tree_spawn=1 | |
| export TORCH_EXTENSIONS_DIR=/fsx/codeSeCodegen/extensions | |
| export XDG_CACHE_HOME=/fsx/codeSeCodegen/hf_cache | |
| #can be changed | |
| source /fsx/codeSeCodegen/codeSeEnv/bin/activate | |
| # can be changed | |
| srun --comment elm --pty bash -i |
main.sh was sanity run
def can use main2.sh it's everything that one need for running multinode deepspeed