Skip to content

Instantly share code, notes, and snippets.

@saforem2
Created February 3, 2025 20:27
Show Gist options
  • Select an option

  • Save saforem2/acfe73b20b28ed04c11ffbb66dc21ba8 to your computer and use it in GitHub Desktop.

Select an option

Save saforem2/acfe73b20b28ed04c11ffbb66dc21ba8 to your computer and use it in GitHub Desktop.
ezpz
#!/bin/bash --login
# @file utils.sh
# @brief `ezpz` helper script with functions to make life ez.
# @description
# This file provides multiple helper functions, all prefixed with "ezpz_"
# - `ezpz_setup_job`
# - `ezpz_setup_python`
# - ...
#
if [[ "$(command -v setopt)" ]]; then
setopt aliases
elif [[ "$(command -v shopt)" ]]; then
shopt -s expand_aliases
fi
RESET="\e[0m"
BLACK="\e[1;30m"
RED="\e[1;31m"
GREEN="\e[1;32m"
YELLOW="\e[1;33m"
BLUE="\e[1;34m"
MAGENTA="\e[1;35m"
CYAN="\e[1;36m"
# WHITE="\e[1;37m"
# BACKGROUND_BLACK="\e[1;40m"
# BACKGROUND_RED="\e[1;41m"
# BACKGROUND_GREEN="\e[1;42m"
# BACKGROUND_YELLOW="\e[1;43m"
# BACKGROUND_BLUE="\e[1;44m"
# BACKGROUND_MAGENTA="\e[1;45m"
# BACKGROUND_CYAN="\e[1;46m"
# BACKGROUND_WHITE="\e[1;47m"
# BRIGHT_BLACK="\e[1;90m"
# BRIGHT_RED="\e[1;91m"
# BRIGHT_GREEN="\e[1;92m"
# BRIGHT_YELLOW="\e[1;93m"
BRIGHT_BLUE="\e[1;94m"
# BRIGHT_MAGENTA="\e[1;95m"
# BRIGHT_CYAN="\e[1;96m"
# BRIGHT_WHITE="\e[1;97m"
# BACKGROUND_BRIGHT_BLACK="\e[1;100m"
# BACKGROUND_BRIGHT_RED="\e[1;101m"
# BACKGROUND_BRIGHT_GREEN="\e[1;102m"
# BACKGROUND_BRIGHT_YELLOW="\e[1;103m"
# BACKGROUND_BRIGHT_BLUE="\e[1;104m"
# BACKGROUND_BRIGHT_MAGENTA="\e[1;105m"
# BACKGROUND_BRIGHT_CYAN="\e[1;106m"
# BACKGROUND_BRIGHT_WHITE="\e[1;107m"
HOSTNAME=$(hostname)
PBS_ENV_FILE="${HOME}/.pbsenv"
SLURM_ENV_FILE="${HOME}/.slurmenv"
# HEADER_LINE="┌─────────────────────────────────────────────────────────────────────┐\n"
# FOOTER_LINE="└─────────────────────────────────────────────────────────────────────┘\n"
# HEADER="\n"
# FOOTER="\n"
###############################################################################
# Check if running in DEBUG=1 mode.
# - If so, this will print each command before it is ran and exit if any of
# them return a nonzero exit status.
###############################################################################
if [[ -n "${DEBUG-}" ]]; then # to use: `DEBUG=1 bash train_llama_alcf.sh`
printf "\e[1;31m%s\e[0m\n" "!! RUNNING IN DEBUG MODE !!"
_shell_name=$(ezpz_get_shell_name)
if [[ "${_shell_name}" == "zsh" ]]; then
echo "No debug"
# set -x # o pipefail
else
set -euxo pipefail
fi
fi
###############################################################################
# Print (but DO NOT EXECUTE !!) each command that would be ran.
#
# Enable with: NOOP=1 PBS_O_WORKDIR=$(pwd) bash train_llama_alcf.sh
###############################################################################
if [[ -v NOOP ]]; then # to use: `NOOP=1 bash train_llama_alcf.sh`
echo "Run NOOP mode"
set -o noexec # same as set -n
fi
# @description Get name of shell.
# Strip off `/bin/` substr from "${SHELL}" env var and return this string.
#
# @example
# $ echo "${SHELL}"
# /bin/zsh
# $ ezpz_get_shell_name
# zsh
ezpz_get_shell_name() {
echo "${SHELL}" | sed -e "s/\/bin\///g"
}
ezpz_get_tstamp() {
printf "%s" "$(date "+%Y-%m-%d-%H%M%S")"
}
####################
# ezpz_qsme_running
#
# prints 1 line for each running job owned by $USER
#
# each line of the form:
#
# <jobid> <elapsed_time> <node0> <node1> <node2> ...
####################
ezpz_qsme_running() {
qstat -u "${USER}" -n1rw | sed -e "s/\/0\*208/\ /g" | tr "+|." "\ " | awk '{a = ""; for (i = 13 ; i <= NF ; i++) a = a " " $i; print $1 a}' | grep -vE "aurora-pbs|Req|Job|\-\-"
}
###############################
# ezpz_get_jobid_from_hostname
#
# Identify jobid containing "$(hostname)" from all active (running) jobs owned
# by the $USER.
#
# Example:
# --------
# Look for `$(hostname)` in output from `ezpz_qsme_running`, and print the first
# column
#
# | jobid | host0 | host1 | host2 |
# |:---------:|:--------:|:---------:|:--------:|
# | jobid0 | host00 | host10 | host20 |
# | jobid1 | host01 | host11 | host21 |
# | jobid2 | host02 | host12 | host22 |
#
###############################
ezpz_get_jobid_from_hostname() {
# jobid=$(ezpz_qsme_running | sed 's/\/.*\ /\ /g' | sed 's/\/.*//g' | grep "$(hostname | sed 's/\..*//g')" | awk '{print $1}')
jobid=$(ezpz_qsme_running | grep "^[0-9]" | grep $(hostname) | awk '{print $1}')
echo "${jobid}"
}
#######################
# Unset all:
#
# - `PBS_*`
# - {host,HOST}file
#
# environment variables
#######################
ezpz_reset_pbs_vars() {
wd="${PBS_O_WORKDIR:-${WORKING_DIR:-$(pwd)}}"
vars=($(printenv | grep -iE "^PBS" | tr "=" " " | awk '{print $1}'))
for v in "$vars[@]"; do echo "Unsetting $v" && unset -v "${v}"; done
export PBS_O_WORKDIR="${wd}"
}
######################################
# ezpz_get_pbs_nodefile_from_hostname
#
# Return path to PBS_NODEFILE corresponding to the jobid that was identified as
# containing the (currently active, determined by `$(hostname)`) host.
#
# Example:
# --------
# Look for $(hostname) in output from `ezpz_qsme_running`
#
# | jobid | host0 | host1 | host2 |
# |:---------:|:--------:|:---------:|:--------:|
# | jobid0 | host00 | host10 | host20 |
# | jobid1 | host01 | host11 | host21 |
# | jobid2 | host02 | host12 | host22 |
#
# then, once we've identified the `jobid` containing `$(hostname)`, we can use
# that to reconstruct the path to our jobs' `PBS_NODEFILE`, which is located at
#
# ```bash
# /var/spool/pbs/aux/${jobid}
# ````
######################################
ezpz_get_pbs_nodefile_from_hostname() {
jobid=$(ezpz_get_jobid_from_hostname)
if [[ -n "${jobid}" ]]; then
match=$(/bin/ls /var/spool/pbs/aux/ | grep "${jobid}")
hostfile="/var/spool/pbs/aux/${match}"
if [[ -f "${hostfile}" ]]; then
export PBS_NODEFILE="${hostfile}"
_pbs_jobid=$(echo "${PBS_NODEFILE}" | tr "/" " " | awk '{print $NF}')
export PBS_JOBID="${_pbs_jobid}"
echo "${hostfile}"
fi
fi
}
ezpz_save_dotenv() {
if [[ "$#" -ne 1 ]]; then
estr="[error]"
# echo "Expected exactly one argument, specifying outputdir. Received $#"
printf "%s Expected one argument (outdir). Received: %s" "$(printRed "${estr}")" "$#"
else
outdir="$1"
mkdir -p "${outdir}"
module list
dotenv_file="${outdir}/.env"
echo "Saving environment to ${dotenv_file}"
printenv | grep -v "LS_COLORS" >"${dotenv_file}"
export DOTENV_FILE="${dotenv_file}"
fi
}
######################################################################
# ezpz_get_machine_name: Return current machine name, as lowercase string
######################################################################
ezpz_get_machine_name() {
if [[ $(hostname) == x4* || $(hostname) == aurora* ]]; then
machine="aurora"
elif [[ $(hostname) == x1* || $(hostname) == uan* ]]; then
machine="sunspot"
elif [[ $(hostname) == sophia* ]]; then
machine="sophia"
elif [[ $(hostname) == x3* || $(hostname) == polaris* ]]; then
if [[ "${PBS_O_HOST:-}" == sirius* ]]; then
machine="sirius"
else
machine="polaris"
fi
elif [[ $(hostname) == frontier* ]]; then
machine="frontier"
elif [[ $(hostname) == nid* ]]; then
machine="perlmutter"
else
machine=$(hostname)
fi
echo "${machine}"
}
ezpz_check_and_kill_if_running() {
# kill $(ps aux | grep -E "$USER.+(mpi|main.py)" | grep -v grep | awk '{print $2}')
RUNNING_PIDS=$(lsof -i:29500 -Fp | head -n 1 | sed 's/^p//')
if [[ -n "${RUNNING_PIDS}" ]]; then
echo "Caught ${RUNNING_PIDS}" && kill "${RUNNING_PIDS}"
else
echo "Not currently running. Continuing!"
fi
}
#################################
# ezpz_get_slurm_running_jobid
# Retruns SLURM_JOBID of running slurm jobs
#################################
ezpz_get_slurm_running_jobid() {
if [[ -n $(command -v sacct) ]]; then
jobid=$(sacct --format=JobID,NodeList%-30,state%20 --user "${USER}" -s R | grep -Ev "\.int|\.ext|^JobID|^---" | awk '{print $1}')
echo "${jobid}"
fi
}
ezpz_get_slurm_running_nodelist() {
if [[ -n $(command -v sacct) ]]; then
slurm_nodelist=$(sacct --format=JobID,NodeList%-30,state%20 --user $USER -s R | grep -Ev "\.int|\.ext|^JobID|^---" | awk '{print $2}')
echo "${slurm_nodelist}"
fi
}
ezpz_make_slurm_nodefile() {
if [[ "$#" == 1 ]]; then
outfile="$1"
else
outfile="nodefile"
fi
snodelist="${SLURM_NODELIST:-$(ezpz_get_slurm_running_nodelist)}"
if [[ -n $(command -v scontrol) ]]; then
scontrol show hostname "${snodelist}" >"${outfile}"
echo "${outfile}"
fi
}
ezpz_setup_srun() {
# if [[ $(hostname) == login* || $(hostname) == nid* ]]; then
export NHOSTS="${SLURM_NNODES:-1}"
export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}"
export HOSTFILE="${HOSTFILE:-$(ezpz_make_slurm_nodefile "$@")}"
export NGPUS="$((NHOSTS * NGPU_PER_HOST))"
export SRUN_EXEC="srun -l -u --verbose -N${SLURM_NNODES} -n$((SLURM_NNODES * SLURM_GPUS_ON_NODE))"
# export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose"
# else
# echo "Skipping ezpz_setup_srun() on $(hostname)"
# fi
}
############################################################################
# save_ds_env
#
# Save important environment variables to .deepspeed_env, which will be
# forwarded to ALL ranks with DeepSpeed
############################################################################
ezpz_save_ds_env() {
echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env"
{
echo "PATH=${PATH}"
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
echo "http_proxy=${http_proxy:-}"
echo "https_proxy=${https_proxy:-}"
echo "CFLAGS=${CFLAGS}"
echo "PYTHONUSERBASE=$PYTHONUSERBASE"
} >.deepspeed_env
}
###########################
# Setup conda on Frontier
###########################
ezpz_setup_conda_frontier() {
if [[ -z "${CONDA_PREFIX:-}" ]]; then
module load PrgEnv-gnu/8.5.0
module load craype-accel-amd-gfx90a
module load rocm
micromamba activate /lustre/orion/csc613/scratch/foremans/envs/micromamba/py3.10-torch2.2-rocm5.7
# module load miniforge3/23.11.0-0
# eval "$(/autofs/nccs-svm1_sw/frontier/miniforge3/23.11.0-0/bin/conda "shell.$(echo $SHELL | tr '/' ' ' | awk '{print $NF}')" hook)"
# conda activate pytorch2.2-rocm5.7
fi
}
###########################
# Setup conda on Sunspot
###########################
ezpz_setup_conda_sunspot() {
###### check if CONDA_PREFIX non-empty ################
if [[ -z "${CONDA_PREFIX:-}" ]]; then
module use /opt/aurora/24.180.1/modulefiles
module load frameworks/2024.2.1_u1
# module use /soft/preview-modulefiles/24.086.0
# module load frameworks/2024.04.15.002.lua
# module use /soft/preview-modulefiles/24.086.0 ; module load frameworks/2024.04.15.002.lua
# source "${WORKING_DIR}/ALCF/sunspot-env-2024-q2.sh"
fi
}
###########################
# Setup conda on Aurora
###########################
ezpz_setup_conda_aurora() {
if [[ -z "${CONDA_PREFIX:-}" ]]; then
# NOTE: Updated 2024-10-08 [@saforem2]
module load frameworks
module load mpich
else
printf "Caught CONDA_PREFIX=%s from environment, using this!" "${CONDA_PREFIX}"
fi
}
########################
# Setup conda on Sirius
########################
ezpz_setup_conda_sirius() {
if [[ -z "${CONDA_PREFIX:-}" && -z "${VIRTUAL_ENV-}" ]]; then
export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba
shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}')
eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell "${shell_name}")"
micromamba activate 2024-04-23
else
echo "Found existing python at: $(which python3)"
fi
}
# ########################
# # Setup conda on Sophia
# ########################
ezpz_setup_conda_sophia() {
if [[ -z "${CONDA_PREFIX:-}" ]]; then
module load conda
conda activate base
else
echo "Caught CONDA_PREFIX=${CONDA_PREFIX}"
fi
}
########################
# Setup conda on Polaris
########################
ezpz_setup_conda_polaris() {
# unset MPICH_GPU_SUPPORT_ENABLED
###### check if CONDA_PREFIX non-empty ################
if [[ -z "${CONDA_PREFIX:-}" ]]; then
# if so, load the default conda/2024-04-29
# module and activate base environment
module use /soft/modulefiles
module load conda
conda activate base
else
echo "Caught CONDA_PREFIX=${CONDA_PREFIX}"
fi
}
ezpz_setup_conda() {
# machine_name=$(ezpz_get_machine_name)
# if [[ "${machine_name}" == "aurora" ]]; then
machine_name=$(ezpz_get_machine_name)
# echo "machine name: ${machine_name}"
if [[ "${machine_name}" == "aurora" ]]; then
ezpz_setup_conda_aurora
elif [[ "${machine_name}" == "sophia" ]]; then
ezpz_setup_conda_sophia
elif [[ "${machine_name}" == "sunspot" ]]; then
ezpz_setup_conda_sunspot
elif [[ "${machine_name}" == "polaris" ]]; then
if [[ "${PBS_O_HOST:-}" == sirius* ]]; then
ezpz_setup_conda_sirius
else
ezpz_setup_conda_polaris
fi
elif [[ $(hostname) == frontier* ]]; then
ezpz_setup_conda_frontier
elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then
echo "Running on Perlmutter !!"
module load pytorch
source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate"
else # ------------------------------------- [Unknown] -------------------
echo "Unknown hostname $(hostname)"
exit 1
fi
# # ----- [Perlmutter @ NERSC] -------------------------------------
}
########################
# setup_venv_from_conda
#
# Build (if necessary) a virtual environment
# on top of the active conda and
# activate it.
# ######################
ezpz_setup_venv_from_conda() {
if [[ -z "${CONDA_PREFIX:-}" ]]; then
echo "!! No CONDA_PREFIX var found." # Exiting."
# exit 1
else
echo "Found conda at: ${CONDA_PREFIX}"
CONDA_NAME=$(echo "${CONDA_PREFIX}" | tr '\/' '\t' | sed -E 's/mconda3|\/base//g' | awk '{print $NF}')
export CONDA_NAME
if [[ -z "${VIRTUAL_ENV:-}" ]]; then
echo "No VIRTUAL_ENV found in environment!"
echo " - Trying to setup from ${CONDA_PREFIX}"
export VENV_DIR="${WORKING_DIR}/venvs/${CONDA_NAME}"
echo " - Using VENV_DIR=${VENV_DIR}"
if [[ ! -f "${VENV_DIR}/bin/activate" ]]; then
printf "\n - Creating a new virtual env on top of %s in %s\n" "$(printBlue "${CONDA_NAME}")" "$(printGreen "${VENV_DIR}")"
mkdir -p "${VENV_DIR}"
python3 -m venv "${VENV_DIR}" --system-site-packages
source "${VENV_DIR}/bin/activate" || exit
elif [[ -f "${VENV_DIR}/bin/activate" ]]; then
echo " - Found existing venv, activating from $(printBlue "${VENV_DIR}")"
source "${VENV_DIR}/bin/activate" || exit
else
printf "\n [!! %s]: Unable to locate %s\n" "$(printRed "ERROR")" "$(printMagenta "${VENV_DIR}/bin/activate")"
fi
fi
fi
}
##############################################################################
# `setup_python`:
#
# 1. Setup `conda`
# - if `conda` nonempty, and `venv` empty, use `conda` to setup `venv`.
# - if `venv` nonempty, and `conda` empty, what do (???)
# - if `venv` nonempty and `conda` nonempty, use these
# - if `conda` empty and `venv` empty:
# - if `hostname == x4*`, we're on Aurora
# - if `hostname == x1*`, we're on Sunspot
# - if `hostname == x3*`, we're on Polaris
# - if `hostname == nid*`, we're on Perlmutter
# - otherwise, you're on you're own
#
# 2. Activate (creating, if necessary) a `venv` on top of `base` conda
# - use the $CONDA_PREFIX to create a venv in
# `Megatron-DeepSpeed/venvs/${CONDA_PREFIX}`
# - activate and use this
#
# 3. Print info about which python we're using
##############################################################################
ezpz_setup_python() {
virtual_env="${VIRTUAL_ENV:-}"
conda_prefix="${CONDA_PREFIX:-}"
if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then
echo "No conda_prefix OR virtual_env found in environment..."
echo "Setting up conda..."
ezpz_setup_conda
elif [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then
echo "No virtual environment found."
echo "Using conda from: ${conda_prefix}"
echo "Setting up venv from ${CONDA_PROMPT_MODIFIER:-}"
ezpz_setup_venv_from_conda
elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then
echo "No conda found."
echo "Using virtual_env from: ${virtual_env}"
elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then
echo "Using virtual_env: ${virtual_env} on top of conda from: ${conda_prefix}"
else
echo "Unable to setup python environment. Exiting"
exit 1
fi
if [[ -z "${virtual_env}" ]]; then
ezpz_setup_venv_from_conda
fi
printf "[python] Using ${MAGENTA}%s${RESET}\n" "$(which python3)"
python_exec=$(which python3)
export PYTHON_EXEC="${python_exec}"
}
whereAmI() {
python3 -c 'import os; print(os.getcwd())'
}
join_by() {
local d=${1-} f=${2-}
if shift 2; then
printf %s "$f" "${@/#/$d}"
fi
}
ezpz_parse_hostfile() {
if [[ "$#" != 1 ]]; then
echo "Expected exactly one argument: hostfile"
echo "Received: $#"
fi
hf="$1"
num_hosts=$(ezpz_get_num_hosts "${hf}")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus=$((num_hosts * num_gpus_per_host))
echo "${num_hosts}" "${num_gpus_per_host}" "${num_gpus}"
}
ezpz_get_dist_launch_cmd() {
if [[ "$#" != 1 ]]; then
echo "Expected exactly one argument: hostfile"
echo "Received: $#"
fi
hf="$1"
mn=$(ezpz_get_machine_name)
num_hosts=$(ezpz_get_num_hosts "${hf}")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus="$((num_hosts * num_gpus_per_host))"
num_cores_per_host=$(getconf _NPROCESSORS_ONLN)
num_cpus_per_host=$((num_cores_per_host / 2))
depth=$((num_cpus_per_host / num_gpus_per_host))
if [[ "${mn}" == "sophia" ]]; then
dist_launch_cmd="mpirun -n ${num_gpus} -N ${num_gpus_per_host} --hostfile ${hostfile} -x PATH -x LD_LIBRARY_PATH"
else
dist_launch_cmd="mpiexec --verbose --envall -n ${num_gpus} -ppn ${num_gpus_per_host} --hostfile ${hostfile} --cpu-bind depth -d ${depth}"
fi
if [[ "${mn}" == "aurora" ]]; then
dist_launch_cmd="${dist_launch_cmd} --no-vni"
fi
echo "${dist_launch_cmd}"
}
ezpz_save_pbs_env() {
printf "\n[${BLUE}%s${RESET}]\n" "ezpz_save_pbs_env"
if [[ "$#" == 0 ]]; then
hostfile="${HOSTFILE:-${PBS_NODEFILE}}"
jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
elif [[ "$#" == 1 ]]; then
printf " • Caught ${BLUE}%s${RESET} arguments\n" "$#"
hostfile="$1"
elif [[ "$#" == 2 ]]; then
printf " • Caught ${BLUE}%s${RESET} arguments\n" "$#"
hostfile="$1"
jobenv_file="$2"
else
hostfile="${HOSTFILE:-${PBS_NODEFILE}}"
jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
fi
if [[ -n $(printenv | grep PBS_JOBID) ]]; then
PBS_VARS=$(env | grep PBS)
echo "${PBS_VARS[*]}" >"${jobenv_file}"
if [[ "${hostfile}" != "${PBS_NODEFILE:-}" ]]; then
printf "\n"
printf " • Caught ${RED}%s${RESET} != ${RED}%s${RESET} \n" "hostfile" "PBS_NODEFILE"
printf " • hostfile: ${RED}%s${RESET}\n" "${hostfile}"
printf " • PBS_NODEFILE: ${RED}%s${RESET}\n" "${PBS_NODEFILE}"
printf "\n"
fi
printf " • Using:\n"
printf " • hostfile: ${BLUE}%s${RESET}\n" "${hostfile}"
printf " • jobenv_file: ${BLUE}%s${RESET}\n" "${jobenv_file}"
sed -i 's/^PBS/export\ PBS/g' "${jobenv_file}"
sed -i 's/^HOSTFILE/export\ HOSTFILE/g' "${jobenv_file}"
# dist_env=$(ezpz_parse_hostfile "${hostfile}")
num_hosts=$(ezpz_get_num_hosts "${hostfile}")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus="$((num_hosts * num_gpus_per_host))"
num_cores_per_host=$(getconf _NPROCESSORS_ONLN)
num_cpus_per_host=$((num_cores_per_host / 2))
depth=$((num_cpus_per_host / num_gpus_per_host))
dist_launch_cmd=$(ezpz_get_dist_launch_cmd "${hostfile}")
# dist_launch_cmd="mpiexec --verbose --envall -n ${num_gpus} -ppn ${num_gpus_per_host} --hostfile ${hostfile} --cpu-bind depth -d ${depth}"
# dist_env=()
# dist_env+=($(ezpz_parse_hostfile "$(ezpz_get_pbs_nodefile_from_hostname)"))
# num_hosts="${dist_env[1]}"
# num_gpus_per_host="${dist_env[2]}"
# num_gpus="${dist_env[3]}"
# dist_launch_cmd=$(ezpz_get_dist_launch_cmd "${hostfile}")
# num_hosts=$(ezpz_get_num_hosts "${hostfile}")
# num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
# num_gpus=$(( num_hosts * num_gpus_per_host ))
printf " to calculate:\n"
printf " • num_hosts: ${BLUE}%s${RESET}\n" "${num_hosts}"
printf " • num_cores_per_host: ${BLUE}%s${RESET}\n" "${num_cores_per_host}"
printf " • num_cpus_per_host: ${BLUE}%s${RESET}\n" "${num_cpus_per_host}"
printf " • num_gpus_per_host: ${BLUE}%s${RESET}\n" "${num_gpus_per_host}"
printf " • depth: ${BLUE}%s${RESET}\n" "${depth}"
printf " • num_gpus: ${BLUE}%s${RESET}\n" "${num_gpus}"
# getNumGPUs
# NGPUS="$(( NHOSTS * NGPU_PER_HOST ))"
# export DIST_LAUNCH="mpiexec --verbose --envall -n ${num_gpus} -ppn ${num_gpus_per_host} --hostfile ${hostfile} --cpu-bind depth -d 16"
export DIST_LAUNCH="${dist_launch_cmd}"
export ezlaunch="${DIST_LAUNCH}"
# printf "Caught ${BLUE}HOSTFILE${RESET} != ${BLUE}PBS_NODEFILE${RESET} \n"
printf " • DIST_LAUNCH: ${BLUE}%s${RESET}\n" "${DIST_LAUNCH}"
fi
export HOSTFILE="${hostfile}"
export JOBENV_FILE="${jobenv_file}"
printf " • Setting:\n"
printf " • HOSTFILE: ${BLUE}%s${RESET}\n" "${HOSTFILE}"
printf " • JOBENV_FILE: ${BLUE}%s${RESET}\n\n" "${JOBENV_FILE}"
}
ezpz_save_slurm_env() {
printf "\n[${BLUE}%s${RESET}]\n" "ezpz_save_slurm_env"
if [[ "$#" == 0 ]]; then
# hostfile="${HOSTFILE:-${PBS_NODEFILE}}"
hostfile="${HOSTFILE:-$(ezpz_make_slurm_nodefile)}"
jobenv_file="${JOBENV_FILE:-${SLURM_ENV_FILE}}"
elif [[ "$#" == 1 ]]; then
printf " • Caught ${BLUE}%s${RESET} arguments\n" "$#"
hostfile="$1"
jobenv_file="${JOBENV_FILE:-${SLURM_ENV_FILE}}"
elif [[ "$#" == 2 ]]; then
printf " • Caught ${BLUE}%s${RESET} arguments\n" "$#"
hostfile="$1"
jobenv_file="$2"
else
hostfile="${HOSTFILE:-$(ezpz_make_slurm_nodefile)}"
jobenv_file="${JOBENV_FILE:-${SLURM_ENV_FILE}}"
fi
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
SLURM_VARS=$(env | grep SLU)
echo "${SLURM_VARS[*]}" >"${jobenv_file}"
# if [[ "${hostfile}" != "${SLURM_NODEFILE:-}" ]]; then
# printf "\n"
# printf " • Caught ${RED}%s${RESET} != ${RED}%s${RESET} \n" "hostfile" "SLURM_NODEFILE"
# printf " • hostfile: ${RED}%s${RESET}\n" "${hostfile}"
# printf " • SLURM_NODEFILE: ${RED}%s${RESET}\n" "${SLURM_NODEFILE}"
# printf "\n"
# fi
printf " • Using:\n"
printf " • hostfile: ${BLUE}%s${RESET}\n" "${hostfile}"
printf " • jobenv_file: ${BLUE}%s${RESET}\n" "${jobenv_file}"
sed -i 's/^SLURM/export\ SLURM/g' "${jobenv_file}"
sed -i 's/^HOSTFILE/export\ HOSTFILE/g' "${jobenv_file}"
num_hosts=$(ezpz_get_num_hosts "${hostfile}")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus="$((num_hosts * num_gpus_per_host))"
# dist_env=$(ezpz_parse_hostfile "${hostfile}")
# dist_env=()
# dist_env+=("$(ezpz_parse_hostfile "$(ezpz_make_slurm_nodefile)")")
# num_hosts="${dist_env[1]}"
# num_gpus_per_host="${dist_env[2]}"
# num_gpus="${dist_env[3]}"
# dist_launch_cmd="srun -N ${num_hosts} -n ${num_gpus} -l -u --verbose"
dist_launch_cmd="srun -l -u --verbose -N${SLURM_NNODES} -n$((SLURM_NNODES * SLURM_GPUS_ON_NODE))"
printf " to calculate:\n"
printf " • num_hosts: ${BLUE}%s${RESET}\n" "${num_hosts}"
printf " • num_gpus_per_host: ${BLUE}%s${RESET}\n" "${num_gpus_per_host}"
printf " • num_gpus: ${BLUE}%s${RESET}\n" "${num_gpus}"
export DIST_LAUNCH="${dist_launch_cmd}"
export ezlaunch="${DIST_LAUNCH}"
printf " • DIST_LAUNCH: ${BLUE}%s${RESET}\n" "${DIST_LAUNCH}"
fi
export HOSTFILE="${hostfile}"
export JOBENV_FILE="${jobenv_file}"
printf " • Setting:\n"
printf " • HOSTFILE: ${BLUE}%s${RESET}\n" "${HOSTFILE}"
printf " • JOBENV_FILE: ${BLUE}%s${RESET}\n\n" "${JOBENV_FILE}"
}
ezpz_setup_host_slurm() {
printf "[${CYAN}%s${RESET}]\n" "ezpz_setup_host_slurm"
mn=$(ezpz_get_machine_name)
scheduler_type=$(ezpz_get_scheduler_type)
if [[ "${scheduler_type}" == "slurm" ]]; then
#########################################
# If no arguments passed ("$#" == 0):
#
# - `hostfile` assigned to to the first non-zero variable from:
# 1. `HOSTFILE`
# 2. `PBS_NODEFILE`
# - `jobenv_file` assigned to first non-zero variable from:
# 1. `JOBENV_FILE`
# 2. `PBS_ENV_FILE`
if [[ "$#" == 0 ]]; then
hostfile="${HOSTFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
printf " • Using hostfile: ${CYAN}%s${RESET}\n" "${hostfile}"
printf " • Found in environment:\n"
if [[ -n "${HOSTFILE:-}" ]]; then
printf " • HOSTFILE: ${CYAN}%s${RESET}\n" "${HOSTFILE}"
fi
# if [[ "${hostfile}" != "${PBS_NODEFILE}" ]]; then
elif [[ "$#" == 1 ]]; then
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
hostfile="$1"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
printf " • hostfile=${CYAN}%s${RESET}\n" "${hostfile}"
elif [[ "$#" == 2 ]]; then
hostfile="$1"
jobenv_file="$2"
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
printf " • hostfile=${CYAN}%s${RESET}\n" "${hostfile}"
printf " • jobenv_file=${CYAN}%s${RESET}\n" "${jobenv_file}"
else
echo "Expected exactly 0, 1, or 2 arguments, received: $#"
fi
printf " • Writing SLURM vars to: ${CYAN}%s${RESET}\n" "${jobenv_file}"
if [[ "${mn}" == "frontier" ]]; then
export GPU_TYPE="AMD"
_hostfile=$(ezpz_make_slurm_nodefile)
export HOSTFILE="${_hostfile}"
ezpz_save_slurm_env "$@"
elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then
export GPU_TYPE="NVIDIA"
_hostfile="$(ezpz_make_slurm_nodefile)"
export HOSTFILE="${_hostfile}"
ezpz_save_slurm_env "$@"
fi
fi
}
ezpz_setup_host_pbs() {
printf "[${CYAN}%s${RESET}]\n" "ezpz_setup_host_pbs"
mn=$(ezpz_get_machine_name)
scheduler_type=$(ezpz_get_scheduler_type)
if [[ "${scheduler_type}" == "pbs" ]]; then
#########################################
# If no arguments passed ("$#" == 0):
#
# - `hostfile` assigned to to the first non-zero variable from:
# 1. `HOSTFILE`
# 2. `PBS_NODEFILE`
# - `jobenv_file` assigned to first non-zero variable from:
# 1. `JOBENV_FILE`
# 2. `PBS_ENV_FILE`
if [[ "$#" == 0 ]]; then
# hostfile="${HOSTFILE:-${PBS_NODEFILE}}"
# jobenv_file="$(ezpz_get_jobenv_file)"
# jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
hostfile="${HOSTFILE:-$(ezpz_get_pbs_nodefile_from_hostname)}"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
printf " • Using hostfile: ${CYAN}%s${RESET}\n" "${hostfile}"
printf " • Found in environment:\n"
if [[ -n "${HOSTFILE:-}" ]]; then
printf " • HOSTFILE: ${CYAN}%s${RESET}\n" "${HOSTFILE}"
fi
# if [[ "${hostfile}" != "${PBS_NODEFILE}" ]]; then
if [[ "${scheduler_type}" == "pbs" && "${hostfile}" != "${PBS_NODEFILE:-}" ]]; then
printf " • PBS_NODEFILE: ${CYAN}%s${RESET}\n" "${PBS_NODEFILE}"
fi
elif [[ "$#" == 1 ]]; then
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
hostfile="$1"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
printf " • hostfile=${CYAN}%s${RESET}\n" "${hostfile}"
elif [[ "$#" == 2 ]]; then
hostfile="$1"
jobenv_file="$2"
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
printf " • hostfile=${CYAN}%s${RESET}\n" "${hostfile}"
printf " • jobenv_file=${CYAN}%s${RESET}\n" "${jobenv_file}"
else
echo "Expected exactly 0, 1, or 2 arguments, received: $#"
fi
hn=$(hostname)
if [[ "${hn}" == x* || "${hn}" == "sophia*" ]]; then
printf " • Writing PBS vars to: ${CYAN}%s${RESET}\n" "${jobenv_file}"
if [[ "${mn}" == "polaris" || "${mn}" == "sirius" || "${mn}" == "sophia*" ]]; then
export GPU_TYPE="NVIDIA"
ezpz_save_pbs_env "$@"
elif [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then
export GPU_TYPE="INTEL"
export NGPU_PER_TILE=6
export NTILE_PER_HOST=2
export NGPU_PER_HOST=$((NGPU_PER_TILE * NTILE_PER_HOST))
ezpz_save_pbs_env "$@"
fi
fi
fi
}
ezpz_setup_host() {
mn=$(ezpz_get_machine_name)
scheduler_type=$(ezpz_get_scheduler_type)
# printf "[${CYAN}%s${RESET}]\n" "ezpz_setup_host"
if [[ "${scheduler_type}" == "pbs" ]]; then
ezpz_setup_host_pbs "$@"
elif [[ "${scheduler_type}" ]]; then
ezpz_setup_host_slurm "$@"
else
echo "Unknown scheduler: ${scheduler_type} on ${mn}"
fi
}
###########################
# ezpz_setup_host
#
# takes 0, 1, or 2 arguments
#
# 0.
# - hostfile: Look for $HOSTFILE or $PBS_NODEFILE from environment
# - jobenv_file: Look for $JOBENV_FILE or $PBS_ENV_FILE from environment
#
# 1. hostfile: Specific hostfile to use
#
# 2.
# - hostfile: Specific hostfile to use
# - jobenv_file: Specific `.jobenv` file to use
#
#
# Then, if `hostname` starts with:
#
# - `x3*`: We're on Polaris, with 4 Nvidia A100s per node
# - `x4*` or `x1`: We're on Aurora or Sunspot with 12 Intel PVCs per node
# - `nid` or `login`: We're on Perlmutter with 4 Nvidia A100s per node
#
# if we're on any of the ALCF systems (`x[1-4]*`), we call `ezpz_save_pbs_env`,
# passing along any received arguments
###########################
ezpz_setup_host_old() {
printf "[${CYAN}%s${RESET}]\n" "ezpz_setup_host"
mn=$(ezpz_get_machine_name)
scheduler_type=$(ezpz_get_scheduler_type)
#########################################
# If no arguments passed ("$#" == 0):
#
# - `hostfile` assigned to to the first non-zero variable from:
# 1. `HOSTFILE`
# 2. `PBS_NODEFILE`
# - `jobenv_file` assigned to first non-zero variable from:
# 1. `JOBENV_FILE`
# 2. `PBS_ENV_FILE`
if [[ "$#" == 0 ]]; then
# hostfile="${HOSTFILE:-${PBS_NODEFILE}}"
# jobenv_file="$(ezpz_get_jobenv_file)"
# jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
if [[ "${scheduler_type}" == "slurm" ]]; then
hostfile="${HOSTFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}"
elif [[ "${scheduler_type}" == "pbs" ]]; then
hostfile="${HOSTFILE:-$(ezpz_get_pbs_nodefile_from_hostname)}"
else
echo "Unknown scheduler: $(ezpz_get_scheduler_type)"
fi
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
printf " • Using hostfile: ${CYAN}%s${RESET}\n" "${hostfile}"
printf " • Found in environment:\n"
if [[ -n "${HOSTFILE:-}" ]]; then
printf " • HOSTFILE: ${CYAN}%s${RESET}\n" "${HOSTFILE}"
fi
# if [[ "${hostfile}" != "${PBS_NODEFILE}" ]]; then
if [[ "${scheduler_type}" == "pbs" && "${hostfile}" != "${PBS_NODEFILE:-}" ]]; then
printf " • PBS_NODEFILE: ${CYAN}%s${RESET}\n" "${PBS_NODEFILE}"
fi
elif [[ "$#" == 1 ]]; then
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
hostfile="$1"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
printf " • hostfile=${CYAN}%s${RESET}\n" "${hostfile}"
elif [[ "$#" == 2 ]]; then
hostfile="$1"
jobenv_file="$2"
printf " • Caught ${CYAN}%s${RESET} arguments\n" "$#"
printf " • hostfile=${CYAN}%s${RESET}\n" "${hostfile}"
printf " • jobenv_file=${CYAN}%s${RESET}\n" "${jobenv_file}"
else
echo "Expected exactly 0, 1, or 2 arguments, received: $#"
fi
if [[ "${scheduler_type:-}" == "pbs" ]]; then # && "${hostfile}" != "${PBS_NODEFILE:-}" ]]; then
# if [[ $(hostname) == x3* ]]; then
hn=$(hostname)
if [[ "${hn}" == x* || "${hn}" == "sophia*" ]]; then
printf " • Writing PBS vars to: ${CYAN}%s${RESET}\n" "${jobenv_file}"
if [[ "${mn}" == "polaris" || "${mn}" == "sirius" || "${mn}" == "sophia" ]]; then
export GPU_TYPE="NVIDIA"
ezpz_save_pbs_env "$@"
elif [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then
export GPU_TYPE="INTEL"
export NGPU_PER_TILE=6
export NTILE_PER_HOST=2
export NGPU_PER_HOST=$((NGPU_PER_TILE * NTILE_PER_HOST))
ezpz_save_pbs_env "$@"
fi
fi
elif [[ "${scheduler_type:-}" == "slurm" ]]; then
printf " • Writing SLURM vars to: ${CYAN}%s${RESET}\n" "${jobenv_file}"
if [[ "${mn}" == "frontier" ]]; then
export GPU_TYPE="AMD"
_hostfile=$(ezpz_make_slurm_nodefile)
export HOSTFILE="${_hostfile}"
ezpz_save_slurm_env "$@"
elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then
export GPU_TYPE="NVIDIA"
_hostfile="$(ezpz_make_slurm_nodefile)"
export HOSTFILE="${_hostfile}"
ezpz_save_slurm_env "$@"
fi
else
echo "!! Unknown scheduler !! Neither 'pbs' nor 'slurm' ?? ${scheduler_type:-}"
echo " Unexpected hostname: $(hostname)"
export GPU_TYPE="NONE"
HOSTFILE="hostfile"
hostname >"${HOSTFILE}"
fi
}
ezpz_print_hosts() {
if [[ "$#" == 0 ]]; then
hostfile="${HOSTFILE:-${PBS_NODEFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}}"
elif [[ "$#" == 1 ]]; then
hostfile="$1"
else
# hostfile="${HOSTFILE:-${PBS_NODEFILE:-${NODEFILE}}}"
hostfile="${HOSTFILE:-${PBS_NODEFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}}"
fi
counter=0
for f in $(/bin/cat "${hostfile}"); do
# printf "│ • [host:%s] - \e[1;34m%s\e[0m\n" "${counter}" "${f}"
printf " • [host:${MAGENTA}%s${RESET}] - ${MAGENTA}%s${RESET}\n" "${counter}" "${f}"
counter=$((counter + 1))
done
}
ezpz_get_num_xpus() {
python3 -c 'import intel_extension_for_pytorch as ipex; print(ipex.xpu.device_count())'
}
ezpz_get_num_gpus_nvidia() {
if [[ -n "$(command -v nvidia-smi)" ]]; then
num_gpus=$(nvidia-smi -L | wc -l)
else
num_gpus=$(python3 -c 'import torch; print(torch.cuda.device_count())')
fi
export NGPU_PER_HOST="${num_gpus}"
echo "${num_gpus}"
}
ezpz_get_num_gpus_per_host() {
mn=$(ezpz_get_machine_name)
# export NGPU_PER_HOST=12
if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then
ngpu_per_host=12
elif [[ "${mn}" == "frontier" ]]; then
ngpu_per_host=8
else
ngpu_per_host=$(ezpz_get_num_gpus_nvidia)
fi
export NGPU_PER_HOST="${ngpu_per_host}"
echo "${ngpu_per_host}"
}
ezpz_get_num_hosts() {
if [[ "$#" == 0 ]]; then
hostfile="${HOSTFILE:-${PBS_NODEFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}}"
elif [[ "$#" == 1 ]]; then
hostfile="$1"
else
hostfile="${HOSTFILE:-${PBS_NODEFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}}"
fi
if [[ -n "${hostfile}" ]]; then
nhosts=$(wc -l <"${hostfile}")
elif [[ -n "${SLURM_NNODES:-}" ]]; then
nhosts=${SLURM_NNODES:-1}
else
nhosts=1
fi
if [[ -n "${nhosts}" ]]; then
export NHOSTS="${nhosts}"
fi
echo "${nhosts}"
}
ezpz_get_num_gpus_total() {
num_hosts=$(ezpz_get_num_hosts "$@")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus=$((num_hosts * num_gpus_per_host))
echo "${num_gpus}"
}
ezpz_get_jobenv_file() {
mn=$(ezpz_get_machine_name)
if [[ "${mn}" == "aurora" || "${mn}" == "polaris" || "${mn}" == "sunspot" || "${mn}" == "sirius" || "${mn}" == "sophia" ]]; then
echo "${JOBENV_FILE:-${PBS_ENV_FILE}}"
elif [[ "${mn}" == "frontier" || "${mn}" == "perlmutter" || -n "${SLURM_JOB_ID:-}" ]]; then
echo "${JOBENV_FILE:-${SLURM_ENV_FILE}}"
fi
}
ezpz_get_scheduler_type() {
mn=$(ezpz_get_machine_name)
if [[ "${mn}" == "aurora" || "${mn}" == "polaris" || "${mn}" == "sunspot" || "${mn}" == "sirius" || "${mn}" == "sophia" ]]; then
echo "pbs"
elif [[ "${mn}" == "frontier" || "${mn}" == "perlmutter" || -n "${SLURM_JOB_ID:-}" ]]; then
echo "slurm"
fi
}
ezpz_write_job_info() {
if [[ "$#" == 0 ]]; then
hostfile="${HOSTFILE:-${PBS_NODEFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}}"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
# jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
elif [[ "$#" == 1 ]]; then
hostfile="$1"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
elif [[ "$#" == 2 ]]; then
hostfile="$1"
jobenv_file="$2"
else
hostfile="${HOSTFILE:-${PBS_NODEFILE:-${NODEFILE:-$(ezpz_make_slurm_nodefile)}}}"
# jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
fi
# printf "[ezpz_write_job_info] Caught jobenv_file: %s\n" "${jobenv_file}"
# printf "[ezpz_write_job_info] Caught hostfile: %s\n" "${hostfile}"
# getNumGPUs
# dist_env=$(ezpz_parse_hostfile "${hostfile}")
# num_hosts=$(ezpz_get_num_hosts "${hostfile}")
# num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
# num_gpus="$((num_hosts * num_gpus_per_host))"
# num_hosts="${dist_env[1]}"
# num_gpus_per_host="${dist_env[2]}"
# num_gpus="${dist_env[3]}"
# dist_launch_cmd=$(ezpz_get_dist_launch_cmd "${hostfile}")
scheduler_type=$(ezpz_get_scheduler_type)
num_hosts=$(ezpz_get_num_hosts "${hostfile}")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus="$((num_hosts * num_gpus_per_host))"
num_cores_per_host=$(getconf _NPROCESSORS_ONLN)
num_cpus_per_host=$((num_cores_per_host / 2))
depth=$((num_cpus_per_host / num_gpus_per_host))
if [[ "${scheduler_type}" == "pbs" ]]; then
# dist_launch_cmd="mpiexec --verbose --envall -n ${num_gpus} -ppn ${num_gpus_per_host} --hostfile ${hostfile} --cpu-bind depth -d ${depth}"
dist_launch_cmd=$(ezpz_get_dist_launch_cmd "${hostfile}")
elif [[ "${scheduler_type}" == "slurm" ]]; then
# dist_launch_cmd="srun -N ${num_hosts} -n ${num_gpus} -l -u --verbose"
dist_launch_cmd="srun -l -u --verbose -N${SLURM_NNODES} -n$((SLURM_NNODES * SLURM_GPUS_ON_NODE))"
else
echo Unknown scheduler!
fi
if [[ -f "${hostfile:-}" ]]; then
HOSTS=$(join_by ', ' "$(/bin/cat "${hostfile}")")
export NHOSTS="${num_hosts}"
export NGPU_PER_HOST="${num_gpus_per_host}"
export NGPUS="${num_gpus}"
{
echo "export HOSTFILE=${hostfile}"
echo "export NHOSTS=${NHOSTS}"
echo "export NGPU_PER_HOST=${NGPU_PER_HOST}"
echo "export NGPUS=${NGPUS}"
} >>"${jobenv_file}"
export LAUNCH="${dist_launch_cmd}"
export DIST_LAUNCH="${dist_launch_cmd}"
export ezlaunch="${DIST_LAUNCH}"
# if [[ -n "${DIST_LAUNCH:-}" ]]; then
# echo "alias LAUNCH='${DIST_LAUNCH}'"
# fi
export LAUNCH="${DIST_LAUNCH}"
export ezlaunch="${DIST_LAUNCH}"
alias launch="${LAUNCH}"
printf "[${MAGENTA}%s${RESET}]\n" "HOSTS"
# printf "hostfile: ${MAGENTA}%s${RESET}\n" "${hostfile}"
ezpz_print_hosts "${hostfile}"
printf "\n"
printf "[${BRIGHT_BLUE}%s${RESET}]\n" "DIST INFO"
printf " • NGPUS=${BRIGHT_BLUE}%s${RESET}\n" "$NGPUS"
printf " • NHOSTS=${BRIGHT_BLUE}%s${RESET}\n" "${NHOSTS}"
printf " • NGPU_PER_HOST=${BRIGHT_BLUE}%s${RESET}\n" "${NGPU_PER_HOST}"
printf " • HOSTFILE=${BRIGHT_BLUE}%s${RESET}\n" "${hostfile}"
printf " • DIST_LAUNCH=${BRIGHT_BLUE}%s${RESET}\n" "${DIST_LAUNCH}"
printf "\n"
if [[ -n "$(command -v launch)" ]]; then
printf "[${GREEN}%s${RESET}]:\n" "LAUNCH"
printf " • To launch across all available GPUs, use: ${GREEN}%s${RESET}\n" "launch"
printf "\n"
printf " ${GREEN}launch${RESET} = ${GREEN}%s${RESET}\n" "${LAUNCH}"
# printf " '${GREEN}launch${RESET}' ( = ${GREEN}%s${RESET} )\n" "${LAUNCH}"
fi
printf "\n"
# echo "export HOSTFILE=${hostfile}" >> "${JOBENV_FILE}"
# echo "┌────────────────────────────────────────────────────────────────────────────────"
# echo "│ YOU ARE HERE: $(whereAmI)"
# echo "│ Run 'source ./bin/getjobenv' in a NEW SHELL to automatically set env vars "
# echo "└────────────────────────────────────────────────────────────────────────────────"
# export NHOSTS="${NHOSTS}"
# export NGPU_PER_HOST="${NGPU_PER_HOST}"
# export NGPUS="${NGPUS}"
fi
}
ezpz_save_deepspeed_env() {
echo "Saving to .deepspeed_env"
echo "PATH=${PATH}" >.deepspeed_env
[ "${LD_LIBRARY_PATH}" ] && echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >>.deepspeed_env
[ "${CFLAGS}" ] && echo "CFLAGS=${CFLAGS}" >>.deepspeed_env
[ "${PYTHONUSERBASE}" ] && echo "PYTHONUSERBASE=${PYTHONUSERBASE}" >>.deepspeed_env
[ "${http_proxy}" ] && echo "http_proxy=${http_proxy}" >>.deepspeed_env
[ "${https_proxy}" ] && echo "https_proxy=${https_proxy}" >>.deepspeed_env
}
ezpz_get_pbs_env() {
if [[ "$#" == 1 ]]; then
hostfile="$1"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
elif [[ "$#" == 2 ]]; then
hostfile="$1"
jobenv_file="$2"
else
hostfile="${HOSTFILE:-$(ezpz_get_pbs_nodefile_from_hostname)}"
jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
fi
printf "\n"
printf "[${BLUE}ezpz_get_pbs_env${RESET}]: Caught ${BLUE}%s${RESET} arguments\n" "$#"
printf " • hostfile: ${BLUE}%s${RESET}\n" "${hostfile}"
printf " • jobenv_file: ${BLUE}%s${RESET}\n" "${jobenv_file}"
if [[ $(hostname) == x3* || $(hostname) == x1* || $(hostname) == x4* || $(hostname) == sophia* ]]; then
if [[ -n $(/bin/cat "${hostfile:-}" | grep "$(hostname)") ]]; then
num_hosts=$(ezpz_get_num_hosts "${hostfile}")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus="$((num_hosts * num_gpus_per_host))"
dist_launch=$(ezpz_get_dist_launch_cmd "${hostfile}")
export DIST_LAUNCH="${dist_launch}"
export ezlaunch="${DIST_LAUNCH}"
else
echo "$(hostname) not found in ${hostfile} ... ?"
fi
else
echo "Skipping ezpz_get_pbs_env() on $(hostname)"
fi
# printf "%s" "${FOOTER}"
}
ezpz_get_slurm_env() {
if [[ -n "${SLURM_JOB_ID}" ]]; then
export JOBENV_FILE="${SLURM_ENV_FILE}"
# export jobenv_file="${JOBENV_FILE:-$(ezpz_get_jobenv_file)}"
# shellcheck source="${HOME}/.slurmenv"
[ -f "${JOBENV_FILE}" ] && source "${JOBENV_FILE}"
export DIST_LAUNCH="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose"
export ezlaunch="${DIST_LAUNCH}"
else
echo "Skipping ezpz_get_slurm_env() on $(hostname)"
fi
}
ezpz_get_job_env() {
if [[ "$#" == 1 ]]; then
hostfile="$1"
elif [[ "$#" == 2 ]]; then
hostfile="$1"
jobenv_file="$2"
else
# jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
scheduler_type=$(ezpz_get_scheduler_type)
if [[ "${scheduler_type}" == pbs ]]; then
hostfile="${HOSTFILE:-${PBS_NODEFILE:-$(ezpz_get_pbs_nodefile_from_hostname)}}"
jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
ezpz_get_pbs_env "$@"
elif [[ "${scheduler_type}" == "slurm" ]]; then
hostfile="${HOSTFILE:-$(ezpz_make_slurm_nodefile)}"
jobenv_file="${SLURM_ENV_FILE}"
ezpz_get_slurm_env "$@"
else
echo "[ezpz_get_job_env] Unknown scheduler ${scheduler_type}"
fi
fi
if [[ -f "${hostfile:-}" ]]; then
nhosts=$(wc -l <"${hostfile}")
local nhosts="${nhosts}"
export LAUNCH="${DIST_LAUNCH}"
export ezlaunch="${DIST_LAUNCH}"
alias launch="${DIST_LAUNCH}"
export HOSTFILE="${hostfile}"
export NHOSTS="${nhosts}"
export NGPU_PER_HOST="${NGPU_PER_HOST}"
export NGPUS="${NGPUS}"
export WORLD_SIZE="${NGPUS}"
hosts_arr=$(/bin/cat "${HOSTFILE}")
export HOSTS_ARR="${hosts_arr}"
HOSTS="$(join_by ', ' "$(/bin/cat "${HOSTFILE}")")"
export HOSTS="${HOSTS}"
fi
}
ezpz_print_job_env() {
if [[ "$#" == 0 ]]; then
hostfile="${HOSTFILE:-${PBS_NODEFILE}}"
jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
elif [[ "$#" == 1 ]]; then
hostfile="$1"
elif [[ "$#" == 2 ]]; then
hostfile="$1"
jobenv_file="$2"
else
hostfile="${HOSTFILE:-${PBS_NODEFILE}}"
jobenv_file="${JOBENV_FILE:-${PBS_ENV_FILE}}"
fi
num_hosts=$(ezpz_get_num_hosts "${hostfile}")
num_gpus_per_host=$(ezpz_get_num_gpus_per_host)
num_gpus="$((num_hosts * num_gpus_per_host))"
printf "\n"
printf " [${MAGENTA}%s${RESET}]:\n" "HOSTS"
ezpz_print_hosts "${hostfile}"
printf "\n"
printf " [${BRIGHT_BLUE}%s${RESET}]:\n" "DIST INFO"
printf " • NGPUS=${BRIGHT_BLUE}%s${RESET}\n" "${num_gpus}"
printf " • NHOSTS=${BRIGHT_BLUE}%s${RESET}\n" "${num_hosts}"
printf " • NGPU_PER_HOST=${BRIGHT_BLUE}%s${RESET}\n" "${num_gpus_per_host}"
printf " • HOSTFILE=${BRIGHT_BLUE}%s${RESET}\n" "${hostfile}"
printf " • LAUNCH=${BRIGHT_BLUE}%s${RESET}\n" "${LAUNCH}"
printf " • DIST_LAUNCH=${BRIGHT_BLUE}%s${RESET}\n" "${DIST_LAUNCH}"
printf "\n"
printf " [${GREEN}%s${RESET}]:\n" "LAUNCH"
printf " • To launch across all available GPUs, use:\n"
printf " '${GREEN}launch${RESET}' ( = ${GREEN}%s${RESET} )\n" "${LAUNCH}"
printf "\n"
}
ezpz_getjobenv_main() {
ezpz_get_job_env "$@"
ezpz_setup_host "$@"
ezpz_write_job_info "$@"
# ezpz_print_job_env "$@"
}
ezpz_savejobenv_main() {
# printf "${BLACK}%s${RESET}\n" "${LOGO}"
# printf "${BLACK}[ezpz]${RESET}\n" "${LOGO_DOOM}"
ezpz_setup_host "$@"
ezpz_write_job_info "$@"
}
ezpz_setup_alcf() {
mn=$(ezpz_get_machine_name)
hn=$(hostname)
local mn="${mn}"
local hn="${hn}"
printf "\n"
printf "[%s ${YELLOW}%s${RESET}]\n" "🍋" "ezpz/bin/utils.sh"
printf "\n"
printf " • USER=${BLACK}%s${RESET}\n" "${USER}"
printf " • MACHINE=${BLACK}%s${RESET}\n" "${mn}"
printf " • HOST=${BLACK}%s${RESET}\n" "${hn}"
printf " • TSTAMP=${BLACK}%s${RESET}\n" "$(ezpz_get_tstamp)"
printf "\n"
if [[ -n "${PBS_NODEFILE:-}" ]]; then
ezpz_savejobenv_main "$@"
elif [[ -n "${SLURM_JOB_ID:-}" ]]; then
ezpz_savejobenv_main "$@"
else
scheduler_type=$(ezpz_get_scheduler_type)
if [[ "${scheduler_type}" == "pbs" ]]; then
_pbs_nodefile=$(ezpz_get_pbs_nodefile_from_hostname)
export PBS_NODEFILE="${_pbs_nodefile}"
ezpz_getjobenv_main "$@"
elif [[ "${scheduler_type}" == "slurm" ]]; then
running_nodes=$(ezpz_get_slurm_running_nodelist)
if [[ -n "${running_nodes}" ]]; then
snodelist=$(scontrol show hostname "${running_nodes}")
_slurm_job_id=$(ezpz_get_slurm_running_jobid)
export SLURM_JOB_ID="${_slurm_job_id}"
export SLURM_NODELIST="${running_nodes}"
ezpz_getjobenv_main "$@"
fi
fi
fi
}
ezpz_setup_job() {
mn=$(ezpz_get_machine_name)
hn=$(hostname)
local mn="${mn}"
local hn="${hn}"
printf "\n"
printf "[%s ${YELLOW}%s${RESET}]\n" "🍋" "ezpz/bin/utils.sh"
# printf "[${RED}%s${RESET}]\n" "ezpz/bin/utils.sh"
# printf "\n"
# printf "[${BLACK}%s${RESET}]\n" "$(ezpz_get_tstamp)"
printf " • USER=${YELLOW}%s${RESET}\n" "${USER}"
printf " • MACHINE=${YELLOW}%s${RESET}\n" "${mn}"
printf " • HOST=${YELLOW}%s${RESET}\n" "${hn}"
printf " • TSTAMP=${YELLOW}%s${RESET}\n" "$(ezpz_get_tstamp)"
printf "\n"
if [[ -n "${PBS_NODEFILE:-}" ]]; then
ezpz_savejobenv_main "$@"
elif [[ -n "${SLURM_JOB_ID:-}" ]]; then
ezpz_savejobenv_main "$@"
else
scheduler_type=$(ezpz_get_scheduler_type)
if [[ "${scheduler_type}" == "pbs" ]]; then
_pbs_nodefile=$(ezpz_get_pbs_nodefile_from_hostname)
if [[ -f "${_pbs_nodefile}" ]]; then
export PBS_NODEFILE="${_pbs_nodefile}"
ezpz_getjobenv_main "$@"
else
echo "[${mn}] @ [${hn}] No compute node found !!"
fi
elif [[ "${scheduler_type}" == "slurm" ]]; then
running_nodes=$(ezpz_get_slurm_running_nodelist)
if [[ -n "${running_nodes}" ]]; then
snodelist=$(scontrol show hostname "${running_nodes}")
_slurm_job_id=$(ezpz_get_slurm_running_jobid)
export SLURM_JOB_ID="${_slurm_job_id}"
export SLURM_NODELIST="${running_nodes}"
ezpz_getjobenv_main "$@"
fi
fi
fi
}
ezpz_setup_env() {
ezpz_setup_python && ezpz_setup_job
}
###############################################
# Helper functions for printing colored text
###############################################
printBlack() {
printf "\e[1;30m%s\e[0m\n" "$@"
}
printRed() {
printf "\e[1;31m%s\e[0m\n" "$@"
}
printGreen() {
printf "\e[1;32m%s\e[0m\n" "$@"
}
printYellow() {
printf "\e[1;33m%s\e[0m\n" "$@"
}
printBlue() {
printf "\e[1;34m%s\e[0m\n" "$@"
}
printMagenta() {
printf "\e[1;35m%s\e[0m\n" "$@"
}
printCyan() {
printf "\e[1;36m%s\e[0m\n" "$@"
}
##################
# utils_main
#
# This will get called automatically when running:
#
# ```bash
# $ cd Megatron-DeepSpeed
# $ PBS_O_WORKDIR=$(pwd) source ALCF/utils.sh
# ```
#
# - This will set `"${WORKING_DIR}"`, according to:
# 1. `${PBS_O_WORKDIR}` is nonzero, use this
# 2. else, if `${SLURM_SUBMIT_DIR}` is nonzero use this
# 3. else, use `$(pwd)`
#
# this is crucial since many of the functions below use paths
# which are defined relative to this "${WORKING_DIR}"
# (e.g. virtual environment, location of executables, etc.)
##################
utils_main() {
# for debug mode, run with `DEBUG=1`
if [[ -n "${DEBUG:-}" ]]; then
set -euxo
fi
if [[ -n "${PBS_O_WORKDIR:-}" ]]; then
WORKING_DIR="${PBS_O_WORKDIR}"
elif [[ -n "${SLURM_SUBMIT_DIR:-}" ]]; then
WORKING_DIR="${SLURM_SUBMIT_DIR}"
else
echo "Unable to detect PBS or SLURM working directory info..."
WORKING_DIR=$(python3 -c 'import os; print(os.getcwd())')
echo "Using ${WORKING_DIR} as working directory..."
fi
export WORKING_DIR="${WORKING_DIR}"
printf "Using WORKING_DIR: %s\n" "${WORKING_DIR}"
}
utils_main
if [[ -n "${DEBUG:-}" ]]; then set +x; fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment