Skip to content

Instantly share code, notes, and snippets.

@swhitt
Last active September 9, 2025 01:19
Show Gist options
  • Select an option

  • Save swhitt/3eece69e8bc096427c3c684548f508ed to your computer and use it in GitHub Desktop.

Select an option

Save swhitt/3eece69e8bc096427c3c684548f508ed to your computer and use it in GitHub Desktop.
SageMaker setup script with GPU monitoring, ML utilities, data tools (DuckDB/parquet), and customized bash environment
#!/usr/bin/env bash
# SageMaker setup script
# Run with: curl -sSL https://gist.githubusercontent.com/swhitt/3eece69e8bc096427c3c684548f508ed/raw/setup.sh | bash
echo "🚀 SageMaker Setup Script"
if [[ "${DEBUG:-0}" == "1" ]]; then
set -x
fi
set -e
readonly START_MARK="# >>> sagemaker-setup >>>"
readonly END_MARK="# <<< sagemaker-setup <<<"
log() { echo "▶ $*"; }
error() { echo "✗ $*" >&2; }
success() { echo "✓ $*"; }
install_tools() {
log "Installing essential tools..."
mkdir -p "$HOME/.local/bin" "$HOME/bin"
if command -v pip3 >/dev/null 2>&1; then
log "Installing GPU and ML tools..."
if timeout 30 pip3 install --user --quiet --upgrade gpustat nvidia-ml-py >/dev/null 2>&1; then
success "GPU tools installed"
else
log "GPU tools install failed"
fi
fi
# Install conda tools if conda is available
if command -v conda >/dev/null 2>&1; then
log "Installing nvtop via conda..."
if timeout 60 conda install -y -c conda-forge nvtop >/dev/null 2>&1; then
success "nvtop installed"
else
log "nvtop install failed"
fi
fi
# Install DuckDB for data analysis
log "Installing DuckDB..."
rm -f "$HOME/.local/bin/duckdb"
if timeout 30 curl -sL "https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip" -o /tmp/duckdb.zip >/dev/null 2>&1; then
unzip -q -o /tmp/duckdb.zip -d "$HOME/.local/bin/" 2>/dev/null
chmod +x "$HOME/.local/bin/duckdb"
rm -f /tmp/duckdb.zip
success "DuckDB installed"
else
log "DuckDB install failed"
fi
log "Installing jq..."
rm -f "$HOME/.local/bin/jq"
if timeout 30 curl -sL "https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64" -o "$HOME/.local/bin/jq" >/dev/null 2>&1; then
chmod +x "$HOME/.local/bin/jq"
success "jq installed"
else
log "jq install failed"
fi
}
create_utilities() {
log "Creating utility scripts..."
rm -f "$HOME/bin/gpumem" "$HOME/bin/gpumon" "$HOME/bin/pq" "$HOME/bin/ckpt" "$HOME/bin/slice-pq"
cat > "$HOME/bin/gpumem" << 'EOF'
#!/usr/bin/env bash
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi --query-gpu=index,memory.used,memory.total,memory.free --format=csv,noheader | \
awk -F', ' '{printf "GPU %s: %s / %s (Free: %s)\n", $1, $2, $3, $4}'
else
echo "nvidia-smi not available"
fi
EOF
chmod +x "$HOME/bin/gpumem"
cat > "$HOME/bin/gpumon" << 'EOF'
#!/usr/bin/env bash
# Real-time GPU monitoring
watch -n 0.5 'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits | column -t -s ","'
EOF
chmod +x "$HOME/bin/gpumon"
cat > "$HOME/bin/pq" << 'EOF'
#!/usr/bin/env python3
import sys
try:
import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path
if len(sys.argv) < 2:
print("Usage: pq <file.parquet> [nrows]")
sys.exit(1)
file = sys.argv[1]
nrows = int(sys.argv[2]) if len(sys.argv) > 2 else 5
meta = pq.read_metadata(file)
schema = pq.read_schema(file)
print(f"File: {Path(file).name}")
print(f"Rows: {meta.num_rows:,}")
print(f"Columns: {len(schema.names)}")
print("\nSchema:")
for field in schema:
print(f" {field.name:30} {str(field.type):20}")
print(f"\nFirst {nrows} rows:")
df = pd.read_parquet(file)
# Truncate binary columns to avoid pages of output
pd.set_option('display.max_colwidth', 50)
for col in df.columns:
if df[col].dtype == 'object':
if df[col].notna().any():
sample = str(df[col].dropna().iloc[0]) if not df[col].dropna().empty else ""
if len(sample) > 100 or any(ord(c) < 32 or ord(c) > 126 for c in sample[:100]):
df[col] = df[col].astype(str).str[:20] + '...'
print(df.head(nrows).to_string(max_colwidth=50))
except ImportError:
print("Error: pandas/pyarrow not available. Install with: pip install pandas pyarrow")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
EOF
chmod +x "$HOME/bin/pq"
cat > "$HOME/bin/ckpt" << 'EOF'
#!/usr/bin/env python3
import sys
from pathlib import Path
def inspect_checkpoint(path):
path = Path(path)
try:
if path.suffix == '.safetensors' or 'safetensors' in path.name:
from safetensors import safe_open
with safe_open(path, framework="pt") as f:
keys = f.keys()
print(f"SafeTensors checkpoint: {len(keys)} tensors")
print(f"Size: {path.stat().st_size / 1e9:.2f}GB")
# Sample some keys
sample = list(keys)[:10]
for k in sample:
tensor = f.get_tensor(k)
print(f" {k}: {list(tensor.shape)} ({tensor.dtype})")
if len(keys) > 10:
print(f" ... and {len(keys) - 10} more")
else:
import torch
ckpt = torch.load(path, map_location='cpu')
if isinstance(ckpt, dict):
print(f"PyTorch checkpoint: {len(ckpt)} keys")
print(f"Size: {path.stat().st_size / 1e9:.2f}GB")
for i, (k, v) in enumerate(ckpt.items()):
if i >= 10:
print(f" ... and {len(ckpt) - 10} more")
break
if hasattr(v, 'shape'):
print(f" {k}: {list(v.shape)} ({v.dtype})")
else:
print(f" {k}: {type(v).__name__}")
except ImportError as e:
print(f"Error: Required package not available - {e}")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: ckpt <checkpoint_file>")
sys.exit(1)
inspect_checkpoint(sys.argv[1])
EOF
chmod +x "$HOME/bin/ckpt"
cat > "$HOME/bin/slice-pq" << 'EOF'
#!/usr/bin/env python
import sys
import pandas as pd
from pathlib import Path
if len(sys.argv) == 2:
# Just show row count
input_file = sys.argv[1]
if not Path(input_file).exists():
print(f"Error: {input_file} not found")
sys.exit(1)
df = pd.read_parquet(input_file)
print(f"{input_file}: {len(df):,} rows")
print("Usage: slice-pq <input.parquet> <num_rows_or_range>")
print("Examples:")
print(" slice-pq file.parquet 1000 # first 1000 rows")
print(" slice-pq file.parquet 500:1500 # rows 500-1499")
print(" slice-pq file.parquet 1000: # from row 1000 to end")
print(" slice-pq file.parquet :500 # first 500 rows")
sys.exit(0)
if len(sys.argv) != 3:
print("Usage: slice-pq <input.parquet> <num_rows_or_range>")
sys.exit(1)
input_file = sys.argv[1]
slice_arg = sys.argv[2]
# Parse slice argument (supports N or start:end format)
if ':' in slice_arg:
start_str, end_str = slice_arg.split(':', 1)
start = int(start_str) if start_str else None
end = int(end_str) if end_str else None
else:
start, end = None, int(slice_arg)
if not Path(input_file).exists():
print(f"Error: {input_file} not found")
sys.exit(1)
# Generate output filename
input_path = Path(input_file)
if start is None:
suffix = f"_{end}"
elif end is None:
suffix = f"_{start}_end"
else:
suffix = f"_{start}_{end}"
output_file = input_path.stem + suffix + input_path.suffix
print(f"Reading {input_file}...")
df = pd.read_parquet(input_file)
print(f"Original: {len(df):,} rows")
# Apply slice
df_slice = df.iloc[start:end] if start is not None else df.head(end)
if start is None:
slice_desc = f"first {end}"
elif end is None:
slice_desc = f"rows {start} to end"
else:
slice_desc = f"rows {start} to {end-1}"
print(f"Slicing to: {len(df_slice):,} rows ({slice_desc})")
df_slice.to_parquet(output_file)
print(f"Saved: {output_file}")
EOF
chmod +x "$HOME/bin/slice-pq"
success "Utility scripts created"
}
setup_shell() {
log "Setting up shell configuration..."
local bashrc="$HOME/.bashrc"
[[ -f "$bashrc" ]] && cp "$bashrc" "$bashrc.bak"
if [[ -f "$bashrc" ]] && grep -q "$START_MARK" "$bashrc"; then
log "Removing existing configuration..."
while grep -q "$START_MARK" "$bashrc"; do
sed -i "/$START_MARK/,/$END_MARK/d" "$bashrc"
done
fi
cat >> "$bashrc" << 'EOF'
# >>> sagemaker-setup >>>
# Only run in interactive shells
case $- in
*i*) ;;
*) return ;;
esac
export PATH="$HOME/bin:$HOME/.local/bin:$PATH"
__git_branch() {
if command -v git >/dev/null 2>&1; then
local branch
branch=$(git branch 2>/dev/null | grep '^\*' | cut -d' ' -f2- 2>/dev/null || echo "")
[[ -n "$branch" ]] && echo " ($branch)"
fi
}
if [[ -t 1 ]] && command -v tput >/dev/null 2>&1 && tput colors >/dev/null 2>&1 && [[ $(tput colors) -ge 8 ]]; then
PS1='\[\033[32m\]\u@\h\[\033[0m\]:\[\033[34m\]\w\[\033[33m\]$(__git_branch)\[\033[0m\]\$ '
else
PS1='\u@\h:\w$(__git_branch)\$ '
fi
alias ll='ls -lhF'
alias la='ls -lhAF'
alias lt='ls -lhFtr'
alias ..='cd ..'
alias ...='cd ../..'
alias -- -='cd -'
alias rm='rm -i'
alias mv='mv -i'
alias cp='cp -i'
alias grep='grep --color=auto'
alias df='df -h'
alias du='du -h'
alias free='free -h'
alias gpu='nvidia-smi'
alias gpuw='watch -n1 nvidia-smi'
alias py='python3'
alias ipy='ipython'
HISTSIZE=100000
HISTFILESIZE=100000
HISTCONTROL=ignoreboth:erasedups
HISTTIMEFORMAT='%F %T '
HISTIGNORE='ls:ll:cd:pwd:exit:date:* --help:gpu:gpuw'
if shopt -q histappend 2>/dev/null; then
shopt -s histappend
shopt -s cmdhist
fi
PROMPT_COMMAND="${PROMPT_COMMAND:+$PROMPT_COMMAND; }history -a; history -n"
sql() {
local query="${1:?Usage: sql 'SELECT ... FROM file.ext ...'}"
duckdb -c "$query"
}
data_files() {
local dir="${1:-.}"
local show_all=false
if [[ "$1" == "-a" ]]; then
show_all=true
dir="${2:-.}"
fi
local find_cmd="find \"$dir\" -type f"
if [[ "$show_all" == false ]]; then
find_cmd="$find_cmd -not -path '*/.conda/*' -not -path '*/.local/*' -not -path '*/.cache/*'"
fi
find_cmd="$find_cmd \( -name \"*.parquet\" -o -name \"*.csv\" -o -name \"*.json\" -o -name \"*.jsonl\" \) -exec ls -lh {} \; 2>/dev/null | awk '{print \$NF, \$5}'"
eval "$find_cmd"
}
model_files() {
local dir="${1:-.}"
find "$dir" -type f \( \
-name "*.pt" -o \
-name "*.pth" -o \
-name "*.bin" -o \
-name "*.safetensors" -o \
-name "*.ckpt" \
\) -exec ls -lh {} \; 2>/dev/null | awk '{print $NF, $5}'
}
killpy() {
local procs=$(pgrep -f python | wc -l)
if [[ $procs -gt 0 ]]; then
echo "Killing $procs Python processes..."
pkill -9 -f python
else
echo "No Python processes running"
fi
}
# SageMaker URL generator
url() {
local space_name="${1:-$SAGEMAKER_SPACE_NAME}"
if [[ -z "$space_name" ]]; then
echo "Usage: url <space-name>"
echo "Set SAGEMAKER_SPACE_NAME env var or pass space name as argument"
return 1
fi
if ! command -v aws >/dev/null 2>&1; then
echo "AWS CLI not available"
return 1
fi
echo "Looking up space info for: $space_name"
local space_info=$(timeout 10 aws sagemaker list-spaces --query "Spaces[?SpaceName=='$space_name'].[DomainId,OwnershipSettingsSummary.OwnerUserProfileName]" --output text 2>/dev/null)
if [[ -z "$space_info" ]]; then
echo "Error: Could not find space '$space_name'"
return 1
fi
local domain_id=$(echo "$space_info" | cut -f1)
local profile=$(echo "$space_info" | cut -f2)
echo "Generating URL for space: $space_name, profile: $profile, domain: $domain_id"
timeout 10 aws sagemaker create-presigned-domain-url \
--domain-id "$domain_id" \
--user-profile-name "$profile" \
--space-name "$space_name" \
--query 'AuthorizedUrl' \
--output text 2>/dev/null || echo "Failed to generate URL (check AWS CLI setup)"
}
clear_gpu() {
if command -v pkill >/dev/null 2>&1; then
pkill -f python 2>/dev/null || true
echo "GPU processes cleared"
else
echo "pkill not available"
fi
}
echo "🚀 SageMaker tools loaded:"
echo " url [space] - Generate SageMaker URL"
echo " gpumem - Show GPU memory usage"
echo " gpumon - Real-time GPU monitoring"
echo " ckpt file.pt - Inspect PyTorch/SafeTensors checkpoints"
echo " pq data.parquet - View parquet files"
echo " slice-pq file.parquet [N|start:end] - Extract parquet rows"
echo " sql 'SELECT...' - Query data files with SQL"
echo " data_files [-a] - Find data files (csv, parquet, json)"
echo " model_files - Find model files (pt, safetensors, etc)"
echo " killpy - Kill Python processes"
echo " gpu/gpuw - nvidia-smi (watch mode)"
# <<< sagemaker-setup <<<
EOF
success "Shell configuration added to ~/.bashrc"
}
setup_bash_profile() {
local bash_profile="$HOME/.bash_profile"
if [[ ! -f "$bash_profile" ]] || ! grep -q "bashrc" "$bash_profile"; then
# shellcheck disable=SC2016
echo 'if [[ -f "$HOME/.bashrc" ]]; then source "$HOME/.bashrc"; fi' >> "$bash_profile"
log "Updated ~/.bash_profile to source ~/.bashrc"
fi
}
verify_setup() {
log "Verifying setup..."
local issues=0
[[ ! -x "$HOME/bin/gpumem" ]] && { error "gpumem script missing"; ((issues++)); }
if [[ ! -f "$HOME/.bashrc" ]] || ! grep -q "$START_MARK" "$HOME/.bashrc"; then
error "bashrc configuration missing"
((issues++))
fi
if [[ $issues -eq 0 ]]; then
success "Setup verification passed"
return 0
else
error "Setup verification found $issues issues"
return 1
fi
}
main() {
echo "Starting SageMaker setup..."
install_tools
create_utilities
setup_shell
setup_bash_profile
touch "$HOME/.bash_history" 2>/dev/null || true
if verify_setup; then
echo
echo "✅ Setup completed successfully!"
echo
echo "To activate the new configuration:"
echo " source ~/.bashrc"
echo
echo "Or start a new shell session"
echo
else
echo
echo "❌ Setup completed with issues"
echo "Check the error messages above"
echo
return 1
fi
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment