Last active
September 9, 2025 01:19
-
-
Save swhitt/3eece69e8bc096427c3c684548f508ed to your computer and use it in GitHub Desktop.
SageMaker setup script with GPU monitoring, ML utilities, data tools (DuckDB/parquet), and customized bash environment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # SageMaker setup script | |
| # Run with: curl -sSL https://gist.githubusercontent.com/swhitt/3eece69e8bc096427c3c684548f508ed/raw/setup.sh | bash | |
| echo "🚀 SageMaker Setup Script" | |
| if [[ "${DEBUG:-0}" == "1" ]]; then | |
| set -x | |
| fi | |
| set -e | |
| readonly START_MARK="# >>> sagemaker-setup >>>" | |
| readonly END_MARK="# <<< sagemaker-setup <<<" | |
| log() { echo "▶ $*"; } | |
| error() { echo "✗ $*" >&2; } | |
| success() { echo "✓ $*"; } | |
| install_tools() { | |
| log "Installing essential tools..." | |
| mkdir -p "$HOME/.local/bin" "$HOME/bin" | |
| if command -v pip3 >/dev/null 2>&1; then | |
| log "Installing GPU and ML tools..." | |
| if timeout 30 pip3 install --user --quiet --upgrade gpustat nvidia-ml-py >/dev/null 2>&1; then | |
| success "GPU tools installed" | |
| else | |
| log "GPU tools install failed" | |
| fi | |
| fi | |
| # Install conda tools if conda is available | |
| if command -v conda >/dev/null 2>&1; then | |
| log "Installing nvtop via conda..." | |
| if timeout 60 conda install -y -c conda-forge nvtop >/dev/null 2>&1; then | |
| success "nvtop installed" | |
| else | |
| log "nvtop install failed" | |
| fi | |
| fi | |
| # Install DuckDB for data analysis | |
| log "Installing DuckDB..." | |
| rm -f "$HOME/.local/bin/duckdb" | |
| if timeout 30 curl -sL "https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip" -o /tmp/duckdb.zip >/dev/null 2>&1; then | |
| unzip -q -o /tmp/duckdb.zip -d "$HOME/.local/bin/" 2>/dev/null | |
| chmod +x "$HOME/.local/bin/duckdb" | |
| rm -f /tmp/duckdb.zip | |
| success "DuckDB installed" | |
| else | |
| log "DuckDB install failed" | |
| fi | |
| log "Installing jq..." | |
| rm -f "$HOME/.local/bin/jq" | |
| if timeout 30 curl -sL "https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64" -o "$HOME/.local/bin/jq" >/dev/null 2>&1; then | |
| chmod +x "$HOME/.local/bin/jq" | |
| success "jq installed" | |
| else | |
| log "jq install failed" | |
| fi | |
| } | |
| create_utilities() { | |
| log "Creating utility scripts..." | |
| rm -f "$HOME/bin/gpumem" "$HOME/bin/gpumon" "$HOME/bin/pq" "$HOME/bin/ckpt" "$HOME/bin/slice-pq" | |
| cat > "$HOME/bin/gpumem" << 'EOF' | |
| #!/usr/bin/env bash | |
| if command -v nvidia-smi >/dev/null 2>&1; then | |
| nvidia-smi --query-gpu=index,memory.used,memory.total,memory.free --format=csv,noheader | \ | |
| awk -F', ' '{printf "GPU %s: %s / %s (Free: %s)\n", $1, $2, $3, $4}' | |
| else | |
| echo "nvidia-smi not available" | |
| fi | |
| EOF | |
| chmod +x "$HOME/bin/gpumem" | |
| cat > "$HOME/bin/gpumon" << 'EOF' | |
| #!/usr/bin/env bash | |
| # Real-time GPU monitoring | |
| watch -n 0.5 'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits | column -t -s ","' | |
| EOF | |
| chmod +x "$HOME/bin/gpumon" | |
| cat > "$HOME/bin/pq" << 'EOF' | |
| #!/usr/bin/env python3 | |
| import sys | |
| try: | |
| import pandas as pd | |
| import pyarrow.parquet as pq | |
| from pathlib import Path | |
| if len(sys.argv) < 2: | |
| print("Usage: pq <file.parquet> [nrows]") | |
| sys.exit(1) | |
| file = sys.argv[1] | |
| nrows = int(sys.argv[2]) if len(sys.argv) > 2 else 5 | |
| meta = pq.read_metadata(file) | |
| schema = pq.read_schema(file) | |
| print(f"File: {Path(file).name}") | |
| print(f"Rows: {meta.num_rows:,}") | |
| print(f"Columns: {len(schema.names)}") | |
| print("\nSchema:") | |
| for field in schema: | |
| print(f" {field.name:30} {str(field.type):20}") | |
| print(f"\nFirst {nrows} rows:") | |
| df = pd.read_parquet(file) | |
| # Truncate binary columns to avoid pages of output | |
| pd.set_option('display.max_colwidth', 50) | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| if df[col].notna().any(): | |
| sample = str(df[col].dropna().iloc[0]) if not df[col].dropna().empty else "" | |
| if len(sample) > 100 or any(ord(c) < 32 or ord(c) > 126 for c in sample[:100]): | |
| df[col] = df[col].astype(str).str[:20] + '...' | |
| print(df.head(nrows).to_string(max_colwidth=50)) | |
| except ImportError: | |
| print("Error: pandas/pyarrow not available. Install with: pip install pandas pyarrow") | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| sys.exit(1) | |
| EOF | |
| chmod +x "$HOME/bin/pq" | |
| cat > "$HOME/bin/ckpt" << 'EOF' | |
| #!/usr/bin/env python3 | |
| import sys | |
| from pathlib import Path | |
| def inspect_checkpoint(path): | |
| path = Path(path) | |
| try: | |
| if path.suffix == '.safetensors' or 'safetensors' in path.name: | |
| from safetensors import safe_open | |
| with safe_open(path, framework="pt") as f: | |
| keys = f.keys() | |
| print(f"SafeTensors checkpoint: {len(keys)} tensors") | |
| print(f"Size: {path.stat().st_size / 1e9:.2f}GB") | |
| # Sample some keys | |
| sample = list(keys)[:10] | |
| for k in sample: | |
| tensor = f.get_tensor(k) | |
| print(f" {k}: {list(tensor.shape)} ({tensor.dtype})") | |
| if len(keys) > 10: | |
| print(f" ... and {len(keys) - 10} more") | |
| else: | |
| import torch | |
| ckpt = torch.load(path, map_location='cpu') | |
| if isinstance(ckpt, dict): | |
| print(f"PyTorch checkpoint: {len(ckpt)} keys") | |
| print(f"Size: {path.stat().st_size / 1e9:.2f}GB") | |
| for i, (k, v) in enumerate(ckpt.items()): | |
| if i >= 10: | |
| print(f" ... and {len(ckpt) - 10} more") | |
| break | |
| if hasattr(v, 'shape'): | |
| print(f" {k}: {list(v.shape)} ({v.dtype})") | |
| else: | |
| print(f" {k}: {type(v).__name__}") | |
| except ImportError as e: | |
| print(f"Error: Required package not available - {e}") | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| sys.exit(1) | |
| if __name__ == '__main__': | |
| if len(sys.argv) < 2: | |
| print("Usage: ckpt <checkpoint_file>") | |
| sys.exit(1) | |
| inspect_checkpoint(sys.argv[1]) | |
| EOF | |
| chmod +x "$HOME/bin/ckpt" | |
| cat > "$HOME/bin/slice-pq" << 'EOF' | |
| #!/usr/bin/env python | |
| import sys | |
| import pandas as pd | |
| from pathlib import Path | |
| if len(sys.argv) == 2: | |
| # Just show row count | |
| input_file = sys.argv[1] | |
| if not Path(input_file).exists(): | |
| print(f"Error: {input_file} not found") | |
| sys.exit(1) | |
| df = pd.read_parquet(input_file) | |
| print(f"{input_file}: {len(df):,} rows") | |
| print("Usage: slice-pq <input.parquet> <num_rows_or_range>") | |
| print("Examples:") | |
| print(" slice-pq file.parquet 1000 # first 1000 rows") | |
| print(" slice-pq file.parquet 500:1500 # rows 500-1499") | |
| print(" slice-pq file.parquet 1000: # from row 1000 to end") | |
| print(" slice-pq file.parquet :500 # first 500 rows") | |
| sys.exit(0) | |
| if len(sys.argv) != 3: | |
| print("Usage: slice-pq <input.parquet> <num_rows_or_range>") | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| slice_arg = sys.argv[2] | |
| # Parse slice argument (supports N or start:end format) | |
| if ':' in slice_arg: | |
| start_str, end_str = slice_arg.split(':', 1) | |
| start = int(start_str) if start_str else None | |
| end = int(end_str) if end_str else None | |
| else: | |
| start, end = None, int(slice_arg) | |
| if not Path(input_file).exists(): | |
| print(f"Error: {input_file} not found") | |
| sys.exit(1) | |
| # Generate output filename | |
| input_path = Path(input_file) | |
| if start is None: | |
| suffix = f"_{end}" | |
| elif end is None: | |
| suffix = f"_{start}_end" | |
| else: | |
| suffix = f"_{start}_{end}" | |
| output_file = input_path.stem + suffix + input_path.suffix | |
| print(f"Reading {input_file}...") | |
| df = pd.read_parquet(input_file) | |
| print(f"Original: {len(df):,} rows") | |
| # Apply slice | |
| df_slice = df.iloc[start:end] if start is not None else df.head(end) | |
| if start is None: | |
| slice_desc = f"first {end}" | |
| elif end is None: | |
| slice_desc = f"rows {start} to end" | |
| else: | |
| slice_desc = f"rows {start} to {end-1}" | |
| print(f"Slicing to: {len(df_slice):,} rows ({slice_desc})") | |
| df_slice.to_parquet(output_file) | |
| print(f"Saved: {output_file}") | |
| EOF | |
| chmod +x "$HOME/bin/slice-pq" | |
| success "Utility scripts created" | |
| } | |
| setup_shell() { | |
| log "Setting up shell configuration..." | |
| local bashrc="$HOME/.bashrc" | |
| [[ -f "$bashrc" ]] && cp "$bashrc" "$bashrc.bak" | |
| if [[ -f "$bashrc" ]] && grep -q "$START_MARK" "$bashrc"; then | |
| log "Removing existing configuration..." | |
| while grep -q "$START_MARK" "$bashrc"; do | |
| sed -i "/$START_MARK/,/$END_MARK/d" "$bashrc" | |
| done | |
| fi | |
| cat >> "$bashrc" << 'EOF' | |
| # >>> sagemaker-setup >>> | |
| # Only run in interactive shells | |
| case $- in | |
| *i*) ;; | |
| *) return ;; | |
| esac | |
| export PATH="$HOME/bin:$HOME/.local/bin:$PATH" | |
| __git_branch() { | |
| if command -v git >/dev/null 2>&1; then | |
| local branch | |
| branch=$(git branch 2>/dev/null | grep '^\*' | cut -d' ' -f2- 2>/dev/null || echo "") | |
| [[ -n "$branch" ]] && echo " ($branch)" | |
| fi | |
| } | |
| if [[ -t 1 ]] && command -v tput >/dev/null 2>&1 && tput colors >/dev/null 2>&1 && [[ $(tput colors) -ge 8 ]]; then | |
| PS1='\[\033[32m\]\u@\h\[\033[0m\]:\[\033[34m\]\w\[\033[33m\]$(__git_branch)\[\033[0m\]\$ ' | |
| else | |
| PS1='\u@\h:\w$(__git_branch)\$ ' | |
| fi | |
| alias ll='ls -lhF' | |
| alias la='ls -lhAF' | |
| alias lt='ls -lhFtr' | |
| alias ..='cd ..' | |
| alias ...='cd ../..' | |
| alias -- -='cd -' | |
| alias rm='rm -i' | |
| alias mv='mv -i' | |
| alias cp='cp -i' | |
| alias grep='grep --color=auto' | |
| alias df='df -h' | |
| alias du='du -h' | |
| alias free='free -h' | |
| alias gpu='nvidia-smi' | |
| alias gpuw='watch -n1 nvidia-smi' | |
| alias py='python3' | |
| alias ipy='ipython' | |
| HISTSIZE=100000 | |
| HISTFILESIZE=100000 | |
| HISTCONTROL=ignoreboth:erasedups | |
| HISTTIMEFORMAT='%F %T ' | |
| HISTIGNORE='ls:ll:cd:pwd:exit:date:* --help:gpu:gpuw' | |
| if shopt -q histappend 2>/dev/null; then | |
| shopt -s histappend | |
| shopt -s cmdhist | |
| fi | |
| PROMPT_COMMAND="${PROMPT_COMMAND:+$PROMPT_COMMAND; }history -a; history -n" | |
| sql() { | |
| local query="${1:?Usage: sql 'SELECT ... FROM file.ext ...'}" | |
| duckdb -c "$query" | |
| } | |
| data_files() { | |
| local dir="${1:-.}" | |
| local show_all=false | |
| if [[ "$1" == "-a" ]]; then | |
| show_all=true | |
| dir="${2:-.}" | |
| fi | |
| local find_cmd="find \"$dir\" -type f" | |
| if [[ "$show_all" == false ]]; then | |
| find_cmd="$find_cmd -not -path '*/.conda/*' -not -path '*/.local/*' -not -path '*/.cache/*'" | |
| fi | |
| find_cmd="$find_cmd \( -name \"*.parquet\" -o -name \"*.csv\" -o -name \"*.json\" -o -name \"*.jsonl\" \) -exec ls -lh {} \; 2>/dev/null | awk '{print \$NF, \$5}'" | |
| eval "$find_cmd" | |
| } | |
| model_files() { | |
| local dir="${1:-.}" | |
| find "$dir" -type f \( \ | |
| -name "*.pt" -o \ | |
| -name "*.pth" -o \ | |
| -name "*.bin" -o \ | |
| -name "*.safetensors" -o \ | |
| -name "*.ckpt" \ | |
| \) -exec ls -lh {} \; 2>/dev/null | awk '{print $NF, $5}' | |
| } | |
| killpy() { | |
| local procs=$(pgrep -f python | wc -l) | |
| if [[ $procs -gt 0 ]]; then | |
| echo "Killing $procs Python processes..." | |
| pkill -9 -f python | |
| else | |
| echo "No Python processes running" | |
| fi | |
| } | |
| # SageMaker URL generator | |
| url() { | |
| local space_name="${1:-$SAGEMAKER_SPACE_NAME}" | |
| if [[ -z "$space_name" ]]; then | |
| echo "Usage: url <space-name>" | |
| echo "Set SAGEMAKER_SPACE_NAME env var or pass space name as argument" | |
| return 1 | |
| fi | |
| if ! command -v aws >/dev/null 2>&1; then | |
| echo "AWS CLI not available" | |
| return 1 | |
| fi | |
| echo "Looking up space info for: $space_name" | |
| local space_info=$(timeout 10 aws sagemaker list-spaces --query "Spaces[?SpaceName=='$space_name'].[DomainId,OwnershipSettingsSummary.OwnerUserProfileName]" --output text 2>/dev/null) | |
| if [[ -z "$space_info" ]]; then | |
| echo "Error: Could not find space '$space_name'" | |
| return 1 | |
| fi | |
| local domain_id=$(echo "$space_info" | cut -f1) | |
| local profile=$(echo "$space_info" | cut -f2) | |
| echo "Generating URL for space: $space_name, profile: $profile, domain: $domain_id" | |
| timeout 10 aws sagemaker create-presigned-domain-url \ | |
| --domain-id "$domain_id" \ | |
| --user-profile-name "$profile" \ | |
| --space-name "$space_name" \ | |
| --query 'AuthorizedUrl' \ | |
| --output text 2>/dev/null || echo "Failed to generate URL (check AWS CLI setup)" | |
| } | |
| clear_gpu() { | |
| if command -v pkill >/dev/null 2>&1; then | |
| pkill -f python 2>/dev/null || true | |
| echo "GPU processes cleared" | |
| else | |
| echo "pkill not available" | |
| fi | |
| } | |
| echo "🚀 SageMaker tools loaded:" | |
| echo " url [space] - Generate SageMaker URL" | |
| echo " gpumem - Show GPU memory usage" | |
| echo " gpumon - Real-time GPU monitoring" | |
| echo " ckpt file.pt - Inspect PyTorch/SafeTensors checkpoints" | |
| echo " pq data.parquet - View parquet files" | |
| echo " slice-pq file.parquet [N|start:end] - Extract parquet rows" | |
| echo " sql 'SELECT...' - Query data files with SQL" | |
| echo " data_files [-a] - Find data files (csv, parquet, json)" | |
| echo " model_files - Find model files (pt, safetensors, etc)" | |
| echo " killpy - Kill Python processes" | |
| echo " gpu/gpuw - nvidia-smi (watch mode)" | |
| # <<< sagemaker-setup <<< | |
| EOF | |
| success "Shell configuration added to ~/.bashrc" | |
| } | |
| setup_bash_profile() { | |
| local bash_profile="$HOME/.bash_profile" | |
| if [[ ! -f "$bash_profile" ]] || ! grep -q "bashrc" "$bash_profile"; then | |
| # shellcheck disable=SC2016 | |
| echo 'if [[ -f "$HOME/.bashrc" ]]; then source "$HOME/.bashrc"; fi' >> "$bash_profile" | |
| log "Updated ~/.bash_profile to source ~/.bashrc" | |
| fi | |
| } | |
| verify_setup() { | |
| log "Verifying setup..." | |
| local issues=0 | |
| [[ ! -x "$HOME/bin/gpumem" ]] && { error "gpumem script missing"; ((issues++)); } | |
| if [[ ! -f "$HOME/.bashrc" ]] || ! grep -q "$START_MARK" "$HOME/.bashrc"; then | |
| error "bashrc configuration missing" | |
| ((issues++)) | |
| fi | |
| if [[ $issues -eq 0 ]]; then | |
| success "Setup verification passed" | |
| return 0 | |
| else | |
| error "Setup verification found $issues issues" | |
| return 1 | |
| fi | |
| } | |
| main() { | |
| echo "Starting SageMaker setup..." | |
| install_tools | |
| create_utilities | |
| setup_shell | |
| setup_bash_profile | |
| touch "$HOME/.bash_history" 2>/dev/null || true | |
| if verify_setup; then | |
| echo | |
| echo "✅ Setup completed successfully!" | |
| echo | |
| echo "To activate the new configuration:" | |
| echo " source ~/.bashrc" | |
| echo | |
| echo "Or start a new shell session" | |
| echo | |
| else | |
| echo | |
| echo "❌ Setup completed with issues" | |
| echo "Check the error messages above" | |
| echo | |
| return 1 | |
| fi | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment