swhitt · September 9, 2025 01:19
diff --git a/setup.sh b/setup.sh
 #!/usr/bin/env bash
 # SageMaker setup script
 # Run with: curl -sSL https://gist.githubusercontent.com/swhitt/3eece69e8bc096427c3c684548f508ed/raw/setup.sh | bash

 echo "🚀 SageMaker Setup Script"

 if [[ "${DEBUG:-0}" == "1" ]]; then
    set -x
 fi

 set -e

 readonly START_MARK="# >>> sagemaker-setup >>>"
 readonly END_MARK="# <<< sagemaker-setup <<<"

 log() { echo "▶ $*"; }
 error() { echo "✗ $*" >&2; }
 success() { echo "✓ $*"; }

 install_tools() {
    log "Installing essential tools..."
    
    mkdir -p "$HOME/.local/bin" "$HOME/bin"
    
    if command -v pip3 >/dev/null 2>&1; then
        log "Installing GPU and ML tools..."
        if timeout 30 pip3 install --user --quiet --upgrade gpustat nvidia-ml-py >/dev/null 2>&1; then
            success "GPU tools installed"
        else
            log "GPU tools install failed"
        fi
    fi
    
    # Install conda tools if conda is available
    if command -v conda >/dev/null 2>&1; then
        log "Installing nvtop via conda..."
        if timeout 60 conda install -y -c conda-forge nvtop >/dev/null 2>&1; then
            success "nvtop installed"
        else
            log "nvtop install failed"
        fi
    fi
    
    # Install DuckDB for data analysis
    log "Installing DuckDB..."
    rm -f "$HOME/.local/bin/duckdb"
    if timeout 30 curl -sL "https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip" -o /tmp/duckdb.zip >/dev/null 2>&1; then
        unzip -q -o /tmp/duckdb.zip -d "$HOME/.local/bin/" 2>/dev/null
        chmod +x "$HOME/.local/bin/duckdb"
        rm -f /tmp/duckdb.zip
        success "DuckDB installed"
    else
        log "DuckDB install failed"
    fi
    
    log "Installing jq..."
    rm -f "$HOME/.local/bin/jq"
    if timeout 30 curl -sL "https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64" -o "$HOME/.local/bin/jq" >/dev/null 2>&1; then
        chmod +x "$HOME/.local/bin/jq"
        success "jq installed"
    else
        log "jq install failed"
    fi
 }

 create_utilities() {
    log "Creating utility scripts..."
    
    rm -f "$HOME/bin/gpumem" "$HOME/bin/gpumon" "$HOME/bin/pq" "$HOME/bin/ckpt" "$HOME/bin/slice-pq"
    
    cat > "$HOME/bin/gpumem" << 'EOF'
 #!/usr/bin/env bash
 if command -v nvidia-smi >/dev/null 2>&1; then
    nvidia-smi --query-gpu=index,memory.used,memory.total,memory.free --format=csv,noheader | \
        awk -F', ' '{printf "GPU %s: %s / %s (Free: %s)\n", $1, $2, $3, $4}'
 else
    echo "nvidia-smi not available"
 fi
 EOF
    chmod +x "$HOME/bin/gpumem"
    
    cat > "$HOME/bin/gpumon" << 'EOF'
 #!/usr/bin/env bash
 # Real-time GPU monitoring
 watch -n 0.5 'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits | column -t -s ","'
 EOF
    chmod +x "$HOME/bin/gpumon"
    
    
    cat > "$HOME/bin/pq" << 'EOF'
 #!/usr/bin/env python3
 import sys
 try:
    import pandas as pd
    import pyarrow.parquet as pq
    from pathlib import Path
    
    if len(sys.argv) < 2:
        print("Usage: pq <file.parquet> [nrows]")
        sys.exit(1)
    
    file = sys.argv[1]
    nrows = int(sys.argv[2]) if len(sys.argv) > 2 else 5
    
    meta = pq.read_metadata(file)
    schema = pq.read_schema(file)
    
    print(f"File: {Path(file).name}")
    print(f"Rows: {meta.num_rows:,}")
    print(f"Columns: {len(schema.names)}")
    print("\nSchema:")
    for field in schema:
        print(f"  {field.name:30} {str(field.type):20}")
    
    print(f"\nFirst {nrows} rows:")
    df = pd.read_parquet(file)
    
    # Truncate binary columns to avoid pages of output
    pd.set_option('display.max_colwidth', 50)
    for col in df.columns:
        if df[col].dtype == 'object':
            if df[col].notna().any():
                sample = str(df[col].dropna().iloc[0]) if not df[col].dropna().empty else ""
                if len(sample) > 100 or any(ord(c) < 32 or ord(c) > 126 for c in sample[:100]):
                    df[col] = df[col].astype(str).str[:20] + '...'
    
    print(df.head(nrows).to_string(max_colwidth=50))
 except ImportError:
    print("Error: pandas/pyarrow not available. Install with: pip install pandas pyarrow")
    sys.exit(1)
 except Exception as e:
    print(f"Error: {e}")
    sys.exit(1)
 EOF
    chmod +x "$HOME/bin/pq"
    
    cat > "$HOME/bin/ckpt" << 'EOF'
 #!/usr/bin/env python3
 import sys
 from pathlib import Path

 def inspect_checkpoint(path):
    path = Path(path)
    
    try:
        if path.suffix == '.safetensors' or 'safetensors' in path.name:
            from safetensors import safe_open
            with safe_open(path, framework="pt") as f:
                keys = f.keys()
                print(f"SafeTensors checkpoint: {len(keys)} tensors")
                print(f"Size: {path.stat().st_size / 1e9:.2f}GB")
                
                # Sample some keys
                sample = list(keys)[:10]
                for k in sample:
                    tensor = f.get_tensor(k)
                    print(f"  {k}: {list(tensor.shape)} ({tensor.dtype})")
                if len(keys) > 10:
                    print(f"  ... and {len(keys) - 10} more")
        else:
            import torch
            ckpt = torch.load(path, map_location='cpu')
            
            if isinstance(ckpt, dict):
                print(f"PyTorch checkpoint: {len(ckpt)} keys")
                print(f"Size: {path.stat().st_size / 1e9:.2f}GB")
                
                for i, (k, v) in enumerate(ckpt.items()):
                    if i >= 10:
                        print(f"  ... and {len(ckpt) - 10} more")
                        break
                    if hasattr(v, 'shape'):
                        print(f"  {k}: {list(v.shape)} ({v.dtype})")
                    else:
                        print(f"  {k}: {type(v).__name__}")
    except ImportError as e:
        print(f"Error: Required package not available - {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)

 if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: ckpt <checkpoint_file>")
        sys.exit(1)
    
    inspect_checkpoint(sys.argv[1])
 EOF
    chmod +x "$HOME/bin/ckpt"
    
    cat > "$HOME/bin/slice-pq" << 'EOF'
 #!/usr/bin/env python

 import sys
 import pandas as pd
 from pathlib import Path

 if len(sys.argv) == 2:
    # Just show row count
    input_file = sys.argv[1]
    if not Path(input_file).exists():
        print(f"Error: {input_file} not found")
        sys.exit(1)
    
    df = pd.read_parquet(input_file)
    print(f"{input_file}: {len(df):,} rows")
    print("Usage: slice-pq <input.parquet> <num_rows_or_range>")
    print("Examples:")
    print("  slice-pq file.parquet 1000      # first 1000 rows")  
    print("  slice-pq file.parquet 500:1500  # rows 500-1499")
    print("  slice-pq file.parquet 1000:     # from row 1000 to end")
    print("  slice-pq file.parquet :500      # first 500 rows")
    sys.exit(0)

 if len(sys.argv) != 3:
    print("Usage: slice-pq <input.parquet> <num_rows_or_range>")
    sys.exit(1)

 input_file = sys.argv[1]
 slice_arg = sys.argv[2]

 # Parse slice argument (supports N or start:end format)
 if ':' in slice_arg:
    start_str, end_str = slice_arg.split(':', 1)
    start = int(start_str) if start_str else None
    end = int(end_str) if end_str else None
 else:
    start, end = None, int(slice_arg)

 if not Path(input_file).exists():
    print(f"Error: {input_file} not found")
    sys.exit(1)

 # Generate output filename  
 input_path = Path(input_file)
 if start is None:
    suffix = f"_{end}" 
 elif end is None:
    suffix = f"_{start}_end"
 else:
    suffix = f"_{start}_{end}"

 output_file = input_path.stem + suffix + input_path.suffix

 print(f"Reading {input_file}...")
 df = pd.read_parquet(input_file)
 print(f"Original: {len(df):,} rows")

 # Apply slice
 df_slice = df.iloc[start:end] if start is not None else df.head(end)
 if start is None:
    slice_desc = f"first {end}"
 elif end is None:
    slice_desc = f"rows {start} to end"
 else:
    slice_desc = f"rows {start} to {end-1}"

 print(f"Slicing to: {len(df_slice):,} rows ({slice_desc})")

 df_slice.to_parquet(output_file)
 print(f"Saved: {output_file}")
 EOF
    chmod +x "$HOME/bin/slice-pq"
    
    success "Utility scripts created"
 }

 setup_shell() {
    log "Setting up shell configuration..."
    
    local bashrc="$HOME/.bashrc"
    
    [[ -f "$bashrc" ]] && cp "$bashrc" "$bashrc.bak"
    
    if [[ -f "$bashrc" ]] && grep -q "$START_MARK" "$bashrc"; then
        log "Removing existing configuration..."
        while grep -q "$START_MARK" "$bashrc"; do
            sed -i "/$START_MARK/,/$END_MARK/d" "$bashrc"
        done
    fi
    
    cat >> "$bashrc" << 'EOF'

 # >>> sagemaker-setup >>>
 # Only run in interactive shells
 case $- in
    *i*) ;;
    *) return ;;
 esac

 export PATH="$HOME/bin:$HOME/.local/bin:$PATH"

 __git_branch() {
    if command -v git >/dev/null 2>&1; then
        local branch
        branch=$(git branch 2>/dev/null | grep '^\*' | cut -d' ' -f2- 2>/dev/null || echo "")
        [[ -n "$branch" ]] && echo " ($branch)"
    fi
 }

 if [[ -t 1 ]] && command -v tput >/dev/null 2>&1 && tput colors >/dev/null 2>&1 && [[ $(tput colors) -ge 8 ]]; then
    PS1='\[\033[32m\]\u@\h\[\033[0m\]:\[\033[34m\]\w\[\033[33m\]$(__git_branch)\[\033[0m\]\$ '
 else
    PS1='\u@\h:\w$(__git_branch)\$ '
 fi

 alias ll='ls -lhF'
 alias la='ls -lhAF'
 alias lt='ls -lhFtr'
 alias ..='cd ..'
 alias ...='cd ../..'
 alias -- -='cd -'

 alias rm='rm -i'
 alias mv='mv -i'
 alias cp='cp -i'

 alias grep='grep --color=auto'
 alias df='df -h'
 alias du='du -h'
 alias free='free -h'

 alias gpu='nvidia-smi'
 alias gpuw='watch -n1 nvidia-smi'

 alias py='python3'
 alias ipy='ipython'

 HISTSIZE=100000
 HISTFILESIZE=100000
 HISTCONTROL=ignoreboth:erasedups
 HISTTIMEFORMAT='%F %T '
 HISTIGNORE='ls:ll:cd:pwd:exit:date:* --help:gpu:gpuw'

 if shopt -q histappend 2>/dev/null; then
    shopt -s histappend
    shopt -s cmdhist
 fi

 PROMPT_COMMAND="${PROMPT_COMMAND:+$PROMPT_COMMAND; }history -a; history -n"

 sql() {
    local query="${1:?Usage: sql 'SELECT ... FROM file.ext ...'}"
    duckdb -c "$query"
 }

 data_files() {
    local dir="${1:-.}"
    local show_all=false
    
    if [[ "$1" == "-a" ]]; then
        show_all=true
        dir="${2:-.}"
    fi
    
    local find_cmd="find \"$dir\" -type f"
    
    if [[ "$show_all" == false ]]; then
        find_cmd="$find_cmd -not -path '*/.conda/*' -not -path '*/.local/*' -not -path '*/.cache/*'"
    fi
    
    find_cmd="$find_cmd \( -name \"*.parquet\" -o -name \"*.csv\" -o -name \"*.json\" -o -name \"*.jsonl\" \) -exec ls -lh {} \; 2>/dev/null | awk '{print \$NF, \$5}'"
    
    eval "$find_cmd"
 }

 model_files() {
    local dir="${1:-.}"
    find "$dir" -type f \( \
        -name "*.pt" -o \
        -name "*.pth" -o \
        -name "*.bin" -o \
        -name "*.safetensors" -o \
        -name "*.ckpt" \
    \) -exec ls -lh {} \; 2>/dev/null | awk '{print $NF, $5}'
 }

 killpy() {
    local procs=$(pgrep -f python | wc -l)
    if [[ $procs -gt 0 ]]; then
        echo "Killing $procs Python processes..."
        pkill -9 -f python
    else
        echo "No Python processes running"
    fi
 }

 # SageMaker URL generator
 url() {
    local space_name="${1:-$SAGEMAKER_SPACE_NAME}"
    
    if [[ -z "$space_name" ]]; then
        echo "Usage: url <space-name>"
        echo "Set SAGEMAKER_SPACE_NAME env var or pass space name as argument"
        return 1
    fi
    
    if ! command -v aws >/dev/null 2>&1; then
        echo "AWS CLI not available"
        return 1
    fi
    
    echo "Looking up space info for: $space_name"
    local space_info=$(timeout 10 aws sagemaker list-spaces --query "Spaces[?SpaceName=='$space_name'].[DomainId,OwnershipSettingsSummary.OwnerUserProfileName]" --output text 2>/dev/null)
    
    if [[ -z "$space_info" ]]; then
        echo "Error: Could not find space '$space_name'"
        return 1
    fi
    
    local domain_id=$(echo "$space_info" | cut -f1)
    local profile=$(echo "$space_info" | cut -f2)
    
    echo "Generating URL for space: $space_name, profile: $profile, domain: $domain_id"
    timeout 10 aws sagemaker create-presigned-domain-url \
        --domain-id "$domain_id" \
        --user-profile-name "$profile" \
        --space-name "$space_name" \
        --query 'AuthorizedUrl' \
        --output text 2>/dev/null || echo "Failed to generate URL (check AWS CLI setup)"
 }

 clear_gpu() {
    if command -v pkill >/dev/null 2>&1; then
        pkill -f python 2>/dev/null || true
        echo "GPU processes cleared"
    else
        echo "pkill not available"
    fi
 }

 echo "🚀 SageMaker tools loaded:"
 echo "  url [space]     - Generate SageMaker URL"
 echo "  gpumem          - Show GPU memory usage"
 echo "  gpumon          - Real-time GPU monitoring"
 echo "  ckpt file.pt    - Inspect PyTorch/SafeTensors checkpoints"  
 echo "  pq data.parquet - View parquet files"
 echo "  slice-pq file.parquet [N|start:end] - Extract parquet rows"
 echo "  sql 'SELECT...' - Query data files with SQL"
 echo "  data_files [-a] - Find data files (csv, parquet, json)"
 echo "  model_files     - Find model files (pt, safetensors, etc)"
 echo "  killpy          - Kill Python processes"
 echo "  gpu/gpuw        - nvidia-smi (watch mode)"

 # <<< sagemaker-setup <<<
 EOF

    success "Shell configuration added to ~/.bashrc"
 }

 setup_bash_profile() {
    local bash_profile="$HOME/.bash_profile"
    
    if [[ ! -f "$bash_profile" ]] || ! grep -q "bashrc" "$bash_profile"; then
        # shellcheck disable=SC2016
        echo 'if [[ -f "$HOME/.bashrc" ]]; then source "$HOME/.bashrc"; fi' >> "$bash_profile"
        log "Updated ~/.bash_profile to source ~/.bashrc"
    fi
 }

 verify_setup() {
    log "Verifying setup..."
    
    local issues=0
    
    [[ ! -x "$HOME/bin/gpumem" ]] && { error "gpumem script missing"; ((issues++)); }
    
    if [[ ! -f "$HOME/.bashrc" ]] || ! grep -q "$START_MARK" "$HOME/.bashrc"; then
        error "bashrc configuration missing"
        ((issues++))
    fi
    
    if [[ $issues -eq 0 ]]; then
        success "Setup verification passed"
        return 0
    else
        error "Setup verification found $issues issues"
        return 1
    fi
 }

 main() {
    echo "Starting SageMaker setup..."
    
    install_tools
    create_utilities  
    setup_shell
    setup_bash_profile
    
    touch "$HOME/.bash_history" 2>/dev/null || true
    
    if verify_setup; then
        echo
        echo "✅ Setup completed successfully!"
        echo
        echo "To activate the new configuration:"
        echo "  source ~/.bashrc"
        echo
        echo "Or start a new shell session"
        echo
    else
        echo
        echo "❌ Setup completed with issues"
        echo "Check the error messages above"
        echo
        return 1
    fi
 }

 main "$@"
	#!/usr/bin/env bash
	# SageMaker setup script
	# Run with: curl -sSL https://gist.githubusercontent.com/swhitt/3eece69e8bc096427c3c684548f508ed/raw/setup.sh \| bash

	echo "🚀 SageMaker Setup Script"

	if [[ "${DEBUG:-0}" == "1" ]]; then
	set -x
	fi

	set -e

	readonly START_MARK="# >>> sagemaker-setup >>>"
	readonly END_MARK="# <<< sagemaker-setup <<<"

	log() { echo "▶ $*"; }
	error() { echo "✗ $*" >&2; }
	success() { echo "✓ $*"; }

	install_tools() {
	log "Installing essential tools..."

	mkdir -p "$HOME/.local/bin" "$HOME/bin"

	if command -v pip3 >/dev/null 2>&1; then
	log "Installing GPU and ML tools..."
	if timeout 30 pip3 install --user --quiet --upgrade gpustat nvidia-ml-py >/dev/null 2>&1; then
	success "GPU tools installed"
	else
	log "GPU tools install failed"
	fi
	fi

	# Install conda tools if conda is available
	if command -v conda >/dev/null 2>&1; then
	log "Installing nvtop via conda..."
	if timeout 60 conda install -y -c conda-forge nvtop >/dev/null 2>&1; then
	success "nvtop installed"
	else
	log "nvtop install failed"
	fi
	fi

	# Install DuckDB for data analysis
	log "Installing DuckDB..."
	rm -f "$HOME/.local/bin/duckdb"
	if timeout 30 curl -sL "https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip" -o /tmp/duckdb.zip >/dev/null 2>&1; then
	unzip -q -o /tmp/duckdb.zip -d "$HOME/.local/bin/" 2>/dev/null
	chmod +x "$HOME/.local/bin/duckdb"
	rm -f /tmp/duckdb.zip
	success "DuckDB installed"
	else
	log "DuckDB install failed"
	fi

	log "Installing jq..."
	rm -f "$HOME/.local/bin/jq"
	if timeout 30 curl -sL "https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64" -o "$HOME/.local/bin/jq" >/dev/null 2>&1; then
	chmod +x "$HOME/.local/bin/jq"
	success "jq installed"
	else
	log "jq install failed"
	fi
	}

	create_utilities() {
	log "Creating utility scripts..."

	rm -f "$HOME/bin/gpumem" "$HOME/bin/gpumon" "$HOME/bin/pq" "$HOME/bin/ckpt" "$HOME/bin/slice-pq"

	cat > "$HOME/bin/gpumem" << 'EOF'
	#!/usr/bin/env bash
	if command -v nvidia-smi >/dev/null 2>&1; then
	nvidia-smi --query-gpu=index,memory.used,memory.total,memory.free --format=csv,noheader \| \
	awk -F', ' '{printf "GPU %s: %s / %s (Free: %s)\n", $1, $2, $3, $4}'
	else
	echo "nvidia-smi not available"
	fi
	EOF
	chmod +x "$HOME/bin/gpumem"

	cat > "$HOME/bin/gpumon" << 'EOF'
	#!/usr/bin/env bash
	# Real-time GPU monitoring
	watch -n 0.5 'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits \| column -t -s ","'
	EOF
	chmod +x "$HOME/bin/gpumon"


	cat > "$HOME/bin/pq" << 'EOF'
	#!/usr/bin/env python3
	import sys
	try:
	import pandas as pd
	import pyarrow.parquet as pq
	from pathlib import Path

	if len(sys.argv) < 2:
	print("Usage: pq <file.parquet> [nrows]")
	sys.exit(1)

	file = sys.argv[1]
	nrows = int(sys.argv[2]) if len(sys.argv) > 2 else 5

	meta = pq.read_metadata(file)
	schema = pq.read_schema(file)

	print(f"File: {Path(file).name}")
	print(f"Rows: {meta.num_rows:,}")
	print(f"Columns: {len(schema.names)}")
	print("\nSchema:")
	for field in schema:
	print(f" {field.name:30} {str(field.type):20}")

	print(f"\nFirst {nrows} rows:")
	df = pd.read_parquet(file)

	# Truncate binary columns to avoid pages of output
	pd.set_option('display.max_colwidth', 50)
	for col in df.columns:
	if df[col].dtype == 'object':
	if df[col].notna().any():
	sample = str(df[col].dropna().iloc[0]) if not df[col].dropna().empty else ""
	if len(sample) > 100 or any(ord(c) < 32 or ord(c) > 126 for c in sample[:100]):
	df[col] = df[col].astype(str).str[:20] + '...'

	print(df.head(nrows).to_string(max_colwidth=50))
	except ImportError:
	print("Error: pandas/pyarrow not available. Install with: pip install pandas pyarrow")
	sys.exit(1)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)
	EOF
	chmod +x "$HOME/bin/pq"

	cat > "$HOME/bin/ckpt" << 'EOF'
	#!/usr/bin/env python3
	import sys
	from pathlib import Path

	def inspect_checkpoint(path):
	path = Path(path)

	try:
	if path.suffix == '.safetensors' or 'safetensors' in path.name:
	from safetensors import safe_open
	with safe_open(path, framework="pt") as f:
	keys = f.keys()
	print(f"SafeTensors checkpoint: {len(keys)} tensors")
	print(f"Size: {path.stat().st_size / 1e9:.2f}GB")

	# Sample some keys
	sample = list(keys)[:10]
	for k in sample:
	tensor = f.get_tensor(k)
	print(f" {k}: {list(tensor.shape)} ({tensor.dtype})")
	if len(keys) > 10:
	print(f" ... and {len(keys) - 10} more")
	else:
	import torch
	ckpt = torch.load(path, map_location='cpu')

	if isinstance(ckpt, dict):
	print(f"PyTorch checkpoint: {len(ckpt)} keys")
	print(f"Size: {path.stat().st_size / 1e9:.2f}GB")

	for i, (k, v) in enumerate(ckpt.items()):
	if i >= 10:
	print(f" ... and {len(ckpt) - 10} more")
	break
	if hasattr(v, 'shape'):
	print(f" {k}: {list(v.shape)} ({v.dtype})")
	else:
	print(f" {k}: {type(v).__name__}")
	except ImportError as e:
	print(f"Error: Required package not available - {e}")
	sys.exit(1)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)

	if __name__ == '__main__':
	if len(sys.argv) < 2:
	print("Usage: ckpt <checkpoint_file>")
	sys.exit(1)

	inspect_checkpoint(sys.argv[1])
	EOF
	chmod +x "$HOME/bin/ckpt"

	cat > "$HOME/bin/slice-pq" << 'EOF'
	#!/usr/bin/env python

	import sys
	import pandas as pd
	from pathlib import Path

	if len(sys.argv) == 2:
	# Just show row count
	input_file = sys.argv[1]
	if not Path(input_file).exists():
	print(f"Error: {input_file} not found")
	sys.exit(1)

	df = pd.read_parquet(input_file)
	print(f"{input_file}: {len(df):,} rows")
	print("Usage: slice-pq <input.parquet> <num_rows_or_range>")
	print("Examples:")
	print(" slice-pq file.parquet 1000 # first 1000 rows")
	print(" slice-pq file.parquet 500:1500 # rows 500-1499")
	print(" slice-pq file.parquet 1000: # from row 1000 to end")
	print(" slice-pq file.parquet :500 # first 500 rows")
	sys.exit(0)

	if len(sys.argv) != 3:
	print("Usage: slice-pq <input.parquet> <num_rows_or_range>")
	sys.exit(1)

	input_file = sys.argv[1]
	slice_arg = sys.argv[2]

	# Parse slice argument (supports N or start:end format)
	if ':' in slice_arg:
	start_str, end_str = slice_arg.split(':', 1)
	start = int(start_str) if start_str else None
	end = int(end_str) if end_str else None
	else:
	start, end = None, int(slice_arg)

	if not Path(input_file).exists():
	print(f"Error: {input_file} not found")
	sys.exit(1)

	# Generate output filename
	input_path = Path(input_file)
	if start is None:
	suffix = f"_{end}"
	elif end is None:
	suffix = f"_{start}_end"
	else:
	suffix = f"_{start}_{end}"

	output_file = input_path.stem + suffix + input_path.suffix

	print(f"Reading {input_file}...")
	df = pd.read_parquet(input_file)
	print(f"Original: {len(df):,} rows")

	# Apply slice
	df_slice = df.iloc[start:end] if start is not None else df.head(end)
	if start is None:
	slice_desc = f"first {end}"
	elif end is None:
	slice_desc = f"rows {start} to end"
	else:
	slice_desc = f"rows {start} to {end-1}"

	print(f"Slicing to: {len(df_slice):,} rows ({slice_desc})")

	df_slice.to_parquet(output_file)
	print(f"Saved: {output_file}")
	EOF
	chmod +x "$HOME/bin/slice-pq"

	success "Utility scripts created"
	}

	setup_shell() {
	log "Setting up shell configuration..."

	local bashrc="$HOME/.bashrc"

	[[ -f "$bashrc" ]] && cp "$bashrc" "$bashrc.bak"

	if [[ -f "$bashrc" ]] && grep -q "$START_MARK" "$bashrc"; then
	log "Removing existing configuration..."
	while grep -q "$START_MARK" "$bashrc"; do
	sed -i "/$START_MARK/,/$END_MARK/d" "$bashrc"
	done
	fi

	cat >> "$bashrc" << 'EOF'

	# >>> sagemaker-setup >>>
	# Only run in interactive shells
	case $- in
	i) ;;
	*) return ;;
	esac

	export PATH="$HOME/bin:$HOME/.local/bin:$PATH"

	__git_branch() {
	if command -v git >/dev/null 2>&1; then
	local branch
	branch=$(git branch 2>/dev/null \| grep '^\*' \| cut -d' ' -f2- 2>/dev/null \|\| echo "")
	[[ -n "$branch" ]] && echo " ($branch)"
	fi
	}

	if [[ -t 1 ]] && command -v tput >/dev/null 2>&1 && tput colors >/dev/null 2>&1 && [[ $(tput colors) -ge 8 ]]; then
	PS1='\[\033[32m\]\u@\h\[\033[0m\]:\[\033[34m\]\w\[\033[33m\]$(__git_branch)\[\033[0m\]\$ '
	else
	PS1='\u@\h:\w$(__git_branch)\$ '
	fi

	alias ll='ls -lhF'
	alias la='ls -lhAF'
	alias lt='ls -lhFtr'
	alias ..='cd ..'
	alias ...='cd ../..'
	alias -- -='cd -'

	alias rm='rm -i'
	alias mv='mv -i'
	alias cp='cp -i'

	alias grep='grep --color=auto'
	alias df='df -h'
	alias du='du -h'
	alias free='free -h'

	alias gpu='nvidia-smi'
	alias gpuw='watch -n1 nvidia-smi'

	alias py='python3'
	alias ipy='ipython'

	HISTSIZE=100000
	HISTFILESIZE=100000
	HISTCONTROL=ignoreboth:erasedups
	HISTTIMEFORMAT='%F %T '
	HISTIGNORE='ls:ll:cd:pwd:exit:date:* --help:gpu:gpuw'

	if shopt -q histappend 2>/dev/null; then
	shopt -s histappend
	shopt -s cmdhist
	fi

	PROMPT_COMMAND="${PROMPT_COMMAND:+$PROMPT_COMMAND; }history -a; history -n"

	sql() {
	local query="${1:?Usage: sql 'SELECT ... FROM file.ext ...'}"
	duckdb -c "$query"
	}

	data_files() {
	local dir="${1:-.}"
	local show_all=false

	if [[ "$1" == "-a" ]]; then
	show_all=true
	dir="${2:-.}"
	fi

	local find_cmd="find \"$dir\" -type f"

	if [[ "$show_all" == false ]]; then
	find_cmd="$find_cmd -not -path '/.conda/' -not -path '/.local/' -not -path '/.cache/'"
	fi

	find_cmd="$find_cmd \( -name \".parquet\" -o -name \".csv\" -o -name \".json\" -o -name \".jsonl\" \) -exec ls -lh {} \; 2>/dev/null \| awk '{print \$NF, \$5}'"

	eval "$find_cmd"
	}

	model_files() {
	local dir="${1:-.}"
	find "$dir" -type f \( \
	-name "*.pt" -o \
	-name "*.pth" -o \
	-name "*.bin" -o \
	-name "*.safetensors" -o \
	-name "*.ckpt" \
	\) -exec ls -lh {} \; 2>/dev/null \| awk '{print $NF, $5}'
	}

	killpy() {
	local procs=$(pgrep -f python \| wc -l)
	if [[ $procs -gt 0 ]]; then
	echo "Killing $procs Python processes..."
	pkill -9 -f python
	else
	echo "No Python processes running"
	fi
	}

	# SageMaker URL generator
	url() {
	local space_name="${1:-$SAGEMAKER_SPACE_NAME}"

	if [[ -z "$space_name" ]]; then
	echo "Usage: url <space-name>"
	echo "Set SAGEMAKER_SPACE_NAME env var or pass space name as argument"
	return 1
	fi

	if ! command -v aws >/dev/null 2>&1; then
	echo "AWS CLI not available"
	return 1
	fi

	echo "Looking up space info for: $space_name"
	local space_info=$(timeout 10 aws sagemaker list-spaces --query "Spaces[?SpaceName=='$space_name'].[DomainId,OwnershipSettingsSummary.OwnerUserProfileName]" --output text 2>/dev/null)

	if [[ -z "$space_info" ]]; then
	echo "Error: Could not find space '$space_name'"
	return 1
	fi

	local domain_id=$(echo "$space_info" \| cut -f1)
	local profile=$(echo "$space_info" \| cut -f2)

	echo "Generating URL for space: $space_name, profile: $profile, domain: $domain_id"
	timeout 10 aws sagemaker create-presigned-domain-url \
	--domain-id "$domain_id" \
	--user-profile-name "$profile" \
	--space-name "$space_name" \
	--query 'AuthorizedUrl' \
	--output text 2>/dev/null \|\| echo "Failed to generate URL (check AWS CLI setup)"
	}

	clear_gpu() {
	if command -v pkill >/dev/null 2>&1; then
	pkill -f python 2>/dev/null \|\| true
	echo "GPU processes cleared"
	else
	echo "pkill not available"
	fi
	}

	echo "🚀 SageMaker tools loaded:"
	echo " url [space] - Generate SageMaker URL"
	echo " gpumem - Show GPU memory usage"
	echo " gpumon - Real-time GPU monitoring"
	echo " ckpt file.pt - Inspect PyTorch/SafeTensors checkpoints"
	echo " pq data.parquet - View parquet files"
	echo " slice-pq file.parquet [N\|start:end] - Extract parquet rows"
	echo " sql 'SELECT...' - Query data files with SQL"
	echo " data_files [-a] - Find data files (csv, parquet, json)"
	echo " model_files - Find model files (pt, safetensors, etc)"
	echo " killpy - Kill Python processes"
	echo " gpu/gpuw - nvidia-smi (watch mode)"

	# <<< sagemaker-setup <<<
	EOF

	success "Shell configuration added to ~/.bashrc"
	}

	setup_bash_profile() {
	local bash_profile="$HOME/.bash_profile"

	if [[ ! -f "$bash_profile" ]] \|\| ! grep -q "bashrc" "$bash_profile"; then
	# shellcheck disable=SC2016
	echo 'if [[ -f "$HOME/.bashrc" ]]; then source "$HOME/.bashrc"; fi' >> "$bash_profile"
	log "Updated ~/.bash_profile to source ~/.bashrc"
	fi
	}

	verify_setup() {
	log "Verifying setup..."

	local issues=0

	[[ ! -x "$HOME/bin/gpumem" ]] && { error "gpumem script missing"; ((issues++)); }

	if [[ ! -f "$HOME/.bashrc" ]] \|\| ! grep -q "$START_MARK" "$HOME/.bashrc"; then
	error "bashrc configuration missing"
	((issues++))
	fi

	if [[ $issues -eq 0 ]]; then
	success "Setup verification passed"
	return 0
	else
	error "Setup verification found $issues issues"
	return 1
	fi
	}

	main() {
	echo "Starting SageMaker setup..."

	install_tools
	create_utilities
	setup_shell
	setup_bash_profile

	touch "$HOME/.bash_history" 2>/dev/null \|\| true

	if verify_setup; then
	echo
	echo "✅ Setup completed successfully!"
	echo
	echo "To activate the new configuration:"
	echo " source ~/.bashrc"
	echo
	echo "Or start a new shell session"
	echo
	else
	echo
	echo "❌ Setup completed with issues"
	echo "Check the error messages above"
	echo
	return 1
	fi
	}

	main "$@"
No results found