bedwards · October 17, 2025 14:21
diff --git a/burrito.txt b/burrito.txt
 Note: Click the "Raw" button for a wider view.

 Sorted by Tok/sec (descending).

                                                      🚀 Ollama LLM Benchmark Results
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━┓
 ┃ Model                                            ┃      TTFT (s) ┃       TPOT (s) ┃ Tok/sec ┃ Params (B) ┃ Size (GB) ┃ VRAM (GB) ┃ Runs ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━┩
 │ tinyllama:1.1b                                   │  0.052 ±0.008 │ 0.0038 ±0.0001 │   266.2 │        1.1 │      0.59 │      0.71 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ llama3.2:3b                                      │  0.116 ±0.006 │ 0.0061 ±0.0001 │   162.7 │        3.0 │      1.88 │      2.26 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ phi3:mini                                        │  0.048 ±0.003 │ 0.0070 ±0.0001 │   143.2 │        N/A │      2.03 │      2.43 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ interstellarninja/hermes-2-pro-llama-3-8b:latest │  0.165 ±0.088 │ 0.0102 ±0.0001 │    97.9 │        8.0 │      4.58 │      5.50 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ llama3.1:8b                                      │  0.435 ±0.544 │ 0.0105 ±0.0001 │    95.5 │        8.0 │      4.58 │      5.50 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ nollama/mythomax-l2-13b:Q4_K_M                   │  0.066 ±0.012 │ 0.0174 ±0.0021 │    57.4 │       13.0 │      7.33 │      8.79 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ gpt-oss:20b                                      │ -1.000 ±0.000 │ 0.0329 ±0.0004 │    30.4 │       20.0 │     12.85 │     15.42 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ qwen3:8b                                         │ -1.000 ±0.000 │ 0.0330 ±0.0001 │    30.3 │        8.0 │      4.87 │      5.84 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ deepseek-r1:8b                                   │ -1.000 ±0.000 │ 0.0334 ±0.0003 │    30.0 │        8.0 │      4.87 │      5.84 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ llama3.3:70b                                     │  0.173 ±0.005 │ 0.0719 ±0.0002 │    13.9 │       70.0 │     39.60 │     47.52 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ qwen2.5:72b                                      │  0.186 ±0.003 │ 0.0776 ±0.0001 │    12.9 │       72.0 │     44.16 │     52.99 │    3 │
 └──────────────────────────────────────────────────┴───────────────┴────────────────┴─────────┴────────────┴───────────┴───────────┴──────┘

 📊 Columns:
  • TTFT (s)  : Time to First Token - latency before generation starts (responsiveness)
  • TPOT (s)  : Time Per Output Token - average time per token during generation
  • Tok/sec   : Tokens per second - sustained generation speed (inverse of TPOT)
  • Params (B): Model size in billions of parameters
  • Size (GB) : Disk space used by model file
  • VRAM (GB) : Estimated memory usage (includes 19% KV cache overhead)
  • Runs      : Number of test iterations averaged
diff --git a/c_burrito.py b/c_burrito.py
 #!/usr/bin/env python3

 DEFAULT_PROMPT = "What is a burrito? Give a one-sentence answer."
 ALWAYS_WARMUP = True

 # --host   | OLLAMA_HOST    | Ollama server URL
 # --prompt | DEFAULT_PROMPT | Benchmark prompt
 # --runs   | 3              | Number of benchmark runs per model
 # --warmup | true           | Run a warmup iteration before benchmarking
 # --models | all installed  | Specific models to benchmark
 # --sort   | speed          | ttft, tpot, size - Sort results by metric
 # --output | none           | Output file path for results
 # --format | json           | csv - Output format

 import ollama
 import time
 import json
 import csv
 import argparse
 import statistics
 from pathlib import Path
 from typing import List, Dict, Any, Tuple, Optional
 from dataclasses import dataclass, asdict

 # Optional: rich library for better output (falls back to basic if not available)
 try:
    from rich.console import Console
    from rich.table import Table
    from rich.progress import Progress, SpinnerColumn, TextColumn
    RICH_AVAILABLE = True
    console = Console()
 except ImportError:
    RICH_AVAILABLE = False
    console = None

 # --- Configuration ---
 OLLAMA_HOST = 'http://localhost:11434'
 VRAM_OVERHEAD_FACTOR = 1.2

 @dataclass
 class BenchmarkResult:
    """Structured benchmark result data."""
    model_name: str
    ttft_mean: float
    ttft_std: float
    tpot_mean: float
    tpot_std: float
    tokens_per_sec: float
    param_count_billion: float
    file_size_gb: float
    vram_estimate_gb: float
    output_tokens: int
    runs: int
    load_time_ms: float = 0.0

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for export."""
        return asdict(self)

 # --- Helper Functions ---

 def bytes_to_gb(b: int) -> float:
    """Converts bytes to gigabytes."""
    return b / (1024 ** 3)

 def extract_param_count(model_name: str, details: Dict[str, Any]) -> float:
    """
    Extracts parameter count from model name or details.
    Improved parsing with multiple strategies.
    """
    # Strategy 1: Parse from model name (e.g., "llama3.2:3b", "mistral:7b-instruct")
    model_lower = model_name.lower()

    # Look for patterns like "7b", "3.2b", "70b", etc.
    import re
    patterns = [
        r'(\d+\.?\d*)b',  # Matches 7b, 3.2b, 70b
        r':(\d+\.?\d*)b',  # Matches :7b
    ]

    for pattern in patterns:
        match = re.search(pattern, model_lower)
        if match:
            try:
                return float(match.group(1))
            except ValueError:
                pass

    # Strategy 2: Check modelfile template for parameter info
    modelfile = (
        getattr(details, 'modelfile', '') if hasattr(details, 'modelfile')
        else (
            details.get('modelfile', '') if isinstance(details, dict)
            else ''
        )
    )

    if 'parameters' in modelfile.lower():
        # Try to extract from template
        pass

    # Strategy 3: Estimate from file size (very rough)
    if hasattr(details, 'size'):
        file_size_gb = bytes_to_gb(details.size)
    elif isinstance(details, dict):
        file_size_gb = bytes_to_gb(details.get('size', 0))
    else:
        file_size_gb = 0.0


    if file_size_gb > 0:
        # Rough estimate: Q4 quantization uses ~0.5-0.6 GB per billion params
        # This is just a fallback estimate
        estimated = file_size_gb / 0.55
        if estimated < 100:  # Sanity check
            return round(estimated, 1)

    return 0.0


 def get_model_details(client: ollama.Client, model_name: str, file_size_bytes: int = 0) -> Dict[str, Any]:
    """Fetches detailed information about a model using /api/show."""
    try:
        details = client.show(model_name)

        # # DEBUG: Print what we actually get
        # print(f"\n=== DEBUG for {model_name} ===")
        # print(f"Type: {type(details)}")
        # print(f"Dir: {[attr for attr in dir(details) if not attr.startswith('_')]}")
        # if hasattr(details, '__dict__'):
        #     print(f"Dict: {details.__dict__}")
        # print("=== END DEBUG ===\n")

        param_count = extract_param_count(model_name, details)
        vram_estimate_gb = bytes_to_gb(file_size_bytes) * VRAM_OVERHEAD_FACTOR

        return {
            "file_size_gb": bytes_to_gb(file_size_bytes),
            "param_count_billion": param_count,
            "vram_estimate_gb": vram_estimate_gb,
            "details": details
        }
    except Exception as e:
        print(f"Warning: Error fetching details for {model_name}: {e}")
        return {
            "file_size_gb": 0.0,
            "param_count_billion": 0.0,
            "vram_estimate_gb": 0.0,
            "details": {}
        }

 def measure_ttft(client: ollama.Client, model_name: str, prompt: str) -> Tuple[float, float, int, float]:
    """
    Measures Time-to-First-Token (TTFT), TPOT, and load time.
    Returns: (ttft, tpot, eval_count, load_time_ms)
    """
    try:
        start_time = time.time()
        ttft = -1.0
        total_duration = 0
        load_duration_ns = 0

        response_stream = client.generate(
            model=model_name,
            prompt=prompt,
            stream=True,
            options={
                "num_predict": 50,  # Limit output for consistency
                "temperature": 0.0,  # Deterministic for benchmarking
            }
        )

        for chunk in response_stream:
            if ttft < 0 and chunk.get('response'):
                ttft = time.time() - start_time

            if chunk.get('done'):
                total_duration_ns = chunk.get('total_duration', 0)
                load_duration_ns = chunk.get('load_duration', 0)
                total_duration = total_duration_ns / 1_000_000_000
                eval_count = chunk.get('eval_count', 0)
                break

        tpot = (total_duration - ttft) / eval_count if eval_count > 0 else 0.0
        load_time_ms = load_duration_ns / 1_000_000

        return ttft, tpot, eval_count, load_time_ms

    except Exception as e:
        print(f"Error benchmarking {model_name}: {e}")
        return float('inf'), float('inf'), 0, 0.0

 def run_multiple_benchmarks(client: ollama.Client, model_name: str, prompt: str,
                           runs: int, warmup: bool = True) -> Dict[str, Any]:
    """
    Run multiple benchmark iterations and calculate statistics.
    """
    if warmup:
        if RICH_AVAILABLE:
            console.print(f"  [dim]Running warmup...[/dim]")
        else:
            print(f"  Running warmup...")
        measure_ttft(client, model_name, prompt)
        time.sleep(1)

    ttfts = []
    tpots = []
    eval_counts = []
    load_times = []

    for i in range(runs):
        if RICH_AVAILABLE:
            console.print(f"  [dim]Run {i+1}/{runs}...[/dim]")
        else:
            print(f"  Run {i+1}/{runs}...")

        ttft, tpot, eval_count, load_time = measure_ttft(client, model_name, prompt)

        if ttft != float('inf'):
            ttfts.append(ttft)
            tpots.append(tpot)
            eval_counts.append(eval_count)
            load_times.append(load_time)

        time.sleep(0.5)  # Brief pause between runs

    if not ttfts:
        return {
            "ttft_mean": float('inf'),
            "ttft_std": 0.0,
            "tpot_mean": float('inf'),
            "tpot_std": 0.0,
            "tokens_per_sec": 0.0,
            "output_tokens": 0,
            "load_time_ms": 0.0
        }

    ttft_mean = statistics.mean(ttfts)
    tpot_mean = statistics.mean(tpots)
    tokens_per_sec = 1.0 / tpot_mean if tpot_mean > 0 else 0.0

    return {
        "ttft_mean": ttft_mean,
        "ttft_std": statistics.stdev(ttfts) if len(ttfts) > 1 else 0.0,
        "tpot_mean": tpot_mean,
        "tpot_std": statistics.stdev(tpots) if len(tpots) > 1 else 0.0,
        "tokens_per_sec": tokens_per_sec,
        "output_tokens": int(statistics.mean(eval_counts)),
        "load_time_ms": statistics.mean(load_times)
    }

 def print_results_table(results: List[BenchmarkResult], sort_by: str = 'ttft'):
    """Print results in a formatted table."""
    if not results:
        print("No results to display.")
        return

    # Sort results
    sort_key_map = {
        'ttft': lambda x: x.ttft_mean,
        'tpot': lambda x: x.tpot_mean,
        'speed': lambda x: -x.tokens_per_sec,  # Negative for descending
        'size': lambda x: x.file_size_gb
    }
    results = sorted(results, key=sort_key_map.get(sort_by, sort_key_map['ttft']))

    if RICH_AVAILABLE:
        table = Table(title="🚀 Ollama LLM Benchmark Results", show_lines=True)
        table.add_column("Model", style="cyan", no_wrap=True)
        table.add_column("TTFT (s)", justify="right", style="green")
        table.add_column("TPOT (s)", justify="right", style="yellow")
        table.add_column("Tok/sec", justify="right", style="magenta")
        table.add_column("Params (B)", justify="right")
        table.add_column("Size (GB)", justify="right")
        table.add_column("VRAM (GB)", justify="right", style="red")
        table.add_column("Runs", justify="right", style="dim")

        for result in results:
            ttft_str = f"{result.ttft_mean:.3f} ±{result.ttft_std:.3f}" if result.ttft_mean != float('inf') else "Error"
            tpot_str = f"{result.tpot_mean:.4f} ±{result.tpot_std:.4f}" if result.tpot_mean != float('inf') else "Error"

            table.add_row(
                result.model_name,
                ttft_str,
                tpot_str,
                f"{result.tokens_per_sec:.1f}",
                f"{result.param_count_billion:.1f}" if result.param_count_billion > 0 else "N/A",
                f"{result.file_size_gb:.2f}",
                f"{result.vram_estimate_gb:.2f}",
                str(result.runs)
            )

        console.print(table)
    else:
        # Fallback to basic table
        print("\n" + "=" * 120)
        print("OLLAMA LLM BENCHMARK RESULTS")
        print("=" * 120)

        header = "{:<30} | {:>12} | {:>12} | {:>10} | {:>10} | {:>10} | {:>10} | {:>5}".format(
            "Model", "TTFT (s)", "TPOT (s)", "Tok/sec", "Params (B)", "Size (GB)", "VRAM (GB)", "Runs"
        )
        print(header)
        print("-" * 120)

        for result in results:
            ttft_str = f"{result.ttft_mean:.3f}±{result.ttft_std:.2f}" if result.ttft_mean != float('inf') else "Error"
            tpot_str = f"{result.tpot_mean:.4f}±{result.tpot_std:.3f}" if result.tpot_mean != float('inf') else "Error"

            row = "{:<30} | {:>12} | {:>12} | {:>10.1f} | {:>10} | {:>10.2f} | {:>10.2f} | {:>5}".format(
                result.model_name[:30],
                ttft_str,
                tpot_str,
                result.tokens_per_sec,
                f"{result.param_count_billion:.1f}" if result.param_count_billion > 0 else "N/A",
                result.file_size_gb,
                result.vram_estimate_gb,
                result.runs
            )
            print(row)

        print("=" * 120)

 def export_results(results: List[BenchmarkResult], format: str, output_path: Path):
    """Export results to CSV or JSON."""
    if format == 'csv':
        with open(output_path, 'w', newline='') as f:
            if results:
                writer = csv.DictWriter(f, fieldnames=results[0].to_dict().keys())
                writer.writeheader()
                for result in results:
                    writer.writerow(result.to_dict())
        print(f"\n✓ Results exported to {output_path}")

    elif format == 'json':
        with open(output_path, 'w') as f:
            json.dump([r.to_dict() for r in results], f, indent=2)
        print(f"\n✓ Results exported to {output_path}")

 def run_benchmark(args):
    """Main benchmark function."""
    client = ollama.Client(host=args.host)

    if RICH_AVAILABLE:
        console.print(f"[bold green]Connecting to Ollama at {args.host}...[/bold green]")
    else:
        print(f"Connecting to Ollama at {args.host}...")

    try:
        models = client.list().get('models', [])
    except Exception as e:
        print(f"\n[ERROR] Could not connect to Ollama server at {args.host}")
        print(f"Details: {e}")
        print("\nMake sure Ollama is running: ollama serve")
        return

    if not models:
        print("\n[INFO] No models found. Install models with: ollama pull <model_name>")
        return

    # Filter models if specified
    if args.models:
        model_names = set(args.models)
        models = [m for m in models if m.model in model_names]
        if not models:
            print(f"[ERROR] None of the specified models found: {args.models}")
            return

    print(f"\nFound {len(models)} model(s) to benchmark")
    print(f"Prompt: '{args.prompt}'")
    print(f"Runs per model: {args.runs}" + (" (with warmup)" if args.warmup else ""))
    print("-" * 80 + "\n")

    benchmark_results: List[BenchmarkResult] = []

    for model_entry in models:
        if hasattr(model_entry, 'model'):
            model_name = model_entry.model
            model_size = getattr(model_entry, 'size', 0)
        elif hasattr(model_entry, 'name'):
            model_name = model_entry.name
            model_size = getattr(model_entry, 'size', 0)
        elif isinstance(model_entry, dict):
            model_name = model_entry.get('name') or model_entry.get('model', 'unknown')
            model_size = model_entry.get('size', 0)
        else:
            model_name = str(model_entry)
            model_size = 0

        if RICH_AVAILABLE:
            console.print(f"\n[bold cyan]Benchmarking: {model_name}[/bold cyan]")
        else:
            print(f"\nBenchmarking: {model_name}")

        details = get_model_details(client, model_name, model_size)

        # Run benchmark
        bench_stats = run_multiple_benchmarks(
            client, model_name, args.prompt,
            runs=args.runs, warmup=args.warmup
        )

        # Create result
        result = BenchmarkResult(
            model_name=model_name,
            ttft_mean=bench_stats['ttft_mean'],
            ttft_std=bench_stats['ttft_std'],
            tpot_mean=bench_stats['tpot_mean'],
            tpot_std=bench_stats['tpot_std'],
            tokens_per_sec=bench_stats['tokens_per_sec'],
            param_count_billion=details['param_count_billion'],
            file_size_gb=details['file_size_gb'],
            vram_estimate_gb=details['vram_estimate_gb'],
            output_tokens=bench_stats['output_tokens'],
            runs=args.runs,
            load_time_ms=bench_stats['load_time_ms']
        )

        benchmark_results.append(result)

    # Display results
    print("\n")
    print_results_table(benchmark_results, sort_by=args.sort)

    # Print summary stats
    print("\n📊 Summary Statistics:")
    print(f"  • TTFT = Time to First Token (lower is better)")
    print(f"  • TPOT = Time Per Output Token (lower is better)")
    print(f"  • Tok/sec = Tokens per second (higher is better)")
    print(f"  • VRAM estimate includes {int((VRAM_OVERHEAD_FACTOR-1)*100)}% overhead for KV cache")

    # Export if requested
    if args.output:
        export_results(benchmark_results, args.format, Path(args.output))

 def main():
    parser = argparse.ArgumentParser(
        description="Comprehensive Ollama LLM Benchmark Tool",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  %(prog)s                                    # Benchmark all models
  %(prog)s --models llama3.2:3b mistral:7b    # Benchmark specific models
  %(prog)s --runs 5 --warmup                  # 5 runs with warmup
  %(prog)s --output results.json --format json # Export to JSON
  %(prog)s --sort speed                       # Sort by generation speed
        """
    )

    parser.add_argument(
        '--host',
        default=OLLAMA_HOST,
        help=f'Ollama server URL (default: {OLLAMA_HOST})'
    )

    parser.add_argument(
        '--prompt',
        default=DEFAULT_PROMPT,
        help=f'Benchmark prompt (default: "{DEFAULT_PROMPT}")'
    )

    parser.add_argument(
        '--runs',
        type=int,
        default=3,
        help='Number of benchmark runs per model (default: 3)'
    )

    parser.add_argument(
        '--warmup',
        action='store_true',
        help='Run a warmup iteration before benchmarking'
    )

    parser.add_argument(
        '--models',
        nargs='+',
        help='Specific models to benchmark (default: all installed)'
    )

    parser.add_argument(
        '--sort',
        choices=['ttft', 'tpot', 'speed', 'size'],
        default='speed',
        help='Sort results by metric (default: speed)'
    )

    parser.add_argument(
        '--output',
        help='Output file path for results'
    )

    parser.add_argument(
        '--format',
        choices=['csv', 'json'],
        default='json',
        help='Output format (default: json)'
    )

    args = parser.parse_args()

    if ALWAYS_WARMUP:
        args.warmup = True

    run_benchmark(args)

 if __name__ == "__main__":
    main()
diff --git a/d_full_run.txt b/d_full_run.txt
 (net) BriansMacStudio sftp % python burrito.py
 Connecting to Ollama at http://localhost:11434...

 Found 11 model(s) to benchmark
 Prompt: 'What is a burrito? Give a one-sentence answer.'
 Runs per model: 3 (with warmup)
 --------------------------------------------------------------------------------


 Benchmarking: gpt-oss:20b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: phi3:mini
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: tinyllama:1.1b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: llama3.2:3b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: llama3.3:70b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: qwen2.5:72b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: nollama/mythomax-l2-13b:Q4_K_M
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: interstellarninja/hermes-2-pro-llama-3-8b:latest
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: llama3.1:8b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: deepseek-r1:8b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...

 Benchmarking: qwen3:8b
  Running warmup...
  Run 1/3...
  Run 2/3...
  Run 3/3...


                                                      🚀 Ollama LLM Benchmark Results
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━┓
 ┃ Model                                            ┃      TTFT (s) ┃       TPOT (s) ┃ Tok/sec ┃ Params (B) ┃ Size (GB) ┃ VRAM (GB) ┃ Runs ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━┩
 │ tinyllama:1.1b                                   │  0.052 ±0.008 │ 0.0038 ±0.0001 │   266.2 │        1.1 │      0.59 │      0.71 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ llama3.2:3b                                      │  0.116 ±0.006 │ 0.0061 ±0.0001 │   162.7 │        3.0 │      1.88 │      2.26 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ phi3:mini                                        │  0.048 ±0.003 │ 0.0070 ±0.0001 │   143.2 │        N/A │      2.03 │      2.43 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ interstellarninja/hermes-2-pro-llama-3-8b:latest │  0.165 ±0.088 │ 0.0102 ±0.0001 │    97.9 │        8.0 │      4.58 │      5.50 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ llama3.1:8b                                      │  0.435 ±0.544 │ 0.0105 ±0.0001 │    95.5 │        8.0 │      4.58 │      5.50 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ nollama/mythomax-l2-13b:Q4_K_M                   │  0.066 ±0.012 │ 0.0174 ±0.0021 │    57.4 │       13.0 │      7.33 │      8.79 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ gpt-oss:20b                                      │ -1.000 ±0.000 │ 0.0329 ±0.0004 │    30.4 │       20.0 │     12.85 │     15.42 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ qwen3:8b                                         │ -1.000 ±0.000 │ 0.0330 ±0.0001 │    30.3 │        8.0 │      4.87 │      5.84 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ deepseek-r1:8b                                   │ -1.000 ±0.000 │ 0.0334 ±0.0003 │    30.0 │        8.0 │      4.87 │      5.84 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ llama3.3:70b                                     │  0.173 ±0.005 │ 0.0719 ±0.0002 │    13.9 │       70.0 │     39.60 │     47.52 │    3 │
 ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
 │ qwen2.5:72b                                      │  0.186 ±0.003 │ 0.0776 ±0.0001 │    12.9 │       72.0 │     44.16 │     52.99 │    3 │
 └──────────────────────────────────────────────────┴───────────────┴────────────────┴─────────┴────────────┴───────────┴───────────┴──────┘

 📊 Summary Statistics:
  • TTFT = Time to First Token (lower is better)
  • TPOT = Time Per Output Token (lower is better)
  • Tok/sec = Tokens per second (higher is better)
  • VRAM estimate includes 19% overhead for KV cache
 (net) BriansMacStudio sftp %
diff --git a/e_sys_profile.txt b/e_sys_profile.txt
 + system_profiler SPHardwareDataType SPDisplaysDataType SPMemoryDataType SPStorageDataType
 Hardware:

    Hardware Overview:

      Model Name: Mac Studio
      Model Identifier: Mac14,14
      Model Number: Z180000E6LL/A
      Chip: Apple M2 Ultra
      Total Number of Cores: 24 (16 performance and 8 efficiency)
      Memory: 192 GB
      System Firmware Version: 10151.121.1
      OS Loader Version: 10151.121.1
      Serial Number (system): QK45L9J7DL
      Hardware UUID: 112DF8F6-1437-51F5-BA05-1FD6202A91EB
      Provisioning UDID: 00006022-000A11AA0202201E
      Activation Lock Status: Enabled

 Graphics/Displays:

    Apple M2 Ultra:

      Chipset Model: Apple M2 Ultra
      Type: GPU
      Bus: Built-In
      Total Number of Cores: 76
      Vendor: Apple (0x106b)
      Metal Support: Metal 3
      Displays:
        HDMI2K:
          Resolution: 1920 x 1080 (1080p FHD - Full High Definition)
          UI Looks like: 1920 x 1080 @ 60.00Hz
          Main Display: Yes
          Mirror: Off
          Online: Yes
          Rotation: Supported

 Memory:

      Memory: 192 GB
      Type: LPDDR5
      Manufacturer: Hynix

 Storage:

    Data:

      Free: 5.75 TB (5,748,488,724,480 bytes)
      Capacity: 8 TB (7,998,551,654,400 bytes)
      Mount Point: /System/Volumes/Data
      File System: APFS
      Writable: Yes
      Ignore Ownership: No
      BSD Name: disk3s5
      Volume UUID: AA8B5B1F-34C4-4FCB-AB95-306048694283
      Physical Drive:
          Device Name: APPLE SSD AP8192Z
          Media Name: AppleAPFSMedia
          Medium Type: SSD
          Protocol: Apple Fabric
          Internal: Yes
          Partition Map Type: Unknown
          S.M.A.R.T. Status: Verified

    Macintosh HD:

      Free: 5.75 TB (5,748,488,724,480 bytes)
      Capacity: 8 TB (7,998,551,654,400 bytes)
      Mount Point: /
      File System: APFS
      Writable: No
      Ignore Ownership: No
      BSD Name: disk3s1s1
      Volume UUID: E28F0643-33B3-4F17-974B-EF0D1061EC56
      Physical Drive:
          Device Name: APPLE SSD AP8192Z
          Media Name: AppleAPFSMedia
          Medium Type: SSD
          Protocol: Apple Fabric
          Internal: Yes
          Partition Map Type: Unknown
          S.M.A.R.T. Status: Verified

    Docker:

      Free: 178.7 MB (178,708,480 bytes)
      Capacity: 2.28 GB (2,282,708,992 bytes)
      Mount Point: /private/var/folders/r4/xjmr3wrx7tq_6rlf4hltw1bc0000gn/T/DockerDesktop-207573627911685
      File System: HFS+
      Writable: No
      Ignore Ownership: Yes
      BSD Name: disk4s2
      Volume UUID: 275A1A9D-EE6D-3A66-B97A-4F45F8357D0F
      Physical Drive:
          Device Name: Disk Image
          Media Name: Apple UDIF read-only compressed (lzma) Media
          Protocol: Disk Image
          Internal: No
          Partition Map Type: GPT (GUID Partition Table)

 + vm_stat
 Mach Virtual Memory Statistics: (page size of 16384 bytes)
 Pages free:                             3836436.
 Pages active:                           4177765.
 Pages inactive:                          990159.
 Pages speculative:                      3186264.
 Pages throttled:                              0.
 Pages wired down:                        174294.
 Pages purgeable:                           2269.
 "Translation faults":              106134237803.
 Pages copy-on-write:                   57688512.
 Pages zero filled:                  98188086464.
 Pages reactivated:                      2098499.
 Pages purged:                           2441458.
 File-backed pages:                      5886992.
 Anonymous pages:                        2467196.
 Pages stored in compressor:              252994.
 Pages occupied by compressor:             99879.
 Decompressions:                         2983984.
 Compressions:                           6461143.
 Pageins:                               20860802.
 Pageouts:                                 13419.
 Swapins:                                    143.
 Swapouts:                                  1412.
	Note: Click the "Raw" button for a wider view.

	Sorted by Tok/sec (descending).

	🚀 Ollama LLM Benchmark Results
	┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━┓
	┃ Model ┃ TTFT (s) ┃ TPOT (s) ┃ Tok/sec ┃ Params (B) ┃ Size (GB) ┃ VRAM (GB) ┃ Runs ┃
	┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━┩
	│ tinyllama:1.1b │ 0.052 ±0.008 │ 0.0038 ±0.0001 │ 266.2 │ 1.1 │ 0.59 │ 0.71 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ llama3.2:3b │ 0.116 ±0.006 │ 0.0061 ±0.0001 │ 162.7 │ 3.0 │ 1.88 │ 2.26 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ phi3:mini │ 0.048 ±0.003 │ 0.0070 ±0.0001 │ 143.2 │ N/A │ 2.03 │ 2.43 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ interstellarninja/hermes-2-pro-llama-3-8b:latest │ 0.165 ±0.088 │ 0.0102 ±0.0001 │ 97.9 │ 8.0 │ 4.58 │ 5.50 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ llama3.1:8b │ 0.435 ±0.544 │ 0.0105 ±0.0001 │ 95.5 │ 8.0 │ 4.58 │ 5.50 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ nollama/mythomax-l2-13b:Q4_K_M │ 0.066 ±0.012 │ 0.0174 ±0.0021 │ 57.4 │ 13.0 │ 7.33 │ 8.79 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ gpt-oss:20b │ -1.000 ±0.000 │ 0.0329 ±0.0004 │ 30.4 │ 20.0 │ 12.85 │ 15.42 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ qwen3:8b │ -1.000 ±0.000 │ 0.0330 ±0.0001 │ 30.3 │ 8.0 │ 4.87 │ 5.84 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ deepseek-r1:8b │ -1.000 ±0.000 │ 0.0334 ±0.0003 │ 30.0 │ 8.0 │ 4.87 │ 5.84 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ llama3.3:70b │ 0.173 ±0.005 │ 0.0719 ±0.0002 │ 13.9 │ 70.0 │ 39.60 │ 47.52 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ qwen2.5:72b │ 0.186 ±0.003 │ 0.0776 ±0.0001 │ 12.9 │ 72.0 │ 44.16 │ 52.99 │ 3 │
	└──────────────────────────────────────────────────┴───────────────┴────────────────┴─────────┴────────────┴───────────┴───────────┴──────┘

	📊 Columns:
	• TTFT (s) : Time to First Token - latency before generation starts (responsiveness)
	• TPOT (s) : Time Per Output Token - average time per token during generation
	• Tok/sec : Tokens per second - sustained generation speed (inverse of TPOT)
	• Params (B): Model size in billions of parameters
	• Size (GB) : Disk space used by model file
	• VRAM (GB) : Estimated memory usage (includes 19% KV cache overhead)
	• Runs : Number of test iterations averaged
	#!/usr/bin/env python3

	DEFAULT_PROMPT = "What is a burrito? Give a one-sentence answer."
	ALWAYS_WARMUP = True

	# --host \| OLLAMA_HOST \| Ollama server URL
	# --prompt \| DEFAULT_PROMPT \| Benchmark prompt
	# --runs \| 3 \| Number of benchmark runs per model
	# --warmup \| true \| Run a warmup iteration before benchmarking
	# --models \| all installed \| Specific models to benchmark
	# --sort \| speed \| ttft, tpot, size - Sort results by metric
	# --output \| none \| Output file path for results
	# --format \| json \| csv - Output format

	import ollama
	import time
	import json
	import csv
	import argparse
	import statistics
	from pathlib import Path
	from typing import List, Dict, Any, Tuple, Optional
	from dataclasses import dataclass, asdict

	# Optional: rich library for better output (falls back to basic if not available)
	try:
	from rich.console import Console
	from rich.table import Table
	from rich.progress import Progress, SpinnerColumn, TextColumn
	RICH_AVAILABLE = True
	console = Console()
	except ImportError:
	RICH_AVAILABLE = False
	console = None

	# --- Configuration ---
	OLLAMA_HOST = 'http://localhost:11434'
	VRAM_OVERHEAD_FACTOR = 1.2

	@dataclass
	class BenchmarkResult:
	"""Structured benchmark result data."""
	model_name: str
	ttft_mean: float
	ttft_std: float
	tpot_mean: float
	tpot_std: float
	tokens_per_sec: float
	param_count_billion: float
	file_size_gb: float
	vram_estimate_gb: float
	output_tokens: int
	runs: int
	load_time_ms: float = 0.0

	def to_dict(self) -> Dict[str, Any]:
	"""Convert to dictionary for export."""
	return asdict(self)

	# --- Helper Functions ---

	def bytes_to_gb(b: int) -> float:
	"""Converts bytes to gigabytes."""
	return b / (1024 ** 3)

	def extract_param_count(model_name: str, details: Dict[str, Any]) -> float:
	"""
	Extracts parameter count from model name or details.
	Improved parsing with multiple strategies.
	"""
	# Strategy 1: Parse from model name (e.g., "llama3.2:3b", "mistral:7b-instruct")
	model_lower = model_name.lower()

	# Look for patterns like "7b", "3.2b", "70b", etc.
	import re
	patterns = [
	r'(\d+\.?\d*)b', # Matches 7b, 3.2b, 70b
	r':(\d+\.?\d*)b', # Matches :7b
	]

	for pattern in patterns:
	match = re.search(pattern, model_lower)
	if match:
	try:
	return float(match.group(1))
	except ValueError:
	pass

	# Strategy 2: Check modelfile template for parameter info
	modelfile = (
	getattr(details, 'modelfile', '') if hasattr(details, 'modelfile')
	else (
	details.get('modelfile', '') if isinstance(details, dict)
	else ''
	)
	)

	if 'parameters' in modelfile.lower():
	# Try to extract from template
	pass

	# Strategy 3: Estimate from file size (very rough)
	if hasattr(details, 'size'):
	file_size_gb = bytes_to_gb(details.size)
	elif isinstance(details, dict):
	file_size_gb = bytes_to_gb(details.get('size', 0))
	else:
	file_size_gb = 0.0


	if file_size_gb > 0:
	# Rough estimate: Q4 quantization uses ~0.5-0.6 GB per billion params
	# This is just a fallback estimate
	estimated = file_size_gb / 0.55
	if estimated < 100: # Sanity check
	return round(estimated, 1)

	return 0.0


	def get_model_details(client: ollama.Client, model_name: str, file_size_bytes: int = 0) -> Dict[str, Any]:
	"""Fetches detailed information about a model using /api/show."""
	try:
	details = client.show(model_name)

	# # DEBUG: Print what we actually get
	# print(f"\n=== DEBUG for {model_name} ===")
	# print(f"Type: {type(details)}")
	# print(f"Dir: {[attr for attr in dir(details) if not attr.startswith('_')]}")
	# if hasattr(details, '__dict__'):
	# print(f"Dict: {details.__dict__}")
	# print("=== END DEBUG ===\n")

	param_count = extract_param_count(model_name, details)
	vram_estimate_gb = bytes_to_gb(file_size_bytes) * VRAM_OVERHEAD_FACTOR

	return {
	"file_size_gb": bytes_to_gb(file_size_bytes),
	"param_count_billion": param_count,
	"vram_estimate_gb": vram_estimate_gb,
	"details": details
	}
	except Exception as e:
	print(f"Warning: Error fetching details for {model_name}: {e}")
	return {
	"file_size_gb": 0.0,
	"param_count_billion": 0.0,
	"vram_estimate_gb": 0.0,
	"details": {}
	}

	def measure_ttft(client: ollama.Client, model_name: str, prompt: str) -> Tuple[float, float, int, float]:
	"""
	Measures Time-to-First-Token (TTFT), TPOT, and load time.
	Returns: (ttft, tpot, eval_count, load_time_ms)
	"""
	try:
	start_time = time.time()
	ttft = -1.0
	total_duration = 0
	load_duration_ns = 0

	response_stream = client.generate(
	model=model_name,
	prompt=prompt,
	stream=True,
	options={
	"num_predict": 50, # Limit output for consistency
	"temperature": 0.0, # Deterministic for benchmarking
	}
	)

	for chunk in response_stream:
	if ttft < 0 and chunk.get('response'):
	ttft = time.time() - start_time

	if chunk.get('done'):
	total_duration_ns = chunk.get('total_duration', 0)
	load_duration_ns = chunk.get('load_duration', 0)
	total_duration = total_duration_ns / 1_000_000_000
	eval_count = chunk.get('eval_count', 0)
	break

	tpot = (total_duration - ttft) / eval_count if eval_count > 0 else 0.0
	load_time_ms = load_duration_ns / 1_000_000

	return ttft, tpot, eval_count, load_time_ms

	except Exception as e:
	print(f"Error benchmarking {model_name}: {e}")
	return float('inf'), float('inf'), 0, 0.0

	def run_multiple_benchmarks(client: ollama.Client, model_name: str, prompt: str,
	runs: int, warmup: bool = True) -> Dict[str, Any]:
	"""
	Run multiple benchmark iterations and calculate statistics.
	"""
	if warmup:
	if RICH_AVAILABLE:
	console.print(f" [dim]Running warmup...[/dim]")
	else:
	print(f" Running warmup...")
	measure_ttft(client, model_name, prompt)
	time.sleep(1)

	ttfts = []
	tpots = []
	eval_counts = []
	load_times = []

	for i in range(runs):
	if RICH_AVAILABLE:
	console.print(f" [dim]Run {i+1}/{runs}...[/dim]")
	else:
	print(f" Run {i+1}/{runs}...")

	ttft, tpot, eval_count, load_time = measure_ttft(client, model_name, prompt)

	if ttft != float('inf'):
	ttfts.append(ttft)
	tpots.append(tpot)
	eval_counts.append(eval_count)
	load_times.append(load_time)

	time.sleep(0.5) # Brief pause between runs

	if not ttfts:
	return {
	"ttft_mean": float('inf'),
	"ttft_std": 0.0,
	"tpot_mean": float('inf'),
	"tpot_std": 0.0,
	"tokens_per_sec": 0.0,
	"output_tokens": 0,
	"load_time_ms": 0.0
	}

	ttft_mean = statistics.mean(ttfts)
	tpot_mean = statistics.mean(tpots)
	tokens_per_sec = 1.0 / tpot_mean if tpot_mean > 0 else 0.0

	return {
	"ttft_mean": ttft_mean,
	"ttft_std": statistics.stdev(ttfts) if len(ttfts) > 1 else 0.0,
	"tpot_mean": tpot_mean,
	"tpot_std": statistics.stdev(tpots) if len(tpots) > 1 else 0.0,
	"tokens_per_sec": tokens_per_sec,
	"output_tokens": int(statistics.mean(eval_counts)),
	"load_time_ms": statistics.mean(load_times)
	}

	def print_results_table(results: List[BenchmarkResult], sort_by: str = 'ttft'):
	"""Print results in a formatted table."""
	if not results:
	print("No results to display.")
	return

	# Sort results
	sort_key_map = {
	'ttft': lambda x: x.ttft_mean,
	'tpot': lambda x: x.tpot_mean,
	'speed': lambda x: -x.tokens_per_sec, # Negative for descending
	'size': lambda x: x.file_size_gb
	}
	results = sorted(results, key=sort_key_map.get(sort_by, sort_key_map['ttft']))

	if RICH_AVAILABLE:
	table = Table(title="🚀 Ollama LLM Benchmark Results", show_lines=True)
	table.add_column("Model", style="cyan", no_wrap=True)
	table.add_column("TTFT (s)", justify="right", style="green")
	table.add_column("TPOT (s)", justify="right", style="yellow")
	table.add_column("Tok/sec", justify="right", style="magenta")
	table.add_column("Params (B)", justify="right")
	table.add_column("Size (GB)", justify="right")
	table.add_column("VRAM (GB)", justify="right", style="red")
	table.add_column("Runs", justify="right", style="dim")

	for result in results:
	ttft_str = f"{result.ttft_mean:.3f} ±{result.ttft_std:.3f}" if result.ttft_mean != float('inf') else "Error"
	tpot_str = f"{result.tpot_mean:.4f} ±{result.tpot_std:.4f}" if result.tpot_mean != float('inf') else "Error"

	table.add_row(
	result.model_name,
	ttft_str,
	tpot_str,
	f"{result.tokens_per_sec:.1f}",
	f"{result.param_count_billion:.1f}" if result.param_count_billion > 0 else "N/A",
	f"{result.file_size_gb:.2f}",
	f"{result.vram_estimate_gb:.2f}",
	str(result.runs)
	)

	console.print(table)
	else:
	# Fallback to basic table
	print("\n" + "=" * 120)
	print("OLLAMA LLM BENCHMARK RESULTS")
	print("=" * 120)

	header = "{:<30} \| {:>12} \| {:>12} \| {:>10} \| {:>10} \| {:>10} \| {:>10} \| {:>5}".format(
	"Model", "TTFT (s)", "TPOT (s)", "Tok/sec", "Params (B)", "Size (GB)", "VRAM (GB)", "Runs"
	)
	print(header)
	print("-" * 120)

	for result in results:
	ttft_str = f"{result.ttft_mean:.3f}±{result.ttft_std:.2f}" if result.ttft_mean != float('inf') else "Error"
	tpot_str = f"{result.tpot_mean:.4f}±{result.tpot_std:.3f}" if result.tpot_mean != float('inf') else "Error"

	row = "{:<30} \| {:>12} \| {:>12} \| {:>10.1f} \| {:>10} \| {:>10.2f} \| {:>10.2f} \| {:>5}".format(
	result.model_name[:30],
	ttft_str,
	tpot_str,
	result.tokens_per_sec,
	f"{result.param_count_billion:.1f}" if result.param_count_billion > 0 else "N/A",
	result.file_size_gb,
	result.vram_estimate_gb,
	result.runs
	)
	print(row)

	print("=" * 120)

	def export_results(results: List[BenchmarkResult], format: str, output_path: Path):
	"""Export results to CSV or JSON."""
	if format == 'csv':
	with open(output_path, 'w', newline='') as f:
	if results:
	writer = csv.DictWriter(f, fieldnames=results[0].to_dict().keys())
	writer.writeheader()
	for result in results:
	writer.writerow(result.to_dict())
	print(f"\n✓ Results exported to {output_path}")

	elif format == 'json':
	with open(output_path, 'w') as f:
	json.dump([r.to_dict() for r in results], f, indent=2)
	print(f"\n✓ Results exported to {output_path}")

	def run_benchmark(args):
	"""Main benchmark function."""
	client = ollama.Client(host=args.host)

	if RICH_AVAILABLE:
	console.print(f"[bold green]Connecting to Ollama at {args.host}...[/bold green]")
	else:
	print(f"Connecting to Ollama at {args.host}...")

	try:
	models = client.list().get('models', [])
	except Exception as e:
	print(f"\n[ERROR] Could not connect to Ollama server at {args.host}")
	print(f"Details: {e}")
	print("\nMake sure Ollama is running: ollama serve")
	return

	if not models:
	print("\n[INFO] No models found. Install models with: ollama pull <model_name>")
	return

	# Filter models if specified
	if args.models:
	model_names = set(args.models)
	models = [m for m in models if m.model in model_names]
	if not models:
	print(f"[ERROR] None of the specified models found: {args.models}")
	return

	print(f"\nFound {len(models)} model(s) to benchmark")
	print(f"Prompt: '{args.prompt}'")
	print(f"Runs per model: {args.runs}" + (" (with warmup)" if args.warmup else ""))
	print("-" * 80 + "\n")

	benchmark_results: List[BenchmarkResult] = []

	for model_entry in models:
	if hasattr(model_entry, 'model'):
	model_name = model_entry.model
	model_size = getattr(model_entry, 'size', 0)
	elif hasattr(model_entry, 'name'):
	model_name = model_entry.name
	model_size = getattr(model_entry, 'size', 0)
	elif isinstance(model_entry, dict):
	model_name = model_entry.get('name') or model_entry.get('model', 'unknown')
	model_size = model_entry.get('size', 0)
	else:
	model_name = str(model_entry)
	model_size = 0

	if RICH_AVAILABLE:
	console.print(f"\n[bold cyan]Benchmarking: {model_name}[/bold cyan]")
	else:
	print(f"\nBenchmarking: {model_name}")

	details = get_model_details(client, model_name, model_size)

	# Run benchmark
	bench_stats = run_multiple_benchmarks(
	client, model_name, args.prompt,
	runs=args.runs, warmup=args.warmup
	)

	# Create result
	result = BenchmarkResult(
	model_name=model_name,
	ttft_mean=bench_stats['ttft_mean'],
	ttft_std=bench_stats['ttft_std'],
	tpot_mean=bench_stats['tpot_mean'],
	tpot_std=bench_stats['tpot_std'],
	tokens_per_sec=bench_stats['tokens_per_sec'],
	param_count_billion=details['param_count_billion'],
	file_size_gb=details['file_size_gb'],
	vram_estimate_gb=details['vram_estimate_gb'],
	output_tokens=bench_stats['output_tokens'],
	runs=args.runs,
	load_time_ms=bench_stats['load_time_ms']
	)

	benchmark_results.append(result)

	# Display results
	print("\n")
	print_results_table(benchmark_results, sort_by=args.sort)

	# Print summary stats
	print("\n📊 Summary Statistics:")
	print(f" • TTFT = Time to First Token (lower is better)")
	print(f" • TPOT = Time Per Output Token (lower is better)")
	print(f" • Tok/sec = Tokens per second (higher is better)")
	print(f" • VRAM estimate includes {int((VRAM_OVERHEAD_FACTOR-1)*100)}% overhead for KV cache")

	# Export if requested
	if args.output:
	export_results(benchmark_results, args.format, Path(args.output))

	def main():
	parser = argparse.ArgumentParser(
	description="Comprehensive Ollama LLM Benchmark Tool",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	%(prog)s # Benchmark all models
	%(prog)s --models llama3.2:3b mistral:7b # Benchmark specific models
	%(prog)s --runs 5 --warmup # 5 runs with warmup
	%(prog)s --output results.json --format json # Export to JSON
	%(prog)s --sort speed # Sort by generation speed
	"""
	)

	parser.add_argument(
	'--host',
	default=OLLAMA_HOST,
	help=f'Ollama server URL (default: {OLLAMA_HOST})'
	)

	parser.add_argument(
	'--prompt',
	default=DEFAULT_PROMPT,
	help=f'Benchmark prompt (default: "{DEFAULT_PROMPT}")'
	)

	parser.add_argument(
	'--runs',
	type=int,
	default=3,
	help='Number of benchmark runs per model (default: 3)'
	)

	parser.add_argument(
	'--warmup',
	action='store_true',
	help='Run a warmup iteration before benchmarking'
	)

	parser.add_argument(
	'--models',
	nargs='+',
	help='Specific models to benchmark (default: all installed)'
	)

	parser.add_argument(
	'--sort',
	choices=['ttft', 'tpot', 'speed', 'size'],
	default='speed',
	help='Sort results by metric (default: speed)'
	)

	parser.add_argument(
	'--output',
	help='Output file path for results'
	)

	parser.add_argument(
	'--format',
	choices=['csv', 'json'],
	default='json',
	help='Output format (default: json)'
	)

	args = parser.parse_args()

	if ALWAYS_WARMUP:
	args.warmup = True

	run_benchmark(args)

	if __name__ == "__main__":
	main()
	(net) BriansMacStudio sftp % python burrito.py
	Connecting to Ollama at http://localhost:11434...

	Found 11 model(s) to benchmark
	Prompt: 'What is a burrito? Give a one-sentence answer.'
	Runs per model: 3 (with warmup)
	--------------------------------------------------------------------------------


	Benchmarking: gpt-oss:20b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: phi3:mini
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: tinyllama:1.1b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: llama3.2:3b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: llama3.3:70b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: qwen2.5:72b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: nollama/mythomax-l2-13b:Q4_K_M
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: interstellarninja/hermes-2-pro-llama-3-8b:latest
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: llama3.1:8b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: deepseek-r1:8b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...

	Benchmarking: qwen3:8b
	Running warmup...
	Run 1/3...
	Run 2/3...
	Run 3/3...


	🚀 Ollama LLM Benchmark Results
	┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━┓
	┃ Model ┃ TTFT (s) ┃ TPOT (s) ┃ Tok/sec ┃ Params (B) ┃ Size (GB) ┃ VRAM (GB) ┃ Runs ┃
	┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━┩
	│ tinyllama:1.1b │ 0.052 ±0.008 │ 0.0038 ±0.0001 │ 266.2 │ 1.1 │ 0.59 │ 0.71 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ llama3.2:3b │ 0.116 ±0.006 │ 0.0061 ±0.0001 │ 162.7 │ 3.0 │ 1.88 │ 2.26 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ phi3:mini │ 0.048 ±0.003 │ 0.0070 ±0.0001 │ 143.2 │ N/A │ 2.03 │ 2.43 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ interstellarninja/hermes-2-pro-llama-3-8b:latest │ 0.165 ±0.088 │ 0.0102 ±0.0001 │ 97.9 │ 8.0 │ 4.58 │ 5.50 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ llama3.1:8b │ 0.435 ±0.544 │ 0.0105 ±0.0001 │ 95.5 │ 8.0 │ 4.58 │ 5.50 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ nollama/mythomax-l2-13b:Q4_K_M │ 0.066 ±0.012 │ 0.0174 ±0.0021 │ 57.4 │ 13.0 │ 7.33 │ 8.79 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ gpt-oss:20b │ -1.000 ±0.000 │ 0.0329 ±0.0004 │ 30.4 │ 20.0 │ 12.85 │ 15.42 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ qwen3:8b │ -1.000 ±0.000 │ 0.0330 ±0.0001 │ 30.3 │ 8.0 │ 4.87 │ 5.84 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ deepseek-r1:8b │ -1.000 ±0.000 │ 0.0334 ±0.0003 │ 30.0 │ 8.0 │ 4.87 │ 5.84 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ llama3.3:70b │ 0.173 ±0.005 │ 0.0719 ±0.0002 │ 13.9 │ 70.0 │ 39.60 │ 47.52 │ 3 │
	├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤
	│ qwen2.5:72b │ 0.186 ±0.003 │ 0.0776 ±0.0001 │ 12.9 │ 72.0 │ 44.16 │ 52.99 │ 3 │
	└──────────────────────────────────────────────────┴───────────────┴────────────────┴─────────┴────────────┴───────────┴───────────┴──────┘

	📊 Summary Statistics:
	• TTFT = Time to First Token (lower is better)
	• TPOT = Time Per Output Token (lower is better)
	• Tok/sec = Tokens per second (higher is better)
	• VRAM estimate includes 19% overhead for KV cache
	(net) BriansMacStudio sftp %
	+ system_profiler SPHardwareDataType SPDisplaysDataType SPMemoryDataType SPStorageDataType
	Hardware:

	Hardware Overview:

	Model Name: Mac Studio
	Model Identifier: Mac14,14
	Model Number: Z180000E6LL/A
	Chip: Apple M2 Ultra
	Total Number of Cores: 24 (16 performance and 8 efficiency)
	Memory: 192 GB
	System Firmware Version: 10151.121.1
	OS Loader Version: 10151.121.1
	Serial Number (system): QK45L9J7DL
	Hardware UUID: 112DF8F6-1437-51F5-BA05-1FD6202A91EB
	Provisioning UDID: 00006022-000A11AA0202201E
	Activation Lock Status: Enabled

	Graphics/Displays:

	Apple M2 Ultra:

	Chipset Model: Apple M2 Ultra
	Type: GPU
	Bus: Built-In
	Total Number of Cores: 76
	Vendor: Apple (0x106b)
	Metal Support: Metal 3
	Displays:
	HDMI2K:
	Resolution: 1920 x 1080 (1080p FHD - Full High Definition)
	UI Looks like: 1920 x 1080 @ 60.00Hz
	Main Display: Yes
	Mirror: Off
	Online: Yes
	Rotation: Supported

	Memory:

	Memory: 192 GB
	Type: LPDDR5
	Manufacturer: Hynix

	Storage:

	Data:

	Free: 5.75 TB (5,748,488,724,480 bytes)
	Capacity: 8 TB (7,998,551,654,400 bytes)
	Mount Point: /System/Volumes/Data
	File System: APFS
	Writable: Yes
	Ignore Ownership: No
	BSD Name: disk3s5
	Volume UUID: AA8B5B1F-34C4-4FCB-AB95-306048694283
	Physical Drive:
	Device Name: APPLE SSD AP8192Z
	Media Name: AppleAPFSMedia
	Medium Type: SSD
	Protocol: Apple Fabric
	Internal: Yes
	Partition Map Type: Unknown
	S.M.A.R.T. Status: Verified

	Macintosh HD:

	Free: 5.75 TB (5,748,488,724,480 bytes)
	Capacity: 8 TB (7,998,551,654,400 bytes)
	Mount Point: /
	File System: APFS
	Writable: No
	Ignore Ownership: No
	BSD Name: disk3s1s1
	Volume UUID: E28F0643-33B3-4F17-974B-EF0D1061EC56
	Physical Drive:
	Device Name: APPLE SSD AP8192Z
	Media Name: AppleAPFSMedia
	Medium Type: SSD
	Protocol: Apple Fabric
	Internal: Yes
	Partition Map Type: Unknown
	S.M.A.R.T. Status: Verified

	Docker:

	Free: 178.7 MB (178,708,480 bytes)
	Capacity: 2.28 GB (2,282,708,992 bytes)
	Mount Point: /private/var/folders/r4/xjmr3wrx7tq_6rlf4hltw1bc0000gn/T/DockerDesktop-207573627911685
	File System: HFS+
	Writable: No
	Ignore Ownership: Yes
	BSD Name: disk4s2
	Volume UUID: 275A1A9D-EE6D-3A66-B97A-4F45F8357D0F
	Physical Drive:
	Device Name: Disk Image
	Media Name: Apple UDIF read-only compressed (lzma) Media
	Protocol: Disk Image
	Internal: No
	Partition Map Type: GPT (GUID Partition Table)

	+ vm_stat
	Mach Virtual Memory Statistics: (page size of 16384 bytes)
	Pages free: 3836436.
	Pages active: 4177765.
	Pages inactive: 990159.
	Pages speculative: 3186264.
	Pages throttled: 0.
	Pages wired down: 174294.
	Pages purgeable: 2269.
	"Translation faults": 106134237803.
	Pages copy-on-write: 57688512.
	Pages zero filled: 98188086464.
	Pages reactivated: 2098499.
	Pages purged: 2441458.
	File-backed pages: 5886992.
	Anonymous pages: 2467196.
	Pages stored in compressor: 252994.
	Pages occupied by compressor: 99879.
	Decompressions: 2983984.
	Compressions: 6461143.
	Pageins: 20860802.
	Pageouts: 13419.
	Swapins: 143.
	Swapouts: 1412.