Last active
October 17, 2025 14:21
-
-
Save bedwards/8cbe72c48acbc4ded96de074a46b5ede to your computer and use it in GitHub Desktop.
Burrito - Ollama LLM Benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Note: Click the "Raw" button for a wider view. | |
| Sorted by Tok/sec (descending). | |
| 🚀 Ollama LLM Benchmark Results | |
| ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━┓ | |
| ┃ Model ┃ TTFT (s) ┃ TPOT (s) ┃ Tok/sec ┃ Params (B) ┃ Size (GB) ┃ VRAM (GB) ┃ Runs ┃ | |
| ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━┩ | |
| │ tinyllama:1.1b │ 0.052 ±0.008 │ 0.0038 ±0.0001 │ 266.2 │ 1.1 │ 0.59 │ 0.71 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ llama3.2:3b │ 0.116 ±0.006 │ 0.0061 ±0.0001 │ 162.7 │ 3.0 │ 1.88 │ 2.26 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ phi3:mini │ 0.048 ±0.003 │ 0.0070 ±0.0001 │ 143.2 │ N/A │ 2.03 │ 2.43 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ interstellarninja/hermes-2-pro-llama-3-8b:latest │ 0.165 ±0.088 │ 0.0102 ±0.0001 │ 97.9 │ 8.0 │ 4.58 │ 5.50 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ llama3.1:8b │ 0.435 ±0.544 │ 0.0105 ±0.0001 │ 95.5 │ 8.0 │ 4.58 │ 5.50 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ nollama/mythomax-l2-13b:Q4_K_M │ 0.066 ±0.012 │ 0.0174 ±0.0021 │ 57.4 │ 13.0 │ 7.33 │ 8.79 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ gpt-oss:20b │ -1.000 ±0.000 │ 0.0329 ±0.0004 │ 30.4 │ 20.0 │ 12.85 │ 15.42 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ qwen3:8b │ -1.000 ±0.000 │ 0.0330 ±0.0001 │ 30.3 │ 8.0 │ 4.87 │ 5.84 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ deepseek-r1:8b │ -1.000 ±0.000 │ 0.0334 ±0.0003 │ 30.0 │ 8.0 │ 4.87 │ 5.84 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ llama3.3:70b │ 0.173 ±0.005 │ 0.0719 ±0.0002 │ 13.9 │ 70.0 │ 39.60 │ 47.52 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ qwen2.5:72b │ 0.186 ±0.003 │ 0.0776 ±0.0001 │ 12.9 │ 72.0 │ 44.16 │ 52.99 │ 3 │ | |
| └──────────────────────────────────────────────────┴───────────────┴────────────────┴─────────┴────────────┴───────────┴───────────┴──────┘ | |
| 📊 Columns: | |
| • TTFT (s) : Time to First Token - latency before generation starts (responsiveness) | |
| • TPOT (s) : Time Per Output Token - average time per token during generation | |
| • Tok/sec : Tokens per second - sustained generation speed (inverse of TPOT) | |
| • Params (B): Model size in billions of parameters | |
| • Size (GB) : Disk space used by model file | |
| • VRAM (GB) : Estimated memory usage (includes 19% KV cache overhead) | |
| • Runs : Number of test iterations averaged |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| DEFAULT_PROMPT = "What is a burrito? Give a one-sentence answer." | |
| ALWAYS_WARMUP = True | |
| # --host | OLLAMA_HOST | Ollama server URL | |
| # --prompt | DEFAULT_PROMPT | Benchmark prompt | |
| # --runs | 3 | Number of benchmark runs per model | |
| # --warmup | true | Run a warmup iteration before benchmarking | |
| # --models | all installed | Specific models to benchmark | |
| # --sort | speed | ttft, tpot, size - Sort results by metric | |
| # --output | none | Output file path for results | |
| # --format | json | csv - Output format | |
| import ollama | |
| import time | |
| import json | |
| import csv | |
| import argparse | |
| import statistics | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Tuple, Optional | |
| from dataclasses import dataclass, asdict | |
| # Optional: rich library for better output (falls back to basic if not available) | |
| try: | |
| from rich.console import Console | |
| from rich.table import Table | |
| from rich.progress import Progress, SpinnerColumn, TextColumn | |
| RICH_AVAILABLE = True | |
| console = Console() | |
| except ImportError: | |
| RICH_AVAILABLE = False | |
| console = None | |
| # --- Configuration --- | |
| OLLAMA_HOST = 'http://localhost:11434' | |
| VRAM_OVERHEAD_FACTOR = 1.2 | |
| @dataclass | |
| class BenchmarkResult: | |
| """Structured benchmark result data.""" | |
| model_name: str | |
| ttft_mean: float | |
| ttft_std: float | |
| tpot_mean: float | |
| tpot_std: float | |
| tokens_per_sec: float | |
| param_count_billion: float | |
| file_size_gb: float | |
| vram_estimate_gb: float | |
| output_tokens: int | |
| runs: int | |
| load_time_ms: float = 0.0 | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert to dictionary for export.""" | |
| return asdict(self) | |
| # --- Helper Functions --- | |
| def bytes_to_gb(b: int) -> float: | |
| """Converts bytes to gigabytes.""" | |
| return b / (1024 ** 3) | |
| def extract_param_count(model_name: str, details: Dict[str, Any]) -> float: | |
| """ | |
| Extracts parameter count from model name or details. | |
| Improved parsing with multiple strategies. | |
| """ | |
| # Strategy 1: Parse from model name (e.g., "llama3.2:3b", "mistral:7b-instruct") | |
| model_lower = model_name.lower() | |
| # Look for patterns like "7b", "3.2b", "70b", etc. | |
| import re | |
| patterns = [ | |
| r'(\d+\.?\d*)b', # Matches 7b, 3.2b, 70b | |
| r':(\d+\.?\d*)b', # Matches :7b | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, model_lower) | |
| if match: | |
| try: | |
| return float(match.group(1)) | |
| except ValueError: | |
| pass | |
| # Strategy 2: Check modelfile template for parameter info | |
| modelfile = ( | |
| getattr(details, 'modelfile', '') if hasattr(details, 'modelfile') | |
| else ( | |
| details.get('modelfile', '') if isinstance(details, dict) | |
| else '' | |
| ) | |
| ) | |
| if 'parameters' in modelfile.lower(): | |
| # Try to extract from template | |
| pass | |
| # Strategy 3: Estimate from file size (very rough) | |
| if hasattr(details, 'size'): | |
| file_size_gb = bytes_to_gb(details.size) | |
| elif isinstance(details, dict): | |
| file_size_gb = bytes_to_gb(details.get('size', 0)) | |
| else: | |
| file_size_gb = 0.0 | |
| if file_size_gb > 0: | |
| # Rough estimate: Q4 quantization uses ~0.5-0.6 GB per billion params | |
| # This is just a fallback estimate | |
| estimated = file_size_gb / 0.55 | |
| if estimated < 100: # Sanity check | |
| return round(estimated, 1) | |
| return 0.0 | |
| def get_model_details(client: ollama.Client, model_name: str, file_size_bytes: int = 0) -> Dict[str, Any]: | |
| """Fetches detailed information about a model using /api/show.""" | |
| try: | |
| details = client.show(model_name) | |
| # # DEBUG: Print what we actually get | |
| # print(f"\n=== DEBUG for {model_name} ===") | |
| # print(f"Type: {type(details)}") | |
| # print(f"Dir: {[attr for attr in dir(details) if not attr.startswith('_')]}") | |
| # if hasattr(details, '__dict__'): | |
| # print(f"Dict: {details.__dict__}") | |
| # print("=== END DEBUG ===\n") | |
| param_count = extract_param_count(model_name, details) | |
| vram_estimate_gb = bytes_to_gb(file_size_bytes) * VRAM_OVERHEAD_FACTOR | |
| return { | |
| "file_size_gb": bytes_to_gb(file_size_bytes), | |
| "param_count_billion": param_count, | |
| "vram_estimate_gb": vram_estimate_gb, | |
| "details": details | |
| } | |
| except Exception as e: | |
| print(f"Warning: Error fetching details for {model_name}: {e}") | |
| return { | |
| "file_size_gb": 0.0, | |
| "param_count_billion": 0.0, | |
| "vram_estimate_gb": 0.0, | |
| "details": {} | |
| } | |
| def measure_ttft(client: ollama.Client, model_name: str, prompt: str) -> Tuple[float, float, int, float]: | |
| """ | |
| Measures Time-to-First-Token (TTFT), TPOT, and load time. | |
| Returns: (ttft, tpot, eval_count, load_time_ms) | |
| """ | |
| try: | |
| start_time = time.time() | |
| ttft = -1.0 | |
| total_duration = 0 | |
| load_duration_ns = 0 | |
| response_stream = client.generate( | |
| model=model_name, | |
| prompt=prompt, | |
| stream=True, | |
| options={ | |
| "num_predict": 50, # Limit output for consistency | |
| "temperature": 0.0, # Deterministic for benchmarking | |
| } | |
| ) | |
| for chunk in response_stream: | |
| if ttft < 0 and chunk.get('response'): | |
| ttft = time.time() - start_time | |
| if chunk.get('done'): | |
| total_duration_ns = chunk.get('total_duration', 0) | |
| load_duration_ns = chunk.get('load_duration', 0) | |
| total_duration = total_duration_ns / 1_000_000_000 | |
| eval_count = chunk.get('eval_count', 0) | |
| break | |
| tpot = (total_duration - ttft) / eval_count if eval_count > 0 else 0.0 | |
| load_time_ms = load_duration_ns / 1_000_000 | |
| return ttft, tpot, eval_count, load_time_ms | |
| except Exception as e: | |
| print(f"Error benchmarking {model_name}: {e}") | |
| return float('inf'), float('inf'), 0, 0.0 | |
| def run_multiple_benchmarks(client: ollama.Client, model_name: str, prompt: str, | |
| runs: int, warmup: bool = True) -> Dict[str, Any]: | |
| """ | |
| Run multiple benchmark iterations and calculate statistics. | |
| """ | |
| if warmup: | |
| if RICH_AVAILABLE: | |
| console.print(f" [dim]Running warmup...[/dim]") | |
| else: | |
| print(f" Running warmup...") | |
| measure_ttft(client, model_name, prompt) | |
| time.sleep(1) | |
| ttfts = [] | |
| tpots = [] | |
| eval_counts = [] | |
| load_times = [] | |
| for i in range(runs): | |
| if RICH_AVAILABLE: | |
| console.print(f" [dim]Run {i+1}/{runs}...[/dim]") | |
| else: | |
| print(f" Run {i+1}/{runs}...") | |
| ttft, tpot, eval_count, load_time = measure_ttft(client, model_name, prompt) | |
| if ttft != float('inf'): | |
| ttfts.append(ttft) | |
| tpots.append(tpot) | |
| eval_counts.append(eval_count) | |
| load_times.append(load_time) | |
| time.sleep(0.5) # Brief pause between runs | |
| if not ttfts: | |
| return { | |
| "ttft_mean": float('inf'), | |
| "ttft_std": 0.0, | |
| "tpot_mean": float('inf'), | |
| "tpot_std": 0.0, | |
| "tokens_per_sec": 0.0, | |
| "output_tokens": 0, | |
| "load_time_ms": 0.0 | |
| } | |
| ttft_mean = statistics.mean(ttfts) | |
| tpot_mean = statistics.mean(tpots) | |
| tokens_per_sec = 1.0 / tpot_mean if tpot_mean > 0 else 0.0 | |
| return { | |
| "ttft_mean": ttft_mean, | |
| "ttft_std": statistics.stdev(ttfts) if len(ttfts) > 1 else 0.0, | |
| "tpot_mean": tpot_mean, | |
| "tpot_std": statistics.stdev(tpots) if len(tpots) > 1 else 0.0, | |
| "tokens_per_sec": tokens_per_sec, | |
| "output_tokens": int(statistics.mean(eval_counts)), | |
| "load_time_ms": statistics.mean(load_times) | |
| } | |
| def print_results_table(results: List[BenchmarkResult], sort_by: str = 'ttft'): | |
| """Print results in a formatted table.""" | |
| if not results: | |
| print("No results to display.") | |
| return | |
| # Sort results | |
| sort_key_map = { | |
| 'ttft': lambda x: x.ttft_mean, | |
| 'tpot': lambda x: x.tpot_mean, | |
| 'speed': lambda x: -x.tokens_per_sec, # Negative for descending | |
| 'size': lambda x: x.file_size_gb | |
| } | |
| results = sorted(results, key=sort_key_map.get(sort_by, sort_key_map['ttft'])) | |
| if RICH_AVAILABLE: | |
| table = Table(title="🚀 Ollama LLM Benchmark Results", show_lines=True) | |
| table.add_column("Model", style="cyan", no_wrap=True) | |
| table.add_column("TTFT (s)", justify="right", style="green") | |
| table.add_column("TPOT (s)", justify="right", style="yellow") | |
| table.add_column("Tok/sec", justify="right", style="magenta") | |
| table.add_column("Params (B)", justify="right") | |
| table.add_column("Size (GB)", justify="right") | |
| table.add_column("VRAM (GB)", justify="right", style="red") | |
| table.add_column("Runs", justify="right", style="dim") | |
| for result in results: | |
| ttft_str = f"{result.ttft_mean:.3f} ±{result.ttft_std:.3f}" if result.ttft_mean != float('inf') else "Error" | |
| tpot_str = f"{result.tpot_mean:.4f} ±{result.tpot_std:.4f}" if result.tpot_mean != float('inf') else "Error" | |
| table.add_row( | |
| result.model_name, | |
| ttft_str, | |
| tpot_str, | |
| f"{result.tokens_per_sec:.1f}", | |
| f"{result.param_count_billion:.1f}" if result.param_count_billion > 0 else "N/A", | |
| f"{result.file_size_gb:.2f}", | |
| f"{result.vram_estimate_gb:.2f}", | |
| str(result.runs) | |
| ) | |
| console.print(table) | |
| else: | |
| # Fallback to basic table | |
| print("\n" + "=" * 120) | |
| print("OLLAMA LLM BENCHMARK RESULTS") | |
| print("=" * 120) | |
| header = "{:<30} | {:>12} | {:>12} | {:>10} | {:>10} | {:>10} | {:>10} | {:>5}".format( | |
| "Model", "TTFT (s)", "TPOT (s)", "Tok/sec", "Params (B)", "Size (GB)", "VRAM (GB)", "Runs" | |
| ) | |
| print(header) | |
| print("-" * 120) | |
| for result in results: | |
| ttft_str = f"{result.ttft_mean:.3f}±{result.ttft_std:.2f}" if result.ttft_mean != float('inf') else "Error" | |
| tpot_str = f"{result.tpot_mean:.4f}±{result.tpot_std:.3f}" if result.tpot_mean != float('inf') else "Error" | |
| row = "{:<30} | {:>12} | {:>12} | {:>10.1f} | {:>10} | {:>10.2f} | {:>10.2f} | {:>5}".format( | |
| result.model_name[:30], | |
| ttft_str, | |
| tpot_str, | |
| result.tokens_per_sec, | |
| f"{result.param_count_billion:.1f}" if result.param_count_billion > 0 else "N/A", | |
| result.file_size_gb, | |
| result.vram_estimate_gb, | |
| result.runs | |
| ) | |
| print(row) | |
| print("=" * 120) | |
| def export_results(results: List[BenchmarkResult], format: str, output_path: Path): | |
| """Export results to CSV or JSON.""" | |
| if format == 'csv': | |
| with open(output_path, 'w', newline='') as f: | |
| if results: | |
| writer = csv.DictWriter(f, fieldnames=results[0].to_dict().keys()) | |
| writer.writeheader() | |
| for result in results: | |
| writer.writerow(result.to_dict()) | |
| print(f"\n✓ Results exported to {output_path}") | |
| elif format == 'json': | |
| with open(output_path, 'w') as f: | |
| json.dump([r.to_dict() for r in results], f, indent=2) | |
| print(f"\n✓ Results exported to {output_path}") | |
| def run_benchmark(args): | |
| """Main benchmark function.""" | |
| client = ollama.Client(host=args.host) | |
| if RICH_AVAILABLE: | |
| console.print(f"[bold green]Connecting to Ollama at {args.host}...[/bold green]") | |
| else: | |
| print(f"Connecting to Ollama at {args.host}...") | |
| try: | |
| models = client.list().get('models', []) | |
| except Exception as e: | |
| print(f"\n[ERROR] Could not connect to Ollama server at {args.host}") | |
| print(f"Details: {e}") | |
| print("\nMake sure Ollama is running: ollama serve") | |
| return | |
| if not models: | |
| print("\n[INFO] No models found. Install models with: ollama pull <model_name>") | |
| return | |
| # Filter models if specified | |
| if args.models: | |
| model_names = set(args.models) | |
| models = [m for m in models if m.model in model_names] | |
| if not models: | |
| print(f"[ERROR] None of the specified models found: {args.models}") | |
| return | |
| print(f"\nFound {len(models)} model(s) to benchmark") | |
| print(f"Prompt: '{args.prompt}'") | |
| print(f"Runs per model: {args.runs}" + (" (with warmup)" if args.warmup else "")) | |
| print("-" * 80 + "\n") | |
| benchmark_results: List[BenchmarkResult] = [] | |
| for model_entry in models: | |
| if hasattr(model_entry, 'model'): | |
| model_name = model_entry.model | |
| model_size = getattr(model_entry, 'size', 0) | |
| elif hasattr(model_entry, 'name'): | |
| model_name = model_entry.name | |
| model_size = getattr(model_entry, 'size', 0) | |
| elif isinstance(model_entry, dict): | |
| model_name = model_entry.get('name') or model_entry.get('model', 'unknown') | |
| model_size = model_entry.get('size', 0) | |
| else: | |
| model_name = str(model_entry) | |
| model_size = 0 | |
| if RICH_AVAILABLE: | |
| console.print(f"\n[bold cyan]Benchmarking: {model_name}[/bold cyan]") | |
| else: | |
| print(f"\nBenchmarking: {model_name}") | |
| details = get_model_details(client, model_name, model_size) | |
| # Run benchmark | |
| bench_stats = run_multiple_benchmarks( | |
| client, model_name, args.prompt, | |
| runs=args.runs, warmup=args.warmup | |
| ) | |
| # Create result | |
| result = BenchmarkResult( | |
| model_name=model_name, | |
| ttft_mean=bench_stats['ttft_mean'], | |
| ttft_std=bench_stats['ttft_std'], | |
| tpot_mean=bench_stats['tpot_mean'], | |
| tpot_std=bench_stats['tpot_std'], | |
| tokens_per_sec=bench_stats['tokens_per_sec'], | |
| param_count_billion=details['param_count_billion'], | |
| file_size_gb=details['file_size_gb'], | |
| vram_estimate_gb=details['vram_estimate_gb'], | |
| output_tokens=bench_stats['output_tokens'], | |
| runs=args.runs, | |
| load_time_ms=bench_stats['load_time_ms'] | |
| ) | |
| benchmark_results.append(result) | |
| # Display results | |
| print("\n") | |
| print_results_table(benchmark_results, sort_by=args.sort) | |
| # Print summary stats | |
| print("\n📊 Summary Statistics:") | |
| print(f" • TTFT = Time to First Token (lower is better)") | |
| print(f" • TPOT = Time Per Output Token (lower is better)") | |
| print(f" • Tok/sec = Tokens per second (higher is better)") | |
| print(f" • VRAM estimate includes {int((VRAM_OVERHEAD_FACTOR-1)*100)}% overhead for KV cache") | |
| # Export if requested | |
| if args.output: | |
| export_results(benchmark_results, args.format, Path(args.output)) | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Comprehensive Ollama LLM Benchmark Tool", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| %(prog)s # Benchmark all models | |
| %(prog)s --models llama3.2:3b mistral:7b # Benchmark specific models | |
| %(prog)s --runs 5 --warmup # 5 runs with warmup | |
| %(prog)s --output results.json --format json # Export to JSON | |
| %(prog)s --sort speed # Sort by generation speed | |
| """ | |
| ) | |
| parser.add_argument( | |
| '--host', | |
| default=OLLAMA_HOST, | |
| help=f'Ollama server URL (default: {OLLAMA_HOST})' | |
| ) | |
| parser.add_argument( | |
| '--prompt', | |
| default=DEFAULT_PROMPT, | |
| help=f'Benchmark prompt (default: "{DEFAULT_PROMPT}")' | |
| ) | |
| parser.add_argument( | |
| '--runs', | |
| type=int, | |
| default=3, | |
| help='Number of benchmark runs per model (default: 3)' | |
| ) | |
| parser.add_argument( | |
| '--warmup', | |
| action='store_true', | |
| help='Run a warmup iteration before benchmarking' | |
| ) | |
| parser.add_argument( | |
| '--models', | |
| nargs='+', | |
| help='Specific models to benchmark (default: all installed)' | |
| ) | |
| parser.add_argument( | |
| '--sort', | |
| choices=['ttft', 'tpot', 'speed', 'size'], | |
| default='speed', | |
| help='Sort results by metric (default: speed)' | |
| ) | |
| parser.add_argument( | |
| '--output', | |
| help='Output file path for results' | |
| ) | |
| parser.add_argument( | |
| '--format', | |
| choices=['csv', 'json'], | |
| default='json', | |
| help='Output format (default: json)' | |
| ) | |
| args = parser.parse_args() | |
| if ALWAYS_WARMUP: | |
| args.warmup = True | |
| run_benchmark(args) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (net) BriansMacStudio sftp % python burrito.py | |
| Connecting to Ollama at http://localhost:11434... | |
| Found 11 model(s) to benchmark | |
| Prompt: 'What is a burrito? Give a one-sentence answer.' | |
| Runs per model: 3 (with warmup) | |
| -------------------------------------------------------------------------------- | |
| Benchmarking: gpt-oss:20b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: phi3:mini | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: tinyllama:1.1b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: llama3.2:3b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: llama3.3:70b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: qwen2.5:72b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: nollama/mythomax-l2-13b:Q4_K_M | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: interstellarninja/hermes-2-pro-llama-3-8b:latest | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: llama3.1:8b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: deepseek-r1:8b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| Benchmarking: qwen3:8b | |
| Running warmup... | |
| Run 1/3... | |
| Run 2/3... | |
| Run 3/3... | |
| 🚀 Ollama LLM Benchmark Results | |
| ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━┓ | |
| ┃ Model ┃ TTFT (s) ┃ TPOT (s) ┃ Tok/sec ┃ Params (B) ┃ Size (GB) ┃ VRAM (GB) ┃ Runs ┃ | |
| ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━┩ | |
| │ tinyllama:1.1b │ 0.052 ±0.008 │ 0.0038 ±0.0001 │ 266.2 │ 1.1 │ 0.59 │ 0.71 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ llama3.2:3b │ 0.116 ±0.006 │ 0.0061 ±0.0001 │ 162.7 │ 3.0 │ 1.88 │ 2.26 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ phi3:mini │ 0.048 ±0.003 │ 0.0070 ±0.0001 │ 143.2 │ N/A │ 2.03 │ 2.43 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ interstellarninja/hermes-2-pro-llama-3-8b:latest │ 0.165 ±0.088 │ 0.0102 ±0.0001 │ 97.9 │ 8.0 │ 4.58 │ 5.50 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ llama3.1:8b │ 0.435 ±0.544 │ 0.0105 ±0.0001 │ 95.5 │ 8.0 │ 4.58 │ 5.50 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ nollama/mythomax-l2-13b:Q4_K_M │ 0.066 ±0.012 │ 0.0174 ±0.0021 │ 57.4 │ 13.0 │ 7.33 │ 8.79 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ gpt-oss:20b │ -1.000 ±0.000 │ 0.0329 ±0.0004 │ 30.4 │ 20.0 │ 12.85 │ 15.42 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ qwen3:8b │ -1.000 ±0.000 │ 0.0330 ±0.0001 │ 30.3 │ 8.0 │ 4.87 │ 5.84 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ deepseek-r1:8b │ -1.000 ±0.000 │ 0.0334 ±0.0003 │ 30.0 │ 8.0 │ 4.87 │ 5.84 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ llama3.3:70b │ 0.173 ±0.005 │ 0.0719 ±0.0002 │ 13.9 │ 70.0 │ 39.60 │ 47.52 │ 3 │ | |
| ├──────────────────────────────────────────────────┼───────────────┼────────────────┼─────────┼────────────┼───────────┼───────────┼──────┤ | |
| │ qwen2.5:72b │ 0.186 ±0.003 │ 0.0776 ±0.0001 │ 12.9 │ 72.0 │ 44.16 │ 52.99 │ 3 │ | |
| └──────────────────────────────────────────────────┴───────────────┴────────────────┴─────────┴────────────┴───────────┴───────────┴──────┘ | |
| 📊 Summary Statistics: | |
| • TTFT = Time to First Token (lower is better) | |
| • TPOT = Time Per Output Token (lower is better) | |
| • Tok/sec = Tokens per second (higher is better) | |
| • VRAM estimate includes 19% overhead for KV cache | |
| (net) BriansMacStudio sftp % |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| + system_profiler SPHardwareDataType SPDisplaysDataType SPMemoryDataType SPStorageDataType | |
| Hardware: | |
| Hardware Overview: | |
| Model Name: Mac Studio | |
| Model Identifier: Mac14,14 | |
| Model Number: Z180000E6LL/A | |
| Chip: Apple M2 Ultra | |
| Total Number of Cores: 24 (16 performance and 8 efficiency) | |
| Memory: 192 GB | |
| System Firmware Version: 10151.121.1 | |
| OS Loader Version: 10151.121.1 | |
| Serial Number (system): QK45L9J7DL | |
| Hardware UUID: 112DF8F6-1437-51F5-BA05-1FD6202A91EB | |
| Provisioning UDID: 00006022-000A11AA0202201E | |
| Activation Lock Status: Enabled | |
| Graphics/Displays: | |
| Apple M2 Ultra: | |
| Chipset Model: Apple M2 Ultra | |
| Type: GPU | |
| Bus: Built-In | |
| Total Number of Cores: 76 | |
| Vendor: Apple (0x106b) | |
| Metal Support: Metal 3 | |
| Displays: | |
| HDMI2K: | |
| Resolution: 1920 x 1080 (1080p FHD - Full High Definition) | |
| UI Looks like: 1920 x 1080 @ 60.00Hz | |
| Main Display: Yes | |
| Mirror: Off | |
| Online: Yes | |
| Rotation: Supported | |
| Memory: | |
| Memory: 192 GB | |
| Type: LPDDR5 | |
| Manufacturer: Hynix | |
| Storage: | |
| Data: | |
| Free: 5.75 TB (5,748,488,724,480 bytes) | |
| Capacity: 8 TB (7,998,551,654,400 bytes) | |
| Mount Point: /System/Volumes/Data | |
| File System: APFS | |
| Writable: Yes | |
| Ignore Ownership: No | |
| BSD Name: disk3s5 | |
| Volume UUID: AA8B5B1F-34C4-4FCB-AB95-306048694283 | |
| Physical Drive: | |
| Device Name: APPLE SSD AP8192Z | |
| Media Name: AppleAPFSMedia | |
| Medium Type: SSD | |
| Protocol: Apple Fabric | |
| Internal: Yes | |
| Partition Map Type: Unknown | |
| S.M.A.R.T. Status: Verified | |
| Macintosh HD: | |
| Free: 5.75 TB (5,748,488,724,480 bytes) | |
| Capacity: 8 TB (7,998,551,654,400 bytes) | |
| Mount Point: / | |
| File System: APFS | |
| Writable: No | |
| Ignore Ownership: No | |
| BSD Name: disk3s1s1 | |
| Volume UUID: E28F0643-33B3-4F17-974B-EF0D1061EC56 | |
| Physical Drive: | |
| Device Name: APPLE SSD AP8192Z | |
| Media Name: AppleAPFSMedia | |
| Medium Type: SSD | |
| Protocol: Apple Fabric | |
| Internal: Yes | |
| Partition Map Type: Unknown | |
| S.M.A.R.T. Status: Verified | |
| Docker: | |
| Free: 178.7 MB (178,708,480 bytes) | |
| Capacity: 2.28 GB (2,282,708,992 bytes) | |
| Mount Point: /private/var/folders/r4/xjmr3wrx7tq_6rlf4hltw1bc0000gn/T/DockerDesktop-207573627911685 | |
| File System: HFS+ | |
| Writable: No | |
| Ignore Ownership: Yes | |
| BSD Name: disk4s2 | |
| Volume UUID: 275A1A9D-EE6D-3A66-B97A-4F45F8357D0F | |
| Physical Drive: | |
| Device Name: Disk Image | |
| Media Name: Apple UDIF read-only compressed (lzma) Media | |
| Protocol: Disk Image | |
| Internal: No | |
| Partition Map Type: GPT (GUID Partition Table) | |
| + vm_stat | |
| Mach Virtual Memory Statistics: (page size of 16384 bytes) | |
| Pages free: 3836436. | |
| Pages active: 4177765. | |
| Pages inactive: 990159. | |
| Pages speculative: 3186264. | |
| Pages throttled: 0. | |
| Pages wired down: 174294. | |
| Pages purgeable: 2269. | |
| "Translation faults": 106134237803. | |
| Pages copy-on-write: 57688512. | |
| Pages zero filled: 98188086464. | |
| Pages reactivated: 2098499. | |
| Pages purged: 2441458. | |
| File-backed pages: 5886992. | |
| Anonymous pages: 2467196. | |
| Pages stored in compressor: 252994. | |
| Pages occupied by compressor: 99879. | |
| Decompressions: 2983984. | |
| Compressions: 6461143. | |
| Pageins: 20860802. | |
| Pageouts: 13419. | |
| Swapins: 143. | |
| Swapouts: 1412. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment