Skip to content

Instantly share code, notes, and snippets.

@bedwards
Last active October 30, 2025 13:11
Show Gist options
  • Select an option

  • Save bedwards/2c5eb18fc91b6228356bf7969857139e to your computer and use it in GitHub Desktop.

Select an option

Save bedwards/2c5eb18fc91b6228356bf7969857139e to your computer and use it in GitHub Desktop.
burrito
codestral:22b
0.20 seconds
qwen3-coder:30b
0.21 seconds
gemma2:27b
0.29 seconds
command-r-plus:104b
0.56 seconds
llama3.1:70b
0.60 seconds
gpt-oss:120b
1.35 seconds
$ ~/bin/ollama-load.py
Loading gpt-oss:120b
8.94 seconds
What is a burrito? In three words.
Stuffed tortilla wrap.
4.17 seconds
What is a burrito? In three words.
Stuffed tortilla wrap.
1.32 seconds
What is a burrito? In three words.
Tortilla‑filled wrap.
1.35 seconds
$ ~/bin/ollama-load.py 0
Unloading gpt-oss:120b
0.11 seconds
---
$ ~/bin/ollama-load.py llama3.1:70b
Loading llama3.1:70b
4.19 seconds
What is a burrito? In three words.
Large Mexican wrap.
12.39 seconds
What is a burrito? In three words.
Tortilla wrapped meal.
0.60 seconds
What is a burrito? In three words.
Tortilla wrapped meal.
0.60 seconds
$ ~/bin/ollama-load.py llama3.1:70b 0
Unloading llama3.1:70b
0.03 seconds
---
$ ~/bin/ollama-load.py gemma2:27b
Loading gemma2:27b
0.98 seconds
What is a burrito? In three words.
Flour tortilla wrap.
22.61 seconds
What is a burrito? In three words.
Flour tortilla wrap.
0.29 seconds
What is a burrito? In three words.
Mexican stuffed tortilla.
0.29 seconds
$ ~/bin/ollama-load.py gemma2:27b 0
Unloading gemma2:27b
0.18 seconds
---
$ ~/bin/ollama-load.py qwen3-coder:30b
Loading qwen3-coder:30b
5.37 seconds
What is a burrito? In three words.
Mexican food wrap.
1.71 seconds
What is a burrito? In three words.
Mexican food wrap.
0.19 seconds
What is a burrito? In three words.
Mexican food wrap.
0.21 seconds
$ ~/bin/ollama-load.py qwen3-coder:30b 0
Unloading qwen3-coder:30b
0.21 seconds
---
$ ~/bin/ollama-load.py codestral:22b
Loading codestral:22b
1.22 seconds
What is a burrito? In three words.
Mexican Food Delight: A Wrap of Joy!
18.48 seconds
What is a burrito? In three words.
Mexican wrap, filled delight.
0.20 seconds
What is a burrito? In three words.
Mexican, filled, wrap.
0.20 seconds
$ ~/bin/ollama-load.py codestral:22b 0
Unloading codestral:22b
0.03 seconds
---
$ ~/bin/ollama-load.py command-r-plus:104b
Loading command-r-plus:104b
3.62 seconds
What is a burrito? In three words.
A Mexican food wrap.
89.33 seconds
What is a burrito? In three words.
A Mexican food.
0.56 seconds
What is a burrito? In three words.
A filled tortilla.
0.55 seconds
$ ~/bin/ollama-load.py command-r-plus:104b 0
Unloading command-r-plus:104b
0.13 seconds
#!/usr/bin/env python3
from datetime import datetime
import json
import sys
import re
import requests
# --- Configuration (Defaults) ---
OLLAMA_HOST = "http://studio:11434"
DEFAULT_MODEL = "gpt-oss:120b"
DEFAULT_KEEP_ALIVE = -1
PROMPT = "What is a burrito? In three words."
# Regex to check if a string is an integer (positive or negative)
NUMERIC_REGEX = re.compile(r"^[+-]?[0-9]+$")
n_pings = 3
# --- Argument Parsing ---
def parse_args():
"""Parses arguments to determine model and keep_alive values."""
# Get raw arguments (excluding the script name)
args = sys.argv[1:]
model = DEFAULT_MODEL
keep_alive = DEFAULT_KEEP_ALIVE
if not args:
# No arguments: Use all defaults
pass
elif len(args) == 1:
arg1 = args[0]
if NUMERIC_REGEX.match(arg1):
# Case 1: Argument 1 is a number (it's the keep_alive)
keep_alive = int(arg1)
# Model remains the DEFAULT_MODEL
else:
# Case 2: Argument 1 is not a number (it's the model name)
model = arg1
# keep_alive remains the DEFAULT_KEEP_ALIVE
elif len(args) >= 2:
# Case 3: Both model and keep_alive are provided
model = args[0]
# We assume arg2 is the keep_alive string, but it must be convertible to a number for the API
try:
keep_alive = int(args[1])
except ValueError:
sys.stderr.write(f"Warning: Argument 2 ('{args[1]}') is not a valid integer for keep_alive. Using default ({DEFAULT_KEEP_ALIVE}).\n")
# If the user passed a non-numeric string (e.g., "10m"), we treat it as an argument error here,
# or you could pass the string directly if Ollama supported it for this logic. Sticking to numbers for parity.
return model, keep_alive
# --- API Functions (Modified to accept model/keep_alive) ---
def handle_model_action(host, model, keep_alive):
"""Sends the initial model action request (load, keep-alive, or unload)."""
url = f"{host}/api/generate"
model_action_payload = {
"model": model,
"keep_alive": keep_alive
}
action_type = "Loading" if keep_alive != 0 else "Unloading"
# print(f"Executing model action ({action_type}): {model}, keep_alive={keep_alive}")
print(f"{action_type} {model}")
try:
response = requests.post(url, json=model_action_payload, stream=False)
response.raise_for_status()
load_data = response.json()
# print(f"Model action complete. Created at: {load_data.get('created_at')}")
except requests.exceptions.RequestException as e:
sys.stderr.write(f"Error during model action {action_type} for {model}: {e}\n")
sys.exit(1)
def generate_response(host, model, prompt):
"""Sends the prompt and collects the streaming response, printing only the clean result."""
full_response_text = ""
try:
url = f"{host}/api/generate"
payload = {
"model": model,
"prompt": prompt,
"stream": True
}
# print(f"Sending prompt to {model}: '{prompt}'")
print(prompt)
response = requests.post(url, json=payload, stream=True)
response.raise_for_status()
# Process stream and filter for non-empty responses
for line in response.iter_lines(decode_unicode=True):
if line:
try:
data = json.loads(line)
token = data.get("response", "")
if token:
full_response_text += token
except json.JSONDecodeError:
sys.stderr.write(f"Warning: Could not decode JSON line: {line}\n")
except requests.exceptions.RequestException as e:
sys.stderr.write(f"Error generating response: {e}\n")
sys.exit(1)
return full_response_text.strip()
# --- Main Execution ---
if __name__ == "__main__":
model_name, keep_alive_value = parse_args()
# 1. Execute the initial model action
start = datetime.now()
handle_model_action(OLLAMA_HOST, model_name, keep_alive_value)
print(f"{(datetime.now() - start).total_seconds():.2f} seconds")
# 2. Conditional prompt generation, matching the shell script's `if [ "$keep_alive" -ne 0 ];`
if keep_alive_value != 0:
for _ in range(n_pings):
start = datetime.now()
final_output = generate_response(OLLAMA_HOST, model_name, PROMPT)
# print("\n--- Final Clean Output ---")
print(final_output)
print(f"{(datetime.now() - start).total_seconds():.2f} seconds")
# print("--------------------------")
else:
# print("\nPrompt skipped: keep_alive was set to 0 (model unload command).")
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment