bedwards · October 30, 2025 13:11
diff --git a/0-burrito.txt b/0-burrito.txt
 codestral:22b
 0.20 seconds

 qwen3-coder:30b
 0.21 seconds

 gemma2:27b
 0.29 seconds

 command-r-plus:104b
 0.56 seconds

 llama3.1:70b
 0.60 seconds

 gpt-oss:120b
 1.35 seconds
diff --git a/1-burrito.txt b/1-burrito.txt
 $ ~/bin/ollama-load.py
 Loading gpt-oss:120b
 8.94 seconds
 What is a burrito? In three words.
 Stuffed tortilla wrap.
 4.17 seconds
 What is a burrito? In three words.
 Stuffed tortilla wrap.
 1.32 seconds
 What is a burrito? In three words.
 Tortilla‑filled wrap.
 1.35 seconds

 $ ~/bin/ollama-load.py 0
 Unloading gpt-oss:120b
 0.11 seconds

 ---

 $ ~/bin/ollama-load.py llama3.1:70b
 Loading llama3.1:70b
 4.19 seconds
 What is a burrito? In three words.
 Large Mexican wrap.
 12.39 seconds
 What is a burrito? In three words.
 Tortilla wrapped meal.
 0.60 seconds
 What is a burrito? In three words.
 Tortilla wrapped meal.
 0.60 seconds

 $  ~/bin/ollama-load.py llama3.1:70b 0
 Unloading llama3.1:70b
 0.03 seconds

 ---

 $  ~/bin/ollama-load.py gemma2:27b
 Loading gemma2:27b
 0.98 seconds
 What is a burrito? In three words.
 Flour tortilla wrap.
 22.61 seconds
 What is a burrito? In three words.
 Flour tortilla wrap.
 0.29 seconds
 What is a burrito? In three words.
 Mexican stuffed tortilla.
 0.29 seconds

 $  ~/bin/ollama-load.py gemma2:27b 0
 Unloading gemma2:27b
 0.18 seconds

 ---

 $  ~/bin/ollama-load.py qwen3-coder:30b
 Loading qwen3-coder:30b
 5.37 seconds
 What is a burrito? In three words.
 Mexican food wrap.
 1.71 seconds
 What is a burrito? In three words.
 Mexican food wrap.
 0.19 seconds
 What is a burrito? In three words.
 Mexican food wrap.
 0.21 seconds

 $  ~/bin/ollama-load.py qwen3-coder:30b 0
 Unloading qwen3-coder:30b
 0.21 seconds

 ---

 $  ~/bin/ollama-load.py codestral:22b
 Loading codestral:22b
 1.22 seconds
 What is a burrito? In three words.
 Mexican Food Delight: A Wrap of Joy!
 18.48 seconds
 What is a burrito? In three words.
 Mexican wrap, filled delight.
 0.20 seconds
 What is a burrito? In three words.
 Mexican, filled, wrap.
 0.20 seconds

 $  ~/bin/ollama-load.py codestral:22b 0
 Unloading codestral:22b
 0.03 seconds

 ---

 $  ~/bin/ollama-load.py command-r-plus:104b
 Loading command-r-plus:104b
 3.62 seconds
 What is a burrito? In three words.
 A Mexican food wrap.
 89.33 seconds
 What is a burrito? In three words.
 A Mexican food.
 0.56 seconds
 What is a burrito? In three words.
 A filled tortilla.
 0.55 seconds

 $  ~/bin/ollama-load.py command-r-plus:104b 0
 Unloading command-r-plus:104b
 0.13 seconds
diff --git a/ollama-load.py b/ollama-load.py
 #!/usr/bin/env python3

 from datetime import datetime
 import json
 import sys
 import re
 import requests

 # --- Configuration (Defaults) ---
 OLLAMA_HOST = "http://studio:11434"
 DEFAULT_MODEL = "gpt-oss:120b"
 DEFAULT_KEEP_ALIVE = -1
 PROMPT = "What is a burrito? In three words."

 # Regex to check if a string is an integer (positive or negative)
 NUMERIC_REGEX = re.compile(r"^[+-]?[0-9]+$")

 n_pings = 3

 # --- Argument Parsing ---

 def parse_args():
    """Parses arguments to determine model and keep_alive values."""

    # Get raw arguments (excluding the script name)
    args = sys.argv[1:]

    model = DEFAULT_MODEL
    keep_alive = DEFAULT_KEEP_ALIVE

    if not args:
        # No arguments: Use all defaults
        pass
    elif len(args) == 1:
        arg1 = args[0]
        if NUMERIC_REGEX.match(arg1):
            # Case 1: Argument 1 is a number (it's the keep_alive)
            keep_alive = int(arg1)
            # Model remains the DEFAULT_MODEL
        else:
            # Case 2: Argument 1 is not a number (it's the model name)
            model = arg1
            # keep_alive remains the DEFAULT_KEEP_ALIVE
    elif len(args) >= 2:
        # Case 3: Both model and keep_alive are provided
        model = args[0]
        # We assume arg2 is the keep_alive string, but it must be convertible to a number for the API
        try:
            keep_alive = int(args[1])
        except ValueError:
            sys.stderr.write(f"Warning: Argument 2 ('{args[1]}') is not a valid integer for keep_alive. Using default ({DEFAULT_KEEP_ALIVE}).\n")
            # If the user passed a non-numeric string (e.g., "10m"), we treat it as an argument error here,
            # or you could pass the string directly if Ollama supported it for this logic. Sticking to numbers for parity.

    return model, keep_alive

 # --- API Functions (Modified to accept model/keep_alive) ---

 def handle_model_action(host, model, keep_alive):
    """Sends the initial model action request (load, keep-alive, or unload)."""
    url = f"{host}/api/generate"
    model_action_payload = {
        "model": model,
        "keep_alive": keep_alive
    }

    action_type = "Loading" if keep_alive != 0 else "Unloading"
    # print(f"Executing model action ({action_type}): {model}, keep_alive={keep_alive}")
    print(f"{action_type} {model}")

    try:
        response = requests.post(url, json=model_action_payload, stream=False)
        response.raise_for_status()

        load_data = response.json()
        # print(f"Model action complete. Created at: {load_data.get('created_at')}")

    except requests.exceptions.RequestException as e:
        sys.stderr.write(f"Error during model action {action_type} for {model}: {e}\n")
        sys.exit(1)


 def generate_response(host, model, prompt):
    """Sends the prompt and collects the streaming response, printing only the clean result."""

    full_response_text = ""

    try:
        url = f"{host}/api/generate"
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": True
        }

        # print(f"Sending prompt to {model}: '{prompt}'")
        print(prompt)
        response = requests.post(url, json=payload, stream=True)
        response.raise_for_status()

        # Process stream and filter for non-empty responses
        for line in response.iter_lines(decode_unicode=True):
            if line:
                try:
                    data = json.loads(line)
                    token = data.get("response", "")

                    if token:
                        full_response_text += token

                except json.JSONDecodeError:
                    sys.stderr.write(f"Warning: Could not decode JSON line: {line}\n")

    except requests.exceptions.RequestException as e:
        sys.stderr.write(f"Error generating response: {e}\n")
        sys.exit(1)

    return full_response_text.strip()

 # --- Main Execution ---

 if __name__ == "__main__":

    model_name, keep_alive_value = parse_args()

    # 1. Execute the initial model action
    start = datetime.now()
    handle_model_action(OLLAMA_HOST, model_name, keep_alive_value)
    print(f"{(datetime.now() - start).total_seconds():.2f} seconds")

    # 2. Conditional prompt generation, matching the shell script's `if [ "$keep_alive" -ne 0 ];`
    if keep_alive_value != 0:
        for _ in range(n_pings):
            start = datetime.now()
            final_output = generate_response(OLLAMA_HOST, model_name, PROMPT)

            # print("\n--- Final Clean Output ---")
            print(final_output)
            print(f"{(datetime.now() - start).total_seconds():.2f} seconds")
            # print("--------------------------")
    else:
        # print("\nPrompt skipped: keep_alive was set to 0 (model unload command).")
        pass
	codestral:22b
	0.20 seconds

	qwen3-coder:30b
	0.21 seconds

	gemma2:27b
	0.29 seconds

	command-r-plus:104b
	0.56 seconds

	llama3.1:70b
	0.60 seconds

	gpt-oss:120b
	1.35 seconds
	$ ~/bin/ollama-load.py
	Loading gpt-oss:120b
	8.94 seconds
	What is a burrito? In three words.
	Stuffed tortilla wrap.
	4.17 seconds
	What is a burrito? In three words.
	Stuffed tortilla wrap.
	1.32 seconds
	What is a burrito? In three words.
	Tortilla‑filled wrap.
	1.35 seconds

	$ ~/bin/ollama-load.py 0
	Unloading gpt-oss:120b
	0.11 seconds

	---

	$ ~/bin/ollama-load.py llama3.1:70b
	Loading llama3.1:70b
	4.19 seconds
	What is a burrito? In three words.
	Large Mexican wrap.
	12.39 seconds
	What is a burrito? In three words.
	Tortilla wrapped meal.
	0.60 seconds
	What is a burrito? In three words.
	Tortilla wrapped meal.
	0.60 seconds

	$ ~/bin/ollama-load.py llama3.1:70b 0
	Unloading llama3.1:70b
	0.03 seconds

	---

	$ ~/bin/ollama-load.py gemma2:27b
	Loading gemma2:27b
	0.98 seconds
	What is a burrito? In three words.
	Flour tortilla wrap.
	22.61 seconds
	What is a burrito? In three words.
	Flour tortilla wrap.
	0.29 seconds
	What is a burrito? In three words.
	Mexican stuffed tortilla.
	0.29 seconds

	$ ~/bin/ollama-load.py gemma2:27b 0
	Unloading gemma2:27b
	0.18 seconds

	---

	$ ~/bin/ollama-load.py qwen3-coder:30b
	Loading qwen3-coder:30b
	5.37 seconds
	What is a burrito? In three words.
	Mexican food wrap.
	1.71 seconds
	What is a burrito? In three words.
	Mexican food wrap.
	0.19 seconds
	What is a burrito? In three words.
	Mexican food wrap.
	0.21 seconds

	$ ~/bin/ollama-load.py qwen3-coder:30b 0
	Unloading qwen3-coder:30b
	0.21 seconds

	---

	$ ~/bin/ollama-load.py codestral:22b
	Loading codestral:22b
	1.22 seconds
	What is a burrito? In three words.
	Mexican Food Delight: A Wrap of Joy!
	18.48 seconds
	What is a burrito? In three words.
	Mexican wrap, filled delight.
	0.20 seconds
	What is a burrito? In three words.
	Mexican, filled, wrap.
	0.20 seconds

	$ ~/bin/ollama-load.py codestral:22b 0
	Unloading codestral:22b
	0.03 seconds

	---

	$ ~/bin/ollama-load.py command-r-plus:104b
	Loading command-r-plus:104b
	3.62 seconds
	What is a burrito? In three words.
	A Mexican food wrap.
	89.33 seconds
	What is a burrito? In three words.
	A Mexican food.
	0.56 seconds
	What is a burrito? In three words.
	A filled tortilla.
	0.55 seconds

	$ ~/bin/ollama-load.py command-r-plus:104b 0
	Unloading command-r-plus:104b
	0.13 seconds
	#!/usr/bin/env python3

	from datetime import datetime
	import json
	import sys
	import re
	import requests

	# --- Configuration (Defaults) ---
	OLLAMA_HOST = "http://studio:11434"
	DEFAULT_MODEL = "gpt-oss:120b"
	DEFAULT_KEEP_ALIVE = -1
	PROMPT = "What is a burrito? In three words."

	# Regex to check if a string is an integer (positive or negative)
	NUMERIC_REGEX = re.compile(r"^[+-]?[0-9]+$")

	n_pings = 3

	# --- Argument Parsing ---

	def parse_args():
	"""Parses arguments to determine model and keep_alive values."""

	# Get raw arguments (excluding the script name)
	args = sys.argv[1:]

	model = DEFAULT_MODEL
	keep_alive = DEFAULT_KEEP_ALIVE

	if not args:
	# No arguments: Use all defaults
	pass
	elif len(args) == 1:
	arg1 = args[0]
	if NUMERIC_REGEX.match(arg1):
	# Case 1: Argument 1 is a number (it's the keep_alive)
	keep_alive = int(arg1)
	# Model remains the DEFAULT_MODEL
	else:
	# Case 2: Argument 1 is not a number (it's the model name)
	model = arg1
	# keep_alive remains the DEFAULT_KEEP_ALIVE
	elif len(args) >= 2:
	# Case 3: Both model and keep_alive are provided
	model = args[0]
	# We assume arg2 is the keep_alive string, but it must be convertible to a number for the API
	try:
	keep_alive = int(args[1])
	except ValueError:
	sys.stderr.write(f"Warning: Argument 2 ('{args[1]}') is not a valid integer for keep_alive. Using default ({DEFAULT_KEEP_ALIVE}).\n")
	# If the user passed a non-numeric string (e.g., "10m"), we treat it as an argument error here,
	# or you could pass the string directly if Ollama supported it for this logic. Sticking to numbers for parity.

	return model, keep_alive

	# --- API Functions (Modified to accept model/keep_alive) ---

	def handle_model_action(host, model, keep_alive):
	"""Sends the initial model action request (load, keep-alive, or unload)."""
	url = f"{host}/api/generate"
	model_action_payload = {
	"model": model,
	"keep_alive": keep_alive
	}

	action_type = "Loading" if keep_alive != 0 else "Unloading"
	# print(f"Executing model action ({action_type}): {model}, keep_alive={keep_alive}")
	print(f"{action_type} {model}")

	try:
	response = requests.post(url, json=model_action_payload, stream=False)
	response.raise_for_status()

	load_data = response.json()
	# print(f"Model action complete. Created at: {load_data.get('created_at')}")

	except requests.exceptions.RequestException as e:
	sys.stderr.write(f"Error during model action {action_type} for {model}: {e}\n")
	sys.exit(1)


	def generate_response(host, model, prompt):
	"""Sends the prompt and collects the streaming response, printing only the clean result."""

	full_response_text = ""

	try:
	url = f"{host}/api/generate"
	payload = {
	"model": model,
	"prompt": prompt,
	"stream": True
	}

	# print(f"Sending prompt to {model}: '{prompt}'")
	print(prompt)
	response = requests.post(url, json=payload, stream=True)
	response.raise_for_status()

	# Process stream and filter for non-empty responses
	for line in response.iter_lines(decode_unicode=True):
	if line:
	try:
	data = json.loads(line)
	token = data.get("response", "")

	if token:
	full_response_text += token

	except json.JSONDecodeError:
	sys.stderr.write(f"Warning: Could not decode JSON line: {line}\n")

	except requests.exceptions.RequestException as e:
	sys.stderr.write(f"Error generating response: {e}\n")
	sys.exit(1)

	return full_response_text.strip()

	# --- Main Execution ---

	if __name__ == "__main__":

	model_name, keep_alive_value = parse_args()

	# 1. Execute the initial model action
	start = datetime.now()
	handle_model_action(OLLAMA_HOST, model_name, keep_alive_value)
	print(f"{(datetime.now() - start).total_seconds():.2f} seconds")

	# 2. Conditional prompt generation, matching the shell script's `if [ "$keep_alive" -ne 0 ];`
	if keep_alive_value != 0:
	for _ in range(n_pings):
	start = datetime.now()
	final_output = generate_response(OLLAMA_HOST, model_name, PROMPT)

	# print("\n--- Final Clean Output ---")
	print(final_output)
	print(f"{(datetime.now() - start).total_seconds():.2f} seconds")
	# print("--------------------------")
	else:
	# print("\nPrompt skipped: keep_alive was set to 0 (model unload command).")
	pass