Skip to content

Instantly share code, notes, and snippets.

@pguso
Created September 22, 2025 16:26
Show Gist options
  • Select an option

  • Save pguso/5e635d12710b817846b9791f638bfdcb to your computer and use it in GitHub Desktop.

Select an option

Save pguso/5e635d12710b817846b9791f638bfdcb to your computer and use it in GitHub Desktop.
LLMs on Modal: From Idle GPUs to Pay-Per-Token
from typing import Optional
from pathlib import Path
import modal
app = modal.App("llms-llama-cpp")
MODEL = "Qwen3-Coder-30B-A3B-Instruct-Q2_K.gguf"
GPU_CONFIG = "A10"
LLAMA_CPP_RELEASE = "b4568"
MINUTES = 60
cuda_version = "12.4.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
image = (
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
.apt_install("git", "build-essential", "cmake", "curl", "libcurl4-openssl-dev")
.run_commands("git clone https://github.com/ggerganov/llama.cpp")
.run_commands(
"cmake llama.cpp -B llama.cpp/build "
"-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON "
)
.run_commands( # this one takes a few minutes!
"cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli"
)
.run_commands("cp llama.cpp/build/bin/llama-* llama.cpp")
.entrypoint([]) # remove NVIDIA base container entrypoint
)
# ## Storing models on Modal
# To make the model weights available on Modal,
# we download them from Hugging Face.
# Modal is serverless, so disks are by default ephemeral.
# To make sure our weights don't disappear between runs,
# which would trigger a long download, we store them in a
# Modal [Volume](https://modal.com/docs/guide/volumes).
# For more on how to use Modal Volumes to store model weights,
# see [this guide](https://modal.com/docs/guide/model-weights).
model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True)
cache_dir = "/root/.cache/llama.cpp"
download_image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install("huggingface_hub[hf_transfer]==0.26.2")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
@app.function(
image=download_image,
volumes={cache_dir: model_cache},
timeout=1 * MINUTES
)
def download_model(repo_id, allow_patterns, revision: Optional[str] = None):
from huggingface_hub import snapshot_download
print(f"πŸ¦™ downloading model from {repo_id} if not present")
snapshot_download(
repo_id=repo_id,
revision=revision,
local_dir=cache_dir,
allow_patterns=allow_patterns,
)
model_cache.commit() # ensure other Modal Functions can see our writes before we quit
print("πŸ¦™ model loaded")
# ## Running llama.cpp as a Modal Function
# Now, let's put it all together.
# At the top of our `llama_cpp_inference` function,
# we add an `app.function` decorator to attach all of our infrastructure:
# - the `image` with the dependencies
# - the `volumes` with the weights and where we can put outputs
# - the `gpu` we want, if any
# We also specify a `timeout` after which to cancel the run.
# Inside the function, we call the `llama.cpp` CLI
# with `subprocess.Popen`. This requires a bit of extra ceremony
# because we want to both show the output as we run
# and store the output to save and return to the local caller.
# For details, see the [Addenda section](#addenda) below.
# Alternatively, you might set up an OpenAI-compatible server
# using base `llama.cpp` or its [Python wrapper library](https://github.com/abetlen/llama-cpp-python)
# along with one of [Modal's decorators for web hosting](https://modal.com/docs/guide/webhooks).
@app.function(
image=image,
volumes={cache_dir: model_cache},
gpu=GPU_CONFIG,
timeout=1 * MINUTES,
)
def llama_cpp_stream(
prompt: Optional[str] = None,
model = MODEL,
n_predict: int = -1
):
import subprocess
model_entrypoint_file = model
if prompt is None:
prompt = DEFAULT_PROMPT
args = ["--threads", "8"]
n_gpu_layers = 64
command = [
"/llama.cpp/llama-cli",
"--model", f"{cache_dir}/{model_entrypoint_file}",
"--n-gpu-layers", str(n_gpu_layers),
"--prompt", prompt,
"--n-predict", str(n_predict),
] + args
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1,
universal_newlines=True
)
for line in process.stdout:
yield line # yield each line as it appears
process.wait()
if process.returncode != 0:
stderr = process.stderr.read()
raise RuntimeError(f"llama.cpp failed with exit code {process.returncode}: {stderr}")
@app.function(
image=image,
volumes={cache_dir: model_cache},
gpu=GPU_CONFIG,
timeout=30 * MINUTES,
enable_memory_snapshot=True,
experimental_options={"enable_gpu_snapshot": True}
)
def llama_cpp_inference(
prompt: Optional[str] = None,
n_predict: int = -1,
):
import subprocess
from uuid import uuid4
model_entrypoint_file = MODEL
if prompt is None:
prompt = DEFAULT_PROMPT # see end of file
args = ["--threads", "8"]
# set layers to "off-load to", aka run on, GPU
n_gpu_layers = 64
command = [
"/llama.cpp/llama-cli",
"--model",
f"{cache_dir}/{model_entrypoint_file}",
"--n-gpu-layers",
str(n_gpu_layers),
"--prompt",
prompt,
"--n-predict",
str(n_predict),
] + args
print("πŸ¦™ running command:", command, sep="\n\t")
# run process and capture output
result = subprocess.run(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True # capture as string
)
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)
if result.returncode != 0:
print("πŸ¦™ llama.cpp error:", result.stderr)
raise RuntimeError(f"llama.cpp failed with exit code {result.returncode}")
return result.stdout # return the actual string output
import modal
download_model = modal.Function.from_name("llms-llama-cpp", "download_model")
llama_cpp_stream = modal.Function.from_name("llms-llama-cpp", "llama_cpp_stream")
try:
# First, ensure the model is downloaded
download_model.remote(
repo_id="unsloth/gpt-oss-20b-GGUF",
allow_patterns=["*Q6_K.gguf"],
)
print("βœ… Download completed successfully")
except Exception as e:
if "timeout" in str(e).lower():
print("⚠️ Download timed out, but model may still be cached")
print(" Proceeding with inference...")
else:
print(f"❌ Download failed: {e}")
raise
# Example 1: Streaming inference
print("πŸš€ Starting streaming inference...")
prompt = "Write a Python function to implement quicksort:"
for chunk in llama_cpp_stream.remote_gen(prompt, n_predict=200):
print(chunk, end="", flush=True)
print("\n" + "=" * 50)
# Example 2: Batch inference
print("πŸš€ Starting batch inference...")
prompts = [
"Explain machine learning in one paragraph:",
"Write a haiku about programming:",
"What are the benefits of serverless computing?"
]
# Run multiple prompts in parallel
results = list(llama_cpp_stream.map(prompts, [100] * len(prompts)))
for i, result in enumerate(results):
print(f"\nPrompt {i + 1} Result:")
print(result[:200] + "..." if len(result) > 200 else result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment