pguso · September 22, 2025 16:26
diff --git a/deploy_llm.py b/deploy_llm.py
 from typing import Optional
 from pathlib import Path
 import modal

 app = modal.App("llms-llama-cpp")

 MODEL = "Qwen3-Coder-30B-A3B-Instruct-Q2_K.gguf"
 GPU_CONFIG = "A10"
 LLAMA_CPP_RELEASE = "b4568"
 MINUTES = 60

 cuda_version = "12.4.0"  # should be no greater than host CUDA version
 flavor = "devel"  # includes full CUDA toolkit
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"

 image = (
    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
    .apt_install("git", "build-essential", "cmake", "curl", "libcurl4-openssl-dev")
    .run_commands("git clone https://github.com/ggerganov/llama.cpp")
    .run_commands(
        "cmake llama.cpp -B llama.cpp/build "
        "-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON "
    )
    .run_commands(  # this one takes a few minutes!
        "cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli"
    )
    .run_commands("cp llama.cpp/build/bin/llama-* llama.cpp")
    .entrypoint([])  # remove NVIDIA base container entrypoint
 )

 # ## Storing models on Modal

 # To make the model weights available on Modal,
 # we download them from Hugging Face.

 # Modal is serverless, so disks are by default ephemeral.
 # To make sure our weights don't disappear between runs,
 # which would trigger a long download, we store them in a
 # Modal [Volume](https://modal.com/docs/guide/volumes).

 # For more on how to use Modal Volumes to store model weights,
 # see [this guide](https://modal.com/docs/guide/model-weights).

 model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True)
 cache_dir = "/root/.cache/llama.cpp"

 download_image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install("huggingface_hub[hf_transfer]==0.26.2")
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
 )


 @app.function(
    image=download_image,
    volumes={cache_dir: model_cache},
    timeout=1 * MINUTES
 )
 def download_model(repo_id, allow_patterns, revision: Optional[str] = None):
    from huggingface_hub import snapshot_download

    print(f"🦙 downloading model from {repo_id} if not present")

    snapshot_download(
        repo_id=repo_id,
        revision=revision,
        local_dir=cache_dir,
        allow_patterns=allow_patterns,
    )

    model_cache.commit()  # ensure other Modal Functions can see our writes before we quit

    print("🦙 model loaded")


 # ## Running llama.cpp as a Modal Function

 # Now, let's put it all together.

 # At the top of our `llama_cpp_inference` function,
 # we add an `app.function` decorator to attach all of our infrastructure:

 # - the `image` with the dependencies
 # - the `volumes` with the weights and where we can put outputs
 # - the `gpu` we want, if any

 # We also specify a `timeout` after which to cancel the run.

 # Inside the function, we call the `llama.cpp` CLI
 # with `subprocess.Popen`. This requires a bit of extra ceremony
 # because we want to both show the output as we run
 # and store the output to save and return to the local caller.
 # For details, see the [Addenda section](#addenda) below.

 # Alternatively, you might set up an OpenAI-compatible server
 # using base `llama.cpp` or its [Python wrapper library](https://github.com/abetlen/llama-cpp-python)
 # along with one of [Modal's decorators for web hosting](https://modal.com/docs/guide/webhooks).

 @app.function(
    image=image,
    volumes={cache_dir: model_cache},
    gpu=GPU_CONFIG,
    timeout=1 * MINUTES,
 )
 def llama_cpp_stream(
        prompt: Optional[str] = None,
        model = MODEL,
        n_predict: int = -1
 ):
    import subprocess

    model_entrypoint_file = model

    if prompt is None:
        prompt = DEFAULT_PROMPT

    args = ["--threads", "8"]
    n_gpu_layers = 64

    command = [
        "/llama.cpp/llama-cli",
        "--model", f"{cache_dir}/{model_entrypoint_file}",
        "--n-gpu-layers", str(n_gpu_layers),
        "--prompt", prompt,
        "--n-predict", str(n_predict),
    ] + args

    process = subprocess.Popen(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        bufsize=1,
        universal_newlines=True
    )

    for line in process.stdout:
        yield line  # yield each line as it appears

    process.wait()
    if process.returncode != 0:
        stderr = process.stderr.read()
        raise RuntimeError(f"llama.cpp failed with exit code {process.returncode}: {stderr}")


 @app.function(
    image=image,
    volumes={cache_dir: model_cache},
    gpu=GPU_CONFIG,
    timeout=30 * MINUTES,
    enable_memory_snapshot=True,
    experimental_options={"enable_gpu_snapshot": True}
 )
 def llama_cpp_inference(
        prompt: Optional[str] = None,
        n_predict: int = -1,
 ):
    import subprocess
    from uuid import uuid4

    model_entrypoint_file = MODEL


    if prompt is None:
        prompt = DEFAULT_PROMPT  # see end of file

    args = ["--threads", "8"]
    # set layers to "off-load to", aka run on, GPU
    n_gpu_layers = 64

    command = [
                  "/llama.cpp/llama-cli",
                  "--model",
                  f"{cache_dir}/{model_entrypoint_file}",
                  "--n-gpu-layers",
                  str(n_gpu_layers),
                  "--prompt",
                  prompt,
                  "--n-predict",
                  str(n_predict),
              ] + args

    print("🦙 running command:", command, sep="\n\t")

    # run process and capture output
    result = subprocess.run(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True  # capture as string
    )

    print("STDOUT:", result.stdout)
    print("STDERR:", result.stderr)

    if result.returncode != 0:
        print("🦙 llama.cpp error:", result.stderr)
        raise RuntimeError(f"llama.cpp failed with exit code {result.returncode}")

    return result.stdout  # return the actual string output
diff --git a/llm_inference.py b/llm_inference.py
 import modal

 download_model = modal.Function.from_name("llms-llama-cpp", "download_model")
 llama_cpp_stream = modal.Function.from_name("llms-llama-cpp", "llama_cpp_stream")

 try:
    # First, ensure the model is downloaded
    download_model.remote(
        repo_id="unsloth/gpt-oss-20b-GGUF",
        allow_patterns=["*Q6_K.gguf"],
    )
    print("✅ Download completed successfully")
 except Exception as e:
    if "timeout" in str(e).lower():
        print("⚠️  Download timed out, but model may still be cached")
        print("    Proceeding with inference...")
    else:
        print(f"❌ Download failed: {e}")
        raise

 # Example 1: Streaming inference
 print("🚀 Starting streaming inference...")
 prompt = "Write a Python function to implement quicksort:"

 for chunk in llama_cpp_stream.remote_gen(prompt, n_predict=200):
    print(chunk, end="", flush=True)

 print("\n" + "=" * 50)

 # Example 2: Batch inference
 print("🚀 Starting batch inference...")
 prompts = [
    "Explain machine learning in one paragraph:",
    "Write a haiku about programming:",
    "What are the benefits of serverless computing?"
 ]

 # Run multiple prompts in parallel
 results = list(llama_cpp_stream.map(prompts, [100] * len(prompts)))

 for i, result in enumerate(results):
    print(f"\nPrompt {i + 1} Result:")
    print(result[:200] + "..." if len(result) > 200 else result)
	from typing import Optional
	from pathlib import Path
	import modal

	app = modal.App("llms-llama-cpp")

	MODEL = "Qwen3-Coder-30B-A3B-Instruct-Q2_K.gguf"
	GPU_CONFIG = "A10"
	LLAMA_CPP_RELEASE = "b4568"
	MINUTES = 60

	cuda_version = "12.4.0" # should be no greater than host CUDA version
	flavor = "devel" # includes full CUDA toolkit
	operating_sys = "ubuntu22.04"
	tag = f"{cuda_version}-{flavor}-{operating_sys}"

	image = (
	modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
	.apt_install("git", "build-essential", "cmake", "curl", "libcurl4-openssl-dev")
	.run_commands("git clone https://github.com/ggerganov/llama.cpp")
	.run_commands(
	"cmake llama.cpp -B llama.cpp/build "
	"-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON "
	)
	.run_commands( # this one takes a few minutes!
	"cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli"
	)
	.run_commands("cp llama.cpp/build/bin/llama-* llama.cpp")
	.entrypoint([]) # remove NVIDIA base container entrypoint
	)

	# ## Storing models on Modal

	# To make the model weights available on Modal,
	# we download them from Hugging Face.

	# Modal is serverless, so disks are by default ephemeral.
	# To make sure our weights don't disappear between runs,
	# which would trigger a long download, we store them in a
	# Modal [Volume](https://modal.com/docs/guide/volumes).

	# For more on how to use Modal Volumes to store model weights,
	# see [this guide](https://modal.com/docs/guide/model-weights).

	model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True)
	cache_dir = "/root/.cache/llama.cpp"

	download_image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install("huggingface_hub[hf_transfer]==0.26.2")
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	)


	@app.function(
	image=download_image,
	volumes={cache_dir: model_cache},
	timeout=1 * MINUTES
	)
	def download_model(repo_id, allow_patterns, revision: Optional[str] = None):
	from huggingface_hub import snapshot_download

	print(f"🦙 downloading model from {repo_id} if not present")

	snapshot_download(
	repo_id=repo_id,
	revision=revision,
	local_dir=cache_dir,
	allow_patterns=allow_patterns,
	)

	model_cache.commit() # ensure other Modal Functions can see our writes before we quit

	print("🦙 model loaded")


	# ## Running llama.cpp as a Modal Function

	# Now, let's put it all together.

	# At the top of our `llama_cpp_inference` function,
	# we add an `app.function` decorator to attach all of our infrastructure:

	# - the `image` with the dependencies
	# - the `volumes` with the weights and where we can put outputs
	# - the `gpu` we want, if any

	# We also specify a `timeout` after which to cancel the run.

	# Inside the function, we call the `llama.cpp` CLI
	# with `subprocess.Popen`. This requires a bit of extra ceremony
	# because we want to both show the output as we run
	# and store the output to save and return to the local caller.
	# For details, see the [Addenda section](#addenda) below.

	# Alternatively, you might set up an OpenAI-compatible server
	# using base `llama.cpp` or its [Python wrapper library](https://github.com/abetlen/llama-cpp-python)
	# along with one of [Modal's decorators for web hosting](https://modal.com/docs/guide/webhooks).

	@app.function(
	image=image,
	volumes={cache_dir: model_cache},
	gpu=GPU_CONFIG,
	timeout=1 * MINUTES,
	)
	def llama_cpp_stream(
	prompt: Optional[str] = None,
	model = MODEL,
	n_predict: int = -1
	):
	import subprocess

	model_entrypoint_file = model

	if prompt is None:
	prompt = DEFAULT_PROMPT

	args = ["--threads", "8"]
	n_gpu_layers = 64

	command = [
	"/llama.cpp/llama-cli",
	"--model", f"{cache_dir}/{model_entrypoint_file}",
	"--n-gpu-layers", str(n_gpu_layers),
	"--prompt", prompt,
	"--n-predict", str(n_predict),
	] + args

	process = subprocess.Popen(
	command,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	bufsize=1,
	universal_newlines=True
	)

	for line in process.stdout:
	yield line # yield each line as it appears

	process.wait()
	if process.returncode != 0:
	stderr = process.stderr.read()
	raise RuntimeError(f"llama.cpp failed with exit code {process.returncode}: {stderr}")


	@app.function(
	image=image,
	volumes={cache_dir: model_cache},
	gpu=GPU_CONFIG,
	timeout=30 * MINUTES,
	enable_memory_snapshot=True,
	experimental_options={"enable_gpu_snapshot": True}
	)
	def llama_cpp_inference(
	prompt: Optional[str] = None,
	n_predict: int = -1,
	):
	import subprocess
	from uuid import uuid4

	model_entrypoint_file = MODEL


	if prompt is None:
	prompt = DEFAULT_PROMPT # see end of file

	args = ["--threads", "8"]
	# set layers to "off-load to", aka run on, GPU
	n_gpu_layers = 64

	command = [
	"/llama.cpp/llama-cli",
	"--model",
	f"{cache_dir}/{model_entrypoint_file}",
	"--n-gpu-layers",
	str(n_gpu_layers),
	"--prompt",
	prompt,
	"--n-predict",
	str(n_predict),
	] + args

	print("🦙 running command:", command, sep="\n\t")

	# run process and capture output
	result = subprocess.run(
	command,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True # capture as string
	)

	print("STDOUT:", result.stdout)
	print("STDERR:", result.stderr)

	if result.returncode != 0:
	print("🦙 llama.cpp error:", result.stderr)
	raise RuntimeError(f"llama.cpp failed with exit code {result.returncode}")

	return result.stdout # return the actual string output
	import modal

	download_model = modal.Function.from_name("llms-llama-cpp", "download_model")
	llama_cpp_stream = modal.Function.from_name("llms-llama-cpp", "llama_cpp_stream")

	try:
	# First, ensure the model is downloaded
	download_model.remote(
	repo_id="unsloth/gpt-oss-20b-GGUF",
	allow_patterns=["*Q6_K.gguf"],
	)
	print("✅ Download completed successfully")
	except Exception as e:
	if "timeout" in str(e).lower():
	print("⚠️ Download timed out, but model may still be cached")
	print(" Proceeding with inference...")
	else:
	print(f"❌ Download failed: {e}")
	raise

	# Example 1: Streaming inference
	print("🚀 Starting streaming inference...")
	prompt = "Write a Python function to implement quicksort:"

	for chunk in llama_cpp_stream.remote_gen(prompt, n_predict=200):
	print(chunk, end="", flush=True)

	print("\n" + "=" * 50)

	# Example 2: Batch inference
	print("🚀 Starting batch inference...")
	prompts = [
	"Explain machine learning in one paragraph:",
	"Write a haiku about programming:",
	"What are the benefits of serverless computing?"
	]

	# Run multiple prompts in parallel
	results = list(llama_cpp_stream.map(prompts, [100] * len(prompts)))

	for i, result in enumerate(results):
	print(f"\nPrompt {i + 1} Result:")
	print(result[:200] + "..." if len(result) > 200 else result)