Created
September 22, 2025 16:26
-
-
Save pguso/5e635d12710b817846b9791f638bfdcb to your computer and use it in GitHub Desktop.
LLMs on Modal: From Idle GPUs to Pay-Per-Token
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import Optional | |
| from pathlib import Path | |
| import modal | |
| app = modal.App("llms-llama-cpp") | |
| MODEL = "Qwen3-Coder-30B-A3B-Instruct-Q2_K.gguf" | |
| GPU_CONFIG = "A10" | |
| LLAMA_CPP_RELEASE = "b4568" | |
| MINUTES = 60 | |
| cuda_version = "12.4.0" # should be no greater than host CUDA version | |
| flavor = "devel" # includes full CUDA toolkit | |
| operating_sys = "ubuntu22.04" | |
| tag = f"{cuda_version}-{flavor}-{operating_sys}" | |
| image = ( | |
| modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12") | |
| .apt_install("git", "build-essential", "cmake", "curl", "libcurl4-openssl-dev") | |
| .run_commands("git clone https://github.com/ggerganov/llama.cpp") | |
| .run_commands( | |
| "cmake llama.cpp -B llama.cpp/build " | |
| "-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON " | |
| ) | |
| .run_commands( # this one takes a few minutes! | |
| "cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli" | |
| ) | |
| .run_commands("cp llama.cpp/build/bin/llama-* llama.cpp") | |
| .entrypoint([]) # remove NVIDIA base container entrypoint | |
| ) | |
| # ## Storing models on Modal | |
| # To make the model weights available on Modal, | |
| # we download them from Hugging Face. | |
| # Modal is serverless, so disks are by default ephemeral. | |
| # To make sure our weights don't disappear between runs, | |
| # which would trigger a long download, we store them in a | |
| # Modal [Volume](https://modal.com/docs/guide/volumes). | |
| # For more on how to use Modal Volumes to store model weights, | |
| # see [this guide](https://modal.com/docs/guide/model-weights). | |
| model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True) | |
| cache_dir = "/root/.cache/llama.cpp" | |
| download_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install("huggingface_hub[hf_transfer]==0.26.2") | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| ) | |
| @app.function( | |
| image=download_image, | |
| volumes={cache_dir: model_cache}, | |
| timeout=1 * MINUTES | |
| ) | |
| def download_model(repo_id, allow_patterns, revision: Optional[str] = None): | |
| from huggingface_hub import snapshot_download | |
| print(f"π¦ downloading model from {repo_id} if not present") | |
| snapshot_download( | |
| repo_id=repo_id, | |
| revision=revision, | |
| local_dir=cache_dir, | |
| allow_patterns=allow_patterns, | |
| ) | |
| model_cache.commit() # ensure other Modal Functions can see our writes before we quit | |
| print("π¦ model loaded") | |
| # ## Running llama.cpp as a Modal Function | |
| # Now, let's put it all together. | |
| # At the top of our `llama_cpp_inference` function, | |
| # we add an `app.function` decorator to attach all of our infrastructure: | |
| # - the `image` with the dependencies | |
| # - the `volumes` with the weights and where we can put outputs | |
| # - the `gpu` we want, if any | |
| # We also specify a `timeout` after which to cancel the run. | |
| # Inside the function, we call the `llama.cpp` CLI | |
| # with `subprocess.Popen`. This requires a bit of extra ceremony | |
| # because we want to both show the output as we run | |
| # and store the output to save and return to the local caller. | |
| # For details, see the [Addenda section](#addenda) below. | |
| # Alternatively, you might set up an OpenAI-compatible server | |
| # using base `llama.cpp` or its [Python wrapper library](https://github.com/abetlen/llama-cpp-python) | |
| # along with one of [Modal's decorators for web hosting](https://modal.com/docs/guide/webhooks). | |
| @app.function( | |
| image=image, | |
| volumes={cache_dir: model_cache}, | |
| gpu=GPU_CONFIG, | |
| timeout=1 * MINUTES, | |
| ) | |
| def llama_cpp_stream( | |
| prompt: Optional[str] = None, | |
| model = MODEL, | |
| n_predict: int = -1 | |
| ): | |
| import subprocess | |
| model_entrypoint_file = model | |
| if prompt is None: | |
| prompt = DEFAULT_PROMPT | |
| args = ["--threads", "8"] | |
| n_gpu_layers = 64 | |
| command = [ | |
| "/llama.cpp/llama-cli", | |
| "--model", f"{cache_dir}/{model_entrypoint_file}", | |
| "--n-gpu-layers", str(n_gpu_layers), | |
| "--prompt", prompt, | |
| "--n-predict", str(n_predict), | |
| ] + args | |
| process = subprocess.Popen( | |
| command, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| text=True, | |
| bufsize=1, | |
| universal_newlines=True | |
| ) | |
| for line in process.stdout: | |
| yield line # yield each line as it appears | |
| process.wait() | |
| if process.returncode != 0: | |
| stderr = process.stderr.read() | |
| raise RuntimeError(f"llama.cpp failed with exit code {process.returncode}: {stderr}") | |
| @app.function( | |
| image=image, | |
| volumes={cache_dir: model_cache}, | |
| gpu=GPU_CONFIG, | |
| timeout=30 * MINUTES, | |
| enable_memory_snapshot=True, | |
| experimental_options={"enable_gpu_snapshot": True} | |
| ) | |
| def llama_cpp_inference( | |
| prompt: Optional[str] = None, | |
| n_predict: int = -1, | |
| ): | |
| import subprocess | |
| from uuid import uuid4 | |
| model_entrypoint_file = MODEL | |
| if prompt is None: | |
| prompt = DEFAULT_PROMPT # see end of file | |
| args = ["--threads", "8"] | |
| # set layers to "off-load to", aka run on, GPU | |
| n_gpu_layers = 64 | |
| command = [ | |
| "/llama.cpp/llama-cli", | |
| "--model", | |
| f"{cache_dir}/{model_entrypoint_file}", | |
| "--n-gpu-layers", | |
| str(n_gpu_layers), | |
| "--prompt", | |
| prompt, | |
| "--n-predict", | |
| str(n_predict), | |
| ] + args | |
| print("π¦ running command:", command, sep="\n\t") | |
| # run process and capture output | |
| result = subprocess.run( | |
| command, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| text=True # capture as string | |
| ) | |
| print("STDOUT:", result.stdout) | |
| print("STDERR:", result.stderr) | |
| if result.returncode != 0: | |
| print("π¦ llama.cpp error:", result.stderr) | |
| raise RuntimeError(f"llama.cpp failed with exit code {result.returncode}") | |
| return result.stdout # return the actual string output |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import modal | |
| download_model = modal.Function.from_name("llms-llama-cpp", "download_model") | |
| llama_cpp_stream = modal.Function.from_name("llms-llama-cpp", "llama_cpp_stream") | |
| try: | |
| # First, ensure the model is downloaded | |
| download_model.remote( | |
| repo_id="unsloth/gpt-oss-20b-GGUF", | |
| allow_patterns=["*Q6_K.gguf"], | |
| ) | |
| print("β Download completed successfully") | |
| except Exception as e: | |
| if "timeout" in str(e).lower(): | |
| print("β οΈ Download timed out, but model may still be cached") | |
| print(" Proceeding with inference...") | |
| else: | |
| print(f"β Download failed: {e}") | |
| raise | |
| # Example 1: Streaming inference | |
| print("π Starting streaming inference...") | |
| prompt = "Write a Python function to implement quicksort:" | |
| for chunk in llama_cpp_stream.remote_gen(prompt, n_predict=200): | |
| print(chunk, end="", flush=True) | |
| print("\n" + "=" * 50) | |
| # Example 2: Batch inference | |
| print("π Starting batch inference...") | |
| prompts = [ | |
| "Explain machine learning in one paragraph:", | |
| "Write a haiku about programming:", | |
| "What are the benefits of serverless computing?" | |
| ] | |
| # Run multiple prompts in parallel | |
| results = list(llama_cpp_stream.map(prompts, [100] * len(prompts))) | |
| for i, result in enumerate(results): | |
| print(f"\nPrompt {i + 1} Result:") | |
| print(result[:200] + "..." if len(result) > 200 else result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment