- NVIDIA GPU (>=24 GB VRAM recommended)
- Linux OS (WSL also works)
- Installed:
pixifrom prefix.dev
mkdir nemotron-vllm && cd nemotron-vllm
pixi init --python 3.10
pixi add -p vllm transformers accelerate pillow opencv-python numpy requests openai gradioExample for CUDA 12.1:
pixi run pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121Edit your pixi.toml to include:
[project]
name = "nemotron-vllm"
version = "0.1.0"
[pypi-dependencies]
vllm = "*"
transformers = ">=4.44"
accelerate = "*"
pillow = "*"
opencv-python = "*"
numpy = "*"
requests = "*"
openai = "*"
gradio = "*"
[tasks]
serve = """
vllm serve nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 \
--trust-remote-code \
--dtype bfloat16 \
--max-model-len 32768 \
--gpu-memory-utilization 0.90
"""
app = "python app.py"Run the model server:
pixi run serveTest text-only inference:
curl http://localhost:8000/v1/chat/completions \
-H "Authorization: Bearer EMPTY" \
-H "Content-Type: application/json" \
-d '{
"model": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
"messages":[{"role":"user","content":"Say hello."}],
"temperature": 0
}'Create app.py:
import base64
from io import BytesIO
import gradio as gr
from openai import OpenAI
from PIL import Image
client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
MODEL = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
def img_to_data_url(img):
buf = BytesIO()
img.save(buf, format="JPEG")
b64 = base64.b64encode(buf.getvalue()).decode()
return f"data:image/jpeg;base64,{b64}"
def infer(prompt, images, thinking, temperature, max_tokens):
content = []
if prompt:
content.append({"type": "input_text", "text": prompt})
if images:
for im in images:
content.append({"type": "input_image", "image_url": img_to_data_url(im)})
sys_prompt = "/no_think" if thinking == "No think" else "/think"
messages = [
{"role": "system", "content": sys_prompt},
{"role": "user", "content": content}
]
resp = client.chat.completions.create(
model=MODEL,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
return resp.choices[0].message.content
with gr.Blocks(title="Nemotron VL (vLLM)") as demo:
gr.Markdown("# Nemotron-Nano-12B-v2-VL (vLLM)\nUpload up to 4 images and ask a question.")
prompt = gr.Textbox(label="Prompt")
gallery = gr.Gallery(label="Images", columns=[4], height=180)
img_uploader = gr.Image(type="pil", label="Add image")
add_btn = gr.Button("Add to gallery")
clear_btn = gr.Button("Clear gallery")
thinking = gr.Radio(["No think", "Think"], value="No think", label="Reasoning style")
temperature = gr.Slider(0, 1, value=0, step=0.1, label="Temperature")
max_tokens = gr.Slider(64, 2048, value=512, step=64, label="Max tokens")
output = gr.Textbox(label="Output")
state_images = gr.State([])
def add_image(current, new):
imgs = list(current or [])
if new: imgs.append(new)
return imgs, imgs
def clear_images():
return [], []
add_btn.click(add_image, [state_images, img_uploader], [state_images, gallery])
clear_btn.click(clear_images, None, [state_images, gallery])
gr.Button("Run").click(infer, [prompt, state_images, thinking, temperature, max_tokens], output)
demo.launch(server_name="0.0.0.0", server_port=7860)Launch the interface:
pixi run appVisit http://localhost:7860 to upload images, enter prompts, and generate answers.
ffmpeg -i video.mp4 -vf "fps=1/2" -frames:v 4 frame_%02d.jpgThen upload the frames via the web interface.
- CUDA OOM: Reduce
--max-model-lenor image resolution. - Torch mismatch: Reinstall Torch for your CUDA version.
- API errors: Ensure
messages[].contentuses the correct multimodal format. - First run slow: Model warms up; later calls are faster.
- Run
pixi run serve→ server starts ✅ - Test
curl→ returns text ✅ - Open
localhost:7860→ image+text works ✅