I do all this SSH'd into the DGX Spark from another machine, so everything is terminal commands.
sudo apt install python3-dev
curl -LsSf https://astral.sh/uv/install.sh | sh
| diff --git a/csrc/ops.h b/csrc/ops.h | |
| index f8bdc61aa..933c64db0 100644 | |
| --- a/csrc/ops.h | |
| +++ b/csrc/ops.h | |
| @@ -218,6 +218,7 @@ bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability); | |
| bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); | |
| bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability); | |
| bool cutlass_group_gemm_supported(int64_t cuda_device_capability); | |
| +bool cutlass_moe_mm_supports_fp4(int64_t cuda_device_capability); | |
| # A crude copy of vLLM's normal Dockerfile that installs | |
| # a released version on DGX Spark | |
| ARG CUDA_VERSION=13.0.2 | |
| ARG PYTHON_VERSION=3.12 | |
| ARG VLLM_VERSION=0.11.2 | |
| ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 | |
| ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl | |
| import json | |
| from openai import OpenAI | |
| def hermes_grammar_from_tools(tools: list[dict]) -> str: | |
| tool_funcs = "" | |
| for tool in tools: | |
| tool_funcs += " | " if tool_funcs else "" | |
| tool_funcs += f"fun_{tool['function']['name']}" |
| # Dependencies: | |
| # pip install openai pydantic-ai | |
| # This example uses the web_search builtin tool, so it assumes you | |
| # have a valid TAVILY_API_KEY environment variable set before starting | |
| # your Llama Stack server. | |
| # Usage: | |
| # | |
| # ollama run llama3.2:3b |
| @@grammar::Llama4 | |
| start | |
| = | |
| expression $ | |
| ; | |
| expression | |
| = |
| diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py | |
| index 8bc733fd..eaea63f8 100644 | |
| --- a/llama_stack/providers/remote/inference/vllm/vllm.py | |
| +++ b/llama_stack/providers/remote/inference/vllm/vllm.py | |
| @@ -161,45 +161,52 @@ def _convert_to_vllm_finish_reason(finish_reason: str) -> StopReason: | |
| async def _process_vllm_chat_completion_stream_response( | |
| stream: AsyncGenerator[OpenAIChatCompletionChunk, None], | |
| ) -> AsyncGenerator: | |
| - event_type = ChatCompletionResponseEventType.start | |
| - tool_call_buf = UnparseableToolCall() |
| diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py | |
| index fbbbc1fb2..5d232f44a 100644 | |
| --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py | |
| +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py | |
| @@ -52,6 +52,16 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall( | |
| name="get_weather", | |
| arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}', | |
| ) | |
| +PYTHON_TAGS_FUNCTION_OUTPUT="<|python_start|>[get_weather(city='San Francisco', metric='celsius')]<|python_end|>" | |
| +PYTHON_TAGS_FUNCTION_CALL = FunctionCall( |