Ugo Varetto ugovaretto

llama.cpp on multiple nodes

After compiling with RPC enabled (GGML_RPC CMake parameter):

Run rpc-server on the remote nodes:

rpc-server --port 5001 --host 169.254.51.65

	# uv venv
	# uv pip install gguf
	# uv run gguf-jinja-dump.py <gguf ..0001 file>

	from gguf import GGUFReader
	import sys

	def get_jinja_template(file_path):
	reader = GGUFReader(file_path)
	# The key is standard for tokenizer chat templates

	import asyncio
	import time
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import sys

	async def do_async_work(task_id: int, duration: float = 0.1) -> str:
	await asyncio.sleep(duration)
	return f"Task {task_id} completed"

	async def run_asyncio(tasks: int=5):

	{
	"$schema": "https://charm.land/crush.json",
	"mcp": {
	"websearch-fetch-mcp": {
	"type": "stdio",
	"command": "uv",
	"args": [
	"run",
	"--with", "fastmcp",
	"--with", "requests",

	#!/usr/bin/env python3
	import os
	import sys
	import requests
	import json
	import argparse

	def generate_bash_command(prompt):
	api_key = os.getenv('BASH_AI_KEY')
	api_url = os.getenv('BASH_URL_KEY', "http://localhost:11434/v1/chat/completions")

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager

	# 1. Setup the WebDriver (using webdriver_manager for simplicity)
	# This automatically downloads and manages the correct ChromeDriver version
	service = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=service)

	# llama-swap configuration
	models:
	VibeThinker-1.5B:
	cmd: llama-server --port ${PORT} -c 0 --model /home/ugo/.cache/llama.cpp/VibeThinker-1.5B.f16.gguf -ngl 99
	Aquif-3.5-Max-42B-A3B:
	cmd: >
	llama-server --port ${PORT}
	--model /home/ugo/.cache/llama.cpp/unsloth-aquif-3.5-Max-42B-A3B-GGUF/aquif-3.5-Max-42B-A3B-UD-Q6_K_XL.gguf -ngl 99 -fa on
	Aquif-3.5-Max-42B-A3B-Coding-Q6_K_XL-KVQ8:
	cmd: >

	#!/usr/bin/env bash
	hf download $1 --local-dir "$HOME/.cache/llama.cpp/$1/$2" --include="*$2.gguf"

	#!/usr/bin/env bash
	# $1: model file name
	# $2: port
	# $3: context size
	# $4: alias, (model name sent to client)

	# Define the help text as a function
	show_help() {
	echo "Usage: $0 <model file> <port> \\"
	echo " <context length, 0 for default> \\"

	GGML_BLAS_VENDOR Intel10_64_dyn
	Vulkan_GLSLANG_VALIDATOR_EXECU ~/.local/vulkan/1.4.321.1/x86_64/bin/glslangValidator
	Vulkan_GLSLC_EXECUTABLE ~/.local/vulkan/1.4.321.1/x86_64/bin/glslc
	Vulkan_INCLUDE_DIR ~/.local/vulkan/1.4.321.1/x86_64/include
	Vulkan_LIBRARY ~/.local/vulkan/1.4.321.1/x86_64/lib/libvulkan.so