Last active
December 3, 2025 17:44
-
-
Save mseri/956ab07c2c4a756a2575745e2b9907d5 to your computer and use it in GitHub Desktop.
llama-launcher
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| set -euo pipefail | |
| MODELS="aquif, gemma3, granite4, granite4-moe, lfm2, lfm2-moe, lfm2-vl, ministral3 (3,8B;i,r), nemotron, qwen3(i,r), voxtral" | |
| function usage() { | |
| echo "Usage: $0 <model> [options]" | |
| echo "Models: $MODELS." | |
| echo "Options:" | |
| echo " --dry to see the invocation string only" | |
| echo " --temp <value> to set temperature for granite models (default: $temp)" | |
| } | |
| if [ $# -eq 0 ]; then | |
| usage | |
| exit 1 | |
| fi | |
| command="" | |
| dry=false | |
| temp=0 | |
| while [ $# -gt 0 ]; do | |
| case $1 in | |
| "aquif") | |
| command="llama-server -hf mradermacher/aquif-3.6-8B-GGUF:Q4_K_M --temp 0.7 -ngl 0 --threads 4 --jinja --cache-reuse 256 -c 8192" | |
| shift ;; | |
| "granite4") | |
| # one can also use ibm-granite/granite-4.0-h-micro-GGUF:Q4_K_M and pick | |
| # parameters at will. IBM says they are all good depending on the needs | |
| command="llama-server -hf unsloth/granite-4.0-h-micro-GGUF:UD-Q4_K_XL --top-k 20 --top-p 0.95 --min-p 0.0 --temp $temp -ngl 0 --threads 4 --jinja --cache-reuse 256 -c 16384" | |
| shift ;; | |
| "granite4-moe") | |
| # one can also use ibm-granite/granite-4.0-h-tiny-GGUF:Q4_K_M and pick | |
| # parameters at will. IBM says they are all good depending on the needs | |
| command="llama-server -hf unsloth/granite-4.0-h-tiny-GGUF:UD-Q4_K_XL --top-k 20 --top-p 0.95 --min-p 0.0 --temp $temp -ngl 0 --threads 4 --jinja --cache-reuse 256 -c 16384" | |
| shift ;; | |
| "gemma3") | |
| command="llama-server -hf stduhpf/google-gemma-3-4b-it-qat-q4_0-gguf-small --top-k 64 --top-p 0.95 --min-p 0.0 --repeat-penalty 1.0 --temp 1.0 -ngl 0 --threads 4 --jinja --cache-reuse 256 -c 16384" | |
| shift ;; | |
| "gpt") | |
| command="llama-server -m /Users/mseri/Downloads/GPT-OSS-20B-Pruned-Q5_0.gguf --temp 1.0 --top-p 1.0 --top-k 0 --min-p 0.01 --threads 4 -ngl 0 -ub 2048 -b 2048 --jinja -c 16384 --chat-template-kwargs '{\"reasoning_effort\": \"medium\"}'" | |
| shift ;; | |
| "lfm2") | |
| command="llama-server -hf LiquidAI/LFM2-2.6B-GGUF:Q8_0 --temp 0.3 --min-p 0.15 --repeat-penalty 1.05 --sampling-seq edskypmxt --threads 4 -ngl 0 --jinja -c 16384" | |
| shift ;; | |
| "lfm2-moe") | |
| command="llama-server -hf unsloth/LFM2-8B-A1B-GGUF:Q4_K_XL --temp 0.3 --min-p 0.15 --repeat-penalty 1.05 --threads 4 -ngl 0 --jinja -c 16384" | |
| shift ;; | |
| "lfm2-vl") | |
| command="llama-server -hf bartowski/LiquidAI_LFM2-VL-1.6B-GGUF:Q6_K --temp 0.1 --min-p 0.15 --top-p 1.0 --top-k 50 --repeat-penalty 1.05 -ngl 0 --threads 4 --jinja -c 16384" | |
| shift ;; | |
| "ministral3-3b") | |
| command="llama-server -hf mistralai/Ministral-3-3B-Instruct-2512-GGUF:Q4_K_M --temp 0.15 -ngl 0 --threads 4 --jinja -c 16384" | |
| shift ;; | |
| "ministral3-3br") | |
| command="llama-server -hf mistralai/Ministral-3-3B-Reasoning-2512-GGUF:Q4_K_M --top-p 0.95 --temp 0.7 -ngl 0 --threads 4 --jinja -c 16384" | |
| shift ;; | |
| "ministral3-8b") | |
| command="llama-server -hf mistralai/Ministral-3-8B-Instruct-2512-GGUF:Q4_K_M --temp 0.15 -ngl 0 --threads 4 --jinja -c 16384" | |
| shift ;; | |
| "ministral3-8br") | |
| command="llama-server -hf mistralai/Ministral-3-8B-Reasoning-2512-GGUF:Q4_K_M --top-p 0.95 --temp 0.7 -ngl 0 --threads 4 --jinja -c 16384" | |
| shift ;; | |
| "nemotron") | |
| command="llama-server -hf bartowski/nvidia_NVIDIA-Nemotron-Nano-9B-v2-GGUF:Q4_K_M --temp 0.6 --top-p 0.95 -ngl 0 --threads 4 --jinja -c 16384" | |
| shift ;; | |
| "qwen3i") | |
| command="llama-server -hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_XL --top-k 20 --top-p 0.8 --min-p 0.0 --temp 0.7 -ngl 0 --threads 4 --jinja --cache-reuse 256 -c 8192" | |
| shift ;; | |
| "qwen3r") | |
| command="llama-server -hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_XL --top-k 20 --top-p 0.95 --min-p 0.0 --temp 0.6 -ngl 0 --threads 4 --jinja --cache-reuse 256 -c 8192" | |
| shift ;; | |
| "voxtral") | |
| command="llama-server -hf bartowski/mistralai_Voxtral-Mini-3B-2507-GGUF:Q4_K_M --top-p 0.95 --temp 0.2 -ngl 0 -c 16384 --cache-reuse 256 --threads 4 -ctk q8_0 -ctv q8_0" | |
| shift ;; | |
| "--dry") | |
| dry=true | |
| shift ;; | |
| "--temp") | |
| if [ $# -lt 2 ]; then | |
| echo "Error: --temp requires a value." | |
| exit 1 | |
| fi | |
| temp="$2" | |
| shift 2 ;; | |
| "--help"|"-h") | |
| usage | |
| exit 0 ;; | |
| *) | |
| echo "Error: Unknown argument." | |
| usage | |
| exit 1 ;; | |
| esac | |
| done | |
| if $dry; then | |
| echo "$command" | |
| exit 0 | |
| else | |
| eval "$command" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment