Angel Anton monday8am

Layer	Component	Role
Koog	LLMClient	Interface Koog expects
Bridge	LocalInferenceLLMClient	Adapter implementing Koog's interface
Bridge	PromptExecuter:(Str) → Str?	Function passed at construction
Bridge	LocalInferenceEngine	Platform-agnostic interface
Platform	LiteRTLmInferenceEngineImpl	Android-specific implementation

Model	Inference Library	Backend	Tool Calling Support	Key Issues
Qwen3-0.6B	LiteRT-LM	❌ CPU (slow)	❌ Not supported	Multiturn conversation broken due to crashes with thinking models; unable to deactivate thinking mode; google-ai-edge/LiteRT-LM#1027
Hammer2.1-0.5B	LiteRT-LM	❌ CPU (slow)	❌ Not supported	Multiturn conversation broken due internal API fail
Hammer2.1-0.5B	MediaPipe	❌ CPU (slow)	❌ Not supported	Internal failure when loading model; solution aparently abandoned by Google
Gemma3-1B	LiteRT-LM	✅ GPU (fast)	❌ Not supported	No tool calling training; unable to identify tools or maintain input/output schema

Model	Overall Acc	Single Turn (Non-live)	Hall Irrelevance	Latency(sec)	Est-Memory-8bit(GB)
Claude Sonnet4.5 (FC)	68.68	88.56	86.32	4.1	N/A (cloud)
Qwen3-0.6B (FC)	22.59	68.56	81.79	7.02	✅ ~0.6
Phi4 mini (FC)	21.7	73.21	80.68	6.76	~3.8
Hammer2.1-0.5b (FC)	21.11	68.62	74.27	0.91	✅ ~0.5
Llama-3.2-1B (FC)	10.85	37.9	53.03	1.18	~1.0
Gemma-3-1B (Prompt)	6.82	2.23	58.28	15.13	~1.0

	suspend fun initialize(
	modelConfig: ModelConfiguration,
	modelPath: String,
	): Result<Unit> =
	withContext(dispatcher) {
	runCatching {
	val llmInference = createLlmInference(context, modelPath, modelConfig)
	val backend = LlmInferenceBackend(llmInference, HammerFormatter())
	val systemInstruction = createSystemInstruction()

	override suspend fun initialize(
	modelConfig: ModelConfiguration,
	modelPath: String,
	): Result<Unit> =
	withContext(dispatcher) {
	val engineConfig =
	EngineConfig(
	modelPath = modelPath,
	backend = if (modelConfig.hardwareAcceleration == HardwareBackend.GPU_SUPPORTED)
	Backend.GPU else Backend.CPU,

	# STEP 6.5 — Build .task bundle using MediaPipe
	import os
	import mediapipe as mp
	from mediapipe.tasks.python.genai import bundler

	# Configure the bundle
	# Hammer2.1 uses Qwen2.5's tokenizer with ChatML format
	task_config = bundler.BundleConfig(
	tflite_model=OUTPUT_TFLITE,
	tokenizer_model=OUTPUT_SPM,

	# STEP 6 — Build .litertlm using high-level API
	%cd /content/ai-edge-torch/

	import os
	import json
	import tempfile
	from ai_edge_torch.generative.utilities import litertlm_builder

	print(f"🏗️ Building .litertlm bundle with {MODEL_NAME} metadata...")

	# STEP 4 — Convert to TFLite with quantization
	hammer_converter = "/content/ai-edge-torch/ai_edge_torch/generative/examples/hammer/convert_to_tflite.py"
	qwen3_converter = "/content/ai-edge-torch/ai_edge_torch/generative/examples/qwen/convert_v3_to_tflite.py"

	if MODEL_NAME == HAMMER2P1:
	print("✅ Using Hammer-specific converter")
	converter_script = hammer_converter
	else:
	print("✅ Using Qwen3-specific converter")
	converter_script = qwen3_converter