The issue manifests when calling model.respond(to: prompt) from the LLM package:
// File: LocalAIService.swift
// Location: sendSingle() method
@MainActor
private func sendSingle(prompt: String, model: LLM) async {
isProcessing = true
currentChunk = 1
totalChunks = 1
self.response = ""
do {
// β ISSUE: This call can throw KV cache errors
await model.respond(to: prompt)
self.response = model.output
// Sometimes the error appears in the output instead of throwing
// Check for KV cache errors in the response
if await checkForKVCacheError(response: model.output) {
print("π¨ KV cache error detected, attempting automatic recovery...")
await handleKVCacheError(originalPrompt: prompt)
return
}
} catch {
print("β Error during AI processing: \(error)")
// Check if it's a KV cache related error
if await isKVCacheError(error) {
print("π¨ KV cache error detected in exception, attempting automatic recovery...")
await handleKVCacheError(originalPrompt: prompt)
return
}
self.response = "Error: \(error.localizedDescription)"
}
isProcessing = false
}The KV cache errors appear in two forms:
"failed to find kv cache slot""kv cache""llama_decode: failed to decode""ubatch""cache slot""decode failed"
"failed to find kv cache slot""kv cache slot""llama_decode: failed to decode""decode: failed to find""ubatch of size""ret = 1""..."(truncated responses)
// MARK: - KV Cache Error Detection and Handling
/// Check if an error is related to KV cache issues
private func isKVCacheError(_ error: Error) async -> Bool {
let errorDescription = error.localizedDescription.lowercased()
let kvCacheErrorPatterns = [
"failed to find kv cache slot",
"kv cache",
"llama_decode: failed to decode",
"ubatch",
"cache slot",
"decode failed"
]
return kvCacheErrorPatterns.contains { pattern in
errorDescription.contains(pattern)
}
}
/// Check if the response contains KV cache error messages
private func checkForKVCacheError(response: String) async -> Bool {
let responseLower = response.lowercased()
let kvCacheErrorPatterns = [
"failed to find kv cache slot",
"kv cache slot",
"llama_decode: failed to decode",
"decode: failed to find",
"ubatch of size",
"ret = 1",
"..."
]
return kvCacheErrorPatterns.contains { pattern in
responseLower.contains(pattern)
}
}Our workaround involves:
- Detecting KV cache errors (both in exceptions and response content)
- Clearing the model instance
- Reinitializing the model
- Retrying the original prompt
/// Handle KV cache errors with automatic recovery
@MainActor
private func handleKVCacheError(originalPrompt: String) async {
print("π§ Starting automatic KV cache error recovery...")
// Update UI to show recovery in progress
self.response = "π§ KV cache error detected. Automatically clearing cache and retrying..."
// Perform full cache reset
await clearCacheAndReinitialize()
// Wait a moment for the model to fully initialize
try? await Task.sleep(nanoseconds: 1_000_000_000) // 1 second
// Retry the original prompt with the fresh model
if model != nil {
print("π Retrying original prompt after cache reset...")
self.response = "π Retrying after cache reset..."
await model!.respond(to: originalPrompt)
self.response = model!.output
print("β
Recovery completed successfully")
} else {
self.response = "β Failed to recover from KV cache error. Please try again or restart the application."
print("β Recovery failed - model could not be reinitialized")
}
isProcessing = false
}
/// Clear the model cache and reinitialize with fresh memory settings
@MainActor
func clearCacheAndReinitialize() async {
print("π§Ή Clearing AI cache and reinitializing...")
// Stop any current processing
stop()
// Clear the current model to free memory
model = nil
response = ""
currentChunk = 0
totalChunks = 0
// Force garbage collection
autoreleasepool {
// This helps ensure memory is actually freed
}
// Refresh system specs to get current memory state
refreshSystemSpecs()
// Reinitialize the model with fresh settings
await initializeModel()
print("β
Cache cleared and model reinitialized")
}@MainActor
private func sendSingle(prompt: String, model: LLM) async {
isProcessing = true
currentChunk = 1
totalChunks = 1
self.response = ""
do {
await model.respond(to: prompt)
self.response = model.output
// β
FIX: Check for KV cache errors in the response
if await checkForKVCacheError(response: model.output) {
print("π¨ KV cache error detected, attempting automatic recovery...")
await handleKVCacheError(originalPrompt: prompt)
return
}
} catch {
print("β Error during AI processing: \(error)")
// β
FIX: Check if it's a KV cache related error
if await isKVCacheError(error) {
print("π¨ KV cache error detected in exception, attempting automatic recovery...")
await handleKVCacheError(originalPrompt: prompt)
return
}
self.response = "Error: \(error.localizedDescription)"
}
isProcessing = false
}We recommend the LLM.swift package should:
-
Internally handle KV cache exhaustion by:
- Automatically clearing and reallocating the KV cache when it becomes full
- Providing a method to manually clear the KV cache without reinitializing the entire model
- Throwing more specific error types that can be caught and handled
-
Add KV cache management methods:
// Proposed API additions extension LLM { func clearKVCache() async throws var kvCacheStatus: KVCacheStatus { get } func resetContext() async throws }
-
Improve error reporting:
- Throw
KVCacheErrorinstead of generic errors - Include cache capacity information in error messages
- Provide guidance on how to resolve the issue
- Throw
- Package: LLM.swift
- Model: bartowski/gemma-2-2b-it-GGUF
- Template: Gemma
- Max Token Count: 16384
- Platform: macOS
- Swift Version: 5.9+
- Initialize a model with
LLM(from: huggingFaceModel, maxTokenCount: 16384) - Send multiple prompts in sequence using
model.respond(to: prompt) - After several prompts (typically 3-5), the KV cache error occurs
- The error either:
- Throws an exception with KV cache related messages, OR
- Returns a response containing KV cache error strings