Skip to content

Instantly share code, notes, and snippets.

@yiwenlu66
Last active December 3, 2025 16:21
Show Gist options
  • Select an option

  • Save yiwenlu66/cf497d7a940a1fc8d6b29d98a976c040 to your computer and use it in GitHub Desktop.

Select an option

Save yiwenlu66/cf497d7a940a1fc8d6b29d98a976c040 to your computer and use it in GitHub Desktop.
Whisper dictation: F2 to transcribe, F3 to transcribe + cleanup with Ollama (Qwen 2.5). ~160 lines, GPU-accelerated.
#!/usr/bin/env python3
"""Whisper dictation: F2 to transcribe, F3 to transcribe + cleanup with Ollama."""
import json
import re
import subprocess
import threading
import time
import urllib.request
import numpy as np
import sounddevice as sd
import whisper
from pynput import keyboard
MODEL = "turbo"
SAMPLE_RATE = 16000
MIN_DURATION = 0.5 # Ignore recordings shorter than this (seconds)
PROMPT = None # Optional context for transcription
# Ollama configuration for text cleanup (F3)
OLLAMA_HOST = "http://localhost:11434"
OLLAMA_MODEL = "qwen2.5:3b"
CLEANUP_PROMPT = """Clean up the following transcribed speech:
- Add proper punctuation and capitalization
- Remove spoken disfluencies (um, uh, er, like, you know, repeated words, false starts)
- Fix minor grammar issues
- Keep the original language, do NOT translate
- Preserve the original wording, meaning, and tone; only make minimal edits for readability
Output ONLY the cleaned text, nothing else.
Text: {text}
Cleaned:"""
# Post-processing replacements (case-insensitive)
REPLACEMENTS = {
"claude.md": "CLAUDE.md",
}
def postprocess(text):
for wrong, right in REPLACEMENTS.items():
text = re.sub(re.escape(wrong), right, text, flags=re.IGNORECASE)
return text
def ensure_ollama_running():
"""Start Ollama server if not already running."""
try:
req = urllib.request.Request(f"{OLLAMA_HOST}/api/tags")
urllib.request.urlopen(req, timeout=2)
return True
except Exception:
print("Starting Ollama server...", end="", flush=True)
subprocess.Popen(
["ollama", "serve"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
for _ in range(30):
time.sleep(0.1)
try:
urllib.request.urlopen(req, timeout=1)
print(" ready.")
return True
except Exception:
pass
print(" failed to start.")
return False
def cleanup_text(text):
"""Use Ollama + Qwen to fix punctuation and grammar."""
try:
payload = json.dumps({
"model": OLLAMA_MODEL,
"prompt": CLEANUP_PROMPT.format(text=text),
"stream": False,
}).encode()
req = urllib.request.Request(
f"{OLLAMA_HOST}/api/generate",
data=payload,
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=30) as resp:
result = json.loads(resp.read().decode())
cleaned = result.get("response", "").strip()
if cleaned and len(cleaned) < len(text) * 3:
return cleaned
print(" (cleanup produced invalid output, using raw)", end="")
return text
except Exception as e:
print(f" (cleanup failed: {e}, using raw)", end="")
return text
print(f"Loading {MODEL} model...")
model = whisper.load_model(MODEL)
ensure_ollama_running()
print("Ready. F2=transcribe, F3=transcribe+cleanup")
recording = False
audio_chunks = []
pending_cleanup = False
lock = threading.Lock()
def on_press(key):
global recording, audio_chunks, pending_cleanup
if key in (keyboard.Key.f2, keyboard.Key.f3) and not recording:
with lock:
recording = True
audio_chunks = []
pending_cleanup = (key == keyboard.Key.f3)
mode = "Recording+cleanup" if pending_cleanup else "Recording"
print(f"{mode}...", end="", flush=True)
def on_release(key):
global recording, audio_chunks, pending_cleanup
if key in (keyboard.Key.f2, keyboard.Key.f3) and recording:
with lock:
recording = False
chunks = audio_chunks.copy()
audio_chunks = []
should_cleanup = pending_cleanup
pending_cleanup = False
if chunks:
threading.Thread(target=transcribe_and_paste, args=(chunks, should_cleanup), daemon=True).start()
else:
print(" (no audio)")
def audio_callback(indata, frames, time, status):
if recording:
with lock:
if recording:
audio_chunks.append(indata.copy())
def transcribe_and_paste(chunks, cleanup=False):
print(" transcribing...", end="", flush=True)
audio = np.concatenate(chunks).flatten().astype(np.float32)
duration = len(audio) / SAMPLE_RATE
if duration < MIN_DURATION:
print(f" (too short: {duration:.1f}s)")
return
result = model.transcribe(audio, fp16=True, task="transcribe", initial_prompt=PROMPT)
text = result["text"].strip()
if cleanup and text:
print(" cleaning...", end="", flush=True)
text = cleanup_text(text)
text = postprocess(text)
if text:
subprocess.run(["xdotool", "type", "--clearmodifiers", "--", text], check=True)
print(f" [{text}]")
else:
print(" (empty)")
stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype=np.float32, callback=audio_callback)
stream.start()
with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
listener.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment