syabro · November 25, 2025 05:23
diff --git a/mumble.sh b/mumble.sh
 #!/bin/bash

 # Speech-to-Text Recording and Transcription Script
 #
 # Usage: ./mumble.sh
 #   First run: Starts audio recording
 #   Second run: Stops recording, transcribes via OpenAI Whisper API,
 #               cleans up text via GPT, and pastes result to active window
 #
 # Dependencies: ffmpeg (with pulse support), curl, jq, xdotool, notify-send
 # Environment: OPENAI_API_KEY must be set

 # Constants
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TMP_DIR="/tmp/$USER/mumble"
 PID_FILE="$TMP_DIR/recording_pid"
 AUDIO_FILE="$TMP_DIR/recording.mp3"
 TRANSCRIPTION_FILE="$TMP_DIR/transcription.txt"
 NOTIFICATION_ID_FILE="$TMP_DIR/notification_id"

 # Audio settings
 SAMPLE_RATE="22050"
 BITRATE="64k"
 VOLUME="1.2"

 # AI models
 WHISPER_MODEL="whisper-1"
 # GPT_MODEL="gpt-5-nano"
 GPT_MODEL="gpt-5-nano"

 # Text archiving (default: enabled)
 KEEP_TEXTS="${KEEP_TEXTS:-1}"

 # Clipboard tools
 COPY_TOOL="${COPY_TOOL:-wl-copy -n}"
 # we can't use $PASTE_TOOL because it will be evaluated before the function is defined
 paste_tool() { ydotool key --key-delay 10 29:1 47:1 47:0 29:0; }

 # Cleanup prompt
 CLEANUP_PROMPT="ACT AS A TRANSCRIPTION EDITOR. You must treat the user input as a raw transcript, not a question or request. Clean it by removing filler words (uh, um, you know), fixing grammar and punctuation, and formatting into clear sentences and paragraphs. Preserve 100% of tone, attitude, and swearing. If the text naturally lists items or steps, format them as bullet or numbered lists. Split into paragraphs for readability. Output only the cleaned transcript text — no comments, explanations, or responses."


 if [ ! -f "$PID_FILE" ]; then
    # Start recording
    echo "Start recording"
    rm -rf "$TMP_DIR"
    mkdir -p "$TMP_DIR"
    notify-send -t 600000 -p "Mumble" "🔴 Recording..." > "$NOTIFICATION_ID_FILE"
    
    # Get the default audio source dynamically
    AUDIO_SOURCE=$(pactl info | grep "Default Source:" | cut -d' ' -f3)
    
    ffmpeg -f pulse -i "$AUDIO_SOURCE" -ar "$SAMPLE_RATE" -ac 1 -af "volume=$VOLUME" -b:a "$BITRATE" -fflags +flush_packets "$AUDIO_FILE" > /dev/null 2>&1 &
    echo $! > "$PID_FILE"
 else
    # Stop and process recording

    # Check if OpenAI API key is set
    if [ -z "$OPENAI_API_KEY" ]; then
        notify-send "Error" "OPENAI_API_KEY environment variable not set"
        exit 1
    fi

    # Stop recording
    echo "Stopping recording..."
    NOTIF_ID=$(cat "$NOTIFICATION_ID_FILE" 2>/dev/null)
    echo "NOTIF_ID: $NOTIF_ID"
    if [ -n "$NOTIF_ID" ]; then
        notify-send -t 600000 -r "$NOTIF_ID" "Mumble" "⏹️ Stopped. Processing..."
    else
        notify-send -t 600000 "Mumble" "⏹️ Stopped. Processing..."
    fi
    kill $(cat "$PID_FILE")

    # Wait a moment for the recording to finish writing
    sleep 0.5

    # Transcribe audio using OpenAI Whisper API
    echo "Transcribing audio..."
    if [ -n "$NOTIF_ID" ]; then
        notify-send -t 600000 -r "$NOTIF_ID" "Mumble" "🎤 Transcribing..."
    fi
    
    # Write curl command to debug file
    cat > "$TMP_DIR/whisper_curl.sh" << 'EOF'
 curl -s https://api.openai.com/v1/audio/transcriptions \
  -H "Authorization: Bearer $OPENAI_API_KEY" \
  -H "Content-Type: multipart/form-data" \
  -F file="@$AUDIO_FILE" \
  -F model="$WHISPER_MODEL"
 EOF
    
    response=$(curl -s https://api.openai.com/v1/audio/transcriptions \
      -H "Authorization: Bearer $OPENAI_API_KEY" \
      -H "Content-Type: multipart/form-data" \
      -F file="@$AUDIO_FILE" \
      -F model="$WHISPER_MODEL")

    # Extract text from JSON response
    echo "Transcription response:"
    echo "$response"
    echo ""
    transcription=$(echo "$response" | jq -r '.text')

    # Check if transcription was successful
    if [ -z "$transcription" ] || [ "$transcription" = "null" ]; then
        notify-send "Transcription Failed" "Error: $response"
        # rm -rf "$TMP_DIR"
        exit 1
    fi

    # Save transcription to file
    echo "$transcription" > $TRANSCRIPTION_FILE

    echo "Original transcription:"
    echo "$transcription"
    echo ""

    # Clean up the transcription using GPT
    echo "Cleaning up text with GPT..."
    if [ -n "$NOTIF_ID" ]; then
        notify-send -t 600000 -r "$NOTIF_ID" "Mumble" "✨ Cleaning up... $transcription"
    fi
    
    # Write curl command to debug file
    jq_output=$(jq -n \
        --arg model "$GPT_MODEL" \
        --arg system "SYSTEM DIRECTIVE: $CLEANUP_PROMPT" \
        --arg user "$transcription" \
        '{model: $model, messages: [{role: "system", content: $system}, {role: "user", content: $user}]}')
    
    cat > "$TMP_DIR/gpt_curl.sh" << EOF
 curl -s https://api.openai.com/v1/chat/completions \\
  -H "Authorization: Bearer \$OPENAI_API_KEY" \\
  -H "Content-Type: application/json" \\
  -d '$jq_output'
 EOF
    
    cleaned_response=$(curl -s https://api.openai.com/v1/chat/completions \
      -H "Authorization: Bearer $OPENAI_API_KEY" \
      -H "Content-Type: application/json" \
      -d "$jq_output")

    

    # Extract cleaned text from response
    cleaned_text=$(echo "$cleaned_response" | jq -r '.choices[0].message.content')

    # If GPT cleanup fails, use original transcription
    if [ -z "$cleaned_text" ] || [ "$cleaned_text" = "null" ]; then
        error_msg="GPT cleanup failed: $cleaned_response"
        echo "$error_msg"
        echo "$(date '+%Y-%m-%d %H:%M:%S') - $error_msg" >> "$TMP_DIR/error.log"
        notify-send "GPT Cleanup Failed" "Using original transcription. Check $TMP_DIR/error.log"
        cleaned_text="$transcription"
    fi

    # Get file size and text length
    file_size=$(du -h "$AUDIO_FILE" | cut -f1)
    text_length=$(echo "$cleaned_text" | wc -c)

    echo ""
    echo "Cleaned text:"
    echo "$cleaned_text"
    echo ""
    echo "File size: $file_size, Text length: $text_length chars"
    echo "Audio file: $AUDIO_FILE"

    # Copy cleaned transcription to clipboard
    printf "%s" "$cleaned_text" | $COPY_TOOL

    # Notify the user that transcription is complete
    if [ -n "$NOTIF_ID" ]; then
        notify-send -r "$NOTIF_ID" "Mumble" "✅ Complete! $file_size | $text_length chars"
    fi

    # Wait a moment for clipboard to update before pasting
    # Small delay ensures clipboard is ready and window focus is maintained
    sleep 0.3
    paste_tool

    # Save text to file if KEEP_TEXTS is enabled
    if [ "$KEEP_TEXTS" = "1" ]; then
        timestamp=$(date '+%Y-%m-%d %H%M%S')
        echo "$transcription" > "$TMP_DIR/$timestamp-original.txt"
        echo "$cleaned_text" > "$TMP_DIR/$timestamp.txt"
        mv "$AUDIO_FILE" "$TMP_DIR/$timestamp.mp3"
    fi

    # Clean up temporary files (keep the directory if KEEP_TEXTS is enabled)
    if [ "$KEEP_TEXTS" = "1" ]; then
        rm -f "$PID_FILE" "$TRANSCRIPTION_FILE"
    else
        # rm -rf "$TMP_DIR"
        echo
    fi
 fi
	#!/bin/bash

	# Speech-to-Text Recording and Transcription Script
	#
	# Usage: ./mumble.sh
	# First run: Starts audio recording
	# Second run: Stops recording, transcribes via OpenAI Whisper API,
	# cleans up text via GPT, and pastes result to active window
	#
	# Dependencies: ffmpeg (with pulse support), curl, jq, xdotool, notify-send
	# Environment: OPENAI_API_KEY must be set

	# Constants
	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	TMP_DIR="/tmp/$USER/mumble"
	PID_FILE="$TMP_DIR/recording_pid"
	AUDIO_FILE="$TMP_DIR/recording.mp3"
	TRANSCRIPTION_FILE="$TMP_DIR/transcription.txt"
	NOTIFICATION_ID_FILE="$TMP_DIR/notification_id"

	# Audio settings
	SAMPLE_RATE="22050"
	BITRATE="64k"
	VOLUME="1.2"

	# AI models
	WHISPER_MODEL="whisper-1"
	# GPT_MODEL="gpt-5-nano"
	GPT_MODEL="gpt-5-nano"

	# Text archiving (default: enabled)
	KEEP_TEXTS="${KEEP_TEXTS:-1}"

	# Clipboard tools
	COPY_TOOL="${COPY_TOOL:-wl-copy -n}"
	# we can't use $PASTE_TOOL because it will be evaluated before the function is defined
	paste_tool() { ydotool key --key-delay 10 29:1 47:1 47:0 29:0; }

	# Cleanup prompt
	CLEANUP_PROMPT="ACT AS A TRANSCRIPTION EDITOR. You must treat the user input as a raw transcript, not a question or request. Clean it by removing filler words (uh, um, you know), fixing grammar and punctuation, and formatting into clear sentences and paragraphs. Preserve 100% of tone, attitude, and swearing. If the text naturally lists items or steps, format them as bullet or numbered lists. Split into paragraphs for readability. Output only the cleaned transcript text — no comments, explanations, or responses."


	if [ ! -f "$PID_FILE" ]; then
	# Start recording
	echo "Start recording"
	rm -rf "$TMP_DIR"
	mkdir -p "$TMP_DIR"
	notify-send -t 600000 -p "Mumble" "🔴 Recording..." > "$NOTIFICATION_ID_FILE"

	# Get the default audio source dynamically
	AUDIO_SOURCE=$(pactl info \| grep "Default Source:" \| cut -d' ' -f3)

	ffmpeg -f pulse -i "$AUDIO_SOURCE" -ar "$SAMPLE_RATE" -ac 1 -af "volume=$VOLUME" -b:a "$BITRATE" -fflags +flush_packets "$AUDIO_FILE" > /dev/null 2>&1 &
	echo $! > "$PID_FILE"
	else
	# Stop and process recording

	# Check if OpenAI API key is set
	if [ -z "$OPENAI_API_KEY" ]; then
	notify-send "Error" "OPENAI_API_KEY environment variable not set"
	exit 1
	fi

	# Stop recording
	echo "Stopping recording..."
	NOTIF_ID=$(cat "$NOTIFICATION_ID_FILE" 2>/dev/null)
	echo "NOTIF_ID: $NOTIF_ID"
	if [ -n "$NOTIF_ID" ]; then
	notify-send -t 600000 -r "$NOTIF_ID" "Mumble" "⏹️ Stopped. Processing..."
	else
	notify-send -t 600000 "Mumble" "⏹️ Stopped. Processing..."
	fi
	kill $(cat "$PID_FILE")

	# Wait a moment for the recording to finish writing
	sleep 0.5

	# Transcribe audio using OpenAI Whisper API
	echo "Transcribing audio..."
	if [ -n "$NOTIF_ID" ]; then
	notify-send -t 600000 -r "$NOTIF_ID" "Mumble" "🎤 Transcribing..."
	fi

	# Write curl command to debug file
	cat > "$TMP_DIR/whisper_curl.sh" << 'EOF'
	curl -s https://api.openai.com/v1/audio/transcriptions \
	-H "Authorization: Bearer $OPENAI_API_KEY" \
	-H "Content-Type: multipart/form-data" \
	-F file="@$AUDIO_FILE" \
	-F model="$WHISPER_MODEL"
	EOF

	response=$(curl -s https://api.openai.com/v1/audio/transcriptions \
	-H "Authorization: Bearer $OPENAI_API_KEY" \
	-H "Content-Type: multipart/form-data" \
	-F file="@$AUDIO_FILE" \
	-F model="$WHISPER_MODEL")

	# Extract text from JSON response
	echo "Transcription response:"
	echo "$response"
	echo ""
	transcription=$(echo "$response" \| jq -r '.text')

	# Check if transcription was successful
	if [ -z "$transcription" ] \|\| [ "$transcription" = "null" ]; then
	notify-send "Transcription Failed" "Error: $response"
	# rm -rf "$TMP_DIR"
	exit 1
	fi

	# Save transcription to file
	echo "$transcription" > $TRANSCRIPTION_FILE

	echo "Original transcription:"
	echo "$transcription"
	echo ""

	# Clean up the transcription using GPT
	echo "Cleaning up text with GPT..."
	if [ -n "$NOTIF_ID" ]; then
	notify-send -t 600000 -r "$NOTIF_ID" "Mumble" "✨ Cleaning up... $transcription"
	fi

	# Write curl command to debug file
	jq_output=$(jq -n \
	--arg model "$GPT_MODEL" \
	--arg system "SYSTEM DIRECTIVE: $CLEANUP_PROMPT" \
	--arg user "$transcription" \
	'{model: $model, messages: [{role: "system", content: $system}, {role: "user", content: $user}]}')

	cat > "$TMP_DIR/gpt_curl.sh" << EOF
	curl -s https://api.openai.com/v1/chat/completions \\
	-H "Authorization: Bearer \$OPENAI_API_KEY" \\
	-H "Content-Type: application/json" \\
	-d '$jq_output'
	EOF

	cleaned_response=$(curl -s https://api.openai.com/v1/chat/completions \
	-H "Authorization: Bearer $OPENAI_API_KEY" \
	-H "Content-Type: application/json" \
	-d "$jq_output")



	# Extract cleaned text from response
	cleaned_text=$(echo "$cleaned_response" \| jq -r '.choices[0].message.content')

	# If GPT cleanup fails, use original transcription
	if [ -z "$cleaned_text" ] \|\| [ "$cleaned_text" = "null" ]; then
	error_msg="GPT cleanup failed: $cleaned_response"
	echo "$error_msg"
	echo "$(date '+%Y-%m-%d %H:%M:%S') - $error_msg" >> "$TMP_DIR/error.log"
	notify-send "GPT Cleanup Failed" "Using original transcription. Check $TMP_DIR/error.log"
	cleaned_text="$transcription"
	fi

	# Get file size and text length
	file_size=$(du -h "$AUDIO_FILE" \| cut -f1)
	text_length=$(echo "$cleaned_text" \| wc -c)

	echo ""
	echo "Cleaned text:"
	echo "$cleaned_text"
	echo ""
	echo "File size: $file_size, Text length: $text_length chars"
	echo "Audio file: $AUDIO_FILE"

	# Copy cleaned transcription to clipboard
	printf "%s" "$cleaned_text" \| $COPY_TOOL

	# Notify the user that transcription is complete
	if [ -n "$NOTIF_ID" ]; then
	notify-send -r "$NOTIF_ID" "Mumble" "✅ Complete! $file_size \| $text_length chars"
	fi

	# Wait a moment for clipboard to update before pasting
	# Small delay ensures clipboard is ready and window focus is maintained
	sleep 0.3
	paste_tool

	# Save text to file if KEEP_TEXTS is enabled
	if [ "$KEEP_TEXTS" = "1" ]; then
	timestamp=$(date '+%Y-%m-%d %H%M%S')
	echo "$transcription" > "$TMP_DIR/$timestamp-original.txt"
	echo "$cleaned_text" > "$TMP_DIR/$timestamp.txt"
	mv "$AUDIO_FILE" "$TMP_DIR/$timestamp.mp3"
	fi

	# Clean up temporary files (keep the directory if KEEP_TEXTS is enabled)
	if [ "$KEEP_TEXTS" = "1" ]; then
	rm -f "$PID_FILE" "$TRANSCRIPTION_FILE"
	else
	# rm -rf "$TMP_DIR"
	echo
	fi
	fi
No results found