rmtuckerphx · November 20, 2025 20:36
diff --git a/gistfile1.txt b/gistfile1.txt

 import os
 import sys
 import asyncio
 import base64
 import argparse
 import signal
 import threading
 import queue
 from azure.ai.voicelive.models import ServerEventType
 from typing import Union, Optional, TYPE_CHECKING, cast
 from concurrent.futures import ThreadPoolExecutor
 import logging

 # Audio processing imports
 try:
    import pyaudio
 except ImportError:
    print("This sample requires pyaudio. Install with: pip install pyaudio")
    sys.exit(1)

 # Environment variable loading
 try:
    from dotenv import load_dotenv

    load_dotenv()
 except ImportError:
    print("Note: python-dotenv not installed. Using existing environment variables.")

 # Azure VoiceLive SDK imports
 from azure.core.credentials import AzureKeyCredential, TokenCredential
 from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

 from azure.ai.voicelive.aio import connect

 if TYPE_CHECKING:
    # Only needed for type checking; avoids runtime import issues
    from azure.ai.voicelive.aio import VoiceLiveConnection

 from azure.ai.voicelive.models import (
    RequestSession,
    ServerVad,
    AzureStandardVoice,
    Modality,
    AudioFormat,
 )

 # Set up logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)


 class AudioProcessor:
    """
    Handles real-time audio capture and playback for the voice assistant.

    Threading Architecture:
    - Main thread: Event loop and UI
    - Capture thread: PyAudio input stream reading
    - Send thread: Async audio data transmission to VoiceLive
    - Playback thread: PyAudio output stream writing
    """

    def __init__(self, connection):
        self.connection = connection
        self.audio = pyaudio.PyAudio()

        # Audio configuration - PCM16, 24kHz, mono as specified
        self.format = pyaudio.paInt16
        self.channels = 1
        self.rate = 24000
        self.chunk_size = 1024

        # Capture and playback state
        self.is_capturing = False
        self.is_playing = False
        self.input_stream = None
        self.output_stream = None

        # Audio queues and threading
        self.audio_queue: "queue.Queue[bytes]" = queue.Queue()
        self.audio_send_queue: "queue.Queue[str]" = queue.Queue()  # base64 audio to send
        self.executor = ThreadPoolExecutor(max_workers=3)
        self.capture_thread: Optional[threading.Thread] = None
        self.playback_thread: Optional[threading.Thread] = None
        self.send_thread: Optional[threading.Thread] = None
        self.loop: Optional[asyncio.AbstractEventLoop] = None  # Store the event loop

        logger.info("AudioProcessor initialized with 24kHz PCM16 mono audio")

    async def start_capture(self):
        """Start capturing audio from microphone."""
        if self.is_capturing:
            return

        # Store the current event loop for use in threads
        self.loop = asyncio.get_event_loop()

        self.is_capturing = True

        try:
            self.input_stream = self.audio.open(
                format=self.format,
                channels=self.channels,
                rate=self.rate,
                input=True,
                frames_per_buffer=self.chunk_size,
                stream_callback=None,
            )

            self.input_stream.start_stream()

            # Start capture thread
            self.capture_thread = threading.Thread(target=self._capture_audio_thread)
            self.capture_thread.daemon = True
            self.capture_thread.start()

            # Start audio send thread
            self.send_thread = threading.Thread(target=self._send_audio_thread)
            self.send_thread.daemon = True
            self.send_thread.start()

            logger.info("Started audio capture")

        except Exception as e:
            logger.error(f"Failed to start audio capture: {e}")
            self.is_capturing = False
            raise

    def _capture_audio_thread(self):
        """Audio capture thread - runs in background."""
        while self.is_capturing and self.input_stream:
            try:
                # Read audio data
                audio_data = self.input_stream.read(self.chunk_size, exception_on_overflow=False)

                if audio_data and self.is_capturing:
                    # Convert to base64 and queue for sending
                    audio_base64 = base64.b64encode(audio_data).decode("utf-8")
                    self.audio_send_queue.put(audio_base64)

            except Exception as e:
                if self.is_capturing:
                    logger.error(f"Error in audio capture: {e}")
                break

    def _send_audio_thread(self):
        """Audio send thread - handles async operations from sync thread."""
        while self.is_capturing:
            try:
                # Get audio data from queue (blocking with timeout)
                audio_base64 = self.audio_send_queue.get(timeout=0.1)

                if audio_base64 and self.is_capturing and self.loop:
                    # Schedule the async send operation in the main event loop
                    future = asyncio.run_coroutine_threadsafe(
                        self.connection.input_audio_buffer.append(audio=audio_base64), self.loop
                    )
                    # Don't wait for completion to avoid blocking

            except queue.Empty:
                continue
            except Exception as e:
                if self.is_capturing:
                    logger.error(f"Error sending audio: {e}")
                break

    async def stop_capture(self):
        """Stop capturing audio."""
        if not self.is_capturing:
            return

        self.is_capturing = False

        if self.input_stream:
            self.input_stream.stop_stream()
            self.input_stream.close()
            self.input_stream = None

        if self.capture_thread:
            self.capture_thread.join(timeout=1.0)

        if self.send_thread:
            self.send_thread.join(timeout=1.0)

        # Clear the send queue
        while not self.audio_send_queue.empty():
            try:
                self.audio_send_queue.get_nowait()
            except queue.Empty:
                break

        logger.info("Stopped audio capture")

    async def start_playback(self):
        """Initialize audio playback system."""
        if self.is_playing:
            return

        self.is_playing = True

        try:
            self.output_stream = self.audio.open(
                format=self.format,
                channels=self.channels,
                rate=self.rate,
                output=True,
                frames_per_buffer=self.chunk_size,
            )

            # Start playback thread
            self.playback_thread = threading.Thread(target=self._playback_audio_thread)
            self.playback_thread.daemon = True
            self.playback_thread.start()

            logger.info("Audio playback system ready")

        except Exception as e:
            logger.error(f"Failed to initialize audio playback: {e}")
            self.is_playing = False
            raise

    def _playback_audio_thread(self):
        """Audio playback thread - runs in background."""
        while self.is_playing:
            try:
                # Get audio data from queue (blocking with timeout)
                audio_data = self.audio_queue.get(timeout=0.1)

                if audio_data and self.output_stream and self.is_playing:
                    self.output_stream.write(audio_data)

            except queue.Empty:
                continue
            except Exception as e:
                if self.is_playing:
                    logger.error(f"Error in audio playback: {e}")
                break

    async def queue_audio(self, audio_data: bytes):
        """Queue audio data for playback."""
        if self.is_playing:
            self.audio_queue.put(audio_data)

    async def stop_playback(self):
        """Stop audio playback and clear queue."""
        if not self.is_playing:
            return

        self.is_playing = False

        # Clear the queue
        while not self.audio_queue.empty():
            try:
                self.audio_queue.get_nowait()
            except queue.Empty:
                break

        if self.output_stream:
            self.output_stream.stop_stream()
            self.output_stream.close()
            self.output_stream = None

        if self.playback_thread:
            self.playback_thread.join(timeout=1.0)

        logger.info("Stopped audio playback")

    async def cleanup(self):
        """Clean up audio resources."""
        await self.stop_capture()
        await self.stop_playback()

        if self.audio:
            self.audio.terminate()

        self.executor.shutdown(wait=True)
        logger.info("Audio processor cleaned up")


 class BasicVoiceAssistant:
    """Basic voice assistant implementing the VoiceLive SDK patterns."""

    def __init__(
        self,
        endpoint: str,
        credential: Union[AzureKeyCredential, TokenCredential],
        model: str,
        voice: str,
        instructions: str,
        agent_id: str,
        foundry_project_name: str,
        agent_access_token: str,
    ):

        self.endpoint = endpoint
        self.credential = credential
        self.model = model
        self.agent_id = agent_id,
        self.foundry_project_name = foundry_project_name,
        self.agent_access_token = agent_access_token,
        self.voice = voice
        self.instructions = instructions
        self.connection: Optional["VoiceLiveConnection"] = None
        self.audio_processor: Optional[AudioProcessor] = None
        self.session_ready = False
        self.conversation_started = False

    async def start(self):
        """Start the voice assistant session."""
        try:
            logger.info(f"Connecting to VoiceLive API with agent {self.agent_id}")

            # Connect to VoiceLive WebSocket API
            async with connect(
                endpoint=self.endpoint,
                credential=self.credential,
                query={
                    "agent-id": self.agent_id,
                    "agent-project-name": self.foundry_project_name,
                    "agent-access-token": self.agent_access_token
                },
                connection_options={
                    "max_msg_size": 10 * 1024 * 1024,
                    "heartbeat": 20,
                    "timeout": 20,
                },
            ) as connection:
                conn = connection
                self.connection = conn

                # Initialize audio processor
                ap = AudioProcessor(conn)
                self.audio_processor = ap

                # Configure session for voice conversation
                await self._setup_session()

                # Start audio systems
                await ap.start_playback()

                logger.info("Voice assistant ready! Start speaking...")
                print("
 " + "=" * 60)
                print("🎤 VOICE ASSISTANT READY")
                print("Start speaking to begin conversation")
                print("Press Ctrl+C to exit")
                print("=" * 60 + "
 ")

                # Process events
                await self._process_events()

        except KeyboardInterrupt:
            logger.info("Received interrupt signal, shutting down...")

        except Exception as e:
            logger.error(f"Connection error: {e}")
            raise

        # Cleanup
        if self.audio_processor:
            await self.audio_processor.cleanup()

    async def _setup_session(self):
        """Configure the VoiceLive session for audio conversation."""
        logger.info("Setting up voice conversation session...")

        # Create strongly typed voice configuration
        voice_config: Union[AzureStandardVoice, str]
        if self.voice.startswith("en-US-") or self.voice.startswith("en-CA-") or "-" in self.voice:
            # Azure voice
            voice_config = AzureStandardVoice(name=self.voice, type="azure-standard")
        else:
            # OpenAI voice (alloy, echo, fable, onyx, nova, shimmer)
            voice_config = self.voice

        # Create strongly typed turn detection configuration
        turn_detection_config = ServerVad(threshold=0.5, prefix_padding_ms=300, silence_duration_ms=500)

        # Create strongly typed session configuration
        session_config = RequestSession(
            modalities=[Modality.TEXT, Modality.AUDIO],
            instructions=self.instructions,
            voice=voice_config,
            input_audio_format=AudioFormat.PCM16,
            output_audio_format=AudioFormat.PCM16,
            turn_detection=turn_detection_config,
        )

        conn = self.connection
        assert conn is not None, "Connection must be established before setting up session"
        await conn.session.update(session=session_config)

        logger.info("Session configuration sent")

    async def _process_events(self):
        """Process events from the VoiceLive connection."""
        try:
            conn = self.connection
            assert conn is not None, "Connection must be established before processing events"
            async for event in conn:
                await self._handle_event(event)

        except KeyboardInterrupt:
            logger.info("Event processing interrupted")
        except Exception as e:
            logger.error(f"Error processing events: {e}")
            raise

    async def _handle_event(self, event):
        """Handle different types of events from VoiceLive."""
        logger.debug(f"Received event: {event.type}")
        ap = self.audio_processor
        conn = self.connection
        assert ap is not None, "AudioProcessor must be initialized"
        assert conn is not None, "Connection must be established"

        if event.type == ServerEventType.SESSION_UPDATED:
            logger.info(f"Session ready: {event.session.id}")
            self.session_ready = True

            # Start audio capture once session is ready
            await ap.start_capture()

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
            logger.info("🎤 User started speaking - stopping playback")
            print("🎤 Listening...")

            # Stop current assistant audio playback (interruption handling)
            await ap.stop_playback()

            # Cancel any ongoing response
            try:
                await conn.response.cancel()
            except Exception as e:
                logger.debug(f"No response to cancel: {e}")

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
            logger.info("🎤 User stopped speaking")
            print("🤔 Processing...")

            # Restart playback system for response
            await ap.start_playback()

        elif event.type == ServerEventType.RESPONSE_CREATED:
            logger.info("🤖 Assistant response created")

        elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
            # Stream audio response to speakers
            logger.debug("Received audio delta")
            await ap.queue_audio(event.delta)

        elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
            logger.info("🤖 Assistant finished speaking")
            print("🎤 Ready for next input...")

        elif event.type == ServerEventType.RESPONSE_DONE:
            logger.info("✅ Response complete")

        elif event.type == ServerEventType.ERROR:
            logger.error(f"❌ VoiceLive error: {event.error.message}")
            print(f"Error: {event.error.message}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED:
            logger.debug(f"Conversation item created: {event.item.id}")

        else:
            logger.debug(f"Unhandled event type: {event.type}")


 def parse_arguments():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description="Basic Voice Assistant using Azure VoiceLive SDK",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument(
        "--api-key",
        help="Azure VoiceLive API key. If not provided, will use AZURE_VOICELIVE_API_KEY environment variable.",
        type=str,
        default=os.environ.get("AZURE_VOICELIVE_API_KEY"),
    )

    parser.add_argument(
        "--endpoint",
        help="Azure VoiceLive endpoint",
        type=str,
        default=os.environ.get("AZURE_VOICELIVE_ENDPOINT", "wss://api.voicelive.com/v1"),
    )

    parser.add_argument(
        "--model",
        help="VoiceLive model to use",
        type=str,
        default=os.environ.get("VOICELIVE_MODEL", "gpt-4o-realtime-preview"),
    )

    parser.add_argument(
        "--voice",
        help="Voice to use for the assistant",
        type=str,
        default=os.environ.get("VOICELIVE_VOICE", "en-US-AvaNeural"),
        choices=[
            "alloy",
            "echo",
            "fable",
            "onyx",
            "nova",
            "shimmer",
            "en-US-AvaNeural",
            "en-US-JennyNeural",
            "en-US-GuyNeural",
        ],
    )

    parser.add_argument(
        "--instructions",
        help="System instructions for the AI assistant",
        type=str,
        default=os.environ.get(
            "VOICELIVE_INSTRUCTIONS",
            "You are a helpful AI assistant. Respond naturally and conversationally. "
            "Keep your responses concise but engaging.",
        ),
    )

    parser.add_argument(
        "--use-token-credential", help="Use Azure token credential instead of API key", action="store_true"
    )

    parser.add_argument("--verbose", help="Enable verbose logging", action="store_true")

    return parser.parse_args()


 async def main():
    """Main function."""
    args = parse_arguments()

    # Set logging level
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Validate credentials
    if not args.api_key and not args.use_token_credential:
        print("❌ Error: No authentication provided")
        print("Please provide an API key using --api-key or set AZURE_VOICELIVE_API_KEY environment variable,")
        print("or use --use-token-credential for Azure authentication.")
        sys.exit(1)

    try:
        # Create client with appropriate credential
        credential: Union[AzureKeyCredential, TokenCredential]
        if args.use_token_credential:
            credential = InteractiveBrowserCredential()  # or DefaultAzureCredential() if needed
            logger.info("Using Azure token credential")
        else:
            credential = AzureKeyCredential(args.api_key)
            logger.info("Using API key credential")

        # Create and start voice assistant
        assistant = BasicVoiceAssistant(
            endpoint=args.endpoint,
            credential=credential,
            voice=args.voice,
            instructions=args.instructions,
            agent_id="<agent_id>",
            foundry_project_name="<foundry_project_name>",
            agent_access_token="<agent_access_token>",
        )

        # Setup signal handlers for graceful shutdown
        def signal_handler(sig, frame):
            logger.info("Received shutdown signal")
            raise KeyboardInterrupt()

        signal.signal(signal.SIGINT, signal_handler)
        signal.signal(signal.SIGTERM, signal_handler)

        # Start the assistant
        await assistant.start()

    except KeyboardInterrupt:
        print("
 👋 Voice assistant shut down. Goodbye!")
    except Exception as e:
        logger.error(f"Fatal error: {e}")
        print(f"❌ Error: {e}")
        sys.exit(1)


 if __name__ == "__main__":
    # Check for required dependencies
    dependencies = {
        "pyaudio": "Audio processing",
        "azure.ai.voicelive": "Azure VoiceLive SDK",
        "azure.core": "Azure Core libraries",
    }

    missing_deps = []
    for dep, description in dependencies.items():
        try:
            __import__(dep.replace("-", "_"))
        except ImportError:
            missing_deps.append(f"{dep} ({description})")

    if missing_deps:
        print("❌ Missing required dependencies:")
        for dep in missing_deps:
            print(f"  - {dep}")
        print("
 Install with: pip install azure-ai-voicelive pyaudio python-dotenv")
        sys.exit(1)

    # Check audio system
    try:
        p = pyaudio.PyAudio()
        # Check for input devices
        input_devices = [
            i
            for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxInputChannels", 0) or 0) > 0
        ]
        # Check for output devices
        output_devices = [
            i
            for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxOutputChannels", 0) or 0) > 0
        ]
        p.terminate()

        if not input_devices:
            print("❌ No audio input devices found. Please check your microphone.")
            sys.exit(1)
        if not output_devices:
            print("❌ No audio output devices found. Please check your speakers.")
            sys.exit(1)

    except Exception as e:
        print(f"❌ Audio system check failed: {e}")
        sys.exit(1)

    print("🎙️  Basic Voice Assistant with Azure VoiceLive SDK")
    print("=" * 50)

    # Run the assistant
    asyncio.run(main())

	import os
	import sys
	import asyncio
	import base64
	import argparse
	import signal
	import threading
	import queue
	from azure.ai.voicelive.models import ServerEventType
	from typing import Union, Optional, TYPE_CHECKING, cast
	from concurrent.futures import ThreadPoolExecutor
	import logging

	# Audio processing imports
	try:
	import pyaudio
	except ImportError:
	print("This sample requires pyaudio. Install with: pip install pyaudio")
	sys.exit(1)

	# Environment variable loading
	try:
	from dotenv import load_dotenv

	load_dotenv()
	except ImportError:
	print("Note: python-dotenv not installed. Using existing environment variables.")

	# Azure VoiceLive SDK imports
	from azure.core.credentials import AzureKeyCredential, TokenCredential
	from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

	from azure.ai.voicelive.aio import connect

	if TYPE_CHECKING:
	# Only needed for type checking; avoids runtime import issues
	from azure.ai.voicelive.aio import VoiceLiveConnection

	from azure.ai.voicelive.models import (
	RequestSession,
	ServerVad,
	AzureStandardVoice,
	Modality,
	AudioFormat,
	)

	# Set up logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)


	class AudioProcessor:
	"""
	Handles real-time audio capture and playback for the voice assistant.

	Threading Architecture:
	- Main thread: Event loop and UI
	- Capture thread: PyAudio input stream reading
	- Send thread: Async audio data transmission to VoiceLive
	- Playback thread: PyAudio output stream writing
	"""

	def __init__(self, connection):
	self.connection = connection
	self.audio = pyaudio.PyAudio()

	# Audio configuration - PCM16, 24kHz, mono as specified
	self.format = pyaudio.paInt16
	self.channels = 1
	self.rate = 24000
	self.chunk_size = 1024

	# Capture and playback state
	self.is_capturing = False
	self.is_playing = False
	self.input_stream = None
	self.output_stream = None

	# Audio queues and threading
	self.audio_queue: "queue.Queue[bytes]" = queue.Queue()
	self.audio_send_queue: "queue.Queue[str]" = queue.Queue() # base64 audio to send
	self.executor = ThreadPoolExecutor(max_workers=3)
	self.capture_thread: Optional[threading.Thread] = None
	self.playback_thread: Optional[threading.Thread] = None
	self.send_thread: Optional[threading.Thread] = None
	self.loop: Optional[asyncio.AbstractEventLoop] = None # Store the event loop

	logger.info("AudioProcessor initialized with 24kHz PCM16 mono audio")

	async def start_capture(self):
	"""Start capturing audio from microphone."""
	if self.is_capturing:
	return

	# Store the current event loop for use in threads
	self.loop = asyncio.get_event_loop()

	self.is_capturing = True

	try:
	self.input_stream = self.audio.open(
	format=self.format,
	channels=self.channels,
	rate=self.rate,
	input=True,
	frames_per_buffer=self.chunk_size,
	stream_callback=None,
	)

	self.input_stream.start_stream()

	# Start capture thread
	self.capture_thread = threading.Thread(target=self._capture_audio_thread)
	self.capture_thread.daemon = True
	self.capture_thread.start()

	# Start audio send thread
	self.send_thread = threading.Thread(target=self._send_audio_thread)
	self.send_thread.daemon = True
	self.send_thread.start()

	logger.info("Started audio capture")

	except Exception as e:
	logger.error(f"Failed to start audio capture: {e}")
	self.is_capturing = False
	raise

	def _capture_audio_thread(self):
	"""Audio capture thread - runs in background."""
	while self.is_capturing and self.input_stream:
	try:
	# Read audio data
	audio_data = self.input_stream.read(self.chunk_size, exception_on_overflow=False)

	if audio_data and self.is_capturing:
	# Convert to base64 and queue for sending
	audio_base64 = base64.b64encode(audio_data).decode("utf-8")
	self.audio_send_queue.put(audio_base64)

	except Exception as e:
	if self.is_capturing:
	logger.error(f"Error in audio capture: {e}")
	break

	def _send_audio_thread(self):
	"""Audio send thread - handles async operations from sync thread."""
	while self.is_capturing:
	try:
	# Get audio data from queue (blocking with timeout)
	audio_base64 = self.audio_send_queue.get(timeout=0.1)

	if audio_base64 and self.is_capturing and self.loop:
	# Schedule the async send operation in the main event loop
	future = asyncio.run_coroutine_threadsafe(
	self.connection.input_audio_buffer.append(audio=audio_base64), self.loop
	)
	# Don't wait for completion to avoid blocking

	except queue.Empty:
	continue
	except Exception as e:
	if self.is_capturing:
	logger.error(f"Error sending audio: {e}")
	break

	async def stop_capture(self):
	"""Stop capturing audio."""
	if not self.is_capturing:
	return

	self.is_capturing = False

	if self.input_stream:
	self.input_stream.stop_stream()
	self.input_stream.close()
	self.input_stream = None

	if self.capture_thread:
	self.capture_thread.join(timeout=1.0)

	if self.send_thread:
	self.send_thread.join(timeout=1.0)

	# Clear the send queue
	while not self.audio_send_queue.empty():
	try:
	self.audio_send_queue.get_nowait()
	except queue.Empty:
	break

	logger.info("Stopped audio capture")

	async def start_playback(self):
	"""Initialize audio playback system."""
	if self.is_playing:
	return

	self.is_playing = True

	try:
	self.output_stream = self.audio.open(
	format=self.format,
	channels=self.channels,
	rate=self.rate,
	output=True,
	frames_per_buffer=self.chunk_size,
	)

	# Start playback thread
	self.playback_thread = threading.Thread(target=self._playback_audio_thread)
	self.playback_thread.daemon = True
	self.playback_thread.start()

	logger.info("Audio playback system ready")

	except Exception as e:
	logger.error(f"Failed to initialize audio playback: {e}")
	self.is_playing = False
	raise

	def _playback_audio_thread(self):
	"""Audio playback thread - runs in background."""
	while self.is_playing:
	try:
	# Get audio data from queue (blocking with timeout)
	audio_data = self.audio_queue.get(timeout=0.1)

	if audio_data and self.output_stream and self.is_playing:
	self.output_stream.write(audio_data)

	except queue.Empty:
	continue
	except Exception as e:
	if self.is_playing:
	logger.error(f"Error in audio playback: {e}")
	break

	async def queue_audio(self, audio_data: bytes):
	"""Queue audio data for playback."""
	if self.is_playing:
	self.audio_queue.put(audio_data)

	async def stop_playback(self):
	"""Stop audio playback and clear queue."""
	if not self.is_playing:
	return

	self.is_playing = False

	# Clear the queue
	while not self.audio_queue.empty():
	try:
	self.audio_queue.get_nowait()
	except queue.Empty:
	break

	if self.output_stream:
	self.output_stream.stop_stream()
	self.output_stream.close()
	self.output_stream = None

	if self.playback_thread:
	self.playback_thread.join(timeout=1.0)

	logger.info("Stopped audio playback")

	async def cleanup(self):
	"""Clean up audio resources."""
	await self.stop_capture()
	await self.stop_playback()

	if self.audio:
	self.audio.terminate()

	self.executor.shutdown(wait=True)
	logger.info("Audio processor cleaned up")


	class BasicVoiceAssistant:
	"""Basic voice assistant implementing the VoiceLive SDK patterns."""

	def __init__(
	self,
	endpoint: str,
	credential: Union[AzureKeyCredential, TokenCredential],
	model: str,
	voice: str,
	instructions: str,
	agent_id: str,
	foundry_project_name: str,
	agent_access_token: str,
	):

	self.endpoint = endpoint
	self.credential = credential
	self.model = model
	self.agent_id = agent_id,
	self.foundry_project_name = foundry_project_name,
	self.agent_access_token = agent_access_token,
	self.voice = voice
	self.instructions = instructions
	self.connection: Optional["VoiceLiveConnection"] = None
	self.audio_processor: Optional[AudioProcessor] = None
	self.session_ready = False
	self.conversation_started = False

	async def start(self):
	"""Start the voice assistant session."""
	try:
	logger.info(f"Connecting to VoiceLive API with agent {self.agent_id}")

	# Connect to VoiceLive WebSocket API
	async with connect(
	endpoint=self.endpoint,
	credential=self.credential,
	query={
	"agent-id": self.agent_id,
	"agent-project-name": self.foundry_project_name,
	"agent-access-token": self.agent_access_token
	},
	connection_options={
	"max_msg_size": 10 * 1024 * 1024,
	"heartbeat": 20,
	"timeout": 20,
	},
	) as connection:
	conn = connection
	self.connection = conn

	# Initialize audio processor
	ap = AudioProcessor(conn)
	self.audio_processor = ap

	# Configure session for voice conversation
	await self._setup_session()

	# Start audio systems
	await ap.start_playback()

	logger.info("Voice assistant ready! Start speaking...")
	print("
	" + "=" * 60)
	print("🎤 VOICE ASSISTANT READY")
	print("Start speaking to begin conversation")
	print("Press Ctrl+C to exit")
	print("=" * 60 + "
	")

	# Process events
	await self._process_events()

	except KeyboardInterrupt:
	logger.info("Received interrupt signal, shutting down...")

	except Exception as e:
	logger.error(f"Connection error: {e}")
	raise

	# Cleanup
	if self.audio_processor:
	await self.audio_processor.cleanup()

	async def _setup_session(self):
	"""Configure the VoiceLive session for audio conversation."""
	logger.info("Setting up voice conversation session...")

	# Create strongly typed voice configuration
	voice_config: Union[AzureStandardVoice, str]
	if self.voice.startswith("en-US-") or self.voice.startswith("en-CA-") or "-" in self.voice:
	# Azure voice
	voice_config = AzureStandardVoice(name=self.voice, type="azure-standard")
	else:
	# OpenAI voice (alloy, echo, fable, onyx, nova, shimmer)
	voice_config = self.voice

	# Create strongly typed turn detection configuration
	turn_detection_config = ServerVad(threshold=0.5, prefix_padding_ms=300, silence_duration_ms=500)

	# Create strongly typed session configuration
	session_config = RequestSession(
	modalities=[Modality.TEXT, Modality.AUDIO],
	instructions=self.instructions,
	voice=voice_config,
	input_audio_format=AudioFormat.PCM16,
	output_audio_format=AudioFormat.PCM16,
	turn_detection=turn_detection_config,
	)

	conn = self.connection
	assert conn is not None, "Connection must be established before setting up session"
	await conn.session.update(session=session_config)

	logger.info("Session configuration sent")

	async def _process_events(self):
	"""Process events from the VoiceLive connection."""
	try:
	conn = self.connection
	assert conn is not None, "Connection must be established before processing events"
	async for event in conn:
	await self._handle_event(event)

	except KeyboardInterrupt:
	logger.info("Event processing interrupted")
	except Exception as e:
	logger.error(f"Error processing events: {e}")
	raise

	async def _handle_event(self, event):
	"""Handle different types of events from VoiceLive."""
	logger.debug(f"Received event: {event.type}")
	ap = self.audio_processor
	conn = self.connection
	assert ap is not None, "AudioProcessor must be initialized"
	assert conn is not None, "Connection must be established"

	if event.type == ServerEventType.SESSION_UPDATED:
	logger.info(f"Session ready: {event.session.id}")
	self.session_ready = True

	# Start audio capture once session is ready
	await ap.start_capture()

	elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
	logger.info("🎤 User started speaking - stopping playback")
	print("🎤 Listening...")

	# Stop current assistant audio playback (interruption handling)
	await ap.stop_playback()

	# Cancel any ongoing response
	try:
	await conn.response.cancel()
	except Exception as e:
	logger.debug(f"No response to cancel: {e}")

	elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
	logger.info("🎤 User stopped speaking")
	print("🤔 Processing...")

	# Restart playback system for response
	await ap.start_playback()

	elif event.type == ServerEventType.RESPONSE_CREATED:
	logger.info("🤖 Assistant response created")

	elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
	# Stream audio response to speakers
	logger.debug("Received audio delta")
	await ap.queue_audio(event.delta)

	elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
	logger.info("🤖 Assistant finished speaking")
	print("🎤 Ready for next input...")

	elif event.type == ServerEventType.RESPONSE_DONE:
	logger.info("✅ Response complete")

	elif event.type == ServerEventType.ERROR:
	logger.error(f"❌ VoiceLive error: {event.error.message}")
	print(f"Error: {event.error.message}")

	elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED:
	logger.debug(f"Conversation item created: {event.item.id}")

	else:
	logger.debug(f"Unhandled event type: {event.type}")


	def parse_arguments():
	"""Parse command line arguments."""
	parser = argparse.ArgumentParser(
	description="Basic Voice Assistant using Azure VoiceLive SDK",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)

	parser.add_argument(
	"--api-key",
	help="Azure VoiceLive API key. If not provided, will use AZURE_VOICELIVE_API_KEY environment variable.",
	type=str,
	default=os.environ.get("AZURE_VOICELIVE_API_KEY"),
	)

	parser.add_argument(
	"--endpoint",
	help="Azure VoiceLive endpoint",
	type=str,
	default=os.environ.get("AZURE_VOICELIVE_ENDPOINT", "wss://api.voicelive.com/v1"),
	)

	parser.add_argument(
	"--model",
	help="VoiceLive model to use",
	type=str,
	default=os.environ.get("VOICELIVE_MODEL", "gpt-4o-realtime-preview"),
	)

	parser.add_argument(
	"--voice",
	help="Voice to use for the assistant",
	type=str,
	default=os.environ.get("VOICELIVE_VOICE", "en-US-AvaNeural"),
	choices=[
	"alloy",
	"echo",
	"fable",
	"onyx",
	"nova",
	"shimmer",
	"en-US-AvaNeural",
	"en-US-JennyNeural",
	"en-US-GuyNeural",
	],
	)

	parser.add_argument(
	"--instructions",
	help="System instructions for the AI assistant",
	type=str,
	default=os.environ.get(
	"VOICELIVE_INSTRUCTIONS",
	"You are a helpful AI assistant. Respond naturally and conversationally. "
	"Keep your responses concise but engaging.",
	),
	)

	parser.add_argument(
	"--use-token-credential", help="Use Azure token credential instead of API key", action="store_true"
	)

	parser.add_argument("--verbose", help="Enable verbose logging", action="store_true")

	return parser.parse_args()


	async def main():
	"""Main function."""
	args = parse_arguments()

	# Set logging level
	if args.verbose:
	logging.getLogger().setLevel(logging.DEBUG)

	# Validate credentials
	if not args.api_key and not args.use_token_credential:
	print("❌ Error: No authentication provided")
	print("Please provide an API key using --api-key or set AZURE_VOICELIVE_API_KEY environment variable,")
	print("or use --use-token-credential for Azure authentication.")
	sys.exit(1)

	try:
	# Create client with appropriate credential
	credential: Union[AzureKeyCredential, TokenCredential]
	if args.use_token_credential:
	credential = InteractiveBrowserCredential() # or DefaultAzureCredential() if needed
	logger.info("Using Azure token credential")
	else:
	credential = AzureKeyCredential(args.api_key)
	logger.info("Using API key credential")

	# Create and start voice assistant
	assistant = BasicVoiceAssistant(
	endpoint=args.endpoint,
	credential=credential,
	voice=args.voice,
	instructions=args.instructions,
	agent_id="<agent_id>",
	foundry_project_name="<foundry_project_name>",
	agent_access_token="<agent_access_token>",
	)

	# Setup signal handlers for graceful shutdown
	def signal_handler(sig, frame):
	logger.info("Received shutdown signal")
	raise KeyboardInterrupt()

	signal.signal(signal.SIGINT, signal_handler)
	signal.signal(signal.SIGTERM, signal_handler)

	# Start the assistant
	await assistant.start()

	except KeyboardInterrupt:
	print("
	👋 Voice assistant shut down. Goodbye!")
	except Exception as e:
	logger.error(f"Fatal error: {e}")
	print(f"❌ Error: {e}")
	sys.exit(1)


	if __name__ == "__main__":
	# Check for required dependencies
	dependencies = {
	"pyaudio": "Audio processing",
	"azure.ai.voicelive": "Azure VoiceLive SDK",
	"azure.core": "Azure Core libraries",
	}

	missing_deps = []
	for dep, description in dependencies.items():
	try:
	__import__(dep.replace("-", "_"))
	except ImportError:
	missing_deps.append(f"{dep} ({description})")

	if missing_deps:
	print("❌ Missing required dependencies:")
	for dep in missing_deps:
	print(f" - {dep}")
	print("
	Install with: pip install azure-ai-voicelive pyaudio python-dotenv")
	sys.exit(1)

	# Check audio system
	try:
	p = pyaudio.PyAudio()
	# Check for input devices
	input_devices = [
	i
	for i in range(p.get_device_count())
	if cast(Union[int, float], p.get_device_info_by_index(i).get("maxInputChannels", 0) or 0) > 0
	]
	# Check for output devices
	output_devices = [
	i
	for i in range(p.get_device_count())
	if cast(Union[int, float], p.get_device_info_by_index(i).get("maxOutputChannels", 0) or 0) > 0
	]
	p.terminate()

	if not input_devices:
	print("❌ No audio input devices found. Please check your microphone.")
	sys.exit(1)
	if not output_devices:
	print("❌ No audio output devices found. Please check your speakers.")
	sys.exit(1)

	except Exception as e:
	print(f"❌ Audio system check failed: {e}")
	sys.exit(1)

	print("🎙️ Basic Voice Assistant with Azure VoiceLive SDK")
	print("=" * 50)

	# Run the assistant
	asyncio.run(main())
No results found