Improve audio capture and LLM integration

- Implement Silero VAD for dynamic audio chunking - Add support for Ollama and vLLM backends - Harden extraction prompts for strict JSON output - Refactor TUI worker to handle proposals asynchronously
2026-05-26 19:51:48 -07:00
parent 60e170e777
commit 58bab75bb5
11 changed files with 290 additions and 78 deletions
@@ -3,6 +3,7 @@ import logging

 import numpy as np
 import sounddevice as sd
+import torch

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -11,16 +12,56 @@ logger = logging.getLogger(__name__)
 class AudioListener:
    """
    Captures audio from the microphone in chunks and puts them into an asyncio queue.
+    Uses Silero VAD for dynamic chunking based on speech detection.
    """

-    def __init__(self, sample_rate=16000, chunk_duration=3, device=None, loop=None):
+    def __init__(
+        self,
+        sample_rate=16000,
+        device=None,
+        loop=None,
+        vad_threshold=0.5,
+        silence_duration=0.5,
+        max_chunk_size=30,
+    ):
        self.sample_rate = sample_rate
-        self.chunk_duration = chunk_duration
        self.device = device
        self.loop = loop
+
+        # VAD Configuration
+        self.vad_threshold = vad_threshold
+        self.silence_duration = silence_duration
+        self.max_chunk_size = max_chunk_size
+
        self.audio_queue = asyncio.Queue()
        self.is_listening = False

+        # Load Silero VAD model
+        try:
+            self.model, utils = torch.hub.load(
+                repo_or_dir="snakers4/silero-vad", model="silero_vad"
+            )
+            self.model.eval()
+            if torch.cuda.is_available():
+                self.model = self.model.cuda()
+            logger.info("Silero VAD model loaded successfully.")
+        except Exception as e:
+            logger.error(f"Failed to load Silero VAD model: {e}")
+            raise
+
+        # VAD state
+        self.is_collecting = False
+        self.silence_samples = 0
+        self.max_silence_samples = int(self.silence_duration * self.sample_rate)
+        self.max_chunk_samples = int(self.max_chunk_size * self.sample_rate)
+
+        # Pre-padding buffer (e.g., 200ms)
+        self.pre_padding_samples = int(0.2 * self.sample_rate)
+        self._ring_buffer = np.zeros(self.pre_padding_samples, dtype=np.float32)
+        self._ring_buffer_idx = 0
+
+        self._collection_buffer = []
+
    def _audio_callback(self, indata, frames, time, status):
        """
        This callback is called by sounddevice for every block of audio captured.
@@ -28,25 +69,77 @@ class AudioListener:
        if status:
            logger.warning(f"SoundDevice status: {status}")

-        # We capture audio in chunks. sounddevice provides blocks.
-        # We append these blocks to a buffer until we reach chunk_duration.
-        self._buffer.append(indata.copy())
+        # Ensure data is float32 and 1D
+        audio_data = indata.flatten().astype(np.float32)

-        # Check if we have enough data for a full chunk
-        current_duration = len(self._buffer) * frames / self.sample_rate
-        if current_duration >= self.chunk_duration:
-            # Concatenate all buffers into one chunk
-            chunk = np.concatenate(self._buffer, axis=0)
-            # Trim to exactly chunk_duration to maintain consistency
-            target_samples = int(self.sample_rate * self.chunk_duration)
-            chunk = chunk[:target_samples]
+        # 1. Update ring buffer for pre-padding
+        # We overwrite the oldest data in the ring buffer
+        num_samples = len(audio_data)
+        for i in range(num_samples):
+            self._ring_buffer[self._ring_buffer_idx] = audio_data[i]
+            self._ring_buffer_idx = (
+                self._ring_buffer_idx + 1
+            ) % self.pre_padding_samples

-            # Flatten to 1D array (samples,) as expected by faster-whisper
-            chunk = chunk.flatten()
+        # 2. Run VAD
+        # Convert to torch tensor
+        tensor_input = torch.from_numpy(audio_data)
+        if torch.cuda.is_available():
+            tensor_input = tensor_input.cuda()

-            # Use call_soon_threadsafe to put the chunk into the asyncio queue from the callback thread
-            self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
-            self._buffer = []
+        with torch.no_grad():
+            # The model expects (batch, samples)
+            # Silero VAD expects frames of 512, 1024, or 1536 for 16kHz
+            # Since we use block_size=512, we are good.
+            probability = self.model(tensor_input.unsqueeze(0), self.sample_rate).item()
+
+        # 3. State-based Chunking Logic
+        if probability > self.vad_threshold:
+            if not self.is_collecting:
+                # Start Detection: Transition to COLLECTING
+                logger.debug("Speech detected. Starting collection.")
+                self.is_collecting = True
+
+                # Pre-padding: Append the ring buffer in the correct order
+                padding = np.roll(self._ring_buffer, -self._ring_buffer_idx)
+                self._collection_buffer.append(padding)
+
+            # Reset silence counter
+            self.silence_samples = 0
+            self._collection_buffer.append(audio_data)
+
+        elif self.is_collecting:
+            # We are in COLLECTING state but current frame is silence
+            self._collection_buffer.append(audio_data)
+            self.silence_samples += num_samples
+
+            # End Detection: Silence lasted longer than threshold
+            if self.silence_samples >= self.max_silence_samples:
+                logger.debug("Silence detected. Flushing chunk.")
+                self._flush_buffer()
+            # Max Chunk Size: Force flush
+            elif sum(len(b) for b in self._collection_buffer) >= self.max_chunk_samples:
+                logger.debug("Max chunk size reached. Force flushing.")
+                self._flush_buffer()
+
+        else:
+            # IDLE state, just waiting for speech
+            pass
+
+    def _flush_buffer(self):
+        """
+        Concatenates the collection buffer and puts it into the asyncio queue.
+        """
+        if not self._collection_buffer:
+            return
+
+        chunk = np.concatenate(self._collection_buffer).flatten()
+        self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
+
+        # Reset state
+        self._collection_buffer = []
+        self.is_collecting = False
+        self.silence_samples = 0

    def start(self):
        """
@@ -56,11 +149,11 @@ class AudioListener:
            raise RuntimeError("Event loop must be provided to AudioListener")

        self.is_listening = True
-        self._buffer = []
+        self._collection_buffer = []

-        # Define the block size for the callback
-        # We'll use a smaller block size (e.g. 0.1s) to keep the callback responsive
-        block_size = int(self.sample_rate * 0.1)
+        # Define the block size for the callback.
+        # Silero VAD v4 recommends 512 samples for 16kHz.
+        block_size = 512

        try:
            self.stream = sd.InputStream(
@@ -71,7 +164,7 @@ class AudioListener:
                callback=self._audio_callback,
            )
            self.stream.start()
-            logger.info("Audio listener started.")
+            logger.info("Audio listener started with VAD-based chunking.")
        except Exception as e:
            logger.error(f"Failed to start audio listener: {e}")
            self.is_listening = False