Improve audio capture and LLM integration

- Implement Silero VAD for dynamic audio chunking
- Add support for Ollama and vLLM backends
- Harden extraction prompts for strict JSON output
- Refactor TUI worker to handle proposals asynchronously
This commit is contained in:
2026-05-26 19:51:48 -07:00
parent 60e170e777
commit 58bab75bb5
11 changed files with 290 additions and 78 deletions
Binary file not shown.
+116 -23
View File
@@ -3,6 +3,7 @@ import logging
import numpy as np
import sounddevice as sd
import torch
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -11,16 +12,56 @@ logger = logging.getLogger(__name__)
class AudioListener:
"""
Captures audio from the microphone in chunks and puts them into an asyncio queue.
Uses Silero VAD for dynamic chunking based on speech detection.
"""
def __init__(self, sample_rate=16000, chunk_duration=3, device=None, loop=None):
def __init__(
self,
sample_rate=16000,
device=None,
loop=None,
vad_threshold=0.5,
silence_duration=0.5,
max_chunk_size=30,
):
self.sample_rate = sample_rate
self.chunk_duration = chunk_duration
self.device = device
self.loop = loop
# VAD Configuration
self.vad_threshold = vad_threshold
self.silence_duration = silence_duration
self.max_chunk_size = max_chunk_size
self.audio_queue = asyncio.Queue()
self.is_listening = False
# Load Silero VAD model
try:
self.model, utils = torch.hub.load(
repo_or_dir="snakers4/silero-vad", model="silero_vad"
)
self.model.eval()
if torch.cuda.is_available():
self.model = self.model.cuda()
logger.info("Silero VAD model loaded successfully.")
except Exception as e:
logger.error(f"Failed to load Silero VAD model: {e}")
raise
# VAD state
self.is_collecting = False
self.silence_samples = 0
self.max_silence_samples = int(self.silence_duration * self.sample_rate)
self.max_chunk_samples = int(self.max_chunk_size * self.sample_rate)
# Pre-padding buffer (e.g., 200ms)
self.pre_padding_samples = int(0.2 * self.sample_rate)
self._ring_buffer = np.zeros(self.pre_padding_samples, dtype=np.float32)
self._ring_buffer_idx = 0
self._collection_buffer = []
def _audio_callback(self, indata, frames, time, status):
"""
This callback is called by sounddevice for every block of audio captured.
@@ -28,25 +69,77 @@ class AudioListener:
if status:
logger.warning(f"SoundDevice status: {status}")
# We capture audio in chunks. sounddevice provides blocks.
# We append these blocks to a buffer until we reach chunk_duration.
self._buffer.append(indata.copy())
# Ensure data is float32 and 1D
audio_data = indata.flatten().astype(np.float32)
# Check if we have enough data for a full chunk
current_duration = len(self._buffer) * frames / self.sample_rate
if current_duration >= self.chunk_duration:
# Concatenate all buffers into one chunk
chunk = np.concatenate(self._buffer, axis=0)
# Trim to exactly chunk_duration to maintain consistency
target_samples = int(self.sample_rate * self.chunk_duration)
chunk = chunk[:target_samples]
# 1. Update ring buffer for pre-padding
# We overwrite the oldest data in the ring buffer
num_samples = len(audio_data)
for i in range(num_samples):
self._ring_buffer[self._ring_buffer_idx] = audio_data[i]
self._ring_buffer_idx = (
self._ring_buffer_idx + 1
) % self.pre_padding_samples
# Flatten to 1D array (samples,) as expected by faster-whisper
chunk = chunk.flatten()
# 2. Run VAD
# Convert to torch tensor
tensor_input = torch.from_numpy(audio_data)
if torch.cuda.is_available():
tensor_input = tensor_input.cuda()
# Use call_soon_threadsafe to put the chunk into the asyncio queue from the callback thread
self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
self._buffer = []
with torch.no_grad():
# The model expects (batch, samples)
# Silero VAD expects frames of 512, 1024, or 1536 for 16kHz
# Since we use block_size=512, we are good.
probability = self.model(tensor_input.unsqueeze(0), self.sample_rate).item()
# 3. State-based Chunking Logic
if probability > self.vad_threshold:
if not self.is_collecting:
# Start Detection: Transition to COLLECTING
logger.debug("Speech detected. Starting collection.")
self.is_collecting = True
# Pre-padding: Append the ring buffer in the correct order
padding = np.roll(self._ring_buffer, -self._ring_buffer_idx)
self._collection_buffer.append(padding)
# Reset silence counter
self.silence_samples = 0
self._collection_buffer.append(audio_data)
elif self.is_collecting:
# We are in COLLECTING state but current frame is silence
self._collection_buffer.append(audio_data)
self.silence_samples += num_samples
# End Detection: Silence lasted longer than threshold
if self.silence_samples >= self.max_silence_samples:
logger.debug("Silence detected. Flushing chunk.")
self._flush_buffer()
# Max Chunk Size: Force flush
elif sum(len(b) for b in self._collection_buffer) >= self.max_chunk_samples:
logger.debug("Max chunk size reached. Force flushing.")
self._flush_buffer()
else:
# IDLE state, just waiting for speech
pass
def _flush_buffer(self):
"""
Concatenates the collection buffer and puts it into the asyncio queue.
"""
if not self._collection_buffer:
return
chunk = np.concatenate(self._collection_buffer).flatten()
self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
# Reset state
self._collection_buffer = []
self.is_collecting = False
self.silence_samples = 0
def start(self):
"""
@@ -56,11 +149,11 @@ class AudioListener:
raise RuntimeError("Event loop must be provided to AudioListener")
self.is_listening = True
self._buffer = []
self._collection_buffer = []
# Define the block size for the callback
# We'll use a smaller block size (e.g. 0.1s) to keep the callback responsive
block_size = int(self.sample_rate * 0.1)
# Define the block size for the callback.
# Silero VAD v4 recommends 512 samples for 16kHz.
block_size = 512
try:
self.stream = sd.InputStream(
@@ -71,7 +164,7 @@ class AudioListener:
callback=self._audio_callback,
)
self.stream.start()
logger.info("Audio listener started.")
logger.info("Audio listener started with VAD-based chunking.")
except Exception as e:
logger.error(f"Failed to start audio listener: {e}")
self.is_listening = False