Improve audio capture and LLM integration
- Implement Silero VAD for dynamic audio chunking - Add support for Ollama and vLLM backends - Harden extraction prompts for strict JSON output - Refactor TUI worker to handle proposals asynchronously
This commit is contained in:
Binary file not shown.
+116
-23
@@ -3,6 +3,7 @@ import logging
|
||||
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
import torch
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -11,16 +12,56 @@ logger = logging.getLogger(__name__)
|
||||
class AudioListener:
|
||||
"""
|
||||
Captures audio from the microphone in chunks and puts them into an asyncio queue.
|
||||
Uses Silero VAD for dynamic chunking based on speech detection.
|
||||
"""
|
||||
|
||||
def __init__(self, sample_rate=16000, chunk_duration=3, device=None, loop=None):
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate=16000,
|
||||
device=None,
|
||||
loop=None,
|
||||
vad_threshold=0.5,
|
||||
silence_duration=0.5,
|
||||
max_chunk_size=30,
|
||||
):
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_duration = chunk_duration
|
||||
self.device = device
|
||||
self.loop = loop
|
||||
|
||||
# VAD Configuration
|
||||
self.vad_threshold = vad_threshold
|
||||
self.silence_duration = silence_duration
|
||||
self.max_chunk_size = max_chunk_size
|
||||
|
||||
self.audio_queue = asyncio.Queue()
|
||||
self.is_listening = False
|
||||
|
||||
# Load Silero VAD model
|
||||
try:
|
||||
self.model, utils = torch.hub.load(
|
||||
repo_or_dir="snakers4/silero-vad", model="silero_vad"
|
||||
)
|
||||
self.model.eval()
|
||||
if torch.cuda.is_available():
|
||||
self.model = self.model.cuda()
|
||||
logger.info("Silero VAD model loaded successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Silero VAD model: {e}")
|
||||
raise
|
||||
|
||||
# VAD state
|
||||
self.is_collecting = False
|
||||
self.silence_samples = 0
|
||||
self.max_silence_samples = int(self.silence_duration * self.sample_rate)
|
||||
self.max_chunk_samples = int(self.max_chunk_size * self.sample_rate)
|
||||
|
||||
# Pre-padding buffer (e.g., 200ms)
|
||||
self.pre_padding_samples = int(0.2 * self.sample_rate)
|
||||
self._ring_buffer = np.zeros(self.pre_padding_samples, dtype=np.float32)
|
||||
self._ring_buffer_idx = 0
|
||||
|
||||
self._collection_buffer = []
|
||||
|
||||
def _audio_callback(self, indata, frames, time, status):
|
||||
"""
|
||||
This callback is called by sounddevice for every block of audio captured.
|
||||
@@ -28,25 +69,77 @@ class AudioListener:
|
||||
if status:
|
||||
logger.warning(f"SoundDevice status: {status}")
|
||||
|
||||
# We capture audio in chunks. sounddevice provides blocks.
|
||||
# We append these blocks to a buffer until we reach chunk_duration.
|
||||
self._buffer.append(indata.copy())
|
||||
# Ensure data is float32 and 1D
|
||||
audio_data = indata.flatten().astype(np.float32)
|
||||
|
||||
# Check if we have enough data for a full chunk
|
||||
current_duration = len(self._buffer) * frames / self.sample_rate
|
||||
if current_duration >= self.chunk_duration:
|
||||
# Concatenate all buffers into one chunk
|
||||
chunk = np.concatenate(self._buffer, axis=0)
|
||||
# Trim to exactly chunk_duration to maintain consistency
|
||||
target_samples = int(self.sample_rate * self.chunk_duration)
|
||||
chunk = chunk[:target_samples]
|
||||
# 1. Update ring buffer for pre-padding
|
||||
# We overwrite the oldest data in the ring buffer
|
||||
num_samples = len(audio_data)
|
||||
for i in range(num_samples):
|
||||
self._ring_buffer[self._ring_buffer_idx] = audio_data[i]
|
||||
self._ring_buffer_idx = (
|
||||
self._ring_buffer_idx + 1
|
||||
) % self.pre_padding_samples
|
||||
|
||||
# Flatten to 1D array (samples,) as expected by faster-whisper
|
||||
chunk = chunk.flatten()
|
||||
# 2. Run VAD
|
||||
# Convert to torch tensor
|
||||
tensor_input = torch.from_numpy(audio_data)
|
||||
if torch.cuda.is_available():
|
||||
tensor_input = tensor_input.cuda()
|
||||
|
||||
# Use call_soon_threadsafe to put the chunk into the asyncio queue from the callback thread
|
||||
self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
|
||||
self._buffer = []
|
||||
with torch.no_grad():
|
||||
# The model expects (batch, samples)
|
||||
# Silero VAD expects frames of 512, 1024, or 1536 for 16kHz
|
||||
# Since we use block_size=512, we are good.
|
||||
probability = self.model(tensor_input.unsqueeze(0), self.sample_rate).item()
|
||||
|
||||
# 3. State-based Chunking Logic
|
||||
if probability > self.vad_threshold:
|
||||
if not self.is_collecting:
|
||||
# Start Detection: Transition to COLLECTING
|
||||
logger.debug("Speech detected. Starting collection.")
|
||||
self.is_collecting = True
|
||||
|
||||
# Pre-padding: Append the ring buffer in the correct order
|
||||
padding = np.roll(self._ring_buffer, -self._ring_buffer_idx)
|
||||
self._collection_buffer.append(padding)
|
||||
|
||||
# Reset silence counter
|
||||
self.silence_samples = 0
|
||||
self._collection_buffer.append(audio_data)
|
||||
|
||||
elif self.is_collecting:
|
||||
# We are in COLLECTING state but current frame is silence
|
||||
self._collection_buffer.append(audio_data)
|
||||
self.silence_samples += num_samples
|
||||
|
||||
# End Detection: Silence lasted longer than threshold
|
||||
if self.silence_samples >= self.max_silence_samples:
|
||||
logger.debug("Silence detected. Flushing chunk.")
|
||||
self._flush_buffer()
|
||||
# Max Chunk Size: Force flush
|
||||
elif sum(len(b) for b in self._collection_buffer) >= self.max_chunk_samples:
|
||||
logger.debug("Max chunk size reached. Force flushing.")
|
||||
self._flush_buffer()
|
||||
|
||||
else:
|
||||
# IDLE state, just waiting for speech
|
||||
pass
|
||||
|
||||
def _flush_buffer(self):
|
||||
"""
|
||||
Concatenates the collection buffer and puts it into the asyncio queue.
|
||||
"""
|
||||
if not self._collection_buffer:
|
||||
return
|
||||
|
||||
chunk = np.concatenate(self._collection_buffer).flatten()
|
||||
self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
|
||||
|
||||
# Reset state
|
||||
self._collection_buffer = []
|
||||
self.is_collecting = False
|
||||
self.silence_samples = 0
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
@@ -56,11 +149,11 @@ class AudioListener:
|
||||
raise RuntimeError("Event loop must be provided to AudioListener")
|
||||
|
||||
self.is_listening = True
|
||||
self._buffer = []
|
||||
self._collection_buffer = []
|
||||
|
||||
# Define the block size for the callback
|
||||
# We'll use a smaller block size (e.g. 0.1s) to keep the callback responsive
|
||||
block_size = int(self.sample_rate * 0.1)
|
||||
# Define the block size for the callback.
|
||||
# Silero VAD v4 recommends 512 samples for 16kHz.
|
||||
block_size = 512
|
||||
|
||||
try:
|
||||
self.stream = sd.InputStream(
|
||||
@@ -71,7 +164,7 @@ class AudioListener:
|
||||
callback=self._audio_callback,
|
||||
)
|
||||
self.stream.start()
|
||||
logger.info("Audio listener started.")
|
||||
logger.info("Audio listener started with VAD-based chunking.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start audio listener: {e}")
|
||||
self.is_listening = False
|
||||
|
||||
Reference in New Issue
Block a user