feat: implement core D&D helpers logic and system architecture

This commit is contained in:
2026-05-25 22:14:58 -07:00
parent 5bb483431f
commit 685586318f
36 changed files with 1137 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
# STT Module
Binary file not shown.
Binary file not shown.
Binary file not shown.
+91
View File
@@ -0,0 +1,91 @@
import asyncio
import logging
import numpy as np
import sounddevice as sd
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AudioListener:
"""
Captures audio from the microphone in chunks and puts them into an asyncio queue.
"""
def __init__(self, sample_rate=16000, chunk_duration=3, device=None, loop=None):
self.sample_rate = sample_rate
self.chunk_duration = chunk_duration
self.device = device
self.loop = loop
self.audio_queue = asyncio.Queue()
self.is_listening = False
def _audio_callback(self, indata, frames, time, status):
"""
This callback is called by sounddevice for every block of audio captured.
"""
if status:
logger.warning(f"SoundDevice status: {status}")
# We capture audio in chunks. sounddevice provides blocks.
# We append these blocks to a buffer until we reach chunk_duration.
self._buffer.append(indata.copy())
# Check if we have enough data for a full chunk
current_duration = len(self._buffer) * frames / self.sample_rate
if current_duration >= self.chunk_duration:
# Concatenate all buffers into one chunk
chunk = np.concatenate(self._buffer, axis=0)
# Trim to exactly chunk_duration to maintain consistency
target_samples = int(self.sample_rate * self.chunk_duration)
chunk = chunk[:target_samples]
# Use call_soon_threadsafe to put the chunk into the asyncio queue from the callback thread
self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
self._buffer = []
def start(self):
"""
Starts the audio capture stream.
"""
if self.loop is None:
raise RuntimeError("Event loop must be provided to AudioListener")
self.is_listening = True
self._buffer = []
# Define the block size for the callback
# We'll use a smaller block size (e.g. 0.1s) to keep the callback responsive
block_size = int(self.sample_rate * 0.1)
try:
self.stream = sd.InputStream(
device=self.device,
channels=1,
samplerate=self.sample_rate,
blocksize=block_size,
callback=self._audio_callback,
)
self.stream.start()
logger.info("Audio listener started.")
except Exception as e:
logger.error(f"Failed to start audio listener: {e}")
self.is_listening = False
raise
def stop(self):
"""
Stops the audio capture stream.
"""
if hasattr(self, "stream"):
self.stream.stop()
self.stream.close()
self.is_listening = False
logger.info("Audio listener stopped.")
async def get_chunk(self):
"""
Retrieves a chunk of audio from the queue asynchronously.
"""
return await self.audio_queue.get()
+69
View File
@@ -0,0 +1,69 @@
import logging
from faster_whisper import WhisperModel
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Transcriber:
"""
Converts audio chunks (numpy arrays) into text using faster-whisper.
"""
def __init__(self, model_size="base", device="cpu", compute_type="int8"):
"""
Initializes the faster-whisper model.
Args:
model_size (str): The size of the model to use (e.g., "tiny", "base", "small").
device (str): The device to run the model on ("cpu" or "cuda").
compute_type (str): The compute type to use (e.g., "int8", "float16").
"""
logger.info(
f"Loading faster-whisper model: {model_size} on {device} ({compute_type})..."
)
try:
self.model = WhisperModel(
model_size, device=device, compute_type=compute_type
)
logger.info("Model loaded successfully.")
except Exception as e:
logger.error(f"Failed to load faster-whisper model: {e}")
raise
def transcribe(self, audio_chunk):
"""
Transcribes a single audio chunk.
Args:
audio_chunk (np.ndarray): The audio data as a numpy array.
Returns:
str: The transcribed text.
"""
if audio_chunk is None:
return ""
try:
# faster-whisper expects audio in float32
audio_data = audio_chunk.astype("float32")
# Transcribe the audio
segments, info = self.model.transcribe(audio_data, beam_size=5)
# Combine segments into a single string
text = " ".join([segment.text.strip() for segment in segments])
return text.strip()
except Exception as e:
logger.error(f"Transcription error: {e}")
return ""
def close(self):
"""
Explicitly release model resources if necessary.
"""
# faster-whisper's WhisperModel doesn't have a standard close(),
# but we'll provide this for consistency.
pass