feat: implement core D&D helpers logic and system architecture

2026-05-25 22:14:58 -07:00
parent 5bb483431f
commit 685586318f
36 changed files with 1137 additions and 0 deletions
@@ -0,0 +1 @@
+# STT Module
@@ -0,0 +1,91 @@
+import asyncio
+import logging
+
+import numpy as np
+import sounddevice as sd
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class AudioListener:
+    """
+    Captures audio from the microphone in chunks and puts them into an asyncio queue.
+    """
+
+    def __init__(self, sample_rate=16000, chunk_duration=3, device=None, loop=None):
+        self.sample_rate = sample_rate
+        self.chunk_duration = chunk_duration
+        self.device = device
+        self.loop = loop
+        self.audio_queue = asyncio.Queue()
+        self.is_listening = False
+
+    def _audio_callback(self, indata, frames, time, status):
+        """
+        This callback is called by sounddevice for every block of audio captured.
+        """
+        if status:
+            logger.warning(f"SoundDevice status: {status}")
+
+        # We capture audio in chunks. sounddevice provides blocks.
+        # We append these blocks to a buffer until we reach chunk_duration.
+        self._buffer.append(indata.copy())
+
+        # Check if we have enough data for a full chunk
+        current_duration = len(self._buffer) * frames / self.sample_rate
+        if current_duration >= self.chunk_duration:
+            # Concatenate all buffers into one chunk
+            chunk = np.concatenate(self._buffer, axis=0)
+            # Trim to exactly chunk_duration to maintain consistency
+            target_samples = int(self.sample_rate * self.chunk_duration)
+            chunk = chunk[:target_samples]
+
+            # Use call_soon_threadsafe to put the chunk into the asyncio queue from the callback thread
+            self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
+            self._buffer = []
+
+    def start(self):
+        """
+        Starts the audio capture stream.
+        """
+        if self.loop is None:
+            raise RuntimeError("Event loop must be provided to AudioListener")
+
+        self.is_listening = True
+        self._buffer = []
+
+        # Define the block size for the callback
+        # We'll use a smaller block size (e.g. 0.1s) to keep the callback responsive
+        block_size = int(self.sample_rate * 0.1)
+
+        try:
+            self.stream = sd.InputStream(
+                device=self.device,
+                channels=1,
+                samplerate=self.sample_rate,
+                blocksize=block_size,
+                callback=self._audio_callback,
+            )
+            self.stream.start()
+            logger.info("Audio listener started.")
+        except Exception as e:
+            logger.error(f"Failed to start audio listener: {e}")
+            self.is_listening = False
+            raise
+
+    def stop(self):
+        """
+        Stops the audio capture stream.
+        """
+        if hasattr(self, "stream"):
+            self.stream.stop()
+            self.stream.close()
+        self.is_listening = False
+        logger.info("Audio listener stopped.")
+
+    async def get_chunk(self):
+        """
+        Retrieves a chunk of audio from the queue asynchronously.
+        """
+        return await self.audio_queue.get()
@@ -0,0 +1,69 @@
+import logging
+
+from faster_whisper import WhisperModel
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Transcriber:
+    """
+    Converts audio chunks (numpy arrays) into text using faster-whisper.
+    """
+
+    def __init__(self, model_size="base", device="cpu", compute_type="int8"):
+        """
+        Initializes the faster-whisper model.
+
+        Args:
+            model_size (str): The size of the model to use (e.g., "tiny", "base", "small").
+            device (str): The device to run the model on ("cpu" or "cuda").
+            compute_type (str): The compute type to use (e.g., "int8", "float16").
+        """
+        logger.info(
+            f"Loading faster-whisper model: {model_size} on {device} ({compute_type})..."
+        )
+        try:
+            self.model = WhisperModel(
+                model_size, device=device, compute_type=compute_type
+            )
+            logger.info("Model loaded successfully.")
+        except Exception as e:
+            logger.error(f"Failed to load faster-whisper model: {e}")
+            raise
+
+    def transcribe(self, audio_chunk):
+        """
+        Transcribes a single audio chunk.
+
+        Args:
+            audio_chunk (np.ndarray): The audio data as a numpy array.
+
+        Returns:
+            str: The transcribed text.
+        """
+        if audio_chunk is None:
+            return ""
+
+        try:
+            # faster-whisper expects audio in float32
+            audio_data = audio_chunk.astype("float32")
+
+            # Transcribe the audio
+            segments, info = self.model.transcribe(audio_data, beam_size=5)
+
+            # Combine segments into a single string
+            text = " ".join([segment.text.strip() for segment in segments])
+
+            return text.strip()
+        except Exception as e:
+            logger.error(f"Transcription error: {e}")
+            return ""
+
+    def close(self):
+        """
+        Explicitly release model resources if necessary.
+        """
+        # faster-whisper's WhisperModel doesn't have a standard close(),
+        # but we'll provide this for consistency.
+        pass