feat: implement core D&D helpers logic and system architecture
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# STT Module
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,91 @@
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AudioListener:
|
||||
"""
|
||||
Captures audio from the microphone in chunks and puts them into an asyncio queue.
|
||||
"""
|
||||
|
||||
def __init__(self, sample_rate=16000, chunk_duration=3, device=None, loop=None):
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_duration = chunk_duration
|
||||
self.device = device
|
||||
self.loop = loop
|
||||
self.audio_queue = asyncio.Queue()
|
||||
self.is_listening = False
|
||||
|
||||
def _audio_callback(self, indata, frames, time, status):
|
||||
"""
|
||||
This callback is called by sounddevice for every block of audio captured.
|
||||
"""
|
||||
if status:
|
||||
logger.warning(f"SoundDevice status: {status}")
|
||||
|
||||
# We capture audio in chunks. sounddevice provides blocks.
|
||||
# We append these blocks to a buffer until we reach chunk_duration.
|
||||
self._buffer.append(indata.copy())
|
||||
|
||||
# Check if we have enough data for a full chunk
|
||||
current_duration = len(self._buffer) * frames / self.sample_rate
|
||||
if current_duration >= self.chunk_duration:
|
||||
# Concatenate all buffers into one chunk
|
||||
chunk = np.concatenate(self._buffer, axis=0)
|
||||
# Trim to exactly chunk_duration to maintain consistency
|
||||
target_samples = int(self.sample_rate * self.chunk_duration)
|
||||
chunk = chunk[:target_samples]
|
||||
|
||||
# Use call_soon_threadsafe to put the chunk into the asyncio queue from the callback thread
|
||||
self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk)
|
||||
self._buffer = []
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Starts the audio capture stream.
|
||||
"""
|
||||
if self.loop is None:
|
||||
raise RuntimeError("Event loop must be provided to AudioListener")
|
||||
|
||||
self.is_listening = True
|
||||
self._buffer = []
|
||||
|
||||
# Define the block size for the callback
|
||||
# We'll use a smaller block size (e.g. 0.1s) to keep the callback responsive
|
||||
block_size = int(self.sample_rate * 0.1)
|
||||
|
||||
try:
|
||||
self.stream = sd.InputStream(
|
||||
device=self.device,
|
||||
channels=1,
|
||||
samplerate=self.sample_rate,
|
||||
blocksize=block_size,
|
||||
callback=self._audio_callback,
|
||||
)
|
||||
self.stream.start()
|
||||
logger.info("Audio listener started.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start audio listener: {e}")
|
||||
self.is_listening = False
|
||||
raise
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stops the audio capture stream.
|
||||
"""
|
||||
if hasattr(self, "stream"):
|
||||
self.stream.stop()
|
||||
self.stream.close()
|
||||
self.is_listening = False
|
||||
logger.info("Audio listener stopped.")
|
||||
|
||||
async def get_chunk(self):
|
||||
"""
|
||||
Retrieves a chunk of audio from the queue asynchronously.
|
||||
"""
|
||||
return await self.audio_queue.get()
|
||||
@@ -0,0 +1,69 @@
|
||||
import logging
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Transcriber:
|
||||
"""
|
||||
Converts audio chunks (numpy arrays) into text using faster-whisper.
|
||||
"""
|
||||
|
||||
def __init__(self, model_size="base", device="cpu", compute_type="int8"):
|
||||
"""
|
||||
Initializes the faster-whisper model.
|
||||
|
||||
Args:
|
||||
model_size (str): The size of the model to use (e.g., "tiny", "base", "small").
|
||||
device (str): The device to run the model on ("cpu" or "cuda").
|
||||
compute_type (str): The compute type to use (e.g., "int8", "float16").
|
||||
"""
|
||||
logger.info(
|
||||
f"Loading faster-whisper model: {model_size} on {device} ({compute_type})..."
|
||||
)
|
||||
try:
|
||||
self.model = WhisperModel(
|
||||
model_size, device=device, compute_type=compute_type
|
||||
)
|
||||
logger.info("Model loaded successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load faster-whisper model: {e}")
|
||||
raise
|
||||
|
||||
def transcribe(self, audio_chunk):
|
||||
"""
|
||||
Transcribes a single audio chunk.
|
||||
|
||||
Args:
|
||||
audio_chunk (np.ndarray): The audio data as a numpy array.
|
||||
|
||||
Returns:
|
||||
str: The transcribed text.
|
||||
"""
|
||||
if audio_chunk is None:
|
||||
return ""
|
||||
|
||||
try:
|
||||
# faster-whisper expects audio in float32
|
||||
audio_data = audio_chunk.astype("float32")
|
||||
|
||||
# Transcribe the audio
|
||||
segments, info = self.model.transcribe(audio_data, beam_size=5)
|
||||
|
||||
# Combine segments into a single string
|
||||
text = " ".join([segment.text.strip() for segment in segments])
|
||||
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription error: {e}")
|
||||
return ""
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Explicitly release model resources if necessary.
|
||||
"""
|
||||
# faster-whisper's WhisperModel doesn't have a standard close(),
|
||||
# but we'll provide this for consistency.
|
||||
pass
|
||||
Reference in New Issue
Block a user