import logging import numpy as np import whisperx # Do not call basicConfig here, as it's called in the orchestrator logger = logging.getLogger(__name__) class Transcriber: """ Converts audio chunks (numpy arrays) into text using WhisperX. """ def __init__( self, model_size="base", device="cpu", compute_type="int8", language="en" ): """ Initializes the WhisperX model. Args: model_size (str): The size of the model to use (e.g., "tiny", "base", "small"). device (str): The device to run the model on ("cpu" or "cuda"). compute_type (str): The compute type to use (e.g., "int8", "float16"). language (str): The language code for alignment (e.g., "en"). """ self.device = device self.compute_type = compute_type self.language = language logger.info( f"Loading WhisperX model: {model_size} on {device} ({compute_type})..." ) try: # Load transcription model self.model = whisperx.load_model( model_size, device=device, compute_type=compute_type ) logger.info("WhisperX model loaded successfully.") except Exception as e: logger.error(f"Failed to load WhisperX models: {e}") raise def transcribe(self, audio_chunk): """ Transcribes an audio chunk. Args: audio_chunk (np.ndarray): The audio data as a numpy array. Returns: list: A list of tuples (speaker_id, text, start, end). """ if audio_chunk is None: return [] try: # WhisperX expects audio in float32 and 1D array audio = audio_chunk.astype("float32").flatten() # 1. Perform transcription # batch_size is set to 16 for efficiency; can be adjusted based on VRAM result = self.model.transcribe(audio, batch_size=16) # Extract ("Unknown", text, start, end) tuples from the transcription result output = [] for segment in result.get("segments", []): text = segment.get("text", "").strip() start = segment.get("start", 0.0) end = segment.get("end", 0.0) if text: output.append(("Unknown", text, start, end)) return output except Exception as e: logger.error(f"Transcription error: {e}") return [] def close(self): """ Explicitly release model resources if necessary. """ pass