Refactor STT pipeline and CLI documentation

Split the STT worker into a collector and a transcription worker to offload heavy processing to a background thread. Add the `--whisper-model` flag and implement LLM latency logging. Expand the README with comprehensive CLI usage instructions.
2026-05-31 15:04:41 -07:00
parent 71ecdb3468
commit da5ab1bb44
5 changed files with 136 additions and 36 deletions
@@ -28,8 +28,54 @@ Distill long sessions into concise highlights. Use LLMs to summarize recorded tr

 ## Interface & Usage

- **CLI**: The primary interface for confirming automated updates and querying current game state.
- **Text Editors**: Since data is stored in Markdown and JSON, you can use any editor (VS Code, Vim, Obsidian) to manually refine your campaign data.
+### CLI
+
+The primary interface for confirming automated updates and querying current game state.
+
+#### Command Line Arguments
+
+Use these flags to manage data ingestion and run the live capture pipeline.
+
+##### RAG Ingestion
+Use these flags to add external documents to the RAG (Retrieval-Augmented Generation) system.
+
+| Flag | Description |
+| :--- | :--- |
+| `--ingest-pdf <path>` | Path to a PDF file to ingest |
+| `--ingest-file <path>` | Path to a markdown file to ingest |
+| `--ingest-dir <path>` | Path to a directory of markdown files to ingest |
+
+##### LLM Configuration
+These flags allow you to override the environment variables for the LLM backend.
+
+| Flag | Description |
+| :--- | :--- |
+| `--llm-backend <backend>` | Backend to use (`openai`, `ollama`, or `vllm`) |
+| `--llm-model <model>` | The model name to use |
+| `--llm-api-key <key>` | API key for the LLM backend |
+| `--llm-base-url <url>` | Base URL for the LLM backend |
+
+##### Pipeline Execution
+| Flag | Description |
+| :--- | :--- |
+| `--run-pipeline` | Starts the main orchestration pipeline (TUI + STT + LLM) |
+
+##### Example Command
+
+To run the live orchestration pipeline using the configuration specified in your `env.sh`, you can use:
+
+```bash
+python main.py --run-pipeline \
+  --llm-backend vllm \
+  --llm-model google/gemma-4-26b-a4b-it \
+  --llm-api-key no-key-required \
+  --whisper-model medium \
+  --llm-base-url https://vllm.tipsy.codes/v1
+```
+
+### Text Editors
+
+Since data is stored in Markdown and JSON, you can use any editor (VS Code, Vim, Obsidian) to manually refine your campaign data.

 ## Technical Stack

@@ -53,6 +53,14 @@ def main():
        help="Base URL for the LLM backend",
    )

+    # STT Configuration Arguments
+    parser.add_argument(
+        "--whisper-model",
+        type=str,
+        default=os.environ.get("WHISPER_MODEL", "base"),
+        help="The Whisper model to use for STT",
+    )
+
    # Pipeline Execution Argument
    parser.add_argument(
        "--run-pipeline",
@@ -75,7 +83,7 @@ def main():
    if args.run_pipeline:
        async def run_pipeline():
            loop = asyncio.get_event_loop()
-            orchestrator = PipelineOrchestrator(loop, llm_config=llm_config)
+            orchestrator = PipelineOrchestrator(loop, llm_config=llm_config, whisper_model=args.whisper_model)
            try:
                await orchestrator.run()
            except KeyboardInterrupt:
@@ -1,5 +1,6 @@
 import logging
 import os
+import time
 from posix import system
 from this import s
 from typing import Any, Dict, Optional
@@ -109,12 +110,20 @@ class LLMProcessor:
        logger.debug("--- LLM CALL END ---")

        try:
+            start_time = time.perf_counter()
            response = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                response_format=response_format,
-                extra_body={"enable_thinking": False},
+                extra_body={
+                    "chat_template_kwargs": {
+                        "enable_thinking": False
+                    }
+                },
            )
+            elapsed_time = time.perf_counter() - start_time
+            logger.info(f"LLM request completed in {elapsed_time:.2f}s")
+
            content = response.choices[0].message.content

            # Debugging: Dump outputs
@@ -39,13 +39,18 @@ logger = logging.getLogger(__name__)


 class PipelineOrchestrator:
-    def __init__(self, loop: asyncio.AbstractEventLoop, llm_config: Optional[dict] = None):
+    def __init__(
+        self,
+        loop: asyncio.AbstractEventLoop,
+        llm_config: Optional[dict] = None,
+        whisper_model: str = "base",
+    ):
        self.loop = loop
        self.llm_config = llm_config or {}

        # Modules
        self.listener = AudioListener(loop=self.loop)
-        self.transcriber = Transcriber(model_size="base", device="cuda")
+        self.transcriber = Transcriber(model_size=whisper_model, device="cuda")
        self.processor = LLMProcessor(**self.llm_config)
        self.rag_manager = RAGManager(llm_config=self.llm_config)

@@ -57,6 +62,9 @@ class PipelineOrchestrator:
        self.log_queue = asyncio.Queue()
        self.persistence_queue = asyncio.Queue()

+        # Synchronization
+        self.transcription_event = asyncio.Event()
+
        self.is_running = False

        # Conversation history for context
@@ -84,11 +92,12 @@ class PipelineOrchestrator:

        return f"Conversation History:\n{context_text}\n\n"

-    async def stt_worker(self):
+    async def stt_collector_worker(self):
        """
-        Worker that handles STT: Audio -> Text.
+        Worker that handles STT Collection: Audio -> Buffer.
+        This task is highly responsive and only manages the buffer.
        """
-        logger.info("STT Worker started.")
+        logger.info("STT Collector Worker started.")
        while self.is_running:
            try:
                # Get audio chunk from listener
@@ -105,33 +114,68 @@ class PipelineOrchestrator:
                    ):
                        self.audio_buffer.pop(0)

-                # Concatenate buffer for transcription
-                full_audio = np.concatenate(self.audio_buffer)
+                # Signal the transcription worker that new data is available
+                self.transcription_event.set()

-                # Transcribe (WhisperX now returns a list of (speaker, text, start, end))
+            except Exception as e:
+                logger.error(f"STT Collector Worker error: {e}")
+
+            # Small sleep to prevent tight loop
+            await asyncio.sleep(0.01)
+
+    async def stt_transcription_worker(self):
+        """
+        Worker that handles STT Transcription: Buffer -> Text.
+        This task handles the heavy lifting in a separate thread.
+        """
+        logger.info("STT Transcription Worker started.")
+        while self.is_running:
+            try:
+                # Wait for a signal that new data is available
+                await self.transcription_event.wait()
+                self.transcription_event.clear()
+
+                # 1. Take a snapshot of the current buffer to avoid race conditions
+                # while the collector is appending new chunks.
+                buffer_snapshot = list(self.audio_buffer)
+                if not buffer_snapshot:
+                    continue
+
+                # 2. Perform transcription in a separate thread.
+                # We pass the snapshot to the helper which handles concatenation and transcription.
                results = await asyncio.to_thread(
-                    self.transcriber.transcribe, full_audio
+                    self._transcribe_buffer_snapshot, buffer_snapshot
                )

-                # Filter for only new segments that start after the last processed segment
+                # 3. Filter for only new segments that start after the last processed segment
                new_segments = [
                    res for res in results if res[2] >= self.last_processed_end_time
                ]

                if new_segments:
                    for speaker, text, start, end in new_segments:
-                        logger.info(f"Transcribed: [{speaker}] {text}")
+                        logger.info(f"STT Raw Transcription: [{speaker}] {text}")
+                        # Push raw transcription to log queue for UI visibility
+                        await self.log_queue.put(f"[{speaker}] {text}")
                        await self.stt_to_clean_queue.put((speaker, text))
                        self.last_processed_end_time = max(
                            self.last_processed_end_time, end
                        )

            except Exception as e:
-                logger.error(f"STT Worker error: {e}")
+                logger.error(f"STT Transcription Worker error: {e}")

-            # Small sleep to prevent tight loop if get_chunk is fast
+            # Small sleep to prevent tight loop
            await asyncio.sleep(0.1)

+    def _transcribe_buffer_snapshot(self, buffer_snapshot):
+        """
+        Helper method to be run in a thread.
+        Concatenates the buffer snapshot and transcribes it.
+        """
+        full_audio = np.concatenate(buffer_snapshot)
+        return self.transcriber.transcribe(full_audio)
+
    async def clean_worker(self):
        """
        Worker that handles Text Cleaning: Raw STT -> Filtered Text.
@@ -204,6 +248,7 @@ class PipelineOrchestrator:

        while self.is_running:
            try:
+                logger.info("LLM Worker: Waiting for input...")
                speaker, text = await internal_queue.get()
                logger.info(f"LLM Worker: Processing text from {speaker}: {text}")

@@ -213,6 +258,9 @@ class PipelineOrchestrator:
                # Log the text sent to the LLM for UI affordance
                await self.log_queue.put(f"[{speaker}] {text}")

+                # Log the filtered message being sent to the LLM
+                logger.info(f"LLM Worker: Sending filtered message to LLM: {text}")
+
                # Structured extraction using the processor
                extraction_result = await asyncio.to_thread(
                    self.processor.extract_structured_data,
@@ -301,7 +349,8 @@ class PipelineOrchestrator:

        # Start workers as background tasks
        tasks = [
-            asyncio.create_task(self.stt_worker()),
+            asyncio.create_task(self.stt_collector_worker()),
+            asyncio.create_task(self.stt_transcription_worker()),
            asyncio.create_task(self.clean_worker()),
            asyncio.create_task(self.llm_worker()),
            asyncio.create_task(self.persistence_worker()),
@@ -76,32 +76,27 @@ class ConfirmationApp(App):
    }

    #pending-facts-table {
-        height: 40%;
+        height: 30%;
        border: solid white;
    }

    #llm-input-container {
        height: 10%;
        border: solid white;
-        padding: 1;
+        padding: 0;
    }

    #context-pane {
-        height: 50%;
+        height: 60%;
        border: solid white;
    }

    #log-pane {
-        height: 30%;
+        height: 100%;
        border: solid white;
        background: #111;
    }

-    #log-footer {
-        height: 70%;
-        border: solid white;
-    }
-
    #modal-container {
        width: 60%;
        height: auto;
@@ -163,18 +158,11 @@ class ConfirmationApp(App):
            Horizontal(
                Vertical(
                    DataTable(id="pending-facts-table"),
-                    Vertical(
-                        Input(placeholder="Message LLM...", id="llm-input"),
-                        id="llm-input-container",
-                    ),
+                    Input(placeholder="Message LLM...", id="llm-input"),
                    ListView(id="context-pane"),
                    id="left-pane",
                ),
-                Vertical(
-                    ListView(id="log-pane"),
-                    Static("LATEST LLM INPUTS", id="log-footer"),
-                    id="right-pane",
-                ),
+                ListView(id="log-pane"),
                id="content-wrapper",
            ),
            id="main-container",