Refactor STT pipeline and CLI documentation

Split the STT worker into a collector and a transcription worker
to offload heavy processing to a background thread. Add the
`--whisper-model` flag and implement LLM latency logging. Expand
the README with comprehensive CLI usage instructions.
This commit is contained in:
2026-05-31 15:04:41 -07:00
parent 71ecdb3468
commit da5ab1bb44
5 changed files with 136 additions and 36 deletions
+48 -2
View File
@@ -28,8 +28,54 @@ Distill long sessions into concise highlights. Use LLMs to summarize recorded tr
## Interface & Usage ## Interface & Usage
- **CLI**: The primary interface for confirming automated updates and querying current game state. ### CLI
- **Text Editors**: Since data is stored in Markdown and JSON, you can use any editor (VS Code, Vim, Obsidian) to manually refine your campaign data.
The primary interface for confirming automated updates and querying current game state.
#### Command Line Arguments
Use these flags to manage data ingestion and run the live capture pipeline.
##### RAG Ingestion
Use these flags to add external documents to the RAG (Retrieval-Augmented Generation) system.
| Flag | Description |
| :--- | :--- |
| `--ingest-pdf <path>` | Path to a PDF file to ingest |
| `--ingest-file <path>` | Path to a markdown file to ingest |
| `--ingest-dir <path>` | Path to a directory of markdown files to ingest |
##### LLM Configuration
These flags allow you to override the environment variables for the LLM backend.
| Flag | Description |
| :--- | :--- |
| `--llm-backend <backend>` | Backend to use (`openai`, `ollama`, or `vllm`) |
| `--llm-model <model>` | The model name to use |
| `--llm-api-key <key>` | API key for the LLM backend |
| `--llm-base-url <url>` | Base URL for the LLM backend |
##### Pipeline Execution
| Flag | Description |
| :--- | :--- |
| `--run-pipeline` | Starts the main orchestration pipeline (TUI + STT + LLM) |
##### Example Command
To run the live orchestration pipeline using the configuration specified in your `env.sh`, you can use:
```bash
python main.py --run-pipeline \
--llm-backend vllm \
--llm-model google/gemma-4-26b-a4b-it \
--llm-api-key no-key-required \
--whisper-model medium \
--llm-base-url https://vllm.tipsy.codes/v1
```
### Text Editors
Since data is stored in Markdown and JSON, you can use any editor (VS Code, Vim, Obsidian) to manually refine your campaign data.
## Technical Stack ## Technical Stack
+9 -1
View File
@@ -53,6 +53,14 @@ def main():
help="Base URL for the LLM backend", help="Base URL for the LLM backend",
) )
# STT Configuration Arguments
parser.add_argument(
"--whisper-model",
type=str,
default=os.environ.get("WHISPER_MODEL", "base"),
help="The Whisper model to use for STT",
)
# Pipeline Execution Argument # Pipeline Execution Argument
parser.add_argument( parser.add_argument(
"--run-pipeline", "--run-pipeline",
@@ -75,7 +83,7 @@ def main():
if args.run_pipeline: if args.run_pipeline:
async def run_pipeline(): async def run_pipeline():
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
orchestrator = PipelineOrchestrator(loop, llm_config=llm_config) orchestrator = PipelineOrchestrator(loop, llm_config=llm_config, whisper_model=args.whisper_model)
try: try:
await orchestrator.run() await orchestrator.run()
except KeyboardInterrupt: except KeyboardInterrupt:
+10 -1
View File
@@ -1,5 +1,6 @@
import logging import logging
import os import os
import time
from posix import system from posix import system
from this import s from this import s
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
@@ -109,12 +110,20 @@ class LLMProcessor:
logger.debug("--- LLM CALL END ---") logger.debug("--- LLM CALL END ---")
try: try:
start_time = time.perf_counter()
response = self.client.chat.completions.create( response = self.client.chat.completions.create(
model=self.model, model=self.model,
messages=messages, messages=messages,
response_format=response_format, response_format=response_format,
extra_body={"enable_thinking": False}, extra_body={
"chat_template_kwargs": {
"enable_thinking": False
}
},
) )
elapsed_time = time.perf_counter() - start_time
logger.info(f"LLM request completed in {elapsed_time:.2f}s")
content = response.choices[0].message.content content = response.choices[0].message.content
# Debugging: Dump outputs # Debugging: Dump outputs
+63 -14
View File
@@ -39,13 +39,18 @@ logger = logging.getLogger(__name__)
class PipelineOrchestrator: class PipelineOrchestrator:
def __init__(self, loop: asyncio.AbstractEventLoop, llm_config: Optional[dict] = None): def __init__(
self,
loop: asyncio.AbstractEventLoop,
llm_config: Optional[dict] = None,
whisper_model: str = "base",
):
self.loop = loop self.loop = loop
self.llm_config = llm_config or {} self.llm_config = llm_config or {}
# Modules # Modules
self.listener = AudioListener(loop=self.loop) self.listener = AudioListener(loop=self.loop)
self.transcriber = Transcriber(model_size="base", device="cuda") self.transcriber = Transcriber(model_size=whisper_model, device="cuda")
self.processor = LLMProcessor(**self.llm_config) self.processor = LLMProcessor(**self.llm_config)
self.rag_manager = RAGManager(llm_config=self.llm_config) self.rag_manager = RAGManager(llm_config=self.llm_config)
@@ -57,6 +62,9 @@ class PipelineOrchestrator:
self.log_queue = asyncio.Queue() self.log_queue = asyncio.Queue()
self.persistence_queue = asyncio.Queue() self.persistence_queue = asyncio.Queue()
# Synchronization
self.transcription_event = asyncio.Event()
self.is_running = False self.is_running = False
# Conversation history for context # Conversation history for context
@@ -84,11 +92,12 @@ class PipelineOrchestrator:
return f"Conversation History:\n{context_text}\n\n" return f"Conversation History:\n{context_text}\n\n"
async def stt_worker(self): async def stt_collector_worker(self):
""" """
Worker that handles STT: Audio -> Text. Worker that handles STT Collection: Audio -> Buffer.
This task is highly responsive and only manages the buffer.
""" """
logger.info("STT Worker started.") logger.info("STT Collector Worker started.")
while self.is_running: while self.is_running:
try: try:
# Get audio chunk from listener # Get audio chunk from listener
@@ -105,33 +114,68 @@ class PipelineOrchestrator:
): ):
self.audio_buffer.pop(0) self.audio_buffer.pop(0)
# Concatenate buffer for transcription # Signal the transcription worker that new data is available
full_audio = np.concatenate(self.audio_buffer) self.transcription_event.set()
# Transcribe (WhisperX now returns a list of (speaker, text, start, end)) except Exception as e:
logger.error(f"STT Collector Worker error: {e}")
# Small sleep to prevent tight loop
await asyncio.sleep(0.01)
async def stt_transcription_worker(self):
"""
Worker that handles STT Transcription: Buffer -> Text.
This task handles the heavy lifting in a separate thread.
"""
logger.info("STT Transcription Worker started.")
while self.is_running:
try:
# Wait for a signal that new data is available
await self.transcription_event.wait()
self.transcription_event.clear()
# 1. Take a snapshot of the current buffer to avoid race conditions
# while the collector is appending new chunks.
buffer_snapshot = list(self.audio_buffer)
if not buffer_snapshot:
continue
# 2. Perform transcription in a separate thread.
# We pass the snapshot to the helper which handles concatenation and transcription.
results = await asyncio.to_thread( results = await asyncio.to_thread(
self.transcriber.transcribe, full_audio self._transcribe_buffer_snapshot, buffer_snapshot
) )
# Filter for only new segments that start after the last processed segment # 3. Filter for only new segments that start after the last processed segment
new_segments = [ new_segments = [
res for res in results if res[2] >= self.last_processed_end_time res for res in results if res[2] >= self.last_processed_end_time
] ]
if new_segments: if new_segments:
for speaker, text, start, end in new_segments: for speaker, text, start, end in new_segments:
logger.info(f"Transcribed: [{speaker}] {text}") logger.info(f"STT Raw Transcription: [{speaker}] {text}")
# Push raw transcription to log queue for UI visibility
await self.log_queue.put(f"[{speaker}] {text}")
await self.stt_to_clean_queue.put((speaker, text)) await self.stt_to_clean_queue.put((speaker, text))
self.last_processed_end_time = max( self.last_processed_end_time = max(
self.last_processed_end_time, end self.last_processed_end_time, end
) )
except Exception as e: except Exception as e:
logger.error(f"STT Worker error: {e}") logger.error(f"STT Transcription Worker error: {e}")
# Small sleep to prevent tight loop if get_chunk is fast # Small sleep to prevent tight loop
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
def _transcribe_buffer_snapshot(self, buffer_snapshot):
"""
Helper method to be run in a thread.
Concatenates the buffer snapshot and transcribes it.
"""
full_audio = np.concatenate(buffer_snapshot)
return self.transcriber.transcribe(full_audio)
async def clean_worker(self): async def clean_worker(self):
""" """
Worker that handles Text Cleaning: Raw STT -> Filtered Text. Worker that handles Text Cleaning: Raw STT -> Filtered Text.
@@ -204,6 +248,7 @@ class PipelineOrchestrator:
while self.is_running: while self.is_running:
try: try:
logger.info("LLM Worker: Waiting for input...")
speaker, text = await internal_queue.get() speaker, text = await internal_queue.get()
logger.info(f"LLM Worker: Processing text from {speaker}: {text}") logger.info(f"LLM Worker: Processing text from {speaker}: {text}")
@@ -213,6 +258,9 @@ class PipelineOrchestrator:
# Log the text sent to the LLM for UI affordance # Log the text sent to the LLM for UI affordance
await self.log_queue.put(f"[{speaker}] {text}") await self.log_queue.put(f"[{speaker}] {text}")
# Log the filtered message being sent to the LLM
logger.info(f"LLM Worker: Sending filtered message to LLM: {text}")
# Structured extraction using the processor # Structured extraction using the processor
extraction_result = await asyncio.to_thread( extraction_result = await asyncio.to_thread(
self.processor.extract_structured_data, self.processor.extract_structured_data,
@@ -301,7 +349,8 @@ class PipelineOrchestrator:
# Start workers as background tasks # Start workers as background tasks
tasks = [ tasks = [
asyncio.create_task(self.stt_worker()), asyncio.create_task(self.stt_collector_worker()),
asyncio.create_task(self.stt_transcription_worker()),
asyncio.create_task(self.clean_worker()), asyncio.create_task(self.clean_worker()),
asyncio.create_task(self.llm_worker()), asyncio.create_task(self.llm_worker()),
asyncio.create_task(self.persistence_worker()), asyncio.create_task(self.persistence_worker()),
+4 -16
View File
@@ -76,32 +76,27 @@ class ConfirmationApp(App):
} }
#pending-facts-table { #pending-facts-table {
height: 40%; height: 30%;
border: solid white; border: solid white;
} }
#llm-input-container { #llm-input-container {
height: 10%; height: 10%;
border: solid white; border: solid white;
padding: 1; padding: 0;
} }
#context-pane { #context-pane {
height: 50%; height: 60%;
border: solid white; border: solid white;
} }
#log-pane { #log-pane {
height: 30%; height: 100%;
border: solid white; border: solid white;
background: #111; background: #111;
} }
#log-footer {
height: 70%;
border: solid white;
}
#modal-container { #modal-container {
width: 60%; width: 60%;
height: auto; height: auto;
@@ -163,18 +158,11 @@ class ConfirmationApp(App):
Horizontal( Horizontal(
Vertical( Vertical(
DataTable(id="pending-facts-table"), DataTable(id="pending-facts-table"),
Vertical(
Input(placeholder="Message LLM...", id="llm-input"), Input(placeholder="Message LLM...", id="llm-input"),
id="llm-input-container",
),
ListView(id="context-pane"), ListView(id="context-pane"),
id="left-pane", id="left-pane",
), ),
Vertical(
ListView(id="log-pane"), ListView(id="log-pane"),
Static("LATEST LLM INPUTS", id="log-footer"),
id="right-pane",
),
id="content-wrapper", id="content-wrapper",
), ),
id="main-container", id="main-container",