Source code for pacing.core.transcription_interfaces

"""
Transcription interfaces for the PACING platform.

These interfaces define how audio is converted to text. Implementations can use
various speech-to-text services (Deepgram, Whisper, Google Speech, etc.) or
mock transcribers for testing.
"""

from abc import ABC, abstractmethod
from typing import AsyncIterator, Optional
import numpy as np

from pacing.models.data_models import TranscriptionResult


[docs] class ITranscriber(ABC): """ Abstract interface for speech-to-text transcription. This interface allows the system to support multiple transcription backends without changing the core logic. The transcriber is responsible for: 1. Converting audio chunks to text 2. Providing confidence scores for transcriptions 3. Handling partial (streaming) transcriptions 4. Speaker diarization (if supported) Design Philosophy: - Transcribers should be stateless or manage their own state - They should handle their own buffering and context management - Confidence scores must be normalized to [0.0, 1.0] """
[docs] @abstractmethod async def transcribe_chunk( self, audio_chunk: np.ndarray, sample_rate: int, is_final: bool = False ) -> TranscriptionResult: """ Transcribe a single audio chunk. Args: audio_chunk: Audio samples (typically float32 or int16) sample_rate: Sample rate in Hz is_final: Whether this is the final chunk in a sequence Returns: TranscriptionResult: The transcription with confidence score Notes: - For streaming transcription, is_final=False produces partial results - Implementations should handle silence gracefully - Empty audio should return empty text with high confidence """ pass
[docs] async def transcribe_stream( self, audio_stream: AsyncIterator[np.ndarray], sample_rate: int ) -> AsyncIterator[TranscriptionResult]: """ Transcribe a stream of audio chunks. This is a convenience method that processes an audio stream and yields transcription results. The default implementation calls transcribe_chunk() for each audio chunk. Args: audio_stream: Async iterator of audio chunks sample_rate: Sample rate in Hz Yields: TranscriptionResult: Transcriptions as they become available Example: async for result in transcriber.transcribe_stream(audio_stream, 16000): print(f"{result.text} (confidence: {result.confidence_score})") """ async for chunk in audio_stream: result = await self.transcribe_chunk(chunk, sample_rate, is_final=False) if result.text: # Only yield non-empty transcriptions yield result
[docs] @abstractmethod def supports_speaker_diarization(self) -> bool: """ Check if this transcriber supports speaker diarization. Returns: bool: True if speaker_id will be populated in TranscriptionResult """ pass
[docs] def get_model_info(self) -> dict: """ Get information about the transcription model. Returns: dict: Model metadata (name, version, language, etc.) """ return { "name": self.__class__.__name__, "version": "unknown", "language": "en-US", }