Source code for wtf_transcript_converter.providers.rev_ai

"""
Rev.ai provider converter for WTF transcript format.

This module provides conversion between Rev.ai transcription format and WTF format.
"""

import re
from typing import Any, Dict, List

from wtf_transcript_converter.core.models import (
    WTFAudio,
    WTFDocument,
    WTFMetadata,
    WTFQuality,
    WTFSegment,
    WTFSpeaker,
    WTFTranscript,
    WTFWord,
)
from wtf_transcript_converter.providers.base import BaseProviderConverter
from wtf_transcript_converter.utils.confidence_utils import normalize_confidence
from wtf_transcript_converter.utils.language_utils import is_valid_bcp47
from wtf_transcript_converter.utils.time_utils import get_current_iso_timestamp



[docs]
class RevAIConverter(BaseProviderConverter):
    """Converter for Rev.ai JSON format to/from WTF format."""


[docs]
    def __init__(self) -> None:
        super().__init__("rev_ai")


    provider_name: str = "rev_ai"
    description: str = "Rev.ai transcription service"
    status: str = "Implemented"


[docs]
    def convert_to_wtf(self, rev_ai_data: Dict[str, Any]) -> WTFDocument:
        """
        Convert Rev.ai JSON data to WTF format.

        Args:
            rev_ai_data: Rev.ai JSON data structure

        Returns:
            WTF document
        """
        # Extract basic transcript information
        transcript = WTFTranscript(
            text=self._extract_full_transcript_text(rev_ai_data),
            language=self._extract_language(rev_ai_data),
            duration=rev_ai_data.get("duration_seconds", 0.0),
            confidence=self._calculate_overall_confidence(rev_ai_data),
        )

        # Convert monologue elements to segments and words
        # RevAI returns 'monologues' (plural) in the API response
        monologues = rev_ai_data.get("monologues", [])
        if monologues:
            monologue = monologues[0]  # Use first monologue
        else:
            monologue = rev_ai_data.get("monologue", {})  # Fallback for singular
        elements = monologue.get("elements", [])

        wtf_segments: List[WTFSegment] = []
        wtf_words: List[WTFWord] = []
        speakers: Dict[str, WTFSpeaker] = {}
        word_id_counter = 0

        # Process elements to create words and segments
        current_segment_text = ""
        current_segment_start = None
        current_segment_end = None
        current_segment_words = []
        segment_id = 0

        for element in elements:
            element_type = element.get("type", "")
            element_value = element.get("value", "")
            start_time = element.get("ts", 0.0)
            end_time = element.get("end_ts", start_time + 0.1)  # Default duration if not provided
            confidence = element.get("confidence", 0.0)

            if element_type == "text":
                # Create word
                wtf_word = WTFWord(
                    id=word_id_counter,
                    start=start_time,
                    end=end_time,
                    text=element_value,
                    confidence=normalize_confidence(confidence, "rev_ai"),
                    speaker=None,
                    is_punctuation=self._detect_punctuation(element_value),
                )
                wtf_words.append(wtf_word)
                current_segment_words.append(word_id_counter)
                word_id_counter += 1

                # Build segment text
                if current_segment_start is None:
                    current_segment_start = start_time
                current_segment_end = end_time
                current_segment_text += element_value + " "

            elif element_type == "punct" and current_segment_text:
                # End current segment and start new one
                if current_segment_text.strip():
                    # Calculate segment confidence
                    segment_confidence = (
                        sum(w.confidence for w in wtf_words[-len(current_segment_words) :])
                        / len(current_segment_words)
                        if current_segment_words
                        else 0.0
                    )

                    wtf_segment = WTFSegment(
                        id=segment_id,
                        start=(
                            float(current_segment_start)
                            if current_segment_start is not None
                            else 0.0
                        ),
                        end=float(current_segment_end) if current_segment_end is not None else 0.0,
                        text=current_segment_text.strip(),
                        confidence=segment_confidence,
                        speaker=None,
                        words=current_segment_words.copy(),
                    )
                    wtf_segments.append(wtf_segment)
                    segment_id += 1

                # Reset for next segment
                current_segment_text = ""
                current_segment_start = None
                current_segment_end = None
                current_segment_words = []

        # Add final segment if exists
        if current_segment_text.strip():
            segment_confidence = (
                sum(w.confidence for w in wtf_words[-len(current_segment_words) :])
                / len(current_segment_words)
                if current_segment_words
                else 0.0
            )

            wtf_segment = WTFSegment(
                id=segment_id,
                start=float(current_segment_start) if current_segment_start is not None else 0.0,
                end=float(current_segment_end) if current_segment_end is not None else 0.0,
                text=current_segment_text.strip(),
                confidence=segment_confidence,
                speaker=None,
                words=current_segment_words,
            )
            wtf_segments.append(wtf_segment)

        # If no segments were created, create a single segment with the full transcript
        if not wtf_segments and wtf_words:
            first_word = wtf_words[0]
            last_word = wtf_words[-1]
            segment_confidence = sum(w.confidence for w in wtf_words) / len(wtf_words)

            wtf_segment = WTFSegment(
                id=0,
                start=first_word.start,
                end=last_word.end,
                text=transcript.text,  # Use the full transcript text
                confidence=segment_confidence,
                speaker=None,
                words=[w.id for w in wtf_words],
            )
            wtf_segments.append(wtf_segment)

        # Extract speaker information
        speaker_id = monologue.get("speaker", 0)
        speakers[str(speaker_id)] = WTFSpeaker(
            id=str(speaker_id),
            label=f"Speaker {speaker_id + 1}",
            segments=[seg.id for seg in wtf_segments],
            total_time=sum(seg.end - seg.start for seg in wtf_segments),
            confidence=transcript.confidence,
        )

        # Assign speaker to segments and words
        for segment in wtf_segments:
            segment.speaker = str(speaker_id)
        for word in wtf_words:
            word.speaker = str(speaker_id)

        # Create metadata
        current_time = get_current_iso_timestamp()
        audio_duration = rev_ai_data.get("duration_seconds", 0.0)

        audio_metadata = WTFAudio(
            duration=audio_duration,
            sample_rate=None,
            channels=None,
            format=None,
            bitrate=None,
        )

        metadata = WTFMetadata(
            created_at=rev_ai_data.get("created_on", current_time),
            processed_at=current_time,
            provider=self.provider_name,
            model=self._extract_model(rev_ai_data),
            processing_time=rev_ai_data.get("processing_time_seconds"),
            audio=audio_metadata,
            options={
                "job_id": rev_ai_data.get("id"),
                "status": rev_ai_data.get("status"),
                "language": rev_ai_data.get("language"),
                "transcriber": rev_ai_data.get("transcriber"),
                "verbatim": rev_ai_data.get("verbatim"),
                "filter_profanity": rev_ai_data.get("filter_profanity"),
                "remove_disfluencies": rev_ai_data.get("remove_disfluencies"),
                "delete_after_seconds": rev_ai_data.get("delete_after_seconds"),
                "skip_diarization": rev_ai_data.get("skip_diarization"),
                "skip_punctuation": rev_ai_data.get("skip_punctuation"),
                "skip_automatic_punctuation": rev_ai_data.get("skip_automatic_punctuation"),
                "speaker_channels_count": rev_ai_data.get("speaker_channels_count"),
                "custom_vocabulary_id": rev_ai_data.get("custom_vocabulary_id"),
                "custom_vocabulary": rev_ai_data.get("custom_vocabulary"),
                "webhook_url": rev_ai_data.get("webhook_url"),
                "webhook_auth_headers": rev_ai_data.get("webhook_auth_headers"),
                "metadata": rev_ai_data.get("metadata"),
                "priority": rev_ai_data.get("priority"),
                "callback_url": rev_ai_data.get("callback_url"),
                "media_url": rev_ai_data.get("media_url"),
                "media_url_ttl": rev_ai_data.get("media_url_ttl"),
                "failure": rev_ai_data.get("failure"),
                "failure_detail": rev_ai_data.get("failure_detail"),
                "warnings": rev_ai_data.get("warnings"),
            },
        )
        # Clean options to remove None values
        metadata.options = {k: v for k, v in metadata.options.items() if v is not None}

        # Calculate quality metrics
        quality = self._calculate_quality_metrics(rev_ai_data, wtf_words)

        # Preserve other Rev.ai-specific fields in extensions
        extensions = {
            "rev_ai_raw_response": rev_ai_data  # Store the full raw response for fidelity
        }

        return WTFDocument(
            transcript=transcript,
            segments=wtf_segments,
            metadata=metadata,
            words=wtf_words if wtf_words else None,
            speakers=speakers if speakers else None,
            alternatives=None,
            enrichments=None,
            extensions=extensions if extensions else None,
            quality=quality,
            streaming=None,
        )



[docs]
    def convert_from_wtf(self, wtf_doc: WTFDocument) -> Dict[str, Any]:
        """
        Convert WTF document to Rev.ai JSON format.

        Args:
            wtf_doc: WTF document

        Returns:
            Rev.ai JSON data structure
        """
        # Reconstruct Rev.ai structure
        rev_ai_data: Dict[str, Any] = {
            "id": wtf_doc.metadata.options.get("job_id", "wtf-converted-id"),
            "status": "transcribed",
            "created_on": wtf_doc.metadata.created_at,
            "duration_seconds": wtf_doc.transcript.duration,
            "language": (
                wtf_doc.transcript.language.split("-")[0]
                if "-" in wtf_doc.transcript.language
                else wtf_doc.transcript.language
            ),
            "monologue": {"speaker": 0, "elements": []},  # Default speaker
        }

        # Convert words to elements
        if wtf_doc.words:
            for word in wtf_doc.words:
                # Add text element
                rev_ai_data["monologue"]["elements"].append(
                    {
                        "type": "text",
                        "value": word.text,
                        "ts": word.start,
                        "end_ts": word.end,
                        "confidence": word.confidence,
                    }
                )

                # Add punctuation if it's punctuation
                if word.is_punctuation:
                    rev_ai_data["monologue"]["elements"].append(
                        {
                            "type": "punct",
                            "value": word.text,
                            "ts": word.end,
                            "end_ts": word.end + 0.1,
                        }
                    )

        # Merge extensions back if available
        if wtf_doc.extensions and "rev_ai_raw_response" in wtf_doc.extensions:
            original_raw = wtf_doc.extensions["rev_ai_raw_response"]
            # This is a simplistic merge; a real implementation might be more granular
            rev_ai_data.update(original_raw)
            # Ensure our converted data overrides the raw where appropriate
            rev_ai_data["duration_seconds"] = wtf_doc.transcript.duration
            rev_ai_data["language"] = (
                wtf_doc.transcript.language.split("-")[0]
                if "-" in wtf_doc.transcript.language
                else wtf_doc.transcript.language
            )
            if wtf_doc.words:
                rev_ai_data["monologue"]["elements"] = [
                    {
                        "type": "text",
                        "value": word.text,
                        "ts": word.start,
                        "end_ts": word.end,
                        "confidence": word.confidence,
                    }
                    for word in wtf_doc.words
                ]

        return rev_ai_data


    def _extract_full_transcript_text(self, rev_ai_data: Dict[str, Any]) -> str:
        """Extract full transcript text from Rev.ai elements."""
        # RevAI returns 'monologues' (plural) in the API response
        monologues = rev_ai_data.get("monologues", [])
        if monologues:
            monologue = monologues[0]  # Use first monologue
        else:
            monologue = rev_ai_data.get("monologue", {})  # Fallback for singular
        elements = monologue.get("elements", [])
        text_elements = [elem.get("value", "") for elem in elements if elem.get("type") == "text"]
        full_text = " ".join(text_elements)
        return full_text if full_text.strip() else "No transcription available"

    def _extract_language(self, rev_ai_data: Dict[str, Any]) -> str:
        """Extract and normalize language code from Rev.ai data."""
        lang = str(rev_ai_data.get("language", "en")).lower()
        if not is_valid_bcp47(lang):
            return f"{lang}-us"  # Default to US English
        return lang

    def _extract_model(self, rev_ai_data: Dict[str, Any]) -> str:
        """Extract model name from Rev.ai data."""
        transcriber = str(rev_ai_data.get("transcriber", "default"))
        return f"rev-ai-{transcriber}"

    def _calculate_overall_confidence(self, rev_ai_data: Dict[str, Any]) -> float:
        """Calculate overall confidence from Rev.ai data."""
        # RevAI returns 'monologues' (plural) in the API response
        monologues = rev_ai_data.get("monologues", [])
        if monologues:
            monologue = monologues[0]  # Use first monologue
        else:
            monologue = rev_ai_data.get("monologue", {})  # Fallback for singular
        elements = monologue.get("elements", [])
        if not elements:
            return 0.0

        text_elements = [elem for elem in elements if elem.get("type") == "text"]
        if not text_elements:
            return 0.0

        confidences = [elem.get("confidence", 0.0) for elem in text_elements]
        return sum(confidences) / len(confidences) if confidences else 0.0

    def _detect_punctuation(self, word_text: str) -> bool:
        """Simple check to see if a word is primarily punctuation."""
        return bool(re.fullmatch(r"^\W+$", word_text))

    def _calculate_quality_metrics(
        self, rev_ai_data: Dict[str, Any], wtf_words: List[WTFWord]
    ) -> WTFQuality:
        """Calculate quality metrics based on Rev.ai data."""
        low_confidence_words = sum(1 for word in wtf_words if word.confidence < 0.5)
        average_confidence = (
            sum(word.confidence for word in wtf_words) / len(wtf_words) if wtf_words else 0.0
        )

        return WTFQuality(
            audio_quality=None,
            background_noise=None,
            multiple_speakers=None,
            overlapping_speech=None,
            silence_ratio=None,
            average_confidence=average_confidence,
            low_confidence_words=low_confidence_words,
            processing_warnings=rev_ai_data.get("warnings", []),
        )