Source code for wtf_transcript_converter.core.validator

"""
WTF document validation functions.

This module provides validation functions for WTF documents and their components.
"""

import re
from typing import List, Tuple

from .models import WTFDocument



[docs]
def validate_wtf_document(doc: WTFDocument) -> Tuple[bool, List[str]]:
    """
    Validate a WTF document for compliance with the specification.

    Args:
        doc: WTF document to validate

    Returns:
        Tuple of (is_valid, list_of_errors)
    """
    errors = []

    # Basic validation - Pydantic already handles most of this
    try:
        # Check if document can be serialized
        doc.model_dump()
    except Exception as e:
        errors.append(f"Document serialization error: {str(e)}")

    # Additional custom validations
    errors.extend(_validate_transcript_consistency(doc))
    errors.extend(_validate_timing_consistency(doc))
    errors.extend(_validate_speaker_consistency(doc))
    errors.extend(_validate_word_segment_consistency(doc))
    errors.extend(_validate_confidence_scores(doc))

    return len(errors) == 0, errors



def _validate_transcript_consistency(doc: WTFDocument) -> List[str]:
    """Validate transcript text consistency with segments."""
    errors = []

    # Check that transcript text matches concatenated segment text
    segment_text = " ".join(seg.text for seg in doc.segments)
    if doc.transcript.text.strip() != segment_text.strip():
        errors.append("Transcript text does not match concatenated segment text")

    # Check that transcript duration matches segment timing (with more tolerance for Deepgram)
    if doc.segments:
        max_end_time = max(seg.end for seg in doc.segments)
        # Allow up to 5 seconds tolerance for providers like Deepgram that may have silence at the end
        if abs(doc.transcript.duration - max_end_time) > 5.0:
            errors.append(
                f"Transcript duration ({doc.transcript.duration}) does not match segment timing ({max_end_time})"
            )

    return errors


def _validate_timing_consistency(doc: WTFDocument) -> List[str]:
    """Validate timing consistency across the document."""
    errors = []

    # Check segment timing
    for i, segment in enumerate(doc.segments):
        if segment.start >= segment.end:
            errors.append(
                f"Segment {i}: start time ({segment.start}) must be before end time ({segment.end})"
            )

    # Check for overlapping segments
    for i in range(len(doc.segments) - 1):
        if doc.segments[i].end > doc.segments[i + 1].start:
            errors.append(f"Segments {i} and {i + 1} have overlapping times")

    # Check word timing if available
    if doc.words:
        for word in doc.words:
            if word.start >= word.end:
                errors.append(
                    f"Word {word.id}: start time ({word.start}) must be before end time ({word.end})"
                )

    return errors


def _validate_speaker_consistency(doc: WTFDocument) -> List[str]:
    """Validate speaker consistency across the document."""
    errors: List[str] = []

    if not doc.speakers or not doc.segments:
        return errors

    # Get all speaker IDs from speakers dict
    speaker_ids = set(doc.speakers.keys())

    # Check that all segment speakers are valid
    for segment in doc.segments:
        if segment.speaker is not None:
            speaker_key = str(segment.speaker)
            if speaker_key not in speaker_ids:
                errors.append(f"Segment {segment.id} references invalid speaker {segment.speaker}")

    # Check that all word speakers are valid
    if doc.words:
        for word in doc.words:
            if word.speaker is not None:
                speaker_key = str(word.speaker)
                if speaker_key not in speaker_ids:
                    errors.append(f"Word {word.id} references invalid speaker {word.speaker}")

    return errors


def _validate_word_segment_consistency(doc: WTFDocument) -> List[str]:
    """Validate word-segment consistency."""
    errors: List[str] = []

    if not doc.words or not doc.segments:
        return errors

    # Create mapping of segment IDs to segments
    segment_map = {seg.id: seg for seg in doc.segments}

    # Check that all word references in segments are valid
    for segment in doc.segments:
        if segment.words:
            for word_id in segment.words:
                word = next((w for w in doc.words if w.id == word_id), None)
                if word is None:
                    errors.append(f"Segment {segment.id} references invalid word {word_id}")
                else:
                    # Check that word timing is within segment timing
                    if word.start < segment.start or word.end > segment.end:
                        errors.append(
                            f"Word {word_id} timing is outside segment {segment.id} timing"
                        )

    return errors


def _validate_confidence_scores(doc: WTFDocument) -> List[str]:
    """Validate confidence scores are in valid range."""
    errors = []

    # Check transcript confidence
    if not (0.0 <= doc.transcript.confidence <= 1.0):
        errors.append(
            f"Transcript confidence ({doc.transcript.confidence}) must be between 0.0 and 1.0"
        )

    # Check segment confidence scores
    for segment in doc.segments:
        if not (0.0 <= segment.confidence <= 1.0):
            errors.append(
                f"Segment {segment.id} confidence ({segment.confidence}) must be between 0.0 and 1.0"
            )

    # Check word confidence scores
    if doc.words:
        for word in doc.words:
            if not (0.0 <= word.confidence <= 1.0):
                errors.append(
                    f"Word {word.id} confidence ({word.confidence}) must be between 0.0 and 1.0"
                )

    # Check speaker confidence scores
    if doc.speakers:
        for speaker in doc.speakers.values():
            if not (0.0 <= speaker.confidence <= 1.0):
                errors.append(
                    f"Speaker {speaker.id} confidence ({speaker.confidence}) must be between 0.0 and 1.0"
                )

    return errors



[docs]
def validate_confidence_score(confidence: float, context: str = "") -> bool:
    """
    Validate that a confidence score is in the valid range [0.0, 1.0].

    Args:
        confidence: Confidence score to validate
        context: Optional context for error messages

    Returns:
        True if valid, False otherwise
    """
    return 0.0 <= confidence <= 1.0




[docs]
def validate_timestamp(timestamp: str) -> bool:
    """
    Validate ISO 8601 timestamp format.

    Args:
        timestamp: Timestamp string to validate

    Returns:
        True if valid, False otherwise
    """
    try:
        from datetime import datetime

        datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
        return True
    except ValueError:
        return False




[docs]
def validate_language_code(language_code: str) -> bool:
    """
    Validate BCP-47 language code format.

    Args:
        language_code: Language code to validate

    Returns:
        True if valid, False otherwise
    """
    pattern = r"^[a-z]{2,3}(-[A-Z]{2})?(-[a-z0-9]{5,8})?(-[a-z0-9]{1,8})*(-[a-z0-9]{1,8})*$"
    return bool(re.match(pattern, language_code.lower()))