"""
Cross-provider consistency testing.
This module tests the same audio content across multiple providers to ensure
WTF format consistency and validate standardization.
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Sequence, Union
from wtf_transcript_converter.core.models import WTFDocument
from wtf_transcript_converter.core.validator import validate_wtf_document
from wtf_transcript_converter.providers import (
AssemblyAIConverter,
CanaryConverter,
DeepgramConverter,
ParakeetConverter,
RevAIConverter,
WhisperConverter,
)
@dataclass
class ConsistencyResult:
"""Result of cross-provider consistency testing."""
provider: str
wtf_doc: Optional[WTFDocument]
is_valid: bool
validation_errors: List[str]
processing_time: float
confidence_score: float
word_count: int
segment_count: int
duration: float
[docs]
class CrossProviderConsistencyTester:
"""Test consistency across multiple transcription providers."""
[docs]
def __init__(self) -> None:
self.providers = {
"whisper": WhisperConverter(),
"deepgram": DeepgramConverter(),
"assemblyai": AssemblyAIConverter(),
"rev-ai": RevAIConverter(),
"canary": CanaryConverter(),
"parakeet": ParakeetConverter(),
}
[docs]
def test_consistency_with_sample_data(
self, sample_data: Dict[str, Any]
) -> List[ConsistencyResult]:
"""
Test consistency across providers using sample JSON data.
Args:
sample_data: Sample transcription data in provider format
Returns:
List of consistency results for each provider
"""
results = []
for provider_name, converter in self.providers.items():
try:
# Convert to WTF
wtf_doc = converter.convert(sample_data)
# Validate WTF document
is_valid, validation_errors = validate_wtf_document(wtf_doc)
# Calculate metrics
confidence_score = wtf_doc.transcript.confidence
word_count = len(wtf_doc.words) if wtf_doc.words else 0
segment_count = len(wtf_doc.segments)
duration = wtf_doc.transcript.duration
result = ConsistencyResult(
provider=provider_name,
wtf_doc=wtf_doc,
is_valid=is_valid,
validation_errors=validation_errors,
processing_time=0.0, # Not measured for sample data
confidence_score=confidence_score,
word_count=word_count,
segment_count=segment_count,
duration=duration,
)
results.append(result)
except Exception as e:
# Create error result
result = ConsistencyResult(
provider=provider_name,
wtf_doc=None,
is_valid=False,
validation_errors=[f"Conversion failed: {str(e)}"],
processing_time=0.0,
confidence_score=0.0,
word_count=0,
segment_count=0,
duration=0.0,
)
results.append(result)
return results
[docs]
def analyze_consistency(self, results: List[ConsistencyResult]) -> Dict[str, Any]:
"""
Analyze consistency across provider results.
Args:
results: List of consistency results
Returns:
Analysis report
"""
valid_results = [r for r in results if r.is_valid]
if not valid_results:
return {
"status": "failed",
"message": "No valid results from any provider",
"total_providers": len(results),
"valid_providers": 0,
}
# Extract metrics
confidences = [r.confidence_score for r in valid_results]
word_counts = [r.word_count for r in valid_results]
segment_counts = [r.segment_count for r in valid_results]
durations = [r.duration for r in valid_results]
# Calculate consistency metrics
confidence_std = self._calculate_std(confidences)
word_count_std = self._calculate_std(word_counts)
segment_count_std = self._calculate_std(segment_counts)
duration_std = self._calculate_std(durations)
# Check for significant differences
confidence_consistent = confidence_std < 0.1 # Less than 10% standard deviation
word_count_consistent = word_count_std < 2 # Less than 2 words difference
segment_count_consistent = segment_count_std < 1 # Same segment count
duration_consistent = duration_std < 1.0 # Less than 1 second difference
overall_consistent = all(
[
confidence_consistent,
word_count_consistent,
segment_count_consistent,
duration_consistent,
]
)
return {
"status": "consistent" if overall_consistent else "inconsistent",
"total_providers": len(results),
"valid_providers": len(valid_results),
"metrics": {
"confidence": {
"mean": sum(confidences) / len(confidences),
"std": confidence_std,
"consistent": confidence_consistent,
"values": confidences,
},
"word_count": {
"mean": sum(word_counts) / len(word_counts),
"std": word_count_std,
"consistent": word_count_consistent,
"values": word_counts,
},
"segment_count": {
"mean": sum(segment_counts) / len(segment_counts),
"std": segment_count_std,
"consistent": segment_count_consistent,
"values": segment_counts,
},
"duration": {
"mean": sum(durations) / len(durations),
"std": duration_std,
"consistent": duration_consistent,
"values": durations,
},
},
"provider_results": {
r.provider: {
"valid": r.is_valid,
"confidence": r.confidence_score,
"word_count": r.word_count,
"segment_count": r.segment_count,
"duration": r.duration,
"errors": r.validation_errors,
}
for r in results
},
}
def _calculate_std(self, values: Sequence[Union[int, float]]) -> float:
"""Calculate standard deviation."""
if len(values) <= 1:
return 0.0
float_values = [float(x) for x in values]
mean = sum(float_values) / len(float_values)
variance = sum((x - mean) ** 2 for x in float_values) / (len(float_values) - 1)
return float(variance**0.5)
[docs]
def generate_consistency_report(self, results: List[ConsistencyResult]) -> str:
"""Generate a human-readable consistency report."""
analysis = self.analyze_consistency(results)
report = []
report.append("=" * 60)
report.append("CROSS-PROVIDER CONSISTENCY REPORT")
report.append("=" * 60)
report.append(f"Status: {analysis['status'].upper()}")
report.append(
f"Valid Providers: {analysis['valid_providers']}/{analysis['total_providers']}"
)
report.append("")
# Metrics section
report.append("METRICS ANALYSIS:")
report.append("-" * 30)
for metric_name, metric_data in analysis["metrics"].items():
report.append(f"{metric_name.replace('_', ' ').title()}:")
report.append(f" Mean: {metric_data['mean']:.3f}")
report.append(f" Std Dev: {metric_data['std']:.3f}")
report.append(f" Consistent: {'✅' if metric_data['consistent'] else '❌'}")
report.append(f" Values: {metric_data['values']}")
report.append("")
# Provider details
report.append("PROVIDER DETAILS:")
report.append("-" * 30)
for provider, data in analysis["provider_results"].items():
status = "✅" if data["valid"] else "❌"
report.append(f"{provider.upper()}: {status}")
if data["valid"]:
report.append(f" Confidence: {data['confidence']:.3f}")
report.append(f" Words: {data['word_count']}")
report.append(f" Segments: {data['segment_count']}")
report.append(f" Duration: {data['duration']:.2f}s")
else:
report.append(f" Errors: {', '.join(data['errors'])}")
report.append("")
return "\n".join(report)