Source code for wtf_transcript_converter.utils.language_utils

"""
Language utility functions for WTF transcript converter.

This module provides utilities for language code validation and normalization.
"""

import re


[docs] def is_valid_bcp47(language_code: str) -> bool: """ Validate BCP-47 language code format. Args: language_code: Language code to validate Returns: True if valid BCP-47 format """ # Basic BCP-47 validation pattern # This is a simplified version - full BCP-47 is more complex pattern = r"^[a-z]{2,3}(-[A-Z]{2})?(-[a-z0-9]{5,8})?(-[a-z0-9]{1,8})*(-[a-z0-9]{1,8})*(-[a-z0-9]{1,8})*$" return bool(re.match(pattern, language_code.lower()))
[docs] def normalize_language_code(language_code: str) -> str: """ Normalize language code to standard format. Args: language_code: Language code to normalize Returns: Normalized language code """ # Convert to lowercase and handle common variations normalized = language_code.lower() # Handle underscore format (en_us -> en-US) normalized = normalized.replace("_", "-") # Handle full language names full_names = { "english": "en-US", "spanish": "es-ES", "french": "fr-FR", "german": "de-DE", "italian": "it-IT", "portuguese": "pt-BR", "chinese": "zh-CN", "japanese": "ja-JP", "korean": "ko-KR", "russian": "ru-RU", } if normalized in full_names: return full_names[normalized] # Handle common variations variations = { "en": "en-US", "es": "es-ES", "fr": "fr-FR", "de": "de-DE", "it": "it-IT", "pt": "pt-BR", "zh": "zh-CN", "ja": "ja-JP", "ko": "ko-KR", "ru": "ru-RU", } return variations.get(normalized, normalized)