Source code for services.whisper_service

"""
Service for audio transcription with Whisper medium
Stable version for production
"""

import os
import logging
import tempfile
import wave
from datetime import datetime
from typing import Dict, Any, Optional

try:
    import whisper
    WHISPER_AVAILABLE = True
except ImportError:
    WHISPER_AVAILABLE = False
    logging.warning("OpenAI Whisper non disponibile - trascrizione disabilitata")

logger = logging.getLogger(__name__)



[docs]
class WhisperService:
    """
    Service for audio transcription with Whisper medium
    """
    

[docs]
    def __init__(self):
        self.model = None
        self.model_name = "medium"  # Balance between quality and speed
        self._load_model()

    
    def _load_model(self):
        """Load the Whisper model"""
        if not WHISPER_AVAILABLE:
            logger.error("Whisper not available, cannot load model")
            return
        
        try:
            logger.info(f"Loading Whisper model {self.model_name}...")
            self.model = whisper.load_model(self.model_name)
            logger.info(f"Whisper model {self.model_name} loaded successfully")
        except Exception as e:
            logger.error(f"Error loading Whisper model: {str(e)}")
            self.model = None
    

[docs]
    def transcribe_audio_file(self, audio_file_path: str, language: str = "it") -> Dict[str, Any]:
        """
        Transcription of audio file with Whisper

        :param audio_file_path: Path to the audio file
        :type audio_file_path: str
        :param language: Language for transcription (default: Italian)
        :type language: str
        :return: Dictionary with transcription results
        :rtype: Dict[str, Any]
        """
        if not self.model:
            logger.error("Whisper model not loaded")
            return {
                'success': False,
                'error': 'Whisper model not available',
                'transcript': '',
                'confidence': 0.0
            }
        
        if not os.path.exists(audio_file_path):
            logger.error(f"Audio file not found: {audio_file_path}")
            return {
                'success': False,
                'error': 'Audio file not found',
                'transcript': '',
                'confidence': 0.0
            }
        
        try:
            logger.info(f"Starting transcription for file: {audio_file_path}")

            # Transcription with Whisper
            result = self.model.transcribe(
                audio_file_path,
                language=language,
                task="transcribe",
                temperature=0.1,  # Low temperature for more stable output
                best_of=5,        # Best of 5 attempts
                beam_size=5,      # Beam search for better quality
                patience=1.0,     # Patience for beam search
                condition_on_previous_text=False  # Do not condition on previous text
            )
            
            transcript = result.get('text', '').strip()
            segments = result.get('segments', [])
            
            # Calcola confidence media dai segmenti
            confidence = 0.0
            if segments:
                confidences = []
                for segment in segments:
                    if 'avg_logprob' in segment:
                        # Converti logprob in probabilità
                        segment_confidence = min(1.0, max(0.0, (segment['avg_logprob'] + 1.0)))
                        confidences.append(segment_confidence)
                
                if confidences:
                    confidence = sum(confidences) / len(confidences)
            
            # Pulizia del testo
            cleaned_transcript = self._clean_transcript(transcript)
            
            logger.info(f"Trascrizione completata: {len(cleaned_transcript)} caratteri")
            
            return {
                'success': True,
                'transcript': cleaned_transcript,
                'confidence': confidence,
                'language': language,
                'duration': result.get('duration', 0.0),
                'segments': segments,
                'model': self.model_name,
                'timestamp': datetime.utcnow().isoformat()
            }
            
        except Exception as e:
            logger.error(f"Error during transcription: {str(e)}")
            return {
                'success': False,
                'error': str(e),
                'transcript': '',
                'confidence': 0.0
            }

    

[docs]
    def transcribe_audio_blob(self, audio_blob: bytes, format: str = "wav", language: str = "it") -> Dict[str, Any]:
        """
        Transcription of audio blob with Whisper

        :param audio_blob: Audio data in bytes
        :type audio_blob: bytes
        :param format: Audio format (wav, mp3, etc.)
        :type format: str
        :param language: Language for transcription (default: Italian)
        :type language: str
        :return: Dictionary with transcription results
        :rtype: Dict[str, Any]
        """
        # Save the blob to a temporary file
        try:
            with tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) as temp_file:
                temp_file.write(audio_blob)
                temp_path = temp_file.name

            # Transcribe the temporary file
            result = self.transcribe_audio_file(temp_path, language)

            # Remove temporary file
            try:
                os.unlink(temp_path)
            except:
                pass
                
            return result
            
        except Exception as e:
            logger.error(f"Errore processing audio blob: {str(e)}")
            return {
                'success': False,
                'error': str(e),
                'transcript': '',
                'confidence': 0.0
            }

    
    def _clean_transcript(self, text: str) -> str:
        """
        Text cleaning for medical transcription use
        
        :param text: Raw transcript text
        :type text: str
        :returns: Cleaned text
        :rtype: str
        """
        if not text:
            return ""
        
        # Rimuovi spazi multipli
        text = ' '.join(text.split())
        
        # Capitalizza la prima lettera
        if text:
            text = text[0].upper() + text[1:]
        
        # Assicurati che finisca con punteggiatura
        if text and text[-1] not in '.!?':
            text += '.'
        
        return text
    

[docs]
    def test_transcription(self) -> Dict[str, Any]:
        """
        Test the transcription service
        
        :returns: Dictionary with test results
        :rtype: Dict[str, Any]
        """
        if not self.model:
            return {
                'success': False,
                'error': 'Modello non caricato',
                'model': self.model_name
            }
        
        return {
            'success': True,
            'model': self.model_name,
            'available': True,
            'test_passed': True
        }

    

[docs]
    def get_supported_formats(self) -> list:
        """
        Supported audio formats
        
        :returns: List of supported audio formats
        :rtype: list
        """
        return ['wav', 'mp3', 'flac', 'ogg', 'm4a', 'aac']




# Istanza singleton del servizio
whisper_service = WhisperService()