Source code for services.transcription

"""
Service for audio transcription using Whisper
"""

import os
import tempfile
import re
from typing import Tuple, Optional
from django.core.files.base import ContentFile
from core.models import AudioTranscript, Encounter
import logging

logger = logging.getLogger(__name__)


[docs]
def check_dependencies():
    """
    Verify that all necessary dependencies for transcription are available.

    Check for the presence of numpy, torch, whisper, and librosa in the system.

    :returns: True if all dependencies are available
    :rtype: bool
    :raises ImportError: If one or more dependencies are missing
    """
    missing_deps = []
    
    try:
        import numpy
    except ImportError:
        missing_deps.append("numpy")
    
    try:
        import torch
    except ImportError:
        missing_deps.append("torch")
    
    try:
        import whisper
    except ImportError:
        missing_deps.append("openai-whisper")
    
    try:
        import librosa
    except ImportError:
        missing_deps.append("librosa")
    
    if missing_deps:
        deps_str = ", ".join(missing_deps)
        raise ImportError(f"Dipendenze mancanti: {deps_str}. Installa con: pip install {deps_str}")
    
    return True



[docs]
class TranscriptionService:
    """
    Service for audio transcription using OpenAI's Whisper model.

    Handles the complete process of transcribing audio files to text,
    including preprocessing, transcribing, and post-processing of the result.

    :ivar model_name: Name of the Whisper model to use
    :type model_name: str
    :ivar model: Instance of the loaded Whisper model
    :type model: Optional[Any]
    """
    

[docs]
    def __init__(self, model_size: str = "base"):
        """
        Initializes the transcription service with a Whisper model.

        :param model_size: Size of the Whisper model ("tiny", "base", "small", "medium", "large")
        :type model_size: str
        """
        self.model_size = model_size
        self.model = None

        
    def _load_model(self):
        """
        Loads the Whisper model only when necessary (lazy loading).

        :raises ImportError: If the required dependencies are not available
        :raises Exception: If an error occurs while loading the model
        """
        if self.model is None:
            try:
                # Verifica prima tutte le dipendenze
                check_dependencies()
                
                # Importa le dipendenze necessarie
                import numpy as np
                import torch
                import whisper
                
                self.model = whisper.load_model(self.model_size)
                logger.info(f"Modello Whisper '{self.model_size}' caricato con successo")
            except ImportError as e:
                logger.error(f"Dipendenza mancante: {e}")
                raise ImportError(f"Libreria richiesta non trovata: {e}")
            except Exception as e:
                logger.error(f"Errore nel caricamento del modello Whisper: {e}")
                raise


[docs]
    def transcribe_audio_file(self, audio_file, encounter_id: str, language: str = "it") -> AudioTranscript:
        """
        Transcribes an audio file and saves the result to the database.

        :param audio_file: Audio file to transcribe
        :type audio_file: FileField
        :param encounter_id: Unique ID of the encounter associated with the transcription
        :type encounter_id: str
        :param language: Language code of the audio content (default: "it")
        :type language: str
        :returns: Transcription object saved in the database
        :rtype: AudioTranscript
        :raises Exception: If an error occurs during transcription
        """
        try:
            # Ottieni l'encounter
            encounter = Encounter.objects.get(encounter_id=encounter_id)
            
            # Crea record di trascrizione
            transcript = AudioTranscript.objects.create(
                encounter=encounter,
                status='transcribing',
                language=language
            )
            
            # Salva il file audio
            transcript.audio_file.save(
                f"audio_{transcript.transcript_id}.mp3",
                ContentFile(audio_file.read()),
                save=True
            )
            
            # Trascrivi l'audio
            with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_file:
                # Copia il contenuto del file
                audio_file.seek(0)  # Reset del puntatore
                temp_file.write(audio_file.read())
                temp_file.flush()
                temp_file_path = temp_file.name
            
            try:
                # Carica librosa solo quando necessario
                try:
                    import numpy as np
                    import librosa
                except ImportError as e:
                    logger.error(f"Dipendenza mancante: {e}")
                    raise ImportError(f"Libreria richiesta non trovata: {e}")
                
                # Carica e preprocessa l'audio
                audio, sr = librosa.load(temp_file_path, sr=16000, mono=True)
                transcript.audio_duration = len(audio) / sr
                
                # Carica il modello se non già fatto
                self._load_model()
                
                # Trascrizione con Whisper
                result = self.model.transcribe(audio, language=language)
                
                # Post-processing del testo
                cleaned_text = self._clean_transcript_text(result["text"])
                
                # Aggiorna il record
                transcript.transcript_text = cleaned_text
                transcript.confidence_score = self._calculate_confidence(result)
                transcript.status = 'completed'
                transcript.save()
                
                logger.info(f"Trascrizione completata per transcript {transcript.transcript_id}")
                return transcript
                
            finally:
                # Pulisci il file temporaneo assicurandoti che sia chiuso
                try:
                    if os.path.exists(temp_file_path):
                        os.unlink(temp_file_path)
                except Exception as cleanup_error:
                    logger.warning(f"Impossibile rimuovere file temporaneo {temp_file_path}: {cleanup_error}")
                
        except Encounter.DoesNotExist:
            logger.error(f"Encounter {encounter_id} non trovato")
            raise ValueError(f"Encounter {encounter_id} non esistente")
        except Exception as e:
            logger.error(f"Errore durante la trascrizione: {e}")
            if 'transcript' in locals():
                transcript.status = 'error'
                transcript.error_message = str(e)
                transcript.save()
            raise


    def _clean_transcript_text(self, text: str) -> str:
        """
        Cleans and normalizes the transcribed text for medical terminology
        
        :param text: Raw transcribed text
        :type text: str
        :returns: Cleaned and normalized text
        :rtype: str
        """
        # Regex per terminologia medica come nel progetto di riferimento
        text = re.sub(r"\bmilligrams?\s+per\s+deciliter\b", "mg/dl", text, flags=re.IGNORECASE)
        text = re.sub(r"\bmilligrammi?\s+per\s+decilitro\b", "mg/dl", text, flags=re.IGNORECASE)
        text = re.sub(r"\bmg?\s+per\s+decilitro\b", "mg/dl", text, flags=re.IGNORECASE)
        text = re.sub(r"\bmillimeters?\s+of\s+mercury\b", "mmHg", text, flags=re.IGNORECASE)
        text = re.sub(r"\bmillimetri?\s+di\s+merc[ur]io\b", "mmHg", text, flags=re.IGNORECASE)
        text = re.sub(r"\bmm?\s+di\s+merc[ur]io\b", "mmHg", text, flags=re.IGNORECASE)

        text = re.sub(r"\b(\d+)\s+over\s+(\d+)\b", r"\1/\2", text, flags=re.IGNORECASE)
        text = re.sub(r"\b(\d+)\s+su\s+(\d+)\b", r"\1/\2", text, flags=re.IGNORECASE)
        text = re.sub(r"\b(\d+)\s+slash\s+(\d+)\b", r"\1/\2", text, flags=re.IGNORECASE)

        text = re.sub(r"\bbeats?\s+per\s+minute\b", "bpm", text, flags=re.IGNORECASE)
        text = re.sub(r"\bbattiti?\s+al\s+minuto\b", "bpm", text, flags=re.IGNORECASE)

        text = re.sub(r"\bdegrees?\s+celsius\b", "°C", text, flags=re.IGNORECASE)
        text = re.sub(r"\bgradi?\s+celsius\b", "°C", text, flags=re.IGNORECASE)

        text = re.sub(r"\b(\d+)\s+percent\b", r"\1%", text, flags=re.IGNORECASE)
        text = re.sub(r"\bper\s+cento\b", "%", text, flags=re.IGNORECASE)

        return text.strip()

    def _calculate_confidence(self, whisper_result) -> float:
        """
        Calculates a confidence score based on Whisper results
        
        :param whisper_result: Result dictionary from Whisper transcription
        :type whisper_result: dict
        :returns: Confidence score between 0 and 1
        :rtype: float
        """
        # Whisper non fornisce direttamente un confidence score,
        # quindi usiamo alcune euristiche
        if 'segments' in whisper_result:
            # Media dei confidence scores dei segmenti se disponibili
            scores = []
            for segment in whisper_result['segments']:
                if 'avg_logprob' in segment:
                    # Converte log probability in una scala 0-1
                    confidence = max(0, min(1, (segment['avg_logprob'] + 1) / 1))
                    scores.append(confidence)
            
            if scores:
                return sum(scores) / len(scores)
        
        # Fallback: confidence basata sulla lunghezza del testo
        text_length = len(whisper_result.get('text', ''))
        if text_length > 100:
            return 0.8
        elif text_length > 50:
            return 0.6
        else:
            return 0.4