Source code for services.extraction

"""
Service for extracting clinical data using LLM
"""

import json
import re
from typing import Dict, List, Optional, Any
from core.models import AudioTranscript, ClinicalData
import logging

logger = logging.getLogger(__name__)



[docs]
class ClinicalExtractionService:
    """
    Service for extracting structured clinical data from transcripts
    """
    

[docs]
    def __init__(self):
        """
        Initialize the extraction service
        """
        # Template for extraction based on reference projects
        self.extraction_template = {
            "informazioni_paziente": {
                "nome": "",
                "cognome": "",
                "data_nascita": "",
                "codice_fiscale": "",
                "sesso": "",
                "età": ""
            },
            "parametri_vitali": {
                "pressione_arteriosa": "",
                "frequenza_cardiaca": "",
                "temperatura": "",
                "saturazione_ossigeno": "",
                "frequenza_respiratoria": ""
            },
            "sintomi": [],
            "esami_clinici": [],
            "diagnosi": [],
            "terapie": [],
            "allergie": [],
            "storia_clinica": "",
            "note_mediche": "",
            "priorità_triage": ""
        }



[docs]
    def extract_clinical_data(self, transcript: AudioTranscript) -> ClinicalData:
        """
        Extract structured clinical data from a transcript

        :param transcript: AudioTranscript object to process
        :type transcript: AudioTranscript
        :returns: ClinicalData: Object with extracted clinical data
        :rtype: ClinicalData
        :raises Exception: If extraction fails
        """
        try:
            # Crea record di dati clinici
            clinical_data = ClinicalData.objects.create(
                transcript=transcript
            )
            
            # Estrai i dati dal testo
            text = transcript.transcript_text
            extracted_data = self._extract_structured_data(text)
            
            # Popola i campi del modello con i dati estratti
            self._populate_clinical_data_fields(clinical_data, extracted_data)
            
            # Calcola confidence score
            clinical_data.confidence_score = self._calculate_extraction_confidence(extracted_data)
            clinical_data.save()
            
            logger.info(f"Estrazione completata per clinical_data {clinical_data.id}")
            return clinical_data
            
        except Exception as e:
            logger.error(f"Errore durante l'estrazione: {e}")
            if 'clinical_data' in locals():
                # Non possiamo impostare un campo 'status' che non esiste
                # Invece, possiamo eliminare il record parziale o lasciarlo incompleto
                pass
            raise


    def _extract_structured_data(self, text: str) -> Dict[str, Any]:
        """
        Extract structured data from text using regex patterns and heuristic logic
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: Dictionary with extracted data
        :rtype: Dict[str, Any]
        """
        data = self.extraction_template.copy()
        
        # Estrazione informazioni paziente
        data["informazioni_paziente"] = self._extract_patient_info(text)
        
        # Estrazione parametri vitali
        data["parametri_vitali"] = self._extract_vital_signs(text)
        
        # Estrazione sintomi
        data["sintomi"] = self._extract_symptoms(text)
        
        # Estrazione esami
        data["esami_clinici"] = self._extract_clinical_tests(text)
        
        # Estrazione diagnosi
        data["diagnosi"] = self._extract_diagnoses(text)
        
        # Estrazione terapie
        data["terapie"] = self._extract_therapies(text)
        
        # Estrazione allergie
        data["allergie"] = self._extract_allergies(text)
        
        # Estrazione storia clinica
        data["storia_clinica"] = self._extract_medical_history(text)
        
        # Estrazione note mediche
        data["note_mediche"] = self._extract_medical_notes(text)
        
        # Determinazione priorità triage
        data["priorità_triage"] = self._determine_triage_priority(data)
        
        return data

    def _extract_patient_info(self, text: str) -> Dict[str, str]:
        """
        Extract patient information
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: Dictionary with patient information
        :rtype: Dict[str, str]
        """
        info = {}
        
        # Nome e cognome
        name_patterns = [
            r"il\s+paziente\s+(\w+)\s+(\w+)",
            r"la\s+paziente\s+(\w+)\s+(\w+)",
            r"signor[ea]?\s+(\w+)\s+(\w+)",
            r"nome\s*:\s*(\w+)\s+(\w+)"
        ]
        
        for pattern in name_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                info["nome"] = match.group(1)
                info["cognome"] = match.group(2)
                break
        
        # Età
        age_match = re.search(r"(\d{1,3})\s*anni?", text, re.IGNORECASE)
        if age_match:
            info["età"] = age_match.group(1)
        
        # Sesso
        if re.search(r"\b(maschio|uomo|signore?)\b", text, re.IGNORECASE):
            info["sesso"] = "M"
        elif re.search(r"\b(femmina|donna|signora)\b", text, re.IGNORECASE):
            info["sesso"] = "F"
        
        return info

    def _extract_vital_signs(self, text: str) -> Dict[str, str]:
        """
        Extract vital signs
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: Dictionary with vital signs
        :rtype: Dict[str, str]
        """
        vitals = {}
        
        # Pressione arteriosa
        bp_patterns = [
            r"pressione\s*(?:arteriosa)?\s*(?:è|di)?\s*(\d{2,3})/(\d{2,3})",
            r"(\d{2,3})/(\d{2,3})\s*mmHg",
            r"(\d{2,3})\s*su\s*(\d{2,3})"
        ]
        
        for pattern in bp_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                vitals["pressione_arteriosa"] = f"{match.group(1)}/{match.group(2)} mmHg"
                break
        
        # Frequenza cardiaca
        hr_patterns = [
            r"frequenza\s*cardiaca\s*(?:è|di)?\s*(\d{2,3})",
            r"(\d{2,3})\s*bpm",
            r"battiti\s*(?:al\s*minuto)?\s*(\d{2,3})"
        ]
        
        for pattern in hr_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                vitals["frequenza_cardiaca"] = f"{match.group(1)} bpm"
                break
        
        # Temperatura
        temp_patterns = [
            r"temperatura\s*(?:corporea)?\s*(?:è|di)?\s*(\d{2,3}(?:\.\d)?)\s*°?C?",
            r"febbre\s*(?:a)?\s*(\d{2,3}(?:\.\d)?)\s*°?C?"
        ]
        
        for pattern in temp_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                vitals["temperatura"] = f"{match.group(1)}°C"
                break
        
        # Saturazione ossigeno
        sat_patterns = [
            r"saturazione\s*(?:ossigeno)?\s*(?:è|di)?\s*(\d{2,3})%?",
            r"SpO2\s*(\d{2,3})%?"
        ]
        
        for pattern in sat_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                vitals["saturazione_ossigeno"] = f"{match.group(1)}%"
                break
        
        return vitals

    def _extract_symptoms(self, text: str) -> List[str]:
        """
        Extract symptoms from text
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: List of symptoms
        :rtype: List[str]
        """
        symptoms = []
        
        symptom_patterns = [
            r"dolore\s+(?:al|alla|ai|alle)\s+(\w+)",
            r"sintomi?\s*(?:di|sono|è|include)?\s*([^.]+)",
            r"si\s+presenta\s+con\s+([^.]+)",
            r"lamenta\s+([^.]+)",
            r"accusa\s+([^.]+)"
        ]
        
        for pattern in symptom_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                symptom = match.group(1).strip()
                if len(symptom) > 3 and symptom not in symptoms:
                    symptoms.append(symptom)
        
        return symptoms

    def _extract_clinical_tests(self, text: str) -> List[str]:
        """
        Extract clinical tests
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: List of clinical tests
        :rtype: List[str]
        """
        tests = []
        
        test_patterns = [
            r"esame\s+(?:del|della|dei|delle)?\s*(\w+)",
            r"analisi\s+(?:del|della|dei|delle)?\s*(\w+)",
            r"radiografia\s+(?:del|della|dei|delle)?\s*(\w+)",
            r"ecografia\s+(?:del|della|dei|delle)?\s*(\w+)",
            r"TAC\s+(?:del|della|dei|delle)?\s*(\w+)",
            r"risonanza\s+(?:del|della|dei|delle)?\s*(\w+)"
        ]
        
        for pattern in test_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                test = f"{match.group(0)}"
                if test not in tests:
                    tests.append(test)
        
        return tests

    def _extract_diagnoses(self, text: str) -> List[str]:
        """
        Extract diagnoses from text
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: List of diagnoses
        :rtype: List[str]
        """
        diagnoses = []
        
        diagnosis_patterns = [
            r"diagnosi\s*(?:è|di)?\s*([^.]+)",
            r"diagnosticato\s+(?:con)?\s*([^.]+)",
            r"presenta\s+(?:una|un)?\s*([^.]+)",
            r"sospetto\s+(?:di)?\s*([^.]+)"
        ]
        
        for pattern in diagnosis_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                diagnosis = match.group(1).strip()
                if len(diagnosis) > 3 and diagnosis not in diagnoses:
                    diagnoses.append(diagnosis)
        
        return diagnoses

    def _extract_therapies(self, text: str) -> List[str]:
        """
        Extract therapies and medications from text
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: List of therapies/medications
        :rtype: List[str]
        """
        therapies = []
        
        therapy_patterns = [
            r"terapia\s+(?:con)?\s*([^.]+)",
            r"farmaco\s*([^.]+)",
            r"prescri(?:tto|zione)\s*([^.]+)",
            r"somministrar[eio]\s*([^.]+)",
            r"assumere\s*([^.]+)"
        ]
        
        for pattern in therapy_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                therapy = match.group(1).strip()
                if len(therapy) > 3 and therapy not in therapies:
                    therapies.append(therapy)
        
        return therapies

    def _extract_allergies(self, text: str) -> List[str]:
        """
        Extract allergies from text
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: List of allergies
        :rtype: List[str]
        """
        allergies = []
        
        allergy_patterns = [
            r"allergi[ca]?\s+(?:a|al|alla|ai|alle)?\s*([^.]+)",
            r"intolleranz[ea]\s+(?:a|al|alla|ai|alle)?\s*([^.]+)",
            r"reazion[ei]\s+avvers[ea]\s+(?:a|al|alla|ai|alle)?\s*([^.]+)"
        ]
        
        for pattern in allergy_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                allergy = match.group(1).strip()
                if len(allergy) > 2 and allergy not in allergies:
                    allergies.append(allergy)
        
        return allergies

    def _extract_medical_history(self, text: str) -> str:
        """
        Extract medical history from text
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: Medical history as a string
        :rtype: str
        """
        history_patterns = [
            r"storia\s+clinic[a]?\s*:?\s*([^.]+)",
            r"anamnesi\s*:?\s*([^.]+)",
            r"precedenti\s+(?:medici|clinici)\s*:?\s*([^.]+)"
        ]
        
        for pattern in history_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        return ""

    def _extract_medical_notes(self, text: str) -> str:
        """
        Extract general medical notes from text
        
        :param text: Transcript text to analyze
        :type text: str
        :returns: Medical notes as a string
        :rtype: str
        """
        # Rimuovi parti già estratte e mantieni il resto come note
        cleaned_text = text
        
        # Rimuovi pattern già estratti
        patterns_to_remove = [
            r"pressione\s*(?:arteriosa)?\s*(?:è|di)?\s*\d{2,3}/\d{2,3}",
            r"frequenza\s*cardiaca\s*(?:è|di)?\s*\d{2,3}",
            r"temperatura\s*(?:corporea)?\s*(?:è|di)?\s*\d{2,3}(?:\.\d)?°?C?"
        ]
        
        for pattern in patterns_to_remove:
            cleaned_text = re.sub(pattern, "", cleaned_text, flags=re.IGNORECASE)
        
        return cleaned_text.strip()

    def _determine_triage_priority(self, data: Dict[str, Any]) -> str:
        """
        Determine triage priority based on extracted data
        
        :param data: Extracted clinical data
        :type data: Dict[str, Any]
        :returns: Triage priority level ("ALTA", "MEDIA", "BASSA")
        :rtype: str
        """
        # Logica di priorità basata sui sintomi e parametri vitali
        vitals = data.get("parametri_vitali", {})
        symptoms = data.get("sintomi", [])
        
        # Priorità ALTA se ci sono parametri vitali critici
        if vitals.get("pressione_arteriosa"):
            bp = vitals["pressione_arteriosa"]
            bp_match = re.search(r"(\d+)/(\d+)", bp)
            if bp_match:
                systolic = int(bp_match.group(1))
                diastolic = int(bp_match.group(2))
                if systolic > 180 or diastolic > 110 or systolic < 90:
                    return "ALTA"
        
        if vitals.get("frequenza_cardiaca"):
            hr_match = re.search(r"(\d+)", vitals["frequenza_cardiaca"])
            if hr_match:
                hr = int(hr_match.group(1))
                if hr > 120 or hr < 50:
                    return "ALTA"
        
        # Priorità ALTA per sintomi critici
        critical_symptoms = ["dolore toracico", "difficoltà respiratoria", "perdita coscienza"]
        for symptom in symptoms:
            for critical in critical_symptoms:
                if critical.lower() in symptom.lower():
                    return "ALTA"
        
        # Priorità MEDIA per sintomi moderati
        moderate_symptoms = ["dolore", "febbre", "nausea"]
        for symptom in symptoms:
            for moderate in moderate_symptoms:
                if moderate.lower() in symptom.lower():
                    return "MEDIA"
        
        return "BASSA"

    def _calculate_extraction_confidence(self, data: Dict[str, Any]) -> float:
        """
        Calculate a confidence score for the extraction
        
        :param data: Extracted clinical data
        :type data: Dict[str, Any]
        :returns: Confidence score between 0.0 and 1.0
        :rtype: float
        """
        total_fields = 0
        filled_fields = 0
        
        for section, content in data.items():
            if isinstance(content, dict):
                for field, value in content.items():
                    total_fields += 1
                    if value and value.strip():
                        filled_fields += 1
            elif isinstance(content, list):
                total_fields += 1
                if content:
                    filled_fields += 1
            elif isinstance(content, str):
                total_fields += 1
                if content and content.strip():
                    filled_fields += 1
        
        if total_fields == 0:
            return 0.0
        
        return filled_fields / total_fields

    def _populate_clinical_data_fields(self, clinical_data: ClinicalData, extracted_data: Dict[str, Any]) -> None:
        """
        Populate the ClinicalData model fields with extracted data
        
        :param clinical_data: ClinicalData object to populate
        :type clinical_data: ClinicalData
        :param extracted_data: Dictionary with extracted clinical data
        :type extracted_data: Dict[str, Any]
        :returns: None
        :rtype: None
        """
        try:
            # Informazioni paziente
            patient_info = extracted_data.get("informazioni_paziente", {})
            clinical_data.patient_name = f"{patient_info.get('nome', '')} {patient_info.get('cognome', '')}".strip()
            
            if patient_info.get('età'):
                try:
                    clinical_data.patient_age = int(patient_info['età'])
                except (ValueError, TypeError):
                    pass
            
            clinical_data.patient_gender = patient_info.get('sesso', '')
            
            # Anamnesi
            clinical_data.chief_complaint = extracted_data.get("sintomi_principali", "")
            clinical_data.history_present_illness = extracted_data.get("storia_clinica", "")
            
            # Liste JSON
            clinical_data.past_medical_history = extracted_data.get("storia_medica", [])
            clinical_data.medications = extracted_data.get("terapie", [])
            clinical_data.allergies = extracted_data.get("allergie", [])
            clinical_data.diagnosis = extracted_data.get("diagnosi", [])
            
            # Parametri vitali e esame obiettivo
            clinical_data.vital_signs = extracted_data.get("parametri_vitali", {})
            clinical_data.physical_examination = extracted_data.get("esami_clinici", {})
            
            # Valutazione e piano
            clinical_data.assessment = extracted_data.get("note_mediche", "")
            clinical_data.treatment_plan = extracted_data.get("piano_terapeutico", "")
            
        except Exception as e:
            logger.error(f"Errore nel popolare i campi di ClinicalData: {e}")

            # Non sollevare l'errore, continua con i campi che siamo riusciti a popolare