Source code for services.extraction
"""
Service for extracting clinical data using LLM
"""
import json
import re
from typing import Dict, List, Optional, Any
from core.models import AudioTranscript, ClinicalData
import logging
logger = logging.getLogger(__name__)
[docs]
class ClinicalExtractionService:
"""
Service for extracting structured clinical data from transcripts
"""
[docs]
def __init__(self):
"""
Initialize the extraction service
"""
# Template for extraction based on reference projects
self.extraction_template = {
"informazioni_paziente": {
"nome": "",
"cognome": "",
"data_nascita": "",
"codice_fiscale": "",
"sesso": "",
"età": ""
},
"parametri_vitali": {
"pressione_arteriosa": "",
"frequenza_cardiaca": "",
"temperatura": "",
"saturazione_ossigeno": "",
"frequenza_respiratoria": ""
},
"sintomi": [],
"esami_clinici": [],
"diagnosi": [],
"terapie": [],
"allergie": [],
"storia_clinica": "",
"note_mediche": "",
"priorità_triage": ""
}
[docs]
def extract_clinical_data(self, transcript: AudioTranscript) -> ClinicalData:
"""
Extract structured clinical data from a transcript
:param transcript: AudioTranscript object to process
:type transcript: AudioTranscript
:returns: ClinicalData: Object with extracted clinical data
:rtype: ClinicalData
:raises Exception: If extraction fails
"""
try:
# Crea record di dati clinici
clinical_data = ClinicalData.objects.create(
transcript=transcript
)
# Estrai i dati dal testo
text = transcript.transcript_text
extracted_data = self._extract_structured_data(text)
# Popola i campi del modello con i dati estratti
self._populate_clinical_data_fields(clinical_data, extracted_data)
# Calcola confidence score
clinical_data.confidence_score = self._calculate_extraction_confidence(extracted_data)
clinical_data.save()
logger.info(f"Estrazione completata per clinical_data {clinical_data.id}")
return clinical_data
except Exception as e:
logger.error(f"Errore durante l'estrazione: {e}")
if 'clinical_data' in locals():
# Non possiamo impostare un campo 'status' che non esiste
# Invece, possiamo eliminare il record parziale o lasciarlo incompleto
pass
raise
def _extract_structured_data(self, text: str) -> Dict[str, Any]:
"""
Extract structured data from text using regex patterns and heuristic logic
:param text: Transcript text to analyze
:type text: str
:returns: Dictionary with extracted data
:rtype: Dict[str, Any]
"""
data = self.extraction_template.copy()
# Estrazione informazioni paziente
data["informazioni_paziente"] = self._extract_patient_info(text)
# Estrazione parametri vitali
data["parametri_vitali"] = self._extract_vital_signs(text)
# Estrazione sintomi
data["sintomi"] = self._extract_symptoms(text)
# Estrazione esami
data["esami_clinici"] = self._extract_clinical_tests(text)
# Estrazione diagnosi
data["diagnosi"] = self._extract_diagnoses(text)
# Estrazione terapie
data["terapie"] = self._extract_therapies(text)
# Estrazione allergie
data["allergie"] = self._extract_allergies(text)
# Estrazione storia clinica
data["storia_clinica"] = self._extract_medical_history(text)
# Estrazione note mediche
data["note_mediche"] = self._extract_medical_notes(text)
# Determinazione priorità triage
data["priorità_triage"] = self._determine_triage_priority(data)
return data
def _extract_patient_info(self, text: str) -> Dict[str, str]:
"""
Extract patient information
:param text: Transcript text to analyze
:type text: str
:returns: Dictionary with patient information
:rtype: Dict[str, str]
"""
info = {}
# Nome e cognome
name_patterns = [
r"il\s+paziente\s+(\w+)\s+(\w+)",
r"la\s+paziente\s+(\w+)\s+(\w+)",
r"signor[ea]?\s+(\w+)\s+(\w+)",
r"nome\s*:\s*(\w+)\s+(\w+)"
]
for pattern in name_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
info["nome"] = match.group(1)
info["cognome"] = match.group(2)
break
# Età
age_match = re.search(r"(\d{1,3})\s*anni?", text, re.IGNORECASE)
if age_match:
info["età"] = age_match.group(1)
# Sesso
if re.search(r"\b(maschio|uomo|signore?)\b", text, re.IGNORECASE):
info["sesso"] = "M"
elif re.search(r"\b(femmina|donna|signora)\b", text, re.IGNORECASE):
info["sesso"] = "F"
return info
def _extract_vital_signs(self, text: str) -> Dict[str, str]:
"""
Extract vital signs
:param text: Transcript text to analyze
:type text: str
:returns: Dictionary with vital signs
:rtype: Dict[str, str]
"""
vitals = {}
# Pressione arteriosa
bp_patterns = [
r"pressione\s*(?:arteriosa)?\s*(?:è|di)?\s*(\d{2,3})/(\d{2,3})",
r"(\d{2,3})/(\d{2,3})\s*mmHg",
r"(\d{2,3})\s*su\s*(\d{2,3})"
]
for pattern in bp_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vitals["pressione_arteriosa"] = f"{match.group(1)}/{match.group(2)} mmHg"
break
# Frequenza cardiaca
hr_patterns = [
r"frequenza\s*cardiaca\s*(?:è|di)?\s*(\d{2,3})",
r"(\d{2,3})\s*bpm",
r"battiti\s*(?:al\s*minuto)?\s*(\d{2,3})"
]
for pattern in hr_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vitals["frequenza_cardiaca"] = f"{match.group(1)} bpm"
break
# Temperatura
temp_patterns = [
r"temperatura\s*(?:corporea)?\s*(?:è|di)?\s*(\d{2,3}(?:\.\d)?)\s*°?C?",
r"febbre\s*(?:a)?\s*(\d{2,3}(?:\.\d)?)\s*°?C?"
]
for pattern in temp_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vitals["temperatura"] = f"{match.group(1)}°C"
break
# Saturazione ossigeno
sat_patterns = [
r"saturazione\s*(?:ossigeno)?\s*(?:è|di)?\s*(\d{2,3})%?",
r"SpO2\s*(\d{2,3})%?"
]
for pattern in sat_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vitals["saturazione_ossigeno"] = f"{match.group(1)}%"
break
return vitals
def _extract_symptoms(self, text: str) -> List[str]:
"""
Extract symptoms from text
:param text: Transcript text to analyze
:type text: str
:returns: List of symptoms
:rtype: List[str]
"""
symptoms = []
symptom_patterns = [
r"dolore\s+(?:al|alla|ai|alle)\s+(\w+)",
r"sintomi?\s*(?:di|sono|è|include)?\s*([^.]+)",
r"si\s+presenta\s+con\s+([^.]+)",
r"lamenta\s+([^.]+)",
r"accusa\s+([^.]+)"
]
for pattern in symptom_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
symptom = match.group(1).strip()
if len(symptom) > 3 and symptom not in symptoms:
symptoms.append(symptom)
return symptoms
def _extract_clinical_tests(self, text: str) -> List[str]:
"""
Extract clinical tests
:param text: Transcript text to analyze
:type text: str
:returns: List of clinical tests
:rtype: List[str]
"""
tests = []
test_patterns = [
r"esame\s+(?:del|della|dei|delle)?\s*(\w+)",
r"analisi\s+(?:del|della|dei|delle)?\s*(\w+)",
r"radiografia\s+(?:del|della|dei|delle)?\s*(\w+)",
r"ecografia\s+(?:del|della|dei|delle)?\s*(\w+)",
r"TAC\s+(?:del|della|dei|delle)?\s*(\w+)",
r"risonanza\s+(?:del|della|dei|delle)?\s*(\w+)"
]
for pattern in test_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
test = f"{match.group(0)}"
if test not in tests:
tests.append(test)
return tests
def _extract_diagnoses(self, text: str) -> List[str]:
"""
Extract diagnoses from text
:param text: Transcript text to analyze
:type text: str
:returns: List of diagnoses
:rtype: List[str]
"""
diagnoses = []
diagnosis_patterns = [
r"diagnosi\s*(?:è|di)?\s*([^.]+)",
r"diagnosticato\s+(?:con)?\s*([^.]+)",
r"presenta\s+(?:una|un)?\s*([^.]+)",
r"sospetto\s+(?:di)?\s*([^.]+)"
]
for pattern in diagnosis_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
diagnosis = match.group(1).strip()
if len(diagnosis) > 3 and diagnosis not in diagnoses:
diagnoses.append(diagnosis)
return diagnoses
def _extract_therapies(self, text: str) -> List[str]:
"""
Extract therapies and medications from text
:param text: Transcript text to analyze
:type text: str
:returns: List of therapies/medications
:rtype: List[str]
"""
therapies = []
therapy_patterns = [
r"terapia\s+(?:con)?\s*([^.]+)",
r"farmaco\s*([^.]+)",
r"prescri(?:tto|zione)\s*([^.]+)",
r"somministrar[eio]\s*([^.]+)",
r"assumere\s*([^.]+)"
]
for pattern in therapy_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
therapy = match.group(1).strip()
if len(therapy) > 3 and therapy not in therapies:
therapies.append(therapy)
return therapies
def _extract_allergies(self, text: str) -> List[str]:
"""
Extract allergies from text
:param text: Transcript text to analyze
:type text: str
:returns: List of allergies
:rtype: List[str]
"""
allergies = []
allergy_patterns = [
r"allergi[ca]?\s+(?:a|al|alla|ai|alle)?\s*([^.]+)",
r"intolleranz[ea]\s+(?:a|al|alla|ai|alle)?\s*([^.]+)",
r"reazion[ei]\s+avvers[ea]\s+(?:a|al|alla|ai|alle)?\s*([^.]+)"
]
for pattern in allergy_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
allergy = match.group(1).strip()
if len(allergy) > 2 and allergy not in allergies:
allergies.append(allergy)
return allergies
def _extract_medical_history(self, text: str) -> str:
"""
Extract medical history from text
:param text: Transcript text to analyze
:type text: str
:returns: Medical history as a string
:rtype: str
"""
history_patterns = [
r"storia\s+clinic[a]?\s*:?\s*([^.]+)",
r"anamnesi\s*:?\s*([^.]+)",
r"precedenti\s+(?:medici|clinici)\s*:?\s*([^.]+)"
]
for pattern in history_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).strip()
return ""
def _extract_medical_notes(self, text: str) -> str:
"""
Extract general medical notes from text
:param text: Transcript text to analyze
:type text: str
:returns: Medical notes as a string
:rtype: str
"""
# Rimuovi parti già estratte e mantieni il resto come note
cleaned_text = text
# Rimuovi pattern già estratti
patterns_to_remove = [
r"pressione\s*(?:arteriosa)?\s*(?:è|di)?\s*\d{2,3}/\d{2,3}",
r"frequenza\s*cardiaca\s*(?:è|di)?\s*\d{2,3}",
r"temperatura\s*(?:corporea)?\s*(?:è|di)?\s*\d{2,3}(?:\.\d)?°?C?"
]
for pattern in patterns_to_remove:
cleaned_text = re.sub(pattern, "", cleaned_text, flags=re.IGNORECASE)
return cleaned_text.strip()
def _determine_triage_priority(self, data: Dict[str, Any]) -> str:
"""
Determine triage priority based on extracted data
:param data: Extracted clinical data
:type data: Dict[str, Any]
:returns: Triage priority level ("ALTA", "MEDIA", "BASSA")
:rtype: str
"""
# Logica di priorità basata sui sintomi e parametri vitali
vitals = data.get("parametri_vitali", {})
symptoms = data.get("sintomi", [])
# Priorità ALTA se ci sono parametri vitali critici
if vitals.get("pressione_arteriosa"):
bp = vitals["pressione_arteriosa"]
bp_match = re.search(r"(\d+)/(\d+)", bp)
if bp_match:
systolic = int(bp_match.group(1))
diastolic = int(bp_match.group(2))
if systolic > 180 or diastolic > 110 or systolic < 90:
return "ALTA"
if vitals.get("frequenza_cardiaca"):
hr_match = re.search(r"(\d+)", vitals["frequenza_cardiaca"])
if hr_match:
hr = int(hr_match.group(1))
if hr > 120 or hr < 50:
return "ALTA"
# Priorità ALTA per sintomi critici
critical_symptoms = ["dolore toracico", "difficoltà respiratoria", "perdita coscienza"]
for symptom in symptoms:
for critical in critical_symptoms:
if critical.lower() in symptom.lower():
return "ALTA"
# Priorità MEDIA per sintomi moderati
moderate_symptoms = ["dolore", "febbre", "nausea"]
for symptom in symptoms:
for moderate in moderate_symptoms:
if moderate.lower() in symptom.lower():
return "MEDIA"
return "BASSA"
def _calculate_extraction_confidence(self, data: Dict[str, Any]) -> float:
"""
Calculate a confidence score for the extraction
:param data: Extracted clinical data
:type data: Dict[str, Any]
:returns: Confidence score between 0.0 and 1.0
:rtype: float
"""
total_fields = 0
filled_fields = 0
for section, content in data.items():
if isinstance(content, dict):
for field, value in content.items():
total_fields += 1
if value and value.strip():
filled_fields += 1
elif isinstance(content, list):
total_fields += 1
if content:
filled_fields += 1
elif isinstance(content, str):
total_fields += 1
if content and content.strip():
filled_fields += 1
if total_fields == 0:
return 0.0
return filled_fields / total_fields
def _populate_clinical_data_fields(self, clinical_data: ClinicalData, extracted_data: Dict[str, Any]) -> None:
"""
Populate the ClinicalData model fields with extracted data
:param clinical_data: ClinicalData object to populate
:type clinical_data: ClinicalData
:param extracted_data: Dictionary with extracted clinical data
:type extracted_data: Dict[str, Any]
:returns: None
:rtype: None
"""
try:
# Informazioni paziente
patient_info = extracted_data.get("informazioni_paziente", {})
clinical_data.patient_name = f"{patient_info.get('nome', '')} {patient_info.get('cognome', '')}".strip()
if patient_info.get('età'):
try:
clinical_data.patient_age = int(patient_info['età'])
except (ValueError, TypeError):
pass
clinical_data.patient_gender = patient_info.get('sesso', '')
# Anamnesi
clinical_data.chief_complaint = extracted_data.get("sintomi_principali", "")
clinical_data.history_present_illness = extracted_data.get("storia_clinica", "")
# Liste JSON
clinical_data.past_medical_history = extracted_data.get("storia_medica", [])
clinical_data.medications = extracted_data.get("terapie", [])
clinical_data.allergies = extracted_data.get("allergie", [])
clinical_data.diagnosis = extracted_data.get("diagnosi", [])
# Parametri vitali e esame obiettivo
clinical_data.vital_signs = extracted_data.get("parametri_vitali", {})
clinical_data.physical_examination = extracted_data.get("esami_clinici", {})
# Valutazione e piano
clinical_data.assessment = extracted_data.get("note_mediche", "")
clinical_data.treatment_plan = extracted_data.get("piano_terapeutico", "")
except Exception as e:
logger.error(f"Errore nel popolare i campi di ClinicalData: {e}")
# Non sollevare l'errore, continua con i campi che siamo riusciti a popolare