gradio_app.py

from dotenv import load_dotenv
import os
import gradio as gr

from brain_of_the_doctor import encode_image, analyze_image_with_query
from voice_of_the_patient import record_audio, transcribe_with_groq
from voice_of_the_doctor import text_to_speech_with_gtts, text_to_speech_with_elevenlabs

load_dotenv()

#Fetches API keys correctly
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")

system_prompt = """You have to act as a professional doctor, I know you are not but this is for learning purposes. 
            What's in this image? Do you find anything wrong with it medically? 
            If you make a differential, suggest some remedies for them. Do not add any numbers or special characters in 
            your response. Your response should be in one long paragraph. Also, always answer as if you are answering to a real person.
            Do not say 'In the image I see' but say 'With what I see, I think you have ....'
            Don't respond as an AI model in markdown, your answer should mimic that of an actual doctor, not an AI bot. 
            Keep your answer concise (max 2 sentences). No preamble, start your answer right away, please."""

def process_inputs(audio_filepath, image_filepath):
    #Uses `os.getenv()` to fetch the API key
    speech_to_text_output = transcribe_with_groq(
        GROQ_API_KEY=GROQ_API_KEY,  
        audio_filepath=audio_filepath,
        stt_model="whisper-large-v3"
    )

    #Handle the image input correctly
    if image_filepath:
        doctor_response = analyze_image_with_query(
            query=system_prompt + speech_to_text_output, 
            encoded_image=encode_image(image_filepath), 
            model="llama-3.2-11b-vision-preview"
        )
    else:
        doctor_response = "No image provided for me to analyze"

    #Ensure `final.mp3` is saved in the correct path
    output_audio_path = os.path.join(os.getcwd(), "final.mp3")
    voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath=output_audio_path) 

    return speech_to_text_output, doctor_response, voice_of_doctor  

#Create the Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="Doctor's Response"),
        gr.Audio(type="filepath", label="Doctor's Voice")  #Corrected
    ],
    title="MediVision AI"
)

iface.launch()