Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Lookingsoft Radiology Assistant | |
| Automatically generated by Colab. | |
| This file is adapted from: | |
| https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/rishirajacharya/i-o-25-radiology-with-medgemma-gemini-native-tts.b5cf5dca-3453-45b1-b7c0-ec7c22aedf1b.ipynb | |
| # Lookingsoft Radiology Assistant | |
| ## Developed by Lookingsoft Team | |
| This demo showcases an AI-powered radiology assistant that leverages MedGemma for medical image interpretation and Gemini’s native text-to-speech (TTS) for natural voice output. The assistant transforms complex radiology reports into easy-to-understand language and delivers it through a user-friendly voice-driven experience—highlighting key areas in radiology images and making insights more accessible. | |
| ### 🔐 Securing API Keys | |
| We use secure tokens to authenticate with Hugging Face and Google’s Gemini APIs, ensuring safe and authorized access. | |
| """ | |
| import spaces | |
| from google import genai | |
| from google.genai import types | |
| import os | |
| gemini_api_key = os.getenv('GEMINI_API_KEY') | |
| hf_token = os.environ.get("HF_TOKEN") | |
| client = genai.Client(api_key=gemini_api_key) | |
| """### 🧠 Loading MedGemma for Radiology Insights | |
| Here, we load the MedGemma model—an image-text model optimized for medical contexts. We apply 4-bit quantization to enhance performance and reduce memory usage on GPUs. | |
| """ | |
| from transformers import pipeline, BitsAndBytesConfig | |
| import torch | |
| model_kwargs = dict(torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True)) | |
| pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", model_kwargs=model_kwargs, token=hf_token) | |
| pipe.model.generation_config.do_sample = False | |
| """### 🩻 Radiology Image Interpretation Logic | |
| This function uses MedGemma to generate a plain-language report based on a given prompt and medical image. It formats the input and passes it to the model for inference. | |
| """ | |
| from PIL import Image | |
| def infer(prompt: str, image: Image.Image, system: str = None) -> str: | |
| image_filename = "image.png" | |
| image.save(image_filename) | |
| messages = [] | |
| if system: | |
| messages.append({ | |
| "role": "system", | |
| "content": [{"type": "text", "text": system}] | |
| }) | |
| messages.append({ | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| {"type": "image", "image": image} | |
| ] | |
| }) | |
| output = pipe(text=messages, max_new_tokens=2048) | |
| response = output[0]["generated_text"][-1]["content"] | |
| return response | |
| """### 🔊 Prepare for Gemini's Native TTS | |
| This helper function converts Gemini’s audio output into a `.wav` file—enabling the assistant to speak its reports in a natural-sounding voice. | |
| """ | |
| import wave | |
| def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2): | |
| with wave.open(filename, "wb") as wf: | |
| wf.setnchannels(channels) | |
| wf.setsampwidth(sample_width) | |
| wf.setframerate(rate) | |
| wf.writeframes(pcm) | |
| """### 🤖 Integrating Image Analysis and Voice Output | |
| This function combines the MedGemma analysis with Gemini’s TTS to produce both text and audio responses. | |
| """ | |
| import gradio as gr | |
| import requests | |
| def _do_predictions(text, image_file, image_url, source_type): | |
| if source_type == "url": | |
| image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw) | |
| else: | |
| image = image_file | |
| report = infer(text, image) | |
| response = client.models.generate_content( | |
| model="gemini-2.5-flash-preview-tts", | |
| contents=report, | |
| config=types.GenerateContentConfig( | |
| response_modalities=["AUDIO"], | |
| speech_config=types.SpeechConfig( | |
| voice_config=types.VoiceConfig( | |
| prebuilt_voice_config=types.PrebuiltVoiceConfig( | |
| voice_name='Kore', | |
| ) | |
| ) | |
| ), | |
| ) | |
| ) | |
| data = response.candidates[0].content.parts[0].inline_data.data | |
| file_name='out.wav' | |
| wave_file(file_name, data) | |
| return report, file_name | |
| """### 🖼️ Interactive Web UI with Gradio | |
| A user-friendly interface built with Gradio. Users can upload an image or provide a URL, enter a prompt, and receive both a text report and an audio explanation—powered by **MedGemma + Gemini TTS**. | |
| """ | |
| def toggle_image_src(choice): | |
| if choice == "url": | |
| return gr.update(visible=False), gr.update(visible=True) | |
| else: | |
| return gr.update(visible=True), gr.update(visible=False) | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # Lookingsoft Radiology Assistant | |
| ## Developed by the Lookingsoft Team | |
| This assistant demonstrates the integration of MedGemma for medical image interpretation with Gemini’s native text-to-speech (TTS). It simplifies complex radiology reports into clear, spoken language, making insights more accessible and understandable for both professionals and patients. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| text = gr.Text(label="Instructions", lines=2, interactive=True) | |
| with gr.Column(): | |
| radio = gr.Radio(["file", "url"], value="file", label="Input Image Source") | |
| image_file = gr.Image(label="File", type="pil", visible=True) | |
| image_url = gr.Textbox(label="URL", visible=False) | |
| with gr.Row(): | |
| submit = gr.Button("Generate") | |
| with gr.Column(): | |
| output = gr.Textbox(label="Generated Report") | |
| audio_output = gr.Audio(label="Generated Report (wav)") | |
| submit.click(_do_predictions, inputs=[text, image_file, image_url, radio], | |
| outputs=[output, audio_output]) | |
| radio.change(toggle_image_src, radio, [image_file, image_url], queue=False, show_progress=False) | |
| gr.Examples( | |
| fn=_do_predictions, | |
| examples=[ | |
| ["Describe this X-ray", Image.open(requests.get("https://google-rad-explain.hf.space/static/images/Effusion2.jpg", headers={"User-Agent": "example"}, stream=True).raw), None, "file"], | |
| ["Describe this CT", None, "https://google-rad-explain.hf.space/static/images/CT-Tumor.jpg", "url"], | |
| ], | |
| inputs=[text, image_file, image_url, radio], | |
| outputs=[output, audio_output] | |
| ) | |
| gr.Markdown(""" | |
| ### Disclaimer | |
| This demonstration is for educational purposes only. It is not intended to diagnose or treat any disease or condition and should not be considered medical advice. | |
| """) | |
| demo.queue(max_size=8 * 4).launch(share=True) |