Spaces:
Running
Running
Commit
·
ee1c502
1
Parent(s):
936a0d5
Initial commit.
Browse files- app.py +68 -0
- requirements.txt +6 -0
app.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from ibm_watson import TextToSpeechV1
|
| 4 |
+
import os
|
| 5 |
+
from google import genai
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
voices = [
|
| 9 |
+
"en-US_MichaelV3Voice",
|
| 10 |
+
"en-US_LisaV3Voice",
|
| 11 |
+
"en-US_AllisonV3Voice"
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
genai_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
| 15 |
+
|
| 16 |
+
genai_model = "gemini-2.0-flash"
|
| 17 |
+
|
| 18 |
+
text_to_speech_authenticator = IAMAuthenticator(
|
| 19 |
+
apikey=os.getenv("TEXT_TO_SPEECH_APIKEY")
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
text_to_speech_engine = TextToSpeechV1(
|
| 23 |
+
authenticator=text_to_speech_authenticator
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
text_to_speech_engine.set_service_url(os.getenv("TEXT_TO_SPEECH_URL"))
|
| 27 |
+
|
| 28 |
+
def main():
|
| 29 |
+
st.title("IBM TTS + Google Gemini")
|
| 30 |
+
st.write("This experimental application utilizes Google Gemini's advanced language processing capabilities to automatically insert Speech Synthesis Markup Language (SSML) tags into input text. By then processing this enriched text through IBM's Text to Speech engine, the app aims to produce higher-quality, more nuanced, and human-like synthesized speech.")
|
| 31 |
+
|
| 32 |
+
col1, col2 = st.columns(2)
|
| 33 |
+
|
| 34 |
+
with col1:
|
| 35 |
+
text = st.text_input("Input Text")
|
| 36 |
+
|
| 37 |
+
voice = st.selectbox("Select Voice", voices)
|
| 38 |
+
|
| 39 |
+
if st.button("Synthesize"):
|
| 40 |
+
with col2:
|
| 41 |
+
with st.spinner("Processing text input with Gemini..."):
|
| 42 |
+
genai_response = genai_client.models.generate_content(
|
| 43 |
+
model=genai_model,
|
| 44 |
+
contents=[
|
| 45 |
+
"Your goal is to help the text-to-speech engine sound more natural.",
|
| 46 |
+
f'The input text is: "{text}"',
|
| 47 |
+
"- Improve the input text with SSML tags",
|
| 48 |
+
"Return only the XML tags and the text content"
|
| 49 |
+
]
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
text_to_synthesize = re.sub(r'```xml|```', '', genai_response.text)
|
| 53 |
+
|
| 54 |
+
with st.spinner("Synthesizing speech..."):
|
| 55 |
+
text_to_speech_response = text_to_speech_engine.synthesize(
|
| 56 |
+
text_to_synthesize,
|
| 57 |
+
accept='audio/wav',
|
| 58 |
+
voice=voice
|
| 59 |
+
).get_result().content
|
| 60 |
+
|
| 61 |
+
with col2:
|
| 62 |
+
st.write("Gemini Response")
|
| 63 |
+
st.markdown(genai_response.text)
|
| 64 |
+
st.write("Generated Audio")
|
| 65 |
+
st.audio(text_to_speech_response, format="audio/wav")
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ibm-cloud-sdk-core
|
| 2 |
+
ibm-watson
|
| 3 |
+
gradio
|
| 4 |
+
google-genai
|
| 5 |
+
uvicorn
|
| 6 |
+
streamlit
|