Spaces:

matdmiller
/

tts-openai

Runtime error

App Files Files Community

tts-openai / app.py

matdmiller

added text chunking for text over 4,000 chars

d9a62b3 over 1 year ago

raw

history blame

6.28 kB

	# AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.

	# %% auto 0
	__all__ = ['secret_import_failed', 'tts_voices', 'launch_kwargs', 'split_text', 'concatenate_mp3', 'create_speech',
	'get_input_text_len']

	# %% app.ipynb 1
	#tts_openai_secrets.py content:
	#import os
	#os.environ['OPENAI_API_KEY'] = 'sk-XXXXXXXXXXXXXXXXXXXXXX'
	import os
	secret_import_failed = False
	try:
	_ = os.environ['OPENAI_API_KEY']
	print('OPENAI_API_KEY environment variable was found.')
	except:
	print('OPENAI_API_KEY environment variable was not found.')
	secret_import_failed = True
	try:
	GRADIO_PASSWORD = os.environ['GRADIO_PASSWORD']
	print('GRADIO_PASSWORD environment variable was found.')
	except:
	print('GRADIO_PASSWORD environment variable was not found.')
	secret_import_failed = True

	if secret_import_failed == True:
	import tts_openai_secrets
	GRADIO_PASSWORD = os.environ['GRADIO_PASSWORD']
	print('import tts_openai_secrets succeeded')

	# %% app.ipynb 3
	import gradio as gr
	import openai
	from pydub import AudioSegment
	import io

	# %% app.ipynb 4
	try:
	tts_models = [o.id for o in openai.models.list().data if 'tts' in o.id]
	print('successfully got tts model list:', tts_models)
	except:
	tts_models = ['tts-1']

	# %% app.ipynb 5
	tts_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']

	# %% app.ipynb 6
	def split_text(input_text, max_length=4000, lookback=1000):
	# If the text is shorter than the max_length, return it as is
	if len(input_text) <= max_length:
	return [input_text]

	chunks = []
	while input_text:
	# Check if the remaining text is shorter than the max_length
	if len(input_text) <= max_length:
	chunks.append(input_text)
	break

	# Define the split point, initially set to max_length
	split_point = max_length

	# Look for a newline in the last 'lookback' characters
	newline_index = input_text.rfind('\n', max_length-lookback, max_length)
	if newline_index != -1:
	split_point = newline_index + 1 # Include the newline in the current chunk

	# If no newline, look for a period followed by space
	elif '. ' in input_text[max_length-lookback:max_length]:
	# Find the last '. ' in the lookback range
	period_index = input_text.rfind('. ', max_length-lookback, max_length)
	split_point = period_index + 2 # Split after the space

	# Split the text and update the input_text
	chunks.append(input_text[:split_point])
	input_text = input_text[split_point:]

	return chunks

	# %% app.ipynb 7
	def concatenate_mp3(mp3_files):
	if len(mp3_files) == 1:
	return mp3_files[0]
	else:
	# Initialize an empty AudioSegment object for concatenation
	combined = AudioSegment.empty()

	# Write out audio file responses as individual files for debugging
	# for idx, mp3_data in enumerate(mp3_files):
	# with open(f'./{idx}.mp3', 'wb') as f:
	# f.write(mp3_data)

	# Loop through the list of mp3 binary data
	for mp3_data in mp3_files:
	# Convert binary data to an audio segment
	audio_segment = AudioSegment.from_file(io.BytesIO(mp3_data), format="mp3")
	# Concatenate this segment to the combined segment
	combined += audio_segment

	# Export the combined segment to a new mp3 file
	# Use a BytesIO object to handle this in memory
	combined_mp3 = io.BytesIO()
	combined.export(combined_mp3, format="mp3")

	# Seek to the start so it's ready for reading
	combined_mp3.seek(0)

	return combined_mp3.getvalue()

	# %% app.ipynb 8
	def create_speech(input_text, model='tts-1', voice='alloy', progress=gr.Progress()):
	# Split the input text into chunks
	chunks = split_text(input_text)

	# Initialize the progress bar
	progress(0, desc="Starting TTS processing...")

	# Initialize a list to hold the audio data of each chunk
	audio_data = []

	# Create a client instance for OpenAI
	client = openai.OpenAI()

	# Calculate the progress increment for each chunk
	progress_increment = 1.0 / len(chunks)

	# Process each chunk
	for i, chunk in enumerate(chunks):
	response = client.audio.speech.create(
	model=model,
	voice=voice,
	input=chunk,
	speed=1.0
	)
	# Append the audio content of the response to the list
	audio_data.append(response.content)

	# Update the progress bar
	progress((i + 1) * progress_increment, desc=f"Processing chunk {i + 1} of {len(chunks)}")

	# Close the client connection
	client.close()

	# Concatenate the audio data from all chunks
	combined_audio = concatenate_mp3(audio_data)

	# Final update to the progress bar
	progress(1, desc="Processing completed")

	return combined_audio


	# %% app.ipynb 9
	def get_input_text_len(input_text):
	return len(input_text)

	# %% app.ipynb 10
	with gr.Blocks(title='OpenAI TTS', head='OpenAI TTS') as app:
	gr.Markdown("# OpenAI TTS")
	gr.Markdown("Start typing below and then click Go to create the speech from your text. The current limit is 4,000 characters.")
	with gr.Row():
	input_text = gr.Textbox(max_lines=100, label="Enter text here")
	with gr.Row():
	tts_model_dropdown = gr.Dropdown(value='tts-1',choices=tts_models, label='Model')
	tts_voice_dropdown = gr.Dropdown(value='alloy',choices=tts_voices,label='Voice')
	input_text_length = gr.Label(label="Number of characters")
	output_audio = gr.Audio()
	input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
	go_btn = gr.Button("Go")
	go_btn.click(fn=create_speech, inputs=[input_text, tts_model_dropdown, tts_voice_dropdown], outputs=[output_audio])
	clear_btn = gr.Button('Clear')
	clear_btn.click(fn=lambda: '', outputs=input_text)


	# %% app.ipynb 11
	launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
	'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}

	# %% app.ipynb 13
	#.py launch
	if __name__ == "__main__":
	app.launch(**launch_kwargs)