alimirferdos's picture
Update app.py
1dc10eb verified
raw
history blame
3.25 kB
import tempfile, os, re
import gradio as gr
import fitz # PyMuPDF
from TTS.utils.synthesizer import Synthesizer
from TTS.utils.download import download_url
MODEL_NAMES = [
"vits male1 (best)", "vits female (best)", "vits-male", "vits female1",
"glowtts-male", "glowtts-female", "female tacotron2"
]
MAX_TXT_LEN = 800
# Load/download models if not already present
modelInfo = [
["vits-male", "best_model_65633.pth", "config-0.json", "https://huggingface.co/Kamtera/persian-tts-male-vits/resolve/main/"],
["vits female (best)", "checkpoint_48000.pth", "config-2.json", "https://huggingface.co/Kamtera/persian-tts-female-vits/resolve/main/"],
["glowtts-male", "best_model_77797.pth", "config-1.json", "https://huggingface.co/Kamtera/persian-tts-male-glow_tts/resolve/main/"],
["glowtts-female", "best_model.pth", "config.json", "https://huggingface.co/Kamtera/persian-tts-female-glow_tts/resolve/main/"],
["vits male1 (best)", "checkpoint_88000.pth", "config.json", "https://huggingface.co/Kamtera/persian-tts-male1-vits/resolve/main/"],
["vits female1", "checkpoint_50000.pth", "config.json", "https://huggingface.co/Kamtera/persian-tts-female1-vits/resolve/main/"],
["female tacotron2", "checkpoint_313000.pth", "config-2.json", "https://huggingface.co/Kamtera/persian-tts-female-tacotron2/resolve/main/"]
]
for d in modelInfo:
if not os.path.exists(d[0]):
os.makedirs(d[0])
download_url(d[3]+d[1], d[0], "best_model.pth")
download_url(d[3]+d[2], d[0], "config.json")
def split_chapters(text):
chapters = re.split(r'\n\s*(?:ูุตู„|ุจุฎุด)[^\n]*\n', text)
return [ch.strip() for ch in chapters if ch.strip()]
def synthesize_text(text, synthesizer):
chunks = [text[i:i+MAX_TXT_LEN] for i in range(0, len(text), MAX_TXT_LEN)]
audio_paths = []
for i, chunk in enumerate(chunks):
wav = synthesizer.tts(chunk)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
synthesizer.save_wav(wav, fp.name)
audio_paths.append(fp.name)
return audio_paths
def pdf_to_chapter_audio(pdf_file, model_name):
# Load model
synthesizer = Synthesizer(f"{model_name}/best_model.pth", f"{model_name}/config.json")
# Extract PDF text
doc = fitz.open(pdf_file.name)
text = "\n".join([page.get_text() for page in doc])
chapters = split_chapters(text)
chapter_audios = []
for i, chapter in enumerate(chapters):
paths = synthesize_text(chapter, synthesizer)
chapter_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
os.system(f"sox {' '.join(paths)} {chapter_path}") # Merge if multiple chunks
chapter_audios.append((f"Chapter {i+1}", chapter_path))
return chapter_audios
gr.Interface(
fn=pdf_to_chapter_audio,
inputs=[
gr.File(label="Upload Persian PDF Book"),
gr.Radio(label="Pick a TTS Model", choices=MODEL_NAMES, value="vits female (best)"),
],
outputs=gr.Dataset(components=["text", gr.Audio(label="Chapter Audio", type='filepath')]),
title="๐Ÿ“š Persian Book to Audio Chapters",
description="Upload a Persian PDF book and convert each chapter into audio using a TTS model."
).launch()