import tempfile, os, re import gradio as gr import fitz # PyMuPDF from TTS.utils.synthesizer import Synthesizer from TTS.utils.download import download_url MODEL_NAMES = [ "vits male1 (best)", "vits female (best)", "vits-male", "vits female1", "glowtts-male", "glowtts-female", "female tacotron2" ] MAX_TXT_LEN = 800 # Load/download models if not already present modelInfo = [ ["vits-male", "best_model_65633.pth", "config-0.json", "https://huggingface.co/Kamtera/persian-tts-male-vits/resolve/main/"], ["vits female (best)", "checkpoint_48000.pth", "config-2.json", "https://huggingface.co/Kamtera/persian-tts-female-vits/resolve/main/"], ["glowtts-male", "best_model_77797.pth", "config-1.json", "https://huggingface.co/Kamtera/persian-tts-male-glow_tts/resolve/main/"], ["glowtts-female", "best_model.pth", "config.json", "https://huggingface.co/Kamtera/persian-tts-female-glow_tts/resolve/main/"], ["vits male1 (best)", "checkpoint_88000.pth", "config.json", "https://huggingface.co/Kamtera/persian-tts-male1-vits/resolve/main/"], ["vits female1", "checkpoint_50000.pth", "config.json", "https://huggingface.co/Kamtera/persian-tts-female1-vits/resolve/main/"], ["female tacotron2", "checkpoint_313000.pth", "config-2.json", "https://huggingface.co/Kamtera/persian-tts-female-tacotron2/resolve/main/"] ] for d in modelInfo: if not os.path.exists(d[0]): os.makedirs(d[0]) download_url(d[3]+d[1], d[0], "best_model.pth") download_url(d[3]+d[2], d[0], "config.json") def split_chapters(text): chapters = re.split(r'\n\s*(?:فصل|بخش)[^\n]*\n', text) return [ch.strip() for ch in chapters if ch.strip()] def synthesize_text(text, synthesizer): chunks = [text[i:i+MAX_TXT_LEN] for i in range(0, len(text), MAX_TXT_LEN)] audio_paths = [] for i, chunk in enumerate(chunks): wav = synthesizer.tts(chunk) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: synthesizer.save_wav(wav, fp.name) audio_paths.append(fp.name) return audio_paths def pdf_to_chapter_audio(pdf_file, model_name): # Load model synthesizer = Synthesizer(f"{model_name}/best_model.pth", f"{model_name}/config.json") # Extract PDF text doc = fitz.open(pdf_file.name) text = "\n".join([page.get_text() for page in doc]) chapters = split_chapters(text) chapter_audios = [] for i, chapter in enumerate(chapters): paths = synthesize_text(chapter, synthesizer) chapter_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name os.system(f"sox {' '.join(paths)} {chapter_path}") # Merge if multiple chunks chapter_audios.append((f"Chapter {i+1}", chapter_path)) return chapter_audios gr.Interface( fn=pdf_to_chapter_audio, inputs=[ gr.File(label="Upload Persian PDF Book"), gr.Radio(label="Pick a TTS Model", choices=MODEL_NAMES, value="vits female (best)"), ], outputs=gr.Dataset(components=["text", gr.Audio(label="Chapter Audio", type='filepath')]), title="📚 Persian Book to Audio Chapters", description="Upload a Persian PDF book and convert each chapter into audio using a TTS model." ).launch()