| |
| """ |
| Simple example script for Pocket TTS ONNX inference. |
| |
| Usage: |
| python generate.py "Hello, this is a test." samples/reference.wav output.wav |
| python generate.py "Hello world" samples/expresso_02_ex03-ex01_calm_005.wav output.wav |
| """ |
|
|
| import argparse |
| import time |
| from pocket_tts_onnx import PocketTTSOnnx |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Generate speech with Pocket TTS ONNX") |
| parser.add_argument("text", help="Text to synthesize") |
| parser.add_argument("voice", help="Path to voice reference audio file") |
| parser.add_argument("output", help="Output audio file path") |
| parser.add_argument("--precision", choices=["int8", "fp32"], default="int8", |
| help="Model precision (default: int8)") |
| args = parser.parse_args() |
|
|
| print(f"Loading models (precision={args.precision})...") |
| t0 = time.time() |
| tts = PocketTTSOnnx(precision=args.precision) |
| print(f" Loaded in {time.time() - t0:.2f}s") |
|
|
| print(f"Generating speech...") |
| print(f" Text: {args.text}") |
| print(f" Voice: {args.voice}") |
|
|
| t0 = time.time() |
| audio = tts.generate(args.text, voice=args.voice) |
| gen_time = time.time() - t0 |
|
|
| duration = len(audio) / tts.SAMPLE_RATE |
| rtfx = duration / gen_time |
|
|
| print(f" Generated {duration:.2f}s audio in {gen_time:.2f}s (RTFx: {rtfx:.2f}x)") |
|
|
| tts.save_audio(audio, args.output) |
| print(f" Saved to: {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|