Hassan-16 commited on
Commit
92ccd47
ยท
verified ยท
1 Parent(s): 1509739

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -64
app.py CHANGED
@@ -23,8 +23,18 @@ CUDA_AVAILABLE = torch.cuda.is_available()
23
  device = "cuda" if CUDA_AVAILABLE else "cpu"
24
  logger.info(f"Using hardware: {device}")
25
 
26
- # Load a single model instance
27
- model = KModel("hexgrad/Kokoro-82M").to(device).eval()
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Define pipelines for American ('a') and British ('b') English
30
  pipelines = {
@@ -39,7 +49,45 @@ try:
39
  except AttributeError as e:
40
  logger.warning(f"Could not set custom pronunciations: {e}")
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
 
43
  voice_path = os.path.join(VOICE_DIR, voice)
44
  if not os.path.exists(voice_path):
45
  raise FileNotFoundError(f"Voice file not found: {voice_path}")
@@ -47,16 +95,20 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
47
  pipeline = pipelines[voice[0]]
48
  use_gpu = use_gpu and CUDA_AVAILABLE
49
  try:
 
 
50
  generator = pipeline(text, voice=voice_path, speed=speed)
51
  for _, ps, audio in generator:
 
52
  return (24000, audio.numpy()), ps
53
  except gr.exceptions.Error as e:
54
  if use_gpu:
55
  gr.Warning(str(e))
56
- gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
57
  model.to("cpu")
58
  generator = pipeline(text, voice=voice_path, speed=speed)
59
  for _, ps, audio in generator:
 
60
  return (24000, audio.numpy()), ps
61
  else:
62
  raise gr.Error(e)
@@ -74,55 +126,23 @@ def tokenize_first(text, voice="af_bella.pt"):
74
  return ""
75
 
76
  def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
 
77
  voice_path = os.path.join(VOICE_DIR, voice)
78
  if not os.path.exists(voice_path):
79
  raise FileNotFoundError(f"Voice file not found: {voice_path}")
80
 
81
  pipeline = pipelines[voice[0]]
82
  use_gpu = use_gpu and CUDA_AVAILABLE
83
- first = True
84
- if not use_gpu:
85
  model.to("cpu")
 
86
  generator = pipeline(text, voice=voice_path, speed=speed)
87
  for _, _, audio in generator:
88
  yield 24000, audio.numpy()
89
  if first:
90
  first = False
91
  yield 24000, torch.zeros(1).numpy()
92
-
93
- # Dynamically load .pt voice files from VOICE_DIR
94
- def load_voice_choices():
95
- voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
96
- choices = {}
97
- for voice_file in voice_files:
98
- prefix = voice_file[:2]
99
- if prefix == 'af':
100
- label = f"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ {voice_file[3:-3].capitalize()}"
101
- elif prefix == 'am':
102
- label = f"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน {voice_file[3:-3].capitalize()}"
103
- elif prefix == 'bf':
104
- label = f"๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ {voice_file[3:-3].capitalize()}"
105
- elif prefix == 'bm':
106
- label = f"๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน {voice_file[3:-3].capitalize()}"
107
- else:
108
- label = f"Unknown {voice_file[:-3]}"
109
- choices[label] = voice_file
110
- return choices
111
-
112
- CHOICES = load_voice_choices()
113
-
114
- # Log available voices
115
- for label, voice_path in CHOICES.items():
116
- full_path = os.path.join(VOICE_DIR, voice_path)
117
- if not os.path.exists(full_path):
118
- logger.warning(f"Voice file not found: {full_path}")
119
- else:
120
- logger.info(f"Loaded voice: {label} ({voice_path})")
121
-
122
- # If no voices are found, add a default fallback
123
- if not CHOICES:
124
- logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
125
- CHOICES = {"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella ๐Ÿ”ฅ": "af_bella.pt"}
126
 
127
  TOKEN_NOTE = '''
128
  ๐Ÿ’ก Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kหˆOkษ™ษนO/)
@@ -134,45 +154,43 @@ TOKEN_NOTE = '''
134
  โฌ†๏ธ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
135
  '''
136
 
137
- with gr.Blocks() as generate_tab:
138
- out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
139
- generate_btn = gr.Button("Generate", variant="primary")
140
- with gr.Accordion("Output Tokens", open=True):
141
- out_ps = gr.Textbox(interactive=False, show_label=False,
142
- info="Tokens used to generate the audio, up to 510 context length.")
143
- tokenize_btn = gr.Button("Tokenize", variant="secondary")
144
- gr.Markdown(TOKEN_NOTE)
145
-
146
- with gr.Blocks() as stream_tab:
147
- out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
148
- with gr.Row():
149
- stream_btn = gr.Button("Stream", variant="primary")
150
- stop_btn = gr.Button("Stop", variant="stop")
151
- with gr.Accordion("Note", open=True):
152
- gr.Markdown("โš ๏ธ There may be delays in streaming audio due to processing limitations.")
153
-
154
- with gr.Blocks() as app:
155
  with gr.Row():
156
  with gr.Column():
157
- text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
158
  with gr.Row():
159
  voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
160
  info="Quality and availability vary by language")
161
  use_gpu = gr.Dropdown(
162
- [("GPU ๏ฟฝ-held", True), ("CPU ๐ŸŒ", False)],
163
  value=CUDA_AVAILABLE,
164
  label="Hardware",
165
- info="GPU is usually faster, but may require CUDA support",
166
  interactive=CUDA_AVAILABLE
167
  )
168
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
169
  with gr.Column():
170
- gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
171
- generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu],
172
- outputs=[out_audio, out_ps])
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
174
  stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
175
  stop_btn.click(fn=None, cancels=[stream_event])
176
 
177
  if __name__ == "__main__":
178
- app.queue().launch()
 
 
 
23
  device = "cuda" if CUDA_AVAILABLE else "cpu"
24
  logger.info(f"Using hardware: {device}")
25
 
26
+ # Cache model in a persistent directory
27
+ MODEL_CACHE_DIR = os.path.join(os.path.dirname(__file__), "model_cache")
28
+ os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
29
+
30
+ # Load a single model instance with caching
31
+ try:
32
+ start_time = time.time()
33
+ model = KModel("hexgrad/Kokoro-82M", cache_dir=MODEL_CACHE_DIR).to(device).eval()
34
+ logger.info(f"Model loading time: {time.time() - start_time} seconds")
35
+ except Exception as e:
36
+ logger.error(f"Failed to load model: {e}")
37
+ raise
38
 
39
  # Define pipelines for American ('a') and British ('b') English
40
  pipelines = {
 
49
  except AttributeError as e:
50
  logger.warning(f"Could not set custom pronunciations: {e}")
51
 
52
+ # Cache voice choices to avoid repeated file scanning
53
+ VOICE_CHOICES = None
54
+ def load_voice_choices():
55
+ global VOICE_CHOICES
56
+ if VOICE_CHOICES is not None:
57
+ return VOICE_CHOICES
58
+ voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
59
+ choices = {}
60
+ for voice_file in voice_files:
61
+ prefix = voice_file[:2]
62
+ if prefix == 'af':
63
+ label = f"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ {voice_file[3:-3].capitalize()}"
64
+ elif prefix == 'am':
65
+ label = f"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน {voice_file[3:-3].capitalize()}"
66
+ elif prefix == 'bf':
67
+ label = f"๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ {voice_file[3:-3].capitalize()}"
68
+ elif prefix == 'bm':
69
+ label = f"๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน {voice_file[3:-3].capitalize()}"
70
+ else:
71
+ label = f"Unknown {voice_file[:-3]}"
72
+ choices[label] = voice_file
73
+ if not choices:
74
+ logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
75
+ choices = {"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella ๐Ÿ”ฅ": "af_bella.pt"}
76
+ VOICE_CHOICES = choices
77
+ return choices
78
+
79
+ CHOICES = load_voice_choices()
80
+
81
+ # Log available voices
82
+ for label, voice_path in CHOICES.items():
83
+ full_path = os.path.join(VOICE_DIR, voice_path)
84
+ if not os.path.exists(full_path):
85
+ logger.warning(f"Voice file not found: {full_path}")
86
+ else:
87
+ logger.info(f"Loaded voice: {label} ({voice_path})")
88
+
89
  def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
90
+ start_time = time.time()
91
  voice_path = os.path.join(VOICE_DIR, voice)
92
  if not os.path.exists(voice_path):
93
  raise FileNotFoundError(f"Voice file not found: {voice_path}")
 
95
  pipeline = pipelines[voice[0]]
96
  use_gpu = use_gpu and CUDA_AVAILABLE
97
  try:
98
+ if not use_gpu and model.device.type != "cpu":
99
+ model.to("cpu")
100
  generator = pipeline(text, voice=voice_path, speed=speed)
101
  for _, ps, audio in generator:
102
+ logger.info(f"Generation time: {time.time() - start_time} seconds")
103
  return (24000, audio.numpy()), ps
104
  except gr.exceptions.Error as e:
105
  if use_gpu:
106
  gr.Warning(str(e))
107
+ gr.Info("Retrying with CPU.")
108
  model.to("cpu")
109
  generator = pipeline(text, voice=voice_path, speed=speed)
110
  for _, ps, audio in generator:
111
+ logger.info(f"Generation time (CPU retry): {time.time() - start_time} seconds")
112
  return (24000, audio.numpy()), ps
113
  else:
114
  raise gr.Error(e)
 
126
  return ""
127
 
128
  def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
129
+ start_time = time.time()
130
  voice_path = os.path.join(VOICE_DIR, voice)
131
  if not os.path.exists(voice_path):
132
  raise FileNotFoundError(f"Voice file not found: {voice_path}")
133
 
134
  pipeline = pipelines[voice[0]]
135
  use_gpu = use_gpu and CUDA_AVAILABLE
136
+ if not use_gpu and model.device.type != "cpu":
 
137
  model.to("cpu")
138
+ first = True
139
  generator = pipeline(text, voice=voice_path, speed=speed)
140
  for _, _, audio in generator:
141
  yield 24000, audio.numpy()
142
  if first:
143
  first = False
144
  yield 24000, torch.zeros(1).numpy()
145
+ logger.info(f"Streaming generation time: {time.time() - start_time} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  TOKEN_NOTE = '''
148
  ๐Ÿ’ก Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kหˆOkษ™ษนO/)
 
154
  โฌ†๏ธ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
155
  '''
156
 
157
+ with gr.Blocks(theme="soft") as app:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  with gr.Row():
159
  with gr.Column():
160
+ text = gr.Textbox(label="Input Text", value=TEXT, info="Arbitrarily many characters supported")
161
  with gr.Row():
162
  voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
163
  info="Quality and availability vary by language")
164
  use_gpu = gr.Dropdown(
165
+ [("GPU ๐Ÿš€", True), ("CPU ๐ŸŒ", False)],
166
  value=CUDA_AVAILABLE,
167
  label="Hardware",
168
+ info="GPU is faster but requires CUDA support",
169
  interactive=CUDA_AVAILABLE
170
  )
171
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
172
  with gr.Column():
173
+ with gr.Tab(label="Generate"):
174
+ out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
175
+ generate_btn = gr.Button("Generate", variant="primary")
176
+ with gr.Accordion("Output Tokens", open=True):
177
+ out_ps = gr.Textbox(interactive=False, show_label=False,
178
+ info="Tokens used to generate the audio, up to 510 context length.")
179
+ tokenize_btn = gr.Button("Tokenize", variant="secondary")
180
+ gr.Markdown(TOKEN_NOTE)
181
+ with gr.Tab(label="Stream"):
182
+ out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
183
+ with gr.Row():
184
+ stream_btn = gr.Button("Stream", variant="primary")
185
+ stop_btn = gr.Button("Stop", variant="stop")
186
+ gr.Markdown("โš ๏ธ Streaming may have initial delays due to processing.")
187
+
188
+ generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps])
189
  tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
190
  stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
191
  stop_btn.click(fn=None, cancels=[stream_event])
192
 
193
  if __name__ == "__main__":
194
+ logger.info("Starting Gradio app...")
195
+ app.launch()
196
+ logger.info("Gradio app started.")