Delete app.py

#7
by deleted - opened
Files changed (1) hide show
  1. app.py +0 -263
app.py DELETED
@@ -1,263 +0,0 @@
1
- import spaces
2
- import os
3
-
4
- import gradio as gr
5
- import numpy as np
6
-
7
- import torch
8
- import torchaudio
9
- from generator import Segment, load_csm_1b
10
- from huggingface_hub import hf_hub_download, login
11
- from watermarking import watermark
12
-
13
- api_key = os.getenv("HF_TOKEN")
14
- gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))
15
- CSM_1B_HF_WATERMARK = list(map(int, os.getenv("WATERMARK_KEY").split(" ")))
16
-
17
- login(token=api_key)
18
-
19
- SPACE_INTRO_TEXT = """\
20
- # Sesame CSM 1B
21
-
22
- Generate from CSM 1B (Conversational Speech Model).
23
- Code is available on GitHub: [SesameAILabs/csm](https://github.com/SesameAILabs/csm).
24
- Checkpoint is [hosted on HuggingFace](https://huggingface.co/sesame/csm-1b).
25
-
26
- Try out our interactive demo [sesame.com/voicedemo](https://www.sesame.com/voicedemo),
27
- this uses a fine-tuned variant of CSM.
28
-
29
- The model has some capacity for non-English languages due to data contamination in the training
30
- data, but it is likely not to perform well.
31
-
32
- ---
33
-
34
- """
35
-
36
- CONVO_INTRO_TEXT = """\
37
- ## Conversation content
38
-
39
- Each line is an utterance in the conversation to generate. Speakers alternate between A and B, starting with speaker A.
40
- """
41
-
42
- DEFAULT_CONVERSATION = """\
43
- Hey how are you doing.
44
- Pretty good, pretty good.
45
- I'm great, so happy to be speaking to you.
46
- Me too, this is some cool stuff huh?
47
- Yeah, I've been reading more about speech generation, and it really seems like context is important.
48
- Definitely.
49
- """
50
-
51
- SPEAKER_PROMPTS = {
52
- "conversational_a": {
53
- "text": (
54
- "like revising for an exam I'd have to try and like keep up the momentum because I'd "
55
- "start really early I'd be like okay I'm gonna start revising now and then like "
56
- "you're revising for ages and then I just like start losing steam I didn't do that "
57
- "for the exam we had recently to be fair that was a more of a last minute scenario "
58
- "but like yeah I'm trying to like yeah I noticed this yesterday that like Mondays I "
59
- "sort of start the day with this not like a panic but like a"
60
- ),
61
- "audio": "prompts/conversational_a.wav",
62
- },
63
- "conversational_b": {
64
- "text": (
65
- "like a super Mario level. Like it's very like high detail. And like, once you get "
66
- "into the park, it just like, everything looks like a computer game and they have all "
67
- "these, like, you know, if, if there's like a, you know, like in a Mario game, they "
68
- "will have like a question block. And if you like, you know, punch it, a coin will "
69
- "come out. So like everyone, when they come into the park, they get like this little "
70
- "bracelet and then you can go punching question blocks around."
71
- ),
72
- "audio": "prompts/conversational_b.wav",
73
- },
74
- "read_speech_a": {
75
- "text": (
76
- "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little "
77
- "like those of the sea eagle, and the ghost of his smile that flickered on his "
78
- "singularly pale face, with a stern and insidious look, confronted me."
79
- ),
80
- "audio": "prompts/read_speech_a.wav",
81
- },
82
- "read_speech_b": {
83
- "text": (
84
- "He was such a big boy that he wore high boots and carried a jack knife. He gazed and "
85
- "gazed at the cap, and could not keep from fingering the blue tassel."
86
- ),
87
- "audio": "prompts/read_speech_b.wav",
88
- },
89
- "read_speech_c": {
90
- "text": (
91
- "All passed so quickly, there was so much going on around him, the Tree quite forgot "
92
- "to look to himself."
93
- ),
94
- "audio": "prompts/read_speech_c.wav",
95
- },
96
- "read_speech_d": {
97
- "text": (
98
- "Suddenly I was back in the old days Before you felt we ought to drift apart. It was "
99
- "some trick-the way your eyebrows raise."
100
- ),
101
- "audio": "prompts/read_speech_d.wav",
102
- },
103
- }
104
-
105
- device = "cuda" if torch.cuda.is_available() else "cpu"
106
- generator = load_csm_1b(device=device)
107
-
108
-
109
- @spaces.GPU(duration=gpu_timeout)
110
- def infer(
111
- text_prompt_speaker_a,
112
- text_prompt_speaker_b,
113
- audio_prompt_speaker_a,
114
- audio_prompt_speaker_b,
115
- gen_conversation_input,
116
- ) -> tuple[np.ndarray, int]:
117
- """Generates an audio conversation between two speakers """
118
-
119
- # Estimate token limit, otherwise failure might happen after many utterances have been generated.
120
- if len(gen_conversation_input.strip() + text_prompt_speaker_a.strip() + text_prompt_speaker_b.strip()) >= 2000:
121
- raise gr.Error("Prompts and conversation too long.", duration=30)
122
-
123
- try:
124
- return _infer(
125
- text_prompt_speaker_a,
126
- text_prompt_speaker_b,
127
- audio_prompt_speaker_a,
128
- audio_prompt_speaker_b,
129
- gen_conversation_input,
130
- )
131
- except ValueError as e:
132
- raise gr.Error(f"Error generating audio: {e}", duration=120)
133
-
134
-
135
- def _infer(
136
- text_prompt_speaker_a,
137
- text_prompt_speaker_b,
138
- audio_prompt_speaker_a,
139
- audio_prompt_speaker_b,
140
- gen_conversation_input,
141
- ) -> tuple[np.ndarray, int]:
142
- audio_prompt_a = prepare_prompt(text_prompt_speaker_a, 0, audio_prompt_speaker_a)
143
- audio_prompt_b = prepare_prompt(text_prompt_speaker_b, 1, audio_prompt_speaker_b)
144
-
145
- prompt_segments: list[Segment] = [audio_prompt_a, audio_prompt_b]
146
- generated_segments: list[Segment] = []
147
-
148
- conversation_lines = [line.strip() for line in gen_conversation_input.strip().split("\n") if line.strip()]
149
- for i, line in enumerate(conversation_lines):
150
- # Alternating speakers A and B, starting with A
151
- speaker_id = i % 2
152
-
153
- audio_tensor = generator.generate(
154
- text=line,
155
- speaker=speaker_id,
156
- context=prompt_segments + generated_segments,
157
- max_audio_length_ms=30_000,
158
- )
159
- generated_segments.append(Segment(text=line, speaker=speaker_id, audio=audio_tensor))
160
-
161
- # Concatenate all generations and convert to 16-bit int format
162
- audio_tensors = [segment.audio for segment in generated_segments]
163
- audio_tensor = torch.cat(audio_tensors, dim=0)
164
-
165
- # This applies an imperceptible watermark to identify audio as AI-generated.
166
- # Watermarking ensures transparency, dissuades misuse, and enables traceability.
167
- # Please be a responsible AI citizen and keep the watermarking in place.
168
- # If using CSM 1B in another application, use your own private key and keep it secret.
169
- audio_tensor, wm_sample_rate = watermark(
170
- generator._watermarker, audio_tensor, generator.sample_rate, CSM_1B_HF_WATERMARK
171
- )
172
- audio_tensor = torchaudio.functional.resample(
173
- audio_tensor, orig_freq=wm_sample_rate, new_freq=generator.sample_rate
174
- )
175
-
176
- audio_array = (audio_tensor * 32768).to(torch.int16).cpu().numpy()
177
-
178
- return generator.sample_rate, audio_array
179
-
180
-
181
- def prepare_prompt(text: str, speaker: int, audio_path: str) -> Segment:
182
- audio_tensor, _ = load_prompt_audio(audio_path)
183
- return Segment(text=text, speaker=speaker, audio=audio_tensor)
184
-
185
-
186
- def load_prompt_audio(audio_path: str) -> torch.Tensor:
187
- audio_tensor, sample_rate = torchaudio.load(audio_path)
188
- if audio_tensor.shape[0] != 1:
189
- gr.Warning("Warning: Audio prompt is multi-channel, converting to mono.", duration=15)
190
- audio_tensor = audio_tensor.mean(dim=0)
191
- audio_tensor = audio_tensor.squeeze(0)
192
- if sample_rate != generator.sample_rate:
193
- audio_tensor = torchaudio.functional.resample(
194
- audio_tensor, orig_freq=sample_rate, new_freq=generator.sample_rate
195
- )
196
- return audio_tensor, generator.sample_rate
197
-
198
-
199
- def create_speaker_prompt_ui(speaker_name: str):
200
- speaker_dropdown = gr.Dropdown(
201
- choices=list(SPEAKER_PROMPTS.keys()), label="Select a predefined speaker", value=speaker_name
202
- )
203
- with gr.Accordion("Or add your own voice prompt", open=False):
204
- text_prompt_speaker = gr.Textbox(label="Speaker prompt", lines=4, value=SPEAKER_PROMPTS[speaker_name]["text"])
205
- audio_prompt_speaker = gr.Audio(
206
- label="Speaker prompt", type="filepath", value=SPEAKER_PROMPTS[speaker_name]["audio"]
207
- )
208
-
209
- return speaker_dropdown, text_prompt_speaker, audio_prompt_speaker
210
-
211
-
212
- with gr.Blocks() as app:
213
- gr.Markdown(SPACE_INTRO_TEXT)
214
- gr.Markdown("## Voices")
215
- with gr.Row():
216
- with gr.Column():
217
- gr.Markdown("### Speaker A")
218
- speaker_a_dropdown, text_prompt_speaker_a, audio_prompt_speaker_a = create_speaker_prompt_ui(
219
- "conversational_a"
220
- )
221
-
222
- with gr.Column():
223
- gr.Markdown("### Speaker B")
224
- speaker_b_dropdown, text_prompt_speaker_b, audio_prompt_speaker_b = create_speaker_prompt_ui(
225
- "conversational_b"
226
- )
227
-
228
- def update_audio(speaker):
229
- if speaker in SPEAKER_PROMPTS:
230
- return SPEAKER_PROMPTS[speaker]["audio"]
231
- return None
232
-
233
- def update_text(speaker):
234
- if speaker in SPEAKER_PROMPTS:
235
- return SPEAKER_PROMPTS[speaker]["text"]
236
- return None
237
-
238
- speaker_a_dropdown.change(fn=update_audio, inputs=[speaker_a_dropdown], outputs=[audio_prompt_speaker_a])
239
- speaker_b_dropdown.change(fn=update_audio, inputs=[speaker_b_dropdown], outputs=[audio_prompt_speaker_b])
240
-
241
- speaker_a_dropdown.change(fn=update_text, inputs=[speaker_a_dropdown], outputs=[text_prompt_speaker_a])
242
- speaker_b_dropdown.change(fn=update_text, inputs=[speaker_b_dropdown], outputs=[text_prompt_speaker_b])
243
-
244
- gr.Markdown(CONVO_INTRO_TEXT)
245
-
246
- gen_conversation_input = gr.TextArea(label="conversation", lines=20, value=DEFAULT_CONVERSATION)
247
- generate_btn = gr.Button("Generate conversation", variant="primary")
248
- gr.Markdown("GPU time limited to 3 minutes, for longer usage duplicate the space.")
249
- audio_output = gr.Audio(label="Synthesized audio")
250
-
251
- generate_btn.click(
252
- infer,
253
- inputs=[
254
- text_prompt_speaker_a,
255
- text_prompt_speaker_b,
256
- audio_prompt_speaker_a,
257
- audio_prompt_speaker_b,
258
- gen_conversation_input,
259
- ],
260
- outputs=[audio_output],
261
- )
262
-
263
- app.launch(ssr_mode=True,mcp_server=True)