fdaudens HF Staff commited on
Commit
fa0def6
·
verified ·
1 Parent(s): 8be53a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -69
app.py CHANGED
@@ -28,26 +28,30 @@ def load_model():
28
  processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
29
  return model, processor
30
 
31
- def chunk_audio(audio_file, chunk_length=30):
32
  """Split audio into chunks"""
33
- audio = AudioSegment.from_file(audio_file.name)
34
-
35
- audios = []
36
- target_rate = 16000
37
- chunk_length_ms = chunk_length * 1000
38
-
39
- for i in range(0, len(audio), chunk_length_ms):
40
- chunk = audio[i:i + chunk_length_ms]
41
- chunk = chunk.set_channels(1).set_frame_rate(target_rate)
42
 
43
- buf = io.BytesIO()
44
- chunk.export(buf, format="wav")
45
- buf.seek(0)
46
 
47
- rate, data = wavfile.read(buf)
48
- audios.append(data)
49
-
50
- return audios
 
 
 
 
 
 
 
 
 
 
51
 
52
  @spaces.GPU(duration=120)
53
  def embed_audio_chunks(audios):
@@ -108,66 +112,73 @@ def audio_to_base64(data, rate=16000):
108
  encoded_string = base64.b64encode(buf.read()).decode("utf-8")
109
  return encoded_string
110
 
111
- def process_audio_rag(audio_file, query, chunk_length=30, use_openai=False, openai_key=None):
112
  """Main processing function"""
113
- if not audio_file:
114
  return "Please upload an audio file", None, None
115
 
116
- # Chunk audio
117
- audios = chunk_audio(audio_file, chunk_length)
118
-
119
- # Embed chunks
120
- embeddings = embed_audio_chunks(audios)
121
-
122
- # Search for relevant chunks
123
- top_indices = search_audio(query, embeddings, audios)
124
-
125
- # Prepare results
126
- result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
127
- result_text += f"Chunk indices: {top_indices}\n\n"
128
-
129
- # Save first result as audio file
130
- first_chunk_path = "result_chunk.wav"
131
- wavfile.write(first_chunk_path, 16000, audios[top_indices[0]])
132
 
133
- # Optional: Use OpenAI for answer generation
134
- if use_openai and openai_key:
135
- from openai import OpenAI
136
- client = OpenAI(api_key=openai_key)
137
 
138
- content = [{"type": "text", "text": f"Answer the query using the audio files. Query: {query}"}]
 
139
 
140
- for idx in top_indices[:3]: # Use top 3 chunks
141
- content.extend([
142
- {"type": "text", "text": f"Audio chunk #{idx}:"},
143
- {
144
- "type": "input_audio",
145
- "input_audio": {
146
- "data": audio_to_base64(audios[idx]),
147
- "format": "wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  }
149
- }
150
- ])
 
 
 
 
 
 
 
 
151
 
152
- try:
153
- completion = client.chat.completions.create(
154
- model="gpt-4o-audio-preview",
155
- messages=[{"role": "user", "content": content}]
156
- )
157
- result_text += f"\nOpenAI Answer: {completion.choices[0].message.content}"
158
- except Exception as e:
159
- result_text += f"\nOpenAI Error: {str(e)}"
160
-
161
- # Create audio visualization
162
- import matplotlib.pyplot as plt
163
- fig, ax = plt.subplots(figsize=(10, 4))
164
- ax.plot(audios[top_indices[0]])
165
- ax.set_title(f"Waveform of top matching chunk (#{top_indices[0]})")
166
- ax.set_xlabel("Samples")
167
- ax.set_ylabel("Amplitude")
168
- plt.tight_layout()
169
-
170
- return result_text, first_chunk_path, fig
171
 
172
  # Create Gradio interface
173
  with gr.Blocks(title="AudioRAG Demo") as demo:
 
28
  processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
29
  return model, processor
30
 
31
+ def chunk_audio(audio_file_path, chunk_length=30):
32
  """Split audio into chunks"""
33
+ try:
34
+ # audio_file_path is already a string path when type="filepath"
35
+ audio = AudioSegment.from_file(audio_file_path)
 
 
 
 
 
 
36
 
37
+ audios = []
38
+ target_rate = 16000
39
+ chunk_length_ms = chunk_length * 1000
40
 
41
+ for i in range(0, len(audio), chunk_length_ms):
42
+ chunk = audio[i:i + chunk_length_ms]
43
+ chunk = chunk.set_channels(1).set_frame_rate(target_rate)
44
+
45
+ buf = io.BytesIO()
46
+ chunk.export(buf, format="wav")
47
+ buf.seek(0)
48
+
49
+ rate, data = wavfile.read(buf)
50
+ audios.append(data)
51
+
52
+ return audios
53
+ except Exception as e:
54
+ raise gr.Error(f"Error processing audio file: {str(e)}. Make sure ffmpeg is installed.")
55
 
56
  @spaces.GPU(duration=120)
57
  def embed_audio_chunks(audios):
 
112
  encoded_string = base64.b64encode(buf.read()).decode("utf-8")
113
  return encoded_string
114
 
115
+ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False, openai_key=None):
116
  """Main processing function"""
117
+ if not audio_file_path:
118
  return "Please upload an audio file", None, None
119
 
120
+ if not query:
121
+ return "Please enter a search query", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ try:
124
+ # Chunk audio
125
+ audios = chunk_audio(audio_file_path, chunk_length)
 
126
 
127
+ # Embed chunks
128
+ embeddings = embed_audio_chunks(audios)
129
 
130
+ # Search for relevant chunks
131
+ top_indices = search_audio(query, embeddings, audios)
132
+
133
+ # Prepare results
134
+ result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
135
+ result_text += f"Chunk indices: {top_indices}\n\n"
136
+
137
+ # Save first result as audio file
138
+ first_chunk_path = "result_chunk.wav"
139
+ wavfile.write(first_chunk_path, 16000, audios[top_indices[0]])
140
+
141
+ # Optional: Use OpenAI for answer generation
142
+ if use_openai and openai_key:
143
+ from openai import OpenAI
144
+ client = OpenAI(api_key=openai_key)
145
+
146
+ content = [{"type": "text", "text": f"Answer the query using the audio files. Query: {query}"}]
147
+
148
+ for idx in top_indices[:3]: # Use top 3 chunks
149
+ content.extend([
150
+ {"type": "text", "text": f"Audio chunk #{idx}:"},
151
+ {
152
+ "type": "input_audio",
153
+ "input_audio": {
154
+ "data": audio_to_base64(audios[idx]),
155
+ "format": "wav"
156
+ }
157
  }
158
+ ])
159
+
160
+ try:
161
+ completion = client.chat.completions.create(
162
+ model="gpt-4o-audio-preview",
163
+ messages=[{"role": "user", "content": content}]
164
+ )
165
+ result_text += f"\nOpenAI Answer: {completion.choices[0].message.content}"
166
+ except Exception as e:
167
+ result_text += f"\nOpenAI Error: {str(e)}"
168
 
169
+ # Create audio visualization
170
+ import matplotlib.pyplot as plt
171
+ fig, ax = plt.subplots(figsize=(10, 4))
172
+ ax.plot(audios[top_indices[0]])
173
+ ax.set_title(f"Waveform of top matching chunk (#{top_indices[0]})")
174
+ ax.set_xlabel("Samples")
175
+ ax.set_ylabel("Amplitude")
176
+ plt.tight_layout()
177
+
178
+ return result_text, first_chunk_path, fig
179
+
180
+ except Exception as e:
181
+ return f"Error: {str(e)}", None, None
 
 
 
 
 
 
182
 
183
  # Create Gradio interface
184
  with gr.Blocks(title="AudioRAG Demo") as demo: