Polarisailabs commited on
Commit
c73fb78
·
verified ·
1 Parent(s): bcbfc1f

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +3 -0
  2. app.py +269 -0
  3. video_data/.DS_Store +0 -0
  4. video_data/1.mp4 +3 -0
  5. video_data/2.mp4 +3 -0
  6. video_data/3.mp4 +3 -0
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ video_data/1.mp4 filter=lfs diff=lfs merge=lfs -text
2
+ video_data/2.mp4 filter=lfs diff=lfs merge=lfs -text
3
+ video_data/3.mp4 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Install required libraries if running outside Colab
3
+ # !pip install gradio yt-dlp moviepy pillow speechrecognition llama-index lancedb google-generativeai
4
+
5
+ import gradio as gr
6
+ from moviepy import VideoFileClip
7
+ from pathlib import Path
8
+ import speech_recognition as sr
9
+ from PIL import Image
10
+ import os
11
+ import shutil
12
+ import json
13
+ import matplotlib.pyplot as plt
14
+ import yt_dlp
15
+ import requests
16
+ import base64
17
+ from io import BytesIO
18
+
19
+ # Add your existing methods here (download_video, video_to_images, video_to_audio, audio_to_text, prepare_video...)
20
+
21
+ def plot_images(image_paths):
22
+ images_shown = 0
23
+ plt.figure(figsize=(16, 9))
24
+ img_files = []
25
+ for img_path in image_paths:
26
+ if os.path.isfile(img_path):
27
+ img_files.append(img_path)
28
+ images_shown += 1
29
+ if images_shown >= 7:
30
+ break
31
+ return img_files
32
+
33
+ def download_video(video_url, output_video_path="./video_data/"):
34
+ ydl_opts = {
35
+ "format": "bestvideo+bestaudio/best",
36
+ "merge_output_format": "mp4",
37
+ "outtmpl": f"{output_video_path}/input_vid.mp4",
38
+ "noplaylist": True,
39
+ "quiet": False,
40
+ # Uncomment and set your cookie file path if required
41
+ # "cookiefile": "cookies.txt",
42
+ }
43
+ Path(output_video_path).mkdir(parents=True, exist_ok=True)
44
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
45
+ info = ydl.extract_info(video_url, download=True)
46
+ info = ydl.sanitize_info(info)
47
+ return {
48
+ "title": info.get("title"),
49
+ "uploader": info.get("uploader"),
50
+ "views": info.get("view_count"),
51
+ }
52
+
53
+ def video_to_images(video_path, output_folder):
54
+ Path(output_folder).mkdir(parents=True, exist_ok=True)
55
+ clip = VideoFileClip(video_path)
56
+ clip.write_images_sequence(
57
+ os.path.join(output_folder, "frame%04d.png"), fps=0.2
58
+ )
59
+
60
+ def video_to_audio(video_path, output_audio_path):
61
+ clip = VideoFileClip(video_path)
62
+ audio = clip.audio
63
+ audio.write_audiofile(output_audio_path)
64
+
65
+ def audio_to_text(audio_path):
66
+ recognizer = sr.Recognizer()
67
+ try:
68
+ with sr.AudioFile(audio_path) as source:
69
+ audio_data = recognizer.record(source)
70
+ text = recognizer.recognize_google(audio_data)
71
+ return text
72
+ except sr.UnknownValueError:
73
+ print("Google Speech Recognition could not understand the audio.")
74
+ except sr.RequestError as e:
75
+ print(f"Could not request results: {e}")
76
+ return None
77
+
78
+ def prepare_all_videos(
79
+ video_folder="./video_data/",
80
+ output_folder="./mixed_data/"
81
+ ):
82
+ """
83
+ Processes all video files in video_folder, extracting images and text for each,
84
+ and stores them in unique subfolders under output_folder.
85
+ Returns a list of metadata dicts for all videos.
86
+ """
87
+ Path(output_folder).mkdir(parents=True, exist_ok=True)
88
+ video_files = [f for f in os.listdir(video_folder) if f.lower().endswith(('.mp4', '.mov', '.avi', '.mkv'))]
89
+ all_metadata = []
90
+ for video_file in video_files:
91
+ video_path = os.path.join(video_folder, video_file)
92
+ video_name = Path(video_file).stem
93
+ video_output_folder = os.path.join(output_folder, video_name)
94
+ Path(video_output_folder).mkdir(parents=True, exist_ok=True)
95
+ audio_path = os.path.join(video_output_folder, "output_audio.wav")
96
+ # Extract images and audio
97
+ video_to_images(video_path, video_output_folder)
98
+ video_to_audio(video_path, audio_path)
99
+ # Transcribe audio
100
+ text_data = audio_to_text(audio_path)
101
+ text_path = os.path.join(video_output_folder, "output_text.txt")
102
+ with open(text_path, "w") as file:
103
+ file.write(text_data if text_data else "")
104
+ os.remove(audio_path)
105
+ # Dummy metadata, you can enhance this as needed
106
+ meta = {
107
+ "title": video_name,
108
+ "uploader": "unknown",
109
+ "views": "unknown",
110
+ "file": video_file
111
+ }
112
+ all_metadata.append({"meta": meta, "text": text_data, "folder": video_output_folder})
113
+ return all_metadata
114
+
115
+ from llama_index.core.indices import MultiModalVectorStoreIndex
116
+ from llama_index.core import SimpleDirectoryReader, StorageContext
117
+ from llama_index.vector_stores.lancedb import LanceDBVectorStore
118
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
119
+ from llama_index.core import Settings
120
+
121
+ def create_vector_db_for_all(image_txt_root_folder: str):
122
+ """
123
+ Loads all subfolders in image_txt_root_folder as documents for the vector DB.
124
+ """
125
+ text_store = LanceDBVectorStore(uri="lancedb", table_name="text_collection")
126
+ image_store = LanceDBVectorStore(uri="lancedb", table_name="image_collection")
127
+ storage_context = StorageContext.from_defaults(
128
+ vector_store=text_store, image_store=image_store
129
+ )
130
+ Settings.embed_model = HuggingFaceEmbedding(
131
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
132
+ )
133
+ # Load all subfolders as documents
134
+ documents = []
135
+ for subfolder in Path(image_txt_root_folder).iterdir():
136
+ if subfolder.is_dir():
137
+ documents.extend(SimpleDirectoryReader(str(subfolder)).load_data())
138
+ index = MultiModalVectorStoreIndex.from_documents(
139
+ documents,
140
+ storage_context=storage_context,
141
+ )
142
+ retriever_engine = index.as_retriever(
143
+ similarity_top_k=2, image_similarity_top_k=3
144
+ )
145
+ return retriever_engine
146
+
147
+ from llama_index.core.schema import ImageNode
148
+
149
+ def retrieve(retriever_engine, query_str):
150
+ retrieval_results = retriever_engine.retrieve(query_str)
151
+ retrieved_image = []
152
+ retrieved_text = []
153
+ for res_node in retrieval_results:
154
+ if isinstance(res_node.node, ImageNode):
155
+ retrieved_image.append(res_node.node.metadata["file_path"])
156
+ else:
157
+ retrieved_text.append(res_node.text)
158
+ return retrieved_image, retrieved_text
159
+
160
+ qa_tmpl_str = (
161
+ "Given the provided information, including relevant images and retrieved context from the video, \
162
+ accurately and precisely answer the query without any additional prior knowledge.\n"
163
+ "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
164
+ "---------------------\n"
165
+ "Context: {context_str}\n"
166
+ "Metadata for video: {metadata_str} \n"
167
+ "---------------------\n"
168
+ "Query: {query_str}\n"
169
+ "Answer: "
170
+ )
171
+
172
+ # Define model values and their corresponding labels
173
+ available_models = [
174
+ {"value": "meta-llama/llama-4-maverick:free", "label": "Llama"},
175
+ {"value": "qwen/qwen2.5-vl-72b-instruct:free", "label": "Qwen"},
176
+ {"value": "google/gemma-3-27b-it:free", "label": "Gemma"},
177
+ {"value": "moonshotai/kimi-vl-a3b-thinking:free", "label": "Kimi"},
178
+ {"value": "google/gemini-2.0-flash-exp:free", "label": "Gemini"},
179
+ # Add more models here if needed
180
+ ]
181
+
182
+ # Helper to get value from label or vice versa
183
+ model_value_to_label = {item["value"]: item["label"] for item in available_models}
184
+ model_label_to_value = {item["label"]: item["value"] for item in available_models}
185
+
186
+ # Gradio interface function
187
+ def gradio_chat(query, model_label):
188
+ output_video_path = "./video_data/"
189
+ output_folder = "./mixed_data/"
190
+
191
+ try:
192
+ # Process all videos
193
+ all_metadata = prepare_all_videos(output_video_path, output_folder)
194
+ # Combine metadata for all videos
195
+ metadata_str = json.dumps([item["meta"] for item in all_metadata])
196
+ retriever_engine = create_vector_db_for_all(output_folder)
197
+
198
+ img, txt = retrieve(retriever_engine=retriever_engine, query_str=query)
199
+ context_str = "".join(txt)
200
+ prompt = qa_tmpl_str.format(
201
+ context_str=context_str, query_str=query, metadata_str=metadata_str
202
+ )
203
+
204
+ OPENROUTER_API_KEY = os.environ['OPENROUTER_API_KEY']
205
+ headers = {
206
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
207
+ "Content-Type": "application/json",
208
+ "HTTP-Referer": "<YOUR_SITE_URL>",
209
+ "X-Title": "<YOUR_SITE_NAME>",
210
+ }
211
+
212
+ model_name = model_label_to_value.get(model_label, available_models[0]["value"])
213
+
214
+ messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
215
+ image_paths = []
216
+ for img_path in img:
217
+ try:
218
+ image = Image.open(img_path)
219
+ buffered = BytesIO()
220
+ image.save(buffered, format="JPEG")
221
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
222
+ messages[0]["content"].append({
223
+ "type": "image_url",
224
+ "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
225
+ })
226
+ image_paths.append(img_path)
227
+ except Exception as e:
228
+ print(f"Error loading image {img_path}: {e}")
229
+
230
+ data = {
231
+ "model": model_name,
232
+ "messages": messages,
233
+ }
234
+
235
+ response = requests.post(
236
+ url="https://openrouter.ai/api/v1/chat/completions",
237
+ headers=headers,
238
+ data=json.dumps(data)
239
+ )
240
+ response.raise_for_status()
241
+ result_text = response.json()['choices'][0]['message']['content']
242
+
243
+ return result_text, image_paths
244
+ except Exception as e:
245
+ return f"Error: {str(e)}", []
246
+
247
+ # Gradio UI
248
+
249
+ gradio_ui = gr.Interface(
250
+ fn=gradio_chat,
251
+ inputs=[
252
+ gr.Textbox(label="",placeholder="Try: Best island in Maldives"),
253
+ gr.Dropdown(
254
+ choices=[item["label"] for item in available_models],
255
+ value=available_models[0]["label"],
256
+ label="Select Model:"
257
+ )
258
+ ],
259
+ outputs=[
260
+ gr.Textbox(label="Vega Response:"),
261
+ gr.Gallery(label="Relevant Images", allow_preview=True),
262
+ ],
263
+ title="",
264
+ description="",
265
+ theme = gr.themes.Default(primary_hue="sky")
266
+ )
267
+
268
+ if __name__ == "__main__":
269
+ gradio_ui.launch(share=True)
video_data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
video_data/1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1695c52c844d32219234109c0dfdc25e1829c4c52323ea6f5cbd449ba7acae4b
3
+ size 4718847
video_data/2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b53690e38d5f6e564ce44510ef0cf3ab1ee976a5d0be4be8a3e3c9050728f7e
3
+ size 3656614
video_data/3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99b6d2b61823a876ad72b93f29941eaf75f09fd24a64ebc772ac7f05bf44e78
3
+ size 4640762