Spaces:

SuriRaja
/

VideoMAECROWDSCENES_B

Sleeping

App Files Files Community

SuriRaja commited on Jun 3

Commit

760aaa1

verified ·

1 Parent(s): 45209ed

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -48

app.py CHANGED Viewed

@@ -2,21 +2,17 @@ import torch
 import gradio as gr
 import numpy as np
 import cv2
-from transformers import (
-    VideoMAEImageProcessor,
-    VideoMAEForPreTraining,
-    VideoMAEForVideoClassification,
-)
-# Initialize model and processor for pretraining (reconstruction) and classification
-model_name_pretrain = "MCG-NJU/videomae-base"
-model_name_classify = "MCG-NTU/videomae-base"
-processor = VideoMAEImageProcessor.from_pretrained(model_name_pretrain)
-model_pretrain = VideoMAEForPreTraining.from_pretrained(model_name_pretrain)
-model_classify = VideoMAEForVideoClassification.from_pretrained(model_name_classify)
-# Some example labels for NTU dataset (replace with full list as needed)
 labels = [
     "drink water", "eat meal/snack", "brush teeth", "clapping", "writing",
     "reading", "wear jacket", "take off jacket", "put on a shoe", "take off a shoe"
@@ -37,20 +33,20 @@ def preprocess_video(video_path):
     return frames[:16]
 def predict_video(video):
     frames = preprocess_video(video.name)
     pixel_values = processor(frames, return_tensors="pt").pixel_values
-    # For pretraining: random mask
     num_patches_per_frame = (model_pretrain.config.image_size // model_pretrain.config.patch_size) ** 2
     seq_length = (16 // model_pretrain.config.tubelet_size) * num_patches_per_frame
     bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()
     with torch.no_grad():
         outputs = model_pretrain(pixel_values, bool_masked_pos=bool_masked_pos)
     loss = outputs.loss.item()
-    # For classification: get logits and predict top 3 classes
     with torch.no_grad():
         outputs_class = model_classify(pixel_values)
     logits = outputs_class.logits
@@ -58,64 +54,41 @@ def predict_video(video):
     top5_prob, top5_catid = torch.topk(probs, 3)
     top_actions = {labels[catid]: float(prob) for prob, catid in zip(top5_prob, top5_catid)}
-    return {
-        "Reconstruction Loss": f"{loss:.4f}",
-        "Top 3 Action Predictions": top_actions
-    }
 def preprocess_image(image):
-    # Convert PIL image to numpy RGB array and resize
-    image = np.array(image.convert("RGB").resize((224,224)))
-    # Add batch and channel dimension
     pixel_values = processor(image, return_tensors="pt").pixel_values
     return pixel_values
 def predict_image(image):
     pixel_values = preprocess_image(image)
-    # For pretraining (masked autoencoding), mask patches randomly
     num_patches = (model_pretrain.config.image_size // model_pretrain.config.patch_size) ** 2
     bool_masked_pos = torch.randint(0, 2, (1, num_patches)).bool()
     with torch.no_grad():
         outputs = model_pretrain(pixel_values, bool_masked_pos=bool_masked_pos)
     loss = outputs.loss.item()
     return f"Image Reconstruction Loss: {loss:.4f}"
-# Gradio interface with Tabs for Image and Video
 with gr.Blocks() as demo:
     gr.Markdown("# VideoMAE Demo: Image and Video Input")
     with gr.Tab("Video Input"):
         video_input = gr.Video(label="Upload Video (short clip)")
-        video_output_loss = gr.Textbox(label="Reconstruction Loss")
-        video_output_preds = gr.Label(num_top_classes=3, label="Top 3 Action Predictions")
         video_btn = gr.Button("Predict Video")
-        def video_predict_fn(video):
-            if video is None:
-                return "", {}
-            results = predict_video(video)
-            return results["Reconstruction Loss"], results["Top 3 Action Predictions"]
-        video_btn.click(
-            fn=video_predict_fn,
-            inputs=video_input,
-            outputs=[video_output_loss, video_output_preds],
-        )
     with gr.Tab("Image Input"):
         image_input = gr.Image(label="Upload Image")
-        image_output = gr.Textbox(label="Reconstruction Loss")
         image_btn = gr.Button("Predict Image")
-        image_btn.click(
-            fn=predict_image,
-            inputs=image_input,
-            outputs=image_output,
-        )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import numpy as np
 import cv2
+from transformers import VideoMAEImageProcessor, VideoMAEForPreTraining, VideoMAEForVideoClassification
+# Use the publicly available MCG-NJU model for both pretraining and classification
+model_name = "MCG-NJU/videomae-base"
+# Load processor and models
+processor = VideoMAEImageProcessor.from_pretrained(model_name)
+model_pretrain = VideoMAEForPreTraining.from_pretrained(model_name)
+model_classify = VideoMAEForVideoClassification.from_pretrained(model_name)
+# Example labels for classification (replace with full NTU action list if needed)
 labels = [
     "drink water", "eat meal/snack", "brush teeth", "clapping", "writing",
     "reading", "wear jacket", "take off jacket", "put on a shoe", "take off a shoe"
     return frames[:16]
 def predict_video(video):
+    if video is None:
+        return "", {}
     frames = preprocess_video(video.name)
     pixel_values = processor(frames, return_tensors="pt").pixel_values
+    # Masked positions for pretraining
     num_patches_per_frame = (model_pretrain.config.image_size // model_pretrain.config.patch_size) ** 2
     seq_length = (16 // model_pretrain.config.tubelet_size) * num_patches_per_frame
     bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()
     with torch.no_grad():
         outputs = model_pretrain(pixel_values, bool_masked_pos=bool_masked_pos)
     loss = outputs.loss.item()
     with torch.no_grad():
         outputs_class = model_classify(pixel_values)
     logits = outputs_class.logits
     top5_prob, top5_catid = torch.topk(probs, 3)
     top_actions = {labels[catid]: float(prob) for prob, catid in zip(top5_prob, top5_catid)}
+    return f"Reconstruction Loss: {loss:.4f}", top_actions
 def preprocess_image(image):
+    # Resize and convert to RGB numpy array
+    image = np.array(image.convert("RGB").resize((224, 224)))
     pixel_values = processor(image, return_tensors="pt").pixel_values
     return pixel_values
 def predict_image(image):
+    if image is None:
+        return "No image provided."
     pixel_values = preprocess_image(image)
     num_patches = (model_pretrain.config.image_size // model_pretrain.config.patch_size) ** 2
     bool_masked_pos = torch.randint(0, 2, (1, num_patches)).bool()
     with torch.no_grad():
         outputs = model_pretrain(pixel_values, bool_masked_pos=bool_masked_pos)
     loss = outputs.loss.item()
     return f"Image Reconstruction Loss: {loss:.4f}"
 with gr.Blocks() as demo:
     gr.Markdown("# VideoMAE Demo: Image and Video Input")
     with gr.Tab("Video Input"):
         video_input = gr.Video(label="Upload Video (short clip)")
+        video_loss = gr.Textbox(label="Reconstruction Loss")
+        video_preds = gr.Label(num_top_classes=3, label="Top 3 Action Predictions")
         video_btn = gr.Button("Predict Video")
+        video_btn.click(predict_video, inputs=video_input, outputs=[video_loss, video_preds])
     with gr.Tab("Image Input"):
         image_input = gr.Image(label="Upload Image")
+        image_loss = gr.Textbox(label="Reconstruction Loss")
         image_btn = gr.Button("Predict Image")
+        image_btn.click(predict_image, inputs=image_input, outputs=image_loss)
 if __name__ == "__main__":
     demo.launch()