File size: 5,968 Bytes
a3c6e6d 4837f79 a3c6e6d db1fe13 a3c6e6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
import numpy as np
from PIL import Image
import gradio as gr
if __name__ == "__main__":
# Define the labels
FER2013_LABELS = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
# Define the model path
MODEL_PATH = "v3-resnet18.pt"
resnet_model = models.resnet18(pretrained=False)
# Get the number of input features for the final fully connected layer
num_ftrs = resnet_model.fc.in_features
num_classes = 7 # no of classes in our data
# Replace the final fully connected layer
resnet_model.fc = nn.Sequential(
# First hidden layer (Input: num_ftrs, Output: 512 nodes)
nn.Linear(num_ftrs, 512),
nn.ReLU(),
nn.Dropout(0.5), # Slightly higher dropout rate example
# Second hidden layer (Input: 512, Output: 256 nodes)
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.5), # Apply dropout again
# Output layer (Input: 256, Output: 7 classes)
nn.Linear(256, 7)
)
# Load the pre-trained weights
resnet_model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))
# Set the model to evaluation mode
resnet_model.eval()
# determine the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move the model to the device
resnet_model.to(device)
# Define the image transformation
# ImageNet mean and std deviation (for 3 channels) - necessary because the pretrained model was trained with these
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
TARGET_SIZE = 224 # The spatial size expected by the pretrained ResNet-18 after transforms
# Transforms for the training set (includes augmentation)
train_transforms_resnet = T.Compose([
T.Grayscale(num_output_channels=3), # Convert 1-channel grayscale to 3 channels by repeating
T.Resize(256), # Resize the image (typically the shorter side) to 256
T.RandomCrop(TARGET_SIZE), # Take a random 224x224 crop for data augmentation
T.RandomHorizontalFlip(), # Standard data augmentation
T.RandomRotation(degrees=10), # add slight rotation
T.ColorJitter(brightness=0.2, contrast=0.2), # add brightness/contrast jitter
T.ToTensor(), # Convert PIL Image to PyTorch Tensor (scales to [0, 1])
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) # Normalize with ImageNet values
])
# Transforms for the validation/testing set (deterministic preprocessing)
val_transforms_resnet = T.Compose([
T.Grayscale(num_output_channels=3), # Convert 1-channel grayscale to 3 channels
T.Resize(256), # Resize the image (typically the shorter side) to 256
T.CenterCrop(TARGET_SIZE), # Take the central 224x224 crop for evaluation
T.ToTensor(), # Convert PIL Image to PyTorch Tensor (scales to [0, 1])
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) # Normalize with ImageNet values
])
def classify_expression(image: np.ndarray):
# Ensure model, device, labels, and transforms are accessible
global resnet_model, device, val_transforms_resnet, FER2013_LABELS
# Gradio passes the image as a NumPy array (H x W x C, uint8, RGB)
# Convert NumPy array to PIL Image
# Handle potential None input if user clears the image
if image is None:
return {} # Return empty dictionary or suitable message
try:
pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
except TypeError as e:
print(f"Error converting image to PIL: {e}")
return {"Error": 0.0} # Indicate an error
# Apply transformations
input_tensor = val_transforms_resnet(pil_image)
# Add batch dimension (B x C x H x W) and move to device
input_tensor = input_tensor.unsqueeze(0).to(device)
# Perform inference
with torch.no_grad(): # Disable gradient calculation for inference
outputs = resnet_model(input_tensor)
# Post-process output
# outputs are raw logits
probabilities = torch.softmax(outputs, dim=1)[0] # Get probabilities for the first (and only) image in the batch
# Get top probabilities and corresponding labels
# Use .cpu() to move tensor back to CPU for numpy conversion or list operations
top_probs, top_ids = torch.topk(probabilities, len(FER2013_LABELS)) # Get scores for all classes
# Create a dictionary mapping labels to confidences for Gradio's gr.Label()
confidences = {FER2013_LABELS[top_ids[i].item()]: top_probs[i].item() for i in range(top_probs.size(0))}
return confidences
# Define input and output components
input_image = gr.Image(type="numpy", label="Upload a face image") # type="numpy" is important
output_label = gr.Label(label="Predicted Expression", num_top_classes=len(FER2013_LABELS)) # Show scores for all 7 classes
# Create the Gradio Interface
interface = gr.Interface(
fn=classify_expression, # Your prediction function
inputs=input_image, # Your input component
outputs=output_label, # Your output component
title="Ayoub Facial Expression Recognition", # Title for your demo
description="Upload an image of a face to classify its expression (Angry, Disgust, Fear, Happy, Sad, Surprise, Neutral) using a ResNet-18 model.", # Description
examples=[ "examples/angry face.jpeg",
"examples/happy face.jpeg",
"examples/sad face.jpeg",
"examples/surprised face.jpeg",
"examples/happy face1.jpeg"
]
)
interface.launch() |