facial-emotion-recognition-11

Sleeping

File size: 5,968 Bytes

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
import numpy as np
from PIL import Image

import gradio as gr

if __name__ == "__main__":
    # Define the labels
    FER2013_LABELS = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

    # Define the model path
    MODEL_PATH = "v3-resnet18.pt"

    resnet_model = models.resnet18(pretrained=False)

    # Get the number of input features for the final fully connected layer
    num_ftrs = resnet_model.fc.in_features

    num_classes = 7 # no of classes in our data

    # Replace the final fully connected layer
    resnet_model.fc = nn.Sequential(
        # First hidden layer (Input: num_ftrs, Output: 512 nodes)
        nn.Linear(num_ftrs, 512),
        nn.ReLU(),
        nn.Dropout(0.5), # Slightly higher dropout rate example

        # Second hidden layer (Input: 512, Output: 256 nodes)
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5), # Apply dropout again

        # Output layer (Input: 256, Output: 7 classes)
        nn.Linear(256, 7)
    )

    # Load the pre-trained weights
    resnet_model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))

    # Set the model to evaluation mode
    resnet_model.eval()

    # determine the device to use
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move the model to the device
    resnet_model.to(device)

    # Define the image transformation

    # ImageNet mean and std deviation (for 3 channels) - necessary because the pretrained model was trained with these
    IMAGENET_MEAN = [0.485, 0.456, 0.406]
    IMAGENET_STD = [0.229, 0.224, 0.225]
    TARGET_SIZE = 224 # The spatial size expected by the pretrained ResNet-18 after transforms

    # Transforms for the training set (includes augmentation)
    train_transforms_resnet = T.Compose([
        T.Grayscale(num_output_channels=3), # Convert 1-channel grayscale to 3 channels by repeating
        T.Resize(256),                      # Resize the image (typically the shorter side) to 256
        T.RandomCrop(TARGET_SIZE),          # Take a random 224x224 crop for data augmentation
        T.RandomHorizontalFlip(),           # Standard data augmentation
        T.RandomRotation(degrees=10),       # add slight rotation
        T.ColorJitter(brightness=0.2, contrast=0.2), # add brightness/contrast jitter
        T.ToTensor(),                       # Convert PIL Image to PyTorch Tensor (scales to [0, 1])
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) # Normalize with ImageNet values
    ])

    # Transforms for the validation/testing set (deterministic preprocessing)
    val_transforms_resnet = T.Compose([
        T.Grayscale(num_output_channels=3), # Convert 1-channel grayscale to 3 channels
        T.Resize(256),                      # Resize the image (typically the shorter side) to 256
        T.CenterCrop(TARGET_SIZE),          # Take the central 224x224 crop for evaluation
        T.ToTensor(),                       # Convert PIL Image to PyTorch Tensor (scales to [0, 1])
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) # Normalize with ImageNet values
    ])

    def classify_expression(image: np.ndarray):
        # Ensure model, device, labels, and transforms are accessible
        global resnet_model, device, val_transforms_resnet, FER2013_LABELS

        # Gradio passes the image as a NumPy array (H x W x C, uint8, RGB)
        # Convert NumPy array to PIL Image
        # Handle potential None input if user clears the image
        if image is None:
            return {} # Return empty dictionary or suitable message

        try:
            pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
        except TypeError as e:
            print(f"Error converting image to PIL: {e}")
            return {"Error": 0.0} # Indicate an error

        # Apply transformations
        input_tensor = val_transforms_resnet(pil_image)

        # Add batch dimension (B x C x H x W) and move to device
        input_tensor = input_tensor.unsqueeze(0).to(device)

        # Perform inference
        with torch.no_grad(): # Disable gradient calculation for inference
            outputs = resnet_model(input_tensor)

        # Post-process output
        # outputs are raw logits
        probabilities = torch.softmax(outputs, dim=1)[0] # Get probabilities for the first (and only) image in the batch

        # Get top probabilities and corresponding labels
        # Use .cpu() to move tensor back to CPU for numpy conversion or list operations
        top_probs, top_ids = torch.topk(probabilities, len(FER2013_LABELS)) # Get scores for all classes

        # Create a dictionary mapping labels to confidences for Gradio's gr.Label()
        confidences = {FER2013_LABELS[top_ids[i].item()]: top_probs[i].item() for i in range(top_probs.size(0))}

        return confidences



    # Define input and output components
    input_image = gr.Image(type="numpy", label="Upload a face image") # type="numpy" is important
    output_label = gr.Label(label="Predicted Expression", num_top_classes=len(FER2013_LABELS)) # Show scores for all 7 classes

    # Create the Gradio Interface
    interface = gr.Interface(
        fn=classify_expression, # Your prediction function
        inputs=input_image, # Your input component
        outputs=output_label, # Your output component
        title="Ayoub Facial Expression Recognition", # Title for your demo
        description="Upload an image of a face to classify its expression (Angry, Disgust, Fear, Happy, Sad, Surprise, Neutral) using a ResNet-18 model.", # Description
        examples=[ "examples/angry face.jpeg",
                  "examples/happy face.jpeg",
                  "examples/sad face.jpeg",
                  "examples/surprised face.jpeg",
                  "examples/happy face1.jpeg"
        ]
    )

    interface.launch()