File size: 5,968 Bytes
a3c6e6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4837f79
a3c6e6d
db1fe13
 
 
 
 
a3c6e6d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
import numpy as np
from PIL import Image

import gradio as gr

if __name__ == "__main__":
    # Define the labels
    FER2013_LABELS = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

    # Define the model path
    MODEL_PATH = "v3-resnet18.pt"

    resnet_model = models.resnet18(pretrained=False)

    # Get the number of input features for the final fully connected layer
    num_ftrs = resnet_model.fc.in_features

    num_classes = 7 # no of classes in our data

    # Replace the final fully connected layer
    resnet_model.fc = nn.Sequential(
        # First hidden layer (Input: num_ftrs, Output: 512 nodes)
        nn.Linear(num_ftrs, 512),
        nn.ReLU(),
        nn.Dropout(0.5), # Slightly higher dropout rate example

        # Second hidden layer (Input: 512, Output: 256 nodes)
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5), # Apply dropout again

        # Output layer (Input: 256, Output: 7 classes)
        nn.Linear(256, 7)
    )

    # Load the pre-trained weights
    resnet_model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))

    # Set the model to evaluation mode
    resnet_model.eval()

    # determine the device to use
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move the model to the device
    resnet_model.to(device)

    # Define the image transformation

    # ImageNet mean and std deviation (for 3 channels) - necessary because the pretrained model was trained with these
    IMAGENET_MEAN = [0.485, 0.456, 0.406]
    IMAGENET_STD = [0.229, 0.224, 0.225]
    TARGET_SIZE = 224 # The spatial size expected by the pretrained ResNet-18 after transforms

    # Transforms for the training set (includes augmentation)
    train_transforms_resnet = T.Compose([
        T.Grayscale(num_output_channels=3), # Convert 1-channel grayscale to 3 channels by repeating
        T.Resize(256),                      # Resize the image (typically the shorter side) to 256
        T.RandomCrop(TARGET_SIZE),          # Take a random 224x224 crop for data augmentation
        T.RandomHorizontalFlip(),           # Standard data augmentation
        T.RandomRotation(degrees=10),       # add slight rotation
        T.ColorJitter(brightness=0.2, contrast=0.2), # add brightness/contrast jitter
        T.ToTensor(),                       # Convert PIL Image to PyTorch Tensor (scales to [0, 1])
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) # Normalize with ImageNet values
    ])

    # Transforms for the validation/testing set (deterministic preprocessing)
    val_transforms_resnet = T.Compose([
        T.Grayscale(num_output_channels=3), # Convert 1-channel grayscale to 3 channels
        T.Resize(256),                      # Resize the image (typically the shorter side) to 256
        T.CenterCrop(TARGET_SIZE),          # Take the central 224x224 crop for evaluation
        T.ToTensor(),                       # Convert PIL Image to PyTorch Tensor (scales to [0, 1])
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) # Normalize with ImageNet values
    ])

    def classify_expression(image: np.ndarray):
        # Ensure model, device, labels, and transforms are accessible
        global resnet_model, device, val_transforms_resnet, FER2013_LABELS

        # Gradio passes the image as a NumPy array (H x W x C, uint8, RGB)
        # Convert NumPy array to PIL Image
        # Handle potential None input if user clears the image
        if image is None:
            return {} # Return empty dictionary or suitable message

        try:
            pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
        except TypeError as e:
            print(f"Error converting image to PIL: {e}")
            return {"Error": 0.0} # Indicate an error

        # Apply transformations
        input_tensor = val_transforms_resnet(pil_image)

        # Add batch dimension (B x C x H x W) and move to device
        input_tensor = input_tensor.unsqueeze(0).to(device)

        # Perform inference
        with torch.no_grad(): # Disable gradient calculation for inference
            outputs = resnet_model(input_tensor)

        # Post-process output
        # outputs are raw logits
        probabilities = torch.softmax(outputs, dim=1)[0] # Get probabilities for the first (and only) image in the batch

        # Get top probabilities and corresponding labels
        # Use .cpu() to move tensor back to CPU for numpy conversion or list operations
        top_probs, top_ids = torch.topk(probabilities, len(FER2013_LABELS)) # Get scores for all classes

        # Create a dictionary mapping labels to confidences for Gradio's gr.Label()
        confidences = {FER2013_LABELS[top_ids[i].item()]: top_probs[i].item() for i in range(top_probs.size(0))}

        return confidences



    # Define input and output components
    input_image = gr.Image(type="numpy", label="Upload a face image") # type="numpy" is important
    output_label = gr.Label(label="Predicted Expression", num_top_classes=len(FER2013_LABELS)) # Show scores for all 7 classes

    # Create the Gradio Interface
    interface = gr.Interface(
        fn=classify_expression, # Your prediction function
        inputs=input_image, # Your input component
        outputs=output_label, # Your output component
        title="Ayoub Facial Expression Recognition", # Title for your demo
        description="Upload an image of a face to classify its expression (Angry, Disgust, Fear, Happy, Sad, Surprise, Neutral) using a ResNet-18 model.", # Description
        examples=[ "examples/angry face.jpeg",
                  "examples/happy face.jpeg",
                  "examples/sad face.jpeg",
                  "examples/surprised face.jpeg",
                  "examples/happy face1.jpeg"
        ]
    )

    interface.launch()