Spaces:
Sleeping
Sleeping
import torch | |
import torch.nn as nn | |
import json | |
import gradio as gr | |
# --- Step 1: Load the vocabularies --- | |
# These files are in your Hugging Face Space repository, so we can load them directly. | |
with open('char_to_int.json', 'r') as f: | |
char_to_int = json.load(f) | |
with open('int_to_lang.json', 'r') as f: | |
int_to_lang = json.load(f) | |
# --- Step 2: Re-define the Model Architecture --- | |
# This MUST be the exact same architecture as the one you trained. | |
# All the hyperparameters (embedding_dim, hidden_dim, etc.) must match. | |
class CodeClassifierRNN(nn.Module): | |
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx): | |
super().__init__() | |
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx) | |
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True) | |
self.dropout = nn.Dropout(dropout) | |
self.fc = nn.Linear(hidden_dim * 2, output_dim) # * 2 for bidirectional | |
def forward(self, text): | |
embedded = self.embedding(text) | |
_, (hidden, _) = self.lstm(embedded) | |
hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1) | |
hidden = self.dropout(hidden) | |
output = self.fc(hidden) | |
return output | |
# --- Step 3: Instantiate the model and load the trained weights --- | |
# Set hyperparameters to match your training script | |
PAD_IDX = char_to_int['<PAD>'] | |
VOCAB_SIZE = len(char_to_int) | |
EMBEDDING_DIM = 128 | |
HIDDEN_DIM = 192 # Must match the final trained model | |
OUTPUT_DIM = len(int_to_lang) | |
N_LAYERS = 2 | |
BIDIRECTIONAL = True | |
DROPOUT = 0.5 | |
# Create an instance of the model | |
model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX) | |
# Load the saved state dictionary. | |
# We use map_location='cpu' because the Space runs on a CPU. | |
model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu')) | |
model.eval() # Set the model to evaluation mode | |
# --- Step 4: Create the prediction function --- | |
def classify_code(code_snippet): | |
if not code_snippet: | |
return {} | |
# 1. Convert snippet to tensor of indices | |
indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet] | |
tensor = torch.LongTensor(indexed).unsqueeze(0) # Add batch dimension | |
# 2. Make prediction | |
with torch.no_grad(): | |
prediction = model(tensor) | |
# 3. Get probabilities using softmax | |
probabilities = torch.softmax(prediction, dim=1) | |
# 4. Get top 5 predictions | |
top5_probs, top5_indices = torch.topk(probabilities, 5) | |
# 5. Format for Gradio output | |
confidences = {int_to_lang[str(idx.item())]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])} | |
return confidences | |
# --- Step 5: Create and launch the Gradio Interface --- | |
iface = gr.Interface( | |
fn=classify_code, | |
inputs=gr.Code(language=None, label="Code Snippet"), | |
outputs=gr.Label(num_top_classes=5, label="Predicted Language"), | |
title="Polyglot Code Classifier", | |
description="Enter a code snippet to see which programming language the AI thinks it is. This model was trained from scratch on a custom dataset.", | |
examples=[ | |
["def hello_world():\n print('Hello from Python!')"], | |
["function greet() {\n console.log('Hello from JavaScript!');\n}"], | |
["public class Main {\n public static void main(String[] args) {\n System.out.println(\"Hello, Java!\");\n }\n}"] | |
] | |
) | |
iface.launch() |