Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import nltk | |
| import re | |
| import pickle | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| from tensorflow import keras | |
| from sklearn.preprocessing import LabelEncoder | |
| # Ensure necessary NLTK resources are downloaded | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| # Load Stopwords and Initialize Lemmatizer | |
| STOPWORDS = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| # Function to clean and preprocess URL data | |
| def preprocess_url(url): | |
| url = url.lower() # Convert to lowercase | |
| url = re.sub(r'https?://', '', url) # Remove http or https | |
| url = re.sub(r'www\.', '', url) # Remove www | |
| url = re.sub(r'[^a-zA-Z0-9]', ' ', url) # Remove special characters | |
| url = re.sub(r'\s+', ' ', url).strip() # Remove extra spaces | |
| tokens = word_tokenize(url) # Tokenize | |
| tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords | |
| tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization | |
| return ' '.join(tokens) | |
| # Function to clean and preprocess HTML data | |
| def preprocess_html(html): | |
| html = re.sub(r'<[^>]+>', ' ', html) # Remove HTML tags | |
| html = html.lower() # Convert to lowercase | |
| html = re.sub(r'https?://', '', html) # Remove http or https | |
| html = re.sub(r'[^a-zA-Z0-9]', ' ', html) # Remove special characters | |
| html = re.sub(r'\s+', ' ', html).strip() # Remove extra spaces | |
| tokens = word_tokenize(html) # Tokenize | |
| tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords | |
| tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization | |
| return ' '.join(tokens) | |
| # Load trained model | |
| model = keras.models.load_model('new_phishing_detection_model.keras') | |
| # Define maximum length and number of words | |
| max_url_length = 180 | |
| max_html_length = 2000 | |
| max_words = 10000 | |
| # Load the fitted tokenizers | |
| with open('url_tokenizer.pkl', 'rb') as file: | |
| url_tokenizer = pickle.load(file) | |
| with open('html_tokenizer.pkl', 'rb') as file: | |
| html_tokenizer = pickle.load(file) | |
| # Load the label encoder | |
| with open('label_encoder.pkl', 'rb') as file: | |
| label_encoder = pickle.load(file) | |
| # Define the prediction function | |
| def predict_phishing(url, html): | |
| cleaned_url = preprocess_url(url) | |
| cleaned_html = preprocess_html(html) | |
| new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url]) | |
| new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post') | |
| new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html]) | |
| new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post') | |
| new_predictions_prob = model.predict([new_url_padded, new_html_padded]) | |
| new_predictions = (new_predictions_prob > 0.6).astype(int) # Adjust threshold if needed | |
| predicted_category = label_encoder.inverse_transform(new_predictions)[0] | |
| predicted_probability = f"{new_predictions_prob[0][0]:.4f}" | |
| return predicted_category.capitalize(), predicted_probability | |
| # Create Gradio Interface | |
| interface = gr.Interface( | |
| fn=predict_phishing, | |
| inputs=[ | |
| gr.components.Textbox(label="URL"), | |
| gr.components.Textbox(label="HTML Snippet", lines=10, placeholder="Paste HTML content here") | |
| ], | |
| outputs=[ | |
| gr.components.Textbox(label="Predicted Category"), | |
| gr.components.Textbox(label="Predicted Probability") | |
| ], | |
| title="Phishing Detection Model", | |
| description="Enter a URL and its HTML content to predict if it's spam or legitimate. It's recommended to provide both for accurate results.", | |
| live=True, | |
| css=""" | |
| .interface-container { | |
| border: 2px solid #4CAF50; | |
| border-radius: 10px; | |
| padding: 20px; | |
| text-align: center; | |
| } | |
| .gr-textbox, .gr-textbox textarea, .gr-button { | |
| margin-left: auto !important; | |
| margin-right: auto !important; | |
| } | |
| """ | |
| ) | |
| # Footer text | |
| footer = gr.Markdown(""" | |
| --- | |
| <div style="text-align: center;"> | |
| Made with ❤️ by Ramadhirra<br> | |
| Model by Ramadhirra<br> | |
| WebUI by Ramadhirra | |
| </div> | |
| """) | |
| # Combine the interface and footer | |
| app = gr.Blocks() | |
| with app: | |
| interface.render() | |
| footer.render() | |
| # Launch the Gradio interface | |
| app.launch() |