Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tensorflow as tf | |
| import numpy as np | |
| import nltk | |
| import pickle | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| import pandas as pd | |
| import re | |
| # Load the model | |
| model = tf.keras.models.load_model('new_phishing_detection_model.keras') | |
| # Compile the model with standard loss and metrics | |
| model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), | |
| loss='binary_crossentropy', | |
| metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]) | |
| # Preprocessing functions | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| STOPWORDS = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| def preprocess_url(url): | |
| url = url.lower() | |
| url = re.sub(r'https?://', '', url) | |
| url = re.sub(r'www\.', '', url) | |
| url = re.sub(r'[^a-zA-Z0-9]', ' ', url) | |
| url = re.sub(r'\s+', ' ', url).strip() | |
| tokens = word_tokenize(url) | |
| tokens = [word for word in tokens if word not in STOPWORDS] | |
| tokens = [lemmatizer.lemmatize(word) for word in tokens] | |
| return ' '.join(tokens) | |
| def preprocess_html(html): | |
| html = re.sub(r'<[^>]+>', ' ', html) | |
| html = html.lower() | |
| html = re.sub(r'https?://', '', html) | |
| html = re.sub(r'[^a-zA-Z0-9]', ' ', html) | |
| html = re.sub(r'\s+', ' ', html).strip() | |
| tokens = word_tokenize(html) | |
| tokens = [word for word in tokens if word not in STOPWORDS] | |
| tokens = [lemmatizer.lemmatize(word) for word in tokens] | |
| return ' '.join(tokens) | |
| # Define maximum lengths | |
| max_url_length = 180 | |
| max_html_length = 2000 | |
| max_words = 10000 | |
| # Load datasets | |
| url_df = pd.read_csv('url_data.csv') | |
| html_df = pd.read_csv('html_data.csv') | |
| # Clean URL 'Data' Columns | |
| url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url) | |
| # Clean HTML 'Data' Columns | |
| html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html) | |
| # URL Tokenization and Padding | |
| url_tokenizer = Tokenizer(num_words=max_words, char_level=True) | |
| url_tokenizer.fit_on_texts(url_df['Cleaned_Data']) | |
| url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data']) | |
| url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post') | |
| # HTML Tokenization and Padding | |
| html_tokenizer = Tokenizer(num_words=max_words) | |
| html_tokenizer.fit_on_texts(html_df['Cleaned_Data']) | |
| html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data']) | |
| html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post') | |
| # Encode 'Category' Column | |
| label_encoder = LabelEncoder() | |
| url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category']) | |
| html_df['Category_Encoded'] = label_encoder.transform(html_df['Category']) | |
| # Split datasets into training and testing sets | |
| url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42) | |
| html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42) | |
| def preprocess_input(input_text, tokenizer, max_length): | |
| sequences = tokenizer.texts_to_sequences([input_text]) | |
| padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post') | |
| return padded_sequences | |
| def get_prediction(input_text, input_type): | |
| is_url = input_type == "URL" | |
| if is_url: | |
| cleaned_text = preprocess_url(input_text) | |
| input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length) | |
| input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input | |
| else: | |
| cleaned_text = preprocess_html(input_text) | |
| input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length) | |
| input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input | |
| prediction = model.predict(input_data)[0][0] | |
| return prediction | |
| def ensemble_prediction(input_text, input_type, n_ensemble=5): | |
| predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)] | |
| avg_prediction = np.mean(predictions) | |
| return avg_prediction | |
| def phishing_detection(input_text, input_type): | |
| prediction = ensemble_prediction(input_text, input_type) | |
| threshold = 0.5 # Keep the threshold unchanged | |
| if prediction > threshold: | |
| return f"Warning: This site is likely a phishing site! ({prediction:.2f})" | |
| else: | |
| return f"Safe: This site is not likely a phishing site. ({prediction:.2f})" | |
| iface = gr.Interface( | |
| fn=phishing_detection, | |
| inputs=[ | |
| gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"), | |
| gr.components.Radio(["URL", "HTML"], type="value", label="Input Type") | |
| ], | |
| outputs=gr.components.Textbox(label="Phishing Detection Result"), | |
| title="Phishing Detection Model", | |
| description="Check if a URL or HTML is Phishing.", | |
| theme="default" | |
| ) | |
| iface.launch() |