Spaces:

rmdhirr
/

web-phishing-detection

Sleeping

App Files Files Community

web-phishing-detection / app.py

rmdhirr

Update app.py

8af0aaf verified over 1 year ago

raw

history blame

5.22 kB

	import gradio as gr
	import tensorflow as tf
	import numpy as np
	import nltk
	import pickle
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.preprocessing.text import Tokenizer
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	import pandas as pd
	import re

	# Load the model
	model = tf.keras.models.load_model('new_phishing_detection_model.keras')

	# Compile the model with standard loss and metrics
	model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
	loss='binary_crossentropy',
	metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

	# Preprocessing functions
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	STOPWORDS = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	def preprocess_url(url):
	url = url.lower()
	url = re.sub(r'https?://', '', url)
	url = re.sub(r'www\.', '', url)
	url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
	url = re.sub(r'\s+', ' ', url).strip()
	tokens = word_tokenize(url)
	tokens = [word for word in tokens if word not in STOPWORDS]
	tokens = [lemmatizer.lemmatize(word) for word in tokens]
	return ' '.join(tokens)

	def preprocess_html(html):
	html = re.sub(r'<[^>]+>', ' ', html)
	html = html.lower()
	html = re.sub(r'https?://', '', html)
	html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
	html = re.sub(r'\s+', ' ', html).strip()
	tokens = word_tokenize(html)
	tokens = [word for word in tokens if word not in STOPWORDS]
	tokens = [lemmatizer.lemmatize(word) for word in tokens]
	return ' '.join(tokens)

	# Define maximum lengths
	max_url_length = 180
	max_html_length = 2000
	max_words = 10000

	# Load datasets
	url_df = pd.read_csv('url_data.csv')
	html_df = pd.read_csv('html_data.csv')

	# Clean URL 'Data' Columns
	url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url)

	# Clean HTML 'Data' Columns
	html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html)

	# URL Tokenization and Padding
	url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
	url_tokenizer.fit_on_texts(url_df['Cleaned_Data'])
	url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data'])
	url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post')

	# HTML Tokenization and Padding
	html_tokenizer = Tokenizer(num_words=max_words)
	html_tokenizer.fit_on_texts(html_df['Cleaned_Data'])
	html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data'])
	html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post')

	# Encode 'Category' Column
	label_encoder = LabelEncoder()
	url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category'])
	html_df['Category_Encoded'] = label_encoder.transform(html_df['Category'])

	# Split datasets into training and testing sets
	url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42)
	html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42)

	def preprocess_input(input_text, tokenizer, max_length):
	sequences = tokenizer.texts_to_sequences([input_text])
	padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
	return padded_sequences

	def get_prediction(input_text, input_type):
	is_url = input_type == "URL"
	if is_url:
	cleaned_text = preprocess_url(input_text)
	input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
	input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
	else:
	cleaned_text = preprocess_html(input_text)
	input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
	input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input

	prediction = model.predict(input_data)[0][0]
	return prediction

	def ensemble_prediction(input_text, input_type, n_ensemble=5):
	predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)]
	avg_prediction = np.mean(predictions)
	return avg_prediction

	def phishing_detection(input_text, input_type):
	prediction = ensemble_prediction(input_text, input_type)
	threshold = 0.5 # Keep the threshold unchanged
	if prediction > threshold:
	return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
	else:
	return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"

	iface = gr.Interface(
	fn=phishing_detection,
	inputs=[
	gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
	gr.components.Radio(["URL", "HTML"], type="value", label="Input Type")
	],
	outputs=gr.components.Textbox(label="Phishing Detection Result"),
	title="Phishing Detection Model",
	description="Check if a URL or HTML is Phishing.",
	theme="default"
	)

	iface.launch()