Spaces:

rmdhirr
/

web-phishing-detection

Sleeping

App Files Files Community

web-phishing-detection / app.py

rmdhirr

Update app.py

2ea44e1 verified over 1 year ago

raw

history blame contribute delete

4.48 kB

	import gradio as gr
	import nltk
	import re
	import pickle
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow import keras
	from sklearn.preprocessing import LabelEncoder

	# Ensure necessary NLTK resources are downloaded
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	# Load Stopwords and Initialize Lemmatizer
	STOPWORDS = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	# Function to clean and preprocess URL data
	def preprocess_url(url):
	url = url.lower() # Convert to lowercase
	url = re.sub(r'https?://', '', url) # Remove http or https
	url = re.sub(r'www\.', '', url) # Remove www
	url = re.sub(r'[^a-zA-Z0-9]', ' ', url) # Remove special characters
	url = re.sub(r'\s+', ' ', url).strip() # Remove extra spaces
	tokens = word_tokenize(url) # Tokenize
	tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords
	tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
	return ' '.join(tokens)

	# Function to clean and preprocess HTML data
	def preprocess_html(html):
	html = re.sub(r'<[^>]+>', ' ', html) # Remove HTML tags
	html = html.lower() # Convert to lowercase
	html = re.sub(r'https?://', '', html) # Remove http or https
	html = re.sub(r'[^a-zA-Z0-9]', ' ', html) # Remove special characters
	html = re.sub(r'\s+', ' ', html).strip() # Remove extra spaces
	tokens = word_tokenize(html) # Tokenize
	tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords
	tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
	return ' '.join(tokens)

	# Load trained model
	model = keras.models.load_model('new_phishing_detection_model.keras')

	# Define maximum length and number of words
	max_url_length = 180
	max_html_length = 2000
	max_words = 10000

	# Load the fitted tokenizers
	with open('url_tokenizer.pkl', 'rb') as file:
	url_tokenizer = pickle.load(file)

	with open('html_tokenizer.pkl', 'rb') as file:
	html_tokenizer = pickle.load(file)

	# Load the label encoder
	with open('label_encoder.pkl', 'rb') as file:
	label_encoder = pickle.load(file)

	# Define the prediction function
	def predict_phishing(url, html):
	cleaned_url = preprocess_url(url)
	cleaned_html = preprocess_html(html)

	new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url])
	new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post')

	new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html])
	new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post')

	new_predictions_prob = model.predict([new_url_padded, new_html_padded])
	new_predictions = (new_predictions_prob > 0.6).astype(int) # Adjust threshold if needed

	predicted_category = label_encoder.inverse_transform(new_predictions)[0]
	predicted_probability = f"{new_predictions_prob[0][0]:.4f}"

	return predicted_category.capitalize(), predicted_probability

	# Create Gradio Interface
	interface = gr.Interface(
	fn=predict_phishing,
	inputs=[
	gr.components.Textbox(label="URL"),
	gr.components.Textbox(label="HTML Snippet", lines=10, placeholder="Paste HTML content here")
	],
	outputs=[
	gr.components.Textbox(label="Predicted Category"),
	gr.components.Textbox(label="Predicted Probability")
	],
	title="Phishing Detection Model",
	description="Enter a URL and its HTML content to predict if it's spam or legitimate. It's recommended to provide both for accurate results.",
	live=True,
	css="""
	.interface-container {
	border: 2px solid #4CAF50;
	border-radius: 10px;
	padding: 20px;
	text-align: center;
	}
	.gr-textbox, .gr-textbox textarea, .gr-button {
	margin-left: auto !important;
	margin-right: auto !important;
	}
	"""
	)

	# Footer text
	footer = gr.Markdown("""
	---
	<div style="text-align: center;">
	Made with ❤️ by Ramadhirra<br>
	Model by Ramadhirra<br>
	WebUI by Ramadhirra
	</div>
	""")

	# Combine the interface and footer
	app = gr.Blocks()

	with app:
	interface.render()
	footer.render()

	# Launch the Gradio interface
	app.launch()