Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,8 @@ import os
|
|
| 8 |
import pandas as pd
|
| 9 |
from collections import Counter
|
| 10 |
import io
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Download NLTK data
|
| 13 |
try:
|
|
@@ -66,7 +68,74 @@ vectorizer = joblib.load(vectorizer_path)
|
|
| 66 |
|
| 67 |
# Spam indicators
|
| 68 |
SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
|
| 69 |
-
'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
def extract_urls(message):
|
| 72 |
"""Extract all URLs from the message"""
|
|
@@ -82,6 +151,10 @@ def analyze_email(message):
|
|
| 82 |
analysis['word_count'] = len(message.split())
|
| 83 |
analysis['char_count'] = len(message)
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
# Extract URLs
|
| 86 |
analysis['urls'] = extract_urls(message)
|
| 87 |
analysis['has_urls'] = len(analysis['urls']) > 0
|
|
@@ -96,6 +169,9 @@ def analyze_email(message):
|
|
| 96 |
found_keywords = [kw for kw in SPAM_KEYWORDS if kw in message_lower]
|
| 97 |
analysis['spam_keywords'] = found_keywords
|
| 98 |
|
|
|
|
|
|
|
|
|
|
| 99 |
return analysis
|
| 100 |
|
| 101 |
def highlight_spam_words(message, keywords):
|
|
@@ -106,9 +182,39 @@ def highlight_spam_words(message, keywords):
|
|
| 106 |
highlighted = pattern.sub(f'<mark style="background-color: #ffcccc; padding: 2px 4px; border-radius: 3px;">{kw}</mark>', highlighted)
|
| 107 |
return highlighted
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
def classify_email(message):
|
| 110 |
if not message.strip():
|
| 111 |
-
return "<div style='color:gray;'>Empty message</div>", "", "", ""
|
| 112 |
|
| 113 |
try:
|
| 114 |
# Get analysis
|
|
@@ -119,10 +225,11 @@ def classify_email(message):
|
|
| 119 |
vec = vectorizer.transform([cleaned])
|
| 120 |
pred = model.predict(vec)[0]
|
| 121 |
|
| 122 |
-
|
|
|
|
| 123 |
|
| 124 |
# Result card
|
| 125 |
-
if
|
| 126 |
result_html = """
|
| 127 |
<div style='border:2px solid #ff4d4d; border-radius:10px; background-color:#ffe6e6;
|
| 128 |
padding:15px; font-size:18px; font-weight:bold; text-align:center;'>
|
|
@@ -137,11 +244,19 @@ def classify_email(message):
|
|
| 137 |
</div>
|
| 138 |
"""
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
# Analysis details
|
| 141 |
details_html = f"""
|
| 142 |
<div style='background-color:#f8f9fa; padding:15px; border-radius:8px; margin-top:10px;'>
|
| 143 |
<h3 style='margin-top:0; color:#333;'>π Email Analysis</h3>
|
| 144 |
<table style='width:100%; border-collapse: collapse;'>
|
|
|
|
|
|
|
| 145 |
<tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
|
| 146 |
<tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
|
| 147 |
<tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'β οΈ Yes (' + str(len(analysis['urls'])) + ')' if analysis['has_urls'] else 'β No'}</td></tr>
|
|
@@ -152,6 +267,21 @@ def classify_email(message):
|
|
| 152 |
</div>
|
| 153 |
"""
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
# URLs detected
|
| 156 |
if analysis['urls']:
|
| 157 |
urls_html = f"""
|
|
@@ -160,6 +290,9 @@ def classify_email(message):
|
|
| 160 |
<div style='background-color:white; padding:10px; border-radius:5px; font-size:14px;'>
|
| 161 |
{'<br>'.join(['<a href="' + url + '" target="_blank" style="color:#d32f2f; word-break:break-all;">' + url + '</a>' for url in analysis['urls']])}
|
| 162 |
</div>
|
|
|
|
|
|
|
|
|
|
| 163 |
</div>
|
| 164 |
"""
|
| 165 |
else:
|
|
@@ -179,15 +312,22 @@ def classify_email(message):
|
|
| 179 |
else:
|
| 180 |
keywords_html = ""
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
except Exception as e:
|
| 185 |
print(f"Prediction error: {e}")
|
| 186 |
-
return "<div style='color:gray;'>Error during classification</div>", "", "", ""
|
| 187 |
-
|
| 188 |
-
def get_statistics():
|
| 189 |
-
"""Generate statistics dashboard"""
|
| 190 |
-
return ""
|
| 191 |
|
| 192 |
def process_bulk_emails(file):
|
| 193 |
"""Process bulk emails from file"""
|
|
@@ -217,9 +357,15 @@ def process_bulk_emails(file):
|
|
| 217 |
vec = vectorizer.transform([cleaned])
|
| 218 |
pred = model.predict(vec)[0]
|
| 219 |
|
|
|
|
|
|
|
|
|
|
| 220 |
results.append({
|
| 221 |
'Email': str(email)[:100] + '...' if len(str(email)) > 100 else str(email),
|
| 222 |
-
'Classification': 'Spam' if pred == 1 else 'Not Spam'
|
|
|
|
|
|
|
|
|
|
| 223 |
})
|
| 224 |
|
| 225 |
results_df = pd.DataFrame(results)
|
|
@@ -228,22 +374,31 @@ def process_bulk_emails(file):
|
|
| 228 |
output_path = "spam_classification_results.csv"
|
| 229 |
results_df.to_csv(output_path, index=False)
|
| 230 |
|
|
|
|
|
|
|
|
|
|
| 231 |
summary = f"β
Processed {len(results)} emails\n"
|
| 232 |
-
summary += f"π΄ Spam: {
|
| 233 |
-
summary += f"π’ Not Spam: {len(
|
|
|
|
| 234 |
|
| 235 |
return summary, output_path
|
| 236 |
|
| 237 |
except Exception as e:
|
| 238 |
return f"Error processing file: {str(e)}", None
|
| 239 |
|
| 240 |
-
#
|
| 241 |
examples = [
|
| 242 |
-
["Congratulations! You've won a $1000 gift card. Click here!"],
|
| 243 |
-
["URGENT: Your account has been compromised. Verify your identity now
|
| 244 |
-
["Hi mom, I'll be home for dinner tonight."],
|
| 245 |
-
["Hello team, the project report is attached."],
|
| 246 |
-
["Hey John, are we still on for lunch tomorrow?"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
]
|
| 248 |
|
| 249 |
# Custom CSS
|
|
@@ -260,7 +415,7 @@ mark {animation: highlight 0.5s ease;}
|
|
| 260 |
# Gradio interface
|
| 261 |
with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Classifier") as demo:
|
| 262 |
gr.Markdown("# π§ Enhanced Email Spam Classifier")
|
| 263 |
-
gr.Markdown("*Advanced spam detection with
|
| 264 |
|
| 265 |
with gr.Tabs():
|
| 266 |
# Single Email Tab
|
|
@@ -280,36 +435,39 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Class
|
|
| 280 |
output_label = gr.HTML(label="π Result")
|
| 281 |
|
| 282 |
analysis_output = gr.HTML(label="π Analysis Details")
|
|
|
|
| 283 |
urls_output = gr.HTML(label="π URLs Found")
|
| 284 |
keywords_output = gr.HTML(label="π Keyword Highlights")
|
|
|
|
| 285 |
|
| 286 |
gr.Examples(
|
| 287 |
examples=examples,
|
| 288 |
inputs=input_text,
|
| 289 |
-
outputs=[output_label, analysis_output, urls_output, keywords_output],
|
| 290 |
fn=classify_email
|
| 291 |
)
|
| 292 |
|
| 293 |
submit_btn.click(
|
| 294 |
fn=classify_email,
|
| 295 |
inputs=input_text,
|
| 296 |
-
outputs=[output_label, analysis_output, urls_output, keywords_output]
|
| 297 |
)
|
| 298 |
input_text.submit(
|
| 299 |
fn=classify_email,
|
| 300 |
inputs=input_text,
|
| 301 |
-
outputs=[output_label, analysis_output, urls_output, keywords_output]
|
| 302 |
)
|
| 303 |
|
| 304 |
# Bulk Processing Tab
|
| 305 |
with gr.Tab("π¦ Bulk Processing"):
|
| 306 |
gr.Markdown("### Upload a CSV or TXT file with emails (one per line)")
|
|
|
|
| 307 |
with gr.Row():
|
| 308 |
with gr.Column():
|
| 309 |
file_input = gr.File(label="π Upload File", file_types=[".csv", ".txt"])
|
| 310 |
bulk_btn = gr.Button("π Process Bulk Emails", variant="primary")
|
| 311 |
with gr.Column():
|
| 312 |
-
bulk_output = gr.Textbox(label="π Processing Summary", lines=
|
| 313 |
download_output = gr.File(label="β¬οΈ Download Results")
|
| 314 |
|
| 315 |
bulk_btn.click(
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
from collections import Counter
|
| 10 |
import io
|
| 11 |
+
from langdetect import detect, DetectorFactory
|
| 12 |
+
DetectorFactory.seed = 0
|
| 13 |
|
| 14 |
# Download NLTK data
|
| 15 |
try:
|
|
|
|
| 68 |
|
| 69 |
# Spam indicators
|
| 70 |
SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
|
| 71 |
+
'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan',
|
| 72 |
+
'limited time', 'act now', 'expire', 'claim', 'bonus']
|
| 73 |
+
|
| 74 |
+
# Credential phishing keywords
|
| 75 |
+
CREDENTIAL_KEYWORDS = ['password', 'username', 'login', 'credential', 'signin', 'sign in',
|
| 76 |
+
'verify account', 'confirm identity', 'update payment', 'billing information',
|
| 77 |
+
'security alert', 'unusual activity', 'locked account', 'reset password']
|
| 78 |
+
|
| 79 |
+
# Language code mapping
|
| 80 |
+
LANGUAGE_NAMES = {
|
| 81 |
+
'en': 'English', 'es': 'Spanish', 'fr': 'French', 'de': 'German', 'it': 'Italian',
|
| 82 |
+
'pt': 'Portuguese', 'ru': 'Russian', 'zh-cn': 'Chinese', 'ja': 'Japanese', 'ko': 'Korean',
|
| 83 |
+
'ar': 'Arabic', 'hi': 'Hindi', 'tr': 'Turkish', 'nl': 'Dutch', 'pl': 'Polish'
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
def detect_language(text):
|
| 87 |
+
"""Detect the language of the text"""
|
| 88 |
+
try:
|
| 89 |
+
lang_code = detect(text)
|
| 90 |
+
return LANGUAGE_NAMES.get(lang_code, lang_code.upper())
|
| 91 |
+
except:
|
| 92 |
+
return "Unknown"
|
| 93 |
+
|
| 94 |
+
def detect_language_switching(text):
|
| 95 |
+
"""Detect if text switches between multiple languages (common in scams)"""
|
| 96 |
+
try:
|
| 97 |
+
sentences = text.split('.')
|
| 98 |
+
languages = []
|
| 99 |
+
for sentence in sentences:
|
| 100 |
+
if len(sentence.strip()) > 10:
|
| 101 |
+
try:
|
| 102 |
+
lang = detect(sentence)
|
| 103 |
+
languages.append(lang)
|
| 104 |
+
except:
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
unique_languages = list(set(languages))
|
| 108 |
+
if len(unique_languages) > 1:
|
| 109 |
+
return True, unique_languages
|
| 110 |
+
return False, unique_languages
|
| 111 |
+
except:
|
| 112 |
+
return False, []
|
| 113 |
+
|
| 114 |
+
def check_credential_phishing(message):
|
| 115 |
+
"""Check if email is asking for credentials or personal info"""
|
| 116 |
+
message_lower = message.lower()
|
| 117 |
+
found_credential_keywords = []
|
| 118 |
+
|
| 119 |
+
for keyword in CREDENTIAL_KEYWORDS:
|
| 120 |
+
if keyword in message_lower:
|
| 121 |
+
found_credential_keywords.append(keyword)
|
| 122 |
+
|
| 123 |
+
# Check for common phishing patterns
|
| 124 |
+
phishing_patterns = []
|
| 125 |
+
|
| 126 |
+
if re.search(r'(click|tap|press).*(link|here|button)', message_lower):
|
| 127 |
+
phishing_patterns.append("Suspicious call-to-action")
|
| 128 |
+
|
| 129 |
+
if re.search(r'(within|in).*(24|48|72).*(hour|hr)', message_lower):
|
| 130 |
+
phishing_patterns.append("Time pressure tactics")
|
| 131 |
+
|
| 132 |
+
if re.search(r'(suspend|lock|close|terminate).*(account|access)', message_lower):
|
| 133 |
+
phishing_patterns.append("Account threat")
|
| 134 |
+
|
| 135 |
+
if re.search(r'(confirm|verify|update).*(information|details|data)', message_lower):
|
| 136 |
+
phishing_patterns.append("Information request")
|
| 137 |
+
|
| 138 |
+
return found_credential_keywords, phishing_patterns
|
| 139 |
|
| 140 |
def extract_urls(message):
|
| 141 |
"""Extract all URLs from the message"""
|
|
|
|
| 151 |
analysis['word_count'] = len(message.split())
|
| 152 |
analysis['char_count'] = len(message)
|
| 153 |
|
| 154 |
+
# Language detection
|
| 155 |
+
analysis['language'] = detect_language(message)
|
| 156 |
+
analysis['language_switching'], analysis['detected_languages'] = detect_language_switching(message)
|
| 157 |
+
|
| 158 |
# Extract URLs
|
| 159 |
analysis['urls'] = extract_urls(message)
|
| 160 |
analysis['has_urls'] = len(analysis['urls']) > 0
|
|
|
|
| 169 |
found_keywords = [kw for kw in SPAM_KEYWORDS if kw in message_lower]
|
| 170 |
analysis['spam_keywords'] = found_keywords
|
| 171 |
|
| 172 |
+
# Credential phishing check
|
| 173 |
+
analysis['credential_keywords'], analysis['phishing_patterns'] = check_credential_phishing(message)
|
| 174 |
+
|
| 175 |
return analysis
|
| 176 |
|
| 177 |
def highlight_spam_words(message, keywords):
|
|
|
|
| 182 |
highlighted = pattern.sub(f'<mark style="background-color: #ffcccc; padding: 2px 4px; border-radius: 3px;">{kw}</mark>', highlighted)
|
| 183 |
return highlighted
|
| 184 |
|
| 185 |
+
def generate_security_tips(analysis, is_spam):
|
| 186 |
+
"""Generate personalized security tips based on analysis"""
|
| 187 |
+
tips = []
|
| 188 |
+
|
| 189 |
+
if is_spam:
|
| 190 |
+
tips.append("β οΈ This email has been flagged as spam. Exercise caution.")
|
| 191 |
+
|
| 192 |
+
if analysis['credential_keywords']:
|
| 193 |
+
tips.append("π Never share passwords or credentials via email.")
|
| 194 |
+
tips.append("π‘οΈ Legitimate companies won't ask for sensitive info via email.")
|
| 195 |
+
|
| 196 |
+
if analysis['has_urls']:
|
| 197 |
+
tips.append("π Hover over links before clicking to verify destination.")
|
| 198 |
+
tips.append("π Check if URL matches the official company website.")
|
| 199 |
+
|
| 200 |
+
if analysis['phishing_patterns']:
|
| 201 |
+
tips.append("β° Be suspicious of emails creating artificial urgency.")
|
| 202 |
+
tips.append("π Contact the company directly using official contact info.")
|
| 203 |
+
|
| 204 |
+
if analysis['language_switching']:
|
| 205 |
+
tips.append("π Language switching is a common tactic in international scams.")
|
| 206 |
+
|
| 207 |
+
if analysis['all_caps_words'] > 3:
|
| 208 |
+
tips.append("π’ Excessive capitalization is often used to create panic.")
|
| 209 |
+
|
| 210 |
+
if not tips:
|
| 211 |
+
tips.append("β
Stay vigilant with all emails requesting action or information.")
|
| 212 |
+
|
| 213 |
+
return tips
|
| 214 |
+
|
| 215 |
def classify_email(message):
|
| 216 |
if not message.strip():
|
| 217 |
+
return "<div style='color:gray;'>Empty message</div>", "", "", "", ""
|
| 218 |
|
| 219 |
try:
|
| 220 |
# Get analysis
|
|
|
|
| 225 |
vec = vectorizer.transform([cleaned])
|
| 226 |
pred = model.predict(vec)[0]
|
| 227 |
|
| 228 |
+
is_spam = pred == 1
|
| 229 |
+
result_type = "Spam" if is_spam else "Not Spam"
|
| 230 |
|
| 231 |
# Result card
|
| 232 |
+
if is_spam:
|
| 233 |
result_html = """
|
| 234 |
<div style='border:2px solid #ff4d4d; border-radius:10px; background-color:#ffe6e6;
|
| 235 |
padding:15px; font-size:18px; font-weight:bold; text-align:center;'>
|
|
|
|
| 244 |
</div>
|
| 245 |
"""
|
| 246 |
|
| 247 |
+
# Language info
|
| 248 |
+
lang_warning = ""
|
| 249 |
+
if analysis['language_switching']:
|
| 250 |
+
langs = ', '.join([LANGUAGE_NAMES.get(l, l) for l in analysis['detected_languages']])
|
| 251 |
+
lang_warning = f"<tr style='background-color:#fff3cd;'><td style='padding:5px;'><b>β οΈ Language Switching:</b></td><td>Yes ({langs})</td></tr>"
|
| 252 |
+
|
| 253 |
# Analysis details
|
| 254 |
details_html = f"""
|
| 255 |
<div style='background-color:#f8f9fa; padding:15px; border-radius:8px; margin-top:10px;'>
|
| 256 |
<h3 style='margin-top:0; color:#333;'>π Email Analysis</h3>
|
| 257 |
<table style='width:100%; border-collapse: collapse;'>
|
| 258 |
+
<tr><td style='padding:5px;'><b>Detected Language:</b></td><td>{analysis['language']}</td></tr>
|
| 259 |
+
{lang_warning}
|
| 260 |
<tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
|
| 261 |
<tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
|
| 262 |
<tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'β οΈ Yes (' + str(len(analysis['urls'])) + ')' if analysis['has_urls'] else 'β No'}</td></tr>
|
|
|
|
| 267 |
</div>
|
| 268 |
"""
|
| 269 |
|
| 270 |
+
# Credential phishing warning
|
| 271 |
+
if analysis['credential_keywords'] or analysis['phishing_patterns']:
|
| 272 |
+
credential_html = f"""
|
| 273 |
+
<div style='background-color:#ffebee; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #d32f2f;'>
|
| 274 |
+
<h3 style='margin-top:0; color:#d32f2f;'>π Credential Phishing Alert!</h3>
|
| 275 |
+
{f"<p style='margin:5px 0;'><b>Suspicious Keywords:</b> {', '.join(analysis['credential_keywords'])}</p>" if analysis['credential_keywords'] else ""}
|
| 276 |
+
{f"<p style='margin:5px 0;'><b>Phishing Patterns:</b> {', '.join(analysis['phishing_patterns'])}</p>" if analysis['phishing_patterns'] else ""}
|
| 277 |
+
<p style='margin:10px 0 0 0; padding:10px; background-color:#fff; border-radius:5px;'>
|
| 278 |
+
<b>β οΈ Warning:</b> This email appears to be attempting to steal your credentials or personal information.
|
| 279 |
+
</p>
|
| 280 |
+
</div>
|
| 281 |
+
"""
|
| 282 |
+
else:
|
| 283 |
+
credential_html = ""
|
| 284 |
+
|
| 285 |
# URLs detected
|
| 286 |
if analysis['urls']:
|
| 287 |
urls_html = f"""
|
|
|
|
| 290 |
<div style='background-color:white; padding:10px; border-radius:5px; font-size:14px;'>
|
| 291 |
{'<br>'.join(['<a href="' + url + '" target="_blank" style="color:#d32f2f; word-break:break-all;">' + url + '</a>' for url in analysis['urls']])}
|
| 292 |
</div>
|
| 293 |
+
<p style='margin:10px 0 0 0; font-size:13px; color:#666;'>
|
| 294 |
+
π‘ Tip: Always verify URLs before clicking. Hover to see the actual destination.
|
| 295 |
+
</p>
|
| 296 |
</div>
|
| 297 |
"""
|
| 298 |
else:
|
|
|
|
| 312 |
else:
|
| 313 |
keywords_html = ""
|
| 314 |
|
| 315 |
+
# Security tips
|
| 316 |
+
tips = generate_security_tips(analysis, is_spam)
|
| 317 |
+
tips_html = f"""
|
| 318 |
+
<div style='background-color:#e8f5e9; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #4caf50;'>
|
| 319 |
+
<h3 style='margin-top:0; color:#2e7d32;'>π‘οΈ Security Tips</h3>
|
| 320 |
+
<ul style='margin:5px 0; padding-left:20px;'>
|
| 321 |
+
{''.join(['<li style="margin:5px 0;">' + tip + '</li>' for tip in tips])}
|
| 322 |
+
</ul>
|
| 323 |
+
</div>
|
| 324 |
+
"""
|
| 325 |
+
|
| 326 |
+
return result_html, details_html, credential_html, urls_html, keywords_html, tips_html
|
| 327 |
|
| 328 |
except Exception as e:
|
| 329 |
print(f"Prediction error: {e}")
|
| 330 |
+
return "<div style='color:gray;'>Error during classification</div>", "", "", "", "", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
def process_bulk_emails(file):
|
| 333 |
"""Process bulk emails from file"""
|
|
|
|
| 357 |
vec = vectorizer.transform([cleaned])
|
| 358 |
pred = model.predict(vec)[0]
|
| 359 |
|
| 360 |
+
# Additional analysis
|
| 361 |
+
analysis = analyze_email(str(email))
|
| 362 |
+
|
| 363 |
results.append({
|
| 364 |
'Email': str(email)[:100] + '...' if len(str(email)) > 100 else str(email),
|
| 365 |
+
'Classification': 'Spam' if pred == 1 else 'Not Spam',
|
| 366 |
+
'Language': analysis['language'],
|
| 367 |
+
'Has_URLs': 'Yes' if analysis['has_urls'] else 'No',
|
| 368 |
+
'Credential_Risk': 'High' if analysis['credential_keywords'] else 'Low'
|
| 369 |
})
|
| 370 |
|
| 371 |
results_df = pd.DataFrame(results)
|
|
|
|
| 374 |
output_path = "spam_classification_results.csv"
|
| 375 |
results_df.to_csv(output_path, index=False)
|
| 376 |
|
| 377 |
+
spam_count = len([r for r in results if r['Classification'] == 'Spam'])
|
| 378 |
+
credential_risks = len([r for r in results if r['Credential_Risk'] == 'High'])
|
| 379 |
+
|
| 380 |
summary = f"β
Processed {len(results)} emails\n"
|
| 381 |
+
summary += f"π΄ Spam: {spam_count}\n"
|
| 382 |
+
summary += f"π’ Not Spam: {len(results) - spam_count}\n"
|
| 383 |
+
summary += f"π Credential Phishing Risk: {credential_risks}"
|
| 384 |
|
| 385 |
return summary, output_path
|
| 386 |
|
| 387 |
except Exception as e:
|
| 388 |
return f"Error processing file: {str(e)}", None
|
| 389 |
|
| 390 |
+
# Enhanced examples with more diverse scenarios
|
| 391 |
examples = [
|
| 392 |
+
["Congratulations! You've won a $1000 gift card. Click here to claim your prize now!"],
|
| 393 |
+
["URGENT: Your account has been compromised. Verify your identity now by clicking this link and entering your password."],
|
| 394 |
+
["Hi mom, I'll be home for dinner tonight. See you around 7pm."],
|
| 395 |
+
["Hello team, the project report is attached. Please review before tomorrow's meeting."],
|
| 396 |
+
["Hey John, are we still on for lunch tomorrow? Let me know!"],
|
| 397 |
+
["FINAL NOTICE: Your payment is overdue. Click here within 24 hours to update your billing information or your account will be suspended."],
|
| 398 |
+
["Dear customer, we detected unusual activity on your account. Please verify your login credentials immediately."],
|
| 399 |
+
["You have been selected for our exclusive offer! Limited time only - act now to receive FREE cash bonus!"],
|
| 400 |
+
["Meeting reminder: Don't forget about our team sync at 3pm today. Conference room B."],
|
| 401 |
+
["Your package delivery requires signature. Track your shipment here: http://suspicious-tracking-link.com"]
|
| 402 |
]
|
| 403 |
|
| 404 |
# Custom CSS
|
|
|
|
| 415 |
# Gradio interface
|
| 416 |
with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Classifier") as demo:
|
| 417 |
gr.Markdown("# π§ Enhanced Email Spam Classifier")
|
| 418 |
+
gr.Markdown("*Advanced spam detection with multi-language support and credential phishing detection*")
|
| 419 |
|
| 420 |
with gr.Tabs():
|
| 421 |
# Single Email Tab
|
|
|
|
| 435 |
output_label = gr.HTML(label="π Result")
|
| 436 |
|
| 437 |
analysis_output = gr.HTML(label="π Analysis Details")
|
| 438 |
+
credential_output = gr.HTML(label="π Credential Phishing Check")
|
| 439 |
urls_output = gr.HTML(label="π URLs Found")
|
| 440 |
keywords_output = gr.HTML(label="π Keyword Highlights")
|
| 441 |
+
tips_output = gr.HTML(label="π‘οΈ Security Tips")
|
| 442 |
|
| 443 |
gr.Examples(
|
| 444 |
examples=examples,
|
| 445 |
inputs=input_text,
|
| 446 |
+
outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output],
|
| 447 |
fn=classify_email
|
| 448 |
)
|
| 449 |
|
| 450 |
submit_btn.click(
|
| 451 |
fn=classify_email,
|
| 452 |
inputs=input_text,
|
| 453 |
+
outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output]
|
| 454 |
)
|
| 455 |
input_text.submit(
|
| 456 |
fn=classify_email,
|
| 457 |
inputs=input_text,
|
| 458 |
+
outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output]
|
| 459 |
)
|
| 460 |
|
| 461 |
# Bulk Processing Tab
|
| 462 |
with gr.Tab("π¦ Bulk Processing"):
|
| 463 |
gr.Markdown("### Upload a CSV or TXT file with emails (one per line)")
|
| 464 |
+
gr.Markdown("*Results will include spam classification, language detection, and credential phishing risk*")
|
| 465 |
with gr.Row():
|
| 466 |
with gr.Column():
|
| 467 |
file_input = gr.File(label="π Upload File", file_types=[".csv", ".txt"])
|
| 468 |
bulk_btn = gr.Button("π Process Bulk Emails", variant="primary")
|
| 469 |
with gr.Column():
|
| 470 |
+
bulk_output = gr.Textbox(label="π Processing Summary", lines=6)
|
| 471 |
download_output = gr.File(label="β¬οΈ Download Results")
|
| 472 |
|
| 473 |
bulk_btn.click(
|