Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -64,13 +64,16 @@ vectorizer_path = "tfidf_vectorizer.joblib"
|
|
| 64 |
model = joblib.load(model_path)
|
| 65 |
vectorizer = joblib.load(vectorizer_path)
|
| 66 |
|
| 67 |
-
# Session statistics
|
| 68 |
-
session_stats = {"total": 0, "spam": 0, "not_spam": 0, "history": []}
|
| 69 |
-
|
| 70 |
# Spam indicators
|
| 71 |
SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
|
| 72 |
'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan']
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def analyze_email(message):
|
| 75 |
"""Detailed email analysis"""
|
| 76 |
analysis = {}
|
|
@@ -78,7 +81,10 @@ def analyze_email(message):
|
|
| 78 |
# Basic stats
|
| 79 |
analysis['word_count'] = len(message.split())
|
| 80 |
analysis['char_count'] = len(message)
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
| 82 |
analysis['has_email'] = bool(re.search(r'\S+@\S+', message))
|
| 83 |
|
| 84 |
# Suspicious patterns
|
|
@@ -102,7 +108,7 @@ def highlight_spam_words(message, keywords):
|
|
| 102 |
|
| 103 |
def classify_email(message):
|
| 104 |
if not message.strip():
|
| 105 |
-
return "<div style='color:gray;'>Empty message</div>", "", ""
|
| 106 |
|
| 107 |
try:
|
| 108 |
# Get analysis
|
|
@@ -113,19 +119,7 @@ def classify_email(message):
|
|
| 113 |
vec = vectorizer.transform([cleaned])
|
| 114 |
pred = model.predict(vec)[0]
|
| 115 |
|
| 116 |
-
|
| 117 |
-
session_stats['total'] += 1
|
| 118 |
-
if pred == 1:
|
| 119 |
-
session_stats['spam'] += 1
|
| 120 |
-
result_type = "Spam"
|
| 121 |
-
else:
|
| 122 |
-
session_stats['not_spam'] += 1
|
| 123 |
-
result_type = "Not Spam"
|
| 124 |
-
|
| 125 |
-
session_stats['history'].append({
|
| 126 |
-
'message': message[:50] + '...' if len(message) > 50 else message,
|
| 127 |
-
'result': result_type
|
| 128 |
-
})
|
| 129 |
|
| 130 |
# Result card
|
| 131 |
if pred == 1:
|
|
@@ -150,7 +144,7 @@ def classify_email(message):
|
|
| 150 |
<table style='width:100%; border-collapse: collapse;'>
|
| 151 |
<tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
|
| 152 |
<tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
|
| 153 |
-
<tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'β οΈ Yes' if analysis['has_urls'] else 'β No'}</td></tr>
|
| 154 |
<tr><td style='padding:5px;'><b>Contains Emails:</b></td><td>{'Yes' if analysis['has_email'] else 'No'}</td></tr>
|
| 155 |
<tr><td style='padding:5px;'><b>ALL CAPS Words:</b></td><td>{analysis['all_caps_words']}</td></tr>
|
| 156 |
<tr><td style='padding:5px;'><b>Exclamation Marks:</b></td><td>{analysis['exclamation_marks']}</td></tr>
|
|
@@ -158,10 +152,23 @@ def classify_email(message):
|
|
| 158 |
</div>
|
| 159 |
"""
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
# Highlighted message with spam keywords
|
| 162 |
if analysis['spam_keywords']:
|
| 163 |
keywords_html = f"""
|
| 164 |
-
<div style='background-color:#
|
| 165 |
<h3 style='margin-top:0; color:#333;'>β οΈ Suspicious Keywords Found</h3>
|
| 166 |
<p style='margin:5px 0;'><b>Keywords:</b> {', '.join(analysis['spam_keywords'])}</p>
|
| 167 |
<div style='background-color:white; padding:10px; border-radius:5px; margin-top:10px; font-size:14px; line-height:1.6;'>
|
|
@@ -172,40 +179,15 @@ def classify_email(message):
|
|
| 172 |
else:
|
| 173 |
keywords_html = ""
|
| 174 |
|
| 175 |
-
return result_html, details_html, keywords_html
|
| 176 |
|
| 177 |
except Exception as e:
|
| 178 |
print(f"Prediction error: {e}")
|
| 179 |
-
return "<div style='color:gray;'>Error during classification</div>", "", ""
|
| 180 |
|
| 181 |
def get_statistics():
|
| 182 |
"""Generate statistics dashboard"""
|
| 183 |
-
|
| 184 |
-
return "<div style='text-align:center; color:gray; padding:20px;'>No emails checked yet</div>"
|
| 185 |
-
|
| 186 |
-
spam_pct = (session_stats['spam'] / session_stats['total']) * 100
|
| 187 |
-
not_spam_pct = (session_stats['not_spam'] / session_stats['total']) * 100
|
| 188 |
-
|
| 189 |
-
stats_html = f"""
|
| 190 |
-
<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding:20px; border-radius:10px; color:white;'>
|
| 191 |
-
<h2 style='margin-top:0; text-align:center;'>π Session Statistics</h2>
|
| 192 |
-
<div style='display:grid; grid-template-columns: repeat(3, 1fr); gap:15px; margin-top:15px;'>
|
| 193 |
-
<div style='background-color:rgba(255,255,255,0.2); padding:15px; border-radius:8px; text-align:center;'>
|
| 194 |
-
<div style='font-size:32px; font-weight:bold;'>{session_stats['total']}</div>
|
| 195 |
-
<div style='font-size:14px;'>Total Checked</div>
|
| 196 |
-
</div>
|
| 197 |
-
<div style='background-color:rgba(255,77,77,0.3); padding:15px; border-radius:8px; text-align:center;'>
|
| 198 |
-
<div style='font-size:32px; font-weight:bold;'>{session_stats['spam']}</div>
|
| 199 |
-
<div style='font-size:14px;'>Spam ({spam_pct:.1f}%)</div>
|
| 200 |
-
</div>
|
| 201 |
-
<div style='background-color:rgba(77,255,77,0.3); padding:15px; border-radius:8px; text-align:center;'>
|
| 202 |
-
<div style='font-size:32px; font-weight:bold;'>{session_stats['not_spam']}</div>
|
| 203 |
-
<div style='font-size:14px;'>Legitimate ({not_spam_pct:.1f}%)</div>
|
| 204 |
-
</div>
|
| 205 |
-
</div>
|
| 206 |
-
</div>
|
| 207 |
-
"""
|
| 208 |
-
return stats_html
|
| 209 |
|
| 210 |
def process_bulk_emails(file):
|
| 211 |
"""Process bulk emails from file"""
|
|
@@ -298,24 +280,25 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Class
|
|
| 298 |
output_label = gr.HTML(label="π Result")
|
| 299 |
|
| 300 |
analysis_output = gr.HTML(label="π Analysis Details")
|
|
|
|
| 301 |
keywords_output = gr.HTML(label="π Keyword Highlights")
|
| 302 |
|
| 303 |
gr.Examples(
|
| 304 |
examples=examples,
|
| 305 |
inputs=input_text,
|
| 306 |
-
outputs=[output_label, analysis_output, keywords_output],
|
| 307 |
fn=classify_email
|
| 308 |
)
|
| 309 |
|
| 310 |
submit_btn.click(
|
| 311 |
fn=classify_email,
|
| 312 |
inputs=input_text,
|
| 313 |
-
outputs=[output_label, analysis_output, keywords_output]
|
| 314 |
)
|
| 315 |
input_text.submit(
|
| 316 |
fn=classify_email,
|
| 317 |
inputs=input_text,
|
| 318 |
-
outputs=[output_label, analysis_output, keywords_output]
|
| 319 |
)
|
| 320 |
|
| 321 |
# Bulk Processing Tab
|
|
@@ -334,14 +317,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Class
|
|
| 334 |
inputs=file_input,
|
| 335 |
outputs=[bulk_output, download_output]
|
| 336 |
)
|
| 337 |
-
|
| 338 |
-
# Statistics Tab
|
| 339 |
-
with gr.Tab("π Statistics"):
|
| 340 |
-
stats_display = gr.HTML()
|
| 341 |
-
refresh_btn = gr.Button("π Refresh Statistics", variant="primary")
|
| 342 |
-
|
| 343 |
-
refresh_btn.click(fn=get_statistics, outputs=stats_display)
|
| 344 |
-
demo.load(fn=get_statistics, outputs=stats_display)
|
| 345 |
|
| 346 |
if __name__ == "__main__":
|
| 347 |
demo.launch()
|
|
|
|
| 64 |
model = joblib.load(model_path)
|
| 65 |
vectorizer = joblib.load(vectorizer_path)
|
| 66 |
|
|
|
|
|
|
|
|
|
|
| 67 |
# Spam indicators
|
| 68 |
SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
|
| 69 |
'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan']
|
| 70 |
|
| 71 |
+
def extract_urls(message):
|
| 72 |
+
"""Extract all URLs from the message"""
|
| 73 |
+
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
|
| 74 |
+
urls = re.findall(url_pattern, message)
|
| 75 |
+
return urls
|
| 76 |
+
|
| 77 |
def analyze_email(message):
|
| 78 |
"""Detailed email analysis"""
|
| 79 |
analysis = {}
|
|
|
|
| 81 |
# Basic stats
|
| 82 |
analysis['word_count'] = len(message.split())
|
| 83 |
analysis['char_count'] = len(message)
|
| 84 |
+
|
| 85 |
+
# Extract URLs
|
| 86 |
+
analysis['urls'] = extract_urls(message)
|
| 87 |
+
analysis['has_urls'] = len(analysis['urls']) > 0
|
| 88 |
analysis['has_email'] = bool(re.search(r'\S+@\S+', message))
|
| 89 |
|
| 90 |
# Suspicious patterns
|
|
|
|
| 108 |
|
| 109 |
def classify_email(message):
|
| 110 |
if not message.strip():
|
| 111 |
+
return "<div style='color:gray;'>Empty message</div>", "", "", ""
|
| 112 |
|
| 113 |
try:
|
| 114 |
# Get analysis
|
|
|
|
| 119 |
vec = vectorizer.transform([cleaned])
|
| 120 |
pred = model.predict(vec)[0]
|
| 121 |
|
| 122 |
+
result_type = "Spam" if pred == 1 else "Not Spam"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
# Result card
|
| 125 |
if pred == 1:
|
|
|
|
| 144 |
<table style='width:100%; border-collapse: collapse;'>
|
| 145 |
<tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
|
| 146 |
<tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
|
| 147 |
+
<tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'β οΈ Yes (' + str(len(analysis['urls'])) + ')' if analysis['has_urls'] else 'β No'}</td></tr>
|
| 148 |
<tr><td style='padding:5px;'><b>Contains Emails:</b></td><td>{'Yes' if analysis['has_email'] else 'No'}</td></tr>
|
| 149 |
<tr><td style='padding:5px;'><b>ALL CAPS Words:</b></td><td>{analysis['all_caps_words']}</td></tr>
|
| 150 |
<tr><td style='padding:5px;'><b>Exclamation Marks:</b></td><td>{analysis['exclamation_marks']}</td></tr>
|
|
|
|
| 152 |
</div>
|
| 153 |
"""
|
| 154 |
|
| 155 |
+
# URLs detected
|
| 156 |
+
if analysis['urls']:
|
| 157 |
+
urls_html = f"""
|
| 158 |
+
<div style='background-color:#fff3cd; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #ff9800;'>
|
| 159 |
+
<h3 style='margin-top:0; color:#333;'>π URLs Detected</h3>
|
| 160 |
+
<div style='background-color:white; padding:10px; border-radius:5px; font-size:14px;'>
|
| 161 |
+
{'<br>'.join(['<a href="' + url + '" target="_blank" style="color:#d32f2f; word-break:break-all;">' + url + '</a>' for url in analysis['urls']])}
|
| 162 |
+
</div>
|
| 163 |
+
</div>
|
| 164 |
+
"""
|
| 165 |
+
else:
|
| 166 |
+
urls_html = ""
|
| 167 |
+
|
| 168 |
# Highlighted message with spam keywords
|
| 169 |
if analysis['spam_keywords']:
|
| 170 |
keywords_html = f"""
|
| 171 |
+
<div style='background-color:#ffebee; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #f44336;'>
|
| 172 |
<h3 style='margin-top:0; color:#333;'>β οΈ Suspicious Keywords Found</h3>
|
| 173 |
<p style='margin:5px 0;'><b>Keywords:</b> {', '.join(analysis['spam_keywords'])}</p>
|
| 174 |
<div style='background-color:white; padding:10px; border-radius:5px; margin-top:10px; font-size:14px; line-height:1.6;'>
|
|
|
|
| 179 |
else:
|
| 180 |
keywords_html = ""
|
| 181 |
|
| 182 |
+
return result_html, details_html, urls_html, keywords_html
|
| 183 |
|
| 184 |
except Exception as e:
|
| 185 |
print(f"Prediction error: {e}")
|
| 186 |
+
return "<div style='color:gray;'>Error during classification</div>", "", "", ""
|
| 187 |
|
| 188 |
def get_statistics():
|
| 189 |
"""Generate statistics dashboard"""
|
| 190 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
def process_bulk_emails(file):
|
| 193 |
"""Process bulk emails from file"""
|
|
|
|
| 280 |
output_label = gr.HTML(label="π Result")
|
| 281 |
|
| 282 |
analysis_output = gr.HTML(label="π Analysis Details")
|
| 283 |
+
urls_output = gr.HTML(label="π URLs Found")
|
| 284 |
keywords_output = gr.HTML(label="π Keyword Highlights")
|
| 285 |
|
| 286 |
gr.Examples(
|
| 287 |
examples=examples,
|
| 288 |
inputs=input_text,
|
| 289 |
+
outputs=[output_label, analysis_output, urls_output, keywords_output],
|
| 290 |
fn=classify_email
|
| 291 |
)
|
| 292 |
|
| 293 |
submit_btn.click(
|
| 294 |
fn=classify_email,
|
| 295 |
inputs=input_text,
|
| 296 |
+
outputs=[output_label, analysis_output, urls_output, keywords_output]
|
| 297 |
)
|
| 298 |
input_text.submit(
|
| 299 |
fn=classify_email,
|
| 300 |
inputs=input_text,
|
| 301 |
+
outputs=[output_label, analysis_output, urls_output, keywords_output]
|
| 302 |
)
|
| 303 |
|
| 304 |
# Bulk Processing Tab
|
|
|
|
| 317 |
inputs=file_input,
|
| 318 |
outputs=[bulk_output, download_output]
|
| 319 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
if __name__ == "__main__":
|
| 322 |
demo.launch()
|