Vedag812 commited on
Commit
921fc4a
Β·
verified Β·
1 Parent(s): 4d007d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -24
app.py CHANGED
@@ -8,6 +8,8 @@ import os
8
  import pandas as pd
9
  from collections import Counter
10
  import io
 
 
11
 
12
  # Download NLTK data
13
  try:
@@ -66,7 +68,74 @@ vectorizer = joblib.load(vectorizer_path)
66
 
67
  # Spam indicators
68
  SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
69
- 'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def extract_urls(message):
72
  """Extract all URLs from the message"""
@@ -82,6 +151,10 @@ def analyze_email(message):
82
  analysis['word_count'] = len(message.split())
83
  analysis['char_count'] = len(message)
84
 
 
 
 
 
85
  # Extract URLs
86
  analysis['urls'] = extract_urls(message)
87
  analysis['has_urls'] = len(analysis['urls']) > 0
@@ -96,6 +169,9 @@ def analyze_email(message):
96
  found_keywords = [kw for kw in SPAM_KEYWORDS if kw in message_lower]
97
  analysis['spam_keywords'] = found_keywords
98
 
 
 
 
99
  return analysis
100
 
101
  def highlight_spam_words(message, keywords):
@@ -106,9 +182,39 @@ def highlight_spam_words(message, keywords):
106
  highlighted = pattern.sub(f'<mark style="background-color: #ffcccc; padding: 2px 4px; border-radius: 3px;">{kw}</mark>', highlighted)
107
  return highlighted
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def classify_email(message):
110
  if not message.strip():
111
- return "<div style='color:gray;'>Empty message</div>", "", "", ""
112
 
113
  try:
114
  # Get analysis
@@ -119,10 +225,11 @@ def classify_email(message):
119
  vec = vectorizer.transform([cleaned])
120
  pred = model.predict(vec)[0]
121
 
122
- result_type = "Spam" if pred == 1 else "Not Spam"
 
123
 
124
  # Result card
125
- if pred == 1:
126
  result_html = """
127
  <div style='border:2px solid #ff4d4d; border-radius:10px; background-color:#ffe6e6;
128
  padding:15px; font-size:18px; font-weight:bold; text-align:center;'>
@@ -137,11 +244,19 @@ def classify_email(message):
137
  </div>
138
  """
139
 
 
 
 
 
 
 
140
  # Analysis details
141
  details_html = f"""
142
  <div style='background-color:#f8f9fa; padding:15px; border-radius:8px; margin-top:10px;'>
143
  <h3 style='margin-top:0; color:#333;'>πŸ“Š Email Analysis</h3>
144
  <table style='width:100%; border-collapse: collapse;'>
 
 
145
  <tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
146
  <tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
147
  <tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'⚠️ Yes (' + str(len(analysis['urls'])) + ')' if analysis['has_urls'] else 'βœ“ No'}</td></tr>
@@ -152,6 +267,21 @@ def classify_email(message):
152
  </div>
153
  """
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  # URLs detected
156
  if analysis['urls']:
157
  urls_html = f"""
@@ -160,6 +290,9 @@ def classify_email(message):
160
  <div style='background-color:white; padding:10px; border-radius:5px; font-size:14px;'>
161
  {'<br>'.join(['<a href="' + url + '" target="_blank" style="color:#d32f2f; word-break:break-all;">' + url + '</a>' for url in analysis['urls']])}
162
  </div>
 
 
 
163
  </div>
164
  """
165
  else:
@@ -179,15 +312,22 @@ def classify_email(message):
179
  else:
180
  keywords_html = ""
181
 
182
- return result_html, details_html, urls_html, keywords_html
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  except Exception as e:
185
  print(f"Prediction error: {e}")
186
- return "<div style='color:gray;'>Error during classification</div>", "", "", ""
187
-
188
- def get_statistics():
189
- """Generate statistics dashboard"""
190
- return ""
191
 
192
  def process_bulk_emails(file):
193
  """Process bulk emails from file"""
@@ -217,9 +357,15 @@ def process_bulk_emails(file):
217
  vec = vectorizer.transform([cleaned])
218
  pred = model.predict(vec)[0]
219
 
 
 
 
220
  results.append({
221
  'Email': str(email)[:100] + '...' if len(str(email)) > 100 else str(email),
222
- 'Classification': 'Spam' if pred == 1 else 'Not Spam'
 
 
 
223
  })
224
 
225
  results_df = pd.DataFrame(results)
@@ -228,22 +374,31 @@ def process_bulk_emails(file):
228
  output_path = "spam_classification_results.csv"
229
  results_df.to_csv(output_path, index=False)
230
 
 
 
 
231
  summary = f"βœ… Processed {len(results)} emails\n"
232
- summary += f"πŸ”΄ Spam: {len([r for r in results if r['Classification'] == 'Spam'])}\n"
233
- summary += f"🟒 Not Spam: {len([r for r in results if r['Classification'] == 'Not Spam'])}"
 
234
 
235
  return summary, output_path
236
 
237
  except Exception as e:
238
  return f"Error processing file: {str(e)}", None
239
 
240
- # Examples
241
  examples = [
242
- ["Congratulations! You've won a $1000 gift card. Click here!"],
243
- ["URGENT: Your account has been compromised. Verify your identity now!"],
244
- ["Hi mom, I'll be home for dinner tonight."],
245
- ["Hello team, the project report is attached."],
246
- ["Hey John, are we still on for lunch tomorrow?"]
 
 
 
 
 
247
  ]
248
 
249
  # Custom CSS
@@ -260,7 +415,7 @@ mark {animation: highlight 0.5s ease;}
260
  # Gradio interface
261
  with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Classifier") as demo:
262
  gr.Markdown("# πŸ“§ Enhanced Email Spam Classifier")
263
- gr.Markdown("*Advanced spam detection with detailed analysis and bulk processing*")
264
 
265
  with gr.Tabs():
266
  # Single Email Tab
@@ -280,36 +435,39 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Class
280
  output_label = gr.HTML(label="πŸ“Š Result")
281
 
282
  analysis_output = gr.HTML(label="πŸ“‹ Analysis Details")
 
283
  urls_output = gr.HTML(label="πŸ”— URLs Found")
284
  keywords_output = gr.HTML(label="πŸ”Ž Keyword Highlights")
 
285
 
286
  gr.Examples(
287
  examples=examples,
288
  inputs=input_text,
289
- outputs=[output_label, analysis_output, urls_output, keywords_output],
290
  fn=classify_email
291
  )
292
 
293
  submit_btn.click(
294
  fn=classify_email,
295
  inputs=input_text,
296
- outputs=[output_label, analysis_output, urls_output, keywords_output]
297
  )
298
  input_text.submit(
299
  fn=classify_email,
300
  inputs=input_text,
301
- outputs=[output_label, analysis_output, urls_output, keywords_output]
302
  )
303
 
304
  # Bulk Processing Tab
305
  with gr.Tab("πŸ“¦ Bulk Processing"):
306
  gr.Markdown("### Upload a CSV or TXT file with emails (one per line)")
 
307
  with gr.Row():
308
  with gr.Column():
309
  file_input = gr.File(label="πŸ“ Upload File", file_types=[".csv", ".txt"])
310
  bulk_btn = gr.Button("πŸš€ Process Bulk Emails", variant="primary")
311
  with gr.Column():
312
- bulk_output = gr.Textbox(label="πŸ“Š Processing Summary", lines=5)
313
  download_output = gr.File(label="⬇️ Download Results")
314
 
315
  bulk_btn.click(
 
8
  import pandas as pd
9
  from collections import Counter
10
  import io
11
+ from langdetect import detect, DetectorFactory
12
+ DetectorFactory.seed = 0
13
 
14
  # Download NLTK data
15
  try:
 
68
 
69
  # Spam indicators
70
  SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
71
+ 'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan',
72
+ 'limited time', 'act now', 'expire', 'claim', 'bonus']
73
+
74
+ # Credential phishing keywords
75
+ CREDENTIAL_KEYWORDS = ['password', 'username', 'login', 'credential', 'signin', 'sign in',
76
+ 'verify account', 'confirm identity', 'update payment', 'billing information',
77
+ 'security alert', 'unusual activity', 'locked account', 'reset password']
78
+
79
+ # Language code mapping
80
+ LANGUAGE_NAMES = {
81
+ 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'de': 'German', 'it': 'Italian',
82
+ 'pt': 'Portuguese', 'ru': 'Russian', 'zh-cn': 'Chinese', 'ja': 'Japanese', 'ko': 'Korean',
83
+ 'ar': 'Arabic', 'hi': 'Hindi', 'tr': 'Turkish', 'nl': 'Dutch', 'pl': 'Polish'
84
+ }
85
+
86
+ def detect_language(text):
87
+ """Detect the language of the text"""
88
+ try:
89
+ lang_code = detect(text)
90
+ return LANGUAGE_NAMES.get(lang_code, lang_code.upper())
91
+ except:
92
+ return "Unknown"
93
+
94
+ def detect_language_switching(text):
95
+ """Detect if text switches between multiple languages (common in scams)"""
96
+ try:
97
+ sentences = text.split('.')
98
+ languages = []
99
+ for sentence in sentences:
100
+ if len(sentence.strip()) > 10:
101
+ try:
102
+ lang = detect(sentence)
103
+ languages.append(lang)
104
+ except:
105
+ pass
106
+
107
+ unique_languages = list(set(languages))
108
+ if len(unique_languages) > 1:
109
+ return True, unique_languages
110
+ return False, unique_languages
111
+ except:
112
+ return False, []
113
+
114
+ def check_credential_phishing(message):
115
+ """Check if email is asking for credentials or personal info"""
116
+ message_lower = message.lower()
117
+ found_credential_keywords = []
118
+
119
+ for keyword in CREDENTIAL_KEYWORDS:
120
+ if keyword in message_lower:
121
+ found_credential_keywords.append(keyword)
122
+
123
+ # Check for common phishing patterns
124
+ phishing_patterns = []
125
+
126
+ if re.search(r'(click|tap|press).*(link|here|button)', message_lower):
127
+ phishing_patterns.append("Suspicious call-to-action")
128
+
129
+ if re.search(r'(within|in).*(24|48|72).*(hour|hr)', message_lower):
130
+ phishing_patterns.append("Time pressure tactics")
131
+
132
+ if re.search(r'(suspend|lock|close|terminate).*(account|access)', message_lower):
133
+ phishing_patterns.append("Account threat")
134
+
135
+ if re.search(r'(confirm|verify|update).*(information|details|data)', message_lower):
136
+ phishing_patterns.append("Information request")
137
+
138
+ return found_credential_keywords, phishing_patterns
139
 
140
  def extract_urls(message):
141
  """Extract all URLs from the message"""
 
151
  analysis['word_count'] = len(message.split())
152
  analysis['char_count'] = len(message)
153
 
154
+ # Language detection
155
+ analysis['language'] = detect_language(message)
156
+ analysis['language_switching'], analysis['detected_languages'] = detect_language_switching(message)
157
+
158
  # Extract URLs
159
  analysis['urls'] = extract_urls(message)
160
  analysis['has_urls'] = len(analysis['urls']) > 0
 
169
  found_keywords = [kw for kw in SPAM_KEYWORDS if kw in message_lower]
170
  analysis['spam_keywords'] = found_keywords
171
 
172
+ # Credential phishing check
173
+ analysis['credential_keywords'], analysis['phishing_patterns'] = check_credential_phishing(message)
174
+
175
  return analysis
176
 
177
  def highlight_spam_words(message, keywords):
 
182
  highlighted = pattern.sub(f'<mark style="background-color: #ffcccc; padding: 2px 4px; border-radius: 3px;">{kw}</mark>', highlighted)
183
  return highlighted
184
 
185
+ def generate_security_tips(analysis, is_spam):
186
+ """Generate personalized security tips based on analysis"""
187
+ tips = []
188
+
189
+ if is_spam:
190
+ tips.append("⚠️ This email has been flagged as spam. Exercise caution.")
191
+
192
+ if analysis['credential_keywords']:
193
+ tips.append("πŸ” Never share passwords or credentials via email.")
194
+ tips.append("πŸ›‘οΈ Legitimate companies won't ask for sensitive info via email.")
195
+
196
+ if analysis['has_urls']:
197
+ tips.append("πŸ”— Hover over links before clicking to verify destination.")
198
+ tips.append("🌐 Check if URL matches the official company website.")
199
+
200
+ if analysis['phishing_patterns']:
201
+ tips.append("⏰ Be suspicious of emails creating artificial urgency.")
202
+ tips.append("πŸ“ž Contact the company directly using official contact info.")
203
+
204
+ if analysis['language_switching']:
205
+ tips.append("🌍 Language switching is a common tactic in international scams.")
206
+
207
+ if analysis['all_caps_words'] > 3:
208
+ tips.append("πŸ“’ Excessive capitalization is often used to create panic.")
209
+
210
+ if not tips:
211
+ tips.append("βœ… Stay vigilant with all emails requesting action or information.")
212
+
213
+ return tips
214
+
215
  def classify_email(message):
216
  if not message.strip():
217
+ return "<div style='color:gray;'>Empty message</div>", "", "", "", ""
218
 
219
  try:
220
  # Get analysis
 
225
  vec = vectorizer.transform([cleaned])
226
  pred = model.predict(vec)[0]
227
 
228
+ is_spam = pred == 1
229
+ result_type = "Spam" if is_spam else "Not Spam"
230
 
231
  # Result card
232
+ if is_spam:
233
  result_html = """
234
  <div style='border:2px solid #ff4d4d; border-radius:10px; background-color:#ffe6e6;
235
  padding:15px; font-size:18px; font-weight:bold; text-align:center;'>
 
244
  </div>
245
  """
246
 
247
+ # Language info
248
+ lang_warning = ""
249
+ if analysis['language_switching']:
250
+ langs = ', '.join([LANGUAGE_NAMES.get(l, l) for l in analysis['detected_languages']])
251
+ lang_warning = f"<tr style='background-color:#fff3cd;'><td style='padding:5px;'><b>⚠️ Language Switching:</b></td><td>Yes ({langs})</td></tr>"
252
+
253
  # Analysis details
254
  details_html = f"""
255
  <div style='background-color:#f8f9fa; padding:15px; border-radius:8px; margin-top:10px;'>
256
  <h3 style='margin-top:0; color:#333;'>πŸ“Š Email Analysis</h3>
257
  <table style='width:100%; border-collapse: collapse;'>
258
+ <tr><td style='padding:5px;'><b>Detected Language:</b></td><td>{analysis['language']}</td></tr>
259
+ {lang_warning}
260
  <tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
261
  <tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
262
  <tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'⚠️ Yes (' + str(len(analysis['urls'])) + ')' if analysis['has_urls'] else 'βœ“ No'}</td></tr>
 
267
  </div>
268
  """
269
 
270
+ # Credential phishing warning
271
+ if analysis['credential_keywords'] or analysis['phishing_patterns']:
272
+ credential_html = f"""
273
+ <div style='background-color:#ffebee; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #d32f2f;'>
274
+ <h3 style='margin-top:0; color:#d32f2f;'>πŸ” Credential Phishing Alert!</h3>
275
+ {f"<p style='margin:5px 0;'><b>Suspicious Keywords:</b> {', '.join(analysis['credential_keywords'])}</p>" if analysis['credential_keywords'] else ""}
276
+ {f"<p style='margin:5px 0;'><b>Phishing Patterns:</b> {', '.join(analysis['phishing_patterns'])}</p>" if analysis['phishing_patterns'] else ""}
277
+ <p style='margin:10px 0 0 0; padding:10px; background-color:#fff; border-radius:5px;'>
278
+ <b>⚠️ Warning:</b> This email appears to be attempting to steal your credentials or personal information.
279
+ </p>
280
+ </div>
281
+ """
282
+ else:
283
+ credential_html = ""
284
+
285
  # URLs detected
286
  if analysis['urls']:
287
  urls_html = f"""
 
290
  <div style='background-color:white; padding:10px; border-radius:5px; font-size:14px;'>
291
  {'<br>'.join(['<a href="' + url + '" target="_blank" style="color:#d32f2f; word-break:break-all;">' + url + '</a>' for url in analysis['urls']])}
292
  </div>
293
+ <p style='margin:10px 0 0 0; font-size:13px; color:#666;'>
294
+ πŸ’‘ Tip: Always verify URLs before clicking. Hover to see the actual destination.
295
+ </p>
296
  </div>
297
  """
298
  else:
 
312
  else:
313
  keywords_html = ""
314
 
315
+ # Security tips
316
+ tips = generate_security_tips(analysis, is_spam)
317
+ tips_html = f"""
318
+ <div style='background-color:#e8f5e9; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #4caf50;'>
319
+ <h3 style='margin-top:0; color:#2e7d32;'>πŸ›‘οΈ Security Tips</h3>
320
+ <ul style='margin:5px 0; padding-left:20px;'>
321
+ {''.join(['<li style="margin:5px 0;">' + tip + '</li>' for tip in tips])}
322
+ </ul>
323
+ </div>
324
+ """
325
+
326
+ return result_html, details_html, credential_html, urls_html, keywords_html, tips_html
327
 
328
  except Exception as e:
329
  print(f"Prediction error: {e}")
330
+ return "<div style='color:gray;'>Error during classification</div>", "", "", "", "", ""
 
 
 
 
331
 
332
  def process_bulk_emails(file):
333
  """Process bulk emails from file"""
 
357
  vec = vectorizer.transform([cleaned])
358
  pred = model.predict(vec)[0]
359
 
360
+ # Additional analysis
361
+ analysis = analyze_email(str(email))
362
+
363
  results.append({
364
  'Email': str(email)[:100] + '...' if len(str(email)) > 100 else str(email),
365
+ 'Classification': 'Spam' if pred == 1 else 'Not Spam',
366
+ 'Language': analysis['language'],
367
+ 'Has_URLs': 'Yes' if analysis['has_urls'] else 'No',
368
+ 'Credential_Risk': 'High' if analysis['credential_keywords'] else 'Low'
369
  })
370
 
371
  results_df = pd.DataFrame(results)
 
374
  output_path = "spam_classification_results.csv"
375
  results_df.to_csv(output_path, index=False)
376
 
377
+ spam_count = len([r for r in results if r['Classification'] == 'Spam'])
378
+ credential_risks = len([r for r in results if r['Credential_Risk'] == 'High'])
379
+
380
  summary = f"βœ… Processed {len(results)} emails\n"
381
+ summary += f"πŸ”΄ Spam: {spam_count}\n"
382
+ summary += f"🟒 Not Spam: {len(results) - spam_count}\n"
383
+ summary += f"πŸ” Credential Phishing Risk: {credential_risks}"
384
 
385
  return summary, output_path
386
 
387
  except Exception as e:
388
  return f"Error processing file: {str(e)}", None
389
 
390
+ # Enhanced examples with more diverse scenarios
391
  examples = [
392
+ ["Congratulations! You've won a $1000 gift card. Click here to claim your prize now!"],
393
+ ["URGENT: Your account has been compromised. Verify your identity now by clicking this link and entering your password."],
394
+ ["Hi mom, I'll be home for dinner tonight. See you around 7pm."],
395
+ ["Hello team, the project report is attached. Please review before tomorrow's meeting."],
396
+ ["Hey John, are we still on for lunch tomorrow? Let me know!"],
397
+ ["FINAL NOTICE: Your payment is overdue. Click here within 24 hours to update your billing information or your account will be suspended."],
398
+ ["Dear customer, we detected unusual activity on your account. Please verify your login credentials immediately."],
399
+ ["You have been selected for our exclusive offer! Limited time only - act now to receive FREE cash bonus!"],
400
+ ["Meeting reminder: Don't forget about our team sync at 3pm today. Conference room B."],
401
+ ["Your package delivery requires signature. Track your shipment here: http://suspicious-tracking-link.com"]
402
  ]
403
 
404
  # Custom CSS
 
415
  # Gradio interface
416
  with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Classifier") as demo:
417
  gr.Markdown("# πŸ“§ Enhanced Email Spam Classifier")
418
+ gr.Markdown("*Advanced spam detection with multi-language support and credential phishing detection*")
419
 
420
  with gr.Tabs():
421
  # Single Email Tab
 
435
  output_label = gr.HTML(label="πŸ“Š Result")
436
 
437
  analysis_output = gr.HTML(label="πŸ“‹ Analysis Details")
438
+ credential_output = gr.HTML(label="πŸ” Credential Phishing Check")
439
  urls_output = gr.HTML(label="πŸ”— URLs Found")
440
  keywords_output = gr.HTML(label="πŸ”Ž Keyword Highlights")
441
+ tips_output = gr.HTML(label="πŸ›‘οΈ Security Tips")
442
 
443
  gr.Examples(
444
  examples=examples,
445
  inputs=input_text,
446
+ outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output],
447
  fn=classify_email
448
  )
449
 
450
  submit_btn.click(
451
  fn=classify_email,
452
  inputs=input_text,
453
+ outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output]
454
  )
455
  input_text.submit(
456
  fn=classify_email,
457
  inputs=input_text,
458
+ outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output]
459
  )
460
 
461
  # Bulk Processing Tab
462
  with gr.Tab("πŸ“¦ Bulk Processing"):
463
  gr.Markdown("### Upload a CSV or TXT file with emails (one per line)")
464
+ gr.Markdown("*Results will include spam classification, language detection, and credential phishing risk*")
465
  with gr.Row():
466
  with gr.Column():
467
  file_input = gr.File(label="πŸ“ Upload File", file_types=[".csv", ".txt"])
468
  bulk_btn = gr.Button("πŸš€ Process Bulk Emails", variant="primary")
469
  with gr.Column():
470
+ bulk_output = gr.Textbox(label="πŸ“Š Processing Summary", lines=6)
471
  download_output = gr.File(label="⬇️ Download Results")
472
 
473
  bulk_btn.click(