Spaces:

Vedag812
/

email_spam

Sleeping

App Files Files Community

Vedag812 commited on Oct 6

Commit

921fc4a

verified ·

1 Parent(s): 4d007d4

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -24

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ import os
 import pandas as pd
 from collections import Counter
 import io
 # Download NLTK data
 try:
@@ -66,7 +68,74 @@ vectorizer = joblib.load(vectorizer_path)
 # Spam indicators
 SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
-                 'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan']
 def extract_urls(message):
     """Extract all URLs from the message"""
@@ -82,6 +151,10 @@ def analyze_email(message):
     analysis['word_count'] = len(message.split())
     analysis['char_count'] = len(message)
     # Extract URLs
     analysis['urls'] = extract_urls(message)
     analysis['has_urls'] = len(analysis['urls']) > 0
@@ -96,6 +169,9 @@ def analyze_email(message):
     found_keywords = [kw for kw in SPAM_KEYWORDS if kw in message_lower]
     analysis['spam_keywords'] = found_keywords
     return analysis
 def highlight_spam_words(message, keywords):
@@ -106,9 +182,39 @@ def highlight_spam_words(message, keywords):
         highlighted = pattern.sub(f'<mark style="background-color: #ffcccc; padding: 2px 4px; border-radius: 3px;">{kw}</mark>', highlighted)
     return highlighted
 def classify_email(message):
     if not message.strip():
-        return "<div style='color:gray;'>Empty message</div>", "", "", ""
     try:
         # Get analysis
@@ -119,10 +225,11 @@ def classify_email(message):
         vec = vectorizer.transform([cleaned])
         pred = model.predict(vec)[0]
-        result_type = "Spam" if pred == 1 else "Not Spam"
         # Result card
-        if pred == 1:
             result_html = """
             <div style='border:2px solid #ff4d4d; border-radius:10px; background-color:#ffe6e6;
                         padding:15px; font-size:18px; font-weight:bold; text-align:center;'>
@@ -137,11 +244,19 @@ def classify_email(message):
             </div>
             """
         # Analysis details
         details_html = f"""
         <div style='background-color:#f8f9fa; padding:15px; border-radius:8px; margin-top:10px;'>
             <h3 style='margin-top:0; color:#333;'>📊 Email Analysis</h3>
             <table style='width:100%; border-collapse: collapse;'>
                 <tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
                 <tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
                 <tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'⚠️ Yes (' + str(len(analysis['urls'])) + ')' if analysis['has_urls'] else '✓ No'}</td></tr>
@@ -152,6 +267,21 @@ def classify_email(message):
         </div>
         """
         # URLs detected
         if analysis['urls']:
             urls_html = f"""
@@ -160,6 +290,9 @@ def classify_email(message):
                 <div style='background-color:white; padding:10px; border-radius:5px; font-size:14px;'>
                     {'<br>'.join(['<a href="' + url + '" target="_blank" style="color:#d32f2f; word-break:break-all;">' + url + '</a>' for url in analysis['urls']])}
                 </div>
             </div>
             """
         else:
@@ -179,15 +312,22 @@ def classify_email(message):
         else:
             keywords_html = ""
-        return result_html, details_html, urls_html, keywords_html
     except Exception as e:
         print(f"Prediction error: {e}")
-        return "<div style='color:gray;'>Error during classification</div>", "", "", ""
-def get_statistics():
-    """Generate statistics dashboard"""
-    return ""
 def process_bulk_emails(file):
     """Process bulk emails from file"""
@@ -217,9 +357,15 @@ def process_bulk_emails(file):
             vec = vectorizer.transform([cleaned])
             pred = model.predict(vec)[0]
             results.append({
                 'Email': str(email)[:100] + '...' if len(str(email)) > 100 else str(email),
-                'Classification': 'Spam' if pred == 1 else 'Not Spam'
             })
         results_df = pd.DataFrame(results)
@@ -228,22 +374,31 @@ def process_bulk_emails(file):
         output_path = "spam_classification_results.csv"
         results_df.to_csv(output_path, index=False)
         summary = f"✅ Processed {len(results)} emails\n"
-        summary += f"🔴 Spam: {len([r for r in results if r['Classification'] == 'Spam'])}\n"
-        summary += f"🟢 Not Spam: {len([r for r in results if r['Classification'] == 'Not Spam'])}"
         return summary, output_path
     except Exception as e:
         return f"Error processing file: {str(e)}", None
-# Examples
 examples = [
-    ["Congratulations! You've won a $1000 gift card. Click here!"],
-    ["URGENT: Your account has been compromised. Verify your identity now!"],
-    ["Hi mom, I'll be home for dinner tonight."],
-    ["Hello team, the project report is attached."],
-    ["Hey John, are we still on for lunch tomorrow?"]
 ]
 # Custom CSS
@@ -260,7 +415,7 @@ mark {animation: highlight 0.5s ease;}
 # Gradio interface
 with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Classifier") as demo:
     gr.Markdown("# 📧 Enhanced Email Spam Classifier")
-    gr.Markdown("*Advanced spam detection with detailed analysis and bulk processing*")
     with gr.Tabs():
         # Single Email Tab
@@ -280,36 +435,39 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Class
                     output_label = gr.HTML(label="📊 Result")
             analysis_output = gr.HTML(label="📋 Analysis Details")
             urls_output = gr.HTML(label="🔗 URLs Found")
             keywords_output = gr.HTML(label="🔎 Keyword Highlights")
             gr.Examples(
                 examples=examples,
                 inputs=input_text,
-                outputs=[output_label, analysis_output, urls_output, keywords_output],
                 fn=classify_email
             )
             submit_btn.click(
                 fn=classify_email,
                 inputs=input_text,
-                outputs=[output_label, analysis_output, urls_output, keywords_output]
             )
             input_text.submit(
                 fn=classify_email,
                 inputs=input_text,
-                outputs=[output_label, analysis_output, urls_output, keywords_output]
             )
         # Bulk Processing Tab
         with gr.Tab("📦 Bulk Processing"):
             gr.Markdown("### Upload a CSV or TXT file with emails (one per line)")
             with gr.Row():
                 with gr.Column():
                     file_input = gr.File(label="📁 Upload File", file_types=[".csv", ".txt"])
                     bulk_btn = gr.Button("🚀 Process Bulk Emails", variant="primary")
                 with gr.Column():
-                    bulk_output = gr.Textbox(label="📊 Processing Summary", lines=5)
                     download_output = gr.File(label="⬇️ Download Results")
             bulk_btn.click(

 import pandas as pd
 from collections import Counter
 import io
+from langdetect import detect, DetectorFactory
+DetectorFactory.seed = 0
 # Download NLTK data
 try:
 # Spam indicators
 SPAM_KEYWORDS = ['win', 'winner', 'congratulations', 'free', 'urgent', 'click', 'verify',
+                 'account', 'suspended', 'prize', 'lottery', 'cash', 'credit', 'loan',
+                 'limited time', 'act now', 'expire', 'claim', 'bonus']
+# Credential phishing keywords
+CREDENTIAL_KEYWORDS = ['password', 'username', 'login', 'credential', 'signin', 'sign in',
+                       'verify account', 'confirm identity', 'update payment', 'billing information',
+                       'security alert', 'unusual activity', 'locked account', 'reset password']
+# Language code mapping
+LANGUAGE_NAMES = {
+    'en': 'English', 'es': 'Spanish', 'fr': 'French', 'de': 'German', 'it': 'Italian',
+    'pt': 'Portuguese', 'ru': 'Russian', 'zh-cn': 'Chinese', 'ja': 'Japanese', 'ko': 'Korean',
+    'ar': 'Arabic', 'hi': 'Hindi', 'tr': 'Turkish', 'nl': 'Dutch', 'pl': 'Polish'
+}
+def detect_language(text):
+    """Detect the language of the text"""
+    try:
+        lang_code = detect(text)
+        return LANGUAGE_NAMES.get(lang_code, lang_code.upper())
+    except:
+        return "Unknown"
+def detect_language_switching(text):
+    """Detect if text switches between multiple languages (common in scams)"""
+    try:
+        sentences = text.split('.')
+        languages = []
+        for sentence in sentences:
+            if len(sentence.strip()) > 10:
+                try:
+                    lang = detect(sentence)
+                    languages.append(lang)
+                except:
+                    pass
+        unique_languages = list(set(languages))
+        if len(unique_languages) > 1:
+            return True, unique_languages
+        return False, unique_languages
+    except:
+        return False, []
+def check_credential_phishing(message):
+    """Check if email is asking for credentials or personal info"""
+    message_lower = message.lower()
+    found_credential_keywords = []
+    for keyword in CREDENTIAL_KEYWORDS:
+        if keyword in message_lower:
+            found_credential_keywords.append(keyword)
+    # Check for common phishing patterns
+    phishing_patterns = []
+    if re.search(r'(click|tap|press).*(link|here|button)', message_lower):
+        phishing_patterns.append("Suspicious call-to-action")
+    if re.search(r'(within|in).*(24|48|72).*(hour|hr)', message_lower):
+        phishing_patterns.append("Time pressure tactics")
+    if re.search(r'(suspend|lock|close|terminate).*(account|access)', message_lower):
+        phishing_patterns.append("Account threat")
+    if re.search(r'(confirm|verify|update).*(information|details|data)', message_lower):
+        phishing_patterns.append("Information request")
+    return found_credential_keywords, phishing_patterns
 def extract_urls(message):
     """Extract all URLs from the message"""
     analysis['word_count'] = len(message.split())
     analysis['char_count'] = len(message)
+    # Language detection
+    analysis['language'] = detect_language(message)
+    analysis['language_switching'], analysis['detected_languages'] = detect_language_switching(message)
     # Extract URLs
     analysis['urls'] = extract_urls(message)
     analysis['has_urls'] = len(analysis['urls']) > 0
     found_keywords = [kw for kw in SPAM_KEYWORDS if kw in message_lower]
     analysis['spam_keywords'] = found_keywords
+    # Credential phishing check
+    analysis['credential_keywords'], analysis['phishing_patterns'] = check_credential_phishing(message)
     return analysis
 def highlight_spam_words(message, keywords):
         highlighted = pattern.sub(f'<mark style="background-color: #ffcccc; padding: 2px 4px; border-radius: 3px;">{kw}</mark>', highlighted)
     return highlighted
+def generate_security_tips(analysis, is_spam):
+    """Generate personalized security tips based on analysis"""
+    tips = []
+    if is_spam:
+        tips.append("⚠️ This email has been flagged as spam. Exercise caution.")
+    if analysis['credential_keywords']:
+        tips.append("🔐 Never share passwords or credentials via email.")
+        tips.append("🛡️ Legitimate companies won't ask for sensitive info via email.")
+    if analysis['has_urls']:
+        tips.append("🔗 Hover over links before clicking to verify destination.")
+        tips.append("🌐 Check if URL matches the official company website.")
+    if analysis['phishing_patterns']:
+        tips.append("⏰ Be suspicious of emails creating artificial urgency.")
+        tips.append("📞 Contact the company directly using official contact info.")
+    if analysis['language_switching']:
+        tips.append("🌍 Language switching is a common tactic in international scams.")
+    if analysis['all_caps_words'] > 3:
+        tips.append("📢 Excessive capitalization is often used to create panic.")
+    if not tips:
+        tips.append("✅ Stay vigilant with all emails requesting action or information.")
+    return tips
 def classify_email(message):
     if not message.strip():
+        return "<div style='color:gray;'>Empty message</div>", "", "", "", ""
     try:
         # Get analysis
         vec = vectorizer.transform([cleaned])
         pred = model.predict(vec)[0]
+        is_spam = pred == 1
+        result_type = "Spam" if is_spam else "Not Spam"
         # Result card
+        if is_spam:
             result_html = """
             <div style='border:2px solid #ff4d4d; border-radius:10px; background-color:#ffe6e6;
                         padding:15px; font-size:18px; font-weight:bold; text-align:center;'>
             </div>
             """
+        # Language info
+        lang_warning = ""
+        if analysis['language_switching']:
+            langs = ', '.join([LANGUAGE_NAMES.get(l, l) for l in analysis['detected_languages']])
+            lang_warning = f"<tr style='background-color:#fff3cd;'><td style='padding:5px;'><b>⚠️ Language Switching:</b></td><td>Yes ({langs})</td></tr>"
         # Analysis details
         details_html = f"""
         <div style='background-color:#f8f9fa; padding:15px; border-radius:8px; margin-top:10px;'>
             <h3 style='margin-top:0; color:#333;'>📊 Email Analysis</h3>
             <table style='width:100%; border-collapse: collapse;'>
+                <tr><td style='padding:5px;'><b>Detected Language:</b></td><td>{analysis['language']}</td></tr>
+                {lang_warning}
                 <tr><td style='padding:5px;'><b>Word Count:</b></td><td>{analysis['word_count']}</td></tr>
                 <tr><td style='padding:5px;'><b>Character Count:</b></td><td>{analysis['char_count']}</td></tr>
                 <tr><td style='padding:5px;'><b>Contains URLs:</b></td><td>{'⚠️ Yes (' + str(len(analysis['urls'])) + ')' if analysis['has_urls'] else '✓ No'}</td></tr>
         </div>
         """
+        # Credential phishing warning
+        if analysis['credential_keywords'] or analysis['phishing_patterns']:
+            credential_html = f"""
+            <div style='background-color:#ffebee; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #d32f2f;'>
+                <h3 style='margin-top:0; color:#d32f2f;'>🔐 Credential Phishing Alert!</h3>
+                {f"<p style='margin:5px 0;'><b>Suspicious Keywords:</b> {', '.join(analysis['credential_keywords'])}</p>" if analysis['credential_keywords'] else ""}
+                {f"<p style='margin:5px 0;'><b>Phishing Patterns:</b> {', '.join(analysis['phishing_patterns'])}</p>" if analysis['phishing_patterns'] else ""}
+                <p style='margin:10px 0 0 0; padding:10px; background-color:#fff; border-radius:5px;'>
+                    <b>⚠️ Warning:</b> This email appears to be attempting to steal your credentials or personal information.
+                </p>
+            </div>
+            """
+        else:
+            credential_html = ""
         # URLs detected
         if analysis['urls']:
             urls_html = f"""
                 <div style='background-color:white; padding:10px; border-radius:5px; font-size:14px;'>
                     {'<br>'.join(['<a href="' + url + '" target="_blank" style="color:#d32f2f; word-break:break-all;">' + url + '</a>' for url in analysis['urls']])}
                 </div>
+                <p style='margin:10px 0 0 0; font-size:13px; color:#666;'>
+                    💡 Tip: Always verify URLs before clicking. Hover to see the actual destination.
+                </p>
             </div>
             """
         else:
         else:
             keywords_html = ""
+        # Security tips
+        tips = generate_security_tips(analysis, is_spam)
+        tips_html = f"""
+        <div style='background-color:#e8f5e9; padding:15px; border-radius:8px; margin-top:10px; border-left:4px solid #4caf50;'>
+            <h3 style='margin-top:0; color:#2e7d32;'>🛡️ Security Tips</h3>
+            <ul style='margin:5px 0; padding-left:20px;'>
+                {''.join(['<li style="margin:5px 0;">' + tip + '</li>' for tip in tips])}
+            </ul>
+        </div>
+        """
+        return result_html, details_html, credential_html, urls_html, keywords_html, tips_html
     except Exception as e:
         print(f"Prediction error: {e}")
+        return "<div style='color:gray;'>Error during classification</div>", "", "", "", "", ""
 def process_bulk_emails(file):
     """Process bulk emails from file"""
             vec = vectorizer.transform([cleaned])
             pred = model.predict(vec)[0]
+            # Additional analysis
+            analysis = analyze_email(str(email))
             results.append({
                 'Email': str(email)[:100] + '...' if len(str(email)) > 100 else str(email),
+                'Classification': 'Spam' if pred == 1 else 'Not Spam',
+                'Language': analysis['language'],
+                'Has_URLs': 'Yes' if analysis['has_urls'] else 'No',
+                'Credential_Risk': 'High' if analysis['credential_keywords'] else 'Low'
             })
         results_df = pd.DataFrame(results)
         output_path = "spam_classification_results.csv"
         results_df.to_csv(output_path, index=False)
+        spam_count = len([r for r in results if r['Classification'] == 'Spam'])
+        credential_risks = len([r for r in results if r['Credential_Risk'] == 'High'])
         summary = f"✅ Processed {len(results)} emails\n"
+        summary += f"🔴 Spam: {spam_count}\n"
+        summary += f"🟢 Not Spam: {len(results) - spam_count}\n"
+        summary += f"🔐 Credential Phishing Risk: {credential_risks}"
         return summary, output_path
     except Exception as e:
         return f"Error processing file: {str(e)}", None
+# Enhanced examples with more diverse scenarios
 examples = [
+    ["Congratulations! You've won a $1000 gift card. Click here to claim your prize now!"],
+    ["URGENT: Your account has been compromised. Verify your identity now by clicking this link and entering your password."],
+    ["Hi mom, I'll be home for dinner tonight. See you around 7pm."],
+    ["Hello team, the project report is attached. Please review before tomorrow's meeting."],
+    ["Hey John, are we still on for lunch tomorrow? Let me know!"],
+    ["FINAL NOTICE: Your payment is overdue. Click here within 24 hours to update your billing information or your account will be suspended."],
+    ["Dear customer, we detected unusual activity on your account. Please verify your login credentials immediately."],
+    ["You have been selected for our exclusive offer! Limited time only - act now to receive FREE cash bonus!"],
+    ["Meeting reminder: Don't forget about our team sync at 3pm today. Conference room B."],
+    ["Your package delivery requires signature. Track your shipment here: http://suspicious-tracking-link.com"]
 ]
 # Custom CSS
 # Gradio interface
 with gr.Blocks(css=css, theme=gr.themes.Soft(), title="Enhanced Email Spam Classifier") as demo:
     gr.Markdown("# 📧 Enhanced Email Spam Classifier")
+    gr.Markdown("*Advanced spam detection with multi-language support and credential phishing detection*")
     with gr.Tabs():
         # Single Email Tab
                     output_label = gr.HTML(label="📊 Result")
             analysis_output = gr.HTML(label="📋 Analysis Details")
+            credential_output = gr.HTML(label="🔐 Credential Phishing Check")
             urls_output = gr.HTML(label="🔗 URLs Found")
             keywords_output = gr.HTML(label="🔎 Keyword Highlights")
+            tips_output = gr.HTML(label="🛡️ Security Tips")
             gr.Examples(
                 examples=examples,
                 inputs=input_text,
+                outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output],
                 fn=classify_email
             )
             submit_btn.click(
                 fn=classify_email,
                 inputs=input_text,
+                outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output]
             )
             input_text.submit(
                 fn=classify_email,
                 inputs=input_text,
+                outputs=[output_label, analysis_output, credential_output, urls_output, keywords_output, tips_output]
             )
         # Bulk Processing Tab
         with gr.Tab("📦 Bulk Processing"):
             gr.Markdown("### Upload a CSV or TXT file with emails (one per line)")
+            gr.Markdown("*Results will include spam classification, language detection, and credential phishing risk*")
             with gr.Row():
                 with gr.Column():
                     file_input = gr.File(label="📁 Upload File", file_types=[".csv", ".txt"])
                     bulk_btn = gr.Button("🚀 Process Bulk Emails", variant="primary")
                 with gr.Column():
+                    bulk_output = gr.Textbox(label="📊 Processing Summary", lines=6)
                     download_output = gr.File(label="⬇️ Download Results")
             bulk_btn.click(