Duv

Sleeping

App Files Files Community

akshit4857 commited on 21 days ago

Commit

32de8a6

verified ·

1 Parent(s): c37d0c9

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +282 -59

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
-Review Validator - Final Professional Edition (Updated)
 """
 import os
@@ -13,9 +14,20 @@ import matplotlib
 matplotlib.use('Agg')  # use non-GUI backend for Streamlit
 import matplotlib.pyplot as plt
 import requests
-import urllib.parse
 import math
 import warnings
 # --- Setup: Silence the technical noise ---
 warnings.filterwarnings("ignore")
@@ -30,7 +42,7 @@ st.set_page_config(
 )
 # ==========================================
-# 🧠 THE AI BRAINS (High Precision Models)
 # ==========================================
 # 1. Text AI Detector: ModernBERT-based detector (0 = human, 1 = AI)
@@ -54,9 +66,9 @@ MODEL_CAPTION = "Salesforce/blip-image-captioning-base"
 def get_token():
     """
     Safely retrieves HF_TOKEN.
-    Priority 1: Environment Variable (Spaces)
     Priority 2: Streamlit Secrets (Local)
-    Optional: if not present, we still try to load public models.
     """
     token = os.environ.get("HF_TOKEN")
     if token:
@@ -164,7 +176,6 @@ def load_ai_squad():
     squad = {}
     errors = []
-    # token is optional for public models – only pass if available
     token_arg = {"token": HF_TOKEN} if HF_TOKEN else {}
     # TEXT MODELS
@@ -213,20 +224,113 @@ def load_ai_squad():
             **token_arg
         )
     except Exception as e:
-        # Caption is optional – not critical
         errors.append(f"Caption model: {e}")
-    # If literally nothing loaded, return None so main() can show a clean error
     if not squad:
         return None, "No models could be loaded. Check internet / HF token / requirements."
-    # If some models failed, join messages (optional debug info)
     err_msg = "\n".join(errors) if errors else None
     return squad, err_msg
 # --- Logic: Analyze Text ---
 def check_text(text, squad):
-    # If fake detector missing, no point pretending
     if 'fake' not in squad:
         return {
             "bot_score": 0,
@@ -237,15 +341,14 @@ def check_text(text, squad):
             "error_msg": "AI text detector not loaded."
         }
-    # 1. Bot / AI Check (ModernBERT AI detector: 0 = human, 1 = AI)
-    res_fake = squad['fake'](text[:512])[0]  # {'label': '1', 'score': 0.93}
     raw_label = res_fake.get('label', '1')
     raw_score = float(res_fake.get('score', 0.5))
     try:
         label_id = int(raw_label)
     except ValueError:
-        # in case model changes label format later
         label_id = 1 if "1" in str(raw_label) else 0
     if label_id == 1:
@@ -255,7 +358,7 @@ def check_text(text, squad):
     bot_score = ai_prob * 100.0
-    # 2. Mood Check
     mood_label = "Unknown"
     if 'mood' in squad:
         try:
@@ -264,7 +367,7 @@ def check_text(text, squad):
         except Exception:
             mood_label = "Unknown"
-    # 3. Grammar Check (CoLA: LABEL_1 = acceptable)
     grammar_score = 50.0
     if 'grammar' in squad:
         try:
@@ -286,14 +389,9 @@ def check_text(text, squad):
 # --- Logic: Analyze Image ---
 def check_image(img, squad):
-    """
-    Returns AI probability and debug scores.
-    Uses a single main detector to avoid conflicts.
-    """
     caption_text = "Caption unavailable"
     ai_chance = 0.0
-    # 1. Image AI Detector
     if 'img_main' in squad:
         try:
             preds = squad['img_main'](img)
@@ -307,14 +405,12 @@ def check_image(img, squad):
                 elif "real" in label:
                     ai_prob = 1 - score
                 else:
-                    # unknown label – assume score ~ AI probability
                     ai_prob = score
                 ai_chance = ai_prob * 100.0
         except Exception:
             ai_chance = 0.0
-    # 2. Captioning (optional, does not affect AI score)
     if 'caption' in squad:
         try:
             cap_res = squad['caption'](img)
@@ -325,17 +421,13 @@ def check_image(img, squad):
     return {
         "ai_chance": ai_chance,
-        "match": 1.0,  # single model, so set to 1
         "score_a": ai_chance,
         "score_b": ai_chance,
         "caption": caption_text
     }
 def get_image_from_url(url):
-    """
-    Safely fetch image from URL.
-    If anything goes wrong, return None instead of crashing (prevents 402-style front-end errors).
-    """
     try:
         headers = {
             'User-Agent': (
@@ -352,12 +444,12 @@ def get_image_from_url(url):
         return None
 # --- Plotting ---
 def breakdown_chart(stats):
-    """Horizontal bar chart for text analysis breakdown"""
     labels = ['AI-Likeness', 'Grammar Quality']
     values = [stats['bot_score'], stats['grammar_score']]
-    fig, ax = plt.subplots(figsize=(5, 2))
     y_pos = np.arange(len(labels))
     ax.barh(y_pos, values, align='center', height=0.6)
@@ -375,6 +467,103 @@ def breakdown_chart(stats):
     plt.tight_layout()
     return fig
 # --- PAGES ---
 def landing_page():
@@ -382,7 +571,7 @@ def landing_page():
     <div class="hero-box">
         <div class="hero-title">🛡️ Review Validator</div>
         <div class="hero-subtitle">
-            Check if reviews look Human or AI. Check if product photos look Real or AI-generated.
         </div>
     </div>
     """, unsafe_allow_html=True)
@@ -392,24 +581,24 @@ def landing_page():
         st.markdown("""
         <div class="feature-card">
             <span class="emoji-icon">🤖</span>
-            <h3>AI Text Checker</h3>
-            <p>Detects if a review looks like it was written by an AI model.</p>
         </div>
         """, unsafe_allow_html=True)
     with c2:
         st.markdown("""
         <div class="feature-card">
             <span class="emoji-icon">📸</span>
-            <h3>AI Image Checker</h3>
-            <p>Flags images that look computer-generated instead of real photos.</p>
         </div>
         """, unsafe_allow_html=True)
     with c3:
         st.markdown("""
         <div class="feature-card">
-            <span class="emoji-icon">⚖️</span>
-            <h3>Simple Verdict</h3>
-            <p>We show clear scores so you can take the final decision.</p>
         </div>
         """, unsafe_allow_html=True)
@@ -465,12 +654,13 @@ def detector_page(squad, warnings_text=None):
                 if txt_input.strip():
                     with st.spinner("Analyzing text..."):
                         res = check_text(txt_input, squad)
-                        st.session_state['text_res'] = (res, strict_mode)
                 else:
                     st.warning("Please paste a review first.")
         if 'text_res' in st.session_state:
-            res, strict_mode = st.session_state['text_res']
             if res.get("error"):
                 st.error(res.get("error_msg", "Text models failed to load."))
@@ -481,8 +671,8 @@ def detector_page(squad, warnings_text=None):
                 grammar_score = res['grammar_score']
                 mood_label = res['mood_label']
-                # Thresholds depend on mode
-                if strict_mode:
                     t_high = 90
                     t_mid = 70
                 else:
@@ -524,25 +714,57 @@ def detector_page(squad, warnings_text=None):
                 )
                 st.write("")
-                v1, v2 = st.columns([1, 2])
-                with v1:
-                    st.markdown("#### 📊 Breakdown")
                     fig = breakdown_chart(res)
-                    st.pyplot(fig)
-                with v2:
-                    st.markdown("#### 💡 Verdict & Explanation")
-                    if verdict_type == "error":
-                        st.error(verdict_text)
-                    elif verdict_type == "warning":
-                        st.warning(verdict_text)
-                    else:
-                        st.success(verdict_text)
-                    st.markdown(
-                        "- This score is based on a model trained to separate **human vs AI-generated** text.\n"
-                        "- High AI score does **not** guarantee fakery, and low score does not guarantee authenticity.\n"
-                        "- Use this as a signal, not final proof."
                     )
     # --- IMAGE TAB ---
     with tab2:
@@ -586,6 +808,8 @@ def detector_page(squad, warnings_text=None):
                             data = check_image(target_img, squad)
                             st.session_state['img_res'] = (data, strict_img)
                             st.session_state['current_img'] = target_img
         with col_view:
             if 'current_img' in st.session_state:
@@ -647,8 +871,7 @@ def main():
     warnings_text = None
     if err:
-        # Non-blocking warnings about partial failures
-        warnings_text = "Some features may be limited:\n" + err.replace("\n", "<br>")
     if st.session_state['page'] == 'landing':
         landing_page()

 """
+Review Validator - Advanced Edition
+With explainability graphs + PDF report download
 """
 import os
 matplotlib.use('Agg')  # use non-GUI backend for Streamlit
 import matplotlib.pyplot as plt
 import requests
 import math
 import warnings
+import re
+from collections import Counter
+from datetime import datetime
+import textwrap
+# Try to import ReportLab for PDF generation
+try:
+    from reportlab.lib.pagesizes import A4
+    from reportlab.pdfgen import canvas
+    HAVE_REPORTLAB = True
+except ImportError:
+    HAVE_REPORTLAB = False
 # --- Setup: Silence the technical noise ---
 warnings.filterwarnings("ignore")
 )
 # ==========================================
+# 🧠 MODELS (Better public detectors)
 # ==========================================
 # 1. Text AI Detector: ModernBERT-based detector (0 = human, 1 = AI)
 def get_token():
     """
     Safely retrieves HF_TOKEN.
+    Priority 1: Env var (Spaces)
     Priority 2: Streamlit Secrets (Local)
+    Optional – app still runs if missing.
     """
     token = os.environ.get("HF_TOKEN")
     if token:
     squad = {}
     errors = []
     token_arg = {"token": HF_TOKEN} if HF_TOKEN else {}
     # TEXT MODELS
             **token_arg
         )
     except Exception as e:
         errors.append(f"Caption model: {e}")
     if not squad:
         return None, "No models could be loaded. Check internet / HF token / requirements."
     err_msg = "\n".join(errors) if errors else None
     return squad, err_msg
+# --- Utility: Basic text stats for explainability ---
+STOPWORDS = set([
+    "the","a","an","is","are","am","and","or","in","on","at","of","to","for",
+    "this","that","it","was","with","as","by","be","from","has","have","had",
+    "i","you","we","they","he","she","my","our","their","your"
+])
+def split_sentences(text: str):
+    # simple sentence splitter
+    parts = re.split(r'[.!?]+', text)
+    return [s.strip() for s in parts if s.strip()]
+def tokenize_words(text: str):
+    tokens = re.findall(r"[A-Za-z']+", text.lower())
+    return tokens
+def analyze_text_structure(text: str):
+    sentences = split_sentences(text)
+    words = tokenize_words(text)
+    num_sentences = max(len(sentences), 1)
+    num_words = len(words)
+    sent_lengths = [len(tokenize_words(s)) for s in sentences] or [0]
+    avg_sent_len = sum(sent_lengths) / len(sent_lengths)
+    var_sent_len = float(np.var(sent_lengths)) if len(sent_lengths) > 1 else 0.0
+    # vocabulary diversity
+    vocab = set(w for w in words if w not in STOPWORDS)
+    vocab_size = len(vocab)
+    ttr = (vocab_size / num_words) if num_words > 0 else 0.0  # type-token ratio
+    # top words
+    filtered = [w for w in words if w not in STOPWORDS]
+    counter = Counter(filtered)
+    top_words = counter.most_common(10)
+    return {
+        "num_sentences": num_sentences,
+        "num_words": num_words,
+        "avg_sentence_len": avg_sent_len,
+        "var_sentence_len": var_sent_len,
+        "ttr": ttr,
+        "top_words": top_words,
+        "sentence_lengths": sent_lengths,
+    }
+def explain_text(res, stats, strict_mode: bool):
+    """
+    Heuristic explanation based on AI score + grammar + structure.
+    Returns list of bullet strings.
+    """
+    bot = res["bot_score"]
+    gram = res["grammar_score"]
+    mood = res["mood_label"]
+    avg_len = stats["avg_sentence_len"]
+    var_len = stats["var_sentence_len"]
+    ttr = stats["ttr"]
+    reasons = []
+    # AI-likeness
+    if bot >= 85:
+        reasons.append("High AI-likeness score – model strongly associates this style with AI text.")
+    elif bot >= 65:
+        reasons.append("Moderate AI-likeness score – some patterns resemble AI-generated writing.")
+    else:
+        reasons.append("Low AI-likeness score – style leans closer to typical human-written reviews.")
+    # Grammar
+    if gram >= 85 and bot >= 70:
+        reasons.append("Grammar is near-perfect and very consistent, which is common in AI text.")
+    elif gram >= 85 and bot < 50:
+        reasons.append("Grammar is very clean but the AI score is low, could be a careful human reviewer.")
+    elif gram < 60:
+        reasons.append("Grammar has noticeable imperfections, more typical of casual human writing.")
+    # Sentence structure
+    if var_len < 5 and avg_len > 12 and bot >= 70:
+        reasons.append("Sentence length is very uniform and long, which often appears in AI outputs.")
+    elif var_len > 15:
+        reasons.append("Sentence length varies a lot, which is more natural for human writing.")
+    # Vocabulary diversity
+    if ttr < 0.3 and bot >= 70:
+        reasons.append("Vocabulary diversity is low despite longer text, hinting at templated or generated style.")
+    elif ttr > 0.45:
+        reasons.append("Vocabulary diversity is relatively high, which often indicates a human author.")
+    # Mood-based explanation
+    reasons.append(f"Overall sentiment detected: **{mood}**.")
+    if strict_mode:
+        reasons.append("Strict mode: thresholds are higher, so AI flags are more conservative but precise.")
+    return reasons
 # --- Logic: Analyze Text ---
 def check_text(text, squad):
     if 'fake' not in squad:
         return {
             "bot_score": 0,
             "error_msg": "AI text detector not loaded."
         }
+    # 1. Bot / AI Check
+    res_fake = squad['fake'](text[:512])[0]
     raw_label = res_fake.get('label', '1')
     raw_score = float(res_fake.get('score', 0.5))
     try:
         label_id = int(raw_label)
     except ValueError:
         label_id = 1 if "1" in str(raw_label) else 0
     if label_id == 1:
     bot_score = ai_prob * 100.0
+    # 2. Mood
     mood_label = "Unknown"
     if 'mood' in squad:
         try:
         except Exception:
             mood_label = "Unknown"
+    # 3. Grammar (CoLA)
     grammar_score = 50.0
     if 'grammar' in squad:
         try:
 # --- Logic: Analyze Image ---
 def check_image(img, squad):
     caption_text = "Caption unavailable"
     ai_chance = 0.0
     if 'img_main' in squad:
         try:
             preds = squad['img_main'](img)
                 elif "real" in label:
                     ai_prob = 1 - score
                 else:
                     ai_prob = score
                 ai_chance = ai_prob * 100.0
         except Exception:
             ai_chance = 0.0
     if 'caption' in squad:
         try:
             cap_res = squad['caption'](img)
     return {
         "ai_chance": ai_chance,
+        "match": 1.0,
         "score_a": ai_chance,
         "score_b": ai_chance,
         "caption": caption_text
     }
 def get_image_from_url(url):
     try:
         headers = {
             'User-Agent': (
         return None
 # --- Plotting ---
 def breakdown_chart(stats):
     labels = ['AI-Likeness', 'Grammar Quality']
     values = [stats['bot_score'], stats['grammar_score']]
+    fig, ax = plt.subplots(figsize=(4, 2))
     y_pos = np.arange(len(labels))
     ax.barh(y_pos, values, align='center', height=0.6)
     plt.tight_layout()
     return fig
+def sentence_length_chart(stats):
+    lens = stats["sentence_lengths"]
+    fig, ax = plt.subplots(figsize=(4, 2))
+    ax.hist(lens, bins=min(len(lens), 8) or 1, edgecolor='black')
+    ax.set_xlabel("Sentence length (words)")
+    ax.set_ylabel("Count")
+    ax.set_title("Sentence Length Distribution")
+    plt.tight_layout()
+    return fig
+def word_freq_chart(stats):
+    top_words = stats["top_words"]
+    if not top_words:
+        fig, ax = plt.subplots(figsize=(4, 2))
+        ax.text(0.5, 0.5, "Not enough text", ha='center', va='center')
+        ax.axis('off')
+        return fig
+    words, freqs = zip(*top_words)
+    fig, ax = plt.subplots(figsize=(4, 2))
+    x = np.arange(len(words))
+    ax.bar(x, freqs)
+    ax.set_xticks(x)
+    ax.set_xticklabels(words, rotation=45, ha='right')
+    ax.set_ylabel("Frequency")
+    ax.set_title("Top Words (excluding stopwords)")
+    plt.tight_layout()
+    return fig
+# --- PDF REPORT GENERATION ---
+def generate_pdf_report(platform, review_text, text_res, text_stats, image_info):
+    """
+    Returns PDF bytes. Requires ReportLab.
+    image_info: dict or None
+    """
+    buffer = io.BytesIO()
+    c = canvas.Canvas(buffer, pagesize=A4)
+    width, height = A4
+    y = height - 50
+    def write_line(text, font="Helvetica", size=10, leading=14):
+        nonlocal y
+        c.setFont(font, size)
+        wrapped = textwrap.wrap(text, width=90)
+        for line in wrapped:
+            if y < 50:
+                c.showPage()
+                y = height - 50
+                c.setFont(font, size)
+            c.drawString(50, y, line)
+            y -= leading
+    # Header
+    c.setFont("Helvetica-Bold", 16)
+    c.drawString(50, y, "Review Validator Report")
+    y -= 25
+    c.setFont("Helvetica", 10)
+    c.drawString(50, y, f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    y -= 15
+    c.drawString(50, y, f"Platform: {platform}")
+    y -= 25
+    # Scores
+    write_line("=== Text Analysis ===", font="Helvetica-Bold", size=12)
+    write_line(f"AI-Likeness Score: {text_res['bot_score']:.1f}%")
+    write_line(f"Grammar Quality: {text_res['grammar_score']:.1f}%")
+    write_line(f"Sentiment: {text_res['mood_label']}")
+    y -= 10
+    # Structure stats
+    write_line("Text Structure:", font="Helvetica-Bold", size=11)
+    write_line(f"- Sentences: {text_stats['num_sentences']}")
+    write_line(f"- Words: {text_stats['num_words']}")
+    write_line(f"- Average sentence length: {text_stats['avg_sentence_len']:.1f} words")
+    write_line(f"- Sentence length variance: {text_stats['var_sentence_len']:.1f}")
+    write_line(f"- Vocabulary diversity (TTR): {text_stats['ttr']:.2f}")
+    y -= 10
+    # Review text
+    write_line("Original Review:", font="Helvetica-Bold", size=11)
+    write_line(review_text or "[empty review]")
+    y -= 10
+    # Image analysis
+    if image_info is not None:
+        write_line("=== Image Analysis ===", font="Helvetica-Bold", size=12)
+        write_line(f"AI Probability: {image_info['ai_chance']:.1f}%")
+        write_line(f"Caption (approx): {image_info['caption']}")
+        y -= 10
+    c.showPage()
+    c.save()
+    pdf_bytes = buffer.getvalue()
+    buffer.close()
+    return pdf_bytes
 # --- PAGES ---
 def landing_page():
     <div class="hero-box">
         <div class="hero-title">🛡️ Review Validator</div>
         <div class="hero-subtitle">
+            Advanced AI-powered review and image analysis with graphs, explainability, and exportable reports.
         </div>
     </div>
     """, unsafe_allow_html=True)
         st.markdown("""
         <div class="feature-card">
             <span class="emoji-icon">🤖</span>
+            <h3>AI Text Detector</h3>
+            <p>Modern models estimate whether a review looks AI-generated or human-written.</p>
         </div>
         """, unsafe_allow_html=True)
     with c2:
         st.markdown("""
         <div class="feature-card">
             <span class="emoji-icon">📸</span>
+            <h3>Image Authenticity</h3>
+            <p>Checks if product photos look real or AI-generated, with approximate captions.</p>
         </div>
         """, unsafe_allow_html=True)
     with c3:
         st.markdown("""
         <div class="feature-card">
+            <span class="emoji-icon">📊</span>
+            <h3>Explainable Reports</h3>
+            <p>Graphs, breakdowns, explanations, and PDF report downloads for sharing.</p>
         </div>
         """, unsafe_allow_html=True)
                 if txt_input.strip():
                     with st.spinner("Analyzing text..."):
                         res = check_text(txt_input, squad)
+                        stats = analyze_text_structure(txt_input)
+                        st.session_state['text_res'] = (res, stats, strict_mode, platform, txt_input)
                 else:
                     st.warning("Please paste a review first.")
         if 'text_res' in st.session_state:
+            res, stats, strict_mode_saved, platform_saved, review_text_saved = st.session_state['text_res']
             if res.get("error"):
                 st.error(res.get("error_msg", "Text models failed to load."))
                 grammar_score = res['grammar_score']
                 mood_label = res['mood_label']
+                # Thresholds
+                if strict_mode_saved:
                     t_high = 90
                     t_mid = 70
                 else:
                 )
                 st.write("")
+                g1, g2, g3 = st.columns(3)
+                with g1:
+                    st.markdown("#### 📊 Scores")
                     fig = breakdown_chart(res)
+                    st.pyplot(fig, use_container_width=True)
+                with g2:
+                    st.markdown("#### 📏 Sentence Lengths")
+                    fig2 = sentence_length_chart(stats)
+                    st.pyplot(fig2, use_container_width=True)
+                with g3:
+                    st.markdown("#### 🔤 Top Words")
+                    fig3 = word_freq_chart(stats)
+                    st.pyplot(fig3, use_container_width=True)
+                st.markdown("#### 💡 Verdict & Explanation")
+                if verdict_type == "error":
+                    st.error(verdict_text)
+                elif verdict_type == "warning":
+                    st.warning(verdict_text)
+                else:
+                    st.success(verdict_text)
+                reasons = explain_text(res, stats, strict_mode_saved)
+                for r in reasons:
+                    st.markdown(f"- {r}")
+                st.markdown(
+                    "<small>Note: These scores and explanations are signals, not absolute proof. "
+                    "Always combine them with your own judgement.</small>",
+                    unsafe_allow_html=True
+                )
+                # PDF report button
+                st.write("")
+                if HAVE_REPORTLAB:
+                    img_info_for_pdf = st.session_state.get("img_res_for_pdf", None)
+                    pdf_bytes = generate_pdf_report(
+                        platform_saved,
+                        review_text_saved,
+                        res,
+                        stats,
+                        img_info_for_pdf
                     )
+                    st.download_button(
+                        "📄 Download PDF Report",
+                        data=pdf_bytes,
+                        file_name="review_validator_report.pdf",
+                        mime="application/pdf",
+                    )
+                else:
+                    st.info("PDF report requires reportlab. Add `reportlab` to requirements.txt to enable export.")
     # --- IMAGE TAB ---
     with tab2:
                             data = check_image(target_img, squad)
                             st.session_state['img_res'] = (data, strict_img)
                             st.session_state['current_img'] = target_img
+                            # store a simplified version for PDF report
+                            st.session_state['img_res_for_pdf'] = data
         with col_view:
             if 'current_img' in st.session_state:
     warnings_text = None
     if err:
+        warnings_text = "Some features may be limited:<br>" + err.replace("\n", "<br>")
     if st.session_state['page'] == 'landing':
         landing_page()