Duv / src /streamlit_app.py
akshit4857's picture
Update src/streamlit_app.py
0ee5b32 verified
raw
history blame
25.6 kB
"""
Review Validator - Advanced Edition
With explainability graphs + PDF report download + Google Reverse Image Search
"""
import os
import io
import time
import base64
import numpy as np
import streamlit as st
from transformers import pipeline, logging as hf_logging
from PIL import Image
import matplotlib
matplotlib.use('Agg') # use non-GUI backend for Streamlit
import matplotlib.pyplot as plt
import requests
import math
import warnings
import re
from collections import Counter
from datetime import datetime
import textwrap
# Try to import ReportLab for PDF generation
try:
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
HAVE_REPORTLAB = True
except ImportError:
HAVE_REPORTLAB = False
# --- Setup: Silence the technical noise ---
warnings.filterwarnings("ignore")
hf_logging.set_verbosity_error()
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
st.set_page_config(
page_title="Review Validator",
page_icon="πŸ›‘οΈ",
layout="wide",
initial_sidebar_state="collapsed"
)
# ==========================================
# 🧠 MODELS (Better public detectors)
# ==========================================
# 1. Text AI Detector: ModernBERT-based detector (0 = human, 1 = AI)
MODEL_FAKE = "AICodexLab/answerdotai-ModernBERT-base-ai-detector"
# 2. Mood Scanner: Sentiment model
MODEL_MOOD = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# 3. Grammar Checker: Acceptability (CoLA)
MODEL_GRAMMAR = "textattack/roberta-base-CoLA"
# 4. Image Detector: Modern real vs fake classifier
MODEL_IMG_MAIN = "prithivMLmods/Mirage-Photo-Classifier"
# 5. Image Captioner (Optional): Describes the image content
MODEL_CAPTION = "Salesforce/blip-image-captioning-base"
# ==========================================
# --- Robust Secrets Management (NON-CRASHING) ---
def get_token():
"""
Safely retrieves HF_TOKEN.
Priority 1: Env var (Spaces)
Priority 2: Streamlit Secrets (Local)
Optional – app still runs if missing.
"""
token = os.environ.get("HF_TOKEN")
if token:
return token
try:
if hasattr(st, "secrets") and "HF_TOKEN" in st.secrets:
return st.secrets["HF_TOKEN"]
except Exception:
pass
return None
def get_serpapi_key():
"""
Safely retrieves SERPAPI_KEY.
Priority 1: Env var
Priority 2: Streamlit Secrets
"""
key = os.environ.get("SERPAPI_KEY")
if key:
return key
try:
if hasattr(st, "secrets") and "SERPAPI_KEY" in st.secrets:
return st.secrets["SERPAPI_KEY"]
except Exception:
pass
return None
HF_TOKEN = get_token()
SERPAPI_KEY = get_serpapi_key()
# --- Custom CSS ---
def inject_custom_css():
st.markdown("""
<style>
.stApp {
background-color: #FFFFFF;
color: #333333;
font-family: 'Helvetica Neue', sans-serif;
}
h1, h2, h3 { color: #2C3E50; }
h1 { font-weight: 800; }
h2 { font-weight: 600; }
.hero-box {
padding: 40px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 20px;
color: white;
text-align: center;
margin-bottom: 30px;
}
.hero-title { font-size: 3rem; font-weight: bold; margin-bottom: 10px; }
.hero-subtitle { font-size: 1.2rem; opacity: 0.9; }
.feature-card {
background: #F8F9FA;
padding: 20px;
border-radius: 15px;
border: 1px solid #EEEEEE;
text-align: center;
transition: transform 0.2s, box-shadow 0.2s;
}
.feature-card:hover {
transform: translateY(-5px);
border-color: #764ba2;
box-shadow: 0 4px 12px rgba(0,0,0,0.06);
}
.emoji-icon { font-size: 3rem; margin-bottom: 10px; display: block; }
.stat-box {
text-align: center;
padding: 15px;
border-radius: 12px;
background: white;
box-shadow: 0 4px 6px rgba(0,0,0,0.05);
border: 1px solid #EEE;
}
.stat-num { font-size: 24px; font-weight: 900; color: #333; }
.stat-txt { font-size: 12px; text-transform: uppercase; color: #777; letter-spacing: 1px; }
.analysis-box {
background: #f0f7ff;
border-left: 5px solid #4285F4;
padding: 15px;
border-radius: 5px;
margin-top: 15px;
}
.warning-box {
background: #fff6e5;
border-left: 5px solid #ffb74d;
padding: 10px 15px;
border-radius: 5px;
font-size: 0.85rem;
margin-top: 8px;
}
.reverse-search-box {
background: #f0fff4;
border-left: 5px solid #48bb78;
padding: 15px;
border-radius: 5px;
margin-top: 15px;
}
.result-item {
background: white;
padding: 12px;
border-radius: 8px;
margin: 8px 0;
border: 1px solid #e2e8f0;
transition: box-shadow 0.2s;
}
.result-item:hover {
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.stButton>button {
border-radius: 30px;
font-weight: bold;
border: none;
padding: 0.6rem 2.2rem;
transition: all 0.3s;
}
.stButton>button:hover {
transform: translateY(-1px);
box-shadow: 0 3px 8px rgba(0,0,0,0.15);
}
</style>
""", unsafe_allow_html=True)
# --- Load Models (Safe Mode, No Hard Crash) ---
@st.cache_resource(show_spinner=False)
def load_ai_squad():
"""
Load all models. Never hard-crash the app.
If some models fail, we still return partial squad.
"""
squad = {}
errors = []
token_arg = {"token": HF_TOKEN} if HF_TOKEN else {}
# TEXT MODELS
try:
squad['fake'] = pipeline(
"text-classification",
model=MODEL_FAKE,
**token_arg
)
except Exception as e:
errors.append(f"Fake detector: {e}")
try:
squad['mood'] = pipeline(
"sentiment-analysis",
model=MODEL_MOOD,
tokenizer=MODEL_MOOD,
**token_arg
)
except Exception as e:
errors.append(f"Mood model: {e}")
try:
squad['grammar'] = pipeline(
"text-classification",
model=MODEL_GRAMMAR,
**token_arg
)
except Exception as e:
errors.append(f"Grammar model: {e}")
# IMAGE MODELS
try:
squad['img_main'] = pipeline(
"image-classification",
model=MODEL_IMG_MAIN,
**token_arg
)
except Exception as e:
errors.append(f"Image main model: {e}")
try:
squad['caption'] = pipeline(
"image-to-text",
model=MODEL_CAPTION,
**token_arg
)
except Exception as e:
errors.append(f"Caption model: {e}")
if not squad:
return None, "No models could be loaded. Check internet / HF token / requirements."
err_msg = "\n".join(errors) if errors else None
return squad, err_msg
# --- Utility: Basic text stats for explainability ---
STOPWORDS = set([
"the","a","an","is","are","am","and","or","in","on","at","of","to","for",
"this","that","it","was","with","as","by","be","from","has","have","had",
"i","you","we","they","he","she","my","our","their","your"
])
def split_sentences(text: str):
# simple sentence splitter
parts = re.split(r'[.!?]+', text)
return [s.strip() for s in parts if s.strip()]
def tokenize_words(text: str):
tokens = re.findall(r"[A-Za-z']+", text.lower())
return tokens
def analyze_text_structure(text: str):
sentences = split_sentences(text)
words = tokenize_words(text)
num_sentences = max(len(sentences), 1)
num_words = len(words)
sent_lengths = [len(tokenize_words(s)) for s in sentences] or [0]
avg_sent_len = sum(sent_lengths) / len(sent_lengths)
var_sent_len = float(np.var(sent_lengths)) if len(sent_lengths) > 1 else 0.0
# vocabulary diversity
vocab = set(w for w in words if w not in STOPWORDS)
vocab_size = len(vocab)
ttr = (vocab_size / num_words) if num_words > 0 else 0.0 # type-token ratio
# top words
filtered = [w for w in words if w not in STOPWORDS]
counter = Counter(filtered)
top_words = counter.most_common(10)
return {
"num_sentences": num_sentences,
"num_words": num_words,
"avg_sentence_len": avg_sent_len,
"var_sentence_len": var_sent_len,
"ttr": ttr,
"top_words": top_words,
"sentence_lengths": sent_lengths,
}
def explain_text(res, stats, strict_mode: bool):
"""
Heuristic explanation based on AI score + grammar + structure.
Returns list of bullet strings.
"""
bot = res["bot_score"]
gram = res["grammar_score"]
mood = res["mood_label"]
avg_len = stats["avg_sentence_len"]
var_len = stats["var_sentence_len"]
ttr = stats["ttr"]
reasons = []
# AI-likeness
if bot >= 85:
reasons.append("High AI-likeness score – model strongly associates this style with AI text.")
elif bot >= 65:
reasons.append("Moderate AI-likeness score – some patterns resemble AI-generated writing.")
else:
reasons.append("Low AI-likeness score – style leans closer to typical human-written reviews.")
# Grammar
if gram >= 85 and bot >= 70:
reasons.append("Grammar is near-perfect and very consistent, which is common in AI text.")
elif gram >= 85 and bot < 50:
reasons.append("Grammar is very clean but the AI score is low, could be a careful human reviewer.")
elif gram < 60:
reasons.append("Grammar has noticeable imperfections, more typical of casual human writing.")
# Sentence structure
if var_len < 5 and avg_len > 12 and bot >= 70:
reasons.append("Sentence length is very uniform and long, which often appears in AI outputs.")
elif var_len > 15:
reasons.append("Sentence length varies a lot, which is more natural for human writing.")
# Vocabulary diversity
if ttr < 0.3 and bot >= 70:
reasons.append("Vocabulary diversity is low despite longer text, hinting at templated or generated style.")
elif ttr > 0.45:
reasons.append("Vocabulary diversity is relatively high, which often indicates a human author.")
# Mood-based explanation
reasons.append(f"Overall sentiment detected: **{mood}**.")
if strict_mode:
reasons.append("Strict mode: thresholds are higher, so AI flags are more conservative but precise.")
return reasons
# --- Logic: Analyze Text ---
def check_text(text, squad):
if 'fake' not in squad:
return {
"bot_score": 0,
"mood_label": "Unavailable",
"grammar_score": 0,
"mood_confidence": 0,
"error": True,
"error_msg": "AI text detector not loaded."
}
# 1. Bot / AI Check
res_fake = squad['fake'](text[:512])[0]
raw_label = res_fake.get('label', '1')
raw_score = float(res_fake.get('score', 0.5))
try:
label_id = int(raw_label)
except ValueError:
label_id = 1 if "1" in str(raw_label) else 0
if label_id == 1:
ai_prob = raw_score
else:
ai_prob = 1 - raw_score
bot_score = ai_prob * 100.0
# 2. Mood
mood_label = "Unknown"
if 'mood' in squad:
try:
res_mood = squad['mood'](text[:512])[0]
mood_label = res_mood.get('label', 'Unknown')
except Exception:
mood_label = "Unknown"
# 3. Grammar (CoLA)
grammar_score = 50.0
if 'grammar' in squad:
try:
res_grammar = squad['grammar'](text[:512])[0]
glabel = res_grammar.get('label', 'LABEL_0')
gscore = float(res_grammar.get('score', 0.5))
grammar_score = (gscore if glabel == 'LABEL_1' else (1 - gscore)) * 100.0
except Exception:
grammar_score = 50.0
return {
"bot_score": bot_score,
"mood_label": mood_label,
"grammar_score": grammar_score,
"mood_confidence": 0,
"error": False,
"error_msg": None
}
# --- Logic: Analyze Image ---
def check_image(img, squad):
caption_text = "Caption unavailable"
ai_chance = 0.0
if 'img_main' in squad:
try:
preds = squad['img_main'](img)
if isinstance(preds, list) and preds:
best = max(preds, key=lambda x: x.get('score', 0))
label = str(best.get('label', '')).lower()
score = float(best.get('score', 0.5))
if "fake" in label or "ai" in label:
ai_prob = score
elif "real" in label:
ai_prob = 1 - score
else:
ai_prob = score
ai_chance = ai_prob * 100.0
except Exception:
ai_chance = 0.0
if 'caption' in squad:
try:
cap_res = squad['caption'](img)
if isinstance(cap_res, list) and cap_res:
caption_text = cap_res[0].get('generated_text', caption_text)
except Exception:
pass
return {
"ai_chance": ai_chance,
"match": 1.0,
"score_a": ai_chance,
"score_b": ai_chance,
"caption": caption_text
}
def get_image_from_url(url):
try:
headers = {
'User-Agent': (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.124 Safari/537.36'
)
}
r = requests.get(url, headers=headers, timeout=7, stream=True)
if r.status_code != 200:
return None
return Image.open(io.BytesIO(r.content)).convert("RGB")
except Exception:
return None
# --- GOOGLE REVERSE IMAGE SEARCH (SerpAPI) ---
def reverse_image_search(image_obj):
"""
Performs Google reverse image search using SerpAPI.
Args:
image_obj: PIL Image object
Returns:
dict with 'success', 'results', 'error' keys
"""
if not SERPAPI_KEY:
return {
"success": False,
"error": "SERPAPI_KEY not configured. Add it to secrets or environment variables.",
"results": []
}
try:
# Convert PIL Image to base64
buffered = io.BytesIO()
image_obj.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode()
# SerpAPI endpoint for Google Lens (reverse image search)
url = "https://serpapi.com/search"
params = {
"engine": "google_lens",
"api_key": SERPAPI_KEY,
"url": f"data:image/jpeg;base64,{img_str}"
}
response = requests.get(url, params=params, timeout=15)
if response.status_code != 200:
return {
"success": False,
"error": f"SerpAPI returned status code {response.status_code}",
"results": []
}
data = response.json()
# Extract visual matches
visual_matches = data.get("visual_matches", [])
results = []
for match in visual_matches[:10]: # Limit to top 10 results
results.append({
"title": match.get("title", "No title"),
"link": match.get("link", ""),
"source": match.get("source", "Unknown"),
"thumbnail": match.get("thumbnail", "")
})
return {
"success": True,
"results": results,
"error": None
}
except Exception as e:
return {
"success": False,
"error": f"Reverse search failed: {str(e)}",
"results": []
}
# --- Plotting ---
def breakdown_chart(stats):
labels = ['AI-Likeness', 'Grammar Quality']
values = [stats['bot_score'], stats['grammar_score']]
fig, ax = plt.subplots(figsize=(4, 2))
y_pos = np.arange(len(labels))
ax.barh(y_pos, values, align='center', height=0.6)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.invert_yaxis()
ax.set_xlabel('Score (0-100)')
ax.set_xlim(0, 100)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDD')
plt.tight_layout()
return fig
def sentence_length_chart(stats):
lens = stats["sentence_lengths"]
fig, ax = plt.subplots(figsize=(4, 2))
ax.hist(lens, bins=min(len(lens), 8) or 1, edgecolor='black')
ax.set_xlabel("Sentence length (words)")
ax.set_ylabel("Count")
ax.set_title("Sentence Length Distribution")
plt.tight_layout()
return fig
def word_freq_chart(stats):
top_words = stats["top_words"]
if not top_words:
fig, ax = plt.subplots(figsize=(4, 2))
ax.text(0.5, 0.5, "Not enough text", ha='center', va='center')
ax.axis('off')
return fig
words, freqs = zip(*top_words)
fig, ax = plt.subplots(figsize=(4, 2))
x = np.arange(len(words))
ax.bar(x, freqs)
ax.set_xticks(x)
ax.set_xticklabels(words, rotation=45, ha='right')
ax.set_ylabel("Frequency")
ax.set_title("Top Words (excluding stopwords)")
plt.tight_layout()
return fig
# --- PDF REPORT GENERATION ---
def generate_pdf_report(platform, review_text, text_res, text_stats, image_info, reverse_search_data=None):
"""
Returns PDF bytes. Requires ReportLab.
image_info: dict or None
reverse_search_data: dict with reverse search results or None
"""
buffer = io.BytesIO()
c = canvas.Canvas(buffer, pagesize=A4)
width, height = A4
y = height - 50
def write_line(text, font="Helvetica", size=10, leading=14):
nonlocal y
c.setFont(font, size)
wrapped = textwrap.wrap(text, width=90)
for line in wrapped:
if y < 50:
c.showPage()
y = height - 50
c.setFont(font, size)
c.drawString(50, y, line)
y -= leading
# Header
c.setFont("Helvetica-Bold", 16)
c.drawString(50, y, "Review Validator Report")
y -= 25
c.setFont("Helvetica", 10)
c.drawString(50, y, f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
y -= 15
c.drawString(50, y, f"Platform: {platform}")
y -= 25
# Scores
write_line("=== Text Analysis ===", font="Helvetica-Bold", size=12)
write_line(f"AI-Likeness Score: {text_res['bot_score']:.1f}%")
write_line(f"Grammar Quality: {text_res['grammar_score']:.1f}%")
write_line(f"Sentiment: {text_res['mood_label']}")
y -= 10
# Structure stats
write_line("Text Structure:", font="Helvetica-Bold", size=11)
write_line(f"- Sentences: {text_stats['num_sentences']}")
write_line(f"- Words: {text_stats['num_words']}")
write_line(f"- Average sentence length: {text_stats['avg_sentence_len']:.1f} words")
write_line(f"- Sentence length variance: {text_stats['var_sentence_len']:.1f}")
write_line(f"- Vocabulary diversity (TTR): {text_stats['ttr']:.2f}")
y -= 10
# Review text
write_line("Original Review:", font="Helvetica-Bold", size=11)
write_line(review_text or "[empty review]")
y -= 10
# Image analysis
if image_info is not None:
write_line("=== Image Analysis ===", font="Helvetica-Bold", size=12)
write_line(f"AI Probability: {image_info['ai_chance']:.1f}%")
write_line(f"Caption (approx): {image_info['caption']}")
y -= 10
# Reverse search results
if reverse_search_data and reverse_search_data.get('success'):
write_line("=== Reverse Image Search Results ===", font="Helvetica-Bold", size=12)
results = reverse_search_data.get('results', [])
if results:
write_line(f"Found {len(results)} matches online:")
for i, result in enumerate(results[:5], 1):
write_line(f"{i}. {result['title']}")
write_line(f" Source: {result['source']}")
write_line(f" Link: {result['link']}")
y -= 5
else:
write_line("No matches found.")
y -= 10
c.showPage()
c.save()
pdf_bytes = buffer.getvalue()
buffer.close()
return pdf_bytes
# --- PAGES ---
def landing_page():
st.markdown("""
<div class="hero-box">
<div class="hero-title">πŸ›‘οΈ Review Validator</div>
<div class="hero-subtitle">
Advanced AI-powered review and image analysis with graphs, explainability, reverse image search, and exportable reports.
</div>
</div>
""", unsafe_allow_html=True)
c1, c2, c3 = st.columns(3)
with c1:
st.markdown("""
<div class="feature-card">
<span class="emoji-icon">πŸ€–</span>
<h3>AI Text Detector</h3>
<p>Modern models estimate whether a review looks AI-generated or human-written.</p>
</div>
""", unsafe_allow_html=True)
with c2:
st.markdown("""
<div class="feature-card">
<span class="emoji-icon">πŸ“Έ</span>
<h3>Image Authenticity</h3>
<p>Checks if product photos look real or AI-generated, with approximate captions.</p>
</div>
""", unsafe_allow_html=True)
with c3:
st.markdown("""
<div class="feature-card">
<span class="emoji-icon">πŸ”</span>
<h3>Reverse Image Search</h3>
<p>Find where the image appears online using Google reverse image search.</p>
</div>
""", unsafe_allow_html=True)
st.write("")
st.write("")
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
if st.button("πŸš€ START CHECKING REVIEWS", type="primary", use_container_width=True):
st.session_state['page'] = 'detector'
st.rerun()
def detector_page(squad, warnings_text=None):
# Header & Selector
c1, c2 = st.columns([3, 1])
with c1:
st.markdown("### πŸ›’ Select the Website")
platform = st.selectbox(
"Where is this review from?",
["Amazon", "Flipkart", "Zomato", "Swiggy", "Myntra", "Other"],
label_visibility="collapsed"
)
with c2:
if st.button("⬅️ Back Home"):
st.session_state['page'] = 'landing'
st.rerun()
st.divider()
if warnings_text:
st.markdown(f"""
<div class="warning-box">
<strong>Note:</strong><br>{warnings_text}
</div>
""", unsafe_allow_html=True)
tab1, tab2 = st.tabs(["πŸ“ Check Review Text", "πŸ“Έ Check Product Image"])
# --- TEXT TAB ---
with tab1:
col1, col2 = st.columns([2, 1])
with col1:
txt_input = st.text_area(
"Paste Review Here:",
height=150,
placeholder="Example: I ordered this yesterday and the quality is amazing...",
key="txt_input"
)
with col2:
st.info("πŸ’‘ Tip: Paste the full review for the best result.")
strict_mode = st.checkbox("Use Strict AI Mode (safer, but misses some cases)", value=True)
if st.button("Analyze Text", type="primary", use_container_width=True):
if txt_input.strip():
with st.spinner("Analyzing text..."):
res = check_text(txt_input, squad)
stats = analyze_text_structure(txt_input)
st.session_state['text_res'] = (res, stats, strict_mode, platform, txt_input)
else:
st.warning("Please paste a review first.")
if 'text_res' in st.session_state:
res, stats, strict_mode_saved, platform_saved, review_text_saved = st.session_state['text_res']
if res.get("error"):
st.error(res.get("error_msg", "Text models failed to load."))
else:
st.markdown("---")
bot_score = res['bot_score']
grammar_score = res['grammar_score']
mood_label = res['mood_label']
# Thresholds
if strict_mode_saved:
t_high = 90
t_mid = 70
else:
t_high = 75
t_mid = 55
if bot_score >= t_high:
verdict_text = "🚨 Very likely AI-generated"
verdict_type = "error"
elif bot_score >= t_mid:
verdict_text = "πŸ€” Suspicious / Mixed"
verdict_type = "warning"
else:
verdict_text = "βœ… Likely human-written"
verdict_type = "success"
k1, k2, k3 = st.columns(3)
color = "red" if bot_score > 50 else "green"
k1.markdown(
f"""<div class="stat-box">
<div class="stat-num" style="color:{color}">{bot_score:.0f}%