"""
Review Validator - Final Version with SerpAPI Integration
"""
import os
import io
import warnings
from collections import Counter
import numpy as np
import streamlit as st
from transformers import pipeline, logging as hf_logging
from PIL import Image
import matplotlib
import matplotlib.pyplot as plt
import requests
from reportlab.lib.pagesizes import A4
from reportlab.platypus import (
SimpleDocTemplate,
Paragraph,
Spacer,
Table,
TableStyle,
)
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
# ------------------- SILENCE NOISE -------------------
warnings.filterwarnings("ignore")
hf_logging.set_verbosity_error()
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
matplotlib.use("Agg")
st.set_page_config(
page_title="Review Validator",
page_icon="🛡️",
layout="wide",
initial_sidebar_state="collapsed",
)
# ------------------- MODEL NAMES -------------------
MODEL_FAKE = "openai-community/roberta-base-openai-detector"
MODEL_MOOD = "cardiffnlp/twitter-roberta-base-sentiment-latest"
MODEL_GRAMMAR = "textattack/roberta-base-CoLA"
MODEL_IMG_A = "dima806/ai_generated_image_detection"
MODEL_IMG_B = "umm-maybe/AI-image-detector"
MODEL_CAPTION = "Salesforce/blip-image-captioning-base"
# ------------------- TOKENS / SECRETS -------------------
def get_hf_token():
token = os.environ.get("HF_TOKEN")
if token:
return token
try:
if hasattr(st, "secrets") and "HF_TOKEN" in st.secrets:
return st.secrets["HF_TOKEN"]
except Exception:
pass
return None
def get_serpapi_key():
key = os.environ.get("SERPAPI_KEY")
if key:
return key
try:
if hasattr(st, "secrets") and "SERPAPI_KEY" in st.secrets:
return st.secrets["SERPAPI_KEY"]
except Exception:
pass
return None
HF_TOKEN = get_hf_token()
# ------------------- CSS -------------------
def inject_custom_css():
st.markdown(
"""
""",
unsafe_allow_html=True,
)
# ------------------- LOAD MODELS -------------------
@st.cache_resource(show_spinner=False)
def load_ai_squad():
squad = {}
if not HF_TOKEN:
return None, "HF_TOKEN missing. Set it in env or Streamlit secrets."
try:
try:
squad["fake"] = pipeline(
"text-classification", model=MODEL_FAKE, token=HF_TOKEN
)
except Exception as e:
print("Fake model error:", e)
try:
squad["mood"] = pipeline(
"sentiment-analysis",
model=MODEL_MOOD,
tokenizer=MODEL_MOOD,
token=HF_TOKEN,
)
except Exception as e:
print("Mood model error:", e)
try:
squad["grammar"] = pipeline(
"text-classification", model=MODEL_GRAMMAR, token=HF_TOKEN
)
except Exception as e:
print("Grammar model error:", e)
try:
squad["img_a"] = pipeline(
"image-classification", model=MODEL_IMG_A, token=HF_TOKEN
)
squad["img_b"] = pipeline(
"image-classification", model=MODEL_IMG_B, token=HF_TOKEN
)
squad["caption"] = pipeline(
"image-to-text", model=MODEL_CAPTION, token=HF_TOKEN
)
except Exception as e:
print("Image model error:", e)
except Exception as e:
return None, str(e)
return squad, None
# ------------------- TEXT HELPERS -------------------
def compute_text_stats(text: str):
sentences = [
s.strip()
for s in text.replace("!", ".").replace("?", ".").split(".")
if s.strip()
]
words = text.split()
word_count = len(words)
sent_lengths = [len(s.split()) for s in sentences] if sentences else []
avg_sent_len = np.mean(sent_lengths) if sent_lengths else 0.0
vocab = {w.lower().strip(".,!?\"'") for w in words if w.strip()}
vocab_size = len(vocab)
ttr = (vocab_size / word_count * 100) if word_count > 0 else 0.0
cleaned = [w.lower().strip(".,!?\"'") for w in words if w.strip()]
common = Counter(cleaned).most_common(8)
return {
"sentence_count": len(sentences),
"word_count": word_count,
"avg_sentence_length": avg_sent_len,
"vocab_size": vocab_size,
"type_token_ratio": ttr,
"sentence_lengths": sent_lengths,
"top_words": common,
}
def explain_text(res, stats):
lines = []
bot = res["bot_score"]
gram = res["grammar_score"]
mood = res["mood_label"]
if bot > 70:
lines.append(
"The AI-likeness score is high, indicating that the review strongly resembles machine-generated text."
)
elif bot > 40:
lines.append(
"The AI-likeness score is in a borderline range, so the review should be treated with caution."
)
else:
lines.append(
"The AI-likeness score is low, suggesting the review is likely human-written."
)
if gram > 80:
lines.append(
"Grammar quality is unusually clean and consistent, which sometimes correlates with AI-written or heavily edited content."
)
elif gram < 40:
lines.append(
"Grammar quality is weak, which can indicate spammy content but usually not advanced AI writing."
)
else:
lines.append(
"Grammar quality is moderate and falls within a typical human writing range."
)
lines.append(
f"The sentiment model detects a {mood.lower()} tone, which can be cross-checked with the context of the review."
)
lines.append(
f"The review contains {stats['sentence_count']} sentences and {stats['word_count']} words, with an average of {stats['avg_sentence_length']:.1f} words per sentence."
)
lines.append(
f"The vocabulary richness (type-token ratio) is approximately {stats['type_token_ratio']:.1f}%, indicating how repetitive or diverse the language is."
)
return "\n\n".join(lines)
def check_text(text, squad):
if "fake" not in squad:
return {"error": True}
res_fake = squad["fake"](text[:512])[0]
bot = res_fake["score"] if res_fake["label"] == "Fake" else 1 - res_fake["score"]
mood_label = "Unknown"
if "mood" in squad:
res_m = squad["mood"](text[:512])[0]
mood_label = res_m["label"]
grammar_score = 0.5
if "grammar" in squad:
res_g = squad["grammar"](text[:512])[0]
grammar_score = (
res_g["score"] if res_g["label"] == "LABEL_1" else 1 - res_g["score"]
)
stats = compute_text_stats(text)
return {
"bot_score": bot * 100,
"mood_label": mood_label,
"grammar_score": grammar_score * 100,
"stats": stats,
"error": False,
}
# ------------------- IMAGE HELPERS -------------------
def get_image_from_url(url: str):
"""
Returns (PIL.Image or None, error_message or None)
Handles 403 cleanly instead of throwing exceptions.
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0 Safari/537.36"
}
r = requests.get(url, headers=headers, timeout=10)
if r.status_code == 403:
return None, (
"The image host returned HTTP 403 (Forbidden). "
"This usually means the server is blocking automated downloads. "
"Download the image manually and upload it as a file instead."
)
if r.status_code != 200:
return None, f"Image host returned HTTP {r.status_code}."
img = Image.open(io.BytesIO(r.content)).convert("RGB")
return img, None
except Exception as e:
return None, f"Error fetching image: {e}"
def check_image(img, squad):
score_a = 0.0
score_b = 0.0
caption = "Analysis unavailable."
ai_words = ["fake", "artificial", "ai", "generated"]
if "img_a" in squad:
try:
for r in squad["img_a"](img):
if any(w in r["label"].lower() for w in ai_words):
score_a = max(score_a, r["score"])
except Exception as e:
print("img_a error:", e)
if "img_b" in squad:
try:
for r in squad["img_b"](img):
if any(w in r["label"].lower() for w in ai_words):
score_b = max(score_b, r["score"])
except Exception as e:
print("img_b error:", e)
else:
score_b = score_a
if "caption" in squad:
try:
cap_res = squad["caption"](img)
caption = cap_res[0]["generated_text"]
except Exception:
pass
avg_ai = (score_a + score_b) / 2
match = 1.0 - abs(score_a - score_b)
return {
"ai_chance": avg_ai * 100,
"match": match,
"score_a": score_a * 100,
"score_b": score_b * 100,
"caption": caption,
}
# ------------------- SERPAPI REVERSE IMAGE -------------------
def serpapi_reverse_image_search(image_url: str, api_key: str):
"""
Google Reverse Image Search using SerpAPI.
Returns dict or None, and error_message if any.
"""
if not api_key:
return None, "SerpAPI key not configured."
if not image_url:
return None, "No image URL provided."
try:
params = {
"engine": "google_reverse_image",
"image_url": image_url,
"api_key": api_key,
"output": "json",
}
resp = requests.get("https://serpapi.com/search", params=params, timeout=25)
if resp.status_code == 403:
return None, (
"SerpAPI returned HTTP 403 (Forbidden). "
"Check that the API key is valid and you have enough quota."
)
if resp.status_code != 200:
return None, f"SerpAPI HTTP {resp.status_code}: {resp.text[:180]}"
data = resp.json()
result = {
"best_guess": data.get("image_guess"),
"visual_matches": data.get("visual_matches", []),
}
return result, None
except Exception as e:
return None, f"Error calling SerpAPI: {e}"
# ------------------- PLOTS -------------------
def breakdown_chart(res):
labels = ["Bot Probability", "Grammar Quality"]
vals = [res["bot_score"], res["grammar_score"]]
fig, ax = plt.subplots(figsize=(4, 2.2))
y = np.arange(len(labels))
ax.barh(y, vals)
ax.set_yticks(y)
ax.set_yticklabels(labels)
ax.invert_yaxis()
ax.set_xlim(0, 100)
for i, v in enumerate(vals):
ax.text(v + 1, i, f"{v:.0f}%", va="center", fontsize=8)
plt.tight_layout()
return fig
def sentence_length_hist(stats):
fig, ax = plt.subplots(figsize=(4, 2.2))
if stats["sentence_lengths"]:
ax.hist(
stats["sentence_lengths"],
bins=min(8, len(stats["sentence_lengths"])),
)
ax.set_xlabel("Words per sentence")
ax.set_ylabel("Frequency")
ax.set_title("Sentence Length Distribution")
plt.tight_layout()
return fig
def word_frequency_chart(stats):
fig, ax = plt.subplots(figsize=(4, 2.2))
top = stats["top_words"]
if top:
words = [w for w, _ in top]
counts = [c for _, c in top]
ax.bar(words, counts)
ax.set_xticklabels(words, rotation=45, ha="right", fontsize=8)
ax.set_title("Top Word Frequency")
plt.tight_layout()
return fig
# ------------------- PDF REPORT -------------------
def generate_pdf(text_input, text_res, image_res, reverse_res, platform):
buf = io.BytesIO()
doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=30, rightMargin=30)
styles = getSampleStyleSheet()
elems = []
elems.append(Paragraph("Review Validator Report", styles["Title"]))
elems.append(Spacer(1, 6))
elems.append(Paragraph(f"Platform: {platform}", styles["Normal"]))
elems.append(Spacer(1, 10))
if text_input:
elems.append(Paragraph("Input Review Text", styles["Heading2"]))
elems.append(Spacer(1, 4))
safe = text_input.replace("\n", "
")
elems.append(Paragraph(safe, styles["Normal"]))
elems.append(Spacer(1, 8))
if text_res and not text_res.get("error", False):
stats = text_res["stats"]
elems.append(Paragraph("Text Authenticity Analysis", styles["Heading2"]))
data = [
["Bot-likeness", f"{text_res['bot_score']:.1f}%"],
["Grammar Quality", f"{text_res['grammar_score']:.1f}%"],
["Sentiment", text_res["mood_label"]],
["Sentence Count", str(stats["sentence_count"])],
["Word Count", str(stats["word_count"])],
["Avg. Sentence Length", f"{stats['avg_sentence_length']:.1f}"],
["Type-Token Ratio", f"{stats['type_token_ratio']:.1f}%"],
]
tbl = Table(data, hAlign="LEFT")
tbl.setStyle(
TableStyle(
[
("BACKGROUND", (0, 0), (-1, 0), colors.lightgrey),
("GRID", (0, 0), (-1, -1), 0.25, colors.grey),
("BOX", (0, 0), (-1, -1), 0.25, colors.black),
]
)
)
elems.append(tbl)
elems.append(Spacer(1, 8))
explanation = explain_text(text_res, stats)
elems.append(Paragraph("Interpretation", styles["Heading3"]))
for para in explanation.split("\n\n"):
elems.append(Paragraph(para, styles["Normal"]))
elems.append(Spacer(1, 3))
if image_res:
elems.append(Spacer(1, 8))
elems.append(Paragraph("Image Authenticity Analysis", styles["Heading2"]))
data2 = [
["AI-likeness (avg)", f"{image_res['ai_chance']:.1f}%"],
["Model A Score", f"{image_res['score_a']:.1f}%"],
["Model B Score", f"{image_res['score_b']:.1f}%"],
["Model Agreement", f"{image_res['match']*100:.1f}%"],
]
t2 = Table(data2, hAlign="LEFT")
t2.setStyle(
TableStyle(
[
("BACKGROUND", (0, 0), (-1, 0), colors.lightgrey),
("GRID", (0, 0), (-1, -1), 0.25, colors.grey),
("BOX", (0, 0), (-1, -1), 0.25, colors.black),
]
)
)
elems.append(t2)
elems.append(Spacer(1, 4))
elems.append(Paragraph(f"Caption: {image_res['caption']}", styles["Normal"]))
if reverse_res:
elems.append(Spacer(1, 8))
elems.append(Paragraph("Reverse Image Search (SerpAPI)", styles["Heading2"]))
best = reverse_res.get("best_guess")
count = reverse_res.get("count", 0)
elems.append(Paragraph(f"Visual matches found: {count}", styles["Normal"]))
if best:
elems.append(Paragraph(f"Google best guess: {best}", styles["Normal"]))
links = reverse_res.get("top_links", [])
if links:
elems.append(Spacer(1, 4))
elems.append(Paragraph("Top Matching Sources:", styles["Heading3"]))
for item in links:
line = f"{item.get('title') or item.get('link')} (source: {item.get('source')})"
elems.append(Paragraph(line, styles["Normal"]))
elems.append(Spacer(1, 2))
doc.build(elems)
pdf_bytes = buf.getvalue()
buf.close()
return pdf_bytes
# ------------------- UI: LANDING -------------------
def landing_page():
st.markdown(
"""
Transformer-based models estimate how likely a review is written by AI.
Dual detectors and captioning analyze whether an image is real or AI-generated.
SerpAPI + Google Reverse Image API to see where else the image appears online.