StealthWriter / detector /custom_model.py
AlyanAkram's picture
Update detector/custom_model.py
6480e0c verified
import os
import torch
from pathlib import Path
from nltk.tokenize import sent_tokenize
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib import colors
from io import BytesIO
import nltk
# === Environment Setup ===
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home"
os.environ["NLTK_DATA"] = "/tmp/nltk_data"
nltk.data.path.append("/tmp/nltk_data")
# === Model Source (Hugging Face or Local) ===
USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
hf_token = os.getenv("HF_TOKEN")
MODEL_PATH = "AlyanAkram/stealth-roberta" if USE_HF_MODEL else "./detector/models/roberta-detector"
if USE_HF_MODEL:
print("πŸ” Loading model from Hugging Face Hub...")
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token)
model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token)
else:
print("πŸ“ Loading model from local files...")
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
device = next(model.parameters()).device
# === Constants ===
AI_THRESHOLD = 0.5
# === Main Analysis Function ===
def analyze_text(text: str):
results = []
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
ai_count, total_sentences = 0, 0
for paragraph in paragraphs:
sentence_results = []
for sentence in sent_tokenize(paragraph):
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
ai_prob = probs[1].item()
is_ai = ai_prob >= AI_THRESHOLD
sentence_results.append((sentence, is_ai, ai_prob))
total_sentences += 1
if is_ai:
ai_count += 1
results.append(sentence_results)
return {
"overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0,
"total_sentences": total_sentences,
"ai_sentences": ai_count,
"results": results
}
# === PDF Report Generator (In-Memory) ===
def generate_pdf_report(results: dict, filename: str) -> BytesIO:
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=A4)
width, height = A4
x, y = 40, height - 60
line_height, font_size = 18, 12
c.setFont("Helvetica-Bold", 14)
c.drawString(x, y, f"πŸ“„ AI Detection Report: {filename}")
y -= 25
c.setFont("Helvetica", font_size)
c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
y -= 30
for para_result in results["results"]:
for sentence, is_ai, _ in para_result:
sentence = sentence.strip()
if not sentence:
continue
if y < 50:
c.showPage()
y = height - 50
words = sentence.split()
current_line = ""
for word in words:
test_line = f"{current_line} {word}".strip()
if c.stringWidth(test_line, "Helvetica", font_size) > width - 80:
if is_ai:
c.setFillColor(colors.cyan)
c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False)
c.setFillColor(colors.black)
c.drawString(x, y, current_line)
y -= line_height
current_line = word
else:
current_line = test_line
if current_line:
if is_ai:
c.setFillColor(colors.cyan)
c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False)
c.setFillColor(colors.black)
c.drawString(x, y, current_line)
y -= line_height
y -= line_height
c.save()
buffer.seek(0)
return buffer