import os import torch from pathlib import Path from nltk.tokenize import sent_tokenize from transformers import RobertaTokenizer, RobertaForSequenceClassification from reportlab.lib.pagesizes import A4 from reportlab.pdfgen import canvas from reportlab.lib import colors from io import BytesIO import nltk # === Environment Setup === os.environ["HF_HOME"] = "/tmp/hf_home" os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home" os.environ["NLTK_DATA"] = "/tmp/nltk_data" nltk.data.path.append("/tmp/nltk_data") # === Model Source (Hugging Face or Local) === USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1" hf_token = os.getenv("HF_TOKEN") MODEL_PATH = "AlyanAkram/stealth-roberta" if USE_HF_MODEL else "./detector/models/roberta-detector" if USE_HF_MODEL: print("🔐 Loading model from Hugging Face Hub...") tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token) model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token) else: print("📁 Loading model from local files...") tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True) model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True) model.eval().to("cuda" if torch.cuda.is_available() else "cpu") device = next(model.parameters()).device # === Constants === AI_THRESHOLD = 0.5 # === Main Analysis Function === def analyze_text(text: str): results = [] paragraphs = [p.strip() for p in text.split("\n") if p.strip()] ai_count, total_sentences = 0, 0 for paragraph in paragraphs: sentence_results = [] for sentence in sent_tokenize(paragraph): inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device) with torch.no_grad(): outputs = model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0] ai_prob = probs[1].item() is_ai = ai_prob >= AI_THRESHOLD sentence_results.append((sentence, is_ai, ai_prob)) total_sentences += 1 if is_ai: ai_count += 1 results.append(sentence_results) return { "overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0, "total_sentences": total_sentences, "ai_sentences": ai_count, "results": results } # === PDF Report Generator (In-Memory) === def generate_pdf_report(results: dict, filename: str) -> BytesIO: buffer = BytesIO() c = canvas.Canvas(buffer, pagesize=A4) width, height = A4 x, y = 40, height - 60 line_height, font_size = 18, 12 c.setFont("Helvetica-Bold", 14) c.drawString(x, y, f"📄 AI Detection Report: {filename}") y -= 25 c.setFont("Helvetica", font_size) c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences") y -= 30 for para_result in results["results"]: for sentence, is_ai, _ in para_result: sentence = sentence.strip() if not sentence: continue if y < 50: c.showPage() y = height - 50 words = sentence.split() current_line = "" for word in words: test_line = f"{current_line} {word}".strip() if c.stringWidth(test_line, "Helvetica", font_size) > width - 80: if is_ai: c.setFillColor(colors.cyan) c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False) c.setFillColor(colors.black) c.drawString(x, y, current_line) y -= line_height current_line = word else: current_line = test_line if current_line: if is_ai: c.setFillColor(colors.cyan) c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False) c.setFillColor(colors.black) c.drawString(x, y, current_line) y -= line_height y -= line_height c.save() buffer.seek(0) return buffer