Spaces:
Running
Running
import os | |
import torch | |
from pathlib import Path | |
from nltk.tokenize import sent_tokenize | |
from transformers import RobertaTokenizer, RobertaForSequenceClassification | |
from reportlab.lib.pagesizes import A4 | |
from reportlab.pdfgen import canvas | |
from reportlab.lib import colors | |
from io import BytesIO | |
import nltk | |
# === Environment Setup === | |
os.environ["HF_HOME"] = "/tmp/hf_home" | |
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home" | |
os.environ["NLTK_DATA"] = "/tmp/nltk_data" | |
nltk.data.path.append("/tmp/nltk_data") | |
# === Model Source (Hugging Face or Local) === | |
USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1" | |
hf_token = os.getenv("HF_TOKEN") | |
MODEL_PATH = "AlyanAkram/stealth-roberta" if USE_HF_MODEL else "./detector/models/roberta-detector" | |
if USE_HF_MODEL: | |
print("π Loading model from Hugging Face Hub...") | |
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token) | |
model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token) | |
else: | |
print("π Loading model from local files...") | |
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True) | |
model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True) | |
model.eval().to("cuda" if torch.cuda.is_available() else "cpu") | |
device = next(model.parameters()).device | |
# === Constants === | |
AI_THRESHOLD = 0.5 | |
# === Main Analysis Function === | |
def analyze_text(text: str): | |
results = [] | |
paragraphs = [p.strip() for p in text.split("\n") if p.strip()] | |
ai_count, total_sentences = 0, 0 | |
for paragraph in paragraphs: | |
sentence_results = [] | |
for sentence in sent_tokenize(paragraph): | |
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0] | |
ai_prob = probs[1].item() | |
is_ai = ai_prob >= AI_THRESHOLD | |
sentence_results.append((sentence, is_ai, ai_prob)) | |
total_sentences += 1 | |
if is_ai: | |
ai_count += 1 | |
results.append(sentence_results) | |
return { | |
"overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0, | |
"total_sentences": total_sentences, | |
"ai_sentences": ai_count, | |
"results": results | |
} | |
# === PDF Report Generator (In-Memory) === | |
def generate_pdf_report(results: dict, filename: str) -> BytesIO: | |
buffer = BytesIO() | |
c = canvas.Canvas(buffer, pagesize=A4) | |
width, height = A4 | |
x, y = 40, height - 60 | |
line_height, font_size = 18, 12 | |
c.setFont("Helvetica-Bold", 14) | |
c.drawString(x, y, f"π AI Detection Report: {filename}") | |
y -= 25 | |
c.setFont("Helvetica", font_size) | |
c.drawString(x, y, f"π§ AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences") | |
y -= 30 | |
for para_result in results["results"]: | |
for sentence, is_ai, _ in para_result: | |
sentence = sentence.strip() | |
if not sentence: | |
continue | |
if y < 50: | |
c.showPage() | |
y = height - 50 | |
words = sentence.split() | |
current_line = "" | |
for word in words: | |
test_line = f"{current_line} {word}".strip() | |
if c.stringWidth(test_line, "Helvetica", font_size) > width - 80: | |
if is_ai: | |
c.setFillColor(colors.cyan) | |
c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False) | |
c.setFillColor(colors.black) | |
c.drawString(x, y, current_line) | |
y -= line_height | |
current_line = word | |
else: | |
current_line = test_line | |
if current_line: | |
if is_ai: | |
c.setFillColor(colors.cyan) | |
c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False) | |
c.setFillColor(colors.black) | |
c.drawString(x, y, current_line) | |
y -= line_height | |
y -= line_height | |
c.save() | |
buffer.seek(0) | |
return buffer | |