Spaces:

AlyanAkram
/

StealthWriter

Running

App Files Files Community

StealthWriter / detector /custom_model.py

AlyanAkram

Update detector/custom_model.py

6480e0c verified 2 months ago

raw

history blame contribute delete

4.43 kB

	import os
	import torch
	from pathlib import Path
	from nltk.tokenize import sent_tokenize
	from transformers import RobertaTokenizer, RobertaForSequenceClassification
	from reportlab.lib.pagesizes import A4
	from reportlab.pdfgen import canvas
	from reportlab.lib import colors
	from io import BytesIO
	import nltk

	# === Environment Setup ===
	os.environ["HF_HOME"] = "/tmp/hf_home"
	os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home"
	os.environ["NLTK_DATA"] = "/tmp/nltk_data"
	nltk.data.path.append("/tmp/nltk_data")

	# === Model Source (Hugging Face or Local) ===
	USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
	hf_token = os.getenv("HF_TOKEN")
	MODEL_PATH = "AlyanAkram/stealth-roberta" if USE_HF_MODEL else "./detector/models/roberta-detector"

	if USE_HF_MODEL:
	print("🔐 Loading model from Hugging Face Hub...")
	tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token)
	model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token)
	else:
	print("📁 Loading model from local files...")
	tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
	model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)

	model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
	device = next(model.parameters()).device

	# === Constants ===
	AI_THRESHOLD = 0.5

	# === Main Analysis Function ===
	def analyze_text(text: str):
	results = []
	paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
	ai_count, total_sentences = 0, 0

	for paragraph in paragraphs:
	sentence_results = []
	for sentence in sent_tokenize(paragraph):
	inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
	with torch.no_grad():
	outputs = model(**inputs)
	probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
	ai_prob = probs[1].item()

	is_ai = ai_prob >= AI_THRESHOLD
	sentence_results.append((sentence, is_ai, ai_prob))
	total_sentences += 1
	if is_ai:
	ai_count += 1

	results.append(sentence_results)

	return {
	"overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0,
	"total_sentences": total_sentences,
	"ai_sentences": ai_count,
	"results": results
	}

	# === PDF Report Generator (In-Memory) ===
	def generate_pdf_report(results: dict, filename: str) -> BytesIO:

	buffer = BytesIO()
	c = canvas.Canvas(buffer, pagesize=A4)
	width, height = A4
	x, y = 40, height - 60
	line_height, font_size = 18, 12

	c.setFont("Helvetica-Bold", 14)
	c.drawString(x, y, f"📄 AI Detection Report: {filename}")
	y -= 25
	c.setFont("Helvetica", font_size)
	c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
	y -= 30

	for para_result in results["results"]:
	for sentence, is_ai, _ in para_result:
	sentence = sentence.strip()
	if not sentence:
	continue

	if y < 50:
	c.showPage()
	y = height - 50

	words = sentence.split()
	current_line = ""

	for word in words:
	test_line = f"{current_line} {word}".strip()
	if c.stringWidth(test_line, "Helvetica", font_size) > width - 80:
	if is_ai:
	c.setFillColor(colors.cyan)
	c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False)
	c.setFillColor(colors.black)
	c.drawString(x, y, current_line)
	y -= line_height
	current_line = word
	else:
	current_line = test_line

	if current_line:
	if is_ai:
	c.setFillColor(colors.cyan)
	c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False)
	c.setFillColor(colors.black)
	c.drawString(x, y, current_line)
	y -= line_height

	y -= line_height

	c.save()
	buffer.seek(0)
	return buffer