Spaces:

AlyanAkram
/

StealthWriter

Running

App Files Files Community

StealthWriter / Ai_rewriter /rewriter_fixed.py

AlyanAkram

Update Ai_rewriter/rewriter_fixed.py

cf5c434 verified about 1 month ago

raw

history blame contribute delete

51.8 kB

	import sys
	import os
	import re
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	from docx import Document
	from docx.shared import Pt, RGBColor
	from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
	import fitz # PyMuPDF for PDF processing
	import difflib
	import random
	import torch

	import sys
	sys.stdout.reconfigure(encoding='utf-8')


	def load_model():
	"""
	Loads AI Rewriter model:
	- Local run → loads from ./ai_rewriter_model/
	- Hugging Face Spaces → loads from HF Hub repo
	"""
	import os
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	import torch

	HF_MODEL_REPO = "AlyanAkram/StealthWriter_Rewriter"
	HF_MODEL_SUBFOLDER = "Ai_rewriter/ai_rewriter_model"

	HF_TOKEN = os.getenv("HF_TOKEN")
	RUNNING_ON_SPACES = os.environ.get("HF_SPACE") or os.environ.get("SPACE_ID")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"📦 Loading model to: {device}")

	try:
	if RUNNING_ON_SPACES:
	print("🌐 Running on Hugging Face Spaces → loading from HF Hub...")
	tokenizer = T5Tokenizer.from_pretrained(HF_MODEL_REPO, subfolder=HF_MODEL_SUBFOLDER, token=HF_TOKEN)
	model = T5ForConditionalGeneration.from_pretrained(HF_MODEL_REPO, subfolder=HF_MODEL_SUBFOLDER, token=HF_TOKEN).to(device)

	else:
	print("💻 Running locally → loading from local files...")
	local_path = os.path.join(os.path.dirname(__file__), "ai_rewriter_model")
	tokenizer = T5Tokenizer.from_pretrained(local_path)
	model = T5ForConditionalGeneration.from_pretrained(local_path).to(device)

	return tokenizer, model, device

	except Exception as e:
	print(f"⚠️ Model load failed ({e}) — using t5-small fallback")
	tokenizer = T5Tokenizer.from_pretrained("t5-small")
	model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
	return tokenizer, model, device


	def is_pure_junk(text):
	"""Identify text that is 100% junk and should be deleted"""
	text_lower = text.lower().strip()

	# System metadata - DELETE ENTIRELY
	system_junk = [
	r'submission\s+id', r'page\s+\d+\s+of', r'trn:oid', r'turnitin',
	r'file\s+name', r'file\s+size', r'document\s+details',
	r'\d+%\s+detected', r'ai.*generated', r'download\s+date',
	r'qualifying\s+text', r'false\s+positives', r'cyan.*purple',
	r'cover\s+page\s+submission', r'ai\s+writing\s+overview'
	]

	for pattern in system_junk:
	if re.search(pattern, text_lower):
	return True

	# Random university spam - DELETE ENTIRELY
	university_spam = [
	r'\b(harvard\|yale\|stanford\|mit\|berkeley)\s+university\b',
	r'\buniversity\s+of\s+(california\|pennsylvania\|maryland)\b',
	r'\bharvard\s+(business\s+school\|law\s+school)\b',
	r'\bjournal\s+of\s+the\s+american\s+medical\s+association\b'
	]

	for pattern in university_spam:
	if re.search(pattern, text_lower):
	return True

	# Nonsensical fragments - DELETE ENTIRELY
	nonsense = [
	r'\bthe\s+sex\s+of\b', r'\bschool\s+sex\b', r'\bis\s+a\s+sexy\b',
	r'\bpunctuation\b(?!\s+marks)', r'\bchemistry\b(?!\s+class)',
	r'\bjournalist?\b(?!\s+report)', r'\bacademic\s+heading\s+to\s+be\s+clear\b',
	r'\bwrite.heading.concise\b', r'\bmaterials\s+for\s+the\s+synthesis\s+of\s+materials\b',
	r'\bcolleges\s+and\s+colleges\s+are\s+a\s+great\s+way\b'
	]

	for pattern in nonsense:
	if re.search(pattern, text_lower):
	return True

	return False

	def is_real_heading(text):
	"""STRICT heading detection - only actual headings"""
	text = text.strip()

	# Learning objectives (these are real headings)
	if re.match(r'^[A-Z]\.?[PM]?\d+\.?\s', text): # P1., C.P6, BC.D2, etc.
	return True

	# Activity headings
	if re.match(r'^IT\s+Systems?\s+[Aa]ctivity\s+\d+', text, re.IGNORECASE):
	return True

	# Learning Aim headings
	if re.match(r'^Learning\s+[Aa]im\s+[A-Z]:', text, re.IGNORECASE):
	return True

	# Main section headings
	main_sections = [
	r'^Contents?$', r'^Overview$', r'^Summary$', r'^Introduction$',
	r'^Data\s+and\s+information\s+use', r'^Hardware\s+and\s+software',
	r'^Stakeholder\s+Impact', r'^Internal\s+and\s+External'
	]

	for pattern in main_sections:
	if re.match(pattern, text, re.IGNORECASE):
	return True

	return False

	def semantic_coherence_check(text):
	"""Check if content makes logical sense in the IT/business context"""
	text_lower = text.lower()

	# Check for logical inconsistencies
	logical_errors = [
	# Contradictory statements
	(r'\binternal.external.internal\b', "Contradictory internal/external"),
	(r'\bsoftware.hardware.software\b', "Contradictory software/hardware"),

	# Nonsensical combinations
	(r'\bprinters.*email\b', "Printers don't send emails"),
	(r'\bwi-fi.*cables\b', "Wi-Fi vs cables contradiction"),
	(r'\bcloud.physical.storage\b', "Cloud vs physical confusion"),

	# Subject-verb disagreements in context
	(r'\bdata.are.information.*is\b', "Data/information verb disagreement"),
	(r'\bstakeholder.communicate.themselves\b', "Reflexive pronoun error"),

	# Circular or redundant logic
	(r'\bthe.of.the.of.the\b', "Too many 'of' chains"),
	(r'\bwhich.that.which.*that\b', "Confusing relative clauses"),
	(r'\benables.to.allow.*to\b', "Redundant enabling/allowing"),
	]

	for pattern, error_type in logical_errors:
	if re.search(pattern, text_lower):
	return False, error_type

	# Check for proper IT concepts
	it_concept_pairs = [
	('backup', ['data', 'files', 'system', 'recovery']),
	('network', ['connection', 'internet', 'communication', 'access']),
	('security', ['firewall', 'encryption', 'protection', 'threat']),
	('stakeholder', ['internal', 'external', 'staff', 'client', 'customer']),
	('software', ['application', 'program', 'system', 'tool']),
	('hardware', ['computer', 'device', 'equipment', 'physical'])
	]

	# If a concept is mentioned, related terms should make sense
	for concept, related_terms in it_concept_pairs:
	if concept in text_lower:
	# Check if the context makes sense
	concept_sentences = [s for s in text.split('.') if concept in s.lower()]
	for sentence in concept_sentences:
	# Very basic context check
	if any(related in sentence.lower() for related in related_terms):
	continue
	else:
	# Check if it's used in a completely wrong context
	wrong_contexts = {
	'backup': ['singing', 'dancing', 'performing'],
	'network': ['friendship', 'social media'],
	'security': ['building security', 'guard'],
	'hardware': ['construction', 'tools', 'nails']
	}

	if concept in wrong_contexts:
	for wrong in wrong_contexts[concept]:
	if wrong in sentence.lower():
	return False, f"Wrong context for {concept}"

	return True, "Coherent"

	def factual_accuracy_check(text):
	"""Check for factual accuracy about JM Shows and IT systems"""
	text_lower = text.lower()

	# Known facts about JM Shows that should be consistent
	known_facts = {
	'jm shows': ['educational', 'theater', 'organization', 'jamal', 'manager'],
	'jamal moulin': ['manager', 'jm shows'],
	'btec': ['qualification', 'education', 'it systems'],
	'stakeholders': ['internal', 'external', 'staff', 'customers', 'schools']
	}

	# Check for factual inconsistencies
	for entity, expected_context in known_facts.items():
	if entity in text_lower:
	entity_sentences = [s for s in text.split('.') if entity in s.lower()]
	for sentence in entity_sentences:
	# Check for contradictory information
	contradictions = {
	'jm shows': ['manufacturing', 'restaurant', 'hospital', 'bank'],
	'jamal moulin': ['student', 'teacher', 'client'],
	'btec': ['university degree', 'masters', 'phd']
	}

	if entity in contradictions:
	for contradiction in contradictions[entity]:
	if contradiction in sentence.lower():
	return False, f"Factual error: {entity} is not {contradiction}"

	return True, "Factually accurate"

	def quality_check_rewrite(original, rewritten):
	"""BRUTAL quality check - reject if rewrite is worse"""
	if not rewritten or len(rewritten.strip()) < 5:
	return False, "Too short"

	original = original.strip()
	rewritten = rewritten.strip()

	# Reject if too different in length
	if len(rewritten) < len(original) * 0.4 or len(rewritten) > len(original) * 2.2:
	return False, "Length change too extreme"

	# SEMANTIC COHERENCE CHECK
	is_coherent, coherence_error = semantic_coherence_check(rewritten)
	if not is_coherent:
	return False, f"Semantic error: {coherence_error}"

	# FACTUAL ACCURACY CHECK
	is_accurate, accuracy_error = factual_accuracy_check(rewritten)
	if not is_accurate:
	return False, f"Factual error: {accuracy_error}"

	# Check for awkward AI phrases that make text worse
	ai_awkwardness = [
	r'\butilizing\b', # AI loves this word
	r'\bin\s+order\s+to\b', # Verbose AI phrase
	r'\bfacilitate\b.*\bin\s+its\b', # Awkward construction
	r'\benables?\s+.\s+to\s+.\s+to\b', # Double "to" constructions
	r'\bthe\s+.\s+of\s+.\s+of\s+.*\s+of\b', # Triple "of" chains
	r'\baccordingly\s+utilizing\b', # Robotic phrasing
	r'\benabling\s+a\s+number\s+of\s+individuals\b', # Overly formal
	r'\bthe\s+.\s+of\s+the\s+.\s+of\s+the\b', # Repetitive structure
	r'\bwhich\s+.\s+that\s+.\s+which\b', # Confusing relative clauses
	]

	awkward_count = sum(1 for pattern in ai_awkwardness
	if re.search(pattern, rewritten, re.IGNORECASE))

	if awkward_count > 1: # Even stricter now
	return False, f"Too many awkward AI phrases ({awkward_count})"

	# Check sentence structure quality
	sentences = [s.strip() for s in rewritten.split('.') if s.strip()]
	for sentence in sentences:
	# Check for overly long sentences (likely AI verbosity)
	if len(sentence.split()) > 40:
	return False, "Sentence too long and complex"

	# Check for repetitive structures
	words = sentence.lower().split()
	if len(words) != len(set(words)) and len([w for w in words if words.count(w) > 2]) > 0:
	return False, "Too much word repetition in sentence"

	# Check if key technical terms are preserved
	key_terms = re.findall(r'\b(JM\s+Shows\|IT\|software\|hardware\|data\|information\|stakeholder\|organization)\b',
	original, re.IGNORECASE)

	preserved = sum(1 for term in key_terms
	if re.search(re.escape(term), rewritten, re.IGNORECASE))

	if key_terms and preserved / len(key_terms) < 0.7:
	return False, "Lost too many key terms"

	# Use similarity check - if too different, probably worse
	similarity = difflib.SequenceMatcher(None, original.lower(), rewritten.lower()).ratio()
	if similarity < 0.3: # More lenient for structure changes
	return False, f"Too different from original (similarity: {similarity:.2f})"

	# Check that meaning is preserved by looking for key concepts
	original_concepts = set(re.findall(r'\b(schedul\w+\|manag\w+\|communicat\w+\|secur\w+\|collaborat\w+)\b',
	original.lower()))
	rewritten_concepts = set(re.findall(r'\b(schedul\w+\|manag\w+\|communicat\w+\|secur\w+\|collaborat\w+)\b',
	rewritten.lower()))

	if original_concepts and len(original_concepts.intersection(rewritten_concepts)) / len(original_concepts) < 0.6:
	return False, "Key concepts not preserved"

	return True, "Acceptable"

	def advanced_sentence_restructuring(text):
	"""Advanced sentence restructuring to make text more human-like"""
	sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
	if not sentences:
	return text

	restructured = []

	for sentence in sentences:
	original_sentence = sentence

	# Multiple restructuring techniques
	sentence = restructure_complex_sentences(sentence)
	sentence = vary_sentence_beginnings(sentence)
	sentence = change_voice_patterns(sentence)
	sentence = rearrange_clauses(sentence)
	sentence = vary_connecting_words(sentence)

	restructured.append(sentence)

	return ' '.join(restructured).replace('..', '.')

	def restructure_complex_sentences(sentence):
	"""Break down and restructure complex sentences"""

	# Pattern 1: "X, which Y, Z" -> "X does Z. This is because Y."
	which_pattern = r'(.+?),\swhich\s+(.+?),\s(.+)'
	match = re.search(which_pattern, sentence, re.IGNORECASE)
	if match and random.random() < 0.4:
	main_part = match.group(1).strip()
	which_part = match.group(2).strip()
	end_part = match.group(3).strip().rstrip('.')

	return f"{main_part} {end_part}. This happens because {which_part}."

	# Pattern 2: "Due to X, Y happens" -> "Y happens. The reason is X."
	due_pattern = r'due\s+to\s+(.+?),\s*(.+)'
	match = re.search(due_pattern, sentence, re.IGNORECASE)
	if match and random.random() < 0.4:
	reason = match.group(1).strip()
	result = match.group(2).strip().rstrip('.')

	return f"{result.capitalize()}. The reason is {reason}."

	# Pattern 3: "Although X, Y" -> "Y, even though X" or "Y. However, X"
	although_pattern = r'although\s+(.+?),\s*(.+)'
	match = re.search(although_pattern, sentence, re.IGNORECASE)
	if match and random.random() < 0.4:
	condition = match.group(1).strip()
	main_clause = match.group(2).strip().rstrip('.')

	if random.random() < 0.5:
	return f"{main_clause.capitalize()}, even though {condition}."
	else:
	return f"{main_clause.capitalize()}. However, {condition}."

	# Pattern 4: Long sentences with multiple "and" -> Split them
	if sentence.count(' and ') >= 2 and len(sentence.split()) > 20:
	parts = sentence.split(' and ')
	if len(parts) >= 3:
	# Take first two parts, make them one sentence
	# Rest becomes second sentence
	first_part = f"{parts[0]} and {parts[1]}."
	remaining = ' and '.join(parts[2:]).rstrip('.')
	return f"{first_part} Additionally, {remaining}."

	return sentence

	def vary_sentence_beginnings(sentence):
	"""Change how sentences start to avoid AI patterns"""

	sentence = sentence.strip()

	# Don't change if it's already starting with something interesting
	interesting_starts = ['However', 'Therefore', 'Additionally', 'Meanwhile', 'Furthermore',
	'Moreover', 'Consequently', 'Subsequently', 'Nevertheless']

	if any(sentence.startswith(start) for start in interesting_starts):
	return sentence

	# Pattern 1: "The organization uses X" -> "JM Shows uses X" or "The company uses X"
	if sentence.lower().startswith('the organization'):
	alternatives = ['JM Shows', 'The company', 'This organization', 'The business']
	chosen = random.choice(alternatives)
	sentence = re.sub(r'^the organization\b', chosen, sentence, flags=re.IGNORECASE)

	# Pattern 2: "This enables" -> Various alternatives
	if sentence.lower().startswith('this enables'):
	alternatives = [
	'This allows', 'This helps', 'This makes it possible for',
	'As a result,', 'Because of this,'
	]
	chosen = random.choice(alternatives)
	sentence = re.sub(r'^this enables?\b', chosen, sentence, flags=re.IGNORECASE)

	# Pattern 3: "It is important" -> "X is crucial" or "X matters because"
	if 'it is important' in sentence.lower():
	alternatives = [
	'it is crucial', 'it is essential', 'it matters',
	'this is vital', 'this is key'
	]
	chosen = random.choice(alternatives)
	sentence = re.sub(r'\bit is important\b', chosen, sentence, flags=re.IGNORECASE)

	# Pattern 4: Add transitional phrases sometimes
	if random.random() < 0.3 and not sentence.lower().startswith(('the', 'this', 'it', 'a')):
	transitions = [
	'In practice, ', 'For example, ', 'In this case, ', 'Typically, ',
	'Usually, ', 'Often, ', 'Generally, '
	]
	if not any(sentence.startswith(t.strip()) for t in transitions):
	chosen_transition = random.choice(transitions)
	sentence = chosen_transition + sentence.lower()

	return sentence

	def change_voice_patterns(sentence):
	"""Change between active and passive voice naturally"""

	# Passive to active transformations
	passive_patterns = [
	# "X is used by Y" -> "Y uses X"
	(r'(\w+(?:\s+\w+))\s+is\s+used\s+by\s+(\w+(?:\s+\w+))', r'\2 uses \1'),

	# "X is managed by Y" -> "Y manages X"
	(r'(\w+(?:\s+\w+))\s+is\s+managed\s+by\s+(\w+(?:\s+\w+))', r'\2 manages \1'),

	# "X is implemented by Y" -> "Y implements X"
	(r'(\w+(?:\s+\w+))\s+is\s+implemented\s+by\s+(\w+(?:\s+\w+))', r'\2 implements \1'),

	# "X are accessed by Y" -> "Y accesses X"
	(r'(\w+(?:\s+\w+))\s+are\s+accessed\s+by\s+(\w+(?:\s+\w+))', r'\2 accesses \1'),
	]

	# Apply passive to active (50% chance)
	if random.random() < 0.5:
	for pattern, replacement in passive_patterns:
	if re.search(pattern, sentence, re.IGNORECASE):
	sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
	break

	# Active to passive transformations (less common, 25% chance)
	elif random.random() < 0.25:
	active_patterns = [
	# "Y uses X" -> "X is used by Y"
	(r'(\w+(?:\s+\w+))\s+uses\s+(\w+(?:\s+\w+))', r'\2 is used by \1'),

	# "Y manages X" -> "X is managed by Y"
	(r'(\w+(?:\s+\w+))\s+manages\s+(\w+(?:\s+\w+))', r'\2 is managed by \1'),
	]

	for pattern, replacement in active_patterns:
	if re.search(pattern, sentence, re.IGNORECASE):
	sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
	break

	return sentence

	def rearrange_clauses(sentence):
	"""Rearrange clauses within sentences"""

	# Pattern 1: "X because Y" -> "Because Y, X"
	because_pattern = r'(.+?)\s+because\s+(.+)'
	match = re.search(because_pattern, sentence, re.IGNORECASE)
	if match and random.random() < 0.3:
	main_clause = match.group(1).strip()
	because_clause = match.group(2).strip().rstrip('.')
	return f"Because {because_clause}, {main_clause.lower()}."

	# Pattern 2: "X when Y" -> "When Y, X"
	when_pattern = r'(.+?)\s+when\s+(.+)'
	match = re.search(when_pattern, sentence, re.IGNORECASE)
	if match and random.random() < 0.3:
	main_clause = match.group(1).strip()
	when_clause = match.group(2).strip().rstrip('.')
	return f"When {when_clause}, {main_clause.lower()}."

	# Pattern 3: Move prepositional phrases
	# "The system stores data in the cloud" -> "In the cloud, the system stores data"
	prep_phrases = ['in the cloud', 'on the network', 'within the organization',
	'through the system', 'via the software', 'using the hardware']

	for phrase in prep_phrases:
	if phrase in sentence.lower() and random.random() < 0.25:
	# Find the phrase and move it to the beginning
	phrase_pattern = re.escape(phrase)
	if re.search(r'\b' + phrase_pattern + r'\b', sentence, re.IGNORECASE):
	# Remove from current position
	new_sentence = re.sub(r',?\s*\b' + phrase_pattern + r'\b,?', '', sentence, flags=re.IGNORECASE)
	# Add to beginning
	new_sentence = f"{phrase.capitalize()}, {new_sentence.strip().lower()}"
	return new_sentence

	return sentence

	def vary_connecting_words(sentence):
	"""Replace connecting words with alternatives"""

	connectors = {
	'and': ['plus', 'as well as', 'along with', 'together with'],
	'but': ['however', 'yet', 'though', 'although'],
	'so': ['therefore', 'thus', 'as a result', 'consequently'],
	'also': ['additionally', 'furthermore', 'moreover', 'as well'],
	'however': ['but', 'yet', 'though', 'still'],
	'therefore': ['so', 'thus', 'as a result', 'consequently'],
	'because': ['since', 'as', 'due to the fact that'],
	'while': ['whereas', 'although', 'though'],
	}

	# Replace connectors (30% chance per connector)
	for original, alternatives in connectors.items():
	if f' {original} ' in sentence.lower() and random.random() < 0.3:
	chosen_alternative = random.choice(alternatives)
	sentence = re.sub(r'\b' + re.escape(original) + r'\b', chosen_alternative,
	sentence, flags=re.IGNORECASE, count=1)
	break # Only replace one per sentence

	return sentence

	def add_human_imperfections(text, imperfection_rate=0.25):
	"""Add subtle human-like imperfections and natural variations"""
	if len(text.split()) < 10:
	return text

	# First apply advanced sentence restructuring
	text = advanced_sentence_restructuring(text)

	sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
	if not sentences:
	return text

	modified_sentences = []

	for sentence in sentences:
	# Higher chance of modification for human-like variation
	if random.random() > (1 - imperfection_rate):
	modified_sentences.append(sentence)
	continue

	# Choose multiple techniques per sentence (more natural)
	techniques = [
	'slight_grammar_variation',
	'word_variation',
	'contraction_change',
	'sentence_connector_variation',
	'add_filler_words',
	'vary_punctuation'
	]

	# Apply 1-2 techniques per modified sentence
	num_techniques = random.choices([1, 2], weights=[0.7, 0.3])[0]
	selected_techniques = random.sample(techniques, num_techniques)

	for technique in selected_techniques:
	if technique == 'slight_grammar_variation':
	sentence = add_subtle_grammar_variations(sentence)
	elif technique == 'word_variation':
	sentence = vary_word_choice_extensively(sentence)
	elif technique == 'contraction_change':
	sentence = modify_contractions_naturally(sentence)
	elif technique == 'sentence_connector_variation':
	sentence = vary_sentence_connectors(sentence)
	elif technique == 'add_filler_words':
	sentence = add_natural_filler_words(sentence)
	elif technique == 'vary_punctuation':
	sentence = vary_punctuation_style(sentence)

	modified_sentences.append(sentence)

	result = ' '.join(modified_sentences).replace('..', '.')

	# Final pass: add some natural flow variations
	result = add_natural_flow_variations(result)

	return result

	def add_subtle_grammar_variations(sentence):
	"""Add realistic grammar variations humans use"""

	# Acceptable variations that humans commonly use
	variations = [
	# "data is" vs "data are" - both acceptable
	(r'\bdata are\b', 'data is', 0.4),

	# "different from" vs "different to" - both used
	(r'\bdifferent from\b', 'different to', 0.3),

	# "compared to" vs "compared with"
	(r'\bcompared with\b', 'compared to', 0.3),

	# "focused on" vs "focused around"
	(r'\bfocused on\b', 'focused around', 0.2),

	# "try to" vs "try and" - both acceptable
	(r'\btry to\b', 'try and', 0.3),

	# Article variations
	(r'\bthe software\b', 'software', 0.2),
	(r'\bthe hardware\b', 'hardware', 0.2),
	(r'\bthe data\b', 'data', 0.2),
	]

	for pattern, replacement, probability in variations:
	if re.search(pattern, sentence, re.IGNORECASE) and random.random() < probability:
	sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
	break # Only one variation per sentence

	return sentence

	def vary_word_choice_extensively(sentence):
	"""More extensive word variations that sound natural"""

	# Extended word variations - more natural alternatives
	word_variations = {
	# Formal to casual
	'utilize': ['use', 'employ', 'apply'],
	'demonstrate': ['show', 'display', 'reveal'],
	'facilitate': ['help', 'enable', 'support'],
	'implement': ['use', 'apply', 'put in place'],
	'obtain': ['get', 'acquire', 'receive'],
	'purchase': ['buy', 'get', 'acquire'],
	'commence': ['start', 'begin', 'kick off'],
	'assist': ['help', 'support', 'aid'],
	'require': ['need', 'call for', 'demand'],
	'provide': ['give', 'offer', 'supply'],
	'ensure': ['make sure', 'guarantee', 'see to it that'],
	'maintain': ['keep', 'preserve', 'uphold'],
	'establish': ['set up', 'create', 'build'],
	'subsequently': ['then', 'after that', 'next'],
	'accordingly': ['so', 'therefore', 'as a result'],
	'furthermore': ['also', 'plus', 'on top of that'],
	'nevertheless': ['but', 'however', 'still'],
	'approximately': ['about', 'around', 'roughly'],
	'numerous': ['many', 'lots of', 'plenty of'],
	'sufficient': ['enough', 'adequate', 'plenty'],
	'essential': ['vital', 'key', 'crucial'],
	'significant': ['major', 'important', 'big'],
	'substantial': ['large', 'considerable', 'major'],

	# Casual to varied
	'really': ['very', 'quite', 'pretty'],
	'big': ['large', 'major', 'significant'],
	'small': ['little', 'minor', 'compact'],
	'good': ['effective', 'useful', 'beneficial'],
	'bad': ['poor', 'ineffective', 'problematic'],
	'fast': ['quick', 'rapid', 'speedy'],
	'slow': ['gradual', 'delayed', 'unhurried'],

	# Technical variations
	'system': ['platform', 'setup', 'framework'],
	'software': ['application', 'program', 'tool'],
	'hardware': ['equipment', 'devices', 'machinery'],
	'network': ['connection', 'system', 'infrastructure'],
	'database': ['data store', 'information system', 'repository'],
	'security': ['protection', 'safety', 'safeguarding'],
	'organization': ['company', 'business', 'firm'],
	'employee': ['staff member', 'worker', 'team member'],
	'customer': ['client', 'user', 'consumer'],
	'manager': ['supervisor', 'leader', 'head'],
	}

	# Apply variations (40% chance per word)
	for formal, alternatives in word_variations.items():
	if f' {formal} ' in sentence.lower() and random.random() < 0.4:
	chosen_alternative = random.choice(alternatives)
	sentence = re.sub(r'\b' + re.escape(formal) + r'\b', chosen_alternative,
	sentence, flags=re.IGNORECASE, count=1)
	break # Only one substitution per sentence

	return sentence

	def modify_contractions_naturally(sentence):
	"""Natural contraction usage like humans"""

	if random.random() < 0.5:
	# Add contractions (more casual/natural)
	contractions = {
	'it is': "it's",
	'they are': "they're",
	'there is': "there's",
	'that is': "that's",
	'cannot': "can't",
	'do not': "don't",
	'does not': "doesn't",
	'will not': "won't",
	'would not': "wouldn't",
	'should not': "shouldn't",
	'could not': "couldn't",
	'have not': "haven't",
	'has not': "hasn't",
	'had not': "hadn't",
	'are not': "aren't",
	'is not': "isn't",
	'was not': "wasn't",
	'were not': "weren't"
	}

	for full, contracted in contractions.items():
	if full in sentence.lower() and random.random() < 0.6:
	sentence = re.sub(r'\b' + re.escape(full) + r'\b', contracted, sentence, flags=re.IGNORECASE)
	break

	else:
	# Remove contractions (more formal when needed)
	expansions = {
	"it's": 'it is',
	"they're": 'they are',
	"there's": 'there is',
	"that's": 'that is',
	"can't": 'cannot',
	"don't": 'do not',
	"doesn't": 'does not',
	"won't": 'will not',
	"wouldn't": 'would not',
	"shouldn't": 'should not',
	"couldn't": 'could not',
	"haven't": 'have not',
	"hasn't": 'has not',
	"hadn't": 'had not',
	"aren't": 'are not',
	"isn't": 'is not',
	"wasn't": 'was not',
	"weren't": 'were not'
	}

	for contracted, full in expansions.items():
	if contracted in sentence and random.random() < 0.4:
	sentence = sentence.replace(contracted, full)
	break

	return sentence

	def vary_sentence_connectors(sentence):
	"""Add variety to how sentences connect ideas"""

	# Add sentence starters that humans use
	if random.random() < 0.3:
	starters = [
	'In fact, ', 'Actually, ', 'Basically, ', 'Essentially, ',
	'In practice, ', 'For instance, ', 'For example, ', 'Specifically, ',
	'In particular, ', 'More importantly, ', 'What\'s more, ', 'Plus, ',
	'On top of that, ', 'Besides that, ', 'Apart from that, '
	]

	# Don't add if sentence already starts with a connector
	if not any(sentence.lower().startswith(word.lower()) for word in
	['the', 'this', 'it', 'however', 'therefore', 'furthermore', 'moreover']):
	if random.random() < 0.4:
	chosen_starter = random.choice(starters)
	sentence = chosen_starter + sentence.lower()

	# Replace mid-sentence connectors
	connector_replacements = {
	' and ': [' plus ', ' as well as ', ' along with '],
	' but ': [' however ', ' though ', ' yet '],
	' so ': [' therefore ', ' thus ', ' as a result '],
	' because ': [' since ', ' as ', ' given that '],
	' although ': [' while ', ' even though ', ' despite the fact that '],
	' therefore ': [' so ', ' thus ', ' as a result '],
	' however ': [' but ', ' though ', ' yet '],
	' moreover ': [' also ', ' plus ', ' on top of that '],
	' furthermore ': [' also ', ' what\'s more ', ' besides '],
	}

	for original, alternatives in connector_replacements.items():
	if original in sentence.lower() and random.random() < 0.3:
	chosen_alternative = random.choice(alternatives)
	sentence = re.sub(re.escape(original), chosen_alternative, sentence, flags=re.IGNORECASE, count=1)
	break

	return sentence

	def add_natural_filler_words(sentence):
	"""Add natural filler words/phrases that humans use"""

	# Natural emphasis and filler phrases
	fillers = {
	'important': ['really important', 'quite important', 'very important'],
	'useful': ['really useful', 'quite useful', 'very useful'],
	'effective': ['highly effective', 'really effective', 'quite effective'],
	'necessary': ['absolutely necessary', 'really necessary', 'quite necessary'],
	'helps': ['actually helps', 'really helps', 'definitely helps'],
	'allows': ['actually allows', 'really allows', 'basically allows'],
	'enables': ['actually enables', 'really enables', 'effectively enables'],
	'provides': ['actually provides', 'really provides', 'effectively provides'],
	}

	# Add natural hedging/softening
	hedges = {
	'is': ['tends to be', 'is generally', 'is usually'],
	'are': ['tend to be', 'are generally', 'are usually'],
	'will': ['will likely', 'will probably', 'will generally'],
	'can': ['can often', 'can usually', 'can typically'],
	'must': ['should really', 'needs to', 'ought to'],
	}

	# Apply fillers (20% chance)
	if random.random() < 0.2:
	for word, alternatives in fillers.items():
	if f' {word} ' in sentence.lower() and random.random() < 0.5:
	chosen_alternative = random.choice(alternatives)
	sentence = re.sub(r'\b' + re.escape(word) + r'\b', chosen_alternative,
	sentence, flags=re.IGNORECASE, count=1)
	break

	# Apply hedges (15% chance)
	if random.random() < 0.15:
	for word, alternatives in hedges.items():
	if f' {word} ' in sentence.lower() and random.random() < 0.4:
	chosen_alternative = random.choice(alternatives)
	sentence = re.sub(r'\b' + re.escape(word) + r'\b', chosen_alternative,
	sentence, flags=re.IGNORECASE, count=1)
	break

	return sentence

	def vary_punctuation_style(sentence):
	"""Vary punctuation style naturally"""

	# Sometimes use semicolons instead of periods for related ideas
	if ' and ' in sentence and random.random() < 0.15:
	# "X does A and Y does B" -> "X does A; Y does B"
	parts = sentence.split(' and ', 1)
	if len(parts) == 2 and len(parts[0]) > 10 and len(parts[1]) > 10:
	sentence = f"{parts[0]}; {parts[1]}"

	# Sometimes use em dashes for emphasis
	if ' - ' in sentence and random.random() < 0.3:
	sentence = sentence.replace(' - ', ' — ')

	# Sometimes use colons for explanations
	if 'because' in sentence.lower() and random.random() < 0.2:
	# "X happens because Y" -> "X happens: Y"
	because_match = re.search(r'(.+?)\s+because\s+(.+)', sentence, re.IGNORECASE)
	if because_match:
	main_part = because_match.group(1).strip()
	reason = because_match.group(2).strip().rstrip('.')
	sentence = f"{main_part}: {reason}."

	return sentence

	def add_natural_flow_variations(text):
	"""Add natural flow variations across the entire text"""

	sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
	if len(sentences) < 2:
	return text

	# Vary sentence lengths naturally
	modified_sentences = []
	for i, sentence in enumerate(sentences):

	# Sometimes combine short sentences
	if (i < len(sentences) - 1 and
	len(sentence.split()) < 8 and
	len(sentences[i + 1].split()) < 8 and
	random.random() < 0.3):

	next_sentence = sentences[i + 1].rstrip('.')
	combined_sentences = [
	f"{sentence.rstrip('.')} and {next_sentence.lower()}.",
	f"{sentence.rstrip('.')}, while {next_sentence.lower()}.",
	f"{sentence.rstrip('.')}; {next_sentence.lower()}.",
	]
	chosen = random.choice(combined_sentences)
	modified_sentences.append(chosen)
	# Skip the next sentence since we combined it
	sentences[i + 1] = ""

	# Sometimes split very long sentences
	elif len(sentence.split()) > 30 and random.random() < 0.4:
	# Look for natural break points
	break_points = [', and ', ', but ', ', however ', ', therefore ']
	for break_point in break_points:
	if break_point in sentence.lower():
	parts = sentence.split(break_point, 1)
	if len(parts) == 2:
	connector = break_point.strip().rstrip(',').capitalize()
	split_version = f"{parts[0].strip()}. {connector}, {parts[1].strip()}"
	modified_sentences.append(split_version)
	break
	else:
	modified_sentences.append(sentence)
	else:
	if sentence.strip(): # Only add non-empty sentences
	modified_sentences.append(sentence)

	return ' '.join(modified_sentences).replace('..', '.')

	def conservative_rewrite(text, tokenizer, model):
	"""Enhanced rewrite with advanced humanization"""
	original = text.strip()

	# Don't rewrite very short text
	if len(original.split()) < 8:
	# Still apply light humanization to short text
	return add_human_imperfections(original, imperfection_rate=0.15)

	# Pre-check: if original already has issues, be extra careful
	original_coherent, _ = semantic_coherence_check(original)
	original_accurate, _ = factual_accuracy_check(original)

	if not original_coherent or not original_accurate:
	print(f"⚠️ Original has issues, applying safe humanization only")
	# Just apply humanization without AI rewriting
	result = add_human_imperfections(original, imperfection_rate=0.2)
	return result.strip()

	# Try multiple approaches with different prompts for more variation
	prompts = [
	f"Rewrite naturally: {original}",
	f"Make this sound more human: {original}",
	f"Improve readability: {original}",
	f"Rephrase this text: {original}",
	f"Make this clearer and more natural: {original}"
	]

	for i, prompt in enumerate(prompts):
	try:
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=300).to(model.device)
	outputs = model.generate(
	inputs.input_ids,
	max_new_tokens=len(original.split()) + 20, # More room for natural expansion
	do_sample=True,
	temperature=0.08 + (i * 0.06), # Gradually increase creativity
	top_p=0.5 + (i * 0.1), # More variation in later attempts
	repetition_penalty=1.08,
	pad_token_id=tokenizer.eos_token_id
	)

	result = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Clean result more thoroughly
	if ":" in result:
	result = result.split(":", 1)[-1].strip()

	# Remove prompt artifacts
	result = re.sub(r'^(rewrite\|make\|improve\|rephrase\|naturally).?:\s', '', result, flags=re.IGNORECASE)
	result = re.sub(r'^(this\|that\|the following)\s+', '', result, flags=re.IGNORECASE)
	result = re.sub(r'\s+', ' ', result).strip()

	# Quality check
	is_good, reason = quality_check_rewrite(original, result)

	if is_good:
	print(f"✅ AI rewrite accepted (attempt {i+1})")

	# ADVANCED HUMANIZATION - This is the key enhancement
	humanized_result = add_human_imperfections(result, imperfection_rate=0.3)

	print(f"🧑 Applied advanced humanization")
	return humanized_result
	else:
	print(f"❌ Attempt {i+1} rejected: {reason}")
	continue

	except Exception as e:
	print(f"⚠️ Error in attempt {i+1}: {e}")
	continue

	# If all AI attempts failed, just apply advanced humanization to original
	print(f"⚠️ All AI attempts failed, applying advanced humanization to original")
	return add_human_imperfections(original, imperfection_rate=0.25)

	def smart_paragraph_grouping(content_items):
	"""Group fragmented text into proper paragraphs"""
	grouped = []
	current_group = []

	for item in content_items:
	text = item['text'].strip()

	if not text:
	continue

	# If it's a heading, save current group and start fresh
	if is_real_heading(text):
	if current_group:
	# Join the fragments into a paragraph
	paragraph_text = ' '.join(current_group)
	if len(paragraph_text.split()) > 5: # Only if substantial
	grouped.append({
	'text': paragraph_text,
	'type': 'paragraph'
	})
	current_group = []

	grouped.append({
	'text': text,
	'type': 'heading'
	})
	else:
	# Add to current group
	current_group.append(text)

	# Don't forget the last group
	if current_group:
	paragraph_text = ' '.join(current_group)
	if len(paragraph_text.split()) > 5:
	grouped.append({
	'text': paragraph_text,
	'type': 'paragraph'
	})

	return grouped

	def process_document_safely(input_path, output_path, tokenizer, model):
	"""Rewrite the document while keeping the exact original formatting."""
	print(f"🔒 Processing with formatting preserved: {input_path}")

	# Open original DOCX directly
	doc = Document(input_path)
	improved_count = 0
	heavily_modified_count = 0

	for para in doc.paragraphs:
	if not para.text.strip():
	continue # skip empty lines

	# Decide if we rewrite (skip headings, very short text, etc.)
	original_text = para.text
	if len(original_text.split()) < 3:
	continue # keep tiny text untouched

	# Rewrite
	improved_text = conservative_rewrite(original_text, tokenizer, model)

	# Count improvements
	if improved_text != original_text:
	improved_count += 1
	similarity = difflib.SequenceMatcher(None, original_text.lower(), improved_text.lower()).ratio()
	if similarity < 0.6:
	heavily_modified_count += 1

	# Replace text while keeping all runs/styles
	if para.runs:
	# If there are multiple runs, merge them into a single updated run
	para.clear() # remove old runs
	run = para.add_run(improved_text)

	# Copy formatting from first run
	orig_run = para.runs[0] if para.runs else None
	if orig_run:
	run.bold = orig_run.bold
	run.italic = orig_run.italic
	run.underline = orig_run.underline
	run.font.name = orig_run.font.name
	run.font.size = orig_run.font.size
	else:
	para.text = improved_text

	# Save document with same formatting but updated text
	doc.save(output_path)
	print(f"✅ Saved rewritten document with original formatting: {output_path}")
	print(f" Improvements made: {improved_count} ({heavily_modified_count} major)")

	def extract_docx_content(input_path):
	"""Extract content from DOCX"""
	doc = Document(input_path)
	content = []

	for para in doc.paragraphs:
	if para.text.strip():
	content.append({
	'text': para.text.strip(),
	'font_size': 12,
	'is_bold': any(run.bold for run in para.runs)
	})

	return content

	def extract_pdf_content(input_path):
	"""Extract content from PDF"""
	doc = fitz.open(input_path)
	content = []

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	blocks = page.get_text("dict")

	for block in blocks["blocks"]:
	if "lines" in block:
	for line in block["lines"]:
	line_text = ""
	font_size = 12
	is_bold = False

	for span in line["spans"]:
	line_text += span["text"]
	font_size = max(font_size, span["size"])
	if "bold" in span["font"].lower():
	is_bold = True

	if line_text.strip():
	content.append({
	'text': line_text.strip(),
	'font_size': font_size,
	'is_bold': is_bold
	})

	doc.close()
	return content

	def generate_clean_docx(content, output_path, original_path=None):
	"""Generate DOCX output with the exact same formatting as the input."""
	if not original_path or not os.path.exists(original_path):
	print("⚠️ Original file not provided or not found, falling back to basic formatting.")
	# Fallback to your old behavior
	doc = Document()
	for item in content:
	if item['type'] == 'heading':
	doc.add_heading(item['text'], level=1)
	else:
	doc.add_paragraph(item['text'])
	doc.save(output_path)
	return

	# Load the original document
	original_doc = Document(original_path)

	# Create a new document with the same structure
	new_doc = Document()

	# We'll keep a counter to match rewritten paragraphs to original ones
	content_index = 0

	for para in original_doc.paragraphs:
	if not para.text.strip():
	# Preserve empty paragraphs
	new_doc.add_paragraph("")
	continue

	# Get the rewritten text for this paragraph
	if content_index < len(content):
	rewritten_text = content[content_index]['text']
	else:
	rewritten_text = para.text # fallback

	# Create a new paragraph in the new doc with the same style
	new_para = new_doc.add_paragraph()
	new_para.style = para.style
	new_para.alignment = para.alignment

	# Preserve each run's formatting, but replace text
	if para.runs:
	# If original had multiple runs, we split rewritten text into one run (simpler)
	run = new_para.add_run(rewritten_text)
	# Copy formatting from the first run of original
	run.font.name = para.runs[0].font.name
	run.font.size = para.runs[0].font.size
	run.bold = para.runs[0].bold
	run.italic = para.runs[0].italic
	run.underline = para.runs[0].underline
	else:
	# Paragraph without runs (rare)
	new_para.add_run(rewritten_text)

	content_index += 1

	# Save final doc
	new_doc.save(output_path)

	def generate_clean_pdf(content, output_path):
	"""Generate clean PDF output"""
	try:
	from reportlab.lib.pagesizes import letter
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
	from reportlab.lib.units import inch

	doc = SimpleDocTemplate(output_path, pagesize=letter, topMargin=1*inch)
	styles = getSampleStyleSheet()

	heading_style = ParagraphStyle(
	'CleanHeading',
	parent=styles['Heading1'],
	fontSize=14,
	spaceAfter=12,
	fontName='Helvetica-Bold'
	)

	body_style = ParagraphStyle(
	'CleanBody',
	parent=styles['Normal'],
	fontSize=11,
	spaceAfter=8,
	fontName='Helvetica',
	alignment=0 # Left aligned
	)

	story = []

	for item in content:
	if item['type'] == 'heading':
	story.append(Paragraph(item['text'], heading_style))
	story.append(Spacer(1, 6))
	else:
	story.append(Paragraph(item['text'], body_style))
	story.append(Spacer(1, 4))

	doc.build(story)

	except ImportError:
	print("⚠️ ReportLab not installed, creating text file instead")
	with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f:
	for item in content:
	if item['type'] == 'heading':
	f.write(f"\n{item['text']}\n{'='*len(item['text'])}\n\n")
	else:
	f.write(f"{item['text']}\n\n")

	def main():
	if len(sys.argv) < 3:
	print("Usage: python rewriter_fixed.py <input_file_or_text> <output_file>")
	print("Examples:")
	print(" python rewriter_fixed.py input.docx output.docx")
	print(" python rewriter_fixed.py input.pdf output.pdf")
	print(" python rewriter_fixed.py \"This is my raw text\" output.txt")
	return

	input_arg = sys.argv[1]
	output_path = sys.argv[2]

	print(" Loading model for enhanced human-style processing...")
	tokenizer, model, device = load_model()

	# --- If it's a file ---
	if os.path.exists(input_arg):
	ext = os.path.splitext(input_arg)[1].lower()

	if ext == ".docx":
	process_document_safely(input_arg, output_path, tokenizer, model)

	elif ext == ".pdf":
	# Extract PDF → rewrite → save PDF
	content = extract_pdf_content(input_arg)
	grouped = smart_paragraph_grouping(content)
	for item in grouped:
	if len(item['text'].split()) >= 3:
	item['text'] = conservative_rewrite(item['text'], tokenizer, model)
	generate_clean_pdf(grouped, output_path)

	else:
	print(f" Unsupported file format: {ext}")
	return

	# --- If it's raw text ---
	else:
	raw_text = input_arg.strip()
	if not raw_text:
	print(" No text provided.")
	return

	humanized = conservative_rewrite(raw_text, tokenizer, model)

	# Save as TXT
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(humanized)

	print(f" Humanized text saved to {output_path}")

	print(" Enhanced human-style processing complete!")

	def rewrite_text(original_text: str) -> str:
	"""Main entry point for rewriting and humanizing."""
	# Add imperfections
	humanized = add_human_imperfections(original_text)

	# Run quality check
	ok, reason = quality_check_rewrite(original_text, humanized)
	if not ok:
	raise ValueError(f"Rewrite failed quality check: {reason}")

	return humanized

	if __name__ == "__main__":
	main()