StealthWriter / Ai_rewriter /rewriter_fixed.py
AlyanAkram's picture
Update Ai_rewriter/rewriter_fixed.py
cf5c434 verified
import sys
import os
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import fitz # PyMuPDF for PDF processing
import difflib
import random
import torch
import sys
sys.stdout.reconfigure(encoding='utf-8')
def load_model():
"""
Loads AI Rewriter model:
- Local run β†’ loads from ./ai_rewriter_model/
- Hugging Face Spaces β†’ loads from HF Hub repo
"""
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
HF_MODEL_REPO = "AlyanAkram/StealthWriter_Rewriter"
HF_MODEL_SUBFOLDER = "Ai_rewriter/ai_rewriter_model"
HF_TOKEN = os.getenv("HF_TOKEN")
RUNNING_ON_SPACES = os.environ.get("HF_SPACE") or os.environ.get("SPACE_ID")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"πŸ“¦ Loading model to: {device}")
try:
if RUNNING_ON_SPACES:
print("🌐 Running on Hugging Face Spaces β†’ loading from HF Hub...")
tokenizer = T5Tokenizer.from_pretrained(HF_MODEL_REPO, subfolder=HF_MODEL_SUBFOLDER, token=HF_TOKEN)
model = T5ForConditionalGeneration.from_pretrained(HF_MODEL_REPO, subfolder=HF_MODEL_SUBFOLDER, token=HF_TOKEN).to(device)
else:
print("πŸ’» Running locally β†’ loading from local files...")
local_path = os.path.join(os.path.dirname(__file__), "ai_rewriter_model")
tokenizer = T5Tokenizer.from_pretrained(local_path)
model = T5ForConditionalGeneration.from_pretrained(local_path).to(device)
return tokenizer, model, device
except Exception as e:
print(f"⚠️ Model load failed ({e}) β€” using t5-small fallback")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
return tokenizer, model, device
def is_pure_junk(text):
"""Identify text that is 100% junk and should be deleted"""
text_lower = text.lower().strip()
# System metadata - DELETE ENTIRELY
system_junk = [
r'submission\s+id', r'page\s+\d+\s+of', r'trn:oid', r'turnitin',
r'file\s+name', r'file\s+size', r'document\s+details',
r'\d+%\s+detected', r'ai.*generated', r'download\s+date',
r'qualifying\s+text', r'false\s+positives', r'cyan.*purple',
r'cover\s+page\s+submission', r'ai\s+writing\s+overview'
]
for pattern in system_junk:
if re.search(pattern, text_lower):
return True
# Random university spam - DELETE ENTIRELY
university_spam = [
r'\b(harvard|yale|stanford|mit|berkeley)\s+university\b',
r'\buniversity\s+of\s+(california|pennsylvania|maryland)\b',
r'\bharvard\s+(business\s+school|law\s+school)\b',
r'\bjournal\s+of\s+the\s+american\s+medical\s+association\b'
]
for pattern in university_spam:
if re.search(pattern, text_lower):
return True
# Nonsensical fragments - DELETE ENTIRELY
nonsense = [
r'\bthe\s+sex\s+of\b', r'\bschool\s+sex\b', r'\bis\s+a\s+sexy\b',
r'\bpunctuation\b(?!\s+marks)', r'\bchemistry\b(?!\s+class)',
r'\bjournalist?\b(?!\s+report)', r'\bacademic\s+heading\s+to\s+be\s+clear\b',
r'\bwrite.*heading.*concise\b', r'\bmaterials\s+for\s+the\s+synthesis\s+of\s+materials\b',
r'\bcolleges\s+and\s+colleges\s+are\s+a\s+great\s+way\b'
]
for pattern in nonsense:
if re.search(pattern, text_lower):
return True
return False
def is_real_heading(text):
"""STRICT heading detection - only actual headings"""
text = text.strip()
# Learning objectives (these are real headings)
if re.match(r'^[A-Z]\.?[PM]?\d+\.?\s', text): # P1., C.P6, BC.D2, etc.
return True
# Activity headings
if re.match(r'^IT\s+Systems?\s+[Aa]ctivity\s+\d+', text, re.IGNORECASE):
return True
# Learning Aim headings
if re.match(r'^Learning\s+[Aa]im\s+[A-Z]:', text, re.IGNORECASE):
return True
# Main section headings
main_sections = [
r'^Contents?$', r'^Overview$', r'^Summary$', r'^Introduction$',
r'^Data\s+and\s+information\s+use', r'^Hardware\s+and\s+software',
r'^Stakeholder\s+Impact', r'^Internal\s+and\s+External'
]
for pattern in main_sections:
if re.match(pattern, text, re.IGNORECASE):
return True
return False
def semantic_coherence_check(text):
"""Check if content makes logical sense in the IT/business context"""
text_lower = text.lower()
# Check for logical inconsistencies
logical_errors = [
# Contradictory statements
(r'\binternal.*external.*internal\b', "Contradictory internal/external"),
(r'\bsoftware.*hardware.*software\b', "Contradictory software/hardware"),
# Nonsensical combinations
(r'\bprinters.*email\b', "Printers don't send emails"),
(r'\bwi-fi.*cables\b', "Wi-Fi vs cables contradiction"),
(r'\bcloud.*physical.*storage\b', "Cloud vs physical confusion"),
# Subject-verb disagreements in context
(r'\bdata.*are.*information.*is\b', "Data/information verb disagreement"),
(r'\bstakeholder.*communicate.*themselves\b', "Reflexive pronoun error"),
# Circular or redundant logic
(r'\bthe.*of.*the.*of.*the\b', "Too many 'of' chains"),
(r'\bwhich.*that.*which.*that\b', "Confusing relative clauses"),
(r'\benables.*to.*allow.*to\b', "Redundant enabling/allowing"),
]
for pattern, error_type in logical_errors:
if re.search(pattern, text_lower):
return False, error_type
# Check for proper IT concepts
it_concept_pairs = [
('backup', ['data', 'files', 'system', 'recovery']),
('network', ['connection', 'internet', 'communication', 'access']),
('security', ['firewall', 'encryption', 'protection', 'threat']),
('stakeholder', ['internal', 'external', 'staff', 'client', 'customer']),
('software', ['application', 'program', 'system', 'tool']),
('hardware', ['computer', 'device', 'equipment', 'physical'])
]
# If a concept is mentioned, related terms should make sense
for concept, related_terms in it_concept_pairs:
if concept in text_lower:
# Check if the context makes sense
concept_sentences = [s for s in text.split('.') if concept in s.lower()]
for sentence in concept_sentences:
# Very basic context check
if any(related in sentence.lower() for related in related_terms):
continue
else:
# Check if it's used in a completely wrong context
wrong_contexts = {
'backup': ['singing', 'dancing', 'performing'],
'network': ['friendship', 'social media'],
'security': ['building security', 'guard'],
'hardware': ['construction', 'tools', 'nails']
}
if concept in wrong_contexts:
for wrong in wrong_contexts[concept]:
if wrong in sentence.lower():
return False, f"Wrong context for {concept}"
return True, "Coherent"
def factual_accuracy_check(text):
"""Check for factual accuracy about JM Shows and IT systems"""
text_lower = text.lower()
# Known facts about JM Shows that should be consistent
known_facts = {
'jm shows': ['educational', 'theater', 'organization', 'jamal', 'manager'],
'jamal moulin': ['manager', 'jm shows'],
'btec': ['qualification', 'education', 'it systems'],
'stakeholders': ['internal', 'external', 'staff', 'customers', 'schools']
}
# Check for factual inconsistencies
for entity, expected_context in known_facts.items():
if entity in text_lower:
entity_sentences = [s for s in text.split('.') if entity in s.lower()]
for sentence in entity_sentences:
# Check for contradictory information
contradictions = {
'jm shows': ['manufacturing', 'restaurant', 'hospital', 'bank'],
'jamal moulin': ['student', 'teacher', 'client'],
'btec': ['university degree', 'masters', 'phd']
}
if entity in contradictions:
for contradiction in contradictions[entity]:
if contradiction in sentence.lower():
return False, f"Factual error: {entity} is not {contradiction}"
return True, "Factually accurate"
def quality_check_rewrite(original, rewritten):
"""BRUTAL quality check - reject if rewrite is worse"""
if not rewritten or len(rewritten.strip()) < 5:
return False, "Too short"
original = original.strip()
rewritten = rewritten.strip()
# Reject if too different in length
if len(rewritten) < len(original) * 0.4 or len(rewritten) > len(original) * 2.2:
return False, "Length change too extreme"
# SEMANTIC COHERENCE CHECK
is_coherent, coherence_error = semantic_coherence_check(rewritten)
if not is_coherent:
return False, f"Semantic error: {coherence_error}"
# FACTUAL ACCURACY CHECK
is_accurate, accuracy_error = factual_accuracy_check(rewritten)
if not is_accurate:
return False, f"Factual error: {accuracy_error}"
# Check for awkward AI phrases that make text worse
ai_awkwardness = [
r'\butilizing\b', # AI loves this word
r'\bin\s+order\s+to\b', # Verbose AI phrase
r'\bfacilitate\b.*\bin\s+its\b', # Awkward construction
r'\benables?\s+.*\s+to\s+.*\s+to\b', # Double "to" constructions
r'\bthe\s+.*\s+of\s+.*\s+of\s+.*\s+of\b', # Triple "of" chains
r'\baccordingly\s+utilizing\b', # Robotic phrasing
r'\benabling\s+a\s+number\s+of\s+individuals\b', # Overly formal
r'\bthe\s+.*\s+of\s+the\s+.*\s+of\s+the\b', # Repetitive structure
r'\bwhich\s+.*\s+that\s+.*\s+which\b', # Confusing relative clauses
]
awkward_count = sum(1 for pattern in ai_awkwardness
if re.search(pattern, rewritten, re.IGNORECASE))
if awkward_count > 1: # Even stricter now
return False, f"Too many awkward AI phrases ({awkward_count})"
# Check sentence structure quality
sentences = [s.strip() for s in rewritten.split('.') if s.strip()]
for sentence in sentences:
# Check for overly long sentences (likely AI verbosity)
if len(sentence.split()) > 40:
return False, "Sentence too long and complex"
# Check for repetitive structures
words = sentence.lower().split()
if len(words) != len(set(words)) and len([w for w in words if words.count(w) > 2]) > 0:
return False, "Too much word repetition in sentence"
# Check if key technical terms are preserved
key_terms = re.findall(r'\b(JM\s+Shows|IT|software|hardware|data|information|stakeholder|organization)\b',
original, re.IGNORECASE)
preserved = sum(1 for term in key_terms
if re.search(re.escape(term), rewritten, re.IGNORECASE))
if key_terms and preserved / len(key_terms) < 0.7:
return False, "Lost too many key terms"
# Use similarity check - if too different, probably worse
similarity = difflib.SequenceMatcher(None, original.lower(), rewritten.lower()).ratio()
if similarity < 0.3: # More lenient for structure changes
return False, f"Too different from original (similarity: {similarity:.2f})"
# Check that meaning is preserved by looking for key concepts
original_concepts = set(re.findall(r'\b(schedul\w+|manag\w+|communicat\w+|secur\w+|collaborat\w+)\b',
original.lower()))
rewritten_concepts = set(re.findall(r'\b(schedul\w+|manag\w+|communicat\w+|secur\w+|collaborat\w+)\b',
rewritten.lower()))
if original_concepts and len(original_concepts.intersection(rewritten_concepts)) / len(original_concepts) < 0.6:
return False, "Key concepts not preserved"
return True, "Acceptable"
def advanced_sentence_restructuring(text):
"""Advanced sentence restructuring to make text more human-like"""
sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
if not sentences:
return text
restructured = []
for sentence in sentences:
original_sentence = sentence
# Multiple restructuring techniques
sentence = restructure_complex_sentences(sentence)
sentence = vary_sentence_beginnings(sentence)
sentence = change_voice_patterns(sentence)
sentence = rearrange_clauses(sentence)
sentence = vary_connecting_words(sentence)
restructured.append(sentence)
return ' '.join(restructured).replace('..', '.')
def restructure_complex_sentences(sentence):
"""Break down and restructure complex sentences"""
# Pattern 1: "X, which Y, Z" -> "X does Z. This is because Y."
which_pattern = r'(.+?),\s*which\s+(.+?),\s*(.+)'
match = re.search(which_pattern, sentence, re.IGNORECASE)
if match and random.random() < 0.4:
main_part = match.group(1).strip()
which_part = match.group(2).strip()
end_part = match.group(3).strip().rstrip('.')
return f"{main_part} {end_part}. This happens because {which_part}."
# Pattern 2: "Due to X, Y happens" -> "Y happens. The reason is X."
due_pattern = r'due\s+to\s+(.+?),\s*(.+)'
match = re.search(due_pattern, sentence, re.IGNORECASE)
if match and random.random() < 0.4:
reason = match.group(1).strip()
result = match.group(2).strip().rstrip('.')
return f"{result.capitalize()}. The reason is {reason}."
# Pattern 3: "Although X, Y" -> "Y, even though X" or "Y. However, X"
although_pattern = r'although\s+(.+?),\s*(.+)'
match = re.search(although_pattern, sentence, re.IGNORECASE)
if match and random.random() < 0.4:
condition = match.group(1).strip()
main_clause = match.group(2).strip().rstrip('.')
if random.random() < 0.5:
return f"{main_clause.capitalize()}, even though {condition}."
else:
return f"{main_clause.capitalize()}. However, {condition}."
# Pattern 4: Long sentences with multiple "and" -> Split them
if sentence.count(' and ') >= 2 and len(sentence.split()) > 20:
parts = sentence.split(' and ')
if len(parts) >= 3:
# Take first two parts, make them one sentence
# Rest becomes second sentence
first_part = f"{parts[0]} and {parts[1]}."
remaining = ' and '.join(parts[2:]).rstrip('.')
return f"{first_part} Additionally, {remaining}."
return sentence
def vary_sentence_beginnings(sentence):
"""Change how sentences start to avoid AI patterns"""
sentence = sentence.strip()
# Don't change if it's already starting with something interesting
interesting_starts = ['However', 'Therefore', 'Additionally', 'Meanwhile', 'Furthermore',
'Moreover', 'Consequently', 'Subsequently', 'Nevertheless']
if any(sentence.startswith(start) for start in interesting_starts):
return sentence
# Pattern 1: "The organization uses X" -> "JM Shows uses X" or "The company uses X"
if sentence.lower().startswith('the organization'):
alternatives = ['JM Shows', 'The company', 'This organization', 'The business']
chosen = random.choice(alternatives)
sentence = re.sub(r'^the organization\b', chosen, sentence, flags=re.IGNORECASE)
# Pattern 2: "This enables" -> Various alternatives
if sentence.lower().startswith('this enables'):
alternatives = [
'This allows', 'This helps', 'This makes it possible for',
'As a result,', 'Because of this,'
]
chosen = random.choice(alternatives)
sentence = re.sub(r'^this enables?\b', chosen, sentence, flags=re.IGNORECASE)
# Pattern 3: "It is important" -> "X is crucial" or "X matters because"
if 'it is important' in sentence.lower():
alternatives = [
'it is crucial', 'it is essential', 'it matters',
'this is vital', 'this is key'
]
chosen = random.choice(alternatives)
sentence = re.sub(r'\bit is important\b', chosen, sentence, flags=re.IGNORECASE)
# Pattern 4: Add transitional phrases sometimes
if random.random() < 0.3 and not sentence.lower().startswith(('the', 'this', 'it', 'a')):
transitions = [
'In practice, ', 'For example, ', 'In this case, ', 'Typically, ',
'Usually, ', 'Often, ', 'Generally, '
]
if not any(sentence.startswith(t.strip()) for t in transitions):
chosen_transition = random.choice(transitions)
sentence = chosen_transition + sentence.lower()
return sentence
def change_voice_patterns(sentence):
"""Change between active and passive voice naturally"""
# Passive to active transformations
passive_patterns = [
# "X is used by Y" -> "Y uses X"
(r'(\w+(?:\s+\w+)*)\s+is\s+used\s+by\s+(\w+(?:\s+\w+)*)', r'\2 uses \1'),
# "X is managed by Y" -> "Y manages X"
(r'(\w+(?:\s+\w+)*)\s+is\s+managed\s+by\s+(\w+(?:\s+\w+)*)', r'\2 manages \1'),
# "X is implemented by Y" -> "Y implements X"
(r'(\w+(?:\s+\w+)*)\s+is\s+implemented\s+by\s+(\w+(?:\s+\w+)*)', r'\2 implements \1'),
# "X are accessed by Y" -> "Y accesses X"
(r'(\w+(?:\s+\w+)*)\s+are\s+accessed\s+by\s+(\w+(?:\s+\w+)*)', r'\2 accesses \1'),
]
# Apply passive to active (50% chance)
if random.random() < 0.5:
for pattern, replacement in passive_patterns:
if re.search(pattern, sentence, re.IGNORECASE):
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
break
# Active to passive transformations (less common, 25% chance)
elif random.random() < 0.25:
active_patterns = [
# "Y uses X" -> "X is used by Y"
(r'(\w+(?:\s+\w+)*)\s+uses\s+(\w+(?:\s+\w+)*)', r'\2 is used by \1'),
# "Y manages X" -> "X is managed by Y"
(r'(\w+(?:\s+\w+)*)\s+manages\s+(\w+(?:\s+\w+)*)', r'\2 is managed by \1'),
]
for pattern, replacement in active_patterns:
if re.search(pattern, sentence, re.IGNORECASE):
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
break
return sentence
def rearrange_clauses(sentence):
"""Rearrange clauses within sentences"""
# Pattern 1: "X because Y" -> "Because Y, X"
because_pattern = r'(.+?)\s+because\s+(.+)'
match = re.search(because_pattern, sentence, re.IGNORECASE)
if match and random.random() < 0.3:
main_clause = match.group(1).strip()
because_clause = match.group(2).strip().rstrip('.')
return f"Because {because_clause}, {main_clause.lower()}."
# Pattern 2: "X when Y" -> "When Y, X"
when_pattern = r'(.+?)\s+when\s+(.+)'
match = re.search(when_pattern, sentence, re.IGNORECASE)
if match and random.random() < 0.3:
main_clause = match.group(1).strip()
when_clause = match.group(2).strip().rstrip('.')
return f"When {when_clause}, {main_clause.lower()}."
# Pattern 3: Move prepositional phrases
# "The system stores data in the cloud" -> "In the cloud, the system stores data"
prep_phrases = ['in the cloud', 'on the network', 'within the organization',
'through the system', 'via the software', 'using the hardware']
for phrase in prep_phrases:
if phrase in sentence.lower() and random.random() < 0.25:
# Find the phrase and move it to the beginning
phrase_pattern = re.escape(phrase)
if re.search(r'\b' + phrase_pattern + r'\b', sentence, re.IGNORECASE):
# Remove from current position
new_sentence = re.sub(r',?\s*\b' + phrase_pattern + r'\b,?', '', sentence, flags=re.IGNORECASE)
# Add to beginning
new_sentence = f"{phrase.capitalize()}, {new_sentence.strip().lower()}"
return new_sentence
return sentence
def vary_connecting_words(sentence):
"""Replace connecting words with alternatives"""
connectors = {
'and': ['plus', 'as well as', 'along with', 'together with'],
'but': ['however', 'yet', 'though', 'although'],
'so': ['therefore', 'thus', 'as a result', 'consequently'],
'also': ['additionally', 'furthermore', 'moreover', 'as well'],
'however': ['but', 'yet', 'though', 'still'],
'therefore': ['so', 'thus', 'as a result', 'consequently'],
'because': ['since', 'as', 'due to the fact that'],
'while': ['whereas', 'although', 'though'],
}
# Replace connectors (30% chance per connector)
for original, alternatives in connectors.items():
if f' {original} ' in sentence.lower() and random.random() < 0.3:
chosen_alternative = random.choice(alternatives)
sentence = re.sub(r'\b' + re.escape(original) + r'\b', chosen_alternative,
sentence, flags=re.IGNORECASE, count=1)
break # Only replace one per sentence
return sentence
def add_human_imperfections(text, imperfection_rate=0.25):
"""Add subtle human-like imperfections and natural variations"""
if len(text.split()) < 10:
return text
# First apply advanced sentence restructuring
text = advanced_sentence_restructuring(text)
sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
if not sentences:
return text
modified_sentences = []
for sentence in sentences:
# Higher chance of modification for human-like variation
if random.random() > (1 - imperfection_rate):
modified_sentences.append(sentence)
continue
# Choose multiple techniques per sentence (more natural)
techniques = [
'slight_grammar_variation',
'word_variation',
'contraction_change',
'sentence_connector_variation',
'add_filler_words',
'vary_punctuation'
]
# Apply 1-2 techniques per modified sentence
num_techniques = random.choices([1, 2], weights=[0.7, 0.3])[0]
selected_techniques = random.sample(techniques, num_techniques)
for technique in selected_techniques:
if technique == 'slight_grammar_variation':
sentence = add_subtle_grammar_variations(sentence)
elif technique == 'word_variation':
sentence = vary_word_choice_extensively(sentence)
elif technique == 'contraction_change':
sentence = modify_contractions_naturally(sentence)
elif technique == 'sentence_connector_variation':
sentence = vary_sentence_connectors(sentence)
elif technique == 'add_filler_words':
sentence = add_natural_filler_words(sentence)
elif technique == 'vary_punctuation':
sentence = vary_punctuation_style(sentence)
modified_sentences.append(sentence)
result = ' '.join(modified_sentences).replace('..', '.')
# Final pass: add some natural flow variations
result = add_natural_flow_variations(result)
return result
def add_subtle_grammar_variations(sentence):
"""Add realistic grammar variations humans use"""
# Acceptable variations that humans commonly use
variations = [
# "data is" vs "data are" - both acceptable
(r'\bdata are\b', 'data is', 0.4),
# "different from" vs "different to" - both used
(r'\bdifferent from\b', 'different to', 0.3),
# "compared to" vs "compared with"
(r'\bcompared with\b', 'compared to', 0.3),
# "focused on" vs "focused around"
(r'\bfocused on\b', 'focused around', 0.2),
# "try to" vs "try and" - both acceptable
(r'\btry to\b', 'try and', 0.3),
# Article variations
(r'\bthe software\b', 'software', 0.2),
(r'\bthe hardware\b', 'hardware', 0.2),
(r'\bthe data\b', 'data', 0.2),
]
for pattern, replacement, probability in variations:
if re.search(pattern, sentence, re.IGNORECASE) and random.random() < probability:
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
break # Only one variation per sentence
return sentence
def vary_word_choice_extensively(sentence):
"""More extensive word variations that sound natural"""
# Extended word variations - more natural alternatives
word_variations = {
# Formal to casual
'utilize': ['use', 'employ', 'apply'],
'demonstrate': ['show', 'display', 'reveal'],
'facilitate': ['help', 'enable', 'support'],
'implement': ['use', 'apply', 'put in place'],
'obtain': ['get', 'acquire', 'receive'],
'purchase': ['buy', 'get', 'acquire'],
'commence': ['start', 'begin', 'kick off'],
'assist': ['help', 'support', 'aid'],
'require': ['need', 'call for', 'demand'],
'provide': ['give', 'offer', 'supply'],
'ensure': ['make sure', 'guarantee', 'see to it that'],
'maintain': ['keep', 'preserve', 'uphold'],
'establish': ['set up', 'create', 'build'],
'subsequently': ['then', 'after that', 'next'],
'accordingly': ['so', 'therefore', 'as a result'],
'furthermore': ['also', 'plus', 'on top of that'],
'nevertheless': ['but', 'however', 'still'],
'approximately': ['about', 'around', 'roughly'],
'numerous': ['many', 'lots of', 'plenty of'],
'sufficient': ['enough', 'adequate', 'plenty'],
'essential': ['vital', 'key', 'crucial'],
'significant': ['major', 'important', 'big'],
'substantial': ['large', 'considerable', 'major'],
# Casual to varied
'really': ['very', 'quite', 'pretty'],
'big': ['large', 'major', 'significant'],
'small': ['little', 'minor', 'compact'],
'good': ['effective', 'useful', 'beneficial'],
'bad': ['poor', 'ineffective', 'problematic'],
'fast': ['quick', 'rapid', 'speedy'],
'slow': ['gradual', 'delayed', 'unhurried'],
# Technical variations
'system': ['platform', 'setup', 'framework'],
'software': ['application', 'program', 'tool'],
'hardware': ['equipment', 'devices', 'machinery'],
'network': ['connection', 'system', 'infrastructure'],
'database': ['data store', 'information system', 'repository'],
'security': ['protection', 'safety', 'safeguarding'],
'organization': ['company', 'business', 'firm'],
'employee': ['staff member', 'worker', 'team member'],
'customer': ['client', 'user', 'consumer'],
'manager': ['supervisor', 'leader', 'head'],
}
# Apply variations (40% chance per word)
for formal, alternatives in word_variations.items():
if f' {formal} ' in sentence.lower() and random.random() < 0.4:
chosen_alternative = random.choice(alternatives)
sentence = re.sub(r'\b' + re.escape(formal) + r'\b', chosen_alternative,
sentence, flags=re.IGNORECASE, count=1)
break # Only one substitution per sentence
return sentence
def modify_contractions_naturally(sentence):
"""Natural contraction usage like humans"""
if random.random() < 0.5:
# Add contractions (more casual/natural)
contractions = {
'it is': "it's",
'they are': "they're",
'there is': "there's",
'that is': "that's",
'cannot': "can't",
'do not': "don't",
'does not': "doesn't",
'will not': "won't",
'would not': "wouldn't",
'should not': "shouldn't",
'could not': "couldn't",
'have not': "haven't",
'has not': "hasn't",
'had not': "hadn't",
'are not': "aren't",
'is not': "isn't",
'was not': "wasn't",
'were not': "weren't"
}
for full, contracted in contractions.items():
if full in sentence.lower() and random.random() < 0.6:
sentence = re.sub(r'\b' + re.escape(full) + r'\b', contracted, sentence, flags=re.IGNORECASE)
break
else:
# Remove contractions (more formal when needed)
expansions = {
"it's": 'it is',
"they're": 'they are',
"there's": 'there is',
"that's": 'that is',
"can't": 'cannot',
"don't": 'do not',
"doesn't": 'does not',
"won't": 'will not',
"wouldn't": 'would not',
"shouldn't": 'should not',
"couldn't": 'could not',
"haven't": 'have not',
"hasn't": 'has not',
"hadn't": 'had not',
"aren't": 'are not',
"isn't": 'is not',
"wasn't": 'was not',
"weren't": 'were not'
}
for contracted, full in expansions.items():
if contracted in sentence and random.random() < 0.4:
sentence = sentence.replace(contracted, full)
break
return sentence
def vary_sentence_connectors(sentence):
"""Add variety to how sentences connect ideas"""
# Add sentence starters that humans use
if random.random() < 0.3:
starters = [
'In fact, ', 'Actually, ', 'Basically, ', 'Essentially, ',
'In practice, ', 'For instance, ', 'For example, ', 'Specifically, ',
'In particular, ', 'More importantly, ', 'What\'s more, ', 'Plus, ',
'On top of that, ', 'Besides that, ', 'Apart from that, '
]
# Don't add if sentence already starts with a connector
if not any(sentence.lower().startswith(word.lower()) for word in
['the', 'this', 'it', 'however', 'therefore', 'furthermore', 'moreover']):
if random.random() < 0.4:
chosen_starter = random.choice(starters)
sentence = chosen_starter + sentence.lower()
# Replace mid-sentence connectors
connector_replacements = {
' and ': [' plus ', ' as well as ', ' along with '],
' but ': [' however ', ' though ', ' yet '],
' so ': [' therefore ', ' thus ', ' as a result '],
' because ': [' since ', ' as ', ' given that '],
' although ': [' while ', ' even though ', ' despite the fact that '],
' therefore ': [' so ', ' thus ', ' as a result '],
' however ': [' but ', ' though ', ' yet '],
' moreover ': [' also ', ' plus ', ' on top of that '],
' furthermore ': [' also ', ' what\'s more ', ' besides '],
}
for original, alternatives in connector_replacements.items():
if original in sentence.lower() and random.random() < 0.3:
chosen_alternative = random.choice(alternatives)
sentence = re.sub(re.escape(original), chosen_alternative, sentence, flags=re.IGNORECASE, count=1)
break
return sentence
def add_natural_filler_words(sentence):
"""Add natural filler words/phrases that humans use"""
# Natural emphasis and filler phrases
fillers = {
'important': ['really important', 'quite important', 'very important'],
'useful': ['really useful', 'quite useful', 'very useful'],
'effective': ['highly effective', 'really effective', 'quite effective'],
'necessary': ['absolutely necessary', 'really necessary', 'quite necessary'],
'helps': ['actually helps', 'really helps', 'definitely helps'],
'allows': ['actually allows', 'really allows', 'basically allows'],
'enables': ['actually enables', 'really enables', 'effectively enables'],
'provides': ['actually provides', 'really provides', 'effectively provides'],
}
# Add natural hedging/softening
hedges = {
'is': ['tends to be', 'is generally', 'is usually'],
'are': ['tend to be', 'are generally', 'are usually'],
'will': ['will likely', 'will probably', 'will generally'],
'can': ['can often', 'can usually', 'can typically'],
'must': ['should really', 'needs to', 'ought to'],
}
# Apply fillers (20% chance)
if random.random() < 0.2:
for word, alternatives in fillers.items():
if f' {word} ' in sentence.lower() and random.random() < 0.5:
chosen_alternative = random.choice(alternatives)
sentence = re.sub(r'\b' + re.escape(word) + r'\b', chosen_alternative,
sentence, flags=re.IGNORECASE, count=1)
break
# Apply hedges (15% chance)
if random.random() < 0.15:
for word, alternatives in hedges.items():
if f' {word} ' in sentence.lower() and random.random() < 0.4:
chosen_alternative = random.choice(alternatives)
sentence = re.sub(r'\b' + re.escape(word) + r'\b', chosen_alternative,
sentence, flags=re.IGNORECASE, count=1)
break
return sentence
def vary_punctuation_style(sentence):
"""Vary punctuation style naturally"""
# Sometimes use semicolons instead of periods for related ideas
if ' and ' in sentence and random.random() < 0.15:
# "X does A and Y does B" -> "X does A; Y does B"
parts = sentence.split(' and ', 1)
if len(parts) == 2 and len(parts[0]) > 10 and len(parts[1]) > 10:
sentence = f"{parts[0]}; {parts[1]}"
# Sometimes use em dashes for emphasis
if ' - ' in sentence and random.random() < 0.3:
sentence = sentence.replace(' - ', ' β€” ')
# Sometimes use colons for explanations
if 'because' in sentence.lower() and random.random() < 0.2:
# "X happens because Y" -> "X happens: Y"
because_match = re.search(r'(.+?)\s+because\s+(.+)', sentence, re.IGNORECASE)
if because_match:
main_part = because_match.group(1).strip()
reason = because_match.group(2).strip().rstrip('.')
sentence = f"{main_part}: {reason}."
return sentence
def add_natural_flow_variations(text):
"""Add natural flow variations across the entire text"""
sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
if len(sentences) < 2:
return text
# Vary sentence lengths naturally
modified_sentences = []
for i, sentence in enumerate(sentences):
# Sometimes combine short sentences
if (i < len(sentences) - 1 and
len(sentence.split()) < 8 and
len(sentences[i + 1].split()) < 8 and
random.random() < 0.3):
next_sentence = sentences[i + 1].rstrip('.')
combined_sentences = [
f"{sentence.rstrip('.')} and {next_sentence.lower()}.",
f"{sentence.rstrip('.')}, while {next_sentence.lower()}.",
f"{sentence.rstrip('.')}; {next_sentence.lower()}.",
]
chosen = random.choice(combined_sentences)
modified_sentences.append(chosen)
# Skip the next sentence since we combined it
sentences[i + 1] = ""
# Sometimes split very long sentences
elif len(sentence.split()) > 30 and random.random() < 0.4:
# Look for natural break points
break_points = [', and ', ', but ', ', however ', ', therefore ']
for break_point in break_points:
if break_point in sentence.lower():
parts = sentence.split(break_point, 1)
if len(parts) == 2:
connector = break_point.strip().rstrip(',').capitalize()
split_version = f"{parts[0].strip()}. {connector}, {parts[1].strip()}"
modified_sentences.append(split_version)
break
else:
modified_sentences.append(sentence)
else:
if sentence.strip(): # Only add non-empty sentences
modified_sentences.append(sentence)
return ' '.join(modified_sentences).replace('..', '.')
def conservative_rewrite(text, tokenizer, model):
"""Enhanced rewrite with advanced humanization"""
original = text.strip()
# Don't rewrite very short text
if len(original.split()) < 8:
# Still apply light humanization to short text
return add_human_imperfections(original, imperfection_rate=0.15)
# Pre-check: if original already has issues, be extra careful
original_coherent, _ = semantic_coherence_check(original)
original_accurate, _ = factual_accuracy_check(original)
if not original_coherent or not original_accurate:
print(f"⚠️ Original has issues, applying safe humanization only")
# Just apply humanization without AI rewriting
result = add_human_imperfections(original, imperfection_rate=0.2)
return result.strip()
# Try multiple approaches with different prompts for more variation
prompts = [
f"Rewrite naturally: {original}",
f"Make this sound more human: {original}",
f"Improve readability: {original}",
f"Rephrase this text: {original}",
f"Make this clearer and more natural: {original}"
]
for i, prompt in enumerate(prompts):
try:
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=300).to(model.device)
outputs = model.generate(
inputs.input_ids,
max_new_tokens=len(original.split()) + 20, # More room for natural expansion
do_sample=True,
temperature=0.08 + (i * 0.06), # Gradually increase creativity
top_p=0.5 + (i * 0.1), # More variation in later attempts
repetition_penalty=1.08,
pad_token_id=tokenizer.eos_token_id
)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Clean result more thoroughly
if ":" in result:
result = result.split(":", 1)[-1].strip()
# Remove prompt artifacts
result = re.sub(r'^(rewrite|make|improve|rephrase|naturally).*?:\s*', '', result, flags=re.IGNORECASE)
result = re.sub(r'^(this|that|the following)\s+', '', result, flags=re.IGNORECASE)
result = re.sub(r'\s+', ' ', result).strip()
# Quality check
is_good, reason = quality_check_rewrite(original, result)
if is_good:
print(f"βœ… AI rewrite accepted (attempt {i+1})")
# ADVANCED HUMANIZATION - This is the key enhancement
humanized_result = add_human_imperfections(result, imperfection_rate=0.3)
print(f"πŸ§‘ Applied advanced humanization")
return humanized_result
else:
print(f"❌ Attempt {i+1} rejected: {reason}")
continue
except Exception as e:
print(f"⚠️ Error in attempt {i+1}: {e}")
continue
# If all AI attempts failed, just apply advanced humanization to original
print(f"⚠️ All AI attempts failed, applying advanced humanization to original")
return add_human_imperfections(original, imperfection_rate=0.25)
def smart_paragraph_grouping(content_items):
"""Group fragmented text into proper paragraphs"""
grouped = []
current_group = []
for item in content_items:
text = item['text'].strip()
if not text:
continue
# If it's a heading, save current group and start fresh
if is_real_heading(text):
if current_group:
# Join the fragments into a paragraph
paragraph_text = ' '.join(current_group)
if len(paragraph_text.split()) > 5: # Only if substantial
grouped.append({
'text': paragraph_text,
'type': 'paragraph'
})
current_group = []
grouped.append({
'text': text,
'type': 'heading'
})
else:
# Add to current group
current_group.append(text)
# Don't forget the last group
if current_group:
paragraph_text = ' '.join(current_group)
if len(paragraph_text.split()) > 5:
grouped.append({
'text': paragraph_text,
'type': 'paragraph'
})
return grouped
def process_document_safely(input_path, output_path, tokenizer, model):
"""Rewrite the document while keeping the exact original formatting."""
print(f"πŸ”’ Processing with formatting preserved: {input_path}")
# Open original DOCX directly
doc = Document(input_path)
improved_count = 0
heavily_modified_count = 0
for para in doc.paragraphs:
if not para.text.strip():
continue # skip empty lines
# Decide if we rewrite (skip headings, very short text, etc.)
original_text = para.text
if len(original_text.split()) < 3:
continue # keep tiny text untouched
# Rewrite
improved_text = conservative_rewrite(original_text, tokenizer, model)
# Count improvements
if improved_text != original_text:
improved_count += 1
similarity = difflib.SequenceMatcher(None, original_text.lower(), improved_text.lower()).ratio()
if similarity < 0.6:
heavily_modified_count += 1
# Replace text while keeping all runs/styles
if para.runs:
# If there are multiple runs, merge them into a single updated run
para.clear() # remove old runs
run = para.add_run(improved_text)
# Copy formatting from first run
orig_run = para.runs[0] if para.runs else None
if orig_run:
run.bold = orig_run.bold
run.italic = orig_run.italic
run.underline = orig_run.underline
run.font.name = orig_run.font.name
run.font.size = orig_run.font.size
else:
para.text = improved_text
# Save document with same formatting but updated text
doc.save(output_path)
print(f"βœ… Saved rewritten document with original formatting: {output_path}")
print(f" Improvements made: {improved_count} ({heavily_modified_count} major)")
def extract_docx_content(input_path):
"""Extract content from DOCX"""
doc = Document(input_path)
content = []
for para in doc.paragraphs:
if para.text.strip():
content.append({
'text': para.text.strip(),
'font_size': 12,
'is_bold': any(run.bold for run in para.runs)
})
return content
def extract_pdf_content(input_path):
"""Extract content from PDF"""
doc = fitz.open(input_path)
content = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
blocks = page.get_text("dict")
for block in blocks["blocks"]:
if "lines" in block:
for line in block["lines"]:
line_text = ""
font_size = 12
is_bold = False
for span in line["spans"]:
line_text += span["text"]
font_size = max(font_size, span["size"])
if "bold" in span["font"].lower():
is_bold = True
if line_text.strip():
content.append({
'text': line_text.strip(),
'font_size': font_size,
'is_bold': is_bold
})
doc.close()
return content
def generate_clean_docx(content, output_path, original_path=None):
"""Generate DOCX output with the exact same formatting as the input."""
if not original_path or not os.path.exists(original_path):
print("⚠️ Original file not provided or not found, falling back to basic formatting.")
# Fallback to your old behavior
doc = Document()
for item in content:
if item['type'] == 'heading':
doc.add_heading(item['text'], level=1)
else:
doc.add_paragraph(item['text'])
doc.save(output_path)
return
# Load the original document
original_doc = Document(original_path)
# Create a new document with the same structure
new_doc = Document()
# We'll keep a counter to match rewritten paragraphs to original ones
content_index = 0
for para in original_doc.paragraphs:
if not para.text.strip():
# Preserve empty paragraphs
new_doc.add_paragraph("")
continue
# Get the rewritten text for this paragraph
if content_index < len(content):
rewritten_text = content[content_index]['text']
else:
rewritten_text = para.text # fallback
# Create a new paragraph in the new doc with the same style
new_para = new_doc.add_paragraph()
new_para.style = para.style
new_para.alignment = para.alignment
# Preserve each run's formatting, but replace text
if para.runs:
# If original had multiple runs, we split rewritten text into one run (simpler)
run = new_para.add_run(rewritten_text)
# Copy formatting from the first run of original
run.font.name = para.runs[0].font.name
run.font.size = para.runs[0].font.size
run.bold = para.runs[0].bold
run.italic = para.runs[0].italic
run.underline = para.runs[0].underline
else:
# Paragraph without runs (rare)
new_para.add_run(rewritten_text)
content_index += 1
# Save final doc
new_doc.save(output_path)
def generate_clean_pdf(content, output_path):
"""Generate clean PDF output"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.units import inch
doc = SimpleDocTemplate(output_path, pagesize=letter, topMargin=1*inch)
styles = getSampleStyleSheet()
heading_style = ParagraphStyle(
'CleanHeading',
parent=styles['Heading1'],
fontSize=14,
spaceAfter=12,
fontName='Helvetica-Bold'
)
body_style = ParagraphStyle(
'CleanBody',
parent=styles['Normal'],
fontSize=11,
spaceAfter=8,
fontName='Helvetica',
alignment=0 # Left aligned
)
story = []
for item in content:
if item['type'] == 'heading':
story.append(Paragraph(item['text'], heading_style))
story.append(Spacer(1, 6))
else:
story.append(Paragraph(item['text'], body_style))
story.append(Spacer(1, 4))
doc.build(story)
except ImportError:
print("⚠️ ReportLab not installed, creating text file instead")
with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f:
for item in content:
if item['type'] == 'heading':
f.write(f"\n{item['text']}\n{'='*len(item['text'])}\n\n")
else:
f.write(f"{item['text']}\n\n")
def main():
if len(sys.argv) < 3:
print("Usage: python rewriter_fixed.py <input_file_or_text> <output_file>")
print("Examples:")
print(" python rewriter_fixed.py input.docx output.docx")
print(" python rewriter_fixed.py input.pdf output.pdf")
print(" python rewriter_fixed.py \"This is my raw text\" output.txt")
return
input_arg = sys.argv[1]
output_path = sys.argv[2]
print(" Loading model for enhanced human-style processing...")
tokenizer, model, device = load_model()
# --- If it's a file ---
if os.path.exists(input_arg):
ext = os.path.splitext(input_arg)[1].lower()
if ext == ".docx":
process_document_safely(input_arg, output_path, tokenizer, model)
elif ext == ".pdf":
# Extract PDF β†’ rewrite β†’ save PDF
content = extract_pdf_content(input_arg)
grouped = smart_paragraph_grouping(content)
for item in grouped:
if len(item['text'].split()) >= 3:
item['text'] = conservative_rewrite(item['text'], tokenizer, model)
generate_clean_pdf(grouped, output_path)
else:
print(f" Unsupported file format: {ext}")
return
# --- If it's raw text ---
else:
raw_text = input_arg.strip()
if not raw_text:
print(" No text provided.")
return
humanized = conservative_rewrite(raw_text, tokenizer, model)
# Save as TXT
with open(output_path, "w", encoding="utf-8") as f:
f.write(humanized)
print(f" Humanized text saved to {output_path}")
print(" Enhanced human-style processing complete!")
def rewrite_text(original_text: str) -> str:
"""Main entry point for rewriting and humanizing."""
# Add imperfections
humanized = add_human_imperfections(original_text)
# Run quality check
ok, reason = quality_check_rewrite(original_text, humanized)
if not ok:
raise ValueError(f"Rewrite failed quality check: {reason}")
return humanized
if __name__ == "__main__":
main()