Spaces:
Running
Running
import sys | |
import os | |
import re | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
from docx import Document | |
from docx.shared import Pt, RGBColor | |
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT | |
import fitz # PyMuPDF for PDF processing | |
import difflib | |
import random | |
import torch | |
import sys | |
sys.stdout.reconfigure(encoding='utf-8') | |
def load_model(): | |
""" | |
Loads AI Rewriter model: | |
- Local run β loads from ./ai_rewriter_model/ | |
- Hugging Face Spaces β loads from HF Hub repo | |
""" | |
import os | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
import torch | |
HF_MODEL_REPO = "AlyanAkram/StealthWriter_Rewriter" | |
HF_MODEL_SUBFOLDER = "Ai_rewriter/ai_rewriter_model" | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
RUNNING_ON_SPACES = os.environ.get("HF_SPACE") or os.environ.get("SPACE_ID") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"π¦ Loading model to: {device}") | |
try: | |
if RUNNING_ON_SPACES: | |
print("π Running on Hugging Face Spaces β loading from HF Hub...") | |
tokenizer = T5Tokenizer.from_pretrained(HF_MODEL_REPO, subfolder=HF_MODEL_SUBFOLDER, token=HF_TOKEN) | |
model = T5ForConditionalGeneration.from_pretrained(HF_MODEL_REPO, subfolder=HF_MODEL_SUBFOLDER, token=HF_TOKEN).to(device) | |
else: | |
print("π» Running locally β loading from local files...") | |
local_path = os.path.join(os.path.dirname(__file__), "ai_rewriter_model") | |
tokenizer = T5Tokenizer.from_pretrained(local_path) | |
model = T5ForConditionalGeneration.from_pretrained(local_path).to(device) | |
return tokenizer, model, device | |
except Exception as e: | |
print(f"β οΈ Model load failed ({e}) β using t5-small fallback") | |
tokenizer = T5Tokenizer.from_pretrained("t5-small") | |
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device) | |
return tokenizer, model, device | |
def is_pure_junk(text): | |
"""Identify text that is 100% junk and should be deleted""" | |
text_lower = text.lower().strip() | |
# System metadata - DELETE ENTIRELY | |
system_junk = [ | |
r'submission\s+id', r'page\s+\d+\s+of', r'trn:oid', r'turnitin', | |
r'file\s+name', r'file\s+size', r'document\s+details', | |
r'\d+%\s+detected', r'ai.*generated', r'download\s+date', | |
r'qualifying\s+text', r'false\s+positives', r'cyan.*purple', | |
r'cover\s+page\s+submission', r'ai\s+writing\s+overview' | |
] | |
for pattern in system_junk: | |
if re.search(pattern, text_lower): | |
return True | |
# Random university spam - DELETE ENTIRELY | |
university_spam = [ | |
r'\b(harvard|yale|stanford|mit|berkeley)\s+university\b', | |
r'\buniversity\s+of\s+(california|pennsylvania|maryland)\b', | |
r'\bharvard\s+(business\s+school|law\s+school)\b', | |
r'\bjournal\s+of\s+the\s+american\s+medical\s+association\b' | |
] | |
for pattern in university_spam: | |
if re.search(pattern, text_lower): | |
return True | |
# Nonsensical fragments - DELETE ENTIRELY | |
nonsense = [ | |
r'\bthe\s+sex\s+of\b', r'\bschool\s+sex\b', r'\bis\s+a\s+sexy\b', | |
r'\bpunctuation\b(?!\s+marks)', r'\bchemistry\b(?!\s+class)', | |
r'\bjournalist?\b(?!\s+report)', r'\bacademic\s+heading\s+to\s+be\s+clear\b', | |
r'\bwrite.*heading.*concise\b', r'\bmaterials\s+for\s+the\s+synthesis\s+of\s+materials\b', | |
r'\bcolleges\s+and\s+colleges\s+are\s+a\s+great\s+way\b' | |
] | |
for pattern in nonsense: | |
if re.search(pattern, text_lower): | |
return True | |
return False | |
def is_real_heading(text): | |
"""STRICT heading detection - only actual headings""" | |
text = text.strip() | |
# Learning objectives (these are real headings) | |
if re.match(r'^[A-Z]\.?[PM]?\d+\.?\s', text): # P1., C.P6, BC.D2, etc. | |
return True | |
# Activity headings | |
if re.match(r'^IT\s+Systems?\s+[Aa]ctivity\s+\d+', text, re.IGNORECASE): | |
return True | |
# Learning Aim headings | |
if re.match(r'^Learning\s+[Aa]im\s+[A-Z]:', text, re.IGNORECASE): | |
return True | |
# Main section headings | |
main_sections = [ | |
r'^Contents?$', r'^Overview$', r'^Summary$', r'^Introduction$', | |
r'^Data\s+and\s+information\s+use', r'^Hardware\s+and\s+software', | |
r'^Stakeholder\s+Impact', r'^Internal\s+and\s+External' | |
] | |
for pattern in main_sections: | |
if re.match(pattern, text, re.IGNORECASE): | |
return True | |
return False | |
def semantic_coherence_check(text): | |
"""Check if content makes logical sense in the IT/business context""" | |
text_lower = text.lower() | |
# Check for logical inconsistencies | |
logical_errors = [ | |
# Contradictory statements | |
(r'\binternal.*external.*internal\b', "Contradictory internal/external"), | |
(r'\bsoftware.*hardware.*software\b', "Contradictory software/hardware"), | |
# Nonsensical combinations | |
(r'\bprinters.*email\b', "Printers don't send emails"), | |
(r'\bwi-fi.*cables\b', "Wi-Fi vs cables contradiction"), | |
(r'\bcloud.*physical.*storage\b', "Cloud vs physical confusion"), | |
# Subject-verb disagreements in context | |
(r'\bdata.*are.*information.*is\b', "Data/information verb disagreement"), | |
(r'\bstakeholder.*communicate.*themselves\b', "Reflexive pronoun error"), | |
# Circular or redundant logic | |
(r'\bthe.*of.*the.*of.*the\b', "Too many 'of' chains"), | |
(r'\bwhich.*that.*which.*that\b', "Confusing relative clauses"), | |
(r'\benables.*to.*allow.*to\b', "Redundant enabling/allowing"), | |
] | |
for pattern, error_type in logical_errors: | |
if re.search(pattern, text_lower): | |
return False, error_type | |
# Check for proper IT concepts | |
it_concept_pairs = [ | |
('backup', ['data', 'files', 'system', 'recovery']), | |
('network', ['connection', 'internet', 'communication', 'access']), | |
('security', ['firewall', 'encryption', 'protection', 'threat']), | |
('stakeholder', ['internal', 'external', 'staff', 'client', 'customer']), | |
('software', ['application', 'program', 'system', 'tool']), | |
('hardware', ['computer', 'device', 'equipment', 'physical']) | |
] | |
# If a concept is mentioned, related terms should make sense | |
for concept, related_terms in it_concept_pairs: | |
if concept in text_lower: | |
# Check if the context makes sense | |
concept_sentences = [s for s in text.split('.') if concept in s.lower()] | |
for sentence in concept_sentences: | |
# Very basic context check | |
if any(related in sentence.lower() for related in related_terms): | |
continue | |
else: | |
# Check if it's used in a completely wrong context | |
wrong_contexts = { | |
'backup': ['singing', 'dancing', 'performing'], | |
'network': ['friendship', 'social media'], | |
'security': ['building security', 'guard'], | |
'hardware': ['construction', 'tools', 'nails'] | |
} | |
if concept in wrong_contexts: | |
for wrong in wrong_contexts[concept]: | |
if wrong in sentence.lower(): | |
return False, f"Wrong context for {concept}" | |
return True, "Coherent" | |
def factual_accuracy_check(text): | |
"""Check for factual accuracy about JM Shows and IT systems""" | |
text_lower = text.lower() | |
# Known facts about JM Shows that should be consistent | |
known_facts = { | |
'jm shows': ['educational', 'theater', 'organization', 'jamal', 'manager'], | |
'jamal moulin': ['manager', 'jm shows'], | |
'btec': ['qualification', 'education', 'it systems'], | |
'stakeholders': ['internal', 'external', 'staff', 'customers', 'schools'] | |
} | |
# Check for factual inconsistencies | |
for entity, expected_context in known_facts.items(): | |
if entity in text_lower: | |
entity_sentences = [s for s in text.split('.') if entity in s.lower()] | |
for sentence in entity_sentences: | |
# Check for contradictory information | |
contradictions = { | |
'jm shows': ['manufacturing', 'restaurant', 'hospital', 'bank'], | |
'jamal moulin': ['student', 'teacher', 'client'], | |
'btec': ['university degree', 'masters', 'phd'] | |
} | |
if entity in contradictions: | |
for contradiction in contradictions[entity]: | |
if contradiction in sentence.lower(): | |
return False, f"Factual error: {entity} is not {contradiction}" | |
return True, "Factually accurate" | |
def quality_check_rewrite(original, rewritten): | |
"""BRUTAL quality check - reject if rewrite is worse""" | |
if not rewritten or len(rewritten.strip()) < 5: | |
return False, "Too short" | |
original = original.strip() | |
rewritten = rewritten.strip() | |
# Reject if too different in length | |
if len(rewritten) < len(original) * 0.4 or len(rewritten) > len(original) * 2.2: | |
return False, "Length change too extreme" | |
# SEMANTIC COHERENCE CHECK | |
is_coherent, coherence_error = semantic_coherence_check(rewritten) | |
if not is_coherent: | |
return False, f"Semantic error: {coherence_error}" | |
# FACTUAL ACCURACY CHECK | |
is_accurate, accuracy_error = factual_accuracy_check(rewritten) | |
if not is_accurate: | |
return False, f"Factual error: {accuracy_error}" | |
# Check for awkward AI phrases that make text worse | |
ai_awkwardness = [ | |
r'\butilizing\b', # AI loves this word | |
r'\bin\s+order\s+to\b', # Verbose AI phrase | |
r'\bfacilitate\b.*\bin\s+its\b', # Awkward construction | |
r'\benables?\s+.*\s+to\s+.*\s+to\b', # Double "to" constructions | |
r'\bthe\s+.*\s+of\s+.*\s+of\s+.*\s+of\b', # Triple "of" chains | |
r'\baccordingly\s+utilizing\b', # Robotic phrasing | |
r'\benabling\s+a\s+number\s+of\s+individuals\b', # Overly formal | |
r'\bthe\s+.*\s+of\s+the\s+.*\s+of\s+the\b', # Repetitive structure | |
r'\bwhich\s+.*\s+that\s+.*\s+which\b', # Confusing relative clauses | |
] | |
awkward_count = sum(1 for pattern in ai_awkwardness | |
if re.search(pattern, rewritten, re.IGNORECASE)) | |
if awkward_count > 1: # Even stricter now | |
return False, f"Too many awkward AI phrases ({awkward_count})" | |
# Check sentence structure quality | |
sentences = [s.strip() for s in rewritten.split('.') if s.strip()] | |
for sentence in sentences: | |
# Check for overly long sentences (likely AI verbosity) | |
if len(sentence.split()) > 40: | |
return False, "Sentence too long and complex" | |
# Check for repetitive structures | |
words = sentence.lower().split() | |
if len(words) != len(set(words)) and len([w for w in words if words.count(w) > 2]) > 0: | |
return False, "Too much word repetition in sentence" | |
# Check if key technical terms are preserved | |
key_terms = re.findall(r'\b(JM\s+Shows|IT|software|hardware|data|information|stakeholder|organization)\b', | |
original, re.IGNORECASE) | |
preserved = sum(1 for term in key_terms | |
if re.search(re.escape(term), rewritten, re.IGNORECASE)) | |
if key_terms and preserved / len(key_terms) < 0.7: | |
return False, "Lost too many key terms" | |
# Use similarity check - if too different, probably worse | |
similarity = difflib.SequenceMatcher(None, original.lower(), rewritten.lower()).ratio() | |
if similarity < 0.3: # More lenient for structure changes | |
return False, f"Too different from original (similarity: {similarity:.2f})" | |
# Check that meaning is preserved by looking for key concepts | |
original_concepts = set(re.findall(r'\b(schedul\w+|manag\w+|communicat\w+|secur\w+|collaborat\w+)\b', | |
original.lower())) | |
rewritten_concepts = set(re.findall(r'\b(schedul\w+|manag\w+|communicat\w+|secur\w+|collaborat\w+)\b', | |
rewritten.lower())) | |
if original_concepts and len(original_concepts.intersection(rewritten_concepts)) / len(original_concepts) < 0.6: | |
return False, "Key concepts not preserved" | |
return True, "Acceptable" | |
def advanced_sentence_restructuring(text): | |
"""Advanced sentence restructuring to make text more human-like""" | |
sentences = [s.strip() + '.' for s in text.split('.') if s.strip()] | |
if not sentences: | |
return text | |
restructured = [] | |
for sentence in sentences: | |
original_sentence = sentence | |
# Multiple restructuring techniques | |
sentence = restructure_complex_sentences(sentence) | |
sentence = vary_sentence_beginnings(sentence) | |
sentence = change_voice_patterns(sentence) | |
sentence = rearrange_clauses(sentence) | |
sentence = vary_connecting_words(sentence) | |
restructured.append(sentence) | |
return ' '.join(restructured).replace('..', '.') | |
def restructure_complex_sentences(sentence): | |
"""Break down and restructure complex sentences""" | |
# Pattern 1: "X, which Y, Z" -> "X does Z. This is because Y." | |
which_pattern = r'(.+?),\s*which\s+(.+?),\s*(.+)' | |
match = re.search(which_pattern, sentence, re.IGNORECASE) | |
if match and random.random() < 0.4: | |
main_part = match.group(1).strip() | |
which_part = match.group(2).strip() | |
end_part = match.group(3).strip().rstrip('.') | |
return f"{main_part} {end_part}. This happens because {which_part}." | |
# Pattern 2: "Due to X, Y happens" -> "Y happens. The reason is X." | |
due_pattern = r'due\s+to\s+(.+?),\s*(.+)' | |
match = re.search(due_pattern, sentence, re.IGNORECASE) | |
if match and random.random() < 0.4: | |
reason = match.group(1).strip() | |
result = match.group(2).strip().rstrip('.') | |
return f"{result.capitalize()}. The reason is {reason}." | |
# Pattern 3: "Although X, Y" -> "Y, even though X" or "Y. However, X" | |
although_pattern = r'although\s+(.+?),\s*(.+)' | |
match = re.search(although_pattern, sentence, re.IGNORECASE) | |
if match and random.random() < 0.4: | |
condition = match.group(1).strip() | |
main_clause = match.group(2).strip().rstrip('.') | |
if random.random() < 0.5: | |
return f"{main_clause.capitalize()}, even though {condition}." | |
else: | |
return f"{main_clause.capitalize()}. However, {condition}." | |
# Pattern 4: Long sentences with multiple "and" -> Split them | |
if sentence.count(' and ') >= 2 and len(sentence.split()) > 20: | |
parts = sentence.split(' and ') | |
if len(parts) >= 3: | |
# Take first two parts, make them one sentence | |
# Rest becomes second sentence | |
first_part = f"{parts[0]} and {parts[1]}." | |
remaining = ' and '.join(parts[2:]).rstrip('.') | |
return f"{first_part} Additionally, {remaining}." | |
return sentence | |
def vary_sentence_beginnings(sentence): | |
"""Change how sentences start to avoid AI patterns""" | |
sentence = sentence.strip() | |
# Don't change if it's already starting with something interesting | |
interesting_starts = ['However', 'Therefore', 'Additionally', 'Meanwhile', 'Furthermore', | |
'Moreover', 'Consequently', 'Subsequently', 'Nevertheless'] | |
if any(sentence.startswith(start) for start in interesting_starts): | |
return sentence | |
# Pattern 1: "The organization uses X" -> "JM Shows uses X" or "The company uses X" | |
if sentence.lower().startswith('the organization'): | |
alternatives = ['JM Shows', 'The company', 'This organization', 'The business'] | |
chosen = random.choice(alternatives) | |
sentence = re.sub(r'^the organization\b', chosen, sentence, flags=re.IGNORECASE) | |
# Pattern 2: "This enables" -> Various alternatives | |
if sentence.lower().startswith('this enables'): | |
alternatives = [ | |
'This allows', 'This helps', 'This makes it possible for', | |
'As a result,', 'Because of this,' | |
] | |
chosen = random.choice(alternatives) | |
sentence = re.sub(r'^this enables?\b', chosen, sentence, flags=re.IGNORECASE) | |
# Pattern 3: "It is important" -> "X is crucial" or "X matters because" | |
if 'it is important' in sentence.lower(): | |
alternatives = [ | |
'it is crucial', 'it is essential', 'it matters', | |
'this is vital', 'this is key' | |
] | |
chosen = random.choice(alternatives) | |
sentence = re.sub(r'\bit is important\b', chosen, sentence, flags=re.IGNORECASE) | |
# Pattern 4: Add transitional phrases sometimes | |
if random.random() < 0.3 and not sentence.lower().startswith(('the', 'this', 'it', 'a')): | |
transitions = [ | |
'In practice, ', 'For example, ', 'In this case, ', 'Typically, ', | |
'Usually, ', 'Often, ', 'Generally, ' | |
] | |
if not any(sentence.startswith(t.strip()) for t in transitions): | |
chosen_transition = random.choice(transitions) | |
sentence = chosen_transition + sentence.lower() | |
return sentence | |
def change_voice_patterns(sentence): | |
"""Change between active and passive voice naturally""" | |
# Passive to active transformations | |
passive_patterns = [ | |
# "X is used by Y" -> "Y uses X" | |
(r'(\w+(?:\s+\w+)*)\s+is\s+used\s+by\s+(\w+(?:\s+\w+)*)', r'\2 uses \1'), | |
# "X is managed by Y" -> "Y manages X" | |
(r'(\w+(?:\s+\w+)*)\s+is\s+managed\s+by\s+(\w+(?:\s+\w+)*)', r'\2 manages \1'), | |
# "X is implemented by Y" -> "Y implements X" | |
(r'(\w+(?:\s+\w+)*)\s+is\s+implemented\s+by\s+(\w+(?:\s+\w+)*)', r'\2 implements \1'), | |
# "X are accessed by Y" -> "Y accesses X" | |
(r'(\w+(?:\s+\w+)*)\s+are\s+accessed\s+by\s+(\w+(?:\s+\w+)*)', r'\2 accesses \1'), | |
] | |
# Apply passive to active (50% chance) | |
if random.random() < 0.5: | |
for pattern, replacement in passive_patterns: | |
if re.search(pattern, sentence, re.IGNORECASE): | |
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) | |
break | |
# Active to passive transformations (less common, 25% chance) | |
elif random.random() < 0.25: | |
active_patterns = [ | |
# "Y uses X" -> "X is used by Y" | |
(r'(\w+(?:\s+\w+)*)\s+uses\s+(\w+(?:\s+\w+)*)', r'\2 is used by \1'), | |
# "Y manages X" -> "X is managed by Y" | |
(r'(\w+(?:\s+\w+)*)\s+manages\s+(\w+(?:\s+\w+)*)', r'\2 is managed by \1'), | |
] | |
for pattern, replacement in active_patterns: | |
if re.search(pattern, sentence, re.IGNORECASE): | |
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) | |
break | |
return sentence | |
def rearrange_clauses(sentence): | |
"""Rearrange clauses within sentences""" | |
# Pattern 1: "X because Y" -> "Because Y, X" | |
because_pattern = r'(.+?)\s+because\s+(.+)' | |
match = re.search(because_pattern, sentence, re.IGNORECASE) | |
if match and random.random() < 0.3: | |
main_clause = match.group(1).strip() | |
because_clause = match.group(2).strip().rstrip('.') | |
return f"Because {because_clause}, {main_clause.lower()}." | |
# Pattern 2: "X when Y" -> "When Y, X" | |
when_pattern = r'(.+?)\s+when\s+(.+)' | |
match = re.search(when_pattern, sentence, re.IGNORECASE) | |
if match and random.random() < 0.3: | |
main_clause = match.group(1).strip() | |
when_clause = match.group(2).strip().rstrip('.') | |
return f"When {when_clause}, {main_clause.lower()}." | |
# Pattern 3: Move prepositional phrases | |
# "The system stores data in the cloud" -> "In the cloud, the system stores data" | |
prep_phrases = ['in the cloud', 'on the network', 'within the organization', | |
'through the system', 'via the software', 'using the hardware'] | |
for phrase in prep_phrases: | |
if phrase in sentence.lower() and random.random() < 0.25: | |
# Find the phrase and move it to the beginning | |
phrase_pattern = re.escape(phrase) | |
if re.search(r'\b' + phrase_pattern + r'\b', sentence, re.IGNORECASE): | |
# Remove from current position | |
new_sentence = re.sub(r',?\s*\b' + phrase_pattern + r'\b,?', '', sentence, flags=re.IGNORECASE) | |
# Add to beginning | |
new_sentence = f"{phrase.capitalize()}, {new_sentence.strip().lower()}" | |
return new_sentence | |
return sentence | |
def vary_connecting_words(sentence): | |
"""Replace connecting words with alternatives""" | |
connectors = { | |
'and': ['plus', 'as well as', 'along with', 'together with'], | |
'but': ['however', 'yet', 'though', 'although'], | |
'so': ['therefore', 'thus', 'as a result', 'consequently'], | |
'also': ['additionally', 'furthermore', 'moreover', 'as well'], | |
'however': ['but', 'yet', 'though', 'still'], | |
'therefore': ['so', 'thus', 'as a result', 'consequently'], | |
'because': ['since', 'as', 'due to the fact that'], | |
'while': ['whereas', 'although', 'though'], | |
} | |
# Replace connectors (30% chance per connector) | |
for original, alternatives in connectors.items(): | |
if f' {original} ' in sentence.lower() and random.random() < 0.3: | |
chosen_alternative = random.choice(alternatives) | |
sentence = re.sub(r'\b' + re.escape(original) + r'\b', chosen_alternative, | |
sentence, flags=re.IGNORECASE, count=1) | |
break # Only replace one per sentence | |
return sentence | |
def add_human_imperfections(text, imperfection_rate=0.25): | |
"""Add subtle human-like imperfections and natural variations""" | |
if len(text.split()) < 10: | |
return text | |
# First apply advanced sentence restructuring | |
text = advanced_sentence_restructuring(text) | |
sentences = [s.strip() + '.' for s in text.split('.') if s.strip()] | |
if not sentences: | |
return text | |
modified_sentences = [] | |
for sentence in sentences: | |
# Higher chance of modification for human-like variation | |
if random.random() > (1 - imperfection_rate): | |
modified_sentences.append(sentence) | |
continue | |
# Choose multiple techniques per sentence (more natural) | |
techniques = [ | |
'slight_grammar_variation', | |
'word_variation', | |
'contraction_change', | |
'sentence_connector_variation', | |
'add_filler_words', | |
'vary_punctuation' | |
] | |
# Apply 1-2 techniques per modified sentence | |
num_techniques = random.choices([1, 2], weights=[0.7, 0.3])[0] | |
selected_techniques = random.sample(techniques, num_techniques) | |
for technique in selected_techniques: | |
if technique == 'slight_grammar_variation': | |
sentence = add_subtle_grammar_variations(sentence) | |
elif technique == 'word_variation': | |
sentence = vary_word_choice_extensively(sentence) | |
elif technique == 'contraction_change': | |
sentence = modify_contractions_naturally(sentence) | |
elif technique == 'sentence_connector_variation': | |
sentence = vary_sentence_connectors(sentence) | |
elif technique == 'add_filler_words': | |
sentence = add_natural_filler_words(sentence) | |
elif technique == 'vary_punctuation': | |
sentence = vary_punctuation_style(sentence) | |
modified_sentences.append(sentence) | |
result = ' '.join(modified_sentences).replace('..', '.') | |
# Final pass: add some natural flow variations | |
result = add_natural_flow_variations(result) | |
return result | |
def add_subtle_grammar_variations(sentence): | |
"""Add realistic grammar variations humans use""" | |
# Acceptable variations that humans commonly use | |
variations = [ | |
# "data is" vs "data are" - both acceptable | |
(r'\bdata are\b', 'data is', 0.4), | |
# "different from" vs "different to" - both used | |
(r'\bdifferent from\b', 'different to', 0.3), | |
# "compared to" vs "compared with" | |
(r'\bcompared with\b', 'compared to', 0.3), | |
# "focused on" vs "focused around" | |
(r'\bfocused on\b', 'focused around', 0.2), | |
# "try to" vs "try and" - both acceptable | |
(r'\btry to\b', 'try and', 0.3), | |
# Article variations | |
(r'\bthe software\b', 'software', 0.2), | |
(r'\bthe hardware\b', 'hardware', 0.2), | |
(r'\bthe data\b', 'data', 0.2), | |
] | |
for pattern, replacement, probability in variations: | |
if re.search(pattern, sentence, re.IGNORECASE) and random.random() < probability: | |
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) | |
break # Only one variation per sentence | |
return sentence | |
def vary_word_choice_extensively(sentence): | |
"""More extensive word variations that sound natural""" | |
# Extended word variations - more natural alternatives | |
word_variations = { | |
# Formal to casual | |
'utilize': ['use', 'employ', 'apply'], | |
'demonstrate': ['show', 'display', 'reveal'], | |
'facilitate': ['help', 'enable', 'support'], | |
'implement': ['use', 'apply', 'put in place'], | |
'obtain': ['get', 'acquire', 'receive'], | |
'purchase': ['buy', 'get', 'acquire'], | |
'commence': ['start', 'begin', 'kick off'], | |
'assist': ['help', 'support', 'aid'], | |
'require': ['need', 'call for', 'demand'], | |
'provide': ['give', 'offer', 'supply'], | |
'ensure': ['make sure', 'guarantee', 'see to it that'], | |
'maintain': ['keep', 'preserve', 'uphold'], | |
'establish': ['set up', 'create', 'build'], | |
'subsequently': ['then', 'after that', 'next'], | |
'accordingly': ['so', 'therefore', 'as a result'], | |
'furthermore': ['also', 'plus', 'on top of that'], | |
'nevertheless': ['but', 'however', 'still'], | |
'approximately': ['about', 'around', 'roughly'], | |
'numerous': ['many', 'lots of', 'plenty of'], | |
'sufficient': ['enough', 'adequate', 'plenty'], | |
'essential': ['vital', 'key', 'crucial'], | |
'significant': ['major', 'important', 'big'], | |
'substantial': ['large', 'considerable', 'major'], | |
# Casual to varied | |
'really': ['very', 'quite', 'pretty'], | |
'big': ['large', 'major', 'significant'], | |
'small': ['little', 'minor', 'compact'], | |
'good': ['effective', 'useful', 'beneficial'], | |
'bad': ['poor', 'ineffective', 'problematic'], | |
'fast': ['quick', 'rapid', 'speedy'], | |
'slow': ['gradual', 'delayed', 'unhurried'], | |
# Technical variations | |
'system': ['platform', 'setup', 'framework'], | |
'software': ['application', 'program', 'tool'], | |
'hardware': ['equipment', 'devices', 'machinery'], | |
'network': ['connection', 'system', 'infrastructure'], | |
'database': ['data store', 'information system', 'repository'], | |
'security': ['protection', 'safety', 'safeguarding'], | |
'organization': ['company', 'business', 'firm'], | |
'employee': ['staff member', 'worker', 'team member'], | |
'customer': ['client', 'user', 'consumer'], | |
'manager': ['supervisor', 'leader', 'head'], | |
} | |
# Apply variations (40% chance per word) | |
for formal, alternatives in word_variations.items(): | |
if f' {formal} ' in sentence.lower() and random.random() < 0.4: | |
chosen_alternative = random.choice(alternatives) | |
sentence = re.sub(r'\b' + re.escape(formal) + r'\b', chosen_alternative, | |
sentence, flags=re.IGNORECASE, count=1) | |
break # Only one substitution per sentence | |
return sentence | |
def modify_contractions_naturally(sentence): | |
"""Natural contraction usage like humans""" | |
if random.random() < 0.5: | |
# Add contractions (more casual/natural) | |
contractions = { | |
'it is': "it's", | |
'they are': "they're", | |
'there is': "there's", | |
'that is': "that's", | |
'cannot': "can't", | |
'do not': "don't", | |
'does not': "doesn't", | |
'will not': "won't", | |
'would not': "wouldn't", | |
'should not': "shouldn't", | |
'could not': "couldn't", | |
'have not': "haven't", | |
'has not': "hasn't", | |
'had not': "hadn't", | |
'are not': "aren't", | |
'is not': "isn't", | |
'was not': "wasn't", | |
'were not': "weren't" | |
} | |
for full, contracted in contractions.items(): | |
if full in sentence.lower() and random.random() < 0.6: | |
sentence = re.sub(r'\b' + re.escape(full) + r'\b', contracted, sentence, flags=re.IGNORECASE) | |
break | |
else: | |
# Remove contractions (more formal when needed) | |
expansions = { | |
"it's": 'it is', | |
"they're": 'they are', | |
"there's": 'there is', | |
"that's": 'that is', | |
"can't": 'cannot', | |
"don't": 'do not', | |
"doesn't": 'does not', | |
"won't": 'will not', | |
"wouldn't": 'would not', | |
"shouldn't": 'should not', | |
"couldn't": 'could not', | |
"haven't": 'have not', | |
"hasn't": 'has not', | |
"hadn't": 'had not', | |
"aren't": 'are not', | |
"isn't": 'is not', | |
"wasn't": 'was not', | |
"weren't": 'were not' | |
} | |
for contracted, full in expansions.items(): | |
if contracted in sentence and random.random() < 0.4: | |
sentence = sentence.replace(contracted, full) | |
break | |
return sentence | |
def vary_sentence_connectors(sentence): | |
"""Add variety to how sentences connect ideas""" | |
# Add sentence starters that humans use | |
if random.random() < 0.3: | |
starters = [ | |
'In fact, ', 'Actually, ', 'Basically, ', 'Essentially, ', | |
'In practice, ', 'For instance, ', 'For example, ', 'Specifically, ', | |
'In particular, ', 'More importantly, ', 'What\'s more, ', 'Plus, ', | |
'On top of that, ', 'Besides that, ', 'Apart from that, ' | |
] | |
# Don't add if sentence already starts with a connector | |
if not any(sentence.lower().startswith(word.lower()) for word in | |
['the', 'this', 'it', 'however', 'therefore', 'furthermore', 'moreover']): | |
if random.random() < 0.4: | |
chosen_starter = random.choice(starters) | |
sentence = chosen_starter + sentence.lower() | |
# Replace mid-sentence connectors | |
connector_replacements = { | |
' and ': [' plus ', ' as well as ', ' along with '], | |
' but ': [' however ', ' though ', ' yet '], | |
' so ': [' therefore ', ' thus ', ' as a result '], | |
' because ': [' since ', ' as ', ' given that '], | |
' although ': [' while ', ' even though ', ' despite the fact that '], | |
' therefore ': [' so ', ' thus ', ' as a result '], | |
' however ': [' but ', ' though ', ' yet '], | |
' moreover ': [' also ', ' plus ', ' on top of that '], | |
' furthermore ': [' also ', ' what\'s more ', ' besides '], | |
} | |
for original, alternatives in connector_replacements.items(): | |
if original in sentence.lower() and random.random() < 0.3: | |
chosen_alternative = random.choice(alternatives) | |
sentence = re.sub(re.escape(original), chosen_alternative, sentence, flags=re.IGNORECASE, count=1) | |
break | |
return sentence | |
def add_natural_filler_words(sentence): | |
"""Add natural filler words/phrases that humans use""" | |
# Natural emphasis and filler phrases | |
fillers = { | |
'important': ['really important', 'quite important', 'very important'], | |
'useful': ['really useful', 'quite useful', 'very useful'], | |
'effective': ['highly effective', 'really effective', 'quite effective'], | |
'necessary': ['absolutely necessary', 'really necessary', 'quite necessary'], | |
'helps': ['actually helps', 'really helps', 'definitely helps'], | |
'allows': ['actually allows', 'really allows', 'basically allows'], | |
'enables': ['actually enables', 'really enables', 'effectively enables'], | |
'provides': ['actually provides', 'really provides', 'effectively provides'], | |
} | |
# Add natural hedging/softening | |
hedges = { | |
'is': ['tends to be', 'is generally', 'is usually'], | |
'are': ['tend to be', 'are generally', 'are usually'], | |
'will': ['will likely', 'will probably', 'will generally'], | |
'can': ['can often', 'can usually', 'can typically'], | |
'must': ['should really', 'needs to', 'ought to'], | |
} | |
# Apply fillers (20% chance) | |
if random.random() < 0.2: | |
for word, alternatives in fillers.items(): | |
if f' {word} ' in sentence.lower() and random.random() < 0.5: | |
chosen_alternative = random.choice(alternatives) | |
sentence = re.sub(r'\b' + re.escape(word) + r'\b', chosen_alternative, | |
sentence, flags=re.IGNORECASE, count=1) | |
break | |
# Apply hedges (15% chance) | |
if random.random() < 0.15: | |
for word, alternatives in hedges.items(): | |
if f' {word} ' in sentence.lower() and random.random() < 0.4: | |
chosen_alternative = random.choice(alternatives) | |
sentence = re.sub(r'\b' + re.escape(word) + r'\b', chosen_alternative, | |
sentence, flags=re.IGNORECASE, count=1) | |
break | |
return sentence | |
def vary_punctuation_style(sentence): | |
"""Vary punctuation style naturally""" | |
# Sometimes use semicolons instead of periods for related ideas | |
if ' and ' in sentence and random.random() < 0.15: | |
# "X does A and Y does B" -> "X does A; Y does B" | |
parts = sentence.split(' and ', 1) | |
if len(parts) == 2 and len(parts[0]) > 10 and len(parts[1]) > 10: | |
sentence = f"{parts[0]}; {parts[1]}" | |
# Sometimes use em dashes for emphasis | |
if ' - ' in sentence and random.random() < 0.3: | |
sentence = sentence.replace(' - ', ' β ') | |
# Sometimes use colons for explanations | |
if 'because' in sentence.lower() and random.random() < 0.2: | |
# "X happens because Y" -> "X happens: Y" | |
because_match = re.search(r'(.+?)\s+because\s+(.+)', sentence, re.IGNORECASE) | |
if because_match: | |
main_part = because_match.group(1).strip() | |
reason = because_match.group(2).strip().rstrip('.') | |
sentence = f"{main_part}: {reason}." | |
return sentence | |
def add_natural_flow_variations(text): | |
"""Add natural flow variations across the entire text""" | |
sentences = [s.strip() + '.' for s in text.split('.') if s.strip()] | |
if len(sentences) < 2: | |
return text | |
# Vary sentence lengths naturally | |
modified_sentences = [] | |
for i, sentence in enumerate(sentences): | |
# Sometimes combine short sentences | |
if (i < len(sentences) - 1 and | |
len(sentence.split()) < 8 and | |
len(sentences[i + 1].split()) < 8 and | |
random.random() < 0.3): | |
next_sentence = sentences[i + 1].rstrip('.') | |
combined_sentences = [ | |
f"{sentence.rstrip('.')} and {next_sentence.lower()}.", | |
f"{sentence.rstrip('.')}, while {next_sentence.lower()}.", | |
f"{sentence.rstrip('.')}; {next_sentence.lower()}.", | |
] | |
chosen = random.choice(combined_sentences) | |
modified_sentences.append(chosen) | |
# Skip the next sentence since we combined it | |
sentences[i + 1] = "" | |
# Sometimes split very long sentences | |
elif len(sentence.split()) > 30 and random.random() < 0.4: | |
# Look for natural break points | |
break_points = [', and ', ', but ', ', however ', ', therefore '] | |
for break_point in break_points: | |
if break_point in sentence.lower(): | |
parts = sentence.split(break_point, 1) | |
if len(parts) == 2: | |
connector = break_point.strip().rstrip(',').capitalize() | |
split_version = f"{parts[0].strip()}. {connector}, {parts[1].strip()}" | |
modified_sentences.append(split_version) | |
break | |
else: | |
modified_sentences.append(sentence) | |
else: | |
if sentence.strip(): # Only add non-empty sentences | |
modified_sentences.append(sentence) | |
return ' '.join(modified_sentences).replace('..', '.') | |
def conservative_rewrite(text, tokenizer, model): | |
"""Enhanced rewrite with advanced humanization""" | |
original = text.strip() | |
# Don't rewrite very short text | |
if len(original.split()) < 8: | |
# Still apply light humanization to short text | |
return add_human_imperfections(original, imperfection_rate=0.15) | |
# Pre-check: if original already has issues, be extra careful | |
original_coherent, _ = semantic_coherence_check(original) | |
original_accurate, _ = factual_accuracy_check(original) | |
if not original_coherent or not original_accurate: | |
print(f"β οΈ Original has issues, applying safe humanization only") | |
# Just apply humanization without AI rewriting | |
result = add_human_imperfections(original, imperfection_rate=0.2) | |
return result.strip() | |
# Try multiple approaches with different prompts for more variation | |
prompts = [ | |
f"Rewrite naturally: {original}", | |
f"Make this sound more human: {original}", | |
f"Improve readability: {original}", | |
f"Rephrase this text: {original}", | |
f"Make this clearer and more natural: {original}" | |
] | |
for i, prompt in enumerate(prompts): | |
try: | |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=300).to(model.device) | |
outputs = model.generate( | |
inputs.input_ids, | |
max_new_tokens=len(original.split()) + 20, # More room for natural expansion | |
do_sample=True, | |
temperature=0.08 + (i * 0.06), # Gradually increase creativity | |
top_p=0.5 + (i * 0.1), # More variation in later attempts | |
repetition_penalty=1.08, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
result = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Clean result more thoroughly | |
if ":" in result: | |
result = result.split(":", 1)[-1].strip() | |
# Remove prompt artifacts | |
result = re.sub(r'^(rewrite|make|improve|rephrase|naturally).*?:\s*', '', result, flags=re.IGNORECASE) | |
result = re.sub(r'^(this|that|the following)\s+', '', result, flags=re.IGNORECASE) | |
result = re.sub(r'\s+', ' ', result).strip() | |
# Quality check | |
is_good, reason = quality_check_rewrite(original, result) | |
if is_good: | |
print(f"β AI rewrite accepted (attempt {i+1})") | |
# ADVANCED HUMANIZATION - This is the key enhancement | |
humanized_result = add_human_imperfections(result, imperfection_rate=0.3) | |
print(f"π§ Applied advanced humanization") | |
return humanized_result | |
else: | |
print(f"β Attempt {i+1} rejected: {reason}") | |
continue | |
except Exception as e: | |
print(f"β οΈ Error in attempt {i+1}: {e}") | |
continue | |
# If all AI attempts failed, just apply advanced humanization to original | |
print(f"β οΈ All AI attempts failed, applying advanced humanization to original") | |
return add_human_imperfections(original, imperfection_rate=0.25) | |
def smart_paragraph_grouping(content_items): | |
"""Group fragmented text into proper paragraphs""" | |
grouped = [] | |
current_group = [] | |
for item in content_items: | |
text = item['text'].strip() | |
if not text: | |
continue | |
# If it's a heading, save current group and start fresh | |
if is_real_heading(text): | |
if current_group: | |
# Join the fragments into a paragraph | |
paragraph_text = ' '.join(current_group) | |
if len(paragraph_text.split()) > 5: # Only if substantial | |
grouped.append({ | |
'text': paragraph_text, | |
'type': 'paragraph' | |
}) | |
current_group = [] | |
grouped.append({ | |
'text': text, | |
'type': 'heading' | |
}) | |
else: | |
# Add to current group | |
current_group.append(text) | |
# Don't forget the last group | |
if current_group: | |
paragraph_text = ' '.join(current_group) | |
if len(paragraph_text.split()) > 5: | |
grouped.append({ | |
'text': paragraph_text, | |
'type': 'paragraph' | |
}) | |
return grouped | |
def process_document_safely(input_path, output_path, tokenizer, model): | |
"""Rewrite the document while keeping the exact original formatting.""" | |
print(f"π Processing with formatting preserved: {input_path}") | |
# Open original DOCX directly | |
doc = Document(input_path) | |
improved_count = 0 | |
heavily_modified_count = 0 | |
for para in doc.paragraphs: | |
if not para.text.strip(): | |
continue # skip empty lines | |
# Decide if we rewrite (skip headings, very short text, etc.) | |
original_text = para.text | |
if len(original_text.split()) < 3: | |
continue # keep tiny text untouched | |
# Rewrite | |
improved_text = conservative_rewrite(original_text, tokenizer, model) | |
# Count improvements | |
if improved_text != original_text: | |
improved_count += 1 | |
similarity = difflib.SequenceMatcher(None, original_text.lower(), improved_text.lower()).ratio() | |
if similarity < 0.6: | |
heavily_modified_count += 1 | |
# Replace text while keeping all runs/styles | |
if para.runs: | |
# If there are multiple runs, merge them into a single updated run | |
para.clear() # remove old runs | |
run = para.add_run(improved_text) | |
# Copy formatting from first run | |
orig_run = para.runs[0] if para.runs else None | |
if orig_run: | |
run.bold = orig_run.bold | |
run.italic = orig_run.italic | |
run.underline = orig_run.underline | |
run.font.name = orig_run.font.name | |
run.font.size = orig_run.font.size | |
else: | |
para.text = improved_text | |
# Save document with same formatting but updated text | |
doc.save(output_path) | |
print(f"β Saved rewritten document with original formatting: {output_path}") | |
print(f" Improvements made: {improved_count} ({heavily_modified_count} major)") | |
def extract_docx_content(input_path): | |
"""Extract content from DOCX""" | |
doc = Document(input_path) | |
content = [] | |
for para in doc.paragraphs: | |
if para.text.strip(): | |
content.append({ | |
'text': para.text.strip(), | |
'font_size': 12, | |
'is_bold': any(run.bold for run in para.runs) | |
}) | |
return content | |
def extract_pdf_content(input_path): | |
"""Extract content from PDF""" | |
doc = fitz.open(input_path) | |
content = [] | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
blocks = page.get_text("dict") | |
for block in blocks["blocks"]: | |
if "lines" in block: | |
for line in block["lines"]: | |
line_text = "" | |
font_size = 12 | |
is_bold = False | |
for span in line["spans"]: | |
line_text += span["text"] | |
font_size = max(font_size, span["size"]) | |
if "bold" in span["font"].lower(): | |
is_bold = True | |
if line_text.strip(): | |
content.append({ | |
'text': line_text.strip(), | |
'font_size': font_size, | |
'is_bold': is_bold | |
}) | |
doc.close() | |
return content | |
def generate_clean_docx(content, output_path, original_path=None): | |
"""Generate DOCX output with the exact same formatting as the input.""" | |
if not original_path or not os.path.exists(original_path): | |
print("β οΈ Original file not provided or not found, falling back to basic formatting.") | |
# Fallback to your old behavior | |
doc = Document() | |
for item in content: | |
if item['type'] == 'heading': | |
doc.add_heading(item['text'], level=1) | |
else: | |
doc.add_paragraph(item['text']) | |
doc.save(output_path) | |
return | |
# Load the original document | |
original_doc = Document(original_path) | |
# Create a new document with the same structure | |
new_doc = Document() | |
# We'll keep a counter to match rewritten paragraphs to original ones | |
content_index = 0 | |
for para in original_doc.paragraphs: | |
if not para.text.strip(): | |
# Preserve empty paragraphs | |
new_doc.add_paragraph("") | |
continue | |
# Get the rewritten text for this paragraph | |
if content_index < len(content): | |
rewritten_text = content[content_index]['text'] | |
else: | |
rewritten_text = para.text # fallback | |
# Create a new paragraph in the new doc with the same style | |
new_para = new_doc.add_paragraph() | |
new_para.style = para.style | |
new_para.alignment = para.alignment | |
# Preserve each run's formatting, but replace text | |
if para.runs: | |
# If original had multiple runs, we split rewritten text into one run (simpler) | |
run = new_para.add_run(rewritten_text) | |
# Copy formatting from the first run of original | |
run.font.name = para.runs[0].font.name | |
run.font.size = para.runs[0].font.size | |
run.bold = para.runs[0].bold | |
run.italic = para.runs[0].italic | |
run.underline = para.runs[0].underline | |
else: | |
# Paragraph without runs (rare) | |
new_para.add_run(rewritten_text) | |
content_index += 1 | |
# Save final doc | |
new_doc.save(output_path) | |
def generate_clean_pdf(content, output_path): | |
"""Generate clean PDF output""" | |
try: | |
from reportlab.lib.pagesizes import letter | |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer | |
from reportlab.lib.units import inch | |
doc = SimpleDocTemplate(output_path, pagesize=letter, topMargin=1*inch) | |
styles = getSampleStyleSheet() | |
heading_style = ParagraphStyle( | |
'CleanHeading', | |
parent=styles['Heading1'], | |
fontSize=14, | |
spaceAfter=12, | |
fontName='Helvetica-Bold' | |
) | |
body_style = ParagraphStyle( | |
'CleanBody', | |
parent=styles['Normal'], | |
fontSize=11, | |
spaceAfter=8, | |
fontName='Helvetica', | |
alignment=0 # Left aligned | |
) | |
story = [] | |
for item in content: | |
if item['type'] == 'heading': | |
story.append(Paragraph(item['text'], heading_style)) | |
story.append(Spacer(1, 6)) | |
else: | |
story.append(Paragraph(item['text'], body_style)) | |
story.append(Spacer(1, 4)) | |
doc.build(story) | |
except ImportError: | |
print("β οΈ ReportLab not installed, creating text file instead") | |
with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f: | |
for item in content: | |
if item['type'] == 'heading': | |
f.write(f"\n{item['text']}\n{'='*len(item['text'])}\n\n") | |
else: | |
f.write(f"{item['text']}\n\n") | |
def main(): | |
if len(sys.argv) < 3: | |
print("Usage: python rewriter_fixed.py <input_file_or_text> <output_file>") | |
print("Examples:") | |
print(" python rewriter_fixed.py input.docx output.docx") | |
print(" python rewriter_fixed.py input.pdf output.pdf") | |
print(" python rewriter_fixed.py \"This is my raw text\" output.txt") | |
return | |
input_arg = sys.argv[1] | |
output_path = sys.argv[2] | |
print(" Loading model for enhanced human-style processing...") | |
tokenizer, model, device = load_model() | |
# --- If it's a file --- | |
if os.path.exists(input_arg): | |
ext = os.path.splitext(input_arg)[1].lower() | |
if ext == ".docx": | |
process_document_safely(input_arg, output_path, tokenizer, model) | |
elif ext == ".pdf": | |
# Extract PDF β rewrite β save PDF | |
content = extract_pdf_content(input_arg) | |
grouped = smart_paragraph_grouping(content) | |
for item in grouped: | |
if len(item['text'].split()) >= 3: | |
item['text'] = conservative_rewrite(item['text'], tokenizer, model) | |
generate_clean_pdf(grouped, output_path) | |
else: | |
print(f" Unsupported file format: {ext}") | |
return | |
# --- If it's raw text --- | |
else: | |
raw_text = input_arg.strip() | |
if not raw_text: | |
print(" No text provided.") | |
return | |
humanized = conservative_rewrite(raw_text, tokenizer, model) | |
# Save as TXT | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(humanized) | |
print(f" Humanized text saved to {output_path}") | |
print(" Enhanced human-style processing complete!") | |
def rewrite_text(original_text: str) -> str: | |
"""Main entry point for rewriting and humanizing.""" | |
# Add imperfections | |
humanized = add_human_imperfections(original_text) | |
# Run quality check | |
ok, reason = quality_check_rewrite(original_text, humanized) | |
if not ok: | |
raise ValueError(f"Rewrite failed quality check: {reason}") | |
return humanized | |
if __name__ == "__main__": | |
main() | |