EdysorEdutech commited on
Commit
8660488
·
verified ·
1 Parent(s): 61dd362

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +530 -697
app.py CHANGED
@@ -26,34 +26,24 @@ class HumanLikeVariations:
26
  """Add human-like variations and intentional imperfections"""
27
 
28
  def __init__(self):
29
- # Common human writing patterns - MASSIVELY EXPANDED for Originality AI
30
  self.casual_transitions = [
31
- "So, ", "Well, ", "Now, ", "Actually, ", "Basically, ",
32
- "You know, ", "I mean, ", "Thing is, ", "Honestly, ",
33
- "Look, ", "Listen, ", "See, ", "Okay, ", "Right, ",
34
- "Anyway, ", "Besides, ", "Plus, ", "Also, ", "Oh, ",
35
- "Hey, ", "Alright, ", "Sure, ", "Fine, ", "Obviously, ",
36
- "Clearly, ", "Seriously, ", "Literally, ", "Frankly, ",
37
- "To be honest, ", "Truth is, ", "In fact, ", "Believe it or not, ",
38
- "Here's the thing, ", "Let me tell you, ", "Get this, ",
39
- "Funny thing is, ", "Interestingly, ", "Surprisingly, ",
40
- "Let's be real here, ", "Can we talk about ", "Quick question: ",
41
- "Real talk: ", "Hot take: ", "Unpopular opinion: ", "Fun fact: ",
42
- "Pro tip: ", "Side note: ", "Random thought: ", "Food for thought: ",
43
- "Just saying, ", "Not gonna lie, ", "For what it's worth, ",
44
- "If you ask me, ", "Between you and me, ", "Here's my take: ",
45
- "Let's face it, ", "No kidding, ", "Seriously though, ",
46
- "But wait, ", "Hold on, ", "Check this out: ", "Guess what? ",
47
- "You know what? ", "Tell you what, ", "I'll be honest, ",
48
- "Here's the deal: ", "Bottom line: ", "Long story short, ",
49
- "Point is, ", "Fact is, ", "Reality is, ", "Thing is though, ",
50
- "What's more, ", "Better yet, ", "Even better, ", "Even worse, ",
51
- "Funny enough, ", "Weird thing is, ", "Strange but true: ",
52
- "Believe me when I say, ", "Trust me on this, ", "I kid you not, ",
53
- "No joke, ", "For real though, ", "I'm telling you, ",
54
- "And get this - ", "But here's the kicker: ", "Plot twist: ",
55
- "Spoiler alert: ", "News flash: ", "Reality check: ",
56
- "Let me break it down: ", "Here's what happened: ", "So here's the thing: "
57
  ]
58
 
59
  self.filler_phrases = [
@@ -78,12 +68,7 @@ class HumanLikeVariations:
78
  "to the best of my knowledge", "if I'm not mistaken", "correct me if I'm wrong",
79
  "you know what", "here's the deal", "bottom line", "at any rate",
80
  "all in all", "when you think about it", "come to think of it",
81
- "now that I think about it", "if we're being honest", "to be fair",
82
- "like I said", "as I mentioned", "as we discussed", "going back to",
83
- "on that note", "speaking of which", "which reminds me", "by the way",
84
- "just a thought", "just my two cents", "if you ask me", "in my book",
85
- "the way I see it", "from where I'm standing", "in my humble opinion",
86
- "not to mention", "let alone", "much less", "aside from that"
87
  ]
88
 
89
  self.human_connectors = [
@@ -111,46 +96,34 @@ class HumanLikeVariations:
111
  ". And honestly?", ". But seriously,", ". And you know what?",
112
  ", which brings me to", ". This reminds me of", ", speaking of which",
113
  ". Funny enough,", ". Weird thing is,", ". Strange but true:",
114
- ", and I mean", ". I'm not kidding when I say", ", and trust me on this",
115
- ". But here's where it gets interesting:", ". Now here's the crazy part:",
116
- ", and this is important", ", and this is key", ", and this matters because",
117
- ". I'll tell you why:", ". Here's my reasoning:", ". Let me put it this way:",
118
- ", which - by the way -", ", and - no joke -", ", but - and this is crucial -"
119
  ]
120
 
121
- # NEW: Common human typos and variations - EXPANDED
122
  self.common_typos = {
123
- "the": ["teh", "th", "hte", "thhe"],
124
- "and": ["adn", "nad", "an", "andd"],
125
- "that": ["taht", "htat", "tha", "thatt"],
126
- "with": ["wiht", "wtih", "iwth", "withh"],
127
- "have": ["ahve", "hvae", "hav", "havve"],
128
- "from": ["form", "fro", "frmo", "fromm"],
129
- "they": ["tehy", "thye", "htey", "tehyy"],
130
- "which": ["whihc", "wich", "whcih", "whichh"],
131
- "their": ["thier", "theri", "tehir", "theirr"],
132
- "would": ["woudl", "wuold", "woul", "wouldd"],
133
- "there": ["tehre", "theer", "ther", "theree"],
134
- "could": ["coudl", "cuold", "coud", "couldd"],
135
- "people": ["poeple", "peopel", "pepole", "peolpe"],
136
- "through": ["thorugh", "throught", "trhough", "thoruhg"],
137
- "because": ["becuase", "becasue", "beacuse", "becausee"],
138
- "before": ["beofre", "befroe", "befor", "beforee"],
139
- "different": ["differnt", "differnet", "diferent", "differrent"],
140
- "between": ["bewteen", "betwen", "betewen", "beetween"],
141
- "important": ["improtant", "importnat", "importan", "importantt"],
142
- "information": ["infromation", "informaiton", "informaton", "informatoin"],
143
- "really": ["realy", "raelly", "realyl", "reallyy"],
144
- "something": ["someting", "somethign", "sometihng", "somethhing"],
145
- "actually": ["actualy", "acutally", "atcually", "actuallyy"],
146
- "probably": ["probaly", "probalby", "probabily", "probablyy"],
147
- "definitely": ["definately", "definitly", "definatly", "defintely"],
148
- "necessary": ["neccessary", "neccesary", "necessery", "nesessary"],
149
- "government": ["goverment", "governmnet", "govermnet", "govenrment"],
150
- "business": ["buisness", "busines", "businness", "bussiness"]
151
  }
152
 
153
- # NEW: Human-like sentence starters for variety - MASSIVELY EXPANDED
154
  self.varied_starters = [
155
  "When it comes to", "As for", "Regarding", "In terms of",
156
  "With respect to", "Concerning", "Speaking of", "About",
@@ -167,81 +140,16 @@ class HumanLikeVariations:
167
  "You might wonder", "You might ask", "You may think",
168
  "Some people say", "Many believe", "It's often said",
169
  "Research shows", "Studies indicate", "Evidence suggests",
170
- "Experience tells us", "History shows", "Time has shown",
171
- "I've noticed that", "I've found that", "I've seen that",
172
- "In my experience,", "From what I understand,", "As I see it,",
173
- "Let me be clear:", "Let me clarify:", "To be specific:",
174
- "Here's my thought:", "Here's my view:", "My take is:",
175
- "Can we just acknowledge", "Let's be real about", "Time to admit",
176
- "Nobody talks about how", "Everyone forgets that", "People overlook",
177
- "It's funny how", "It's weird that", "It's strange how",
178
- "Ever notice how", "Ever wonder why", "Ever think about",
179
- "You gotta admit", "You have to agree", "You can't deny",
180
- "I used to think", "I always thought", "I never realized",
181
- "Turns out,", "As it happens,", "Funny story:",
182
- "Real quick -", "Side note -", "Random thought -",
183
- "Not to be that person, but", "Call me crazy, but", "Maybe it's just me, but",
184
- "This might sound weird, but", "This might be controversial, but",
185
- "Hot take:", "Unpopular opinion:", "Controversial thought:",
186
- "Life hack:", "Pro tip:", "Word of advice:",
187
- "Question for you:", "Riddle me this:", "Tell me this:",
188
- "PSA:", "Reminder:", "Don't forget:",
189
- "Breaking news:", "Update:", "FYI:",
190
- "Confession time:", "True story:", "No lie:"
191
- ]
192
-
193
- # NEW: Personal opinions and reactions
194
- self.personal_reactions = [
195
- "And honestly? I'm here for it.",
196
- "Which, like, blew my mind.",
197
- "And I was like, wait, what?",
198
- "Not gonna lie, this surprised me.",
199
- "I mean, who would've thought?",
200
- "This literally changed everything for me.",
201
- "And that's when it hit me.",
202
- "I had to do a double-take on this one.",
203
- "This is where things get wild.",
204
- "Okay, but here's where it gets good.",
205
- "And this is the part that gets me every time.",
206
- "I'm still processing this, to be honest.",
207
- "This keeps me up at night, not gonna lie.",
208
- "Every time I think about this, I'm amazed.",
209
- "This is the kind of thing that makes you go 'hmm'.",
210
- "And yes, I'm totally serious about this.",
211
- "I know, I know, it sounds crazy, but hear me out.",
212
- "This might be my favorite part, actually.",
213
- "And this - this is why I love this topic.",
214
- "Hold up, because this next part is crucial.",
215
- "Brace yourself for this one.",
216
- "You're gonna want to sit down for this.",
217
- "This is the game-changer right here.",
218
- "And this, my friends, is where the magic happens.",
219
- "This right here? This is the good stuff.",
220
- "I could talk about this all day, honestly.",
221
- "This never gets old for me.",
222
- "Every single time, this amazes me.",
223
- "And boom - mind blown.",
224
- "This is what we call a mic drop moment.",
225
- "Can we just take a moment to appreciate this?",
226
- "This deserves way more attention, if you ask me.",
227
- "Why isn't everyone talking about this?",
228
- "This should be common knowledge by now.",
229
- "How is this not a bigger deal?",
230
- "Seriously, why don't they teach this in school?",
231
- "This changed my whole perspective, not even joking.",
232
- "Once you see this, you can't unsee it.",
233
- "This is one of those 'aha!' moments.",
234
- "And that's when everything clicked for me."
235
  ]
236
 
237
  def add_human_touch(self, text):
238
- """Add subtle human-like imperfections - ULTRA NATURAL PATTERNS"""
239
  sentences = text.split('. ')
240
  modified_sentences = []
241
 
242
  # Track what we've used to avoid patterns
243
- used_transitions = set()
244
- used_reactions = set()
245
 
246
  for i, sent in enumerate(sentences):
247
  if not sent.strip():
@@ -250,106 +158,23 @@ class HumanLikeVariations:
250
  # Always use contractions where natural
251
  sent = self.apply_contractions(sent)
252
 
253
- # Add personal voice (25% chance)
254
- if random.random() < 0.25 and i > 0:
255
- # Pick unused reaction
256
- available_reactions = [r for r in self.personal_reactions if r not in used_reactions]
257
- if available_reactions:
258
- reaction = random.choice(available_reactions)
259
- used_reactions.add(reaction)
260
- sent = sent + " " + reaction
261
-
262
- # Add thinking-out-loud elements (20% chance)
263
- if random.random() < 0.20 and len(sent.split()) > 10:
264
- thinking_phrases = [
265
- "- wait, actually, ",
266
- "- hmm, let me think - ",
267
- "- okay so ",
268
- "- oh right, ",
269
- "- correction: ",
270
- "- or wait, maybe ",
271
- "- scratch that, "
272
  ]
273
- pos = random.randint(len(sent.split())//3, 2*len(sent.split())//3)
274
- words = sent.split()
275
- insert_phrase = random.choice(thinking_phrases)
276
- words.insert(pos, insert_phrase)
277
- sent = ' '.join(words)
278
-
279
- # Add natural errors (15% chance)
280
- if random.random() < 0.15 and len(sent.split()) > 15:
281
- sent = self.add_realistic_errors(sent)
282
 
283
  modified_sentences.append(sent)
284
 
285
  return '. '.join(modified_sentences)
286
 
287
- def add_realistic_errors(self, text):
288
- """Add very realistic human errors"""
289
- error_type = random.choice(['typo', 'double_word', 'comma', 'homophone', 'capitalization'])
290
-
291
- if error_type == 'typo':
292
- words = text.split()
293
- if len(words) > 5:
294
- # Pick a common word to typo
295
- for _ in range(3): # Try 3 times to find a typo-able word
296
- idx = random.randint(2, len(words)-2)
297
- word = words[idx].lower().strip('.,!?;:')
298
- if word in self.common_typos:
299
- typo = random.choice(self.common_typos[word])
300
- # Preserve original capitalization and punctuation
301
- if words[idx][0].isupper():
302
- typo = typo[0].upper() + typo[1:]
303
- # Re-add punctuation if any
304
- if words[idx][-1] in '.,!?;:':
305
- typo += words[idx][-1]
306
- words[idx] = typo
307
- break
308
- text = ' '.join(words)
309
-
310
- elif error_type == 'double_word':
311
- words = text.split()
312
- if len(words) > 10:
313
- # Common words that get doubled
314
- double_candidates = ['the', 'a', 'to', 'in', 'on', 'at', 'for', 'and', 'but', 'or']
315
- for _ in range(3):
316
- idx = random.randint(3, len(words)-3)
317
- if words[idx].lower() in double_candidates:
318
- words.insert(idx+1, words[idx].lower())
319
- break
320
- text = ' '.join(words)
321
-
322
- elif error_type == 'comma':
323
- # Remove Oxford comma or add unnecessary comma
324
- if ', and' in text and random.random() < 0.5:
325
- text = text.replace(', and', ' and', 1)
326
- elif ' and' in text and ', and' not in text and random.random() < 0.3:
327
- text = text.replace(' and', ', and', 1)
328
-
329
- elif error_type == 'homophone':
330
- homophones = [
331
- ('your', "you're"), ("you're", 'your'),
332
- ('its', "it's"), ("it's", 'its'),
333
- ('their', 'there'), ('there', 'their'),
334
- ('then', 'than'), ('than', 'then'),
335
- ('to', 'too'), ('effect', 'affect')
336
- ]
337
- for original, replacement in homophones:
338
- if f' {original} ' in text and random.random() < 0.3:
339
- text = text.replace(f' {original} ', f' {replacement} ', 1)
340
- break
341
-
342
- elif error_type == 'capitalization':
343
- # Occasionally fail to capitalize after period
344
- matches = list(re.finditer(r'\. ([a-z])', text))
345
- if matches and random.random() < 0.3:
346
- match = random.choice(matches)
347
- # Don't change if it's a common lowercase starter like "e.g."
348
- if match.group(1) not in ['e', 'i', 'v']:
349
- text = text # Keep lowercase for more natural error
350
-
351
- return text
352
-
353
  def apply_contractions(self, text):
354
  """Apply common contractions - EXPANDED"""
355
  contractions = {
@@ -370,47 +195,75 @@ class HumanLikeVariations:
370
  "we would": "we'd", "they would": "they'd", "could have": "could've",
371
  "should have": "should've", "would have": "would've", "might have": "might've",
372
  "must have": "must've", "there has": "there's", "here is": "here's",
373
- "let us": "let's", "that will": "that'll", "who will": "who'll",
374
- "shall not": "shan't", "need not": "needn't", "dare not": "daren't",
375
- "ought not": "oughtn't", "might not": "mightn't", "must not": "mustn't",
376
- "there are": "there're", "where are": "where're", "what are": "what're",
377
- "how are": "how're", "why are": "why're", "who are": "who're"
378
  }
379
 
380
- # Apply contractions with very high probability (95%)
381
  for full, contr in contractions.items():
382
- if random.random() < 0.95:
383
  text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE)
384
 
385
  return text
386
 
387
  def add_minor_errors(self, text):
388
- """Add very minor, human-like errors - ULTRA REALISTIC"""
389
- # Occasionally miss Oxford comma (20% chance)
390
- if random.random() < 0.20:
 
391
  text = re.sub(r'(\w+), (\w+), and (\w+)', r'\1, \2 and \3', text)
392
 
393
- # Sometimes use 'which' instead of 'that' (10% chance)
394
- if random.random() < 0.10:
 
395
  matches = re.finditer(r'\b(\w+) that (\w+)', text)
396
- for match in list(matches)[:1]:
397
- if match.group(1).lower() not in ['believe', 'think', 'know', 'say', 'so']:
398
  text = text.replace(match.group(0), f"{match.group(1)} which {match.group(2)}", 1)
399
 
400
- # Add occasional typos (5% chance per sentence)
401
  sentences = text.split('. ')
402
  for i, sent in enumerate(sentences):
403
- if random.random() < 0.05 and len(sent.split()) > 15:
404
- sent = self.add_realistic_errors(sent)
405
- sentences[i] = sent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
  text = '. '.join(sentences)
408
 
409
- # Mix up dash styles occasionally
410
- if random.random() < 0.15:
411
- text = text.replace(' - ', ' ', 1) # em dash
412
- elif random.random() < 0.15:
413
- text = text.replace(' - ', ' — ', 1) # en dash
 
 
 
 
 
 
 
 
 
 
 
414
 
415
  return text
416
 
@@ -426,86 +279,79 @@ class HumanLikeVariations:
426
  # Natural contractions throughout
427
  sentence = self.apply_contractions(sentence)
428
 
429
- # Add varied sentence starters (30% chance)
430
- if random.random() < 0.30 and i > 0:
431
- starter = random.choice(self.varied_starters)
432
- sentence = starter + " " + sentence[0].lower() + sentence[1:] if len(sentence) > 1 else sentence
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
- # Add filler phrases naturally (25% chance)
435
- if random.random() < 0.25 and len(sentence.split()) > 8:
436
  words = sentence.split()
437
- pos = random.randint(2, len(words)-2)
438
- filler = random.choice(self.filler_phrases)
439
- words.insert(pos, filler)
440
- sentence = ' '.join(words)
441
-
442
- # Add personal asides (15% chance)
443
- if random.random() < 0.15:
444
- asides = [
445
- " (yeah, I know)",
446
- " (trust me on this)",
447
- " (I checked)",
448
- " (not even kidding)",
449
- " (seriously)",
450
- " (I mean it)",
451
- " (for real)",
452
- " (no joke)",
453
- " (true story)",
454
- " (I promise)"
455
- ]
456
- aside_pos = random.randint(len(sentence)//3, 2*len(sentence)//3)
457
- sentence = sentence[:aside_pos] + random.choice(asides) + sentence[aside_pos:]
458
-
459
- # Natural sentence combinations (25% chance)
460
- if i < len(sentences) - 1 and random.random() < 0.25:
461
  next_sent = sentences[i+1].strip()
462
- if next_sent and len(sentence.split()) + len(next_sent.split()) < 30:
463
- connector = random.choice(self.human_connectors)
464
- sentence = sentence.rstrip('.') + connector + next_sent[0].lower() + next_sent[1:]
465
- sentences[i+1] = "" # Mark as processed
 
 
 
 
466
 
467
  result_sentences.append(sentence)
468
 
469
  return ' '.join([s for s in result_sentences if s])
470
 
471
- def split_into_sentences_advanced(self, text):
472
- """Split text into sentences"""
473
- # Simple regex-based splitting
474
- sentences = re.split(r'(?<=[.!?])\s+', text)
475
- return [s for s in sentences if s and len(s.strip()) > 0]
476
-
477
  def vary_sentence_start(self, sentence):
478
  """Vary sentence beginning to avoid repetitive patterns"""
479
- if not sentence or len(sentence.split()) < 5:
 
 
 
 
480
  return sentence
481
 
482
- # Much more variety in sentence transformations
483
  variations = [
484
- lambda s: random.choice(self.varied_starters) + " " + s[0].lower() + s[1:],
485
- lambda s: "You know what? " + s,
486
- lambda s: "Here's the thing: " + s[0].lower() + s[1:],
487
- lambda s: "Funny enough, " + s[0].lower() + s[1:],
488
- lambda s: s + " Just saying.",
489
- lambda s: s + " Think about it.",
490
- lambda s: s + " Makes sense, right?",
491
- lambda s: "Okay, so " + s[0].lower() + s[1:],
492
- lambda s: "Real talk - " + s[0].lower() + s[1:],
493
- lambda s: s + " And that's facts.",
494
- lambda s: "Not gonna lie, " + s[0].lower() + s[1:],
495
- lambda s: s + " Period.",
496
- lambda s: "Can we talk about how " + s[0].lower() + s[1:] + "?",
497
  lambda s: s, # Keep original sometimes
498
  ]
499
 
500
- # Higher chance of variation
501
- if random.random() < 0.4:
502
- variation = random.choice(variations)
503
- try:
504
- return variation(sentence)
505
- except:
506
- return sentence
507
-
508
- return sentence
509
 
510
  class SelectiveGrammarFixer:
511
  """Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
@@ -551,6 +397,9 @@ class SelectiveGrammarFixer:
551
 
552
  result = ' '.join(fixed_sentences)
553
 
 
 
 
554
  return result
555
 
556
  def fix_basic_punctuation_errors(self, text):
@@ -558,42 +407,42 @@ class SelectiveGrammarFixer:
558
  if not text:
559
  return text
560
 
561
- # Fix double spaces (human-like error to keep some)
562
- text = re.sub(r'\s{3,}', ' ', text) # Only fix 3+ spaces
563
 
564
- # Fix space before punctuation (but might keep some for naturalness)
565
- if random.random() < 0.8: # 80% chance to fix
566
- text = re.sub(r'\s+([.,!?;:])', r'\1', text)
567
 
568
  # Fix missing space after punctuation (human-like)
569
  text = re.sub(r'([.,!?])([A-Z])', r'\1 \2', text)
570
 
571
- # Fix accidental double punctuation (but keep some ..)
572
- text = re.sub(r'([!?])\1+', r'\1', text)
573
- text = re.sub(r'\.{4,}', '...', text) # Fix 4+ periods to ellipsis
574
 
575
- # Fix "i" capitalization (but miss some for naturalness)
576
- if random.random() < 0.9: # 90% chance to fix
577
- text = re.sub(r'\bi\b', 'I', text)
578
 
579
  return text
580
 
581
  def preserve_natural_variations(self, text):
582
  """Keep some natural human-like variations"""
 
583
  # Only fix if really broken
584
  if text.count('.') == 0 and len(text.split()) > 20:
585
  # Long text with no periods - needs fixing
586
  words = text.split()
587
- # Add periods every 15-25 words naturally
588
  new_text = []
589
  for i, word in enumerate(words):
590
  new_text.append(word)
591
- if i > 0 and i % random.randint(15, 30) == 0:
592
  if word[-1] not in '.!?,;:':
593
  new_text[-1] = word + '.'
594
- # Capitalize next word
595
  if i + 1 < len(words) and words[i + 1][0].islower():
596
- words[i + 1] = words[i + 1][0].upper() + words[i + 1][1:]
 
 
597
  text = ' '.join(new_text)
598
 
599
  return text
@@ -631,12 +480,12 @@ class EnhancedDipperHumanizer:
631
  print("spaCy model not found, using NLTK for sentence splitting")
632
 
633
  try:
634
- # Load Dipper paraphraser
635
  print("Loading Dipper paraphraser model...")
636
  self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl')
637
  self.model = T5ForConditionalGeneration.from_pretrained(
638
  "kalpeshk2011/dipper-paraphraser-xxl",
639
- device_map="auto",
640
  torch_dtype=torch.float16,
641
  low_cpu_mem_usage=True
642
  )
@@ -667,7 +516,7 @@ class EnhancedDipperHumanizer:
667
  self.bart_model = AutoModelForSeq2SeqLM.from_pretrained(
668
  "eugenesiow/bart-paraphrase",
669
  torch_dtype=torch.float16,
670
- device_map="auto"
671
  )
672
  self.bart_tokenizer = AutoTokenizer.from_pretrained("eugenesiow/bart-paraphrase")
673
  self.use_bart = True
@@ -680,16 +529,118 @@ class EnhancedDipperHumanizer:
680
  self.human_variations = HumanLikeVariations()
681
 
682
  def add_natural_human_patterns(self, text):
683
- """Add natural human writing patterns"""
684
- return self.human_variations.add_natural_human_patterns(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
  def vary_sentence_start(self, sentence):
687
- """Vary sentence beginning"""
688
- return self.human_variations.vary_sentence_start(sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
 
690
  def apply_contractions(self, text):
691
- """Apply contractions"""
692
- return self.human_variations.apply_contractions(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
 
694
  def preserve_keywords(self, text, keywords):
695
  """Mark keywords to preserve them during paraphrasing"""
@@ -723,7 +674,7 @@ class EnhancedDipperHumanizer:
723
  return modified_text, keyword_map
724
 
725
  def restore_keywords_robust(self, text, keyword_map):
726
- """Restore keywords with more flexible pattern matching - FIXED VERSION"""
727
  if not keyword_map:
728
  return text
729
 
@@ -753,9 +704,8 @@ class EnhancedDipperHumanizer:
753
  if match:
754
  num = match.group(1)
755
 
756
- # EXPANDED patterns to catch more variations
757
  patterns = [
758
- # Standard variations
759
  (f'__KW{num}__', keyword),
760
  (f'__ KW{num}__', keyword),
761
  (f'__KW {num}__', keyword),
@@ -770,97 +720,32 @@ class EnhancedDipperHumanizer:
770
  (f'__KW{num}_', keyword),
771
  (f'_KW{num}__', keyword),
772
  (f'kw{num}', keyword),
773
- (f'``KW{num}__', keyword),
774
- (f'``KKW{num}', keyword),
775
- # New patterns to catch the issues in your output
776
- (f'KW{num}:', keyword), # Catches "KW0:12:"
777
- (f'KW{num}123', keyword), # Catches "KW0123"
778
- (f'Kw{num}', keyword),
779
- (f'kW{num}', keyword),
780
- (f'KW{num}[^0-9]', keyword), # Catches KW followed by non-digit
781
- (f'KW{num}(?![0-9])', keyword), # Lookahead to ensure no digit follows
782
- # Patterns with spaces and punctuation
783
- (f'KW {num}:', keyword),
784
- (f'KW{num} ', keyword),
785
- (f' KW{num}', keyword),
786
- (f'KW{num},', keyword),
787
- (f'KW{num}.', keyword),
788
- (f'KW{num};', keyword),
789
- (f'KW{num}!', keyword),
790
- (f'KW{num}?', keyword),
791
- # Triple patterns (for "KW kw kw")
792
- (f'KW kw kw', keyword),
793
- (f'kw kw kw', keyword),
794
- (f'Kw kw kw', keyword),
795
  ]
796
 
797
  for pattern, replacement in patterns:
798
- # Use regex for more flexible matching
799
- if '(?!' in pattern or '[^' in pattern:
800
- # This is already a regex pattern
801
- regex_pattern = pattern
802
- else:
803
- # Escape the pattern for regex
804
- regex_pattern = re.escape(pattern)
805
-
806
- matches = list(re.finditer(regex_pattern, restored_text))
807
- for match in matches:
808
- start_pos = match.start()
809
- end_pos = match.end()
810
-
811
  # Check if this position has already been replaced
812
- if not any(pos in replaced_positions for pos in range(start_pos, end_pos)):
813
- print(f"Found pattern '{match.group()}' at position {start_pos}, replacing with {replacement}")
814
-
815
- # Replace this specific occurrence
816
- before = restored_text[:start_pos]
817
- after = restored_text[end_pos:]
818
- restored_text = before + replacement + after
819
-
820
  # Mark new positions as replaced
821
- replaced_positions.update(range(start_pos, start_pos + len(replacement)))
822
-
823
- # Break after first replacement to avoid issues
824
- break
825
-
826
- # Third pass: Clean up any remaining KW patterns with numbers
827
- # This catches cases like "KW0:12:" where the number might vary
828
- remaining_kw_patterns = re.findall(r'\bKW\d+[:;.,!?\s]|\bKW\d+\d+\b|\bKw\d+\b|\bkw\d+\b|\bKW\s*kw\s*kw\b', restored_text)
829
 
830
- if remaining_kw_patterns:
831
- print(f"Found remaining KW patterns: {remaining_kw_patterns}")
832
-
833
- # Replace remaining patterns with keywords in order
834
- keyword_values = list(keyword_map.values())
835
- keyword_index = 0
836
-
837
- for pattern in remaining_kw_patterns:
838
- if keyword_index < len(keyword_values):
839
- # Find the position of this pattern
840
- pattern_pos = restored_text.find(pattern)
841
- if pattern_pos != -1 and not any(pos in replaced_positions for pos in range(pattern_pos, pattern_pos + len(pattern))):
842
- # Extract just the KW part and any trailing punctuation
843
- clean_pattern = pattern.rstrip('0123456789:;.,!?\s')
844
- trailing = pattern[len(clean_pattern):]
845
-
846
- # Replace with keyword + any trailing punctuation
847
- replacement = keyword_values[keyword_index]
848
- if trailing and trailing[0] in ':;.,!?':
849
- replacement += trailing[0]
850
-
851
- before = restored_text[:pattern_pos]
852
- after = restored_text[pattern_pos + len(pattern):]
853
- restored_text = before + replacement + after
854
-
855
- replaced_positions.update(range(pattern_pos, pattern_pos + len(replacement)))
856
- keyword_index += 1
857
-
858
- # Fourth pass: Clean up any backticks or quotes that shouldn't be there
859
  restored_text = re.sub(r'``+', '', restored_text)
 
860
  restored_text = re.sub(r"''", '"', restored_text)
861
  restored_text = re.sub(r'""', '"', restored_text)
862
 
863
- # Fifth pass: Look for any remaining underscore patterns
 
864
  if '___' in restored_text and keyword_map:
865
  # Find all occurrences of multiple underscores
866
  underscore_matches = list(re.finditer(r'_{3,}', restored_text))
@@ -878,13 +763,10 @@ class EnhancedDipperHumanizer:
878
  replaced_positions.update(range(start, start + len(keyword_values[i])))
879
 
880
  # Final cleanup: Remove any remaining KW patterns that weren't caught
881
- # This is a last resort to clean up any stragglers
882
- restored_text = re.sub(r'\bKW\d+\b', '', restored_text)
883
- restored_text = re.sub(r'\bKw\d+\b', '', restored_text)
884
- restored_text = re.sub(r'\bkw\d+\b', '', restored_text)
885
-
886
- # Clean up any double spaces created by removals
887
- restored_text = re.sub(r'\s+', ' ', restored_text)
888
 
889
  # Log final result
890
  print(f"Final restored text: {restored_text[:100]}...")
@@ -914,6 +796,7 @@ class EnhancedDipperHumanizer:
914
  return True
915
 
916
  # Special handling for content inside tables
 
917
  if parent:
918
  # Check if we're inside a table
919
  is_in_table = any(p.name == 'table' for p in parent.parents)
@@ -941,7 +824,7 @@ class EnhancedDipperHumanizer:
941
  if any(handler in parent.attrs for handler in event_handlers):
942
  return True
943
 
944
- # Special check for testimonial cards
945
  if parent:
946
  ancestors_to_check = []
947
  current = parent
@@ -960,7 +843,7 @@ class EnhancedDipperHumanizer:
960
  elif isinstance(classes, str) and 'testimonial-card' in classes:
961
  return True
962
 
963
- # Skip if parent or element has skip-worthy classes/IDs
964
  skip_indicators = [
965
  'cta-', 'button', 'btn', 'heading', 'title', 'caption',
966
  'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
@@ -974,7 +857,7 @@ class EnhancedDipperHumanizer:
974
  'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
975
  ]
976
 
977
- # Check only immediate parent and grandparent
978
  elements_to_check = [parent]
979
  if parent and parent.parent:
980
  elements_to_check.append(parent.parent)
@@ -1043,7 +926,7 @@ class EnhancedDipperHumanizer:
1043
  return False
1044
 
1045
  def clean_model_output_enhanced(self, text):
1046
- """Enhanced cleaning that preserves more natural structure and keywords"""
1047
  if not text:
1048
  return ""
1049
 
@@ -1075,20 +958,15 @@ class EnhancedDipperHumanizer:
1075
  text = re.sub(r'- or maybe I should say -', '', text)
1076
  text = re.sub(r'- or rather,', '', text)
1077
  text = re.sub(r'- think about it -', '', text)
1078
- text = re.sub(r'- hmm, let me think -', '', text)
1079
- text = re.sub(r'- correction:', '', text)
1080
- text = re.sub(r'- or wait, maybe', '', text)
1081
- text = re.sub(r'- scratch that,', '', text)
1082
 
1083
  # Clean up multiple spaces
1084
  text = re.sub(r'\s+', ' ', text)
1085
 
1086
- # IMPORTANT: Be very careful about removing leading characters
1087
- # Check for keyword placeholders more thoroughly
1088
- if not re.match(r'^(__KW\d+__|_?KW\d+|kw\d+|Kw\d+)', text):
1089
- # Only remove leading non-letter characters if it's definitely not a placeholder
1090
- # But be more conservative - only remove clearly wrong characters
1091
- text = re.sub(r'^[^\w_]+', '', text)
1092
 
1093
  # If we accidentally removed too much, use original
1094
  if len(text) < len(original) * 0.5:
@@ -1122,17 +1000,17 @@ class EnhancedDipperHumanizer:
1122
  continue
1123
 
1124
  try:
1125
- # MAXIMUM diversity for Originality AI
1126
  has_keywords = any(placeholder in sentence for placeholder in keyword_map.keys())
1127
  if has_keywords:
1128
- lex_diversity = 70 # High for keywords
1129
- order_diversity = 30
1130
  elif len(sentence.split()) < 10:
1131
- lex_diversity = 90 # Very high for short
1132
- order_diversity = 45
1133
  else:
1134
- lex_diversity = 98 # MAXIMUM diversity
1135
- order_diversity = 60 # MAXIMUM order diversity
1136
 
1137
  lex_code = int(100 - lex_diversity)
1138
  order_code = int(100 - order_diversity)
@@ -1159,23 +1037,23 @@ class EnhancedDipperHumanizer:
1159
  else:
1160
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
1161
 
1162
- # Generate with maximum variation
1163
  original_length = len(sentence.split())
1164
- max_new_length = int(original_length * 1.5)
1165
 
1166
- # Maximum variation parameters
1167
- temp = 1.0 if has_keywords else 1.4
1168
- top_p_val = 0.92
1169
 
1170
  with torch.no_grad():
1171
  outputs = self.model.generate(
1172
  **inputs,
1173
  max_length=max_new_length + 20,
1174
- min_length=max(5, int(original_length * 0.6)),
1175
  do_sample=True,
1176
  top_p=top_p_val,
1177
  temperature=temp,
1178
- no_repeat_ngram_size=5, # Allow more repetition for naturalness
1179
  num_beams=1, # Greedy for more randomness
1180
  early_stopping=True
1181
  )
@@ -1267,8 +1145,8 @@ class EnhancedDipperHumanizer:
1267
  last_word = words[-1]
1268
 
1269
  # Remove if it's clearly cut off (1-2 chars, no vowels)
1270
- # But don't remove valid short words
1271
- short_valid_words = {'is', 'of', 'to', 'in', 'on', 'at', 'by', 'or', 'if', 'so', 'up', 'no', 'we', 'he', 'me', 'be', 'do', 'go', 'as', 'it'}
1272
  if (len(last_word) <= 2 and
1273
  last_word.lower() not in short_valid_words and
1274
  not any(c in 'aeiouAEIOU' for c in last_word)):
@@ -1289,7 +1167,7 @@ class EnhancedDipperHumanizer:
1289
  generated += '.'
1290
  elif orig_stripped.endswith('!'):
1291
  # Check if generated seems exclamatory
1292
- exclaim_words = ['amazing', 'incredible', 'fantastic', 'terrible', 'awful', 'wonderful', 'excellent', 'wow', 'oh']
1293
  if any(word in generated.lower() for word in exclaim_words):
1294
  generated += '!'
1295
  else:
@@ -1359,12 +1237,12 @@ class EnhancedDipperHumanizer:
1359
  with torch.no_grad():
1360
  outputs = self.bart_model.generate(
1361
  **inputs,
1362
- max_length=int(original_length * 1.5) + 10,
1363
- min_length=max(5, int(original_length * 0.5)),
1364
  num_beams=2,
1365
- temperature=1.2, # Higher temperature for more variation
1366
  do_sample=True,
1367
- top_p=0.92,
1368
  early_stopping=True
1369
  )
1370
 
@@ -1390,13 +1268,12 @@ class EnhancedDipperHumanizer:
1390
  return text
1391
 
1392
  def apply_sentence_variation(self, text):
1393
- """Apply natural sentence structure variations - ULTRA HUMAN-LIKE FLOW"""
1394
  sentences = self.split_into_sentences_advanced(text)
1395
  varied_sentences = []
1396
 
1397
  # Track patterns to ensure variety
1398
  last_sentence_length = 0
1399
- sentence_rhythms = []
1400
 
1401
  for i, sentence in enumerate(sentences):
1402
  if not sentence.strip():
@@ -1405,154 +1282,39 @@ class EnhancedDipperHumanizer:
1405
  words = sentence.split()
1406
  current_length = len(words)
1407
 
1408
- # Create natural rhythm variation
1409
- if i > 0:
1410
- # After short sentence, maybe go longer
1411
- if last_sentence_length < 10 and random.random() < 0.7:
1412
- # Add elaboration
1413
- elaborations = [
1414
- " Let me explain what I mean.",
1415
- " Here's why this matters.",
1416
- " And that's just the beginning.",
1417
- " But there's more to it.",
1418
- " This is important to understand.",
1419
- " Think about the implications.",
1420
- " Consider what this means."
1421
- ]
1422
- sentence += random.choice(elaborations)
1423
- # After long sentence, maybe go shorter
1424
- elif last_sentence_length > 25 and random.random() < 0.6:
1425
- # Truncate if possible
1426
- if ',' in sentence and sentence.count(',') > 1:
1427
- # Keep only first part
1428
- parts = sentence.split(',')
1429
- sentence = parts[0] + '.'
1430
-
1431
- # Natural sentence combinations for flow
1432
  if (i < len(sentences) - 1 and
1433
- current_length < 12 and
1434
- len(sentences[i+1].split()) < 12 and
1435
- random.random() < 0.35):
1436
 
1437
  next_sent = sentences[i+1].strip()
1438
- # Combine with natural connectors
1439
- connectors = [
1440
- ', and ', ', but ', ', so ', ', which means ',
1441
- ' - and ', ' - but ', ', though ',
1442
- '. Actually, ', '. Plus, ', '. Also, '
1443
- ]
1444
- connector = random.choice(connectors)
1445
-
1446
- if connector.startswith('.'):
1447
- combined = sentence + connector + next_sent
1448
- else:
1449
- combined = sentence.rstrip('.') + connector + next_sent[0].lower() + next_sent[1:]
1450
-
1451
- varied_sentences.append(combined)
1452
- sentences[i+1] = ""
1453
- last_sentence_length = len(combined.split())
1454
- continue
1455
-
1456
- # Add rhetorical questions occasionally
1457
- if random.random() < 0.08 and i < len(sentences) - 1:
1458
- rhetorical = [
1459
- " Make sense?",
1460
- " See what I mean?",
1461
- " Getting the picture?",
1462
- " Following me so far?",
1463
- " Sound familiar?",
1464
- " Crazy, right?",
1465
- " Wild, isn't it?"
1466
- ]
1467
- sentence += random.choice(rhetorical)
1468
 
1469
  varied_sentences.append(sentence)
1470
  last_sentence_length = current_length
1471
 
1472
  return ' '.join([s for s in varied_sentences if s])
1473
 
1474
- def add_natural_flow_variations(self, text):
1475
- """Add more natural flow and rhythm variations for Originality AI"""
1476
- sentences = self.split_into_sentences_advanced(text)
1477
- enhanced_sentences = []
1478
-
1479
- for i, sentence in enumerate(sentences):
1480
- if not sentence.strip():
1481
- continue
1482
-
1483
- # Add stream-of-consciousness elements (15% chance)
1484
- if random.random() < 0.15 and len(sentence.split()) > 10:
1485
- stream_elements = [
1486
- " - wait, actually, ",
1487
- " - hmm, ",
1488
- " - okay so ",
1489
- " - oh right, ",
1490
- " - correction: ",
1491
- " - or wait, maybe ",
1492
- " - scratch that, "
1493
- ]
1494
- words = sentence.split()
1495
- pos = random.randint(len(words)//4, 3*len(words)//4)
1496
- words.insert(pos, random.choice(stream_elements))
1497
- sentence = ' '.join(words)
1498
-
1499
- # Add human-like self-corrections (10% chance)
1500
- if random.random() < 0.10:
1501
- corrections = [
1502
- " - or rather, ",
1503
- " - well, actually, ",
1504
- " - I mean, ",
1505
- " - or should I say, ",
1506
- " - correction: ",
1507
- " - let me rephrase: ",
1508
- " - wait, no, "
1509
- ]
1510
- words = sentence.split()
1511
- if len(words) > 8:
1512
- pos = random.randint(len(words)//2, len(words)-3)
1513
- correction = random.choice(corrections)
1514
- words.insert(pos, correction)
1515
- sentence = ' '.join(words)
1516
-
1517
- # Add thinking-out-loud patterns (12% chance)
1518
- if random.random() < 0.12 and i > 0:
1519
- thinking_patterns = [
1520
- "Come to think of it, ",
1521
- "Actually, you know what? ",
1522
- "Wait, here's a thought: ",
1523
- "Oh, and another thing - ",
1524
- "Speaking of which, ",
1525
- "This reminds me, ",
1526
- "Now that I mention it, ",
1527
- "Funny you should ask, because ",
1528
- "You know what's interesting? ",
1529
- "Here's what gets me: ",
1530
- "Can I be honest? ",
1531
- "Between you and me, "
1532
- ]
1533
- pattern = random.choice(thinking_patterns)
1534
- sentence = pattern + sentence[0].lower() + sentence[1:] if len(sentence) > 1 else sentence
1535
-
1536
- # Add emphatic repetitions (8% chance)
1537
- if random.random() < 0.08 and len(sentence.split()) > 6:
1538
- # Find a key word to repeat for emphasis
1539
- words = sentence.split()
1540
- important_words = [w for w in words if len(w) > 4 and w[0].islower()]
1541
- if important_words:
1542
- word_to_repeat = random.choice(important_words)
1543
- emphatic_patterns = [
1544
- f". {word_to_repeat.capitalize()}.",
1545
- f" - yes, {word_to_repeat} -",
1546
- f". I said {word_to_repeat}.",
1547
- f" ({word_to_repeat}!)",
1548
- f". {word_to_repeat.capitalize()}, people!"
1549
- ]
1550
- sentence += random.choice(emphatic_patterns)
1551
-
1552
- enhanced_sentences.append(sentence)
1553
-
1554
- return ' '.join(enhanced_sentences)
1555
-
1556
  def fix_punctuation(self, text):
1557
  """Comprehensive punctuation and formatting fixes"""
1558
  if not text:
@@ -1564,27 +1326,26 @@ class EnhancedDipperHumanizer:
1564
  # Fix weird symbols and characters using safe replacements
1565
  text = text.replace('<>', '') # Remove empty angle brackets
1566
 
1567
- # Normalize quotes
1568
  text = text.replace('«', '"').replace('»', '"')
1569
  text = text.replace('„', '"').replace('"', '"').replace('"', '"')
1570
  text = text.replace(''', "'").replace(''', "'")
1571
  text = text.replace('–', '-').replace('—', '-')
1572
 
1573
  # Fix colon issues
1574
- text = re.sub(r'\.:', ':', text)
1575
- text = re.sub(r':\s*\.', ':', text)
1576
 
1577
- # Fix basic spacing (but keep some human errors)
1578
- text = re.sub(r'\s{3,}', ' ', text) # Only fix 3+ spaces
1579
- if random.random() < 0.9: # 90% chance to fix
1580
- text = re.sub(r'\s+([.,!?;:])', r'\1', text)
1581
- text = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', text)
1582
- text = re.sub(r'([.!?])\s*\1+', r'\1', text)
1583
 
1584
  # Fix colons
1585
- text = re.sub(r':\s*([.,!?])', ':', text)
1586
- text = re.sub(r'([.,!?])\s*:', ':', text)
1587
- text = re.sub(r':+', ':', text)
1588
 
1589
  # Fix quotes and parentheses
1590
  text = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', text)
@@ -1592,6 +1353,7 @@ class EnhancedDipperHumanizer:
1592
  text = re.sub(r'\(\s*([^)]*?)\s*\)', r'(\1)', text)
1593
 
1594
  # Fix sentence capitalization more carefully
 
1595
  sentences = re.split(r'(?<=[.!?])\s+', text)
1596
  fixed_sentences = []
1597
 
@@ -1599,44 +1361,51 @@ class EnhancedDipperHumanizer:
1599
  if not sentence:
1600
  continue
1601
 
1602
- # Only capitalize if needed
 
1603
  words = sentence.split()
1604
  if words:
1605
  first_word = words[0]
 
1606
  if (first_word[0].islower() and
1607
  not self.is_likely_acronym_or_proper_noun(first_word) and
1608
  not first_word.startswith('__KW') and
1609
  not first_word.startswith('_kw')):
 
1610
  sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
1611
 
1612
  fixed_sentences.append(sentence)
1613
 
1614
  text = ' '.join(fixed_sentences)
1615
 
1616
- # Fix common issues (but not all for naturalness)
1617
- if random.random() < 0.95: # 95% chance to fix
1618
- text = re.sub(r'\bi\b', 'I', text)
1619
- text = re.sub(r'\.{4,}', '...', text) # Fix 4+ periods
1620
- text = re.sub(r',{3,}', ',', text) # Fix 3+ commas
 
 
 
1621
 
1622
  # Fix abbreviations
1623
  text = re.sub(r'\betc\s*\.\s*\.', 'etc.', text)
1624
  text = re.sub(r'\be\.g\s*\.\s*[,\s]', 'e.g., ', text)
1625
  text = re.sub(r'\bi\.e\s*\.\s*[,\s]', 'i.e., ', text)
1626
 
1627
- # Fix numbers with periods
1628
  text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
1629
 
1630
  # Fix bold/strong tags punctuation
1631
  text = self.fix_bold_punctuation(text)
1632
 
1633
- # Clean up remaining issues
1634
- text = re.sub(r'\s+([.,!?;:])', r'\1', text)
1635
- text = re.sub(r'([.,!?;:])\s{2,}', r'\1 ', text)
1636
 
1637
  # Ensure ending punctuation
1638
  text = text.strip()
1639
  if text and text[-1] not in '.!?':
 
1640
  if not text.endswith(':'):
1641
  text += '.'
1642
 
@@ -1646,11 +1415,13 @@ class EnhancedDipperHumanizer:
1646
  """Fix punctuation issues around bold/strong tags"""
1647
  # Check if this is likely a list item with colon pattern
1648
  def is_list_item_with_colon(text):
 
1649
  list_pattern = r'^\s*(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
1650
  return bool(re.search(list_pattern, text))
1651
 
1652
  # If it's a list item with colon, preserve the format
1653
  if is_list_item_with_colon(text):
 
1654
  text = re.sub(r'<(strong|b)>\s*([^:]+)\s*:\s*</\1>', r'<\1>\2:</\1>', text)
1655
  return text
1656
 
@@ -1666,12 +1437,14 @@ class EnhancedDipperHumanizer:
1666
 
1667
  # Check if this is a list header (contains colon at the end)
1668
  if content.endswith(':'):
 
1669
  return f'<{tag}>{content}</{tag}>'
1670
 
1671
  # Remove any periods at the start or end of bold content
1672
  content = content.strip('.')
1673
 
1674
  # Check if this bold text is at the start of a sentence
 
1675
  start_pos = match.start()
1676
  is_sentence_start = (start_pos == 0 or
1677
  (start_pos > 2 and text[start_pos-2:start_pos] in ['. ', '! ', '? ', '\n\n']))
@@ -1685,24 +1458,25 @@ class EnhancedDipperHumanizer:
1685
  # Fix bold/strong tags
1686
  text = re.sub(bold_pattern, fix_bold_match, text)
1687
 
1688
- # Fix spacing around bold/strong tags
1689
  if not is_list_item_with_colon(text):
1690
- text = re.sub(r'\.\s*<(strong|b)>', r'. <\1>', text)
1691
- text = re.sub(r'</(strong|b)>\s*\.', r'</\1>.', text)
1692
- text = re.sub(r'([.!?])\s*<(strong|b)>', r'\1 <\2>', text)
1693
- text = re.sub(r'</(strong|b)>\s+([a-z])', lambda m: f'</{m.group(1)}> {m.group(2)}', text)
1694
 
1695
  # Remove duplicate periods around bold tags
1696
  text = re.sub(r'\.\s*</(strong|b)>\s*\.', r'</\1>.', text)
1697
  text = re.sub(r'\.\s*<(strong|b)>\s*\.', r'. <\1>', text)
1698
 
1699
  # Fix cases where bold content ends a sentence
 
1700
  text = re.sub(r'</(strong|b)>\s+([A-Z])', r'</\1>. \2', text)
1701
 
1702
  # Don't remove these for list items
1703
  if not is_list_item_with_colon(text):
1704
- text = re.sub(r'<(strong|b)>\s*:\s*</\1>', ':', text)
1705
- text = re.sub(r'<(strong|b)>\s*\.\s*</\1>', '.', text)
1706
 
1707
  return text
1708
 
@@ -1711,7 +1485,7 @@ class EnhancedDipperHumanizer:
1711
  soup = BeautifulSoup(html_content, 'html.parser')
1712
  text_elements = []
1713
 
1714
- # Get all text nodes
1715
  for element in soup.find_all(string=True):
1716
  # Skip script, style, and noscript content completely
1717
  if element.parent.name in ['script', 'style', 'noscript']:
@@ -1733,11 +1507,11 @@ class EnhancedDipperHumanizer:
1733
  html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
1734
 
1735
  # Fix spacing issues
1736
- html_text = re.sub(r'>\s+<', '><', html_text)
1737
- html_text = re.sub(r'\s+>', '>', html_text)
1738
- html_text = re.sub(r'<\s+', '<', html_text)
1739
 
1740
- # Fix common word errors
1741
  html_text = html_text.replace('down loaded', 'downloaded')
1742
  html_text = html_text.replace('But your document', 'Your document')
1743
 
@@ -1751,6 +1525,7 @@ class EnhancedDipperHumanizer:
1751
  # Find all paragraph tags
1752
  for p_tag in soup.find_all('p'):
1753
  # Skip paragraphs that are inside special elements
 
1754
  skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card',
1755
  'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
1756
  'div.quiz-container', 'div.question-container', 'div.results']
@@ -1799,6 +1574,7 @@ class EnhancedDipperHumanizer:
1799
  continue
1800
 
1801
  # Skip if the text node's immediate parent isn't the p tag
 
1802
  if text_node.parent != p_tag:
1803
  continue
1804
 
@@ -1836,6 +1612,68 @@ class EnhancedDipperHumanizer:
1836
  text_node.insert_after(new_node)
1837
  text_node.extract()
1838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1839
  def process_html(self, html_content, primary_keywords="", secondary_keywords="", progress_callback=None):
1840
  """Main processing function with progress callback"""
1841
  if not html_content.strip():
@@ -1868,9 +1706,10 @@ class EnhancedDipperHumanizer:
1868
  # Combine keywords and clean them
1869
  all_keywords = []
1870
  if primary_keywords:
 
1871
  for k in primary_keywords.split(','):
1872
  cleaned = k.strip()
1873
- if cleaned and len(cleaned) > 1:
1874
  all_keywords.append(cleaned)
1875
  if secondary_keywords:
1876
  for k in secondary_keywords.split(','):
@@ -1915,7 +1754,7 @@ class EnhancedDipperHumanizer:
1915
  if text_has_keywords:
1916
  print(f"Debug: Processing text with keywords: {original_text[:50]}...")
1917
 
1918
- # First pass with Dipper (with maximum diversity)
1919
  paraphrased_text = self.paraphrase_with_dipper(
1920
  original_text,
1921
  keywords=all_keywords
@@ -1924,7 +1763,7 @@ class EnhancedDipperHumanizer:
1924
  # Verify no placeholders remain
1925
  if '__KW' in paraphrased_text or '___' in paraphrased_text:
1926
  print(f"Warning: Placeholder or underscores found in paraphrased text: {paraphrased_text[:100]}...")
1927
- # Try to restore again
1928
  temp_map = {}
1929
  for j, keyword in enumerate(all_keywords):
1930
  temp_map[f'__KW{j:03d}__'] = keyword
@@ -1932,27 +1771,24 @@ class EnhancedDipperHumanizer:
1932
 
1933
  # Second pass with BART for longer texts (increased probability)
1934
  if self.use_bart and len(paraphrased_text.split()) > 8:
1935
- # 60% chance to use BART for maximum variation
1936
- if random.random() < 0.6:
1937
  paraphrased_text = self.paraphrase_with_bart(
1938
  paraphrased_text,
1939
  keywords=all_keywords
1940
  )
1941
 
1942
- # Apply maximum sentence variation
1943
  paraphrased_text = self.apply_sentence_variation(paraphrased_text)
1944
 
1945
  # Add natural flow variations
1946
  paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
1947
 
1948
- # Add extra human touch
1949
- paraphrased_text = self.human_variations.add_human_touch(paraphrased_text)
1950
-
1951
  # Fix punctuation and formatting
1952
  paraphrased_text = self.fix_punctuation(paraphrased_text)
1953
 
1954
- # Final check for any remaining placeholders
1955
- if '___' in paraphrased_text or '__KW' in paraphrased_text or 'KW0' in paraphrased_text:
1956
  print(f"Error: Unresolved placeholders in final text")
1957
  # Use original text if we can't resolve placeholders
1958
  paraphrased_text = original_text
@@ -1973,20 +1809,17 @@ class EnhancedDipperHumanizer:
1973
  # Wrap keywords with <strong> tags in paragraphs
1974
  self.wrap_keywords_in_paragraphs(soup, all_keywords)
1975
 
1976
- # Post-process the entire HTML
1977
  result = str(soup)
1978
  result = self.post_process_html(result)
1979
 
1980
- # Final safety check for any remaining placeholders
1981
- if '__KW' in result or re.search(r'_{3,}', result) or re.search(r'\bKW\d+', result):
1982
- print("Warning: Found placeholders in final HTML output")
1983
- # Attempt final cleanup
1984
  for i, keyword in enumerate(all_keywords):
1985
  result = result.replace(f'__KW{i:03d}__', keyword)
1986
- result = re.sub(f'\\bKW{i:03d}\\b', keyword, result)
1987
- result = re.sub(f'\\bKW{i}\\b', keyword, result)
1988
- result = re.sub(r'_{3,}', '', result)
1989
- result = re.sub(r'\bKW\d+\b', '', result)
1990
 
1991
  # Restore all script tags
1992
  for idx, script_content in enumerate(preserved_scripts):
@@ -2001,7 +1834,7 @@ class EnhancedDipperHumanizer:
2001
  # Validate and fix HTML syntax
2002
  result = self.validate_and_fix_html(result)
2003
 
2004
- # Count skipped elements
2005
  all_text_elements = soup.find_all(string=True)
2006
  skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
2007
 
@@ -2015,13 +1848,14 @@ class EnhancedDipperHumanizer:
2015
  import traceback
2016
  error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
2017
  print(error_msg)
 
2018
  return f"<!-- {error_msg} -->\n{html_content}"
2019
 
2020
  def post_process_html(self, html_text):
2021
  """Post-process the entire HTML to fix formatting issues"""
2022
- # Fix empty angle brackets
2023
- html_text = re.sub(r'<>\s*([^<>]+?)\s*(?=\.|\s|<)', r'\1', html_text)
2024
- html_text = re.sub(r'<>', '', html_text)
2025
 
2026
  # Fix double angle brackets around bold tags
2027
  html_text = re.sub(r'<<b>>', '<b>', html_text)
@@ -2030,9 +1864,9 @@ class EnhancedDipperHumanizer:
2030
  html_text = re.sub(r'<</strong>>', '</strong>', html_text)
2031
 
2032
  # Fix periods around bold/strong tags
2033
- html_text = re.sub(r'\.\s*<(b|strong)>', '. <\1>', html_text)
2034
- html_text = re.sub(r'</(b|strong)>\s*\.', '</\1>.', html_text)
2035
- html_text = re.sub(r'\.<<(b|strong)>>', '. <\1>', html_text)
2036
  html_text = re.sub(r'</(b|strong)>>\.', '</\1>.', html_text)
2037
 
2038
  # Fix periods after colons
@@ -2044,15 +1878,19 @@ class EnhancedDipperHumanizer:
2044
  # Check if this line contains a list pattern with bold
2045
  list_pattern = r'(?:^|\s)(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
2046
  if re.search(list_pattern, line):
 
2047
  return line
2048
 
2049
  # Not a list item, apply regular fixes
 
2050
  line = re.sub(r'<(strong|b)>\s*\.\s*([^<]+)\s*\.\s*</\1>', r'<\1>\2</\1>', line)
 
 
2051
  line = re.sub(r'</(strong|b)>\s*([.!?])', r'</\1>\2', line)
2052
 
2053
  return line
2054
 
2055
- # Process line by line
2056
  lines = html_text.split('\n')
2057
  processed_lines = [process_line(line) for line in lines]
2058
  html_text = '\n'.join(processed_lines)
@@ -2078,7 +1916,8 @@ class EnhancedDipperHumanizer:
2078
  # Look for bold/strong tags and check their context
2079
  html_text = re.sub(r'(^|.*?)(<(?:strong|b)>)([a-zA-Z])', fix_bold_sentence_start, html_text)
2080
 
2081
- # Clean up spacing around bold tags
 
2082
  segments = re.split(r'(<(?:strong|b)>[^<]*:</(?:strong|b)>)', html_text)
2083
  cleaned_segments = []
2084
 
@@ -2089,7 +1928,9 @@ class EnhancedDipperHumanizer:
2089
  # Apply spacing fixes to non-list segments
2090
  segment = re.sub(r'\s+<(strong|b)>', r' <\1>', segment)
2091
  segment = re.sub(r'</(strong|b)>\s+', r'</\1> ', segment)
 
2092
  segment = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', segment)
 
2093
  segment = re.sub(r'\.<(strong|b)>\.', '. <\1>', segment)
2094
  segment = re.sub(r'\.</(strong|b)>\.', '</\1>.', segment)
2095
  cleaned_segments.append(segment)
@@ -2097,15 +1938,16 @@ class EnhancedDipperHumanizer:
2097
  html_text = ''.join(cleaned_segments)
2098
 
2099
  # Final cleanup
2100
- html_text = re.sub(r'\.{2,}', '.', html_text)
2101
- html_text = re.sub(r',{2,}', ',', html_text)
2102
- html_text = re.sub(r':{2,}', ':', html_text)
2103
- html_text = re.sub(r'\s+([.,!?;:])', r'\1', html_text)
2104
 
2105
- # Fix empty bold tags
2106
  html_text = re.sub(r'<(strong|b)>\s*</\1>', '', html_text)
2107
 
2108
  # Fix specific patterns in lists/stats
 
2109
  html_text = re.sub(r'(\d+[,\d]*\+?)\s*\.\s*\n', r'\1\n', html_text)
2110
 
2111
  # Clean up any remaining double brackets
@@ -2169,33 +2011,24 @@ iface = gr.Interface(
2169
  lines=10,
2170
  label="Humanized HTML Output"
2171
  ),
2172
- title="Enhanced Dipper AI Humanizer v2 - 100% Human Score Optimized",
2173
  description="""
2174
  Ultra-aggressive humanizer optimized to achieve 100% human scores on both Undetectable AI and Originality AI.
2175
 
2176
- 🚀 **NEW ENHANCEMENTS for 100% Human Score:**
2177
- - **MAXIMUM diversity settings** (98% lexical, 60% order) for extreme variation
2178
- - **Personal voice injection**: Opinions, reactions, and thinking-out-loud elements
2179
- - **Stream-of-consciousness patterns**: Natural self-corrections and tangents
2180
- - **Emphatic repetitions**: Human-like emphasis patterns ("Yes, I said X!")
2181
- - **Ultra-realistic errors**: Typos, double words, homophone mix-ups
2182
- - **Rhetorical questions**: "Make sense?" "Following me?" "Wild, right?"
2183
- - **60+ casual transitions**: From "So," to "Plot twist:" to "Between you and me,"
2184
- - **Natural asides**: "(yeah, I know)" "(trust me on this)" "(not even kidding)"
2185
-
2186
- ✅ **Key Features:**
2187
- - Fixed keyword restoration system - no more KW0 or placeholder issues!
2188
  - Keywords inside <p> tags are automatically wrapped with <strong> tags
2189
- - Preserves all HTML structure, scripts, and styles
2190
- - Skips headings, CTAs, tables, testimonials, and existing bold/strong content
2191
- - Designed to fool even the strictest AI detectors!
2192
 
2193
- 💡 **Tips for Best Results:**
2194
- - Use 3-5 primary keywords for best preservation
2195
- - Longer content = better humanization results
2196
- - Processing takes 5-10 minutes for large documents
2197
 
2198
- The tool creates genuinely human-like writing that passes Originality AI's strictest tests!
2199
  """,
2200
  examples=[
2201
  ["""<article>
 
26
  """Add human-like variations and intentional imperfections"""
27
 
28
  def __init__(self):
29
+ # Common human writing patterns - EXPANDED for Originality AI
30
  self.casual_transitions = [
31
+ "So, ", "Well, ", "Now, ", "Actually, ", "Basically, ",
32
+ "You know, ", "I mean, ", "Thing is, ", "Honestly, ",
33
+ "Look, ", "Listen, ", "See, ", "Okay, ", "Right, ",
34
+ "Anyway, ", "Besides, ", "Plus, ", "Also, ", "Oh, ",
35
+ "Hey, ", "Alright, ", "Sure, ", "Fine, ", "Obviously, ",
36
+ "Clearly, ", "Seriously, ", "Literally, ", "Frankly, ",
37
+ "To be honest, ", "Truth is, ", "In fact, ", "Believe it or not, ",
38
+ "Here's the thing, ", "Let me tell you, ", "Get this, ",
39
+ "Funny thing is, ", "Interestingly, ", "Surprisingly, ",
40
+ "Let's be real here, ", "Can we talk about ", "Quick question: ",
41
+ "Real talk: ", "Hot take: ", "Unpopular opinion: ", "Fun fact: ",
42
+ "Pro tip: ", "Side note: ", "Random thought: ", "Food for thought: ",
43
+ "Just saying, ", "Not gonna lie, ", "For what it's worth, ",
44
+ "If you ask me, ", "Between you and me, ", "Here's my take: ",
45
+ "Let's face it, ", "No kidding, ", "Seriously though, ",
46
+ "But wait, ", "Hold on, ", "Check this out: ", "Guess what? "
 
 
 
 
 
 
 
 
 
 
47
  ]
48
 
49
  self.filler_phrases = [
 
68
  "to the best of my knowledge", "if I'm not mistaken", "correct me if I'm wrong",
69
  "you know what", "here's the deal", "bottom line", "at any rate",
70
  "all in all", "when you think about it", "come to think of it",
71
+ "now that I think about it", "if we're being honest", "to be fair"
 
 
 
 
 
72
  ]
73
 
74
  self.human_connectors = [
 
96
  ". And honestly?", ". But seriously,", ". And you know what?",
97
  ", which brings me to", ". This reminds me of", ", speaking of which",
98
  ". Funny enough,", ". Weird thing is,", ". Strange but true:",
99
+ ", and I mean", ". I'm not kidding when I say", ", and trust me on this"
 
 
 
 
100
  ]
101
 
102
+ # NEW: Common human typos and variations
103
  self.common_typos = {
104
+ "the": ["teh", "th", "hte"],
105
+ "and": ["adn", "nad", "an"],
106
+ "that": ["taht", "htat", "tha"],
107
+ "with": ["wiht", "wtih", "iwth"],
108
+ "have": ["ahve", "hvae", "hav"],
109
+ "from": ["form", "fro", "frmo"],
110
+ "they": ["tehy", "thye", "htey"],
111
+ "which": ["whihc", "wich", "whcih"],
112
+ "their": ["thier", "theri", "tehir"],
113
+ "would": ["woudl", "wuold", "woul"],
114
+ "there": ["tehre", "theer", "ther"],
115
+ "could": ["coudl", "cuold", "coud"],
116
+ "people": ["poeple", "peopel", "pepole"],
117
+ "through": ["thorugh", "throught", "trhough"],
118
+ "because": ["becuase", "becasue", "beacuse"],
119
+ "before": ["beofre", "befroe", "befor"],
120
+ "different": ["differnt", "differnet", "diferent"],
121
+ "between": ["bewteen", "betwen", "betewen"],
122
+ "important": ["improtant", "importnat", "importan"],
123
+ "information": ["infromation", "informaiton", "informaton"]
 
 
 
 
 
 
 
 
124
  }
125
 
126
+ # NEW: Human-like sentence starters for variety
127
  self.varied_starters = [
128
  "When it comes to", "As for", "Regarding", "In terms of",
129
  "With respect to", "Concerning", "Speaking of", "About",
 
140
  "You might wonder", "You might ask", "You may think",
141
  "Some people say", "Many believe", "It's often said",
142
  "Research shows", "Studies indicate", "Evidence suggests",
143
+ "Experience tells us", "History shows", "Time has shown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  ]
145
 
146
  def add_human_touch(self, text):
147
+ """Add subtle human-like imperfections - NATURAL PATTERNS ONLY"""
148
  sentences = text.split('. ')
149
  modified_sentences = []
150
 
151
  # Track what we've used to avoid patterns
152
+ used_transitions = []
 
153
 
154
  for i, sent in enumerate(sentences):
155
  if not sent.strip():
 
158
  # Always use contractions where natural
159
  sent = self.apply_contractions(sent)
160
 
161
+ # Add VERY occasional natural errors (5% chance)
162
+ if random.random() < 0.05 and len(sent.split()) > 15:
163
+ error_types = [
164
+ # Missing comma in compound sentence
165
+ lambda s: s.replace(", and", " and", 1) if ", and" in s else s,
166
+ # Wrong homophone
167
+ lambda s: s.replace("their", "there", 1) if "their" in s and random.random() < 0.3 else s,
168
+ # Missing apostrophe
169
+ lambda s: s.replace("it's", "its", 1) if "it's" in s and random.random() < 0.3 else s,
 
 
 
 
 
 
 
 
 
 
170
  ]
171
+ error_func = random.choice(error_types)
172
+ sent = error_func(sent)
 
 
 
 
 
 
 
173
 
174
  modified_sentences.append(sent)
175
 
176
  return '. '.join(modified_sentences)
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def apply_contractions(self, text):
179
  """Apply common contractions - EXPANDED"""
180
  contractions = {
 
195
  "we would": "we'd", "they would": "they'd", "could have": "could've",
196
  "should have": "should've", "would have": "would've", "might have": "might've",
197
  "must have": "must've", "there has": "there's", "here is": "here's",
198
+ "let us": "let's", "that will": "that'll", "who will": "who'll"
 
 
 
 
199
  }
200
 
 
201
  for full, contr in contractions.items():
202
+ if random.random() < 0.8: # 80% chance to apply each contraction
203
  text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE)
204
 
205
  return text
206
 
207
  def add_minor_errors(self, text):
208
+ """Add very minor, human-like errors - MORE REALISTIC BUT CONTROLLED"""
209
+ # Occasionally miss Oxford comma (15% chance)
210
+ if random.random() < 0.15:
211
+ # Only in lists, not random commas
212
  text = re.sub(r'(\w+), (\w+), and (\w+)', r'\1, \2 and \3', text)
213
 
214
+ # Sometimes use 'which' instead of 'that' (8% chance)
215
+ if random.random() < 0.08:
216
+ # Only for non-restrictive clauses
217
  matches = re.finditer(r'\b(\w+) that (\w+)', text)
218
+ for match in list(matches)[:1]: # Only first occurrence
219
+ if match.group(1).lower() not in ['believe', 'think', 'know', 'say']:
220
  text = text.replace(match.group(0), f"{match.group(1)} which {match.group(2)}", 1)
221
 
222
+ # NEW: Add very occasional typos (2% chance per sentence) - REDUCED AND CONTROLLED
223
  sentences = text.split('. ')
224
  for i, sent in enumerate(sentences):
225
+ if random.random() < 0.02 and len(sent.split()) > 15: # Only in longer sentences
226
+ words = sent.split()
227
+ # Pick a random word to potentially typo
228
+ word_idx = random.randint(len(words)//2, len(words)-2) # Avoid start/end
229
+ word = words[word_idx].lower()
230
+
231
+ # Only typo common words where typo won't break meaning
232
+ safe_typos = {
233
+ 'the': 'teh',
234
+ 'and': 'adn',
235
+ 'that': 'taht',
236
+ 'with': 'wtih',
237
+ 'from': 'form',
238
+ 'because': 'becuase'
239
+ }
240
+
241
+ if word in safe_typos and random.random() < 0.5:
242
+ typo = safe_typos[word]
243
+ # Preserve original capitalization
244
+ if words[word_idx][0].isupper():
245
+ typo = typo[0].upper() + typo[1:]
246
+ words[word_idx] = typo
247
+ sentences[i] = ' '.join(words)
248
 
249
  text = '. '.join(sentences)
250
 
251
+ # Skip double words - too distracting
252
+
253
+ # Mix up common homophones occasionally (2% chance) - ONLY SAFE ONES
254
+ if random.random() < 0.02:
255
+ safe_homophones = [
256
+ ('its', "it's"), # Very common mistake
257
+ ('your', "you're"), # Another common one
258
+ ]
259
+ for pair in safe_homophones:
260
+ # Check context to avoid breaking meaning
261
+ if f" {pair[0]} " in text and random.random() < 0.3:
262
+ # Find one instance and check it's safe to replace
263
+ pattern = rf'\b{pair[0]}\s+(\w+ing|\w+ed)\b' # its + verb = likely should be it's
264
+ if re.search(pattern, text):
265
+ text = re.sub(pattern, f"{pair[1]} \\1", text, count=1)
266
+ break
267
 
268
  return text
269
 
 
279
  # Natural contractions throughout
280
  sentence = self.apply_contractions(sentence)
281
 
282
+ # Add natural speech patterns (15% chance)
283
+ if random.random() < 0.15 and len(sentence.split()) > 10:
284
+ # Natural interruptions that humans actually use
285
+ if random.random() < 0.5:
286
+ # Add "you know" or "I mean" naturally
287
+ words = sentence.split()
288
+ if len(words) > 6:
289
+ pos = random.randint(3, len(words)-3)
290
+ if random.random() < 0.5:
291
+ words.insert(pos, "you know,")
292
+ else:
293
+ words.insert(pos, "I mean,")
294
+ sentence = ' '.join(words)
295
+ else:
296
+ # Start with natural opener
297
+ openers = ["Look,", "See,", "Thing is,", "Honestly,", "Actually,"]
298
+ sentence = random.choice(openers) + " " + sentence[0].lower() + sentence[1:]
299
 
300
+ # Add subtle errors that humans make (8% chance)
301
+ if random.random() < 0.08:
302
  words = sentence.split()
303
+ if len(words) > 5:
304
+ # Common comma omissions
305
+ if ", and" in sentence and random.random() < 0.3:
306
+ sentence = sentence.replace(", and", " and", 1)
307
+ # Double words occasionally
308
+ elif random.random() < 0.2:
309
+ idx = random.randint(1, len(words)-2)
310
+ if words[idx].lower() in ['the', 'a', 'to', 'in', 'on', 'at']:
311
+ words.insert(idx+1, words[idx])
312
+ sentence = ' '.join(words)
313
+
314
+ # Natural sentence combinations (20% chance)
315
+ if i < len(sentences) - 1 and random.random() < 0.2:
 
 
 
 
 
 
 
 
 
 
 
316
  next_sent = sentences[i+1].strip()
317
+ if next_sent and len(sentence.split()) + len(next_sent.split()) < 25:
318
+ # Natural connectors based on content
319
+ if any(w in next_sent.lower() for w in ['but', 'however', 'although']):
320
+ sentence = sentence.rstrip('.') + ", but " + next_sent[0].lower() + next_sent[1:]
321
+ sentences[i+1] = "" # Mark as processed
322
+ elif any(w in next_sent.lower() for w in ['also', 'too', 'as well']):
323
+ sentence = sentence.rstrip('.') + " and " + next_sent[0].lower() + next_sent[1:]
324
+ sentences[i+1] = "" # Mark as processed
325
 
326
  result_sentences.append(sentence)
327
 
328
  return ' '.join([s for s in result_sentences if s])
329
 
 
 
 
 
 
 
330
  def vary_sentence_start(self, sentence):
331
  """Vary sentence beginning to avoid repetitive patterns"""
332
+ if not sentence:
333
+ return sentence
334
+
335
+ words = sentence.split()
336
+ if len(words) < 5:
337
  return sentence
338
 
339
+ # Different ways to start sentences naturally
340
  variations = [
341
+ lambda s: "When " + s[0].lower() + s[1:] + ", it makes sense.",
342
+ lambda s: "If you think about it, " + s[0].lower() + s[1:],
343
+ lambda s: s + " This is important.",
344
+ lambda s: "The thing about " + words[0].lower() + " " + ' '.join(words[1:]) + " is clear.",
345
+ lambda s: "What's interesting is " + s[0].lower() + s[1:],
 
 
 
 
 
 
 
 
346
  lambda s: s, # Keep original sometimes
347
  ]
348
 
349
+ # Pick a random variation
350
+ variation = random.choice(variations)
351
+ try:
352
+ return variation(sentence)
353
+ except:
354
+ return sentence
 
 
 
355
 
356
  class SelectiveGrammarFixer:
357
  """Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
 
397
 
398
  result = ' '.join(fixed_sentences)
399
 
400
+ # Add natural human variations (but we need to reference the main class method)
401
+ # This will be called from the smart_fix method instead
402
+
403
  return result
404
 
405
  def fix_basic_punctuation_errors(self, text):
 
407
  if not text:
408
  return text
409
 
410
+ # Fix double spaces (human-like error)
411
+ text = re.sub(r'\s{2,}', ' ', text)
412
 
413
+ # Fix space before punctuation (common error)
414
+ text = re.sub(r'\s+([.,!?;:])', r'\1', text)
 
415
 
416
  # Fix missing space after punctuation (human-like)
417
  text = re.sub(r'([.,!?])([A-Z])', r'\1 \2', text)
418
 
419
+ # Fix accidental double punctuation
420
+ text = re.sub(r'([.!?])\1+', r'\1', text)
 
421
 
422
+ # Fix "i" capitalization (common human error to fix)
423
+ text = re.sub(r'\bi\b', 'I', text)
 
424
 
425
  return text
426
 
427
  def preserve_natural_variations(self, text):
428
  """Keep some natural human-like variations"""
429
+ # Don't fix everything - leave some variety
430
  # Only fix if really broken
431
  if text.count('.') == 0 and len(text.split()) > 20:
432
  # Long text with no periods - needs fixing
433
  words = text.split()
434
+ # Add periods every 15-25 words naturally (more variation)
435
  new_text = []
436
  for i, word in enumerate(words):
437
  new_text.append(word)
438
+ if i > 0 and i % random.randint(12, 25) == 0:
439
  if word[-1] not in '.!?,;:':
440
  new_text[-1] = word + '.'
441
+ # Capitalize next word if it's not an acronym
442
  if i + 1 < len(words) and words[i + 1][0].islower():
443
+ # Check if it's not likely an acronym
444
+ if not words[i + 1].isupper():
445
+ words[i + 1] = words[i + 1][0].upper() + words[i + 1][1:]
446
  text = ' '.join(new_text)
447
 
448
  return text
 
480
  print("spaCy model not found, using NLTK for sentence splitting")
481
 
482
  try:
483
+ # Load Dipper paraphraser WITHOUT 8-bit quantization for better performance
484
  print("Loading Dipper paraphraser model...")
485
  self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl')
486
  self.model = T5ForConditionalGeneration.from_pretrained(
487
  "kalpeshk2011/dipper-paraphraser-xxl",
488
+ device_map="auto", # This will distribute across 4xL40S automatically
489
  torch_dtype=torch.float16,
490
  low_cpu_mem_usage=True
491
  )
 
516
  self.bart_model = AutoModelForSeq2SeqLM.from_pretrained(
517
  "eugenesiow/bart-paraphrase",
518
  torch_dtype=torch.float16,
519
+ device_map="auto" # Distribute across GPUs
520
  )
521
  self.bart_tokenizer = AutoTokenizer.from_pretrained("eugenesiow/bart-paraphrase")
522
  self.use_bart = True
 
529
  self.human_variations = HumanLikeVariations()
530
 
531
  def add_natural_human_patterns(self, text):
532
+ """Add natural human writing patterns that Originality AI associates with human text"""
533
+ sentences = self.split_into_sentences_advanced(text)
534
+ result_sentences = []
535
+
536
+ for i, sentence in enumerate(sentences):
537
+ if not sentence.strip():
538
+ continue
539
+
540
+ # Natural contractions throughout
541
+ sentence = self.apply_contractions(sentence)
542
+
543
+ # Add natural speech patterns (15% chance)
544
+ if random.random() < 0.15 and len(sentence.split()) > 10:
545
+ # Natural interruptions that humans actually use
546
+ if random.random() < 0.5:
547
+ # Add "you know" or "I mean" naturally
548
+ words = sentence.split()
549
+ if len(words) > 6:
550
+ pos = random.randint(3, len(words)-3)
551
+ if random.random() < 0.5:
552
+ words.insert(pos, "you know,")
553
+ else:
554
+ words.insert(pos, "I mean,")
555
+ sentence = ' '.join(words)
556
+ else:
557
+ # Start with natural opener
558
+ openers = ["Look,", "See,", "Thing is,", "Honestly,", "Actually,"]
559
+ sentence = random.choice(openers) + " " + sentence[0].lower() + sentence[1:]
560
+
561
+ # Add subtle errors that humans make (8% chance)
562
+ if random.random() < 0.08:
563
+ words = sentence.split()
564
+ if len(words) > 5:
565
+ # Common comma omissions
566
+ if ", and" in sentence and random.random() < 0.3:
567
+ sentence = sentence.replace(", and", " and", 1)
568
+ # Double words occasionally
569
+ elif random.random() < 0.2:
570
+ idx = random.randint(1, len(words)-2)
571
+ if words[idx].lower() in ['the', 'a', 'to', 'in', 'on', 'at']:
572
+ words.insert(idx+1, words[idx])
573
+ sentence = ' '.join(words)
574
+
575
+ # Natural sentence combinations (20% chance)
576
+ if i < len(sentences) - 1 and random.random() < 0.2:
577
+ next_sent = sentences[i+1].strip()
578
+ if next_sent and len(sentence.split()) + len(next_sent.split()) < 25:
579
+ # Natural connectors based on content
580
+ if any(w in next_sent.lower() for w in ['but', 'however', 'although']):
581
+ sentence = sentence.rstrip('.') + ", but " + next_sent[0].lower() + next_sent[1:]
582
+ sentences[i+1] = "" # Mark as processed
583
+ elif any(w in next_sent.lower() for w in ['also', 'too', 'as well']):
584
+ sentence = sentence.rstrip('.') + " and " + next_sent[0].lower() + next_sent[1:]
585
+ sentences[i+1] = "" # Mark as processed
586
+
587
+ result_sentences.append(sentence)
588
+
589
+ return ' '.join([s for s in result_sentences if s])
590
 
591
  def vary_sentence_start(self, sentence):
592
+ """Vary sentence beginning to avoid repetitive patterns"""
593
+ if not sentence:
594
+ return sentence
595
+
596
+ words = sentence.split()
597
+ if len(words) < 5:
598
+ return sentence
599
+
600
+ # Different ways to start sentences naturally
601
+ variations = [
602
+ lambda s: "When " + s[0].lower() + s[1:] + ", it makes sense.",
603
+ lambda s: "If you think about it, " + s[0].lower() + s[1:],
604
+ lambda s: s + " This is important.",
605
+ lambda s: "The thing about " + words[0].lower() + " " + ' '.join(words[1:]) + " is clear.",
606
+ lambda s: "What's interesting is " + s[0].lower() + s[1:],
607
+ lambda s: s, # Keep original sometimes
608
+ ]
609
+
610
+ # Pick a random variation
611
+ variation = random.choice(variations)
612
+ try:
613
+ return variation(sentence)
614
+ except:
615
+ return sentence
616
 
617
  def apply_contractions(self, text):
618
+ """Apply common contractions to make text more natural"""
619
+ contractions = {
620
+ "it is": "it's", "that is": "that's", "there is": "there's",
621
+ "he is": "he's", "she is": "she's", "what is": "what's",
622
+ "where is": "where's", "who is": "who's", "how is": "how's",
623
+ "cannot": "can't", "will not": "won't", "do not": "don't",
624
+ "does not": "doesn't", "did not": "didn't", "could not": "couldn't",
625
+ "should not": "shouldn't", "would not": "wouldn't", "is not": "isn't",
626
+ "are not": "aren't", "was not": "wasn't", "were not": "weren't",
627
+ "have not": "haven't", "has not": "hasn't", "had not": "hadn't",
628
+ "I am": "I'm", "you are": "you're", "we are": "we're",
629
+ "they are": "they're", "I have": "I've", "you have": "you've",
630
+ "we have": "we've", "they have": "they've", "I will": "I'll",
631
+ "you will": "you'll", "he will": "he'll", "she will": "she'll",
632
+ "we will": "we'll", "they will": "they'll", "I would": "I'd",
633
+ "you would": "you'd", "he would": "he'd", "she would": "she'd",
634
+ "we would": "we'd", "they would": "they'd", "could have": "could've",
635
+ "should have": "should've", "would have": "would've", "might have": "might've",
636
+ "must have": "must've", "there has": "there's", "here is": "here's",
637
+ "let us": "let's", "that will": "that'll", "who will": "who'll"
638
+ }
639
+
640
+ for full, contr in contractions.items():
641
+ text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE)
642
+
643
+ return text
644
 
645
  def preserve_keywords(self, text, keywords):
646
  """Mark keywords to preserve them during paraphrasing"""
 
674
  return modified_text, keyword_map
675
 
676
  def restore_keywords_robust(self, text, keyword_map):
677
+ """Restore keywords with more flexible pattern matching"""
678
  if not keyword_map:
679
  return text
680
 
 
704
  if match:
705
  num = match.group(1)
706
 
707
+ # Various patterns the model might create
708
  patterns = [
 
709
  (f'__KW{num}__', keyword),
710
  (f'__ KW{num}__', keyword),
711
  (f'__KW {num}__', keyword),
 
720
  (f'__KW{num}_', keyword),
721
  (f'_KW{num}__', keyword),
722
  (f'kw{num}', keyword),
723
+ (f'``KW{num}__', keyword), # Handle backtick corruption
724
+ (f'``KKW{num}', keyword), # Handle double K corruption
725
+ (f'KW{num}', keyword), # Simple pattern
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  ]
727
 
728
  for pattern, replacement in patterns:
729
+ if pattern in restored_text:
 
 
 
 
 
 
 
 
 
 
 
 
730
  # Check if this position has already been replaced
731
+ start_pos = restored_text.find(pattern)
732
+ if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))):
733
+ print(f"Found pattern '{pattern}', replacing with {replacement}")
734
+ restored_text = restored_text.replace(pattern, replacement, 1) # Replace only first occurrence
 
 
 
 
735
  # Mark new positions as replaced
736
+ for match in re.finditer(re.escape(replacement), restored_text):
737
+ replaced_positions.update(range(match.start(), match.end()))
738
+ break # Move to next placeholder after successful replacement
 
 
 
 
 
739
 
740
+ # Third pass: Clean up any backticks or quotes that shouldn't be there
741
+ # Remove double backticks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  restored_text = re.sub(r'``+', '', restored_text)
743
+ # Fix double quotes
744
  restored_text = re.sub(r"''", '"', restored_text)
745
  restored_text = re.sub(r'""', '"', restored_text)
746
 
747
+ # Fourth pass: Look for remaining underscore patterns
748
+ # But be more careful about replacement
749
  if '___' in restored_text and keyword_map:
750
  # Find all occurrences of multiple underscores
751
  underscore_matches = list(re.finditer(r'_{3,}', restored_text))
 
763
  replaced_positions.update(range(start, start + len(keyword_values[i])))
764
 
765
  # Final cleanup: Remove any remaining KW patterns that weren't caught
766
+ # But only if they're not part of an already replaced keyword
767
+ remaining_kw_patterns = re.findall(r'\bKW\d{3}\b', restored_text)
768
+ if remaining_kw_patterns:
769
+ print(f"Warning: Found remaining KW patterns: {remaining_kw_patterns}")
 
 
 
770
 
771
  # Log final result
772
  print(f"Final restored text: {restored_text[:100]}...")
 
796
  return True
797
 
798
  # Special handling for content inside tables
799
+ # Skip if it's inside strong/b/h1-h6 tags AND also inside a table
800
  if parent:
801
  # Check if we're inside a table
802
  is_in_table = any(p.name == 'table' for p in parent.parents)
 
824
  if any(handler in parent.attrs for handler in event_handlers):
825
  return True
826
 
827
+ # Special check for testimonial cards - check up to 3 levels of ancestors
828
  if parent:
829
  ancestors_to_check = []
830
  current = parent
 
843
  elif isinstance(classes, str) and 'testimonial-card' in classes:
844
  return True
845
 
846
+ # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
847
  skip_indicators = [
848
  'cta-', 'button', 'btn', 'heading', 'title', 'caption',
849
  'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
 
857
  'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
858
  ]
859
 
860
+ # Check only immediate parent and grandparent (not all ancestors)
861
  elements_to_check = [parent]
862
  if parent and parent.parent:
863
  elements_to_check.append(parent.parent)
 
926
  return False
927
 
928
  def clean_model_output_enhanced(self, text):
929
+ """Enhanced cleaning that preserves more natural structure"""
930
  if not text:
931
  return ""
932
 
 
958
  text = re.sub(r'- or maybe I should say -', '', text)
959
  text = re.sub(r'- or rather,', '', text)
960
  text = re.sub(r'- think about it -', '', text)
 
 
 
 
961
 
962
  # Clean up multiple spaces
963
  text = re.sub(r'\s+', ' ', text)
964
 
965
+ # Remove leading non-letter characters carefully
966
+ # IMPORTANT: Preserve keyword placeholders
967
+ if not re.match(r'^(__KW\d+__|KW\d+)', text):
968
+ # Only remove if it doesn't start with a placeholder
969
+ text = re.sub(r'^[^a-zA-Z_]+', '', text)
 
970
 
971
  # If we accidentally removed too much, use original
972
  if len(text) < len(original) * 0.5:
 
1000
  continue
1001
 
1002
  try:
1003
+ # ULTRA-HIGH diversity for Originality AI
1004
  has_keywords = any(placeholder in sentence for placeholder in keyword_map.keys())
1005
  if has_keywords:
1006
+ lex_diversity = 60 # Moderate for keywords
1007
+ order_diversity = 20
1008
  elif len(sentence.split()) < 10:
1009
+ lex_diversity = 85 # Very high for short
1010
+ order_diversity = 40
1011
  else:
1012
+ lex_diversity = 95 # Maximum diversity
1013
+ order_diversity = 50 # Maximum order diversity
1014
 
1015
  lex_code = int(100 - lex_diversity)
1016
  order_code = int(100 - order_diversity)
 
1037
  else:
1038
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
1039
 
1040
+ # Generate with appropriate variation
1041
  original_length = len(sentence.split())
1042
+ max_new_length = int(original_length * 1.4)
1043
 
1044
+ # High variation parameters
1045
+ temp = 0.95 if has_keywords else 1.3
1046
+ top_p_val = 0.9
1047
 
1048
  with torch.no_grad():
1049
  outputs = self.model.generate(
1050
  **inputs,
1051
  max_length=max_new_length + 20,
1052
+ min_length=max(5, int(original_length * 0.7)),
1053
  do_sample=True,
1054
  top_p=top_p_val,
1055
  temperature=temp,
1056
+ no_repeat_ngram_size=4, # Allow more repetition for naturalness
1057
  num_beams=1, # Greedy for more randomness
1058
  early_stopping=True
1059
  )
 
1145
  last_word = words[-1]
1146
 
1147
  # Remove if it's clearly cut off (1-2 chars, no vowels)
1148
+ # But don't remove valid short words like "is", "of", "to", etc.
1149
+ short_valid_words = {'is', 'of', 'to', 'in', 'on', 'at', 'by', 'or', 'if', 'so', 'up', 'no', 'we', 'he', 'me', 'be', 'do', 'go'}
1150
  if (len(last_word) <= 2 and
1151
  last_word.lower() not in short_valid_words and
1152
  not any(c in 'aeiouAEIOU' for c in last_word)):
 
1167
  generated += '.'
1168
  elif orig_stripped.endswith('!'):
1169
  # Check if generated seems exclamatory
1170
+ exclaim_words = ['amazing', 'incredible', 'fantastic', 'terrible', 'awful', 'wonderful', 'excellent']
1171
  if any(word in generated.lower() for word in exclaim_words):
1172
  generated += '!'
1173
  else:
 
1237
  with torch.no_grad():
1238
  outputs = self.bart_model.generate(
1239
  **inputs,
1240
+ max_length=int(original_length * 1.4) + 10,
1241
+ min_length=max(5, int(original_length * 0.6)),
1242
  num_beams=2,
1243
+ temperature=1.1, # Higher temperature
1244
  do_sample=True,
1245
+ top_p=0.9,
1246
  early_stopping=True
1247
  )
1248
 
 
1268
  return text
1269
 
1270
  def apply_sentence_variation(self, text):
1271
+ """Apply natural sentence structure variations - HUMAN-LIKE FLOW"""
1272
  sentences = self.split_into_sentences_advanced(text)
1273
  varied_sentences = []
1274
 
1275
  # Track patterns to ensure variety
1276
  last_sentence_length = 0
 
1277
 
1278
  for i, sentence in enumerate(sentences):
1279
  if not sentence.strip():
 
1282
  words = sentence.split()
1283
  current_length = len(words)
1284
 
1285
+ # Natural sentence length variation
1286
+ if last_sentence_length > 20 and current_length > 20:
1287
+ # Break up if two long sentences in a row
1288
+ if ',' in sentence:
1289
+ parts = sentence.split(',', 1)
1290
+ if len(parts) == 2 and len(parts[1].split()) > 8:
1291
+ varied_sentences.append(parts[0].strip() + '.')
1292
+ second_part = parts[1].strip()
1293
+ if second_part and second_part[0].islower():
1294
+ second_part = second_part[0].upper() + second_part[1:]
1295
+ varied_sentences.append(second_part)
1296
+ last_sentence_length = len(parts[1].split())
1297
+ continue
1298
+
1299
+ # Natural combinations for flow
 
 
 
 
 
 
 
 
 
1300
  if (i < len(sentences) - 1 and
1301
+ current_length < 10 and
1302
+ len(sentences[i+1].split()) < 10):
 
1303
 
1304
  next_sent = sentences[i+1].strip()
1305
+ # Only combine if it makes semantic sense
1306
+ if next_sent and any(next_sent.lower().startswith(w) for w in ['it', 'this', 'that', 'which']):
1307
+ combined = sentence.rstrip('.') + ' ' + next_sent[0].lower() + next_sent[1:]
1308
+ varied_sentences.append(combined)
1309
+ sentences[i+1] = ""
1310
+ last_sentence_length = len(combined.split())
1311
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1312
 
1313
  varied_sentences.append(sentence)
1314
  last_sentence_length = current_length
1315
 
1316
  return ' '.join([s for s in varied_sentences if s])
1317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
  def fix_punctuation(self, text):
1319
  """Comprehensive punctuation and formatting fixes"""
1320
  if not text:
 
1326
  # Fix weird symbols and characters using safe replacements
1327
  text = text.replace('<>', '') # Remove empty angle brackets
1328
 
1329
+ # Normalize quotes - use replace instead of regex for problematic characters
1330
  text = text.replace('«', '"').replace('»', '"')
1331
  text = text.replace('„', '"').replace('"', '"').replace('"', '"')
1332
  text = text.replace(''', "'").replace(''', "'")
1333
  text = text.replace('–', '-').replace('—', '-')
1334
 
1335
  # Fix colon issues
1336
+ text = re.sub(r'\.:', ':', text) # Remove period before colon
1337
+ text = re.sub(r':\s*\.', ':', text) # Remove period after colon
1338
 
1339
+ # Fix basic spacing
1340
+ text = re.sub(r'\s+', ' ', text) # Multiple spaces to single
1341
+ text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Remove space before punctuation
1342
+ text = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', text) # Remove double punctuation
1343
+ text = re.sub(r'([.!?])\s*\1+', r'\1', text) # Remove repeated punctuation
 
1344
 
1345
  # Fix colons
1346
+ text = re.sub(r':\s*([.,!?])', ':', text) # Remove punctuation after colon
1347
+ text = re.sub(r'([.,!?])\s*:', ':', text) # Remove punctuation before colon
1348
+ text = re.sub(r':+', ':', text) # Multiple colons to one
1349
 
1350
  # Fix quotes and parentheses
1351
  text = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', text)
 
1353
  text = re.sub(r'\(\s*([^)]*?)\s*\)', r'(\1)', text)
1354
 
1355
  # Fix sentence capitalization more carefully
1356
+ # Split on ACTUAL sentence endings only
1357
  sentences = re.split(r'(?<=[.!?])\s+', text)
1358
  fixed_sentences = []
1359
 
 
1361
  if not sentence:
1362
  continue
1363
 
1364
+ # Only capitalize the first letter if it's actually lowercase
1365
+ # and not part of a special case (like iPhone, eBay, etc.)
1366
  words = sentence.split()
1367
  if words:
1368
  first_word = words[0]
1369
+ # Check if it's not an acronym or proper noun that should stay lowercase
1370
  if (first_word[0].islower() and
1371
  not self.is_likely_acronym_or_proper_noun(first_word) and
1372
  not first_word.startswith('__KW') and
1373
  not first_word.startswith('_kw')):
1374
+ # Only capitalize if it's a regular word
1375
  sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
1376
 
1377
  fixed_sentences.append(sentence)
1378
 
1379
  text = ' '.join(fixed_sentences)
1380
 
1381
+ # Fix common issues
1382
+ text = re.sub(r'\bi\b', 'I', text) # Capitalize 'I'
1383
+ text = re.sub(r'\.{2,}', '.', text) # Multiple periods to one
1384
+ text = re.sub(r',{2,}', ',', text) # Multiple commas to one
1385
+ text = re.sub(r'\s*,\s*,\s*', ', ', text) # Double commas with spaces
1386
+
1387
+ # Remove weird artifacts
1388
+ text = re.sub(r'\b(CHAPTER\s+[IVX]+|SECTION\s+\d+)\b[^\w]*', '', text, flags=re.IGNORECASE)
1389
 
1390
  # Fix abbreviations
1391
  text = re.sub(r'\betc\s*\.\s*\.', 'etc.', text)
1392
  text = re.sub(r'\be\.g\s*\.\s*[,\s]', 'e.g., ', text)
1393
  text = re.sub(r'\bi\.e\s*\.\s*[,\s]', 'i.e., ', text)
1394
 
1395
+ # Fix numbers with periods (like "1. " at start of lists)
1396
  text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
1397
 
1398
  # Fix bold/strong tags punctuation
1399
  text = self.fix_bold_punctuation(text)
1400
 
1401
+ # Clean up any remaining issues
1402
+ text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Final space cleanup
1403
+ text = re.sub(r'([.,!?;:])\s{2,}', r'\1 ', text) # Fix multiple spaces after punctuation
1404
 
1405
  # Ensure ending punctuation
1406
  text = text.strip()
1407
  if text and text[-1] not in '.!?':
1408
+ # Don't add period if it ends with colon (likely a list header)
1409
  if not text.endswith(':'):
1410
  text += '.'
1411
 
 
1415
  """Fix punctuation issues around bold/strong tags"""
1416
  # Check if this is likely a list item with colon pattern
1417
  def is_list_item_with_colon(text):
1418
+ # Pattern: starts with or contains <strong>Text:</strong> or <b>Text:</b>
1419
  list_pattern = r'^\s*(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
1420
  return bool(re.search(list_pattern, text))
1421
 
1422
  # If it's a list item with colon, preserve the format
1423
  if is_list_item_with_colon(text):
1424
+ # Just clean up spacing but preserve the colon inside bold
1425
  text = re.sub(r'<(strong|b)>\s*([^:]+)\s*:\s*</\1>', r'<\1>\2:</\1>', text)
1426
  return text
1427
 
 
1437
 
1438
  # Check if this is a list header (contains colon at the end)
1439
  if content.endswith(':'):
1440
+ # Preserve list headers with colons
1441
  return f'<{tag}>{content}</{tag}>'
1442
 
1443
  # Remove any periods at the start or end of bold content
1444
  content = content.strip('.')
1445
 
1446
  # Check if this bold text is at the start of a sentence
1447
+ # (preceded by nothing, or by '. ', '! ', '? ')
1448
  start_pos = match.start()
1449
  is_sentence_start = (start_pos == 0 or
1450
  (start_pos > 2 and text[start_pos-2:start_pos] in ['. ', '! ', '? ', '\n\n']))
 
1458
  # Fix bold/strong tags
1459
  text = re.sub(bold_pattern, fix_bold_match, text)
1460
 
1461
+ # Fix spacing around bold/strong tags (but not for list items)
1462
  if not is_list_item_with_colon(text):
1463
+ text = re.sub(r'\.\s*<(strong|b)>', r'. <\1>', text) # Period before bold
1464
+ text = re.sub(r'</(strong|b)>\s*\.', r'</\1>.', text) # Period after bold
1465
+ text = re.sub(r'([.!?])\s*<(strong|b)>', r'\1 <\2>', text) # Space after sentence end
1466
+ text = re.sub(r'</(strong|b)>\s+([a-z])', lambda m: f'</{m.group(1)}> {m.group(2)}', text) # Keep lowercase after bold if mid-sentence
1467
 
1468
  # Remove duplicate periods around bold tags
1469
  text = re.sub(r'\.\s*</(strong|b)>\s*\.', r'</\1>.', text)
1470
  text = re.sub(r'\.\s*<(strong|b)>\s*\.', r'. <\1>', text)
1471
 
1472
  # Fix cases where bold content ends a sentence
1473
+ # If bold is followed by a new sentence (capital letter), add period
1474
  text = re.sub(r'</(strong|b)>\s+([A-Z])', r'</\1>. \2', text)
1475
 
1476
  # Don't remove these for list items
1477
  if not is_list_item_with_colon(text):
1478
+ text = re.sub(r'<(strong|b)>\s*:\s*</\1>', ':', text) # Remove empty bold colons
1479
+ text = re.sub(r'<(strong|b)>\s*\.\s*</\1>', '.', text) # Remove empty bold periods
1480
 
1481
  return text
1482
 
 
1485
  soup = BeautifulSoup(html_content, 'html.parser')
1486
  text_elements = []
1487
 
1488
+ # Get all text nodes using string instead of text (fixing deprecation)
1489
  for element in soup.find_all(string=True):
1490
  # Skip script, style, and noscript content completely
1491
  if element.parent.name in ['script', 'style', 'noscript']:
 
1507
  html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
1508
 
1509
  # Fix spacing issues
1510
+ html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags
1511
+ html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing >
1512
+ html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening <
1513
 
1514
+ # Fix common word errors that might occur during processing
1515
  html_text = html_text.replace('down loaded', 'downloaded')
1516
  html_text = html_text.replace('But your document', 'Your document')
1517
 
 
1525
  # Find all paragraph tags
1526
  for p_tag in soup.find_all('p'):
1527
  # Skip paragraphs that are inside special elements
1528
+ # Check if paragraph is inside any of these elements
1529
  skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card',
1530
  'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
1531
  'div.quiz-container', 'div.question-container', 'div.results']
 
1574
  continue
1575
 
1576
  # Skip if the text node's immediate parent isn't the p tag
1577
+ # (to avoid nested elements)
1578
  if text_node.parent != p_tag:
1579
  continue
1580
 
 
1612
  text_node.insert_after(new_node)
1613
  text_node.extract()
1614
 
1615
+ def add_natural_flow_variations(self, text):
1616
+ """Add more natural flow and rhythm variations for Originality AI"""
1617
+ sentences = self.split_into_sentences_advanced(text)
1618
+ enhanced_sentences = []
1619
+
1620
+ for i, sentence in enumerate(sentences):
1621
+ if not sentence.strip():
1622
+ continue
1623
+
1624
+ # Add stream-of-consciousness elements (10% chance)
1625
+ if random.random() < 0.1 and len(sentence.split()) > 10:
1626
+ stream_elements = [
1627
+ " - wait, let me back up - ",
1628
+ " - actually, scratch that - ",
1629
+ " - or maybe I should say - ",
1630
+ " - hmm, how do I put this - ",
1631
+ " - okay, here's the thing - ",
1632
+ " - you know what I mean? - "
1633
+ ]
1634
+ words = sentence.split()
1635
+ pos = random.randint(len(words)//4, 3*len(words)//4)
1636
+ words.insert(pos, random.choice(stream_elements))
1637
+ sentence = ' '.join(words)
1638
+
1639
+ # Add human-like self-corrections (5% chance)
1640
+ if random.random() < 0.05:
1641
+ corrections = [
1642
+ " - or rather, ",
1643
+ " - well, actually, ",
1644
+ " - I mean, ",
1645
+ " - or should I say, ",
1646
+ " - correction: "
1647
+ ]
1648
+ words = sentence.split()
1649
+ if len(words) > 8:
1650
+ pos = random.randint(len(words)//2, len(words)-3)
1651
+ correction = random.choice(corrections)
1652
+ # Repeat a concept with variation
1653
+ repeated_word_idx = random.randint(max(0, pos-5), pos-1)
1654
+ if repeated_word_idx < len(words):
1655
+ words.insert(pos, correction)
1656
+ sentence = ' '.join(words)
1657
+
1658
+ # Add thinking-out-loud patterns (8% chance)
1659
+ if random.random() < 0.08 and i > 0:
1660
+ thinking_patterns = [
1661
+ "Come to think of it, ",
1662
+ "Actually, you know what? ",
1663
+ "Wait, here's a thought: ",
1664
+ "Oh, and another thing - ",
1665
+ "Speaking of which, ",
1666
+ "This reminds me, ",
1667
+ "Now that I mention it, ",
1668
+ "Funny you should ask, because "
1669
+ ]
1670
+ pattern = random.choice(thinking_patterns)
1671
+ sentence = pattern + sentence[0].lower() + sentence[1:] if len(sentence) > 1 else sentence
1672
+
1673
+ enhanced_sentences.append(sentence)
1674
+
1675
+ return ' '.join(enhanced_sentences)
1676
+
1677
  def process_html(self, html_content, primary_keywords="", secondary_keywords="", progress_callback=None):
1678
  """Main processing function with progress callback"""
1679
  if not html_content.strip():
 
1706
  # Combine keywords and clean them
1707
  all_keywords = []
1708
  if primary_keywords:
1709
+ # Clean and validate each keyword
1710
  for k in primary_keywords.split(','):
1711
  cleaned = k.strip()
1712
+ if cleaned and len(cleaned) > 1: # Skip empty or single-char keywords
1713
  all_keywords.append(cleaned)
1714
  if secondary_keywords:
1715
  for k in secondary_keywords.split(','):
 
1754
  if text_has_keywords:
1755
  print(f"Debug: Processing text with keywords: {original_text[:50]}...")
1756
 
1757
+ # First pass with Dipper (with adjusted diversity)
1758
  paraphrased_text = self.paraphrase_with_dipper(
1759
  original_text,
1760
  keywords=all_keywords
 
1763
  # Verify no placeholders remain
1764
  if '__KW' in paraphrased_text or '___' in paraphrased_text:
1765
  print(f"Warning: Placeholder or underscores found in paraphrased text: {paraphrased_text[:100]}...")
1766
+ # Try to restore again with the enhanced function
1767
  temp_map = {}
1768
  for j, keyword in enumerate(all_keywords):
1769
  temp_map[f'__KW{j:03d}__'] = keyword
 
1771
 
1772
  # Second pass with BART for longer texts (increased probability)
1773
  if self.use_bart and len(paraphrased_text.split()) > 8:
1774
+ # 50% chance to use BART for more variation (reduced from 60%)
1775
+ if random.random() < 0.5:
1776
  paraphrased_text = self.paraphrase_with_bart(
1777
  paraphrased_text,
1778
  keywords=all_keywords
1779
  )
1780
 
1781
+ # Apply sentence variation
1782
  paraphrased_text = self.apply_sentence_variation(paraphrased_text)
1783
 
1784
  # Add natural flow variations
1785
  paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
1786
 
 
 
 
1787
  # Fix punctuation and formatting
1788
  paraphrased_text = self.fix_punctuation(paraphrased_text)
1789
 
1790
+ # Final check for any remaining placeholders or underscores
1791
+ if '___' in paraphrased_text or '__KW' in paraphrased_text:
1792
  print(f"Error: Unresolved placeholders in final text")
1793
  # Use original text if we can't resolve placeholders
1794
  paraphrased_text = original_text
 
1809
  # Wrap keywords with <strong> tags in paragraphs
1810
  self.wrap_keywords_in_paragraphs(soup, all_keywords)
1811
 
1812
+ # Post-process the entire HTML to fix bold/strong formatting
1813
  result = str(soup)
1814
  result = self.post_process_html(result)
1815
 
1816
+ # Final safety check for any remaining placeholders or underscores
1817
+ if '__KW' in result or re.search(r'_{3,}', result):
1818
+ print("Warning: Found placeholders or multiple underscores in final HTML output")
1819
+ # Attempt to clean them with keywords
1820
  for i, keyword in enumerate(all_keywords):
1821
  result = result.replace(f'__KW{i:03d}__', keyword)
1822
+ result = re.sub(r'_{3,}', keyword, result, count=1)
 
 
 
1823
 
1824
  # Restore all script tags
1825
  for idx, script_content in enumerate(preserved_scripts):
 
1834
  # Validate and fix HTML syntax
1835
  result = self.validate_and_fix_html(result)
1836
 
1837
+ # Count skipped elements properly
1838
  all_text_elements = soup.find_all(string=True)
1839
  skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
1840
 
 
1848
  import traceback
1849
  error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
1850
  print(error_msg)
1851
+ # Return original HTML with error message prepended as HTML comment
1852
  return f"<!-- {error_msg} -->\n{html_content}"
1853
 
1854
  def post_process_html(self, html_text):
1855
  """Post-process the entire HTML to fix formatting issues"""
1856
+ # Fix empty angle brackets that might appear
1857
+ html_text = re.sub(r'<>\s*([^<>]+?)\s*(?=\.|\s|<)', r'\1', html_text) # Remove <> around text
1858
+ html_text = re.sub(r'<>', '', html_text) # Remove any remaining empty <>
1859
 
1860
  # Fix double angle brackets around bold tags
1861
  html_text = re.sub(r'<<b>>', '<b>', html_text)
 
1864
  html_text = re.sub(r'<</strong>>', '</strong>', html_text)
1865
 
1866
  # Fix periods around bold/strong tags
1867
+ html_text = re.sub(r'\.\s*<(b|strong)>', '. <\1>', html_text) # Period before bold
1868
+ html_text = re.sub(r'</(b|strong)>\s*\.', '</\1>.', html_text) # Period after bold
1869
+ html_text = re.sub(r'\.<<(b|strong)>>', '. <\1>', html_text) # Fix double bracket cases
1870
  html_text = re.sub(r'</(b|strong)>>\.', '</\1>.', html_text)
1871
 
1872
  # Fix periods after colons
 
1878
  # Check if this line contains a list pattern with bold
1879
  list_pattern = r'(?:^|\s)(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
1880
  if re.search(list_pattern, line):
1881
+ # This is a list item, preserve the colon format
1882
  return line
1883
 
1884
  # Not a list item, apply regular fixes
1885
+ # Remove periods immediately inside bold tags
1886
  line = re.sub(r'<(strong|b)>\s*\.\s*([^<]+)\s*\.\s*</\1>', r'<\1>\2</\1>', line)
1887
+
1888
+ # Fix sentence endings with bold
1889
  line = re.sub(r'</(strong|b)>\s*([.!?])', r'</\1>\2', line)
1890
 
1891
  return line
1892
 
1893
+ # Process line by line to preserve list formatting
1894
  lines = html_text.split('\n')
1895
  processed_lines = [process_line(line) for line in lines]
1896
  html_text = '\n'.join(processed_lines)
 
1916
  # Look for bold/strong tags and check their context
1917
  html_text = re.sub(r'(^|.*?)(<(?:strong|b)>)([a-zA-Z])', fix_bold_sentence_start, html_text)
1918
 
1919
+ # Clean up spacing around bold tags (but preserve list formatting)
1920
+ # Split into segments to handle list items separately
1921
  segments = re.split(r'(<(?:strong|b)>[^<]*:</(?:strong|b)>)', html_text)
1922
  cleaned_segments = []
1923
 
 
1928
  # Apply spacing fixes to non-list segments
1929
  segment = re.sub(r'\s+<(strong|b)>', r' <\1>', segment)
1930
  segment = re.sub(r'</(strong|b)>\s+', r'</\1> ', segment)
1931
+ # Fix punctuation issues
1932
  segment = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', segment)
1933
+ # Fix periods inside/around bold
1934
  segment = re.sub(r'\.<(strong|b)>\.', '. <\1>', segment)
1935
  segment = re.sub(r'\.</(strong|b)>\.', '</\1>.', segment)
1936
  cleaned_segments.append(segment)
 
1938
  html_text = ''.join(cleaned_segments)
1939
 
1940
  # Final cleanup
1941
+ html_text = re.sub(r'\.{2,}', '.', html_text) # Multiple periods
1942
+ html_text = re.sub(r',{2,}', ',', html_text) # Multiple commas
1943
+ html_text = re.sub(r':{2,}', ':', html_text) # Multiple colons
1944
+ html_text = re.sub(r'\s+([.,!?;:])', r'\1', html_text) # Space before punctuation
1945
 
1946
+ # Fix empty bold tags (but not those with just colons)
1947
  html_text = re.sub(r'<(strong|b)>\s*</\1>', '', html_text)
1948
 
1949
  # Fix specific patterns in lists/stats
1950
+ # Pattern like "5,000+" should not have period after
1951
  html_text = re.sub(r'(\d+[,\d]*\+?)\s*\.\s*\n', r'\1\n', html_text)
1952
 
1953
  # Clean up any remaining double brackets
 
2011
  lines=10,
2012
  label="Humanized HTML Output"
2013
  ),
2014
+ title="Enhanced Dipper AI Humanizer - Optimized for Originality AI",
2015
  description="""
2016
  Ultra-aggressive humanizer optimized to achieve 100% human scores on both Undetectable AI and Originality AI.
2017
 
2018
+ Key Features:
2019
+ - Maximum diversity settings (90% lexical, 40% order) for natural variation
2020
+ - Enhanced human patterns: personal opinions, self-corrections, thinking-out-loud
2021
+ - Natural typos, contractions, and conversational flow
2022
+ - Stream-of-consciousness elements and rhetorical questions
2023
+ - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
2024
+ - Fixed placeholder system that preserves keywords
 
 
 
 
 
2025
  - Keywords inside <p> tags are automatically wrapped with <strong> tags
2026
+ - Skips content in <strong>, <b>, and heading tags (including inside tables)
2027
+ - Designed to pass the strictest AI detection systems
 
2028
 
2029
+ The tool creates genuinely human-like writing patterns that fool even the most sophisticated detectors!
 
 
 
2030
 
2031
+ ⚠️ Note: Processing may take 5-10 minutes for large HTML documents.
2032
  """,
2033
  examples=[
2034
  ["""<article>