Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -663,122 +663,237 @@ class EnhancedDipperHumanizer:
|
|
| 663 |
return text
|
| 664 |
|
| 665 |
def should_skip_element(self, element, text):
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
return True
|
| 669 |
-
|
| 670 |
-
# Skip JavaScript code inside script tags - CRITICAL FIX
|
| 671 |
-
parent = element.parent
|
| 672 |
-
if parent and parent.name in ['script', 'style', 'noscript']:
|
| 673 |
-
return True
|
| 674 |
-
|
| 675 |
-
# Also check if we're inside a script tag at any level
|
| 676 |
-
for ancestor in element.parents:
|
| 677 |
-
if ancestor.name in ['script', 'style', 'noscript']:
|
| 678 |
return True
|
| 679 |
-
|
| 680 |
-
# Rest of your existing skip logic...
|
| 681 |
-
return False
|
| 682 |
-
|
| 683 |
-
def extract_text_from_html(self, html_content):
|
| 684 |
-
"""Extract text elements from HTML with skip logic"""
|
| 685 |
-
soup = BeautifulSoup(html_content, 'html.parser')
|
| 686 |
-
text_elements = []
|
| 687 |
-
|
| 688 |
-
# CRITICAL: Preserve all script tags completely
|
| 689 |
-
script_tags = soup.find_all('script')
|
| 690 |
-
script_placeholders = {}
|
| 691 |
-
|
| 692 |
-
for i, script in enumerate(script_tags):
|
| 693 |
-
placeholder = f"###SCRIPT_CONTENT_{i}###"
|
| 694 |
-
script_placeholders[placeholder] = str(script)
|
| 695 |
-
script.string = placeholder
|
| 696 |
-
|
| 697 |
-
# Get all text nodes
|
| 698 |
-
for element in soup.find_all(string=True):
|
| 699 |
-
# Skip script, style, and noscript content completely
|
| 700 |
-
if element.parent.name in ['script', 'style', 'noscript']:
|
| 701 |
-
continue
|
| 702 |
-
|
| 703 |
-
# Skip if it's a script placeholder
|
| 704 |
-
text = element.strip()
|
| 705 |
-
if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
|
| 706 |
-
continue
|
| 707 |
-
|
| 708 |
-
if text and not self.should_skip_element(element, text):
|
| 709 |
-
text_elements.append({
|
| 710 |
-
'text': text,
|
| 711 |
-
'element': element
|
| 712 |
-
})
|
| 713 |
-
|
| 714 |
-
return soup, text_elements, script_placeholders
|
| 715 |
-
|
| 716 |
-
def process_html(self, html_content, progress_callback=None):
|
| 717 |
-
"""Main processing function with progress callback"""
|
| 718 |
-
if not html_content.strip():
|
| 719 |
-
return "Please provide HTML content."
|
| 720 |
-
|
| 721 |
-
try:
|
| 722 |
-
# Extract text elements with script preservation
|
| 723 |
-
soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
|
| 724 |
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
processed_count = 0
|
| 730 |
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
|
|
|
|
|
|
| 737 |
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
lex_diversity=60,
|
| 746 |
-
order_diversity=20
|
| 747 |
-
)
|
| 748 |
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 758 |
|
| 759 |
-
|
| 760 |
-
if
|
| 761 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 762 |
|
| 763 |
-
#
|
| 764 |
-
|
|
|
|
| 765 |
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
|
|
|
| 769 |
|
| 770 |
-
#
|
| 771 |
-
|
| 772 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
-
|
| 775 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
|
| 783 |
def is_likely_acronym_or_proper_noun(self, word):
|
| 784 |
"""Check if a word is likely an acronym or part of a proper noun"""
|
|
@@ -1317,63 +1432,43 @@ def process_html(self, html_content, progress_callback=None):
|
|
| 1317 |
|
| 1318 |
return text
|
| 1319 |
|
| 1320 |
-
def
|
| 1321 |
-
"""
|
| 1322 |
-
|
| 1323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1324 |
|
| 1325 |
-
#
|
| 1326 |
-
|
| 1327 |
-
# Skip script, style, and noscript content completely
|
| 1328 |
-
if element.parent.name in ['script', 'style', 'noscript']:
|
| 1329 |
-
continue
|
| 1330 |
-
|
| 1331 |
-
text = element.strip()
|
| 1332 |
-
if text and not self.should_skip_element(element, text):
|
| 1333 |
-
text_elements.append({
|
| 1334 |
-
'text': text,
|
| 1335 |
-
'element': element
|
| 1336 |
-
})
|
| 1337 |
|
| 1338 |
-
|
| 1339 |
-
|
| 1340 |
-
|
| 1341 |
-
|
| 1342 |
-
|
| 1343 |
-
|
| 1344 |
-
|
| 1345 |
-
|
| 1346 |
-
|
| 1347 |
-
|
| 1348 |
-
|
| 1349 |
-
|
| 1350 |
-
|
| 1351 |
-
|
| 1352 |
-
|
| 1353 |
-
|
| 1354 |
-
|
| 1355 |
-
)
|
| 1356 |
-
|
| 1357 |
-
# Fix DOCTYPE
|
| 1358 |
-
html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
|
| 1359 |
-
|
| 1360 |
-
# Fix spacing issues (but not inside scripts)
|
| 1361 |
-
html_text = re.sub(r'>\s+<', '><', html_text)
|
| 1362 |
-
html_text = re.sub(r'\s+>', '>', html_text)
|
| 1363 |
-
html_text = re.sub(r'<\s+', '<', html_text)
|
| 1364 |
-
|
| 1365 |
-
# Fix common word errors that might occur during processing
|
| 1366 |
-
html_text = html_text.replace('down loaded', 'downloaded')
|
| 1367 |
-
html_text = html_text.replace('But your document', 'Your document')
|
| 1368 |
-
|
| 1369 |
-
# Restore script content
|
| 1370 |
-
for placeholder, script_content in script_placeholders.items():
|
| 1371 |
-
html_text = html_text.replace(
|
| 1372 |
-
f'<script>{placeholder}</script>',
|
| 1373 |
-
f'<script>{script_content}</script>'
|
| 1374 |
-
)
|
| 1375 |
-
|
| 1376 |
-
return html_text
|
| 1377 |
|
| 1378 |
def add_natural_flow_variations(self, text):
|
| 1379 |
"""Add more natural flow and rhythm variations for Originality AI"""
|
|
@@ -1437,127 +1532,6 @@ def process_html(self, html_content, progress_callback=None):
|
|
| 1437 |
|
| 1438 |
return ' '.join(enhanced_sentences)
|
| 1439 |
|
| 1440 |
-
def process_html(self, html_content, progress_callback=None):
|
| 1441 |
-
"""Main processing function with progress callback"""
|
| 1442 |
-
if not html_content.strip():
|
| 1443 |
-
return "Please provide HTML content."
|
| 1444 |
-
|
| 1445 |
-
# Store all script and style content to preserve it
|
| 1446 |
-
script_placeholder = "###SCRIPT_PLACEHOLDER_{}###"
|
| 1447 |
-
style_placeholder = "###STYLE_PLACEHOLDER_{}###"
|
| 1448 |
-
preserved_scripts = []
|
| 1449 |
-
preserved_styles = []
|
| 1450 |
-
|
| 1451 |
-
# Temporarily replace script and style tags with placeholders
|
| 1452 |
-
soup_temp = BeautifulSoup(html_content, 'html.parser')
|
| 1453 |
-
|
| 1454 |
-
# Preserve all script tags
|
| 1455 |
-
for idx, script in enumerate(soup_temp.find_all('script')):
|
| 1456 |
-
placeholder = script_placeholder.format(idx)
|
| 1457 |
-
preserved_scripts.append(str(script))
|
| 1458 |
-
script.replace_with(placeholder)
|
| 1459 |
-
|
| 1460 |
-
# Preserve all style tags
|
| 1461 |
-
for idx, style in enumerate(soup_temp.find_all('style')):
|
| 1462 |
-
placeholder = style_placeholder.format(idx)
|
| 1463 |
-
preserved_styles.append(str(style))
|
| 1464 |
-
style.replace_with(placeholder)
|
| 1465 |
-
|
| 1466 |
-
# Get the modified HTML
|
| 1467 |
-
html_content = str(soup_temp)
|
| 1468 |
-
|
| 1469 |
-
try:
|
| 1470 |
-
# Extract text elements
|
| 1471 |
-
soup, text_elements = self.extract_text_from_html(html_content)
|
| 1472 |
-
|
| 1473 |
-
total_elements = len(text_elements)
|
| 1474 |
-
print(f"Found {total_elements} text elements to process (after filtering)")
|
| 1475 |
-
|
| 1476 |
-
# Process each text element
|
| 1477 |
-
processed_count = 0
|
| 1478 |
-
|
| 1479 |
-
for i, element_info in enumerate(text_elements):
|
| 1480 |
-
original_text = element_info['text']
|
| 1481 |
-
|
| 1482 |
-
# Skip placeholders
|
| 1483 |
-
if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text:
|
| 1484 |
-
continue
|
| 1485 |
-
|
| 1486 |
-
# Skip very short texts
|
| 1487 |
-
if len(original_text.split()) < 3:
|
| 1488 |
-
continue
|
| 1489 |
-
|
| 1490 |
-
# First pass with Dipper
|
| 1491 |
-
paraphrased_text = self.paraphrase_with_dipper(
|
| 1492 |
-
original_text,
|
| 1493 |
-
lex_diversity=60,
|
| 1494 |
-
order_diversity=20
|
| 1495 |
-
)
|
| 1496 |
-
|
| 1497 |
-
# Second pass with BART for longer texts (balanced probability)
|
| 1498 |
-
if self.use_bart and len(paraphrased_text.split()) > 8:
|
| 1499 |
-
# 30% chance to use BART for more variation (balanced)
|
| 1500 |
-
if random.random() < 0.3:
|
| 1501 |
-
paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
|
| 1502 |
-
|
| 1503 |
-
# Apply sentence variation
|
| 1504 |
-
paraphrased_text = self.apply_sentence_variation(paraphrased_text)
|
| 1505 |
-
|
| 1506 |
-
# Add natural flow variations
|
| 1507 |
-
paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
|
| 1508 |
-
|
| 1509 |
-
# Fix punctuation and formatting
|
| 1510 |
-
paraphrased_text = self.fix_punctuation(paraphrased_text)
|
| 1511 |
-
|
| 1512 |
-
# Final quality check
|
| 1513 |
-
if paraphrased_text and len(paraphrased_text.split()) >= 3:
|
| 1514 |
-
element_info['element'].replace_with(NavigableString(paraphrased_text))
|
| 1515 |
-
processed_count += 1
|
| 1516 |
-
|
| 1517 |
-
# Progress update
|
| 1518 |
-
if progress_callback:
|
| 1519 |
-
progress_callback(i + 1, total_elements)
|
| 1520 |
-
|
| 1521 |
-
if i % 10 == 0 or i == total_elements - 1:
|
| 1522 |
-
progress = (i + 1) / total_elements * 100
|
| 1523 |
-
print(f"Progress: {progress:.1f}%")
|
| 1524 |
-
|
| 1525 |
-
# Get the processed HTML
|
| 1526 |
-
result = str(soup)
|
| 1527 |
-
|
| 1528 |
-
# Restore all script tags
|
| 1529 |
-
for idx, script_content in enumerate(preserved_scripts):
|
| 1530 |
-
placeholder = script_placeholder.format(idx)
|
| 1531 |
-
result = result.replace(placeholder, script_content)
|
| 1532 |
-
|
| 1533 |
-
# Restore all style tags
|
| 1534 |
-
for idx, style_content in enumerate(preserved_styles):
|
| 1535 |
-
placeholder = style_placeholder.format(idx)
|
| 1536 |
-
result = result.replace(placeholder, style_content)
|
| 1537 |
-
|
| 1538 |
-
# Post-process the entire HTML to fix bold/strong formatting
|
| 1539 |
-
result = self.post_process_html(result)
|
| 1540 |
-
|
| 1541 |
-
# Validate and fix HTML syntax
|
| 1542 |
-
result = self.validate_and_fix_html(result)
|
| 1543 |
-
|
| 1544 |
-
# Count skipped elements properly
|
| 1545 |
-
all_text_elements = soup.find_all(string=True)
|
| 1546 |
-
skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
|
| 1547 |
-
|
| 1548 |
-
print(f"Successfully processed {processed_count} text elements")
|
| 1549 |
-
print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)")
|
| 1550 |
-
print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags")
|
| 1551 |
-
|
| 1552 |
-
return result
|
| 1553 |
-
|
| 1554 |
-
except Exception as e:
|
| 1555 |
-
import traceback
|
| 1556 |
-
error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
|
| 1557 |
-
print(error_msg)
|
| 1558 |
-
# Return original HTML with error message prepended as HTML comment
|
| 1559 |
-
return f"<!-- {error_msg} -->\n{html_content}"
|
| 1560 |
-
|
| 1561 |
def post_process_html(self, html_text):
|
| 1562 |
"""Post-process the entire HTML to fix formatting issues"""
|
| 1563 |
# Fix empty angle brackets that might appear
|
|
|
|
| 663 |
return text
|
| 664 |
|
| 665 |
def should_skip_element(self, element, text):
|
| 666 |
+
"""Determine if an element should be skipped from paraphrasing"""
|
| 667 |
+
if not text or len(text.strip()) < 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
|
| 670 |
+
# Skip JavaScript code inside script tags - CRITICAL FIX
|
| 671 |
+
parent = element.parent
|
| 672 |
+
if parent and parent.name in ['script', 'style', 'noscript']:
|
| 673 |
+
return True
|
|
|
|
| 674 |
|
| 675 |
+
# Also check if we're inside a script tag at any level
|
| 676 |
+
for ancestor in element.parents:
|
| 677 |
+
if ancestor.name in ['script', 'style', 'noscript']:
|
| 678 |
+
return True
|
| 679 |
+
|
| 680 |
+
# Skip headings (h1-h6)
|
| 681 |
+
if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
|
| 682 |
+
return True
|
| 683 |
|
| 684 |
+
# Skip content inside <strong> and <b> tags
|
| 685 |
+
if parent and parent.name in ['strong', 'b']:
|
| 686 |
+
return True
|
| 687 |
|
| 688 |
+
# Skip table content
|
| 689 |
+
if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)):
|
| 690 |
+
return True
|
|
|
|
|
|
|
|
|
|
| 691 |
|
| 692 |
+
# Special handling for content inside tables
|
| 693 |
+
# Skip if it's inside strong/b/h1-h6 tags AND also inside a table
|
| 694 |
+
if parent:
|
| 695 |
+
# Check if we're inside a table
|
| 696 |
+
is_in_table = any(p.name == 'table' for p in parent.parents)
|
| 697 |
+
if is_in_table:
|
| 698 |
+
# If we're in a table, skip any text that's inside formatting tags
|
| 699 |
+
if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']:
|
| 700 |
+
return True
|
| 701 |
+
# Also check if parent's parent is a formatting tag
|
| 702 |
+
if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 703 |
+
return True
|
| 704 |
+
|
| 705 |
+
# Skip table of contents
|
| 706 |
+
if parent:
|
| 707 |
+
parent_text = str(parent).lower()
|
| 708 |
+
if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']):
|
| 709 |
+
return True
|
| 710 |
+
|
| 711 |
+
# Skip CTAs and buttons
|
| 712 |
+
if parent and parent.name in ['button', 'a']:
|
| 713 |
+
return True
|
| 714 |
|
| 715 |
+
# Skip if parent has onclick or other event handlers
|
| 716 |
+
if parent and parent.attrs:
|
| 717 |
+
event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout']
|
| 718 |
+
if any(handler in parent.attrs for handler in event_handlers):
|
| 719 |
+
return True
|
| 720 |
+
|
| 721 |
+
# Special check for testimonial cards - check up to 3 levels of ancestors
|
| 722 |
+
if parent:
|
| 723 |
+
ancestors_to_check = []
|
| 724 |
+
current = parent
|
| 725 |
+
for _ in range(3): # Check up to 3 levels up
|
| 726 |
+
if current:
|
| 727 |
+
ancestors_to_check.append(current)
|
| 728 |
+
current = current.parent
|
| 729 |
+
|
| 730 |
+
# Check if any ancestor has testimonial-card class
|
| 731 |
+
for ancestor in ancestors_to_check:
|
| 732 |
+
if ancestor and ancestor.get('class'):
|
| 733 |
+
classes = ancestor.get('class', [])
|
| 734 |
+
if isinstance(classes, list):
|
| 735 |
+
if any('testimonial-card' in str(cls) for cls in classes):
|
| 736 |
+
return True
|
| 737 |
+
elif isinstance(classes, str) and 'testimonial-card' in classes:
|
| 738 |
+
return True
|
| 739 |
+
|
| 740 |
+
# Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
|
| 741 |
+
skip_indicators = [
|
| 742 |
+
'button', 'btn', 'heading', 'title', 'caption',
|
| 743 |
+
'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
|
| 744 |
+
'warning', 'info', 'success', 'error', 'code', 'pre',
|
| 745 |
+
'stats-grid', 'testimonial-card',
|
| 746 |
+
'cta-box', 'quiz-container', 'contact-form',
|
| 747 |
+
'faq-question', 'sidebar', 'widget', 'banner',
|
| 748 |
+
'author-intro', 'testimonial', 'review', 'feedback',
|
| 749 |
+
'floating-', 'stat-', 'progress-', 'option', 'results',
|
| 750 |
+
'question-container', 'quiz-',
|
| 751 |
+
'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
|
| 752 |
+
]
|
| 753 |
+
|
| 754 |
+
# Check only immediate parent and grandparent (not all ancestors)
|
| 755 |
+
elements_to_check = [parent]
|
| 756 |
+
if parent and parent.parent:
|
| 757 |
+
elements_to_check.append(parent.parent)
|
| 758 |
|
| 759 |
+
for elem in elements_to_check:
|
| 760 |
+
if not elem:
|
| 761 |
+
continue
|
| 762 |
+
|
| 763 |
+
# Check element's class
|
| 764 |
+
elem_class = elem.get('class', [])
|
| 765 |
+
if isinstance(elem_class, list):
|
| 766 |
+
class_str = ' '.join(str(cls).lower() for cls in elem_class)
|
| 767 |
+
if any(indicator in class_str for indicator in skip_indicators):
|
| 768 |
+
return True
|
| 769 |
+
|
| 770 |
+
# Check element's ID
|
| 771 |
+
elem_id = elem.get('id', '')
|
| 772 |
+
if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
|
| 773 |
+
return True
|
| 774 |
+
|
| 775 |
+
# Skip short phrases that might be UI elements
|
| 776 |
+
word_count = len(text.split())
|
| 777 |
+
if word_count <= 5:
|
| 778 |
+
ui_patterns = [
|
| 779 |
+
'click', 'download', 'learn more', 'read more', 'sign up',
|
| 780 |
+
'get started', 'try now', 'buy now', 'next', 'previous',
|
| 781 |
+
'back', 'continue', 'submit', 'cancel', 'get now', 'book your',
|
| 782 |
+
'check out:', 'see also:', 'related:', 'question', 'of'
|
| 783 |
+
]
|
| 784 |
+
if any(pattern in text.lower() for pattern in ui_patterns):
|
| 785 |
+
return True
|
| 786 |
+
|
| 787 |
+
# Skip very short content in styled containers
|
| 788 |
+
if parent and parent.name in ['div', 'section', 'aside', 'blockquote']:
|
| 789 |
+
style = parent.get('style', '')
|
| 790 |
+
if 'border' in style or 'background' in style:
|
| 791 |
+
if word_count <= 20:
|
| 792 |
+
# But don't skip if it's inside a paragraph
|
| 793 |
+
if not any(p.name == 'p' for p in parent.parents):
|
| 794 |
+
return True
|
| 795 |
+
|
| 796 |
+
return False
|
| 797 |
+
|
| 798 |
+
def extract_text_from_html(self, html_content):
|
| 799 |
+
"""Extract text elements from HTML with skip logic"""
|
| 800 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 801 |
+
text_elements = []
|
| 802 |
|
| 803 |
+
# CRITICAL: Preserve all script tags completely
|
| 804 |
+
script_tags = soup.find_all('script')
|
| 805 |
+
script_placeholders = {}
|
| 806 |
|
| 807 |
+
for i, script in enumerate(script_tags):
|
| 808 |
+
placeholder = f"###SCRIPT_CONTENT_{i}###"
|
| 809 |
+
script_placeholders[placeholder] = str(script)
|
| 810 |
+
script.string = placeholder
|
| 811 |
|
| 812 |
+
# Get all text nodes
|
| 813 |
+
for element in soup.find_all(string=True):
|
| 814 |
+
# Skip script, style, and noscript content completely
|
| 815 |
+
if element.parent.name in ['script', 'style', 'noscript']:
|
| 816 |
+
continue
|
| 817 |
+
|
| 818 |
+
# Skip if it's a script placeholder
|
| 819 |
+
text = element.strip()
|
| 820 |
+
if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
|
| 821 |
+
continue
|
| 822 |
+
|
| 823 |
+
if text and not self.should_skip_element(element, text):
|
| 824 |
+
text_elements.append({
|
| 825 |
+
'text': text,
|
| 826 |
+
'element': element
|
| 827 |
+
})
|
| 828 |
|
| 829 |
+
return soup, text_elements, script_placeholders
|
| 830 |
+
|
| 831 |
+
def process_html(self, html_content, progress_callback=None):
|
| 832 |
+
"""Main processing function with progress callback"""
|
| 833 |
+
if not html_content.strip():
|
| 834 |
+
return "Please provide HTML content."
|
| 835 |
|
| 836 |
+
try:
|
| 837 |
+
# Extract text elements with script preservation
|
| 838 |
+
soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
|
| 839 |
+
|
| 840 |
+
total_elements = len(text_elements)
|
| 841 |
+
print(f"Found {total_elements} text elements to process (after filtering)")
|
| 842 |
+
|
| 843 |
+
# Process each text element
|
| 844 |
+
processed_count = 0
|
| 845 |
+
|
| 846 |
+
for i, element_info in enumerate(text_elements):
|
| 847 |
+
original_text = element_info['text']
|
| 848 |
+
|
| 849 |
+
# Skip placeholders
|
| 850 |
+
if "###SCRIPT_" in original_text:
|
| 851 |
+
continue
|
| 852 |
+
|
| 853 |
+
# Skip very short texts
|
| 854 |
+
if len(original_text.split()) < 3:
|
| 855 |
+
continue
|
| 856 |
+
|
| 857 |
+
# Process the text with your existing logic
|
| 858 |
+
paraphrased_text = self.paraphrase_with_dipper(
|
| 859 |
+
original_text,
|
| 860 |
+
lex_diversity=60,
|
| 861 |
+
order_diversity=20
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
# Apply other transformations...
|
| 865 |
+
paraphrased_text = self.apply_sentence_variation(paraphrased_text)
|
| 866 |
+
paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
|
| 867 |
+
paraphrased_text = self.fix_punctuation(paraphrased_text)
|
| 868 |
+
|
| 869 |
+
# Final quality check
|
| 870 |
+
if paraphrased_text and len(paraphrased_text.split()) >= 3:
|
| 871 |
+
element_info['element'].replace_with(NavigableString(paraphrased_text))
|
| 872 |
+
processed_count += 1
|
| 873 |
+
|
| 874 |
+
# Progress update
|
| 875 |
+
if progress_callback:
|
| 876 |
+
progress_callback(i + 1, total_elements)
|
| 877 |
+
|
| 878 |
+
# Get the processed HTML
|
| 879 |
+
result_html = str(soup)
|
| 880 |
+
|
| 881 |
+
# CRITICAL: Restore all script content exactly as it was
|
| 882 |
+
for placeholder, original_script in script_placeholders.items():
|
| 883 |
+
result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
|
| 884 |
+
|
| 885 |
+
# Post-process the entire HTML
|
| 886 |
+
result_html = self.post_process_html(result_html)
|
| 887 |
+
result_html = self.validate_and_fix_html(result_html)
|
| 888 |
+
|
| 889 |
+
print(f"Successfully processed {processed_count} text elements")
|
| 890 |
+
return result_html
|
| 891 |
+
|
| 892 |
+
except Exception as e:
|
| 893 |
+
import traceback
|
| 894 |
+
error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
|
| 895 |
+
print(error_msg)
|
| 896 |
+
return f"<!-- {error_msg} -->\n{html_content}"
|
| 897 |
|
| 898 |
def is_likely_acronym_or_proper_noun(self, word):
|
| 899 |
"""Check if a word is likely an acronym or part of a proper noun"""
|
|
|
|
| 1432 |
|
| 1433 |
return text
|
| 1434 |
|
| 1435 |
+
def validate_and_fix_html(self, html_text):
|
| 1436 |
+
"""Fix common HTML syntax errors after processing"""
|
| 1437 |
+
|
| 1438 |
+
# First, protect script content
|
| 1439 |
+
script_pattern = r'<script[^>]*>(.*?)</script>'
|
| 1440 |
+
scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
|
| 1441 |
+
script_placeholders = {}
|
| 1442 |
+
|
| 1443 |
+
for i, script_content in enumerate(scripts):
|
| 1444 |
+
placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
|
| 1445 |
+
script_placeholders[placeholder] = script_content
|
| 1446 |
+
html_text = html_text.replace(
|
| 1447 |
+
f'<script>{script_content}</script>',
|
| 1448 |
+
f'<script>{placeholder}</script>',
|
| 1449 |
+
1
|
| 1450 |
+
)
|
| 1451 |
|
| 1452 |
+
# Fix DOCTYPE
|
| 1453 |
+
html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1454 |
|
| 1455 |
+
# Fix spacing issues (but not inside scripts)
|
| 1456 |
+
html_text = re.sub(r'>\s+<', '><', html_text)
|
| 1457 |
+
html_text = re.sub(r'\s+>', '>', html_text)
|
| 1458 |
+
html_text = re.sub(r'<\s+', '<', html_text)
|
| 1459 |
+
|
| 1460 |
+
# Fix common word errors that might occur during processing
|
| 1461 |
+
html_text = html_text.replace('down loaded', 'downloaded')
|
| 1462 |
+
html_text = html_text.replace('But your document', 'Your document')
|
| 1463 |
+
|
| 1464 |
+
# Restore script content
|
| 1465 |
+
for placeholder, script_content in script_placeholders.items():
|
| 1466 |
+
html_text = html_text.replace(
|
| 1467 |
+
f'<script>{placeholder}</script>',
|
| 1468 |
+
f'<script>{script_content}</script>'
|
| 1469 |
+
)
|
| 1470 |
+
|
| 1471 |
+
return html_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1472 |
|
| 1473 |
def add_natural_flow_variations(self, text):
|
| 1474 |
"""Add more natural flow and rhythm variations for Originality AI"""
|
|
|
|
| 1532 |
|
| 1533 |
return ' '.join(enhanced_sentences)
|
| 1534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1535 |
def post_process_html(self, html_text):
|
| 1536 |
"""Post-process the entire HTML to fix formatting issues"""
|
| 1537 |
# Fix empty angle brackets that might appear
|