Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 4, 2025

Commit

dcfc371

verified ·

1 Parent(s): 942fc21

Update app.py

Browse files

Files changed (1) hide show

app.py +253 -279

app.py CHANGED Viewed

@@ -663,122 +663,237 @@ class EnhancedDipperHumanizer:
         return text
     def should_skip_element(self, element, text):
-    """Determine if an element should be skipped from paraphrasing"""
-    if not text or len(text.strip()) < 3:
-        return True
-    # Skip JavaScript code inside script tags - CRITICAL FIX
-    parent = element.parent
-    if parent and parent.name in ['script', 'style', 'noscript']:
-        return True
-    # Also check if we're inside a script tag at any level
-    for ancestor in element.parents:
-        if ancestor.name in ['script', 'style', 'noscript']:
             return True
-    # Rest of your existing skip logic...
-    return False
-def extract_text_from_html(self, html_content):
-    """Extract text elements from HTML with skip logic"""
-    soup = BeautifulSoup(html_content, 'html.parser')
-    text_elements = []
-    # CRITICAL: Preserve all script tags completely
-    script_tags = soup.find_all('script')
-    script_placeholders = {}
-    for i, script in enumerate(script_tags):
-        placeholder = f"###SCRIPT_CONTENT_{i}###"
-        script_placeholders[placeholder] = str(script)
-        script.string = placeholder
-    # Get all text nodes
-    for element in soup.find_all(string=True):
-        # Skip script, style, and noscript content completely
-        if element.parent.name in ['script', 'style', 'noscript']:
-            continue
-        # Skip if it's a script placeholder
-        text = element.strip()
-        if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
-            continue
-        if text and not self.should_skip_element(element, text):
-            text_elements.append({
-                'text': text,
-                'element': element
-            })
-    return soup, text_elements, script_placeholders
-def process_html(self, html_content, progress_callback=None):
-    """Main processing function with progress callback"""
-    if not html_content.strip():
-        return "Please provide HTML content."
-    try:
-        # Extract text elements with script preservation
-        soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
-        total_elements = len(text_elements)
-        print(f"Found {total_elements} text elements to process (after filtering)")
-        # Process each text element
-        processed_count = 0
-        for i, element_info in enumerate(text_elements):
-            original_text = element_info['text']
-            # Skip placeholders
-            if "###SCRIPT_" in original_text:
-                continue
-            # Skip very short texts
-            if len(original_text.split()) < 3:
-                continue
-            # Process the text with your existing logic
-            paraphrased_text = self.paraphrase_with_dipper(
-                original_text,
-                lex_diversity=60,
-                order_diversity=20
-            )
-            # Apply other transformations...
-            paraphrased_text = self.apply_sentence_variation(paraphrased_text)
-            paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
-            paraphrased_text = self.fix_punctuation(paraphrased_text)
-            # Final quality check
-            if paraphrased_text and len(paraphrased_text.split()) >= 3:
-                element_info['element'].replace_with(NavigableString(paraphrased_text))
-                processed_count += 1
-            # Progress update
-            if progress_callback:
-                progress_callback(i + 1, total_elements)
-        # Get the processed HTML
-        result_html = str(soup)
-        # CRITICAL: Restore all script content exactly as it was
-        for placeholder, original_script in script_placeholders.items():
-            result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
-        # Post-process the entire HTML
-        result_html = self.post_process_html(result_html)
-        result_html = self.validate_and_fix_html(result_html)
-        print(f"Successfully processed {processed_count} text elements")
-        return result_html
-    except Exception as e:
-        import traceback
-        error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        return f"<!-- {error_msg} -->\n{html_content}"
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""
@@ -1317,63 +1432,43 @@ def process_html(self, html_content, progress_callback=None):
         return text
-    def extract_text_from_html(self, html_content):
-        """Extract text elements from HTML with skip logic"""
-        soup = BeautifulSoup(html_content, 'html.parser')
-        text_elements = []
-        # Get all text nodes using string instead of text (fixing deprecation)
-        for element in soup.find_all(string=True):
-            # Skip script, style, and noscript content completely
-            if element.parent.name in ['script', 'style', 'noscript']:
-                continue
-            text = element.strip()
-            if text and not self.should_skip_element(element, text):
-                text_elements.append({
-                    'text': text,
-                    'element': element
-                })
-        return soup, text_elements
-    def validate_and_fix_html(self, html_text):
-    """Fix common HTML syntax errors after processing"""
-    # First, protect script content
-    script_pattern = r'<script[^>]*>(.*?)</script>'
-    scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
-    script_placeholders = {}
-    for i, script_content in enumerate(scripts):
-        placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
-        script_placeholders[placeholder] = script_content
-        html_text = html_text.replace(
-            f'<script>{script_content}</script>',
-            f'<script>{placeholder}</script>',
-            1
-        )
-    # Fix DOCTYPE
-    html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
-    # Fix spacing issues (but not inside scripts)
-    html_text = re.sub(r'>\s+<', '><', html_text)
-    html_text = re.sub(r'\s+>', '>', html_text)
-    html_text = re.sub(r'<\s+', '<', html_text)
-    # Fix common word errors that might occur during processing
-    html_text = html_text.replace('down loaded', 'downloaded')
-    html_text = html_text.replace('But your document', 'Your document')
-    # Restore script content
-    for placeholder, script_content in script_placeholders.items():
-        html_text = html_text.replace(
-            f'<script>{placeholder}</script>',
-            f'<script>{script_content}</script>'
-        )
-    return html_text
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
@@ -1437,127 +1532,6 @@ def process_html(self, html_content, progress_callback=None):
         return ' '.join(enhanced_sentences)
-    def process_html(self, html_content, progress_callback=None):
-        """Main processing function with progress callback"""
-        if not html_content.strip():
-            return "Please provide HTML content."
-        # Store all script and style content to preserve it
-        script_placeholder = "###SCRIPT_PLACEHOLDER_{}###"
-        style_placeholder = "###STYLE_PLACEHOLDER_{}###"
-        preserved_scripts = []
-        preserved_styles = []
-        # Temporarily replace script and style tags with placeholders
-        soup_temp = BeautifulSoup(html_content, 'html.parser')
-        # Preserve all script tags
-        for idx, script in enumerate(soup_temp.find_all('script')):
-            placeholder = script_placeholder.format(idx)
-            preserved_scripts.append(str(script))
-            script.replace_with(placeholder)
-        # Preserve all style tags
-        for idx, style in enumerate(soup_temp.find_all('style')):
-            placeholder = style_placeholder.format(idx)
-            preserved_styles.append(str(style))
-            style.replace_with(placeholder)
-        # Get the modified HTML
-        html_content = str(soup_temp)
-        try:
-            # Extract text elements
-            soup, text_elements = self.extract_text_from_html(html_content)
-            total_elements = len(text_elements)
-            print(f"Found {total_elements} text elements to process (after filtering)")
-            # Process each text element
-            processed_count = 0
-            for i, element_info in enumerate(text_elements):
-                original_text = element_info['text']
-                # Skip placeholders
-                if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text:
-                    continue
-                # Skip very short texts
-                if len(original_text.split()) < 3:
-                    continue
-                # First pass with Dipper
-                paraphrased_text = self.paraphrase_with_dipper(
-                    original_text,
-                    lex_diversity=60,
-                    order_diversity=20
-                )
-                # Second pass with BART for longer texts (balanced probability)
-                if self.use_bart and len(paraphrased_text.split()) > 8:
-                    # 30% chance to use BART for more variation (balanced)
-                    if random.random() < 0.3:
-                        paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
-                # Apply sentence variation
-                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
-                # Add natural flow variations
-                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
-                # Fix punctuation and formatting
-                paraphrased_text = self.fix_punctuation(paraphrased_text)
-                # Final quality check
-                if paraphrased_text and len(paraphrased_text.split()) >= 3:
-                    element_info['element'].replace_with(NavigableString(paraphrased_text))
-                    processed_count += 1
-                # Progress update
-                if progress_callback:
-                    progress_callback(i + 1, total_elements)
-                if i % 10 == 0 or i == total_elements - 1:
-                    progress = (i + 1) / total_elements * 100
-                    print(f"Progress: {progress:.1f}%")
-            # Get the processed HTML
-            result = str(soup)
-            # Restore all script tags
-            for idx, script_content in enumerate(preserved_scripts):
-                placeholder = script_placeholder.format(idx)
-                result = result.replace(placeholder, script_content)
-            # Restore all style tags
-            for idx, style_content in enumerate(preserved_styles):
-                placeholder = style_placeholder.format(idx)
-                result = result.replace(placeholder, style_content)
-            # Post-process the entire HTML to fix bold/strong formatting
-            result = self.post_process_html(result)
-            # Validate and fix HTML syntax
-            result = self.validate_and_fix_html(result)
-            # Count skipped elements properly
-            all_text_elements = soup.find_all(string=True)
-            skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
-            print(f"Successfully processed {processed_count} text elements")
-            print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)")
-            print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags")
-            return result
-        except Exception as e:
-            import traceback
-            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
-            print(error_msg)
-            # Return original HTML with error message prepended as HTML comment
-            return f"<!-- {error_msg} -->\n{html_content}"
     def post_process_html(self, html_text):
         """Post-process the entire HTML to fix formatting issues"""
         # Fix empty angle brackets that might appear

         return text
     def should_skip_element(self, element, text):
+        """Determine if an element should be skipped from paraphrasing"""
+        if not text or len(text.strip()) < 3:
             return True
+        # Skip JavaScript code inside script tags - CRITICAL FIX
+        parent = element.parent
+        if parent and parent.name in ['script', 'style', 'noscript']:
+            return True
+        # Also check if we're inside a script tag at any level
+        for ancestor in element.parents:
+            if ancestor.name in ['script', 'style', 'noscript']:
+                return True
+        # Skip headings (h1-h6)
+        if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
+            return True
+        # Skip content inside <strong> and <b> tags
+        if parent and parent.name in ['strong', 'b']:
+            return True
+        # Skip table content
+        if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)):
+            return True
+        # Special handling for content inside tables
+        # Skip if it's inside strong/b/h1-h6 tags AND also inside a table
+        if parent:
+            # Check if we're inside a table
+            is_in_table = any(p.name == 'table' for p in parent.parents)
+            if is_in_table:
+                # If we're in a table, skip any text that's inside formatting tags
+                if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']:
+                    return True
+                # Also check if parent's parent is a formatting tag
+                if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                    return True
+        # Skip table of contents
+        if parent:
+            parent_text = str(parent).lower()
+            if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']):
+                return True
+        # Skip CTAs and buttons
+        if parent and parent.name in ['button', 'a']:
+            return True
+        # Skip if parent has onclick or other event handlers
+        if parent and parent.attrs:
+            event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout']
+            if any(handler in parent.attrs for handler in event_handlers):
+                return True
+        # Special check for testimonial cards - check up to 3 levels of ancestors
+        if parent:
+            ancestors_to_check = []
+            current = parent
+            for _ in range(3):  # Check up to 3 levels up
+                if current:
+                    ancestors_to_check.append(current)
+                    current = current.parent
+            # Check if any ancestor has testimonial-card class
+            for ancestor in ancestors_to_check:
+                if ancestor and ancestor.get('class'):
+                    classes = ancestor.get('class', [])
+                    if isinstance(classes, list):
+                        if any('testimonial-card' in str(cls) for cls in classes):
+                            return True
+                    elif isinstance(classes, str) and 'testimonial-card' in classes:
+                        return True
+        # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
+        skip_indicators = [
+            'button', 'btn', 'heading', 'title', 'caption',
+            'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
+            'warning', 'info', 'success', 'error', 'code', 'pre',
+            'stats-grid', 'testimonial-card',
+            'cta-box', 'quiz-container', 'contact-form',
+            'faq-question', 'sidebar', 'widget', 'banner',
+            'author-intro', 'testimonial', 'review', 'feedback',
+            'floating-', 'stat-', 'progress-', 'option', 'results',
+            'question-container', 'quiz-',
+            'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
+        ]
+        # Check only immediate parent and grandparent (not all ancestors)
+        elements_to_check = [parent]
+        if parent and parent.parent:
+            elements_to_check.append(parent.parent)
+        for elem in elements_to_check:
+            if not elem:
+                continue
+            # Check element's class
+            elem_class = elem.get('class', [])
+            if isinstance(elem_class, list):
+                class_str = ' '.join(str(cls).lower() for cls in elem_class)
+                if any(indicator in class_str for indicator in skip_indicators):
+                    return True
+            # Check element's ID
+            elem_id = elem.get('id', '')
+            if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
+                return True
+        # Skip short phrases that might be UI elements
+        word_count = len(text.split())
+        if word_count <= 5:
+            ui_patterns = [
+                'click', 'download', 'learn more', 'read more', 'sign up',
+                'get started', 'try now', 'buy now', 'next', 'previous',
+                'back', 'continue', 'submit', 'cancel', 'get now', 'book your',
+                'check out:', 'see also:', 'related:', 'question', 'of'
+            ]
+            if any(pattern in text.lower() for pattern in ui_patterns):
+                return True
+        # Skip very short content in styled containers
+        if parent and parent.name in ['div', 'section', 'aside', 'blockquote']:
+            style = parent.get('style', '')
+            if 'border' in style or 'background' in style:
+                if word_count <= 20:
+                    # But don't skip if it's inside a paragraph
+                    if not any(p.name == 'p' for p in parent.parents):
+                        return True
+        return False
+    def extract_text_from_html(self, html_content):
+        """Extract text elements from HTML with skip logic"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        text_elements = []
+        # CRITICAL: Preserve all script tags completely
+        script_tags = soup.find_all('script')
+        script_placeholders = {}
+        for i, script in enumerate(script_tags):
+            placeholder = f"###SCRIPT_CONTENT_{i}###"
+            script_placeholders[placeholder] = str(script)
+            script.string = placeholder
+        # Get all text nodes
+        for element in soup.find_all(string=True):
+            # Skip script, style, and noscript content completely
+            if element.parent.name in ['script', 'style', 'noscript']:
+                continue
+            # Skip if it's a script placeholder
+            text = element.strip()
+            if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
+                continue
+            if text and not self.should_skip_element(element, text):
+                text_elements.append({
+                    'text': text,
+                    'element': element
+                })
+        return soup, text_elements, script_placeholders
+    def process_html(self, html_content, progress_callback=None):
+        """Main processing function with progress callback"""
+        if not html_content.strip():
+            return "Please provide HTML content."
+        try:
+            # Extract text elements with script preservation
+            soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
+            total_elements = len(text_elements)
+            print(f"Found {total_elements} text elements to process (after filtering)")
+            # Process each text element
+            processed_count = 0
+            for i, element_info in enumerate(text_elements):
+                original_text = element_info['text']
+                # Skip placeholders
+                if "###SCRIPT_" in original_text:
+                    continue
+                # Skip very short texts
+                if len(original_text.split()) < 3:
+                    continue
+                # Process the text with your existing logic
+                paraphrased_text = self.paraphrase_with_dipper(
+                    original_text,
+                    lex_diversity=60,
+                    order_diversity=20
+                )
+                # Apply other transformations...
+                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
+                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
+                paraphrased_text = self.fix_punctuation(paraphrased_text)
+                # Final quality check
+                if paraphrased_text and len(paraphrased_text.split()) >= 3:
+                    element_info['element'].replace_with(NavigableString(paraphrased_text))
+                    processed_count += 1
+                # Progress update
+                if progress_callback:
+                    progress_callback(i + 1, total_elements)
+            # Get the processed HTML
+            result_html = str(soup)
+            # CRITICAL: Restore all script content exactly as it was
+            for placeholder, original_script in script_placeholders.items():
+                result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
+            # Post-process the entire HTML
+            result_html = self.post_process_html(result_html)
+            result_html = self.validate_and_fix_html(result_html)
+            print(f"Successfully processed {processed_count} text elements")
+            return result_html
+        except Exception as e:
+            import traceback
+            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            return f"<!-- {error_msg} -->\n{html_content}"
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""
         return text
+    def validate_and_fix_html(self, html_text):
+        """Fix common HTML syntax errors after processing"""
+        # First, protect script content
+        script_pattern = r'<script[^>]*>(.*?)</script>'
+        scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
+        script_placeholders = {}
+        for i, script_content in enumerate(scripts):
+            placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
+            script_placeholders[placeholder] = script_content
+            html_text = html_text.replace(
+                f'<script>{script_content}</script>',
+                f'<script>{placeholder}</script>',
+                1
+            )
+        # Fix DOCTYPE
+        html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
+        # Fix spacing issues (but not inside scripts)
+        html_text = re.sub(r'>\s+<', '><', html_text)
+        html_text = re.sub(r'\s+>', '>', html_text)
+        html_text = re.sub(r'<\s+', '<', html_text)
+        # Fix common word errors that might occur during processing
+        html_text = html_text.replace('down loaded', 'downloaded')
+        html_text = html_text.replace('But your document', 'Your document')
+        # Restore script content
+        for placeholder, script_content in script_placeholders.items():
+            html_text = html_text.replace(
+                f'<script>{placeholder}</script>',
+                f'<script>{script_content}</script>'
+            )
+        return html_text
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
         return ' '.join(enhanced_sentences)
     def post_process_html(self, html_text):
         """Post-process the entire HTML to fix formatting issues"""
         # Fix empty angle brackets that might appear