Spaces:

reddit-tools-HF
/

processing-bestofredditorupdates

Running

App Files Files Community

derek-thomas commited on Apr 19, 2024

Commit

7d5b5ca

1 Parent(s): 5d0ccb5

Trying spoiler tag handling

Browse files

Files changed (1) hide show

src/build_nomic.py +41 -10

src/build_nomic.py CHANGED Viewed

@@ -17,17 +17,42 @@ NOMIC_KEY = os.getenv('NOMIC_KEY')
 nomic.login(NOMIC_KEY)
 logger = setup_logger(__name__)
 def count_words(text):
     words = text.split()
     return len(words)
-def convert_markdown_to_html(markdown_text):
-    html = markdown.markdown(markdown_text)
     return html
 def delete_old_nomic():
     logger.info(f"Trying to delete old version of nomic Atlas...")
     try:
@@ -41,6 +66,20 @@ def delete_old_nomic():
         logger.info(f"Failed to delete old version of nomic Atlas.")
 def build_nomic(dataset):
     df = dataset['train'].to_pandas()
@@ -65,14 +104,6 @@ def build_nomic(dataset):
     df['url'] = 'https://www.reddit.com' + df['permalink']
     df['html_content'] = df['content'].apply(convert_markdown_to_html)
-    # Regex to extract subreddit
-    subreddit_re = re.compile(r'r/(\w+)')
-    def extract_subreddit(text):
-        match = subreddit_re.search(text)
-        if match:
-            return 'r/' + match.group(1)
-        return ''
     # Apply the function
     df['subreddit'] = df['content'].apply(extract_subreddit)

 nomic.login(NOMIC_KEY)
 logger = setup_logger(__name__)
+# Regex to extract subreddit
+subreddit_re = re.compile(r'r/(\w+)')
 def count_words(text):
     words = text.split()
     return len(words)
+def preprocess_markdown(text):
+    # Inline CSS for spoilers
+    spoiler_style = 'background-color: black; color: black;'
+    hover_style = 'color: inherit;'  # Assuming you want the text to be visible on hover
+    # Replace Reddit spoiler tags with an HTML span with inline styles
+    text = re.sub(
+            r'\>\!(.*?)\<\!',
+            r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
+            text
+            )
+    return text
+def convert_markdown_to_html(text):
+    processed_text = preprocess_markdown(text)
+    html = markdown.markdown(processed_text)
     return html
+def extract_subreddit(text):
+    match = subreddit_re.search(text)
+    if match:
+        return 'r/' + match.group(1)
+    return ''
 def delete_old_nomic():
     logger.info(f"Trying to delete old version of nomic Atlas...")
     try:
         logger.info(f"Failed to delete old version of nomic Atlas.")
+def preprocess_markdown(text):
+    # Inline CSS for spoilers
+    spoiler_style = 'background-color: black; color: black;'
+    hover_style = 'color: inherit;'  # Assuming you want the text to be visible on hover
+    # Replace Reddit spoiler tags >!spoiler!< with an HTML span with inline styles
+    text = re.sub(
+            r'\>\!(.*?)\<\!',
+            r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
+            text
+            )
+    return text
 def build_nomic(dataset):
     df = dataset['train'].to_pandas()
     df['url'] = 'https://www.reddit.com' + df['permalink']
     df['html_content'] = df['content'].apply(convert_markdown_to_html)
     # Apply the function
     df['subreddit'] = df['content'].apply(extract_subreddit)