Spaces:

reddit-tools-HF
/

processing-bestofredditorupdates

Running

App Files Files Community

derek-thomas commited on Apr 19, 2024

Commit

fdc091a

1 Parent(s): 8ba4837

Adding html and topic modeling on subreddit

Browse files

Files changed (2) hide show

requirements.txt +2 -1
src/build_nomic.py +29 -5

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ tqdm==4.66.1
 beautifulsoup4==4.12.2
 lxml==4.9.3
 rich==13.3.4
-nomic==3.0.15

 beautifulsoup4==4.12.2
 lxml==4.9.3
 rich==13.3.4
+nomic==3.0.15
+markdown==3.6

src/build_nomic.py CHANGED Viewed

@@ -1,12 +1,15 @@
 # https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
 import os
-import pandas as pd
 import time
 import nomic
-from nomic import atlas
-from nomic.dataset import AtlasClass
 import numpy as np
 from src.my_logger import setup_logger
@@ -20,6 +23,11 @@ def count_words(text):
     return len(words)
 def delete_old_nomic():
     logger.info(f"Trying to delete old version of nomic Atlas...")
     try:
@@ -32,11 +40,12 @@ def delete_old_nomic():
     except:
         logger.info(f"Failed to delete old version of nomic Atlas.")
 def build_nomic(dataset):
     df = dataset['train'].to_pandas()
-    non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'word_count',
-                             'score', 'score_percentile']
     # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
     percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
@@ -53,6 +62,20 @@ def build_nomic(dataset):
     df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
     df['word_count'] = df['content'].apply(count_words)
     delete_old_nomic()
@@ -62,5 +85,6 @@ def build_nomic(dataset):
                              data=df[non_embedding_columns].to_dict(orient='records'),
                              id_field='id',
                              identifier='BORU Subreddit Neural Search',
                              )
     logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")

 # https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
 import os
+import re
 import time
+import markdown
 import nomic
 import numpy as np
+import pandas as pd
+from nomic import atlas, Nomic
+from nomic.dataset import AtlasClass
+from nomic.data_inference import NomicTopicOptions
 from src.my_logger import setup_logger
     return len(words)
+def convert_markdown_to_html(markdown_text):
+    html = markdown.markdown(markdown_text)
+    return html
 def delete_old_nomic():
     logger.info(f"Trying to delete old version of nomic Atlas...")
     try:
     except:
         logger.info(f"Failed to delete old version of nomic Atlas.")
 def build_nomic(dataset):
     df = dataset['train'].to_pandas()
+    non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'permalink', 'id', 'word_count',
+                             'score', 'score_percentile', 'html_content', 'subreddit']
     # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
     percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
     df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
     df['word_count'] = df['content'].apply(count_words)
+    df['html_content'] = df['content'].apply(convert_markdown_to_html)
+    # Regex to extract subreddit
+    subreddit_re = re.compile(r'r/(\w+)')
+    def extract_subreddit(text):
+        match = subreddit_re.search(text)
+        if match:
+            return match.group(1)
+        return ''
+    # Apply the function
+    df['subreddit'] = df['content'].apply(extract_subreddit)
+    topic_options = NomicTopicOptions(build_topic_model=True, community_description_target_field='subreddit')
     delete_old_nomic()
                              data=df[non_embedding_columns].to_dict(orient='records'),
                              id_field='id',
                              identifier='BORU Subreddit Neural Search',
+                             topic_model=topic_options
                              )
     logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")