derek-thomas
commited on
Commit
·
fdc091a
1
Parent(s):
8ba4837
Adding html and topic modeling on subreddit
Browse files- requirements.txt +2 -1
- src/build_nomic.py +29 -5
requirements.txt
CHANGED
|
@@ -7,4 +7,5 @@ tqdm==4.66.1
|
|
| 7 |
beautifulsoup4==4.12.2
|
| 8 |
lxml==4.9.3
|
| 9 |
rich==13.3.4
|
| 10 |
-
nomic==3.0.15
|
|
|
|
|
|
| 7 |
beautifulsoup4==4.12.2
|
| 8 |
lxml==4.9.3
|
| 9 |
rich==13.3.4
|
| 10 |
+
nomic==3.0.15
|
| 11 |
+
markdown==3.6
|
src/build_nomic.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
| 1 |
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
|
| 2 |
import os
|
| 3 |
-
import
|
| 4 |
import time
|
| 5 |
|
|
|
|
| 6 |
import nomic
|
| 7 |
-
from nomic import atlas
|
| 8 |
-
from nomic.dataset import AtlasClass
|
| 9 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from src.my_logger import setup_logger
|
| 12 |
|
|
@@ -20,6 +23,11 @@ def count_words(text):
|
|
| 20 |
return len(words)
|
| 21 |
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def delete_old_nomic():
|
| 24 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
| 25 |
try:
|
|
@@ -32,11 +40,12 @@ def delete_old_nomic():
|
|
| 32 |
except:
|
| 33 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
| 34 |
|
|
|
|
| 35 |
def build_nomic(dataset):
|
| 36 |
df = dataset['train'].to_pandas()
|
| 37 |
|
| 38 |
-
non_embedding_columns = ['date_utc', 'title', 'flair', '
|
| 39 |
-
'score', 'score_percentile']
|
| 40 |
|
| 41 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
| 42 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
|
@@ -53,6 +62,20 @@ def build_nomic(dataset):
|
|
| 53 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
| 54 |
|
| 55 |
df['word_count'] = df['content'].apply(count_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
delete_old_nomic()
|
| 58 |
|
|
@@ -62,5 +85,6 @@ def build_nomic(dataset):
|
|
| 62 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
| 63 |
id_field='id',
|
| 64 |
identifier='BORU Subreddit Neural Search',
|
|
|
|
| 65 |
)
|
| 66 |
logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")
|
|
|
|
| 1 |
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
|
| 2 |
import os
|
| 3 |
+
import re
|
| 4 |
import time
|
| 5 |
|
| 6 |
+
import markdown
|
| 7 |
import nomic
|
|
|
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from nomic import atlas, Nomic
|
| 11 |
+
from nomic.dataset import AtlasClass
|
| 12 |
+
from nomic.data_inference import NomicTopicOptions
|
| 13 |
|
| 14 |
from src.my_logger import setup_logger
|
| 15 |
|
|
|
|
| 23 |
return len(words)
|
| 24 |
|
| 25 |
|
| 26 |
+
def convert_markdown_to_html(markdown_text):
|
| 27 |
+
html = markdown.markdown(markdown_text)
|
| 28 |
+
return html
|
| 29 |
+
|
| 30 |
+
|
| 31 |
def delete_old_nomic():
|
| 32 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
| 33 |
try:
|
|
|
|
| 40 |
except:
|
| 41 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
| 42 |
|
| 43 |
+
|
| 44 |
def build_nomic(dataset):
|
| 45 |
df = dataset['train'].to_pandas()
|
| 46 |
|
| 47 |
+
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'permalink', 'id', 'word_count',
|
| 48 |
+
'score', 'score_percentile', 'html_content', 'subreddit']
|
| 49 |
|
| 50 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
| 51 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
|
|
|
| 62 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
| 63 |
|
| 64 |
df['word_count'] = df['content'].apply(count_words)
|
| 65 |
+
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
| 66 |
+
|
| 67 |
+
# Regex to extract subreddit
|
| 68 |
+
subreddit_re = re.compile(r'r/(\w+)')
|
| 69 |
+
def extract_subreddit(text):
|
| 70 |
+
match = subreddit_re.search(text)
|
| 71 |
+
if match:
|
| 72 |
+
return match.group(1)
|
| 73 |
+
return ''
|
| 74 |
+
|
| 75 |
+
# Apply the function
|
| 76 |
+
df['subreddit'] = df['content'].apply(extract_subreddit)
|
| 77 |
+
|
| 78 |
+
topic_options = NomicTopicOptions(build_topic_model=True, community_description_target_field='subreddit')
|
| 79 |
|
| 80 |
delete_old_nomic()
|
| 81 |
|
|
|
|
| 85 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
| 86 |
id_field='id',
|
| 87 |
identifier='BORU Subreddit Neural Search',
|
| 88 |
+
topic_model=topic_options
|
| 89 |
)
|
| 90 |
logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")
|