derek-thomas
commited on
Commit
·
fb9efd9
1
Parent(s):
6621d73
Filtering nsfw content for nomic
Browse files- src/build_nomic.py +4 -0
src/build_nomic.py
CHANGED
|
@@ -87,6 +87,10 @@ def preprocess_markdown(text):
|
|
| 87 |
def build_nomic(dataset):
|
| 88 |
df = dataset['train'].to_pandas()
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
|
| 91 |
'score', 'score_percentile', 'html_content', 'subreddit']
|
| 92 |
|
|
|
|
| 87 |
def build_nomic(dataset):
|
| 88 |
df = dataset['train'].to_pandas()
|
| 89 |
|
| 90 |
+
# Filter df for nsfw content for displaying in Nomic
|
| 91 |
+
df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
|
| 92 |
+
lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1)]
|
| 93 |
+
|
| 94 |
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
|
| 95 |
'score', 'score_percentile', 'html_content', 'subreddit']
|
| 96 |
|