derek-thomas
commited on
Commit
·
7d5b5ca
1
Parent(s):
5d0ccb5
Trying spoiler tag handling
Browse files- src/build_nomic.py +41 -10
src/build_nomic.py
CHANGED
|
@@ -17,17 +17,42 @@ NOMIC_KEY = os.getenv('NOMIC_KEY')
|
|
| 17 |
nomic.login(NOMIC_KEY)
|
| 18 |
logger = setup_logger(__name__)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def count_words(text):
|
| 22 |
words = text.split()
|
| 23 |
return len(words)
|
| 24 |
|
| 25 |
|
| 26 |
-
def
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
return html
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def delete_old_nomic():
|
| 32 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
| 33 |
try:
|
|
@@ -41,6 +66,20 @@ def delete_old_nomic():
|
|
| 41 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
| 42 |
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def build_nomic(dataset):
|
| 45 |
df = dataset['train'].to_pandas()
|
| 46 |
|
|
@@ -65,14 +104,6 @@ def build_nomic(dataset):
|
|
| 65 |
df['url'] = 'https://www.reddit.com' + df['permalink']
|
| 66 |
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
| 67 |
|
| 68 |
-
# Regex to extract subreddit
|
| 69 |
-
subreddit_re = re.compile(r'r/(\w+)')
|
| 70 |
-
def extract_subreddit(text):
|
| 71 |
-
match = subreddit_re.search(text)
|
| 72 |
-
if match:
|
| 73 |
-
return 'r/' + match.group(1)
|
| 74 |
-
return ''
|
| 75 |
-
|
| 76 |
# Apply the function
|
| 77 |
df['subreddit'] = df['content'].apply(extract_subreddit)
|
| 78 |
|
|
|
|
| 17 |
nomic.login(NOMIC_KEY)
|
| 18 |
logger = setup_logger(__name__)
|
| 19 |
|
| 20 |
+
# Regex to extract subreddit
|
| 21 |
+
subreddit_re = re.compile(r'r/(\w+)')
|
| 22 |
+
|
| 23 |
|
| 24 |
def count_words(text):
|
| 25 |
words = text.split()
|
| 26 |
return len(words)
|
| 27 |
|
| 28 |
|
| 29 |
+
def preprocess_markdown(text):
|
| 30 |
+
# Inline CSS for spoilers
|
| 31 |
+
spoiler_style = 'background-color: black; color: black;'
|
| 32 |
+
hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
|
| 33 |
+
|
| 34 |
+
# Replace Reddit spoiler tags with an HTML span with inline styles
|
| 35 |
+
text = re.sub(
|
| 36 |
+
r'\>\!(.*?)\<\!',
|
| 37 |
+
r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
|
| 38 |
+
text
|
| 39 |
+
)
|
| 40 |
+
return text
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def convert_markdown_to_html(text):
|
| 44 |
+
processed_text = preprocess_markdown(text)
|
| 45 |
+
html = markdown.markdown(processed_text)
|
| 46 |
return html
|
| 47 |
|
| 48 |
|
| 49 |
+
def extract_subreddit(text):
|
| 50 |
+
match = subreddit_re.search(text)
|
| 51 |
+
if match:
|
| 52 |
+
return 'r/' + match.group(1)
|
| 53 |
+
return ''
|
| 54 |
+
|
| 55 |
+
|
| 56 |
def delete_old_nomic():
|
| 57 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
| 58 |
try:
|
|
|
|
| 66 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
| 67 |
|
| 68 |
|
| 69 |
+
def preprocess_markdown(text):
|
| 70 |
+
# Inline CSS for spoilers
|
| 71 |
+
spoiler_style = 'background-color: black; color: black;'
|
| 72 |
+
hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
|
| 73 |
+
|
| 74 |
+
# Replace Reddit spoiler tags >!spoiler!< with an HTML span with inline styles
|
| 75 |
+
text = re.sub(
|
| 76 |
+
r'\>\!(.*?)\<\!',
|
| 77 |
+
r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
|
| 78 |
+
text
|
| 79 |
+
)
|
| 80 |
+
return text
|
| 81 |
+
|
| 82 |
+
|
| 83 |
def build_nomic(dataset):
|
| 84 |
df = dataset['train'].to_pandas()
|
| 85 |
|
|
|
|
| 104 |
df['url'] = 'https://www.reddit.com' + df['permalink']
|
| 105 |
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Apply the function
|
| 108 |
df['subreddit'] = df['content'].apply(extract_subreddit)
|
| 109 |
|