| | import os |
| | from pathlib import Path |
| |
|
| | from fastapi import BackgroundTasks, Response, status |
| | import gradio as gr |
| | from huggingface_hub import WebhookPayload, WebhooksServer |
| |
|
| | from src.my_logger import setup_logger |
| | from src.utilities import load_datasets, merge_and_update_datasets |
| | from src.visualize_logs import log_file_to_html_string |
| | from src.build_nomic import build_nomic |
| | from src.readme_update import update_dataset_readme |
| |
|
| | proj_dir = Path(__name__).parent |
| |
|
| | logger = setup_logger(__name__) |
| | logger.info("Starting Application...") |
| |
|
| | SUBREDDIT = os.environ["SUBREDDIT"] |
| | USERNAME = os.environ["USERNAME"] |
| | OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}" |
| | PROCESSED_DATASET = os.environ['PROCESSED_DATASET'] |
| | HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"] |
| | WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret') |
| |
|
| | intro_md = """ |
| | # Processing BORU |
| | This is a space to visually search the subreddit [/r/bestofredditorupdates](https://www.reddit.com/r/BestofRedditorUpdates/). |
| | Have you ever been curious to search for stories that are similar to one of your favorites? This can help! |
| | |
| | - Each dot represents a post (try clicking on one) |
| | - Closer dots are similar in topic |
| | - Use the filters on the left to help you narrow down what you are looking for |
| | - The lasso can help you search in a smaller range that you drag with your mouse |
| | - The filter can help you narrow by field, |
| | - Filtering posts that are `CONCLUDED` |
| | - Filtering popular posts |
| | - Filtering by date |
| | - The search can help you look by keyword |
| | |
| | Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map) |
| | """ |
| |
|
| | details_md = """ |
| | # Details |
| | ## Creation Details |
| | 1. This space is triggered by a webhook for changes on [reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates). |
| | 2. It then takes the updates from that dataset and get embeddings by making leveraging [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) |
| | - [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) is using [zero-spaces](https://huggingface.co/zero-gpu-explorers) a free GPU service to compute the model [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
| | - Im calling this via [gradio_client](https://www.gradio.app/docs/client) which allows any space to be used as an API |
| | 3. The calculated embeddings are stored in this dataset [reddit-tools-HF/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/reddit-tools-HF/reddit-bestofredditorupdates-processed) |
| | 4. These get visualized by [nomic atlas](https://docs.nomic.ai/atlas/introduction/quick-start). You can see how I process it in [build_nomic.py](https://huggingface.co/spaces/reddit-tools-HF/processing-bestofredditorupdates/blob/main/src/build_nomic.py) |
| | """ |
| |
|
| | url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map" |
| | html_str = f'<iframe src={url} style="border:none;height:1024px;width:100%" allow="clipboard-read; clipboard-write" title="Nomic Atlas">' |
| |
|
| | with gr.Blocks() as ui: |
| | with gr.Tab("Application"): |
| | gr.Markdown(intro_md) |
| | gr.HTML(html_str) |
| | with gr.Tab("Logs"): |
| | gr.Markdown("# Logs") |
| | output = gr.HTML(log_file_to_html_string, every=1) |
| | with gr.Tab("Details"): |
| | gr.Markdown(details_md) |
| |
|
| | app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET) |
| |
|
| |
|
| | @app.add_webhook("/dataset_repo") |
| | async def community(payload: WebhookPayload, task_queue: BackgroundTasks): |
| | if not payload.event.scope.startswith("repo"): |
| | return Response("No task scheduled", status_code=status.HTTP_200_OK) |
| |
|
| | logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}") |
| | task_queue.add_task(_process_webhook, payload=payload) |
| | return Response("Task scheduled.", status_code=status.HTTP_202_ACCEPTED) |
| |
|
| | def _process_webhook(payload: WebhookPayload): |
| | logger.info(f"Loading new dataset...") |
| | dataset, original_dataset = load_datasets() |
| | logger.info(f"Loaded new dataset") |
| |
|
| | logger.info(f"Merging and Updating rows...") |
| | dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset) |
| | logger.info(f"Merged and Updated rows") |
| |
|
| | |
| | logger.info(f"Pushing processed data to the Hugging Face Hub...") |
| | dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN) |
| | logger.info(f"Pushed processed data to the Hugging Face Hub") |
| |
|
| | update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count) |
| | logger.info(f"Updated README.") |
| |
|
| | |
| | logger.info(f"Building Nomic...") |
| | build_nomic(dataset=dataset) |
| | logger.info(f"Built Nomic") |
| |
|
| | logger.info(f"Update from webhook completed!") |
| |
|
| | if __name__ == '__main__': |
| | app.launch(server_name="0.0.0.0", show_error=True, server_port=7860) |
| | |
| |
|