Spaces:
Build error
Build error
Improve data processing
Browse files- app.py +10 -13
- load_dataframe.py +39 -13
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import pandas as pd
|
|
| 5 |
import numpy as np
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
|
| 11 |
def aggregated_data(df, aggregation_level="week"):
|
|
@@ -25,17 +25,11 @@ def aggregated_data(df, aggregation_level="week"):
|
|
| 25 |
|
| 26 |
# Calculate the growth rate
|
| 27 |
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
|
| 28 |
-
|
| 29 |
-
print("Type of growth rate:", growth_rate)
|
| 30 |
-
print("Growth rate:", type(growth_rate))
|
| 31 |
-
|
| 32 |
-
# growth_rate = growth_rate.dropna()
|
| 33 |
-
|
| 34 |
-
print("Growht rate after removing nan:", growth_rate)
|
| 35 |
|
| 36 |
# Display the average growth rate as a big number
|
| 37 |
average_growth_rate = growth_rate.mean()
|
| 38 |
-
st.metric(label=f"{aggregation_level.capitalize()}ly
|
| 39 |
|
| 40 |
# Create the plot
|
| 41 |
plt.figure(figsize=(12, 6))
|
|
@@ -109,12 +103,15 @@ def main():
|
|
| 109 |
selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
|
| 110 |
|
| 111 |
# TODO use this instead
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
|
| 115 |
# Use date as index
|
| 116 |
-
df = df.set_index('date')
|
| 117 |
-
df.index = pd.to_datetime(df.index)
|
| 118 |
df = df.sort_index()
|
| 119 |
|
| 120 |
if selection == "Daily/weekly/monthly data":
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
|
| 8 |
+
from load_dataframe import get_data
|
| 9 |
|
| 10 |
|
| 11 |
def aggregated_data(df, aggregation_level="week"):
|
|
|
|
| 25 |
|
| 26 |
# Calculate the growth rate
|
| 27 |
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
|
| 28 |
+
growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Display the average growth rate as a big number
|
| 31 |
average_growth_rate = growth_rate.mean()
|
| 32 |
+
st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")
|
| 33 |
|
| 34 |
# Create the plot
|
| 35 |
plt.figure(figsize=(12, 6))
|
|
|
|
| 103 |
selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
|
| 104 |
|
| 105 |
# TODO use this instead
|
| 106 |
+
df = get_data()
|
| 107 |
+
|
| 108 |
+
print(df.head())
|
| 109 |
+
|
| 110 |
+
# df = pd.read_csv('daily_papers_enriched (3).csv')
|
| 111 |
df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
|
| 112 |
# Use date as index
|
| 113 |
+
# df = df.set_index('date')
|
| 114 |
+
# df.index = pd.to_datetime(df.index)
|
| 115 |
df = df.sort_index()
|
| 116 |
|
| 117 |
if selection == "Daily/weekly/monthly data":
|
load_dataframe.py
CHANGED
|
@@ -20,7 +20,7 @@ class PaperInfo:
|
|
| 20 |
num_comments: int
|
| 21 |
|
| 22 |
|
| 23 |
-
def get_df() -> pd.DataFrame:
|
| 24 |
"""
|
| 25 |
Load the initial dataset as a Pandas dataframe.
|
| 26 |
"""
|
|
@@ -39,7 +39,16 @@ def get_df() -> pd.DataFrame:
|
|
| 39 |
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
|
| 40 |
)
|
| 41 |
paper_info.append(info)
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
|
|
@@ -152,21 +161,15 @@ def check_hf_mention(batch):
|
|
| 152 |
return batch
|
| 153 |
|
| 154 |
|
| 155 |
-
|
| 156 |
-
def get_data() -> pd.DataFrame:
|
| 157 |
"""
|
| 158 |
Load the dataset and enrich it with metadata.
|
| 159 |
"""
|
| 160 |
-
# step 1. load as
|
| 161 |
-
df = get_df()
|
| 162 |
-
df['date'] = pd.to_datetime(df['date'])
|
| 163 |
-
|
| 164 |
-
# step 2. enrich using PapersWithCode API
|
| 165 |
dataset = Dataset.from_pandas(df)
|
| 166 |
|
| 167 |
-
#
|
| 168 |
-
# dataset = dataset.select(range(10))
|
| 169 |
-
|
| 170 |
dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
|
| 171 |
|
| 172 |
# step 3. enrich using Hugging Face API
|
|
@@ -184,4 +187,27 @@ def get_data() -> pd.DataFrame:
|
|
| 184 |
print("First few rows of the dataset:")
|
| 185 |
print(dataframe.head())
|
| 186 |
|
| 187 |
-
return dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
num_comments: int
|
| 21 |
|
| 22 |
|
| 23 |
+
def get_df(start_date: str, end_date: str) -> pd.DataFrame:
|
| 24 |
"""
|
| 25 |
Load the initial dataset as a Pandas dataframe.
|
| 26 |
"""
|
|
|
|
| 39 |
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
|
| 40 |
)
|
| 41 |
paper_info.append(info)
|
| 42 |
+
|
| 43 |
+
df = pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
|
| 44 |
+
|
| 45 |
+
# set date as index
|
| 46 |
+
df = df.set_index('date')
|
| 47 |
+
df.index = pd.to_datetime(df.index)
|
| 48 |
+
# only include data between start_date and end_date
|
| 49 |
+
df = df[(df.index >= start_date) & (df.index <= end_date)]
|
| 50 |
+
|
| 51 |
+
return df
|
| 52 |
|
| 53 |
|
| 54 |
def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
|
|
|
|
| 161 |
return batch
|
| 162 |
|
| 163 |
|
| 164 |
+
def process_data(start_date: str, end_date: str) -> pd.DataFrame:
|
|
|
|
| 165 |
"""
|
| 166 |
Load the dataset and enrich it with metadata.
|
| 167 |
"""
|
| 168 |
+
# step 1. load as HF dataset
|
| 169 |
+
df = get_df(start_date, end_date)
|
|
|
|
|
|
|
|
|
|
| 170 |
dataset = Dataset.from_pandas(df)
|
| 171 |
|
| 172 |
+
# step 2. enrich using PapersWithCode API
|
|
|
|
|
|
|
| 173 |
dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
|
| 174 |
|
| 175 |
# step 3. enrich using Hugging Face API
|
|
|
|
| 187 |
print("First few rows of the dataset:")
|
| 188 |
print(dataframe.head())
|
| 189 |
|
| 190 |
+
return dataframe
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
@st.cache_data
|
| 194 |
+
def get_data() -> pd.DataFrame:
|
| 195 |
+
|
| 196 |
+
# step 1: load pre-processed data
|
| 197 |
+
df = load_dataset("nielsr/daily-papers-enriched", split="train").to_pandas()
|
| 198 |
+
df = df.set_index('date')
|
| 199 |
+
df = df.sort_index()
|
| 200 |
+
df.index = pd.to_datetime(df.index)
|
| 201 |
+
|
| 202 |
+
# step 2: check how much extra data we need to process
|
| 203 |
+
latest_day = df.iloc[-1].name.strftime('%d-%m-%Y')
|
| 204 |
+
today = pd.Timestamp.today().strftime('%d-%m-%Y')
|
| 205 |
+
|
| 206 |
+
# step 3: process the missing data
|
| 207 |
+
if latest_day < today:
|
| 208 |
+
print(f"Processing data from {latest_day} to {today}")
|
| 209 |
+
new_df = process_data(start_date=latest_day, end_date=today)
|
| 210 |
+
new_df = new_df[new_df.index > latest_day]
|
| 211 |
+
df = pd.concat([df, new_df])
|
| 212 |
+
|
| 213 |
+
return df
|