Spaces:
Build error
Build error
Update requirements
Browse files- app.py +37 -15
- load_dataframe.py +43 -0
- requirements.txt +5 -5
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import pandas as pd
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
|
| 7 |
# from load_dataframe import get_data
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def aggregated_data(df, aggregation_level="week"):
|
|
@@ -22,6 +23,13 @@ def aggregated_data(df, aggregation_level="week"):
|
|
| 22 |
# Calculate the percentage of papers with artifacts
|
| 23 |
percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# Create the plot
|
| 26 |
plt.figure(figsize=(12, 6))
|
| 27 |
plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
|
|
@@ -40,38 +48,49 @@ def aggregated_data(df, aggregation_level="week"):
|
|
| 40 |
|
| 41 |
|
| 42 |
def display_data(df):
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
st.markdown(f"""
|
| 46 |
-
##
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
""")
|
| 50 |
|
| 51 |
st.write("Papers with at least one artifact")
|
| 52 |
-
|
| 53 |
-
st.dataframe(df[df['has_artifact']],
|
| 54 |
hide_index=True,
|
| 55 |
-
column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
| 56 |
column_config={"github": st.column_config.LinkColumn(),
|
| 57 |
-
"paper_page": st.column_config.LinkColumn()
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
|
| 60 |
st.write("Papers without artifacts")
|
| 61 |
-
st.
|
| 62 |
hide_index=True,
|
| 63 |
-
column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
| 64 |
column_config={"github": st.column_config.LinkColumn(),
|
| 65 |
"paper_page": st.column_config.LinkColumn()},
|
| 66 |
-
width=2000
|
|
|
|
| 67 |
|
| 68 |
st.write("Papers with a HF mention in README but no artifacts")
|
| 69 |
-
st.
|
| 70 |
hide_index=True,
|
| 71 |
-
column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
| 72 |
column_config={"github": st.column_config.LinkColumn(),
|
| 73 |
"paper_page": st.column_config.LinkColumn()},
|
| 74 |
-
width=2000
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
def main():
|
|
@@ -90,6 +109,9 @@ def main():
|
|
| 90 |
df.index = pd.to_datetime(df.index)
|
| 91 |
df = df.sort_index()
|
| 92 |
|
|
|
|
|
|
|
|
|
|
| 93 |
if selection == "Daily/weekly/monthly data":
|
| 94 |
# Button to select day, month or week
|
| 95 |
# Add streamlit selectbox.
|
|
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
|
| 7 |
# from load_dataframe import get_data
|
| 8 |
+
from urllib.parse import quote
|
| 9 |
|
| 10 |
|
| 11 |
def aggregated_data(df, aggregation_level="week"):
|
|
|
|
| 23 |
# Calculate the percentage of papers with artifacts
|
| 24 |
percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
|
| 25 |
|
| 26 |
+
# Calculate the growth rate
|
| 27 |
+
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
|
| 28 |
+
|
| 29 |
+
# Display the latest growth rate as a big number
|
| 30 |
+
latest_growth_rate = growth_rate.iloc[-1] if not growth_rate.empty else 0
|
| 31 |
+
st.metric(label=f"{aggregation_level.capitalize()}ly Growth Rate", value=f"{latest_growth_rate:.2f}%")
|
| 32 |
+
|
| 33 |
# Create the plot
|
| 34 |
plt.figure(figsize=(12, 6))
|
| 35 |
plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
def display_data(df):
|
| 51 |
+
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
|
| 52 |
+
num_artifacts = df['has_artifact'].sum()
|
| 53 |
+
percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
|
| 54 |
+
percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)
|
| 55 |
+
|
| 56 |
+
# add reached out column
|
| 57 |
+
df['reached_out'] = [False for _ in range(df.shape[0])]
|
| 58 |
|
| 59 |
st.markdown(f"""
|
| 60 |
+
## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
|
| 61 |
+
|
| 62 |
+
* Number of papers: {df.shape[0]}
|
| 63 |
+
* Number of papers with a Github link: {df['github'].notnull().sum()}
|
| 64 |
+
* Number of papers with at least one HF artifact: {num_artifacts}
|
| 65 |
""")
|
| 66 |
|
| 67 |
st.write("Papers with at least one artifact")
|
| 68 |
+
st.data_editor(df[df['has_artifact']],
|
|
|
|
| 69 |
hide_index=True,
|
| 70 |
+
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
| 71 |
column_config={"github": st.column_config.LinkColumn(),
|
| 72 |
+
"paper_page": st.column_config.LinkColumn(),
|
| 73 |
+
"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
|
| 74 |
+
width=2000,
|
| 75 |
+
key="papers_with_artifacts")
|
| 76 |
|
| 77 |
st.write("Papers without artifacts")
|
| 78 |
+
st.data_editor(df[~df['has_artifact']],
|
| 79 |
hide_index=True,
|
| 80 |
+
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
| 81 |
column_config={"github": st.column_config.LinkColumn(),
|
| 82 |
"paper_page": st.column_config.LinkColumn()},
|
| 83 |
+
width=2000,
|
| 84 |
+
key="papers_without_artifacts")
|
| 85 |
|
| 86 |
st.write("Papers with a HF mention in README but no artifacts")
|
| 87 |
+
st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
|
| 88 |
hide_index=True,
|
| 89 |
+
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
| 90 |
column_config={"github": st.column_config.LinkColumn(),
|
| 91 |
"paper_page": st.column_config.LinkColumn()},
|
| 92 |
+
width=2000,
|
| 93 |
+
key="papers_with_hf_mention_no_artifacts")
|
| 94 |
|
| 95 |
|
| 96 |
def main():
|
|
|
|
| 109 |
df.index = pd.to_datetime(df.index)
|
| 110 |
df = df.sort_index()
|
| 111 |
|
| 112 |
+
# hack: include title in URL column
|
| 113 |
+
df['updated_url'] = df.apply(lambda row: f'{row["paper_page"]}/title/{quote(row["title"])}', axis=1)
|
| 114 |
+
|
| 115 |
if selection == "Daily/weekly/monthly data":
|
| 116 |
# Button to select day, month or week
|
| 117 |
# Add streamlit selectbox.
|
load_dataframe.py
CHANGED
|
@@ -21,6 +21,10 @@ class PaperInfo:
|
|
| 21 |
|
| 22 |
|
| 23 |
def get_df() -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
df = pd.merge(
|
| 25 |
left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
|
| 26 |
right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
|
|
@@ -112,6 +116,42 @@ def add_hf_assets(batch):
|
|
| 112 |
return batch
|
| 113 |
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
@st.cache_data
|
| 116 |
def get_data() -> pd.DataFrame:
|
| 117 |
"""
|
|
@@ -132,6 +172,9 @@ def get_data() -> pd.DataFrame:
|
|
| 132 |
# step 3. enrich using Hugging Face API
|
| 133 |
dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
|
| 134 |
|
|
|
|
|
|
|
|
|
|
| 135 |
# return as Pandas dataframe
|
| 136 |
dataframe = dataset.to_pandas()
|
| 137 |
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def get_df() -> pd.DataFrame:
|
| 24 |
+
"""
|
| 25 |
+
Load the initial dataset as a Pandas dataframe.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
df = pd.merge(
|
| 29 |
left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
|
| 30 |
right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
|
|
|
|
| 116 |
return batch
|
| 117 |
|
| 118 |
|
| 119 |
+
def check_hf_mention(batch):
|
| 120 |
+
"""
|
| 121 |
+
Check if a paper mentions Hugging Face in the README.
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
hf_mentions = []
|
| 125 |
+
for github_url in batch["github"]:
|
| 126 |
+
hf_mention = 0
|
| 127 |
+
if github_url != "":
|
| 128 |
+
# get README text using Github API
|
| 129 |
+
owner = github_url.split("/")[-2]
|
| 130 |
+
repo = github_url.split("/")[-1]
|
| 131 |
+
branch = "main"
|
| 132 |
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
|
| 133 |
+
response = requests.get(url)
|
| 134 |
+
|
| 135 |
+
if response.status_code != 200:
|
| 136 |
+
# try master branch as second attempt
|
| 137 |
+
branch = "master"
|
| 138 |
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
|
| 139 |
+
response = requests.get(url)
|
| 140 |
+
|
| 141 |
+
if response.status_code == 200:
|
| 142 |
+
# get text
|
| 143 |
+
text = response.text
|
| 144 |
+
if "huggingface" in text.lower() or "hugging face" in text.lower():
|
| 145 |
+
hf_mention = 1
|
| 146 |
+
|
| 147 |
+
hf_mentions.append(hf_mention)
|
| 148 |
+
|
| 149 |
+
# overwrite the Github links
|
| 150 |
+
batch["hf_mention"] = hf_mentions
|
| 151 |
+
|
| 152 |
+
return batch
|
| 153 |
+
|
| 154 |
+
|
| 155 |
@st.cache_data
|
| 156 |
def get_data() -> pd.DataFrame:
|
| 157 |
"""
|
|
|
|
| 172 |
# step 3. enrich using Hugging Face API
|
| 173 |
dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
|
| 174 |
|
| 175 |
+
# step 4. check if Hugging Face is mentioned in the README
|
| 176 |
+
dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count())
|
| 177 |
+
|
| 178 |
# return as Pandas dataframe
|
| 179 |
dataframe = dataset.to_pandas()
|
| 180 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
streamlit
|
| 2 |
-
|
| 3 |
-
tqdm
|
| 4 |
-
datasets
|
| 5 |
-
paperswithcode
|
|
|
|
| 1 |
+
streamlit==1.36.0
|
| 2 |
+
matplotlib==3.7.0
|
| 3 |
+
tqdm==4.66.4
|
| 4 |
+
datasets==2.20.0
|
| 5 |
+
paperswithcode-client==0.3.1
|