Spaces:

CraftJarvis
/

Minecraft-VLM-Leaderboard

Sleeping

App Files Files Community

zhwang4ai commited on Mar 18

Commit

229c9d9

verified ·

1 Parent(s): 96b85d6

Create app.py

Browse files

Files changed (1) hide show

app.py +115 -0

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import json
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for Minecraft</h1>"""
+DESCRIPTION = f"""
+Evaluation of VLM on Minecraft
+"""
+BENCHMARKS_TO_SKIP = []
+def get_leaderboard_df(score_path):
+    with open(score_path, "r") as f:
+        scores = json.load(f)
+    rows = []
+    for model, metrics in scores.items():
+        row = {"Model": model}  # Initialize with the model name
+        for key, value in metrics.items():
+            if isinstance(value, dict):  # If it's a dictionary, further flatten it
+                for sub_key, sub_value in value.items():
+                    if sub_key != "20":
+                        continue
+                    #row[f"{key}_{sub_key}"] = sub_value
+                    row[f"{key}"] = sub_value
+            else:
+                row[key] = value
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    return df
+leaderboard_df = get_leaderboard_df("output/score.json")
+def agg_df(df, agg: str = "max"):
+    df = df.copy()
+    # Drop date and aggregate results by model name
+    df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()
+    df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
+    # Convert all values to percentage
+    df[df.select_dtypes(include=["number"]).columns] *= 100.0
+    df = df.sort_values(by=["Average"], ascending=False)
+    return df
+# Function to update the table based on search query
+def filter_and_search(cols: list[str], search_query: str, agg: str):
+    df = leaderboard_df
+    df = agg_df(df, agg)
+    if len(search_query) > 0:
+        search_terms = search_query.split(";")
+        search_terms = [term.strip().lower() for term in search_terms]
+        pattern = "|".join(search_terms)
+        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
+        # Drop any columns which are all NaN
+        df = df.dropna(how="all", axis=1)
+    if len(cols) > 0:
+        index_cols = list(leaderboard_df.columns[:1])
+        new_cols = index_cols + cols
+        df = df.copy()[new_cols]
+        df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
+        df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
+    return df
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    with gr.Column():
+        gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
+        with gr.Row():
+            search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
+            agg = gr.Radio(
+                ["min", "max", "mean"],
+                value="max",
+                label="Aggregation",
+                info="How to aggregate results for each model",
+            )
+        # with gr.Row():
+        #     agg = gr.Radio(
+        #         ["20", "50", "100", "200"],
+        #         value="20",
+        #         label="Threshold",
+        #         info="The threshold of gui",
+        #     )
+        with gr.Row():
+            cols_bar = gr.CheckboxGroup(
+                choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
+                show_label=False,
+                info="Select columns to display",
+            )
+        with gr.Group():
+            leaderboard_table = gr.Dataframe(
+                value=leaderboard_df,
+                wrap=True,
+                column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
+            )
+        threshold_text = gr.HTML("Threshold corresponding to the values of gui and embodied: 20")
+    cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
+    agg.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
+    search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
+demo.launch()