Spaces:

akazemian
/

audio-library

Sleeping

App Files Files Community

akazemian commited on Sep 19

Commit

b3908ac

verified ·

1 Parent(s): 7aabe91

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

app.py +123 -120
index.csv +0 -0
temp.py +74 -0

app.py CHANGED Viewed

@@ -3,36 +3,34 @@ from pathlib import Path
 import html as _py_html
 import pandas as pd
 import gradio as gr
-# ----------- FIXED PATHS -----------
-REPORTS_ROOT   = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio")         # /{model_name}/*.html
-FIXED_MANIFEST = Path("/data/atlask/BAU-Quant/manifest_val.csv")
-# -----------------------------------
 DB_PATH = "library.csv"
 ALLOWED_EXTS = {".html"}
-# Columns in DB
-EXTRA_COLS = ["model_name", "category", "dataset"]
 BASE_COLS  = ["id","filename","path","tags","keywords","notes","uploaded_at"]
 ALL_DB_COLS = BASE_COLS + EXTRA_COLS
-# Columns shown in the table (order)
-TABLE_COLS = ["id","filename","model_name","category","dataset",
               "tags","keywords","notes","uploaded_at"]
 # ---------- DB helpers ----------
 def _load_db() -> pd.DataFrame:
     if os.path.exists(DB_PATH):
         df = pd.read_csv(DB_PATH)
-        # migrate: ensure all required columns exist
         for c in ALL_DB_COLS:
             if c not in df.columns:
                 df[c] = ""
-        # normalize text-ish fields
-        for c in ["tags","keywords","notes","model_name","category","dataset"]:
             df[c] = df[c].fillna("").astype(str)
-        # keep only our known columns in stable order
         return df[ALL_DB_COLS]
     return pd.DataFrame(columns=ALL_DB_COLS)
@@ -61,103 +59,92 @@ def _df_from_table_value(table_value):
             return pd.DataFrame(table_value, columns=cols)
     return pd.DataFrame(columns=cols)
-# ---------- Manifest helpers ----------
-def _stem_for_match(p: Path) -> str:
-    stem = p.stem
-    if "chunk" in stem:
-        stem = stem.split("_chunk")[0]
-    return stem
-def _load_manifest():
-    if not FIXED_MANIFEST.exists():
-        return None
-    mf = pd.read_csv(FIXED_MANIFEST)
-    if "file_name" not in mf.columns:
-        return None
-    mf = mf.copy()
-    def mk_from_str(s: str):
-        st = Path(str(s)).stem
-        return st.split("_chunk")[0] if "chunk" in st else st
-    mf["__match_key"] = mf["file_name"].astype(str).apply(mk_from_str)
-    return mf
-# ---------- Sync by model ----------
 def sync_model(model_name: str):
-    f"""
-    Index all .html reports under {REPORTS_ROOT}/{model_name}.
-    Adds NEW files to DB (by exact path), sets model_name,
-    and fills category/dataset from the fixed manifest if present.
     """
     model_name = (model_name or "").strip()
     if not model_name:
-        return gr.Info("Please enter a model name."), None, None, None, None
-    folder = REPORTS_ROOT / model_name
-    if not folder.exists():
-        return gr.Info(f"Folder not found: {folder}"), None, None, None, None
     df = _load_db()
-    manifest = _load_manifest()
     now = datetime.datetime.now().isoformat(timespec="seconds")
     new_rows = []
-    for p in sorted(folder.glob("*.html")):
-        if p.suffix.lower() not in ALLOWED_EXTS:
-            continue
-        # if already indexed, optionally backfill model_name and skip creating a new row
-        existing = df["path"] == str(p)
-        if existing.any():
-            idxs = df.index[existing]
-            for i in idxs:
-                if (df.at[i, "model_name"] or "") != model_name:
-                    df.at[i, "model_name"] = model_name
             continue
-        category, dataset = "", ""
-        if manifest is not None:
-            mk = _stem_for_match(p)
-            hit = manifest[manifest["__match_key"].str.contains(mk, na=False)]
-            if not hit.empty:
-                if "audio_category" in hit.columns:
-                    category = str(hit.iloc[0]["audio_category"])
-                if "dataset" in hit.columns:
-                    dataset = str(hit.iloc[0]["dataset"])
-        uid = uuid.uuid4().hex[:8]
         new_rows.append({
-            "id": uid,
-            "filename": p.name,
-            "path": str(p),     # keep absolute path; no copying
-            "tags": "",
-            "keywords": "",
-            "notes": "",
-            "uploaded_at": now,
-            "model_name": model_name,
-            "category": category,
-            "dataset": dataset
         })
     if new_rows:
         df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
-    _save_db(df)
-    # show refreshed view scoped to this model
-    return refresh_view("", [], "", "", model_name)
 # ---------- Search / filters ----------
-def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filter):
     df = _load_db()
     # tag vocabulary
     all_tags = sorted({t.strip()
                        for s in df["tags"].dropna().astype(str).tolist()
                        for t in s.split(",") if t.strip()})
     all_cats   = sorted([c for c in df["category"].dropna().astype(str).unique() if c])
     all_sets   = sorted([c for c in df["dataset"].dropna().astype(str).unique() if c])
-    all_models = sorted([c for c in df["model_name"].dropna().astype(str).unique() if c])
-    # free-text query across filename/tags/keywords/notes/category/dataset/model
     if query:
         q = query.lower()
         mask = (
@@ -166,8 +153,7 @@ def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filt
             df["keywords"].str.lower().str.contains(q, na=False) |
             df["notes"].str.lower().str.contains(q, na=False) |
             df["category"].str.lower().str.contains(q, na=False) |
-            df["dataset"].str.lower().str.contains(q, na=False) |
-            df["model_name"].str.lower().str.contains(q, na=False)
         )
         df = df[mask]
@@ -181,8 +167,6 @@ def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filt
         df = df[df["category"] == category_filter]
     if dataset_filter:
         df = df[df["dataset"] == dataset_filter]
-    if model_filter:
-        df = df[df["model_name"] == model_filter]
     df = df.sort_values("uploaded_at", ascending=False).reset_index(drop=True)
     view = df[TABLE_COLS].copy()
@@ -193,7 +177,6 @@ def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filt
         gr.update(choices=all_tags),
         gr.update(choices=[""] + all_cats,   value=category_filter or ""),
         gr.update(choices=[""] + all_sets,   value=dataset_filter or ""),
-        gr.update(choices=[""] + all_models, value=model_filter or ""),
         count_text
     )
@@ -233,12 +216,24 @@ def select_row(evt: gr.SelectData, table_value):
         if rec.empty:
             return "<em>Could not find file for this row.</em>", ""
-        path = rec["path"].values[0]
-        if not os.path.exists(path):
-            return f"<em>File not found:</em> <code>{_py_html.escape(path)}</code>", f"📄 {row['filename']}"
-        with open(path, "r", encoding="utf-8") as f:
-            raw_html = f.read()
         iframe = _iframe_from_html_string(raw_html, height_px=720)
         return iframe, f"📄 {row['filename']}"
@@ -247,11 +242,11 @@ def select_row(evt: gr.SelectData, table_value):
         return f"<pre>Failed to render (see terminal):\n{_py_html.escape(str(e))}</pre>", ""
 # ---------- Save edits ----------
-def save_edits(edited_table):
     if edited_table is None or not len(edited_table):
         return gr.Info("Nothing to save.")
     df_db = _load_db()
-    editable_cols = ["model_name","category","dataset","tags","keywords","notes"]
     for c in editable_cols:
         edited_table[c] = edited_table[c].fillna("").astype(str)
     for _, row in edited_table.iterrows():
@@ -260,8 +255,8 @@ def save_edits(edited_table):
             for c in editable_cols:
                 df_db.at[i[0], c] = row[c]
     _save_db(df_db)
-    # return refreshed table only
-    return refresh_view("", [], "", "", "")[0]
 # -------------------- UI --------------------
 # CSS that targets only the three buttons via elem_id
@@ -303,25 +298,24 @@ custom_css = """
 }
 """
 with gr.Blocks(title="Audio HTML Library", css=custom_css) as demo:
     gr.Markdown("## 🎧 Audio Reconstruction Reports — sync • search • view")
     with gr.Row():
         with gr.Column(scale=1):
             # Choose model & sync
-            gr.Markdown(f"**Model folder:** `{REPORTS_ROOT}/model_name`")
             model_in = gr.Textbox(label="Model name", placeholder="e.g., WavCochV8192")
-            sync_btn = gr.Button("Sync this model", elem_id="sync-btn")  # ⬅️ give id
             # Search & filters
             gr.Markdown("---\n**Search & filter**")
-            query = gr.Textbox(label="Keyword search (filename/tags/notes/category/dataset/model)", placeholder="type to search…")
             tag_filter = gr.CheckboxGroup(choices=[], label="Filter by tags (AND)")
             category_filter = gr.Dropdown(choices=[], label="Category")
             dataset_filter  = gr.Dropdown(choices=[], label="Dataset")
-            model_filter    = gr.Dropdown(choices=[], label="Model")
-            refresh_btn = gr.Button("Refresh", elem_id="refresh-btn")     # ⬅️ give id
         with gr.Column(scale=2):
             # Count of current view
@@ -336,31 +330,40 @@ with gr.Blocks(title="Audio HTML Library", css=custom_css) as demo:
                 col_count=(len(TABLE_COLS), "fixed")
             )
             with gr.Row():
-                save_btn = gr.Button("Save Edits", elem_id="save-btn")     # ⬅️ give id
             preview_label = gr.Markdown("")
             preview_html = gr.HTML("")
-    # wiring: sync
-    sync_btn.click(sync_model, [model_in],
-                   [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
-    # wiring: refresh + live filters
-    refresh_btn.click(refresh_view,
-                      [query, tag_filter, category_filter, dataset_filter, model_filter],
-                      [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
-    for comp in (query, tag_filter, category_filter, dataset_filter, model_filter):
-        comp.change(refresh_view,
-                    [query, tag_filter, category_filter, dataset_filter, model_filter],
-                    [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
     table.select(select_row, [table], [preview_html, preview_label])
-    save_btn.click(save_edits, [table], [table])
-    # initial load
-    demo.load(refresh_view,
-              [query, tag_filter, category_filter, dataset_filter, model_filter],
-              [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
 if __name__ == "__main__":
-    demo.launch(share=True)  # auth is optional but recommended

 import html as _py_html
 import pandas as pd
 import gradio as gr
+from huggingface_hub import hf_hub_download
+# ----------- HF DATASET CONFIG -----------
+HF_DATASET_REPO = "akazemian/audio-html"   # <-- change if needed
+INDEX_FILENAME  = "index.csv"
+# -----------------------------------------
 DB_PATH = "library.csv"
 ALLOWED_EXTS = {".html"}
+# Columns in DB (no model_name)
+EXTRA_COLS = ["category", "dataset"]
 BASE_COLS  = ["id","filename","path","tags","keywords","notes","uploaded_at"]
 ALL_DB_COLS = BASE_COLS + EXTRA_COLS
+# Columns shown in the table (no model_name)
+TABLE_COLS = ["id","filename","category","dataset",
               "tags","keywords","notes","uploaded_at"]
 # ---------- DB helpers ----------
 def _load_db() -> pd.DataFrame:
     if os.path.exists(DB_PATH):
         df = pd.read_csv(DB_PATH)
         for c in ALL_DB_COLS:
             if c not in df.columns:
                 df[c] = ""
+        for c in ["tags","keywords","notes","category","dataset"]:
             df[c] = df[c].fillna("").astype(str)
         return df[ALL_DB_COLS]
     return pd.DataFrame(columns=ALL_DB_COLS)
             return pd.DataFrame(table_value, columns=cols)
     return pd.DataFrame(columns=cols)
+# ---------- Load HF index ----------
+def _load_hf_index() -> pd.DataFrame:
+    """
+    Download + read index.csv from the HF dataset repo.
+    Required columns: id, filename, relpath, category, dataset, tags, keywords, notes, uploaded_at
+    """
+    local = hf_hub_download(
+        repo_id=HF_DATASET_REPO,
+        repo_type="dataset",
+        filename=INDEX_FILENAME,
+    )
+    df = pd.read_csv(local)
+    for c in ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]:
+        if c not in df.columns:
+            df[c] = ""
+    # normalize types
+    for c in ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]:
+        df[c] = df[c].fillna("").astype(str)
+    return df
+# ---------- Sync by model (prefix inside HF dataset) ----------
 def sync_model(model_name: str):
+    """
+    Load index.csv from HF, add rows for the selected model (by relpath prefix),
+    store HF URIs in DB, and show only that model’s files.
     """
     model_name = (model_name or "").strip()
     if not model_name:
+        return gr.Info("Please enter a model name."), None, None, None, ""
+    try:
+        idx = _load_hf_index()
+    except Exception as e:
+        traceback.print_exc()
+        return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, ""
+    # rows like "{model_name}/.../file.html"
+    subset = idx[idx["relpath"].str.startswith(model_name + "/")]
+    if subset.empty:
+        return gr.Info(f"No HTML files found for model '{model_name}' on {HF_DATASET_REPO}"), None, None, None, ""
     df = _load_db()
     now = datetime.datetime.now().isoformat(timespec="seconds")
     new_rows = []
+    for _, r in subset.iterrows():
+        relpath = r["relpath"]
+        hub_uri = f"hf://{HF_DATASET_REPO}/{relpath}"
+        if (df["path"] == hub_uri).any():
             continue
         new_rows.append({
+            "id": r["id"] if r["id"] else uuid.uuid4().hex[:8],
+            "filename": r["filename"],
+            "path": hub_uri,                        # store HF URI
+            "tags": r["tags"],
+            "keywords": r["keywords"],
+            "notes": r["notes"],
+            "uploaded_at": r["uploaded_at"] or now,
+            "category": r["category"],
+            "dataset": r["dataset"]
         })
     if new_rows:
         df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
+        _save_db(df)
+    current_model = model_name  # remember which model prefix is active
+    return refresh_view("", [], "", "", current_model) + (current_model,)
 # ---------- Search / filters ----------
+def refresh_view(query, tag_filters, category_filter, dataset_filter, current_model):
     df = _load_db()
+    # scope to current model prefix in HF URI if provided
+    if current_model:
+        prefix = f"hf://{HF_DATASET_REPO}/{current_model}/"
+        df = df[df["path"].astype(str).str.startswith(prefix)]
     # tag vocabulary
     all_tags = sorted({t.strip()
                        for s in df["tags"].dropna().astype(str).tolist()
                        for t in s.split(",") if t.strip()})
     all_cats   = sorted([c for c in df["category"].dropna().astype(str).unique() if c])
     all_sets   = sorted([c for c in df["dataset"].dropna().astype(str).unique() if c])
+    # free-text query across filename/tags/keywords/notes/category/dataset
     if query:
         q = query.lower()
         mask = (
             df["keywords"].str.lower().str.contains(q, na=False) |
             df["notes"].str.lower().str.contains(q, na=False) |
             df["category"].str.lower().str.contains(q, na=False) |
+            df["dataset"].str.lower().str.contains(q, na=False)
         )
         df = df[mask]
         df = df[df["category"] == category_filter]
     if dataset_filter:
         df = df[df["dataset"] == dataset_filter]
     df = df.sort_values("uploaded_at", ascending=False).reset_index(drop=True)
     view = df[TABLE_COLS].copy()
         gr.update(choices=all_tags),
         gr.update(choices=[""] + all_cats,   value=category_filter or ""),
         gr.update(choices=[""] + all_sets,   value=dataset_filter or ""),
         count_text
     )
         if rec.empty:
             return "<em>Could not find file for this row.</em>", ""
+        path_str = rec["path"].values[0]
+        # Hub-backed path → lazy download
+        if str(path_str).startswith("hf://"):
+            _, rest = path_str.split("hf://", 1)
+            repo_id, relpath = rest.split("/", 1)
+            local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=relpath)
+            raw_html = Path(local_path).read_text(encoding="utf-8")
+        elif str(path_str).startswith("http"):
+            # if you ever swap to CDN URLs, iframe the URL directly
+            iframe = f'<iframe style="width:100%;height:720px;border:1px solid #ddd;border-radius:8px;" src="{_py_html.escape(path_str)}"></iframe>'
+            return iframe, f"📄 {row['filename']}"
+        else:
+            # local file fallback (not used for HF flow, kept for compatibility)
+            p = Path(path_str)
+            if not p.exists():
+                return f"<em>File not found:</em> <code>{_py_html.escape(str(p))}</code>", f"📄 {row['filename']}"
+            raw_html = p.read_text(encoding="utf-8")
         iframe = _iframe_from_html_string(raw_html, height_px=720)
         return iframe, f"📄 {row['filename']}"
         return f"<pre>Failed to render (see terminal):\n{_py_html.escape(str(e))}</pre>", ""
 # ---------- Save edits ----------
+def save_edits(edited_table, current_model):
     if edited_table is None or not len(edited_table):
         return gr.Info("Nothing to save.")
     df_db = _load_db()
+    editable_cols = ["category","dataset","tags","keywords","notes"]
     for c in editable_cols:
         edited_table[c] = edited_table[c].fillna("").astype(str)
     for _, row in edited_table.iterrows():
             for c in editable_cols:
                 df_db.at[i[0], c] = row[c]
     _save_db(df_db)
+    # return refreshed table only (respect current_model scope)
+    return refresh_view("", [], "", "", current_model)[0]
 # -------------------- UI --------------------
 # CSS that targets only the three buttons via elem_id
 }
 """
 with gr.Blocks(title="Audio HTML Library", css=custom_css) as demo:
     gr.Markdown("## 🎧 Audio Reconstruction Reports — sync • search • view")
+    current_model = gr.State("")  # remembers active model prefix inside HF repo
     with gr.Row():
         with gr.Column(scale=1):
             # Choose model & sync
+            gr.Markdown(f"**Model prefix on HF dataset:** `{HF_DATASET_REPO}/<model_name>/...`")
             model_in = gr.Textbox(label="Model name", placeholder="e.g., WavCochV8192")
+            sync_btn = gr.Button("Sync this model", elem_id="sync-btn")
             # Search & filters
             gr.Markdown("---\n**Search & filter**")
+            query = gr.Textbox(label="Keyword search (filename/tags/notes/category/dataset)", placeholder="type to search…")
             tag_filter = gr.CheckboxGroup(choices=[], label="Filter by tags (AND)")
             category_filter = gr.Dropdown(choices=[], label="Category")
             dataset_filter  = gr.Dropdown(choices=[], label="Dataset")
+            refresh_btn = gr.Button("Refresh", elem_id="refresh-btn")
         with gr.Column(scale=2):
             # Count of current view
                 col_count=(len(TABLE_COLS), "fixed")
             )
             with gr.Row():
+                save_btn = gr.Button("Save Edits", elem_id="save-btn")
             preview_label = gr.Markdown("")
             preview_html = gr.HTML("")
+    # wiring: sync (also sets current_model)
+    sync_btn.click(
+        sync_model,
+        [model_in],
+        [table, tag_filter, category_filter, dataset_filter, count_md, current_model]
+    )
+    # wiring: refresh + live filters (respect current_model)
+    refresh_btn.click(
+        refresh_view,
+        [query, tag_filter, category_filter, dataset_filter, current_model],
+        [table, tag_filter, category_filter, dataset_filter, count_md]
+    )
+    for comp in (query, tag_filter, category_filter, dataset_filter):
+        comp.change(
+            refresh_view,
+            [query, tag_filter, category_filter, dataset_filter, current_model],
+            [table, tag_filter, category_filter, dataset_filter, count_md]
+        )
     table.select(select_row, [table], [preview_html, preview_label])
+    save_btn.click(save_edits, [table, current_model], [table])
+    # initial load (no model yet)
+    demo.load(
+        refresh_view,
+        [query, tag_filter, category_filter, dataset_filter, current_model],
+        [table, tag_filter, category_filter, dataset_filter, count_md]
+    )
 if __name__ == "__main__":
+    demo.launch(share=True)  # auth optional

index.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

temp.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# upload_htmls_and_index.py
+import posixpath
+from pathlib import Path
+import pandas as pd
+from huggingface_hub import HfApi
+REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
+DATASET_REPO = "akazemian/audio-html"
+# --- replace ONLY your upload block with this (keep the rest of the file) ---
+from huggingface_hub import HfApi
+api = HfApi()
+REPORTS_ROOT = REPORTS_ROOT.resolve()  # your existing constant
+# Upload per model subfolder, but call upload_large_folder on the PARENT
+# (older huggingface_hub versions don't support path_in_repo)
+for sub in sorted([p for p in REPORTS_ROOT.iterdir() if p.is_dir()]):
+    model = sub.name
+    print(f"[HF] upload_large_folder: {REPORTS_ROOT} (include {model}/**/*.html) -> {DATASET_REPO}")
+    api.upload_large_folder(
+        repo_id=DATASET_REPO,
+        repo_type="dataset",
+        folder_path=str(REPORTS_ROOT),          # parent folder
+        allow_patterns=[f"{model}/**/*.html"],  # only this model's files
+    )
+    print(f"✓ uploaded {model}")
+# --- end replacement ---
+# (B) Build index.csv from your existing library.csv (no model_name)
+library = pd.read_csv("library.csv")
+def ensure_cols(df, cols):
+    for c in cols:
+        if c not in df.columns:
+            df[c] = ""
+    return df
+library = ensure_cols(
+    library,
+    ["id","filename","path","tags","keywords","notes","uploaded_at","category","dataset"]
+)
+def local_to_relpath(local_path: str) -> str:
+    # Make path relative to REPORTS_ROOT and normalize to POSIX for HF
+    rel = Path(local_path).resolve().relative_to(REPORTS_ROOT)
+    return posixpath.join(*rel.parts)
+# Only keep rows that actually point to .html files under REPORTS_ROOT
+keep = library["path"].astype(str).str.endswith(".html", na=False) & \
+       library["path"].astype(str).str.startswith(str(REPORTS_ROOT), na=False)
+idx = library[keep].copy()
+# Derive relpath inside the HF dataset from the absolute local path
+idx["relpath"] = idx["path"].apply(local_to_relpath)
+index_cols = ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]
+index_df = idx[index_cols].copy()
+index_df.to_csv("index.csv", index=False)
+# (C) Upload index.csv to the dataset repo (small, separate commit)
+from huggingface_hub import CommitOperationAdd
+api.create_commit(
+    repo_id=DATASET_REPO,
+    repo_type="dataset",
+    operations=[CommitOperationAdd(path_in_repo="index.csv", path_or_fileobj="index.csv")],
+    commit_message=f"Add/update index.csv ({len(index_df)} rows)"
+)
+print("Done: uploaded HTMLs (large-folder) and index.csv")