Spaces:

akazemian
/

audio-library

Sleeping

App Files Files Community

akazemian commited on Sep 19

Commit

4fe3afe

verified ·

1 Parent(s): e119d3b

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

app.py +45 -33
sync_library_and_hf.py +1 -0
temp.py +2 -0

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import html as _py_html
 import pandas as pd
 import gradio as gr
 from huggingface_hub import hf_hub_download
 # ----------- HF DATASET CONFIG -----------
 HF_DATASET_REPO = "akazemian/audio-html"   # <-- change if needed
@@ -91,67 +92,78 @@ def _load_hf_index() -> pd.DataFrame:
     return df
 # ---------- Sync by model (prefix inside HF dataset) ----------
 def sync_model(model_name: str):
-    model_name = (model_name or "").strip()
-    if not model_name:
         return gr.Info("Please enter a model name."), None, None, None, "", ""
-    # 1) read index from HF and filter to this model prefix
     try:
         idx = _load_hf_index()
     except Exception as e:
         traceback.print_exc()
         return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, "", ""
-    sub = idx[idx["relpath"].astype(str).str.startswith(f"{model_name}/")]
     if sub.empty:
-        return gr.Info(f"No HTML files found for model '{model_name}' on {HF_DATASET_REPO}"), None, None, None, "", ""
-    # 2) load local DB, backfill hf_path for existing rows of this model
     db = _load_db()
-    # any existing rows whose filename appears in this model → rewrite hf_path
     if not db.empty:
-        # Build map: filename -> relpath (works if filenames are unique per model)
         rel_by_fname = dict(zip(sub["filename"].astype(str), sub["relpath"].astype(str)))
-        mask_model_rows = db["filename"].isin(rel_by_fname.keys())
-        db.loc[mask_model_rows, "hf_path"] = db.loc[mask_model_rows, "filename"].map(
-            lambda fn: f"hf://{HF_DATASET_REPO}/{rel_by_fname.get(fn, fn)}"
-        )
     # 3) add any missing rows from HF index
     now = datetime.datetime.now().isoformat(timespec="seconds")
-    new_rows = []
     existing_hf = set(db["hf_path"].astype(str))
     for _, r in sub.iterrows():
-        hf_uri = f"hf://{HF_DATASET_REPO}/{r['relpath']}"
         if hf_uri in existing_hf:
             continue
-        # try to find by filename match in local path rows
-        if not db[db["filename"] == r["filename"]].empty:
-            # row exists, just set hf_path (done above), skip adding duplicate line
             continue
         new_rows.append({
-            "id": (r["id"] or uuid.uuid4().hex[:8]),
-            "filename": r["filename"],
-            "path": "",                         # no local path known
-            "hf_path": hf_uri,                  # <-- set hf path
-            "tags": r["tags"],
-            "keywords": r["keywords"],
-            "notes": r["notes"],
-            "uploaded_at": r["uploaded_at"] or now,
-            "category": r["category"],
-            "dataset": r["dataset"],
         })
     if new_rows:
         db = pd.concat([db, pd.DataFrame(new_rows)], ignore_index=True)
-        _save_db(db)
-    else:
-        _save_db(db)  # still write back if we updated hf_path on existing rows
-    current_model = model_name
-    return refresh_view("", [], "", "", current_model) + (current_model, "HF")
 # def sync_model(model_name: str):
 #     """

 import pandas as pd
 import gradio as gr
 from huggingface_hub import hf_hub_download
+from urllib.parse import unquote  # add at top
 # ----------- HF DATASET CONFIG -----------
 HF_DATASET_REPO = "akazemian/audio-html"   # <-- change if needed
     return df
 # ---------- Sync by model (prefix inside HF dataset) ----------
+from urllib.parse import unquote  # ensure this import exists at top
 def sync_model(model_name: str):
+    raw = (model_name or "").strip()
+    if not raw:
         return gr.Info("Please enter a model name."), None, None, None, "", ""
+    # 1) read index from HF and filter to this model prefix (accept raw or URL-decoded)
     try:
         idx = _load_hf_index()
     except Exception as e:
         traceback.print_exc()
         return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, "", ""
+    decoded = unquote(raw)
+    rel = idx["relpath"].astype(str)
+    sub = idx[ rel.str.startswith(f"{raw}/") | rel.str.startswith(f"{decoded}/") ]
     if sub.empty:
+        return gr.Info(
+            f"No HTML files found for model '{raw}'. "
+            "Tip: if you copied from the URL, use '=' instead of '%3D'."
+        ), None, None, None, "", ""
+    # 2) load local DB, backfill hf_path for existing rows of this model (by filename)
     db = _load_db()
     if not db.empty:
         rel_by_fname = dict(zip(sub["filename"].astype(str), sub["relpath"].astype(str)))
+        mask_model_rows = db["filename"].astype(str).isin(rel_by_fname.keys())
+        if mask_model_rows.any():
+            db.loc[mask_model_rows, "hf_path"] = db.loc[mask_model_rows, "filename"].map(
+                lambda fn: f"hf://{HF_DATASET_REPO}/{rel_by_fname.get(str(fn), str(fn))}"
+            )
     # 3) add any missing rows from HF index
     now = datetime.datetime.now().isoformat(timespec="seconds")
     existing_hf = set(db["hf_path"].astype(str))
+    new_rows = []
     for _, r in sub.iterrows():
+        rp = str(r["relpath"])
+        hf_uri = f"hf://{HF_DATASET_REPO}/{rp}"
         if hf_uri in existing_hf:
             continue
+        # If a row with same filename exists already, we updated its hf_path above; skip adding duplicate
+        if not db[db["filename"].astype(str) == str(r["filename"])].empty:
             continue
         new_rows.append({
+            "id": (str(r["id"]) if str(r.get("id", "")) else uuid.uuid4().hex[:8]),
+            "filename": str(r["filename"]),
+            "path": "",                         # local path unknown in HF flow
+            "hf_path": hf_uri,
+            "tags": str(r.get("tags", "")),
+            "keywords": str(r.get("keywords", "")),
+            "notes": str(r.get("notes", "")),
+            "uploaded_at": (str(r.get("uploaded_at", "")) or now),
+            "category": str(r.get("category", "")),
+            "dataset": str(r.get("dataset", "")),
         })
     if new_rows:
         db = pd.concat([db, pd.DataFrame(new_rows)], ignore_index=True)
+    _save_db(db)
+    # Use decoded model for downstream filtering
+    current_model = decoded
+    # outputs: [table, tag_filter, category_filter, dataset_filter, count_md, current_model]
+    return refresh_view("", [], "", "", current_model) + (current_model,)
+# allow user to paste either "wavcoch_audio-preds-sr=16000" or the URL-encoded "%3D" form
 # def sync_model(model_name: str):
 #     """

sync_library_and_hf.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import argparse, datetime, uuid, posixpath, sys, traceback
 from pathlib import Path
 from typing import List, Tuple
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd

 import argparse, datetime, uuid, posixpath, sys, traceback
 from pathlib import Path
 from typing import List, Tuple
+from urllib.parse import unquote  # add at top
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd

temp.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import posixpath
 from pathlib import Path
 import pandas as pd
 from huggingface_hub import HfApi
 REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()

 import posixpath
 from pathlib import Path
 import pandas as pd
+from urllib.parse import unquote  # add at top
 from huggingface_hub import HfApi
 REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()