Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- app.py +45 -33
- sync_library_and_hf.py +1 -0
- temp.py +2 -0
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import html as _py_html
|
|
| 4 |
import pandas as pd
|
| 5 |
import gradio as gr
|
| 6 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 7 |
|
| 8 |
# ----------- HF DATASET CONFIG -----------
|
| 9 |
HF_DATASET_REPO = "akazemian/audio-html" # <-- change if needed
|
|
@@ -91,67 +92,78 @@ def _load_hf_index() -> pd.DataFrame:
|
|
| 91 |
return df
|
| 92 |
|
| 93 |
# ---------- Sync by model (prefix inside HF dataset) ----------
|
|
|
|
|
|
|
| 94 |
def sync_model(model_name: str):
|
| 95 |
-
|
| 96 |
-
if not
|
| 97 |
return gr.Info("Please enter a model name."), None, None, None, "", ""
|
| 98 |
|
| 99 |
-
# 1) read index from HF and filter to this model prefix
|
| 100 |
try:
|
| 101 |
idx = _load_hf_index()
|
| 102 |
except Exception as e:
|
| 103 |
traceback.print_exc()
|
| 104 |
return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, "", ""
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
| 107 |
if sub.empty:
|
| 108 |
-
return gr.Info(
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
# 2) load local DB, backfill hf_path for existing rows of this model
|
| 111 |
db = _load_db()
|
| 112 |
-
|
| 113 |
-
# any existing rows whose filename appears in this model → rewrite hf_path
|
| 114 |
if not db.empty:
|
| 115 |
-
# Build map: filename -> relpath (works if filenames are unique per model)
|
| 116 |
rel_by_fname = dict(zip(sub["filename"].astype(str), sub["relpath"].astype(str)))
|
| 117 |
-
mask_model_rows = db["filename"].isin(rel_by_fname.keys())
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
| 121 |
|
| 122 |
# 3) add any missing rows from HF index
|
| 123 |
now = datetime.datetime.now().isoformat(timespec="seconds")
|
| 124 |
-
new_rows = []
|
| 125 |
existing_hf = set(db["hf_path"].astype(str))
|
|
|
|
| 126 |
for _, r in sub.iterrows():
|
| 127 |
-
|
|
|
|
| 128 |
if hf_uri in existing_hf:
|
| 129 |
continue
|
| 130 |
-
#
|
| 131 |
-
if not db[db["filename"] == r["filename"]].empty:
|
| 132 |
-
# row exists, just set hf_path (done above), skip adding duplicate line
|
| 133 |
continue
|
| 134 |
new_rows.append({
|
| 135 |
-
"id": (r["id"]
|
| 136 |
-
"filename": r["filename"],
|
| 137 |
-
"path": "", #
|
| 138 |
-
"hf_path": hf_uri,
|
| 139 |
-
"tags": r
|
| 140 |
-
"keywords": r
|
| 141 |
-
"notes": r
|
| 142 |
-
"uploaded_at": r
|
| 143 |
-
"category": r
|
| 144 |
-
"dataset": r
|
| 145 |
})
|
| 146 |
|
| 147 |
if new_rows:
|
| 148 |
db = pd.concat([db, pd.DataFrame(new_rows)], ignore_index=True)
|
| 149 |
-
_save_db(db)
|
| 150 |
-
else:
|
| 151 |
-
_save_db(db) # still write back if we updated hf_path on existing rows
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
# def sync_model(model_name: str):
|
| 157 |
# """
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import gradio as gr
|
| 6 |
from huggingface_hub import hf_hub_download
|
| 7 |
+
from urllib.parse import unquote # add at top
|
| 8 |
|
| 9 |
# ----------- HF DATASET CONFIG -----------
|
| 10 |
HF_DATASET_REPO = "akazemian/audio-html" # <-- change if needed
|
|
|
|
| 92 |
return df
|
| 93 |
|
| 94 |
# ---------- Sync by model (prefix inside HF dataset) ----------
|
| 95 |
+
from urllib.parse import unquote # ensure this import exists at top
|
| 96 |
+
|
| 97 |
def sync_model(model_name: str):
|
| 98 |
+
raw = (model_name or "").strip()
|
| 99 |
+
if not raw:
|
| 100 |
return gr.Info("Please enter a model name."), None, None, None, "", ""
|
| 101 |
|
| 102 |
+
# 1) read index from HF and filter to this model prefix (accept raw or URL-decoded)
|
| 103 |
try:
|
| 104 |
idx = _load_hf_index()
|
| 105 |
except Exception as e:
|
| 106 |
traceback.print_exc()
|
| 107 |
return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, "", ""
|
| 108 |
|
| 109 |
+
decoded = unquote(raw)
|
| 110 |
+
rel = idx["relpath"].astype(str)
|
| 111 |
+
sub = idx[ rel.str.startswith(f"{raw}/") | rel.str.startswith(f"{decoded}/") ]
|
| 112 |
if sub.empty:
|
| 113 |
+
return gr.Info(
|
| 114 |
+
f"No HTML files found for model '{raw}'. "
|
| 115 |
+
"Tip: if you copied from the URL, use '=' instead of '%3D'."
|
| 116 |
+
), None, None, None, "", ""
|
| 117 |
|
| 118 |
+
# 2) load local DB, backfill hf_path for existing rows of this model (by filename)
|
| 119 |
db = _load_db()
|
|
|
|
|
|
|
| 120 |
if not db.empty:
|
|
|
|
| 121 |
rel_by_fname = dict(zip(sub["filename"].astype(str), sub["relpath"].astype(str)))
|
| 122 |
+
mask_model_rows = db["filename"].astype(str).isin(rel_by_fname.keys())
|
| 123 |
+
if mask_model_rows.any():
|
| 124 |
+
db.loc[mask_model_rows, "hf_path"] = db.loc[mask_model_rows, "filename"].map(
|
| 125 |
+
lambda fn: f"hf://{HF_DATASET_REPO}/{rel_by_fname.get(str(fn), str(fn))}"
|
| 126 |
+
)
|
| 127 |
|
| 128 |
# 3) add any missing rows from HF index
|
| 129 |
now = datetime.datetime.now().isoformat(timespec="seconds")
|
|
|
|
| 130 |
existing_hf = set(db["hf_path"].astype(str))
|
| 131 |
+
new_rows = []
|
| 132 |
for _, r in sub.iterrows():
|
| 133 |
+
rp = str(r["relpath"])
|
| 134 |
+
hf_uri = f"hf://{HF_DATASET_REPO}/{rp}"
|
| 135 |
if hf_uri in existing_hf:
|
| 136 |
continue
|
| 137 |
+
# If a row with same filename exists already, we updated its hf_path above; skip adding duplicate
|
| 138 |
+
if not db[db["filename"].astype(str) == str(r["filename"])].empty:
|
|
|
|
| 139 |
continue
|
| 140 |
new_rows.append({
|
| 141 |
+
"id": (str(r["id"]) if str(r.get("id", "")) else uuid.uuid4().hex[:8]),
|
| 142 |
+
"filename": str(r["filename"]),
|
| 143 |
+
"path": "", # local path unknown in HF flow
|
| 144 |
+
"hf_path": hf_uri,
|
| 145 |
+
"tags": str(r.get("tags", "")),
|
| 146 |
+
"keywords": str(r.get("keywords", "")),
|
| 147 |
+
"notes": str(r.get("notes", "")),
|
| 148 |
+
"uploaded_at": (str(r.get("uploaded_at", "")) or now),
|
| 149 |
+
"category": str(r.get("category", "")),
|
| 150 |
+
"dataset": str(r.get("dataset", "")),
|
| 151 |
})
|
| 152 |
|
| 153 |
if new_rows:
|
| 154 |
db = pd.concat([db, pd.DataFrame(new_rows)], ignore_index=True)
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
+
_save_db(db)
|
| 157 |
+
|
| 158 |
+
# Use decoded model for downstream filtering
|
| 159 |
+
current_model = decoded
|
| 160 |
+
# outputs: [table, tag_filter, category_filter, dataset_filter, count_md, current_model]
|
| 161 |
+
return refresh_view("", [], "", "", current_model) + (current_model,)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# allow user to paste either "wavcoch_audio-preds-sr=16000" or the URL-encoded "%3D" form
|
| 166 |
+
|
| 167 |
|
| 168 |
# def sync_model(model_name: str):
|
| 169 |
# """
|
sync_library_and_hf.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
import argparse, datetime, uuid, posixpath, sys, traceback
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import List, Tuple
|
|
|
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd
|
|
|
|
| 3 |
import argparse, datetime, uuid, posixpath, sys, traceback
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import List, Tuple
|
| 6 |
+
from urllib.parse import unquote # add at top
|
| 7 |
|
| 8 |
import pandas as pd
|
| 9 |
from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd
|
temp.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
| 2 |
import posixpath
|
| 3 |
from pathlib import Path
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
| 5 |
from huggingface_hub import HfApi
|
| 6 |
|
| 7 |
REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
|
|
|
|
| 2 |
import posixpath
|
| 3 |
from pathlib import Path
|
| 4 |
import pandas as pd
|
| 5 |
+
from urllib.parse import unquote # add at top
|
| 6 |
+
|
| 7 |
from huggingface_hub import HfApi
|
| 8 |
|
| 9 |
REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
|