akazemian commited on
Commit
4fe3afe
·
verified ·
1 Parent(s): e119d3b

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app.py +45 -33
  2. sync_library_and_hf.py +1 -0
  3. temp.py +2 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import html as _py_html
4
  import pandas as pd
5
  import gradio as gr
6
  from huggingface_hub import hf_hub_download
 
7
 
8
  # ----------- HF DATASET CONFIG -----------
9
  HF_DATASET_REPO = "akazemian/audio-html" # <-- change if needed
@@ -91,67 +92,78 @@ def _load_hf_index() -> pd.DataFrame:
91
  return df
92
 
93
  # ---------- Sync by model (prefix inside HF dataset) ----------
 
 
94
  def sync_model(model_name: str):
95
- model_name = (model_name or "").strip()
96
- if not model_name:
97
  return gr.Info("Please enter a model name."), None, None, None, "", ""
98
 
99
- # 1) read index from HF and filter to this model prefix
100
  try:
101
  idx = _load_hf_index()
102
  except Exception as e:
103
  traceback.print_exc()
104
  return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, "", ""
105
 
106
- sub = idx[idx["relpath"].astype(str).str.startswith(f"{model_name}/")]
 
 
107
  if sub.empty:
108
- return gr.Info(f"No HTML files found for model '{model_name}' on {HF_DATASET_REPO}"), None, None, None, "", ""
 
 
 
109
 
110
- # 2) load local DB, backfill hf_path for existing rows of this model
111
  db = _load_db()
112
-
113
- # any existing rows whose filename appears in this model → rewrite hf_path
114
  if not db.empty:
115
- # Build map: filename -> relpath (works if filenames are unique per model)
116
  rel_by_fname = dict(zip(sub["filename"].astype(str), sub["relpath"].astype(str)))
117
- mask_model_rows = db["filename"].isin(rel_by_fname.keys())
118
- db.loc[mask_model_rows, "hf_path"] = db.loc[mask_model_rows, "filename"].map(
119
- lambda fn: f"hf://{HF_DATASET_REPO}/{rel_by_fname.get(fn, fn)}"
120
- )
 
121
 
122
  # 3) add any missing rows from HF index
123
  now = datetime.datetime.now().isoformat(timespec="seconds")
124
- new_rows = []
125
  existing_hf = set(db["hf_path"].astype(str))
 
126
  for _, r in sub.iterrows():
127
- hf_uri = f"hf://{HF_DATASET_REPO}/{r['relpath']}"
 
128
  if hf_uri in existing_hf:
129
  continue
130
- # try to find by filename match in local path rows
131
- if not db[db["filename"] == r["filename"]].empty:
132
- # row exists, just set hf_path (done above), skip adding duplicate line
133
  continue
134
  new_rows.append({
135
- "id": (r["id"] or uuid.uuid4().hex[:8]),
136
- "filename": r["filename"],
137
- "path": "", # no local path known
138
- "hf_path": hf_uri, # <-- set hf path
139
- "tags": r["tags"],
140
- "keywords": r["keywords"],
141
- "notes": r["notes"],
142
- "uploaded_at": r["uploaded_at"] or now,
143
- "category": r["category"],
144
- "dataset": r["dataset"],
145
  })
146
 
147
  if new_rows:
148
  db = pd.concat([db, pd.DataFrame(new_rows)], ignore_index=True)
149
- _save_db(db)
150
- else:
151
- _save_db(db) # still write back if we updated hf_path on existing rows
152
 
153
- current_model = model_name
154
- return refresh_view("", [], "", "", current_model) + (current_model, "HF")
 
 
 
 
 
 
 
 
 
155
 
156
  # def sync_model(model_name: str):
157
  # """
 
4
  import pandas as pd
5
  import gradio as gr
6
  from huggingface_hub import hf_hub_download
7
+ from urllib.parse import unquote # add at top
8
 
9
  # ----------- HF DATASET CONFIG -----------
10
  HF_DATASET_REPO = "akazemian/audio-html" # <-- change if needed
 
92
  return df
93
 
94
  # ---------- Sync by model (prefix inside HF dataset) ----------
95
+ from urllib.parse import unquote # ensure this import exists at top
96
+
97
  def sync_model(model_name: str):
98
+ raw = (model_name or "").strip()
99
+ if not raw:
100
  return gr.Info("Please enter a model name."), None, None, None, "", ""
101
 
102
+ # 1) read index from HF and filter to this model prefix (accept raw or URL-decoded)
103
  try:
104
  idx = _load_hf_index()
105
  except Exception as e:
106
  traceback.print_exc()
107
  return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, "", ""
108
 
109
+ decoded = unquote(raw)
110
+ rel = idx["relpath"].astype(str)
111
+ sub = idx[ rel.str.startswith(f"{raw}/") | rel.str.startswith(f"{decoded}/") ]
112
  if sub.empty:
113
+ return gr.Info(
114
+ f"No HTML files found for model '{raw}'. "
115
+ "Tip: if you copied from the URL, use '=' instead of '%3D'."
116
+ ), None, None, None, "", ""
117
 
118
+ # 2) load local DB, backfill hf_path for existing rows of this model (by filename)
119
  db = _load_db()
 
 
120
  if not db.empty:
 
121
  rel_by_fname = dict(zip(sub["filename"].astype(str), sub["relpath"].astype(str)))
122
+ mask_model_rows = db["filename"].astype(str).isin(rel_by_fname.keys())
123
+ if mask_model_rows.any():
124
+ db.loc[mask_model_rows, "hf_path"] = db.loc[mask_model_rows, "filename"].map(
125
+ lambda fn: f"hf://{HF_DATASET_REPO}/{rel_by_fname.get(str(fn), str(fn))}"
126
+ )
127
 
128
  # 3) add any missing rows from HF index
129
  now = datetime.datetime.now().isoformat(timespec="seconds")
 
130
  existing_hf = set(db["hf_path"].astype(str))
131
+ new_rows = []
132
  for _, r in sub.iterrows():
133
+ rp = str(r["relpath"])
134
+ hf_uri = f"hf://{HF_DATASET_REPO}/{rp}"
135
  if hf_uri in existing_hf:
136
  continue
137
+ # If a row with same filename exists already, we updated its hf_path above; skip adding duplicate
138
+ if not db[db["filename"].astype(str) == str(r["filename"])].empty:
 
139
  continue
140
  new_rows.append({
141
+ "id": (str(r["id"]) if str(r.get("id", "")) else uuid.uuid4().hex[:8]),
142
+ "filename": str(r["filename"]),
143
+ "path": "", # local path unknown in HF flow
144
+ "hf_path": hf_uri,
145
+ "tags": str(r.get("tags", "")),
146
+ "keywords": str(r.get("keywords", "")),
147
+ "notes": str(r.get("notes", "")),
148
+ "uploaded_at": (str(r.get("uploaded_at", "")) or now),
149
+ "category": str(r.get("category", "")),
150
+ "dataset": str(r.get("dataset", "")),
151
  })
152
 
153
  if new_rows:
154
  db = pd.concat([db, pd.DataFrame(new_rows)], ignore_index=True)
 
 
 
155
 
156
+ _save_db(db)
157
+
158
+ # Use decoded model for downstream filtering
159
+ current_model = decoded
160
+ # outputs: [table, tag_filter, category_filter, dataset_filter, count_md, current_model]
161
+ return refresh_view("", [], "", "", current_model) + (current_model,)
162
+
163
+
164
+
165
+ # allow user to paste either "wavcoch_audio-preds-sr=16000" or the URL-encoded "%3D" form
166
+
167
 
168
  # def sync_model(model_name: str):
169
  # """
sync_library_and_hf.py CHANGED
@@ -3,6 +3,7 @@
3
  import argparse, datetime, uuid, posixpath, sys, traceback
4
  from pathlib import Path
5
  from typing import List, Tuple
 
6
 
7
  import pandas as pd
8
  from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd
 
3
  import argparse, datetime, uuid, posixpath, sys, traceback
4
  from pathlib import Path
5
  from typing import List, Tuple
6
+ from urllib.parse import unquote # add at top
7
 
8
  import pandas as pd
9
  from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd
temp.py CHANGED
@@ -2,6 +2,8 @@
2
  import posixpath
3
  from pathlib import Path
4
  import pandas as pd
 
 
5
  from huggingface_hub import HfApi
6
 
7
  REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
 
2
  import posixpath
3
  from pathlib import Path
4
  import pandas as pd
5
+ from urllib.parse import unquote # add at top
6
+
7
  from huggingface_hub import HfApi
8
 
9
  REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()