akazemian commited on
Commit
b3908ac
·
verified ·
1 Parent(s): 7aabe91

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app.py +123 -120
  2. index.csv +0 -0
  3. temp.py +74 -0
app.py CHANGED
@@ -3,36 +3,34 @@ from pathlib import Path
3
  import html as _py_html
4
  import pandas as pd
5
  import gradio as gr
 
6
 
7
- # ----------- FIXED PATHS -----------
8
- REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio") # /{model_name}/*.html
9
- FIXED_MANIFEST = Path("/data/atlask/BAU-Quant/manifest_val.csv")
10
- # -----------------------------------
11
 
12
  DB_PATH = "library.csv"
13
  ALLOWED_EXTS = {".html"}
14
 
15
- # Columns in DB
16
- EXTRA_COLS = ["model_name", "category", "dataset"]
17
  BASE_COLS = ["id","filename","path","tags","keywords","notes","uploaded_at"]
18
  ALL_DB_COLS = BASE_COLS + EXTRA_COLS
19
 
20
- # Columns shown in the table (order)
21
- TABLE_COLS = ["id","filename","model_name","category","dataset",
22
  "tags","keywords","notes","uploaded_at"]
23
 
24
  # ---------- DB helpers ----------
25
  def _load_db() -> pd.DataFrame:
26
  if os.path.exists(DB_PATH):
27
  df = pd.read_csv(DB_PATH)
28
- # migrate: ensure all required columns exist
29
  for c in ALL_DB_COLS:
30
  if c not in df.columns:
31
  df[c] = ""
32
- # normalize text-ish fields
33
- for c in ["tags","keywords","notes","model_name","category","dataset"]:
34
  df[c] = df[c].fillna("").astype(str)
35
- # keep only our known columns in stable order
36
  return df[ALL_DB_COLS]
37
  return pd.DataFrame(columns=ALL_DB_COLS)
38
 
@@ -61,103 +59,92 @@ def _df_from_table_value(table_value):
61
  return pd.DataFrame(table_value, columns=cols)
62
  return pd.DataFrame(columns=cols)
63
 
64
- # ---------- Manifest helpers ----------
65
- def _stem_for_match(p: Path) -> str:
66
- stem = p.stem
67
- if "chunk" in stem:
68
- stem = stem.split("_chunk")[0]
69
- return stem
70
-
71
- def _load_manifest():
72
- if not FIXED_MANIFEST.exists():
73
- return None
74
- mf = pd.read_csv(FIXED_MANIFEST)
75
- if "file_name" not in mf.columns:
76
- return None
77
- mf = mf.copy()
78
- def mk_from_str(s: str):
79
- st = Path(str(s)).stem
80
- return st.split("_chunk")[0] if "chunk" in st else st
81
- mf["__match_key"] = mf["file_name"].astype(str).apply(mk_from_str)
82
- return mf
83
-
84
- # ---------- Sync by model ----------
85
  def sync_model(model_name: str):
86
- f"""
87
- Index all .html reports under {REPORTS_ROOT}/{model_name}.
88
- Adds NEW files to DB (by exact path), sets model_name,
89
- and fills category/dataset from the fixed manifest if present.
90
  """
91
  model_name = (model_name or "").strip()
92
  if not model_name:
93
- return gr.Info("Please enter a model name."), None, None, None, None
 
 
 
 
 
 
94
 
95
- folder = REPORTS_ROOT / model_name
96
- if not folder.exists():
97
- return gr.Info(f"Folder not found: {folder}"), None, None, None, None
 
98
 
99
  df = _load_db()
100
- manifest = _load_manifest()
101
  now = datetime.datetime.now().isoformat(timespec="seconds")
102
  new_rows = []
103
 
104
- for p in sorted(folder.glob("*.html")):
105
- if p.suffix.lower() not in ALLOWED_EXTS:
106
- continue
107
-
108
- # if already indexed, optionally backfill model_name and skip creating a new row
109
- existing = df["path"] == str(p)
110
- if existing.any():
111
- idxs = df.index[existing]
112
- for i in idxs:
113
- if (df.at[i, "model_name"] or "") != model_name:
114
- df.at[i, "model_name"] = model_name
115
  continue
116
-
117
- category, dataset = "", ""
118
- if manifest is not None:
119
- mk = _stem_for_match(p)
120
- hit = manifest[manifest["__match_key"].str.contains(mk, na=False)]
121
- if not hit.empty:
122
- if "audio_category" in hit.columns:
123
- category = str(hit.iloc[0]["audio_category"])
124
- if "dataset" in hit.columns:
125
- dataset = str(hit.iloc[0]["dataset"])
126
-
127
- uid = uuid.uuid4().hex[:8]
128
  new_rows.append({
129
- "id": uid,
130
- "filename": p.name,
131
- "path": str(p), # keep absolute path; no copying
132
- "tags": "",
133
- "keywords": "",
134
- "notes": "",
135
- "uploaded_at": now,
136
- "model_name": model_name,
137
- "category": category,
138
- "dataset": dataset
139
  })
140
 
141
  if new_rows:
142
  df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
 
143
 
144
- _save_db(df)
145
- # show refreshed view scoped to this model
146
- return refresh_view("", [], "", "", model_name)
147
 
148
  # ---------- Search / filters ----------
149
- def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filter):
150
  df = _load_db()
151
 
 
 
 
 
 
152
  # tag vocabulary
153
  all_tags = sorted({t.strip()
154
  for s in df["tags"].dropna().astype(str).tolist()
155
  for t in s.split(",") if t.strip()})
156
  all_cats = sorted([c for c in df["category"].dropna().astype(str).unique() if c])
157
  all_sets = sorted([c for c in df["dataset"].dropna().astype(str).unique() if c])
158
- all_models = sorted([c for c in df["model_name"].dropna().astype(str).unique() if c])
159
 
160
- # free-text query across filename/tags/keywords/notes/category/dataset/model
161
  if query:
162
  q = query.lower()
163
  mask = (
@@ -166,8 +153,7 @@ def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filt
166
  df["keywords"].str.lower().str.contains(q, na=False) |
167
  df["notes"].str.lower().str.contains(q, na=False) |
168
  df["category"].str.lower().str.contains(q, na=False) |
169
- df["dataset"].str.lower().str.contains(q, na=False) |
170
- df["model_name"].str.lower().str.contains(q, na=False)
171
  )
172
  df = df[mask]
173
 
@@ -181,8 +167,6 @@ def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filt
181
  df = df[df["category"] == category_filter]
182
  if dataset_filter:
183
  df = df[df["dataset"] == dataset_filter]
184
- if model_filter:
185
- df = df[df["model_name"] == model_filter]
186
 
187
  df = df.sort_values("uploaded_at", ascending=False).reset_index(drop=True)
188
  view = df[TABLE_COLS].copy()
@@ -193,7 +177,6 @@ def refresh_view(query, tag_filters, category_filter, dataset_filter, model_filt
193
  gr.update(choices=all_tags),
194
  gr.update(choices=[""] + all_cats, value=category_filter or ""),
195
  gr.update(choices=[""] + all_sets, value=dataset_filter or ""),
196
- gr.update(choices=[""] + all_models, value=model_filter or ""),
197
  count_text
198
  )
199
 
@@ -233,12 +216,24 @@ def select_row(evt: gr.SelectData, table_value):
233
  if rec.empty:
234
  return "<em>Could not find file for this row.</em>", ""
235
 
236
- path = rec["path"].values[0]
237
- if not os.path.exists(path):
238
- return f"<em>File not found:</em> <code>{_py_html.escape(path)}</code>", f"📄 {row['filename']}"
239
-
240
- with open(path, "r", encoding="utf-8") as f:
241
- raw_html = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  iframe = _iframe_from_html_string(raw_html, height_px=720)
244
  return iframe, f"📄 {row['filename']}"
@@ -247,11 +242,11 @@ def select_row(evt: gr.SelectData, table_value):
247
  return f"<pre>Failed to render (see terminal):\n{_py_html.escape(str(e))}</pre>", ""
248
 
249
  # ---------- Save edits ----------
250
- def save_edits(edited_table):
251
  if edited_table is None or not len(edited_table):
252
  return gr.Info("Nothing to save.")
253
  df_db = _load_db()
254
- editable_cols = ["model_name","category","dataset","tags","keywords","notes"]
255
  for c in editable_cols:
256
  edited_table[c] = edited_table[c].fillna("").astype(str)
257
  for _, row in edited_table.iterrows():
@@ -260,8 +255,8 @@ def save_edits(edited_table):
260
  for c in editable_cols:
261
  df_db.at[i[0], c] = row[c]
262
  _save_db(df_db)
263
- # return refreshed table only
264
- return refresh_view("", [], "", "", "")[0]
265
 
266
  # -------------------- UI --------------------
267
  # CSS that targets only the three buttons via elem_id
@@ -303,25 +298,24 @@ custom_css = """
303
  }
304
  """
305
 
306
-
307
  with gr.Blocks(title="Audio HTML Library", css=custom_css) as demo:
308
  gr.Markdown("## 🎧 Audio Reconstruction Reports — sync • search • view")
 
309
 
310
  with gr.Row():
311
  with gr.Column(scale=1):
312
  # Choose model & sync
313
- gr.Markdown(f"**Model folder:** `{REPORTS_ROOT}/model_name`")
314
  model_in = gr.Textbox(label="Model name", placeholder="e.g., WavCochV8192")
315
- sync_btn = gr.Button("Sync this model", elem_id="sync-btn") # ⬅️ give id
316
 
317
  # Search & filters
318
  gr.Markdown("---\n**Search & filter**")
319
- query = gr.Textbox(label="Keyword search (filename/tags/notes/category/dataset/model)", placeholder="type to search…")
320
  tag_filter = gr.CheckboxGroup(choices=[], label="Filter by tags (AND)")
321
  category_filter = gr.Dropdown(choices=[], label="Category")
322
  dataset_filter = gr.Dropdown(choices=[], label="Dataset")
323
- model_filter = gr.Dropdown(choices=[], label="Model")
324
- refresh_btn = gr.Button("Refresh", elem_id="refresh-btn") # ⬅️ give id
325
 
326
  with gr.Column(scale=2):
327
  # Count of current view
@@ -336,31 +330,40 @@ with gr.Blocks(title="Audio HTML Library", css=custom_css) as demo:
336
  col_count=(len(TABLE_COLS), "fixed")
337
  )
338
  with gr.Row():
339
- save_btn = gr.Button("Save Edits", elem_id="save-btn") # ⬅️ give id
340
  preview_label = gr.Markdown("")
341
  preview_html = gr.HTML("")
342
 
343
- # wiring: sync
344
- sync_btn.click(sync_model, [model_in],
345
- [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
 
 
 
346
 
347
- # wiring: refresh + live filters
348
- refresh_btn.click(refresh_view,
349
- [query, tag_filter, category_filter, dataset_filter, model_filter],
350
- [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
 
 
351
 
352
- for comp in (query, tag_filter, category_filter, dataset_filter, model_filter):
353
- comp.change(refresh_view,
354
- [query, tag_filter, category_filter, dataset_filter, model_filter],
355
- [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
 
 
356
 
357
  table.select(select_row, [table], [preview_html, preview_label])
358
- save_btn.click(save_edits, [table], [table])
359
 
360
- # initial load
361
- demo.load(refresh_view,
362
- [query, tag_filter, category_filter, dataset_filter, model_filter],
363
- [table, tag_filter, category_filter, dataset_filter, model_filter, count_md])
 
 
364
 
365
  if __name__ == "__main__":
366
- demo.launch(share=True) # auth is optional but recommended
 
3
  import html as _py_html
4
  import pandas as pd
5
  import gradio as gr
6
+ from huggingface_hub import hf_hub_download
7
 
8
+ # ----------- HF DATASET CONFIG -----------
9
+ HF_DATASET_REPO = "akazemian/audio-html" # <-- change if needed
10
+ INDEX_FILENAME = "index.csv"
11
+ # -----------------------------------------
12
 
13
  DB_PATH = "library.csv"
14
  ALLOWED_EXTS = {".html"}
15
 
16
+ # Columns in DB (no model_name)
17
+ EXTRA_COLS = ["category", "dataset"]
18
  BASE_COLS = ["id","filename","path","tags","keywords","notes","uploaded_at"]
19
  ALL_DB_COLS = BASE_COLS + EXTRA_COLS
20
 
21
+ # Columns shown in the table (no model_name)
22
+ TABLE_COLS = ["id","filename","category","dataset",
23
  "tags","keywords","notes","uploaded_at"]
24
 
25
  # ---------- DB helpers ----------
26
  def _load_db() -> pd.DataFrame:
27
  if os.path.exists(DB_PATH):
28
  df = pd.read_csv(DB_PATH)
 
29
  for c in ALL_DB_COLS:
30
  if c not in df.columns:
31
  df[c] = ""
32
+ for c in ["tags","keywords","notes","category","dataset"]:
 
33
  df[c] = df[c].fillna("").astype(str)
 
34
  return df[ALL_DB_COLS]
35
  return pd.DataFrame(columns=ALL_DB_COLS)
36
 
 
59
  return pd.DataFrame(table_value, columns=cols)
60
  return pd.DataFrame(columns=cols)
61
 
62
+ # ---------- Load HF index ----------
63
+ def _load_hf_index() -> pd.DataFrame:
64
+ """
65
+ Download + read index.csv from the HF dataset repo.
66
+ Required columns: id, filename, relpath, category, dataset, tags, keywords, notes, uploaded_at
67
+ """
68
+ local = hf_hub_download(
69
+ repo_id=HF_DATASET_REPO,
70
+ repo_type="dataset",
71
+ filename=INDEX_FILENAME,
72
+ )
73
+ df = pd.read_csv(local)
74
+ for c in ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]:
75
+ if c not in df.columns:
76
+ df[c] = ""
77
+ # normalize types
78
+ for c in ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]:
79
+ df[c] = df[c].fillna("").astype(str)
80
+ return df
81
+
82
+ # ---------- Sync by model (prefix inside HF dataset) ----------
83
  def sync_model(model_name: str):
84
+ """
85
+ Load index.csv from HF, add rows for the selected model (by relpath prefix),
86
+ store HF URIs in DB, and show only that model’s files.
 
87
  """
88
  model_name = (model_name or "").strip()
89
  if not model_name:
90
+ return gr.Info("Please enter a model name."), None, None, None, ""
91
+
92
+ try:
93
+ idx = _load_hf_index()
94
+ except Exception as e:
95
+ traceback.print_exc()
96
+ return gr.Info(f"Failed to load index from HF: {e}"), None, None, None, ""
97
 
98
+ # rows like "{model_name}/.../file.html"
99
+ subset = idx[idx["relpath"].str.startswith(model_name + "/")]
100
+ if subset.empty:
101
+ return gr.Info(f"No HTML files found for model '{model_name}' on {HF_DATASET_REPO}"), None, None, None, ""
102
 
103
  df = _load_db()
 
104
  now = datetime.datetime.now().isoformat(timespec="seconds")
105
  new_rows = []
106
 
107
+ for _, r in subset.iterrows():
108
+ relpath = r["relpath"]
109
+ hub_uri = f"hf://{HF_DATASET_REPO}/{relpath}"
110
+ if (df["path"] == hub_uri).any():
 
 
 
 
 
 
 
111
  continue
 
 
 
 
 
 
 
 
 
 
 
 
112
  new_rows.append({
113
+ "id": r["id"] if r["id"] else uuid.uuid4().hex[:8],
114
+ "filename": r["filename"],
115
+ "path": hub_uri, # store HF URI
116
+ "tags": r["tags"],
117
+ "keywords": r["keywords"],
118
+ "notes": r["notes"],
119
+ "uploaded_at": r["uploaded_at"] or now,
120
+ "category": r["category"],
121
+ "dataset": r["dataset"]
 
122
  })
123
 
124
  if new_rows:
125
  df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
126
+ _save_db(df)
127
 
128
+ current_model = model_name # remember which model prefix is active
129
+ return refresh_view("", [], "", "", current_model) + (current_model,)
 
130
 
131
  # ---------- Search / filters ----------
132
+ def refresh_view(query, tag_filters, category_filter, dataset_filter, current_model):
133
  df = _load_db()
134
 
135
+ # scope to current model prefix in HF URI if provided
136
+ if current_model:
137
+ prefix = f"hf://{HF_DATASET_REPO}/{current_model}/"
138
+ df = df[df["path"].astype(str).str.startswith(prefix)]
139
+
140
  # tag vocabulary
141
  all_tags = sorted({t.strip()
142
  for s in df["tags"].dropna().astype(str).tolist()
143
  for t in s.split(",") if t.strip()})
144
  all_cats = sorted([c for c in df["category"].dropna().astype(str).unique() if c])
145
  all_sets = sorted([c for c in df["dataset"].dropna().astype(str).unique() if c])
 
146
 
147
+ # free-text query across filename/tags/keywords/notes/category/dataset
148
  if query:
149
  q = query.lower()
150
  mask = (
 
153
  df["keywords"].str.lower().str.contains(q, na=False) |
154
  df["notes"].str.lower().str.contains(q, na=False) |
155
  df["category"].str.lower().str.contains(q, na=False) |
156
+ df["dataset"].str.lower().str.contains(q, na=False)
 
157
  )
158
  df = df[mask]
159
 
 
167
  df = df[df["category"] == category_filter]
168
  if dataset_filter:
169
  df = df[df["dataset"] == dataset_filter]
 
 
170
 
171
  df = df.sort_values("uploaded_at", ascending=False).reset_index(drop=True)
172
  view = df[TABLE_COLS].copy()
 
177
  gr.update(choices=all_tags),
178
  gr.update(choices=[""] + all_cats, value=category_filter or ""),
179
  gr.update(choices=[""] + all_sets, value=dataset_filter or ""),
 
180
  count_text
181
  )
182
 
 
216
  if rec.empty:
217
  return "<em>Could not find file for this row.</em>", ""
218
 
219
+ path_str = rec["path"].values[0]
220
+
221
+ # Hub-backed path lazy download
222
+ if str(path_str).startswith("hf://"):
223
+ _, rest = path_str.split("hf://", 1)
224
+ repo_id, relpath = rest.split("/", 1)
225
+ local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=relpath)
226
+ raw_html = Path(local_path).read_text(encoding="utf-8")
227
+ elif str(path_str).startswith("http"):
228
+ # if you ever swap to CDN URLs, iframe the URL directly
229
+ iframe = f'<iframe style="width:100%;height:720px;border:1px solid #ddd;border-radius:8px;" src="{_py_html.escape(path_str)}"></iframe>'
230
+ return iframe, f"📄 {row['filename']}"
231
+ else:
232
+ # local file fallback (not used for HF flow, kept for compatibility)
233
+ p = Path(path_str)
234
+ if not p.exists():
235
+ return f"<em>File not found:</em> <code>{_py_html.escape(str(p))}</code>", f"📄 {row['filename']}"
236
+ raw_html = p.read_text(encoding="utf-8")
237
 
238
  iframe = _iframe_from_html_string(raw_html, height_px=720)
239
  return iframe, f"📄 {row['filename']}"
 
242
  return f"<pre>Failed to render (see terminal):\n{_py_html.escape(str(e))}</pre>", ""
243
 
244
  # ---------- Save edits ----------
245
+ def save_edits(edited_table, current_model):
246
  if edited_table is None or not len(edited_table):
247
  return gr.Info("Nothing to save.")
248
  df_db = _load_db()
249
+ editable_cols = ["category","dataset","tags","keywords","notes"]
250
  for c in editable_cols:
251
  edited_table[c] = edited_table[c].fillna("").astype(str)
252
  for _, row in edited_table.iterrows():
 
255
  for c in editable_cols:
256
  df_db.at[i[0], c] = row[c]
257
  _save_db(df_db)
258
+ # return refreshed table only (respect current_model scope)
259
+ return refresh_view("", [], "", "", current_model)[0]
260
 
261
  # -------------------- UI --------------------
262
  # CSS that targets only the three buttons via elem_id
 
298
  }
299
  """
300
 
 
301
  with gr.Blocks(title="Audio HTML Library", css=custom_css) as demo:
302
  gr.Markdown("## 🎧 Audio Reconstruction Reports — sync • search • view")
303
+ current_model = gr.State("") # remembers active model prefix inside HF repo
304
 
305
  with gr.Row():
306
  with gr.Column(scale=1):
307
  # Choose model & sync
308
+ gr.Markdown(f"**Model prefix on HF dataset:** `{HF_DATASET_REPO}/<model_name>/...`")
309
  model_in = gr.Textbox(label="Model name", placeholder="e.g., WavCochV8192")
310
+ sync_btn = gr.Button("Sync this model", elem_id="sync-btn")
311
 
312
  # Search & filters
313
  gr.Markdown("---\n**Search & filter**")
314
+ query = gr.Textbox(label="Keyword search (filename/tags/notes/category/dataset)", placeholder="type to search…")
315
  tag_filter = gr.CheckboxGroup(choices=[], label="Filter by tags (AND)")
316
  category_filter = gr.Dropdown(choices=[], label="Category")
317
  dataset_filter = gr.Dropdown(choices=[], label="Dataset")
318
+ refresh_btn = gr.Button("Refresh", elem_id="refresh-btn")
 
319
 
320
  with gr.Column(scale=2):
321
  # Count of current view
 
330
  col_count=(len(TABLE_COLS), "fixed")
331
  )
332
  with gr.Row():
333
+ save_btn = gr.Button("Save Edits", elem_id="save-btn")
334
  preview_label = gr.Markdown("")
335
  preview_html = gr.HTML("")
336
 
337
+ # wiring: sync (also sets current_model)
338
+ sync_btn.click(
339
+ sync_model,
340
+ [model_in],
341
+ [table, tag_filter, category_filter, dataset_filter, count_md, current_model]
342
+ )
343
 
344
+ # wiring: refresh + live filters (respect current_model)
345
+ refresh_btn.click(
346
+ refresh_view,
347
+ [query, tag_filter, category_filter, dataset_filter, current_model],
348
+ [table, tag_filter, category_filter, dataset_filter, count_md]
349
+ )
350
 
351
+ for comp in (query, tag_filter, category_filter, dataset_filter):
352
+ comp.change(
353
+ refresh_view,
354
+ [query, tag_filter, category_filter, dataset_filter, current_model],
355
+ [table, tag_filter, category_filter, dataset_filter, count_md]
356
+ )
357
 
358
  table.select(select_row, [table], [preview_html, preview_label])
359
+ save_btn.click(save_edits, [table, current_model], [table])
360
 
361
+ # initial load (no model yet)
362
+ demo.load(
363
+ refresh_view,
364
+ [query, tag_filter, category_filter, dataset_filter, current_model],
365
+ [table, tag_filter, category_filter, dataset_filter, count_md]
366
+ )
367
 
368
  if __name__ == "__main__":
369
+ demo.launch(share=True) # auth optional
index.csv ADDED
The diff for this file is too large to render. See raw diff
 
temp.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # upload_htmls_and_index.py
2
+ import posixpath
3
+ from pathlib import Path
4
+ import pandas as pd
5
+ from huggingface_hub import HfApi
6
+
7
+ REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
8
+ DATASET_REPO = "akazemian/audio-html"
9
+
10
+ # --- replace ONLY your upload block with this (keep the rest of the file) ---
11
+ from huggingface_hub import HfApi
12
+
13
+ api = HfApi()
14
+ REPORTS_ROOT = REPORTS_ROOT.resolve() # your existing constant
15
+
16
+ # Upload per model subfolder, but call upload_large_folder on the PARENT
17
+ # (older huggingface_hub versions don't support path_in_repo)
18
+ for sub in sorted([p for p in REPORTS_ROOT.iterdir() if p.is_dir()]):
19
+ model = sub.name
20
+ print(f"[HF] upload_large_folder: {REPORTS_ROOT} (include {model}/**/*.html) -> {DATASET_REPO}")
21
+ api.upload_large_folder(
22
+ repo_id=DATASET_REPO,
23
+ repo_type="dataset",
24
+ folder_path=str(REPORTS_ROOT), # parent folder
25
+ allow_patterns=[f"{model}/**/*.html"], # only this model's files
26
+ )
27
+ print(f"✓ uploaded {model}")
28
+ # --- end replacement ---
29
+
30
+
31
+
32
+ # (B) Build index.csv from your existing library.csv (no model_name)
33
+ library = pd.read_csv("library.csv")
34
+
35
+ def ensure_cols(df, cols):
36
+ for c in cols:
37
+ if c not in df.columns:
38
+ df[c] = ""
39
+ return df
40
+
41
+ library = ensure_cols(
42
+ library,
43
+ ["id","filename","path","tags","keywords","notes","uploaded_at","category","dataset"]
44
+ )
45
+
46
+ def local_to_relpath(local_path: str) -> str:
47
+ # Make path relative to REPORTS_ROOT and normalize to POSIX for HF
48
+ rel = Path(local_path).resolve().relative_to(REPORTS_ROOT)
49
+ return posixpath.join(*rel.parts)
50
+
51
+ # Only keep rows that actually point to .html files under REPORTS_ROOT
52
+ keep = library["path"].astype(str).str.endswith(".html", na=False) & \
53
+ library["path"].astype(str).str.startswith(str(REPORTS_ROOT), na=False)
54
+ idx = library[keep].copy()
55
+
56
+ # Derive relpath inside the HF dataset from the absolute local path
57
+ idx["relpath"] = idx["path"].apply(local_to_relpath)
58
+
59
+ index_cols = ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]
60
+ index_df = idx[index_cols].copy()
61
+ index_df.to_csv("index.csv", index=False)
62
+
63
+ # (C) Upload index.csv to the dataset repo (small, separate commit)
64
+ from huggingface_hub import CommitOperationAdd
65
+ api.create_commit(
66
+ repo_id=DATASET_REPO,
67
+ repo_type="dataset",
68
+ operations=[CommitOperationAdd(path_in_repo="index.csv", path_or_fileobj="index.csv")],
69
+ commit_message=f"Add/update index.csv ({len(index_df)} rows)"
70
+ )
71
+
72
+ print("Done: uploaded HTMLs (large-folder) and index.csv")
73
+
74
+