Spaces:
Running
Running
| """Fetch full recipes for every medium referenced in data/strain_media.parquet. | |
| Sequential, polite — MediaDive is a small public API. ~25 min for ~1,500 media at | |
| 0.3s/call. Outputs: | |
| - data/media_metadata.parquet — one row per medium (name, pH range, source, etc.) | |
| - data/media_recipes.parquet — one row per (medium_id, compound_id) recipe entry | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from microbe_model import config | |
| from microbe_model.data.mediadive import MediaDiveClient, normalize_recipe | |
| def main() -> None: | |
| links_path = config.DATA / "strain_media.parquet" | |
| if not links_path.exists(): | |
| raise SystemExit(f"Missing {links_path}. Run scripts/08_extract_strain_media.py first.") | |
| links = pd.read_parquet(links_path) | |
| medium_ids = sorted(links["medium_id"].dropna().unique().tolist()) | |
| print(f"Fetching {len(medium_ids):,} unique medium recipes from MediaDive") | |
| cache_dir = config.DATA / "mediadive" | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| client = MediaDiveClient() | |
| metadata_rows = [] | |
| recipe_rows = [] | |
| failed: list[str] = [] | |
| for mid in tqdm(medium_ids, desc="MediaDive", unit="medium"): | |
| cache_path = cache_dir / f"{mid}.json" | |
| if cache_path.exists(): | |
| payload = json.loads(cache_path.read_text()) | |
| else: | |
| payload = client.fetch_medium(mid) | |
| if payload is None: | |
| failed.append(mid) | |
| continue | |
| cache_path.write_text(json.dumps(payload)) | |
| medium = payload.get("medium") or {} | |
| metadata_rows.append({ | |
| "medium_id": str(mid), | |
| "name": medium.get("name"), | |
| "complex_medium": medium.get("complex_medium"), | |
| "min_pH": medium.get("min_pH"), | |
| "max_pH": medium.get("max_pH"), | |
| "source": medium.get("source"), | |
| "link": medium.get("link"), | |
| "n_solutions": len(payload.get("solutions") or []), | |
| }) | |
| recipe_rows.extend(normalize_recipe(payload)) | |
| md = pd.DataFrame(metadata_rows) | |
| rc = pd.DataFrame(recipe_rows) | |
| md.to_parquet(config.DATA / "media_metadata.parquet", index=False) | |
| rc.to_parquet(config.DATA / "media_recipes.parquet", index=False) | |
| print(f"\nWrote {len(md):,} media to media_metadata.parquet") | |
| print(f"Wrote {len(rc):,} compound-rows to media_recipes.parquet") | |
| print(f"Failed to fetch: {len(failed)}") | |
| if rc.empty: | |
| return | |
| print(f"\nUnique compounds: {rc['compound'].nunique():,}") | |
| print("Top 15 most-used compounds across all recipes:") | |
| top = (rc.groupby("compound") | |
| .agg(n_media=("medium_id", "nunique"), median_g_l=("g_l", "median")) | |
| .sort_values("n_media", ascending=False) | |
| .head(15)) | |
| print(top.to_string()) | |
| if __name__ == "__main__": | |
| main() | |