microbe-model / scripts /09_fetch_media_recipes.py
Miyu Horiuchi
Phase E scaffolding: MediaDive integration + strain↔medium links
3d34be9
"""Fetch full recipes for every medium referenced in data/strain_media.parquet.
Sequential, polite — MediaDive is a small public API. ~25 min for ~1,500 media at
0.3s/call. Outputs:
- data/media_metadata.parquet — one row per medium (name, pH range, source, etc.)
- data/media_recipes.parquet — one row per (medium_id, compound_id) recipe entry
"""
from __future__ import annotations
import json
import pandas as pd
from tqdm import tqdm
from microbe_model import config
from microbe_model.data.mediadive import MediaDiveClient, normalize_recipe
def main() -> None:
links_path = config.DATA / "strain_media.parquet"
if not links_path.exists():
raise SystemExit(f"Missing {links_path}. Run scripts/08_extract_strain_media.py first.")
links = pd.read_parquet(links_path)
medium_ids = sorted(links["medium_id"].dropna().unique().tolist())
print(f"Fetching {len(medium_ids):,} unique medium recipes from MediaDive")
cache_dir = config.DATA / "mediadive"
cache_dir.mkdir(parents=True, exist_ok=True)
client = MediaDiveClient()
metadata_rows = []
recipe_rows = []
failed: list[str] = []
for mid in tqdm(medium_ids, desc="MediaDive", unit="medium"):
cache_path = cache_dir / f"{mid}.json"
if cache_path.exists():
payload = json.loads(cache_path.read_text())
else:
payload = client.fetch_medium(mid)
if payload is None:
failed.append(mid)
continue
cache_path.write_text(json.dumps(payload))
medium = payload.get("medium") or {}
metadata_rows.append({
"medium_id": str(mid),
"name": medium.get("name"),
"complex_medium": medium.get("complex_medium"),
"min_pH": medium.get("min_pH"),
"max_pH": medium.get("max_pH"),
"source": medium.get("source"),
"link": medium.get("link"),
"n_solutions": len(payload.get("solutions") or []),
})
recipe_rows.extend(normalize_recipe(payload))
md = pd.DataFrame(metadata_rows)
rc = pd.DataFrame(recipe_rows)
md.to_parquet(config.DATA / "media_metadata.parquet", index=False)
rc.to_parquet(config.DATA / "media_recipes.parquet", index=False)
print(f"\nWrote {len(md):,} media to media_metadata.parquet")
print(f"Wrote {len(rc):,} compound-rows to media_recipes.parquet")
print(f"Failed to fetch: {len(failed)}")
if rc.empty:
return
print(f"\nUnique compounds: {rc['compound'].nunique():,}")
print("Top 15 most-used compounds across all recipes:")
top = (rc.groupby("compound")
.agg(n_media=("medium_id", "nunique"), median_g_l=("g_l", "median"))
.sort_values("n_media", ascending=False)
.head(15))
print(top.to_string())
if __name__ == "__main__":
main()