Spaces:

miyuiu
/

microbe-model

Running

Miyu Horiuchi Claude Opus 4.7 (1M context) commited on 29 days ago

Commit

3d34be9

1 Parent(s): 0fbea89

Phase E scaffolding: MediaDive integration + strain↔medium links

Building toward predicting cultivation media composition from genome (the
actual research deliverable, not just summary stats like T_opt).

src/microbe_model/data/mediadive.py:
- parse_strain_media_links: extracts (medium_id, growth) tuples from a cached
BacDive record (BacDive's `Culture and growth conditions → culture medium[]`
embeds inline links to MediaDive, no new API call needed)
- iter_bacdive_strain_media: walks the BacDive cache (100K records on disk)
- MediaDiveClient: polite REST client (0.3s rate limit) for /rest/medium/{id}
- normalize_recipe: flattens a /medium/{id} payload into per-compound rows

scripts/08_extract_strain_media.py: one-shot extraction from local cache.
Result: 38,649 links across 28,704 strains and 1,581 unique media (zero
network calls). All BacDive entries are growth=yes, so we have positive
labels only — negative cultivation results aren't recorded in this field.

scripts/09_fetch_media_recipes.py: fetches each unique medium recipe via
MediaDive REST. Currently running in background, ~14 min ETA. Caches each
medium's JSON to data/mediadive/{id}.json for resumability.

After this lands, we'll have:
- data/strain_media.parquet — strain↔medium adjacency
- data/media_recipes.parquet — per-(medium, compound) rows with amounts
- data/media_metadata.parquet — medium-level (name, pH range, source)

Top-used media are unsurprising: DSMZ Medium 65 (Streptomyces, 3789 strains),
Medium 92 (TSB, 3053), Medium 9 (Myxococcus, 2850). Long-tail of niche media
provides the prediction signal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

scripts/08_extract_strain_media.py +37 -0
scripts/09_fetch_media_recipes.py +81 -0
src/microbe_model/data/mediadive.py +148 -0

scripts/08_extract_strain_media.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Build data/strain_media.parquet by walking the BacDive cache.
+No network calls — pure local extraction from cached records.
+Output: one row per (bacdive_id, medium_id) link.
+"""
+from __future__ import annotations
+import pandas as pd
+from tqdm import tqdm
+from microbe_model import config
+from microbe_model.data.mediadive import iter_bacdive_strain_media
+def main() -> None:
+    rows = []
+    n_files = sum(1 for _ in config.BACDIVE_DIR.glob("*.json"))
+    print(f"Walking {n_files:,} BacDive cached records...")
+    for row in tqdm(iter_bacdive_strain_media(), total=n_files, unit="link"):
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    out = config.DATA / "strain_media.parquet"
+    df.to_parquet(out, index=False)
+    print(f"\nWrote {len(df):,} strain↔medium links to {out}")
+    print(f"  unique strains:  {df['bacdive_id'].nunique():,}")
+    print(f"  unique media:    {df['medium_id'].nunique():,}")
+    print(f"  growth=yes:      {(df['growth'] == 'yes').sum():,}")
+    print(f"  growth=no:       {(df['growth'] == 'no').sum():,}")
+    print(f"  growth=weak:     {(df['growth'] == 'weak').sum():,}")
+    print("\nTop 10 most-used media:")
+    print(df.groupby(['medium_id', 'medium_name']).size().sort_values(ascending=False).head(10))
+if __name__ == "__main__":
+    main()

scripts/09_fetch_media_recipes.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Fetch full recipes for every medium referenced in data/strain_media.parquet.
+Sequential, polite — MediaDive is a small public API. ~25 min for ~1,500 media at
+0.3s/call. Outputs:
+  - data/media_metadata.parquet — one row per medium (name, pH range, source, etc.)
+  - data/media_recipes.parquet  — one row per (medium_id, compound_id) recipe entry
+"""
+from __future__ import annotations
+import json
+import pandas as pd
+from tqdm import tqdm
+from microbe_model import config
+from microbe_model.data.mediadive import MediaDiveClient, normalize_recipe
+def main() -> None:
+    links_path = config.DATA / "strain_media.parquet"
+    if not links_path.exists():
+        raise SystemExit(f"Missing {links_path}. Run scripts/08_extract_strain_media.py first.")
+    links = pd.read_parquet(links_path)
+    medium_ids = sorted(links["medium_id"].dropna().unique().tolist())
+    print(f"Fetching {len(medium_ids):,} unique medium recipes from MediaDive")
+    cache_dir = config.DATA / "mediadive"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    client = MediaDiveClient()
+    metadata_rows = []
+    recipe_rows = []
+    failed: list[str] = []
+    for mid in tqdm(medium_ids, desc="MediaDive", unit="medium"):
+        cache_path = cache_dir / f"{mid}.json"
+        if cache_path.exists():
+            payload = json.loads(cache_path.read_text())
+        else:
+            payload = client.fetch_medium(mid)
+            if payload is None:
+                failed.append(mid)
+                continue
+            cache_path.write_text(json.dumps(payload))
+        medium = payload.get("medium") or {}
+        metadata_rows.append({
+            "medium_id": str(mid),
+            "name": medium.get("name"),
+            "complex_medium": medium.get("complex_medium"),
+            "min_pH": medium.get("min_pH"),
+            "max_pH": medium.get("max_pH"),
+            "source": medium.get("source"),
+            "link": medium.get("link"),
+            "n_solutions": len(payload.get("solutions") or []),
+        })
+        recipe_rows.extend(normalize_recipe(payload))
+    md = pd.DataFrame(metadata_rows)
+    rc = pd.DataFrame(recipe_rows)
+    md.to_parquet(config.DATA / "media_metadata.parquet", index=False)
+    rc.to_parquet(config.DATA / "media_recipes.parquet", index=False)
+    print(f"\nWrote {len(md):,} media to media_metadata.parquet")
+    print(f"Wrote {len(rc):,} compound-rows to media_recipes.parquet")
+    print(f"Failed to fetch: {len(failed)}")
+    if rc.empty:
+        return
+    print(f"\nUnique compounds: {rc['compound'].nunique():,}")
+    print("Top 15 most-used compounds across all recipes:")
+    top = (rc.groupby("compound")
+            .agg(n_media=("medium_id", "nunique"), median_g_l=("g_l", "median"))
+            .sort_values("n_media", ascending=False)
+            .head(15))
+    print(top.to_string())
+if __name__ == "__main__":
+    main()

src/microbe_model/data/mediadive.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""MediaDive (DSMZ) integration — strain↔medium links and full recipes.
+The BacDive v2 records we already cached include inline medium links of the form
+``https://mediadive.dsmz.de/medium/{id}`` plus a `growth: yes/no` flag. So extracting
+strain↔medium pairs needs no new API calls. The medium *recipes* (compound list
+with amounts) do need network access via MediaDive's REST API.
+API documentation observed live on 2026-04-27:
+  - /rest/medium/{id}   → full recipe with solutions[].recipe[] (compound + amount + unit + g_l)
+  - /rest/media         → paginated list of all media (limit + offset)
+  - /rest/medium-strains/{id} → strains linked to a medium (with bacdive_id)
+"""
+from __future__ import annotations
+import json
+import re
+import time
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Any
+import requests
+from microbe_model import config
+BASE_URL = "https://mediadive.dsmz.de/rest"
+RATE_LIMIT_S = 0.3  # be polite to a small public API
+def _extract_medium_id(link: str | None) -> str | None:
+    if not link:
+        return None
+    m = re.search(r"/medium/([A-Za-z0-9]+)", link)
+    return m.group(1) if m else None
+def parse_strain_media_links(record: dict[str, Any]) -> list[dict[str, Any]]:
+    """Return a list of {medium_id, medium_name, growth} for each medium in a BacDive record."""
+    culture = record.get("Culture and growth conditions") or {}
+    raw = culture.get("culture medium") or []
+    if isinstance(raw, dict):
+        raw = [raw]
+    out: list[dict[str, Any]] = []
+    for m in raw:
+        if not isinstance(m, dict):
+            continue
+        medium_id = _extract_medium_id(m.get("link"))
+        if not medium_id:
+            continue
+        growth = (m.get("growth") or "").strip().lower()
+        out.append({
+            "medium_id": str(medium_id),
+            "medium_name": m.get("name"),
+            "growth": growth,  # "yes", "no", "weak", or ""
+        })
+    return out
+def iter_bacdive_strain_media(cache_dir: Path | None = None) -> Iterator[dict[str, Any]]:
+    """Walk the BacDive cache and yield {bacdive_id, medium_id, medium_name, growth} rows."""
+    cache_dir = cache_dir or config.BACDIVE_DIR
+    for path in cache_dir.glob("*.json"):
+        try:
+            record = json.loads(path.read_text())
+        except json.JSONDecodeError:
+            continue
+        try:
+            bid = int(path.stem)
+        except ValueError:
+            continue
+        for link in parse_strain_media_links(record):
+            yield {
+                "bacdive_id": bid,
+                "medium_id": link["medium_id"],
+                "medium_name": link["medium_name"],
+                "growth": link["growth"],
+            }
+class MediaDiveClient:
+    """Polite REST client for MediaDive — 0.3s sleep between calls by default."""
+    def __init__(self, *, rate_limit_s: float = RATE_LIMIT_S) -> None:
+        self.session = requests.Session()
+        self.rate_limit_s = rate_limit_s
+    def _get(self, path: str, params: dict | None = None) -> dict[str, Any]:
+        time.sleep(self.rate_limit_s)
+        url = f"{BASE_URL}{path}"
+        for attempt in range(3):
+            try:
+                resp = self.session.get(url, params=params, timeout=30)
+                if resp.status_code in (429, 502, 503):
+                    time.sleep(2 ** attempt)
+                    continue
+                resp.raise_for_status()
+                return resp.json()
+            except requests.RequestException:
+                if attempt == 2:
+                    raise
+                time.sleep(2 ** attempt)
+        return {}
+    def fetch_medium(self, medium_id: str) -> dict[str, Any] | None:
+        """Return the full medium record, or None if not found / malformed."""
+        try:
+            body = self._get(f"/medium/{medium_id}")
+        except requests.HTTPError:
+            return None
+        if body.get("status") != 200:
+            return None
+        return body.get("data") or None
+    def list_media(self, *, limit: int = 200, offset: int = 0) -> list[dict[str, Any]]:
+        body = self._get("/media", params={"limit": limit, "offset": offset})
+        return body.get("data") or []
+def normalize_recipe(medium_payload: dict[str, Any]) -> list[dict[str, Any]]:
+    """Flatten a /medium/{id} payload into per-compound rows.
+    Each row: {medium_id, solution_name, compound_id, compound, amount, unit, g_l, optional}.
+    Skips compounds with no g_l / amount.
+    """
+    medium = medium_payload.get("medium") or {}
+    medium_id = str(medium.get("id", ""))
+    rows: list[dict[str, Any]] = []
+    for solution in medium_payload.get("solutions") or []:
+        sol_name = solution.get("name", "")
+        for r in solution.get("recipe") or []:
+            if not isinstance(r, dict):
+                continue
+            compound = r.get("compound")
+            if not compound:
+                continue
+            rows.append({
+                "medium_id": medium_id,
+                "solution_name": sol_name,
+                "compound_id": r.get("compound_id"),
+                "compound": compound,
+                "amount": r.get("amount"),
+                "unit": r.get("unit"),
+                "g_l": r.get("g_l"),
+                "optional": int(r.get("optional", 0) or 0),
+                "condition": r.get("condition"),
+            })
+    return rows