Miyu Horiuchi Claude Opus 4.7 (1M context) commited on
Commit
6c30d74
·
1 Parent(s): 52cf5ab

Rewrite BacDive client for v2 public API (no auth required)

Browse files

BacDive removed registration/Keycloak auth in February 2026 — the v2 REST API
is now fully public. This rewrite:

- Drops OAuth, BACDIVE_USER, BACDIVE_PASSWORD, all Keycloak code
- Uses the new /v2/fetch/{ids} batched endpoint (up to 100 IDs/call,
missing IDs silently dropped)
- Discovers the full BacDive corpus by scanning [1, 200000] in batches —
~2000 calls, ~30 min wall time, no pagination contracts
- Fixes the phenotype extractor to match the real v2 schema:
Sequence information → Genome sequences[].INSDC accession
Culture and growth conditions → culture temp[] (type ∈ {growth, optimum, range})
Properly derives optimum from explicit "optimum" entries first,
falling back to median of positive-growth entries
- Adds 4 schema-aware tests (all passing) using a real /v2/fetch/24493 fixture

Live-tested: scanned 103 real records in ID range 24490-24600 and confirmed
all five prediction targets (T_opt, pH_opt, oxygen, salt, genome accession)
populate correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

.env.example CHANGED
@@ -1,7 +1,5 @@
1
- # BacDive API credentialsregister at https://bacdive.dsmz.de/
2
- BACDIVE_USER=
3
- BACDIVE_PASSWORD=
4
-
5
- # NCBI API key — optional, raises rate limit from 3 req/s to 10 req/s
6
  # Get one at https://www.ncbi.nlm.nih.gov/account/settings/
7
  NCBI_API_KEY=
 
 
 
1
+ # NCBI API keyoptional, raises rate limit from 3 req/s to 10 req/s.
 
 
 
 
2
  # Get one at https://www.ncbi.nlm.nih.gov/account/settings/
3
  NCBI_API_KEY=
4
+
5
+ # (BacDive no longer requires registration as of February 2026 — the v2 API is public.)
README.md CHANGED
@@ -44,8 +44,9 @@ uv sync --all-extras
44
 
45
  ```bash
46
  # 1. Pull strain metadata + phenotype labels from BacDive
47
- # (requires BACDIVE_USER and BACDIVE_PASSWORD env varsregister at bacdive.dsmz.de)
48
- uv run python scripts/01_fetch_bacdive.py --limit 1000
 
49
 
50
  # 2. Download genomes for strains that have an accession
51
  uv run python scripts/02_fetch_genomes.py
@@ -89,5 +90,6 @@ These are deliberate v0 boundaries. See the project notes for the longer-term pl
89
 
90
  Copy `.env.example` to `.env` and fill in:
91
 
92
- - `BACDIVE_USER`, `BACDIVE_PASSWORD` — required for BacDive API access (free registration).
93
  - `NCBI_API_KEY` — optional, raises NCBI rate limit from 3 req/s to 10 req/s.
 
 
 
44
 
45
  ```bash
46
  # 1. Pull strain metadata + phenotype labels from BacDive
47
+ # (BacDive v2 API is public as of Feb 2026 no registration needed)
48
+ uv run python scripts/01_fetch_bacdive.py --end 5000 # smoke test, ~5 min
49
+ # uv run python scripts/01_fetch_bacdive.py --end 200000 # full BacDive, ~30 min
50
 
51
  # 2. Download genomes for strains that have an accession
52
  uv run python scripts/02_fetch_genomes.py
 
90
 
91
  Copy `.env.example` to `.env` and fill in:
92
 
 
93
  - `NCBI_API_KEY` — optional, raises NCBI rate limit from 3 req/s to 10 req/s.
94
+
95
+ (BacDive's v2 API was opened to the public in February 2026 — no registration or token needed.)
scripts/01_fetch_bacdive.py CHANGED
@@ -1,10 +1,15 @@
1
- """Pull strain metadata + phenotype labels from BacDive.
2
 
3
- Writes one JSON per strain to data/bacdive/, plus a consolidated parquet table at
4
- data/bacdive_phenotypes.parquet.
 
5
 
6
  Usage:
7
- uv run python scripts/01_fetch_bacdive.py --limit 1000
 
 
 
 
8
  """
9
  from __future__ import annotations
10
 
@@ -15,31 +20,49 @@ from tqdm import tqdm
15
 
16
  from microbe_model import config
17
  from microbe_model.data.bacdive import (
 
 
18
  BacDiveClient,
 
19
  extract_phenotypes,
20
- fetch_with_cache,
21
  )
22
 
23
 
24
  def main() -> None:
25
  parser = argparse.ArgumentParser()
26
- parser.add_argument("--limit", type=int, default=1000, help="Max strains to fetch (None=all).")
 
 
 
27
  args = parser.parse_args()
28
 
29
  client = BacDiveClient()
30
  rows = []
31
- for bacdive_id in tqdm(client.iter_strain_ids(limit=args.limit), desc="BacDive", unit="strain"):
32
- record = fetch_with_cache(client, bacdive_id)
33
- rows.append(extract_phenotypes(record))
 
 
 
 
 
 
 
 
34
 
35
  df = pd.DataFrame(rows)
36
  out = config.DATA / "bacdive_phenotypes.parquet"
37
  df.to_parquet(out, index=False)
38
- print(f"\nWrote {len(df)} rows to {out}")
 
39
  print("Coverage of prediction targets:")
40
  for col in ("optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"):
41
- print(f" {col}: {df[col].notna().sum()} / {len(df)}")
42
- print(f" genome_accession: {df['genome_accession'].notna().sum()} / {len(df)}")
 
 
 
 
43
 
44
 
45
  if __name__ == "__main__":
 
1
+ """Scan BacDive and write strain phenotype labels to data/bacdive_phenotypes.parquet.
2
 
3
+ Uses the v2 public API (no auth). Discovers strain IDs by batch-scanning the
4
+ integer ID range — missing IDs are silently dropped server-side, so the scan
5
+ is complete in one pass over [start, end].
6
 
7
  Usage:
8
+ # Phase 1 smoke test — scan the first ~5K IDs (returns ~3-4K real records)
9
+ uv run python scripts/01_fetch_bacdive.py --end 5000
10
+
11
+ # Full BacDive (~150K live records, ~30 min wall time)
12
+ uv run python scripts/01_fetch_bacdive.py --end 200000
13
  """
14
  from __future__ import annotations
15
 
 
20
 
21
  from microbe_model import config
22
  from microbe_model.data.bacdive import (
23
+ BATCH_SIZE,
24
+ DEFAULT_MAX_ID,
25
  BacDiveClient,
26
+ cache_record,
27
  extract_phenotypes,
 
28
  )
29
 
30
 
31
  def main() -> None:
32
  parser = argparse.ArgumentParser()
33
+ parser.add_argument("--start", type=int, default=1)
34
+ parser.add_argument("--end", type=int, default=DEFAULT_MAX_ID)
35
+ parser.add_argument("--no-cache", action="store_true",
36
+ help="Skip writing per-strain JSON to disk (saves ~150K small files).")
37
  args = parser.parse_args()
38
 
39
  client = BacDiveClient()
40
  rows = []
41
+ n_batches = (args.end - args.start) // BATCH_SIZE + 1
42
+
43
+ with tqdm(total=n_batches, desc="BacDive batches", unit="batch") as bar:
44
+ for bacdive_id, record in client.iter_records(start=args.start, end=args.end):
45
+ if not args.no_cache:
46
+ cache_record(bacdive_id, record)
47
+ rows.append(extract_phenotypes(record))
48
+ # tqdm advances per batch — track via the integer ID
49
+ if bacdive_id % BATCH_SIZE == 0:
50
+ bar.update(1)
51
+ bar.update(n_batches - bar.n) # finalize
52
 
53
  df = pd.DataFrame(rows)
54
  out = config.DATA / "bacdive_phenotypes.parquet"
55
  df.to_parquet(out, index=False)
56
+
57
+ print(f"\nWrote {len(df)} strains to {out}")
58
  print("Coverage of prediction targets:")
59
  for col in ("optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"):
60
+ n = df[col].notna().sum()
61
+ print(f" {col:30s} {n:>6d} / {len(df)} ({100 * n / max(1, len(df)):.1f}%)")
62
+ n_genome = df["genome_accession"].notna().sum()
63
+ print(f" genome_accession {n_genome:>6d} / {len(df)} ({100 * n_genome / max(1, len(df)):.1f}%)")
64
+ n_both = df[df["genome_accession"].notna() & df["optimal_temperature_c"].notna()].shape[0]
65
+ print(f"\n genome + T_opt (training-ready) {n_both:>4d} strains")
66
 
67
 
68
  if __name__ == "__main__":
src/microbe_model/config.py CHANGED
@@ -19,8 +19,6 @@ FEATURE_DIR = DATA / "features"
19
  for _d in (DATA, ARTIFACTS, BACDIVE_DIR, GENOME_DIR, FEATURE_DIR):
20
  _d.mkdir(parents=True, exist_ok=True)
21
 
22
- BACDIVE_USER = os.environ.get("BACDIVE_USER")
23
- BACDIVE_PASSWORD = os.environ.get("BACDIVE_PASSWORD")
24
  NCBI_API_KEY = os.environ.get("NCBI_API_KEY")
25
 
26
  PHENOTYPE_TARGETS = {
 
19
  for _d in (DATA, ARTIFACTS, BACDIVE_DIR, GENOME_DIR, FEATURE_DIR):
20
  _d.mkdir(parents=True, exist_ok=True)
21
 
 
 
22
  NCBI_API_KEY = os.environ.get("NCBI_API_KEY")
23
 
24
  PHENOTYPE_TARGETS = {
src/microbe_model/data/bacdive.py CHANGED
@@ -1,13 +1,12 @@
1
- """BacDive REST API client.
2
 
3
- BacDive (https://bacdive.dsmz.de/) is the largest curated database of bacterial phenotypes.
4
- Free registration is required; credentials are read from BACDIVE_USER / BACDIVE_PASSWORD.
5
 
6
- This client does the minimum needed for v0:
7
- - log in and obtain an OAuth token
8
- - paginate through the strain catalog
9
- - fetch full records by BacDive ID
10
- - extract the phenotype targets we predict (T_opt, pH_opt, oxygen, salt)
11
  """
12
  from __future__ import annotations
13
 
@@ -21,134 +20,107 @@ import requests
21
 
22
  from microbe_model import config
23
 
24
- BASE_URL = "https://api.bacdive.dsmz.de"
25
- TOKEN_URL = "https://sso.dsmz.de/auth/realms/dsmz/protocol/openid-connect/token"
26
-
27
-
28
- class BacDiveAuthError(RuntimeError):
29
- pass
30
 
31
 
32
  class BacDiveClient:
33
- def __init__(self, user: str | None = None, password: str | None = None) -> None:
34
- self.user = user or config.BACDIVE_USER
35
- self.password = password or config.BACDIVE_PASSWORD
36
- if not self.user or not self.password:
37
- raise BacDiveAuthError(
38
- "Set BACDIVE_USER and BACDIVE_PASSWORD in .env (register at bacdive.dsmz.de)."
39
- )
40
- self._token: str | None = None
41
- self._token_expires_at: float = 0.0
42
  self._session = requests.Session()
43
-
44
- def _refresh_token(self) -> None:
45
- resp = self._session.post(
46
- TOKEN_URL,
47
- data={
48
- "grant_type": "password",
49
- "client_id": "api.bacdive.public",
50
- "username": self.user,
51
- "password": self.password,
52
- },
53
- timeout=30,
54
- )
55
- if resp.status_code != 200:
56
- raise BacDiveAuthError(f"BacDive auth failed: {resp.status_code} {resp.text}")
57
- body = resp.json()
58
- self._token = body["access_token"]
59
- self._token_expires_at = time.time() + body.get("expires_in", 300) - 30
60
-
61
- def _headers(self) -> dict[str, str]:
62
- if self._token is None or time.time() >= self._token_expires_at:
63
- self._refresh_token()
64
- return {"Authorization": f"Bearer {self._token}", "Accept": "application/json"}
65
 
66
  def _get(self, path: str, params: dict | None = None) -> dict[str, Any]:
67
  url = f"{BASE_URL}{path}"
68
  for attempt in range(3):
69
- resp = self._session.get(url, headers=self._headers(), params=params, timeout=60)
70
  if resp.status_code == 429:
71
- time.sleep(2 ** attempt)
72
  continue
73
  resp.raise_for_status()
74
  return resp.json()
75
  resp.raise_for_status()
76
  return {}
77
 
78
- def iter_strain_ids(self, limit: int | None = None) -> Iterator[int]:
79
- """Page through the BacDive catalog and yield strain IDs."""
80
- page_url: str | None = "/fetch/"
81
- seen = 0
82
- while page_url:
83
- body = self._get(page_url)
84
- for record in body.get("results", []):
85
- yield int(record["id"])
86
- seen += 1
87
- if limit is not None and seen >= limit:
88
- return
89
- next_url = body.get("next")
90
- if not next_url:
91
- return
92
- page_url = next_url.replace(BASE_URL, "")
93
-
94
- def fetch_record(self, bacdive_id: int) -> dict[str, Any]:
95
- body = self._get(f"/fetch/{bacdive_id}")
96
- results = body.get("results") or {}
97
- if isinstance(results, list):
98
- return results[0] if results else {}
99
- if isinstance(results, dict) and str(bacdive_id) in results:
100
- return results[str(bacdive_id)]
101
- return results
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
105
- """Pull the v0 prediction targets out of a BacDive record.
106
 
107
- BacDive's record schema is deeply nested and field names vary across record versions.
108
- We tolerate missing fields — anything we can't find becomes None and is dropped at training time.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  """
 
 
 
 
 
 
110
  out: dict[str, Any] = {
111
- "bacdive_id": record.get("General", {}).get("BacDive-ID"),
112
- "species": record.get("Name and taxonomic classification", {}).get("species"),
113
- "ncbi_taxon_id": record.get("General", {}).get("NCBI tax id"),
114
- "optimal_temperature_c": None,
115
- "optimal_ph": None,
116
- "oxygen_requirement": None,
117
- "salt_tolerance_pct": None,
118
- "genome_accession": None,
 
 
119
  }
120
-
121
- culture = record.get("Culture and growth conditions", {})
122
- temps = _as_list(culture.get("culture temp"))
123
- for t in temps:
124
- if isinstance(t, dict) and t.get("type", "").lower() in {"optimum", "optimal"}:
125
- out["optimal_temperature_c"] = _to_float(t.get("temperature"))
126
- break
127
-
128
- phs = _as_list(culture.get("culture pH"))
129
- for p in phs:
130
- if isinstance(p, dict) and p.get("type", "").lower() in {"optimum", "optimal"}:
131
- out["optimal_ph"] = _to_float(p.get("pH"))
132
- break
133
-
134
- physio = record.get("Physiology and metabolism", {})
135
- oxygen = _as_list(physio.get("oxygen tolerance"))
136
- if oxygen and isinstance(oxygen[0], dict):
137
- out["oxygen_requirement"] = oxygen[0].get("oxygen tolerance")
138
-
139
- salt = _as_list(physio.get("halophily"))
140
- for s in salt:
141
- if isinstance(s, dict) and "concentration" in s:
142
- out["salt_tolerance_pct"] = _to_float(s.get("concentration"))
143
- break
144
-
145
- seq = record.get("Sequence information", {})
146
- genomes = _as_list(seq.get("genome sequence"))
147
- for g in genomes:
148
- if isinstance(g, dict) and g.get("accession"):
149
- out["genome_accession"] = g["accession"]
150
- break
151
-
152
  return out
153
 
154
 
@@ -163,20 +135,89 @@ def _as_list(x: Any) -> list:
163
  def _to_float(x: Any) -> float | None:
164
  if x is None:
165
  return None
 
 
 
 
 
 
 
 
 
 
 
166
  try:
167
- return float(str(x).split()[0])
168
  except (ValueError, AttributeError):
169
  return None
170
 
171
 
172
- def cache_path(bacdive_id: int) -> Path:
173
- return config.BACDIVE_DIR / f"{bacdive_id}.json"
174
-
175
 
176
- def fetch_with_cache(client: BacDiveClient, bacdive_id: int) -> dict[str, Any]:
177
- path = cache_path(bacdive_id)
178
- if path.exists():
179
- return json.loads(path.read_text())
180
- record = client.fetch_record(bacdive_id)
181
- path.write_text(json.dumps(record))
182
- return record
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """BacDive REST API client (v2, public).
2
 
3
+ The BacDive v2 API is fully open as of February 2026 — no registration, no auth.
4
+ Documentation: https://api.bacdive.dsmz.de/
5
 
6
+ We discover strain IDs by scanning the integer ID space in semicolon-batched fetches
7
+ of up to 100 IDs per call. Missing IDs are silently dropped server-side, so a blind
8
+ scan over [1, MAX_ID] yields every existing record in one pass. At ~150K live IDs
9
+ (as of 2026-04), this takes ~30 minutes single-threaded.
 
10
  """
11
  from __future__ import annotations
12
 
 
20
 
21
  from microbe_model import config
22
 
23
+ BASE_URL = "https://api.bacdive.dsmz.de/v2"
24
+ BATCH_SIZE = 100 # max IDs per /fetch/ call (server limit)
25
+ DEFAULT_MAX_ID = 200_000 # conservative upper bound; live max is ~160K-180K as of 2026-04
 
 
 
26
 
27
 
28
  class BacDiveClient:
29
+ def __init__(self, *, request_timeout: int = 60, retry_sleep_s: float = 1.0) -> None:
 
 
 
 
 
 
 
 
30
  self._session = requests.Session()
31
+ self.timeout = request_timeout
32
+ self.retry_sleep_s = retry_sleep_s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def _get(self, path: str, params: dict | None = None) -> dict[str, Any]:
35
  url = f"{BASE_URL}{path}"
36
  for attempt in range(3):
37
+ resp = self._session.get(url, params=params, timeout=self.timeout)
38
  if resp.status_code == 429:
39
+ time.sleep(self.retry_sleep_s * (attempt + 1))
40
  continue
41
  resp.raise_for_status()
42
  return resp.json()
43
  resp.raise_for_status()
44
  return {}
45
 
46
+ def fetch_batch(self, ids: list[int]) -> dict[int, dict[str, Any]]:
47
+ """Fetch up to BATCH_SIZE strain records in a single call.
48
+
49
+ Returns a {bacdive_id: record} mapping. Missing IDs are absent from the result.
50
+ """
51
+ if not ids:
52
+ return {}
53
+ if len(ids) > BATCH_SIZE:
54
+ raise ValueError(f"Batch size {len(ids)} exceeds server limit {BATCH_SIZE}")
55
+ path = f"/fetch/{';'.join(str(i) for i in ids)}"
56
+ body = self._get(path)
57
+ results = body.get("results")
58
+ if isinstance(results, dict):
59
+ return {int(k): v for k, v in results.items()}
60
+ return {}
 
 
 
 
 
 
 
 
 
61
 
62
+ def iter_records(
63
+ self,
64
+ *,
65
+ start: int = 1,
66
+ end: int = DEFAULT_MAX_ID,
67
+ batch_size: int = BATCH_SIZE,
68
+ ) -> Iterator[tuple[int, dict[str, Any]]]:
69
+ """Scan the BacDive ID range and yield (id, record) for every existing strain."""
70
+ for batch_start in range(start, end + 1, batch_size):
71
+ batch_end = min(batch_start + batch_size - 1, end)
72
+ ids = list(range(batch_start, batch_end + 1))
73
+ records = self.fetch_batch(ids)
74
+ yield from sorted(records.items())
75
 
 
 
76
 
77
+ def cache_path(bacdive_id: int) -> Path:
78
+ return config.BACDIVE_DIR / f"{bacdive_id}.json"
79
+
80
+
81
+ def cache_record(bacdive_id: int, record: dict[str, Any]) -> Path:
82
+ path = cache_path(bacdive_id)
83
+ path.write_text(json.dumps(record))
84
+ return path
85
+
86
+
87
+ def load_cached(bacdive_id: int) -> dict[str, Any] | None:
88
+ path = cache_path(bacdive_id)
89
+ if not path.exists():
90
+ return None
91
+ return json.loads(path.read_text())
92
+
93
+
94
+ def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
95
+ """Pull the v0 prediction targets out of a BacDive v2 record.
96
+
97
+ Field locations (verified against live API on 2026-04-26):
98
+ - General → BacDive-ID
99
+ - Name and taxonomic classification → species, genus, family
100
+ - Culture and growth conditions → culture temp[] (type ∈ {growth, optimum, range, no growth})
101
+ - Culture and growth conditions → culture pH[] (same shape)
102
+ - Physiology and metabolism → oxygen tolerance[]
103
+ - Physiology and metabolism → halophily[]
104
+ - Sequence information → Genome sequences[].INSDC accession
105
  """
106
+ general = record.get("General") or {}
107
+ taxon = record.get("Name and taxonomic classification") or {}
108
+ culture = record.get("Culture and growth conditions") or {}
109
+ physio = record.get("Physiology and metabolism") or {}
110
+ seq = record.get("Sequence information") or {}
111
+
112
  out: dict[str, Any] = {
113
+ "bacdive_id": general.get("BacDive-ID"),
114
+ "species": taxon.get("species"),
115
+ "genus": taxon.get("genus"),
116
+ "family": (taxon.get("LPSN") or {}).get("family") or taxon.get("family"),
117
+ "ncbi_taxon_id": _first_ncbi_tax_id(general.get("NCBI tax id")),
118
+ "optimal_temperature_c": _derive_optimum(_as_list(culture.get("culture temp")), "temperature"),
119
+ "optimal_ph": _derive_optimum(_as_list(culture.get("culture pH")), "pH"),
120
+ "oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"),
121
+ "salt_tolerance_pct": _derive_salt(physio.get("halophily")),
122
+ "genome_accession": _first_genome_accession(seq.get("Genome sequences")),
123
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  return out
125
 
126
 
 
135
  def _to_float(x: Any) -> float | None:
136
  if x is None:
137
  return None
138
+ s = str(x).strip()
139
+ if not s:
140
+ return None
141
+ if "-" in s and not s.startswith("-"):
142
+ # e.g. "5-30" — return midpoint
143
+ parts = s.split("-")
144
+ try:
145
+ lo, hi = float(parts[0]), float(parts[1])
146
+ return (lo + hi) / 2
147
+ except (ValueError, IndexError):
148
+ return None
149
  try:
150
+ return float(s.split()[0])
151
  except (ValueError, AttributeError):
152
  return None
153
 
154
 
155
+ def _derive_optimum(entries: list, value_key: str) -> float | None:
156
+ """Find an optimum for a temperature- or pH-like list of {type, value} entries.
 
157
 
158
+ Preference order:
159
+ 1. type == "optimum" (exact)
160
+ 2. median of "positive growth" entries
161
+ 3. None
162
+ """
163
+ optima = []
164
+ growth = []
165
+ for entry in entries:
166
+ if not isinstance(entry, dict):
167
+ continue
168
+ etype = (entry.get("type") or "").lower()
169
+ value = _to_float(entry.get(value_key))
170
+ if value is None:
171
+ continue
172
+ is_positive = (entry.get("growth") or "").lower() in {"positive", "yes", "+", "true"}
173
+ if "optim" in etype:
174
+ optima.append(value)
175
+ elif etype == "growth" and is_positive:
176
+ growth.append(value)
177
+ if optima:
178
+ return sum(optima) / len(optima)
179
+ if growth:
180
+ sorted_g = sorted(growth)
181
+ n = len(sorted_g)
182
+ return sorted_g[n // 2] if n % 2 else (sorted_g[n // 2 - 1] + sorted_g[n // 2]) / 2
183
+ return None
184
+
185
+
186
+ def _first_value(entries: list, key: str) -> str | None:
187
+ for entry in entries:
188
+ if isinstance(entry, dict) and entry.get(key):
189
+ return str(entry[key])
190
+ return None
191
+
192
+
193
+ def _derive_salt(halophily: Any) -> float | None:
194
+ for entry in _as_list(halophily):
195
+ if not isinstance(entry, dict):
196
+ continue
197
+ for field in ("concentration", "salt concentration", "tested relation"):
198
+ value = _to_float(entry.get(field))
199
+ if value is not None:
200
+ return value
201
+ return None
202
+
203
+
204
+ def _first_genome_accession(genome_entries: Any) -> str | None:
205
+ for entry in _as_list(genome_entries):
206
+ if isinstance(entry, dict):
207
+ for key in ("INSDC accession", "NCBI accession", "accession"):
208
+ value = entry.get(key)
209
+ if value:
210
+ return str(value)
211
+ return None
212
+
213
+
214
+ def _first_ncbi_tax_id(tax: Any) -> int | None:
215
+ for entry in _as_list(tax):
216
+ if isinstance(entry, dict):
217
+ value = entry.get("NCBI tax id")
218
+ if value is not None:
219
+ try:
220
+ return int(value)
221
+ except (ValueError, TypeError):
222
+ continue
223
+ return None
tests/test_bacdive.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test BacDive phenotype extraction against a fixture of the real v2 schema."""
2
+ from __future__ import annotations
3
+
4
+ from microbe_model.data.bacdive import _derive_optimum, extract_phenotypes
5
+
6
+ # Trimmed-down version of a real /v2/fetch/24493 response (Phaeobacter gallaeciensis BS 107).
7
+ SAMPLE_RECORD = {
8
+ "General": {
9
+ "BacDive-ID": 24493,
10
+ "NCBI tax id": [
11
+ {"NCBI tax id": 1423144, "Matching level": "strain"},
12
+ {"NCBI tax id": 60890, "Matching level": "species"},
13
+ ],
14
+ },
15
+ "Name and taxonomic classification": {
16
+ "LPSN": {
17
+ "domain": "Bacteria",
18
+ "phylum": "Pseudomonadota",
19
+ "class": "Alphaproteobacteria",
20
+ "order": "Rhodobacterales",
21
+ "family": "Roseobacteraceae",
22
+ "genus": "Phaeobacter",
23
+ "species": "Phaeobacter gallaeciensis",
24
+ },
25
+ "genus": "Phaeobacter",
26
+ "species": "Phaeobacter gallaeciensis",
27
+ },
28
+ "Culture and growth conditions": {
29
+ "culture temp": [
30
+ {"growth": "positive", "type": "growth", "temperature": "25"},
31
+ {"growth": "positive", "type": "growth", "temperature": "22"},
32
+ {"growth": "positive", "type": "growth", "temperature": "5-30"},
33
+ {"growth": "negative", "type": "growth", "temperature": "37"},
34
+ ],
35
+ },
36
+ "Physiology and metabolism": {
37
+ "oxygen tolerance": [{"oxygen tolerance": "obligate aerobe"}],
38
+ },
39
+ "Sequence information": {
40
+ "Genome sequences": [
41
+ {"INSDC accession": "GCA_000511385", "assembly level": "complete"},
42
+ {"INSDC accession": "GCA_000819625", "assembly level": "contig"},
43
+ ],
44
+ },
45
+ }
46
+
47
+
48
+ def test_extract_phenotypes_real_schema() -> None:
49
+ out = extract_phenotypes(SAMPLE_RECORD)
50
+ assert out["bacdive_id"] == 24493
51
+ assert out["species"] == "Phaeobacter gallaeciensis"
52
+ assert out["genus"] == "Phaeobacter"
53
+ assert out["family"] == "Roseobacteraceae"
54
+ assert out["ncbi_taxon_id"] == 1423144
55
+ assert out["genome_accession"] == "GCA_000511385" # first listed
56
+ assert out["oxygen_requirement"] == "obligate aerobe"
57
+
58
+ # Three positive-growth temps: 25, 22, midpoint(5-30)=17.5 → median = 22
59
+ assert out["optimal_temperature_c"] == 22.0
60
+
61
+
62
+ def test_derive_optimum_prefers_explicit_optimum() -> None:
63
+ entries = [
64
+ {"type": "growth", "growth": "positive", "temperature": "30"},
65
+ {"type": "optimum", "temperature": "37"},
66
+ {"type": "growth", "growth": "positive", "temperature": "25"},
67
+ ]
68
+ assert _derive_optimum(entries, "temperature") == 37.0
69
+
70
+
71
+ def test_derive_optimum_falls_back_to_growth_median() -> None:
72
+ entries = [
73
+ {"type": "growth", "growth": "positive", "temperature": "20"},
74
+ {"type": "growth", "growth": "positive", "temperature": "30"},
75
+ {"type": "growth", "growth": "negative", "temperature": "45"}, # ignored
76
+ ]
77
+ assert _derive_optimum(entries, "temperature") == 25.0
78
+
79
+
80
+ def test_extract_phenotypes_handles_missing_fields() -> None:
81
+ out = extract_phenotypes({})
82
+ assert out["bacdive_id"] is None
83
+ assert out["genome_accession"] is None
84
+ assert out["optimal_temperature_c"] is None