"""Streamlit UI for microbe-model — Prototype A "Lab notebook" implementation. Visual design from the Claude Design bundle (microbe-ml/project/prototype-a-*). Warm cream paper, IBM Plex Serif/Sans/Mono, oxidized-iron accent. Three confidence primitives so the same word never reads the same way twice. Run: uv run --extra ui streamlit run app.py """ from __future__ import annotations import json import os import sys from pathlib import Path import pandas as pd import requests import streamlit as st ROOT = Path(__file__).resolve().parent sys.path.insert(0, str(ROOT / "scripts")) from microbe_model import config # noqa: E402 from microbe_model.train.media_recommender import load_models # noqa: E402 from recommend import ( # noqa: E402 _format_recipe_summary, _load_genome_features, _predict_phenotypes, ) # ────────────────────────────────────────────────────────────────────── # Theme tokens (Prototype A — Lab notebook) # ────────────────────────────────────────────────────────────────────── PAPER = "#f5f1e8" PAPER_DEEP = "#ece6d6" INK = "#1f1d18" INK_SOFT = "#5a554a" INK_FAINT = "#94907f" RULE = "#d6cdb6" RULE_SOFT = "#e6dfca" ACCENT = "#a8521a" ACCENT_TINT = "#fdf6e8" POS = "#3f6b3a" WARN = "#a8521a" FOCUSED_STRIP = "#ede4cd" BROAD_STRIP = "#e8e0c8" O2_COLOR = "#3a7d6e" EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" st.set_page_config( page_title="microbe-model — what to grow it in", page_icon="🦠", layout="wide", initial_sidebar_state="collapsed", ) # ────────────────────────────────────────────────────────────────────── # CSS — global typography + paper background + restyled widgets # ────────────────────────────────────────────────────────────────────── st.markdown( f""" """, unsafe_allow_html=True, ) # ────────────────────────────────────────────────────────────────────── # Cached loaders # ────────────────────────────────────────────────────────────────────── @st.cache_data def load_results(): p = config.ARTIFACTS / "baseline_results.json" if not p.exists(): return {} data = json.loads(p.read_text()) data.pop("__meta__", None) return data @st.cache_resource def load_recommender(): return load_models(config.ROOT / "models" / "recommender") @st.cache_data def load_uncultured() -> pd.DataFrame: return pd.read_parquet(config.ARTIFACTS / "uncultured_predictions.parquet") @st.cache_data def load_media_meta() -> pd.DataFrame: return pd.read_parquet(config.DATA / "media_metadata.parquet") @st.cache_data def load_recipes() -> pd.DataFrame: return pd.read_parquet(config.DATA / "media_recipes.parquet") # ────────────────────────────────────────────────────────────────────── # Color helpers + confidence primitives # ────────────────────────────────────────────────────────────────────── def temp_color(t: float) -> str: if t < 15: return "#3b82a6" if t < 30: return "#5b8b9c" if t < 45: return "#7d8470" if t < 60: return "#b06a3b" return "#a04020" def ph_color(p: float) -> str: if p < 6: return "#a04020" if p < 7.5: return "#7d8470" return "#3b82a6" def salt_color(s: float) -> str: if s < 1: return "#7d8470" if s < 5: return "#b89048" return "#8a5e1f" def media_conf_bar(value: float, color: str = ACCENT, height: int = 8) -> str: pct = max(0, min(100, round(value * 100))) return ( f'
' f'
' f'
' f'{pct}%' f'
' ) def oxygen_conf_arc(value: float, size: int = 36, color: str = O2_COLOR) -> str: pct = max(0, min(100, round(value * 100))) r = size / 2 - 3 c = 2 * 3.14159 * r arc_len = c * value return ( f'
' f'' f'' f'' f'' f'
{pct}
' f'
' ) def interval_bar(value: float, lo: float, hi: float, scale_min: float, scale_max: float, color: str, unit: str = "", height: int = 6, show_label: bool = False) -> str: rng = scale_max - scale_min or 1 lo_pct = max(0, min(100, ((lo - scale_min) / rng) * 100)) hi_pct = max(0, min(100, ((hi - scale_min) / rng) * 100)) val_pct = max(0, min(100, ((value - scale_min) / rng) * 100)) inner = ( f'
' f'
' f'
' f'
' ) if show_label: inner += ( f'
' f'{lo}{unit}' f'{value}{unit}' f'{hi}{unit}' f'
' ) return f'
{inner}
' # ────────────────────────────────────────────────────────────────────── # Helpers # ────────────────────────────────────────────────────────────────────── def phylum_from_taxonomy(tax: str | None) -> str: if not isinstance(tax, str): return "—" for part in tax.split(";"): part = part.strip() if part.startswith("p__"): return part[3:] or "—" return "—" def is_accession(s: str) -> bool: s = s.strip().upper() return s.startswith(("GCA_", "GCF_")) @st.cache_data(ttl=3600, show_spinner=False) def search_ncbi_assembly(name: str, retmax: int = 10) -> list[dict]: if not name.strip(): return [] api_key = os.environ.get("NCBI_API_KEY") common = {"api_key": api_key} if api_key else {} try: r = requests.get( f"{EUTILS_BASE}/esearch.fcgi", params={"db": "assembly", "term": f"{name}[Organism] AND latest[filter]", "retmode": "json", "retmax": retmax, **common}, timeout=20, ) r.raise_for_status() ids = r.json().get("esearchresult", {}).get("idlist", []) if not ids: return [] r = requests.get( f"{EUTILS_BASE}/esummary.fcgi", params={"db": "assembly", "id": ",".join(ids), "retmode": "json", **common}, timeout=20, ) r.raise_for_status() result = r.json().get("result", {}) except requests.RequestException as e: st.error(f"NCBI search failed: {e}") return [] out = [] for uid in result.get("uids", []): doc = result.get(uid, {}) out.append({ "accession": str(doc.get("assemblyaccession", "")), "organism": str(doc.get("organism", "")), "level": str(doc.get("assemblystatus", "")), }) rank = {"Complete Genome": 0, "Chromosome": 1, "Scaffold": 2, "Contig": 3} out.sort(key=lambda r: rank.get(r["level"], 99)) return out def _compare_card_html(label, pred, lo, hi, pub, unit, color, sm, smax, ok): badge = (f'✓ in 80% PI' if ok else f'△ outside PI') if pub is not None else "" pub_marker = "" if pub is not None: pub_pct = max(0, min(100, ((pub - sm) / (smax - sm)) * 100)) pub_marker = ( f'
' f'
↑ pub
' ) pub_block = ( f'
published
' f'
' f'{pub}{unit}
' f'
literature
' if pub is not None else '
published
' f'
' ) return f"""
{label}{badge}
predicted
{pred:.1f}{unit}
{lo:.1f}{unit} – {hi:.1f}{unit}
{pub_block}
{interval_bar(pred, lo, hi, sm, smax, color, unit)} {pub_marker}
""" def _oxygen_compare_card(pred, conf, pub): ok = (pub is not None and pred == pub) badge = (f'✓ match' if ok else f'△ mismatch') if pub else "" pub_block = ( f'
published
' f'
{pub}
' if pub else f'
published
' f'
' ) return f"""
Oxygen requirement{badge}
predicted
{oxygen_conf_arc(conf, size=32)}
{pred}
{pub_block}
""" def run_inference(target: str): feats, acc, n_contigs = _load_genome_features(target) feats_series = pd.Series(feats) phenotypes = _predict_phenotypes(feats_series) models, feature_cols = load_recommender() media_meta = load_media_meta() recipes = load_recipes() name_by_id = dict(zip(media_meta["medium_id"].astype(str), media_meta["name"], strict=True)) X_pred = feats_series[feature_cols].to_frame().T recs = [] for medium_id, model in models.items(): proba = float(model.predict_proba(X_pred)[0, 1]) recs.append({ "medium_id": medium_id, "name": name_by_id.get(medium_id, "(unknown)"), "confidence": proba, "recipe": _format_recipe_summary(medium_id, recipes), }) recs.sort(key=lambda r: r["confidence"], reverse=True) return { "accession": acc, "n_contigs": n_contigs, "n_cds": int(feats["n_predicted_cds"]), "gc": float(feats["gc_content"]), "phenotypes": phenotypes, "media": recs, } # ────────────────────────────────────────────────────────────────────── # Header (lab-notebook style) # ────────────────────────────────────────────────────────────────────── st.markdown( f"""

microbe-model

v1.2.0 · trained 2026-03-14

Predicted growth conditions for microbes that have never been cultured. Pick one. Try the medium. Five thousand candidates from GTDB scored against twenty-four DSMZ media.

""", unsafe_allow_html=True, ) tab_catalog, tab_test, tab_about = st.tabs( ["Catalog", "Test on a known genome", "Model accuracy"] ) # ────────────────────────────────────────────────────────────────────── # Tab 1 — Catalog # ────────────────────────────────────────────────────────────────────── with tab_catalog: unc_all = load_uncultured().copy() unc_all["phylum"] = unc_all["gtdb_taxonomy"].map(phylum_from_taxonomy) unc_all["truly_uncultured"] = ( unc_all["ncbi_organism_name"].fillna("").str.lower().str.startswith("uncultured") ) n_focused = int(unc_all["truly_uncultured"].sum()) n_total = len(unc_all) if "mode" not in st.session_state: st.session_state["mode"] = "focused" if "filter" not in st.session_state: st.session_state["filter"] = "all" # Mode strip — two big pills with chrome shift mode = st.session_state["mode"] focused = mode == "focused" mc1, mc2 = st.columns(2, gap="small") with mc1: if st.button( f"1,294 truly never-cultured" + (" · NCBI name starts with \"uncultured\"" if focused else ""), key="mode_focused", type="primary" if focused else "secondary", use_container_width=True, ): st.session_state["mode"] = "focused" st.rerun() with mc2: if st.button( f"5,000 all candidates" + (" · includes 3,706 named-but-absent-from-BacDive" if not focused else ""), key="mode_broad", type="primary" if not focused else "secondary", use_container_width=True, ): st.session_state["mode"] = "broad" st.rerun() # ──────────────── Predict bar ──────────────── st.markdown('
', unsafe_allow_html=True) st.markdown( f'
Predict a medium
', unsafe_allow_html=True, ) pcol1, pcol2, pcol3 = st.columns([5, 2, 2]) with pcol1: query = st.text_input( label="predict query", label_visibility="collapsed", placeholder='Organism name, NCBI accession, or paste FASTA…', key="predict_query", ) with pcol2: upload = st.file_uploader( label="upload", label_visibility="collapsed", type=["fna", "fa", "fasta", "gz"], key="predict_upload", ) with pcol3: submit = st.button("🔎 Predict", type="primary", use_container_width=True) quick = st.columns([1, 1, 1, 6]) with quick[0]: if st.button("Try: Thermus thermophilus", key="qt_thermus"): st.session_state["predict_target"] = "Thermus thermophilus" st.session_state["run_predict"] = True with quick[1]: if st.button("Try: E. coli K-12", key="qt_ecoli"): st.session_state["predict_target"] = "GCF_000005845.2" st.session_state["run_predict"] = True with quick[2]: if st.button("Try: B. subtilis 168", key="qt_bsub"): st.session_state["predict_target"] = "GCF_000009045.1" st.session_state["run_predict"] = True # Run prediction if requested target = None if upload is not None and submit: tmp = ROOT / "data" / "_uploaded" / upload.name tmp.parent.mkdir(parents=True, exist_ok=True) tmp.write_bytes(upload.getbuffer()) target = str(tmp) elif submit and query.strip() and is_accession(query): target = query.strip() elif submit and query.strip(): with st.spinner(f"Searching NCBI for '{query.strip()}'…"): hits = search_ncbi_assembly(query.strip(), retmax=10) if not hits: st.warning(f"No NCBI Assembly hits for '{query.strip()}'.") else: st.session_state["ncbi_hits"] = hits elif st.session_state.pop("run_predict", False): target = st.session_state.pop("predict_target") if not is_accession(target): with st.spinner(f"Searching NCBI for '{target}'…"): hits = search_ncbi_assembly(target, retmax=5) if hits: target = hits[0]["accession"] hits = st.session_state.get("ncbi_hits", []) if hits and not target: st.markdown(f'
{len(hits)} NCBI matches
', unsafe_allow_html=True) labels = [f"{h['accession']} — {h['organism']} · {h['level']}" for h in hits] choice = st.radio("pick", options=list(range(len(hits))), format_func=lambda i: labels[i], label_visibility="collapsed", key="ncbi_choice") if st.button("Run on selected", type="primary"): target = hits[choice]["accession"] st.session_state.pop("ncbi_hits", None) if target: with st.spinner(f"Predicting for {target}…"): try: result = run_inference(target) except SystemExit as e: st.error(str(e)) st.stop() st.session_state["last_result"] = result result = st.session_state.get("last_result") if result: p = result["phenotypes"] top = result["media"][0] if result["media"] else None T = p.get("optimal_temperature_c", {}) pH = p.get("optimal_ph", {}) O2 = p.get("oxygen_requirement", {}) salt = p.get("salt_tolerance_pct", {}) st.markdown( f"""
Prediction · {result['accession']}
Try {top['name'] if top else '—'} {top['medium_id'] if top else ''}
T {T.get('prediction', 0):.0f}°C pH {pH.get('prediction', 0):.1f} O₂ {O2.get('prediction', '—')} salt {salt.get('prediction', 0):.1f}%
""", unsafe_allow_html=True, ) with st.expander("Full prediction · phenotype intervals + ranked media", expanded=False): ic = st.columns(4) for col, (key, label, unit, scale) in zip( ic, [ ("optimal_temperature_c", "T_opt", "°C", (0, 110)), ("optimal_ph", "pH", "", (2, 11)), ("oxygen_requirement", "O₂", "", None), ("salt_tolerance_pct", "salt", "%", (0, 25)), ], strict=True, ): info = p.get(key) or {} with col: if info.get("task") == "regression": v, lo, hi = info["prediction"], info.get("low_80"), info.get("high_80") c = (temp_color(v) if "temp" in key else ph_color(v) if "ph" in key else salt_color(v)) st.markdown( f"""
{label}
{v:.1f}{unit}
{interval_bar(v, lo or v, hi or v, scale[0], scale[1], c, unit, show_label=True)}
""", unsafe_allow_html=True, ) else: st.markdown( f"""
{label}
{oxygen_conf_arc(info.get("confidence", 0), size=40)}
{info.get("prediction", "—")}
""", unsafe_allow_html=True, ) st.markdown('
', unsafe_allow_html=True) st.markdown('
Top media
', unsafe_allow_html=True) for i, r in enumerate(result["media"][:5], 1): st.markdown( f"""
{r['medium_id']} {r['name']} {media_conf_bar(r['confidence'])}
{f'
{r["recipe"]}
' if r['recipe'] else ''}
""", unsafe_allow_html=True, ) if st.button("Clear prediction", key="clear_pred"): st.session_state.pop("last_result", None) st.rerun() st.markdown("
", unsafe_allow_html=True) # close lab-pad # ──────────────── Quick filters ──────────────── filter_opts = [ ("all", "All"), ("thermo", "Thermophiles · >55°C"), ("psychro", "Psychrophiles · <15°C"), ("anaerobe", "Anaerobes"), ("halo", "Halotolerant · >3% NaCl"), ] fcols = st.columns([1, 1.4, 1.4, 1, 1.4, 4]) for i, (key, label) in enumerate(filter_opts): with fcols[i]: if st.button( label, key=f"filter_{key}", type="primary" if st.session_state["filter"] == key else "secondary", use_container_width=True, ): st.session_state["filter"] = key st.rerun() unc = unc_all[unc_all["truly_uncultured"]] if focused else unc_all f = st.session_state["filter"] if f == "thermo": unc = unc[unc["pred_optimal_temperature_c"] > 55] elif f == "psychro": unc = unc[unc["pred_optimal_temperature_c"] < 15] elif f == "anaerobe": unc = unc[unc["pred_oxygen_requirement"].fillna("").str.contains("anaerobe", case=False)] elif f == "halo": unc = unc[unc["pred_salt_tolerance_pct"] > 3] # Search row st.markdown('
', unsafe_allow_html=True) sc1, sc2 = st.columns([4, 1]) with sc1: search = st.text_input( label="search", label_visibility="collapsed", placeholder="⌕ filter by organism name…", key="catalog_search", ) with sc2: st.markdown( f'
' f'showing {len(unc):,} · sorted by confidence
', unsafe_allow_html=True, ) st.markdown("
", unsafe_allow_html=True) if search: unc = unc[unc["ncbi_organism_name"].fillna("").str.contains(search, case=False, na=False)] if "top1_confidence" in unc.columns: unc = unc.sort_values("top1_confidence", ascending=False) # ──────────────── Top picks (cards) ──────────────── featured = unc.head(6) rest = unc.iloc[6:] if len(featured): st.markdown( f'
' f'
Top {len(featured)} picksby media confidence
' f'
', unsafe_allow_html=True, ) cards_html = ['
'] for _, m in featured.iterrows(): T = float(m["pred_optimal_temperature_c"]) ph = float(m["pred_optimal_ph"]) slt = float(m["pred_salt_tolerance_pct"]) o2_lbl = m["pred_oxygen_requirement"] or "—" o2_conf = float(m.get("pred_oxygen_requirement_confidence") or 0) top_id = m["top1_medium_id"] top_name = m["top1_medium_name"] top_conf = float(m["top1_confidence"]) short = (m["ncbi_organism_name"] or m["genome_accession"])[:80] cards_html.append(f"""
{m['genome_accession']} · {m['phylum']}
{short}
T_opt {T:.0f}°C
{interval_bar(T, max(0, T - 5), min(110, T + 5), 0, 110, temp_color(T))}
pH {ph:.1f}
{interval_bar(ph, max(2, ph - 0.5), min(11, ph + 0.5), 2, 11, ph_color(ph))}
salt {slt:.1f}%
{interval_bar(slt, max(0, slt - 1), min(25, slt + 1), 0, 25, salt_color(slt))}
O₂
{oxygen_conf_arc(o2_conf, size=28)}
{o2_lbl}
Try this medium
{top_id} {top_name}
{media_conf_bar(top_conf)}
""") cards_html.append("
") st.markdown("\n".join(cards_html), unsafe_allow_html=True) # ──────────────── Rest as table ──────────────── if len(rest): st.markdown( f'
' f'
Remaining {len(rest):,}
' f'
', unsafe_allow_html=True, ) table_rows = [] for _, m in rest.head(80).iterrows(): T = float(m["pred_optimal_temperature_c"]) ph = float(m["pred_optimal_ph"]) slt = float(m["pred_salt_tolerance_pct"]) o2_lbl = m["pred_oxygen_requirement"] or "—" o2_conf = float(m.get("pred_oxygen_requirement_confidence") or 0) short = (m["ncbi_organism_name"] or "")[:60] table_rows.append(f""" {m['genome_accession']} {short} {m['phylum']} {m['top1_medium_id']} {m['top1_medium_name'][:38]} {media_conf_bar(float(m['top1_confidence']))} {T:.0f}°C {ph:.1f}
{oxygen_conf_arc(o2_conf, size=20)}{o2_lbl}
{slt:.1f}% {float(m['checkm_completeness']):.0f} """) st.markdown( f'
' f'' f'{"".join(f"" for h in ["Accession", "Organism", "Phylum", "Try this medium", "Conf.", "T", "pH", "O₂", "Salt", "CheckM"])}' f'{"".join(table_rows)}
{h}
' f'
' f'showing first 80 of {len(rest):,} remaining · use search and filters to narrow' f'
', unsafe_allow_html=True, ) # ────────────────────────────────────────────────────────────────────── # Tab 2 — Test on a known genome # ────────────────────────────────────────────────────────────────────── SANITY_ORGANISMS = [ { "accession": "GCF_000005845.2", "name": "Escherichia coli K-12 MG1655", "known": {"T_opt": 37.0, "pH": 7.0, "O2": "facultative anaerobe", "salt": 1.0, "medium": "LB (Luria-Bertani)"}, }, { "accession": "GCF_000009045.1", "name": "Bacillus subtilis 168", "known": {"T_opt": 30.0, "pH": 7.0, "O2": "facultative anaerobe", "salt": 2.0, "medium": "LB or Nutrient Broth"}, }, { "accession": "GCF_000091545.1", "name": "Thermus thermophilus HB8", "known": {"T_opt": 70.0, "pH": 7.5, "O2": "aerobe", "salt": 0.5, "medium": "DSMZ 74 Castenholz TYE"}, }, ] with tab_test: st.markdown( f'
' f'
Sanity-check the model on a microbe with published growth conditions.
' f'
', unsafe_allow_html=True, ) st.markdown('
', unsafe_allow_html=True) pcols = st.columns(3) for col, org in zip(pcols, SANITY_ORGANISMS, strict=True): with col: k = org["known"] st.markdown( f"""
{org['name']}
{org['accession']}
{k['T_opt']:.0f}°C pH {k['pH']:.1f} {k['O2']}
""", unsafe_allow_html=True, ) if st.button(f"Predict {org['name'].split()[0]}", key=f"sanity_{org['accession']}", use_container_width=True): st.session_state["test_target"] = org["accession"] st.session_state["test_known"] = org["known"] st.session_state["test_run"] = True st.markdown('
', unsafe_allow_html=True) with st.form("test_form", clear_on_submit=False): tcol1, tcol2 = st.columns([5, 2]) with tcol1: t_query = st.text_input( label="test query", label_visibility="collapsed", placeholder="⌕ organism name or NCBI accession…", value=st.session_state.get("test_target", ""), ) with tcol2: t_upload = st.file_uploader("test upload", type=["fna", "fa", "fasta", "gz"], label_visibility="collapsed") t_submit = st.form_submit_button("Run", type="primary", use_container_width=True) auto = st.session_state.pop("test_run", False) known = st.session_state.pop("test_known", None) t_target = None if t_upload is not None: tmp = ROOT / "data" / "_uploaded" / t_upload.name tmp.parent.mkdir(parents=True, exist_ok=True) tmp.write_bytes(t_upload.getbuffer()) t_target = str(tmp) elif t_submit and t_query.strip() and is_accession(t_query): t_target = t_query.strip() elif t_submit and t_query.strip(): with st.spinner(f"Searching NCBI for '{t_query.strip()}'…"): t_hits = search_ncbi_assembly(t_query.strip(), retmax=5) if t_hits: t_target = t_hits[0]["accession"] else: st.warning(f"No NCBI hits for '{t_query.strip()}'.") elif auto: t_target = st.session_state.get("test_target") if t_target: with st.spinner(f"Predicting {t_target}…"): try: t_result = run_inference(t_target) except SystemExit as e: st.error(str(e)) st.stop() p = t_result["phenotypes"] st.markdown( f'
' f'
Predicted vs published — {t_result["accession"]}
', unsafe_allow_html=True, ) cards = [] # Temperature T = p.get("optimal_temperature_c", {}) if T: v, lo, hi = T["prediction"], T.get("low_80", T["prediction"]), T.get("high_80", T["prediction"]) pub = known["T_opt"] if known else None ok = pub is not None and lo <= pub <= hi cards.append(_compare_card_html("Optimum temperature", v, lo, hi, pub, "°C", temp_color(v), 0, 110, ok)) pH = p.get("optimal_ph", {}) if pH: v, lo, hi = pH["prediction"], pH.get("low_80", pH["prediction"]), pH.get("high_80", pH["prediction"]) pub = known["pH"] if known else None ok = pub is not None and lo <= pub <= hi cards.append(_compare_card_html("Optimum pH", v, lo, hi, pub, "", ph_color(v), 2, 11, ok)) slt = p.get("salt_tolerance_pct", {}) if slt: v, lo, hi = slt["prediction"], slt.get("low_80", slt["prediction"]), slt.get("high_80", slt["prediction"]) pub = known["salt"] if known else None ok = pub is not None and lo <= pub <= hi cards.append(_compare_card_html("Salt tolerance", v, lo, hi, pub, "%", salt_color(v), 0, 25, ok)) O2 = p.get("oxygen_requirement", {}) if O2: cards.append(_oxygen_compare_card(O2.get("prediction", "—"), O2.get("confidence", 0), known["O2"] if known else None)) st.markdown( f'
' f'{"".join(cards)}
', unsafe_allow_html=True, ) st.markdown('
', unsafe_allow_html=True) st.markdown('
Top media to try
', unsafe_allow_html=True) for i, r in enumerate(t_result["media"][:5], 1): st.markdown( f"""
{r['medium_id']} {r['name']} {media_conf_bar(r['confidence'])}
{f'
{r["recipe"]}
' if r['recipe'] else ''}
""", unsafe_allow_html=True, ) st.markdown("
", unsafe_allow_html=True) # ────────────────────────────────────────────────────────────────────── # Tab 3 — Model accuracy # ────────────────────────────────────────────────────────────────────── with tab_about: results = load_results() targets_meta = [ ("optimal_temperature_c", "Temperature optimum", "MAE", "°C", temp_color(45), "Useful — labs incubate in 5°C steps; you'd usually pick the right shelf.", "Model rarely misses by a tube. Trust the median; verify edge cases."), ("optimal_ph", "pH optimum", "MAE", "", ph_color(7), "Marginal — distinguishes acidic / neutral / alkaline, not finer.", "Buffer to predicted ±0.5; don't over-interpret tenths."), ("oxygen_requirement", "Oxygen requirement", "F1", "", O2_COLOR, "Weak — 9 imbalanced classes, frequent aerobe ↔ aerotolerant confusion.", "Treat predicted O₂ as a hint; check obligate vs facultative in a tube."), ("salt_tolerance_pct", "Salt tolerance", "MAE", "%", salt_color(3), "Decent — separates freshwater / marine / halotolerant.", "Reasonable for screening; not for fine-tuning compound concentrations."), ] st.markdown( f"""
The verdict
v1 handcrafted features is the working baseline. Trust temperature and pH; verify oxygen and salt with a tube.
""", unsafe_allow_html=True, ) cards_html = ['
'] for key, label, metric, unit, color, verdict, detail in targets_meta: a = results.get(key, {}) val = a.get("mean_metric", 0) cards_html.append(f"""
{label} 5-fold GroupKFold by family
{metric} {val:.2f} {unit}
"{verdict}"
{detail}
""") cards_html.append("
") st.markdown("\n".join(cards_html), unsafe_allow_html=True) # Confidence legend — three primitives st.markdown( f"""
How confidence is calculated
{media_conf_bar(0.72)}
Media confidence
Per-medium binary classifier predict_proba. Not perfectly calibrated — BacDive only has positive examples.
{oxygen_conf_arc(0.72, size=36)}
Oxygen confidence
Max softmax probability across 9 imbalanced classes. Low values mean the model can't pick between near-neighbour categories.
{interval_bar(37, 32, 43, 0, 80, temp_color(37), "°C", show_label=True)}
Prediction interval
Quantile regression at α=0.1 / 0.9 → 80% PI for T, pH, salt. Wide interval = model uncertain.
""", unsafe_allow_html=True, ) st.markdown( f"""
Trained on 17,047 BacDive strains with growth conditions; uncultured catalog is 5,000 held-out GTDB genomes scored against 24 DSMZ media. Features: 353 handcrafted genome statistics — GC, codon usage, tetranucleotide frequencies, AA composition. XGBoost classifiers for media; quantile regression XGBoost for prediction intervals.
""", unsafe_allow_html=True, )