', unsafe_allow_html=True) st.markdown( f'

Predict a medium

', unsafe_allow_html=True, ) pcol1, pcol2, pcol3 = st.columns([5, 2, 2]) with pcol1: query = st.text_input( label="predict query", label_visibility="collapsed", placeholder='Organism name, NCBI accession, or paste FASTA…', key="predict_query", ) with pcol2: upload = st.file_uploader( label="upload", label_visibility="collapsed", type=["fna", "fa", "fasta", "gz"], key="predict_upload", ) with pcol3: submit = st.button("🔎 Predict", type="primary", use_container_width=True) quick = st.columns([1, 1, 1, 6]) with quick[0]: if st.button("Try: Thermus thermophilus", key="qt_thermus"): st.session_state["predict_target"] = "Thermus thermophilus" st.session_state["run_predict"] = True with quick[1]: if st.button("Try: E. coli K-12", key="qt_ecoli"): st.session_state["predict_target"] = "GCF_000005845.2" st.session_state["run_predict"] = True with quick[2]: if st.button("Try: B. subtilis 168", key="qt_bsub"): st.session_state["predict_target"] = "GCF_000009045.1" st.session_state["run_predict"] = True # Run prediction if requested target = None if upload is not None and submit: tmp = ROOT / "data" / "_uploaded" / upload.name tmp.parent.mkdir(parents=True, exist_ok=True) tmp.write_bytes(upload.getbuffer()) target = str(tmp) elif submit and query.strip() and is_accession(query): target = query.strip() elif submit and query.strip(): with st.spinner(f"Searching NCBI for '{query.strip()}'…"): hits = search_ncbi_assembly(query.strip(), retmax=10) if not hits: st.warning(f"No NCBI Assembly hits for '{query.strip()}'.") else: st.session_state["ncbi_hits"] = hits elif st.session_state.pop("run_predict", False): target = st.session_state.pop("predict_target") if not is_accession(target): with st.spinner(f"Searching NCBI for '{target}'…"): hits = search_ncbi_assembly(target, retmax=5) if hits: target = hits[0]["accession"] hits = st.session_state.get("ncbi_hits", []) if hits and not target: st.markdown(f'

{len(hits)} NCBI matches

', unsafe_allow_html=True) labels = [f"{h['accession']} — {h['organism']} · {h['level']}" for h in hits] choice = st.radio("pick", options=list(range(len(hits))), format_func=lambda i: labels[i], label_visibility="collapsed", key="ncbi_choice") if st.button("Run on selected", type="primary"): target = hits[choice]["accession"] st.session_state.pop("ncbi_hits", None) if target: with st.spinner(f"Predicting for {target}…"): try: result = run_inference(target) except SystemExit as e: st.error(str(e)) st.stop() st.session_state["last_result"] = result result = st.session_state.get("last_result") if result: p = result["phenotypes"] top = result["media"][0] if result["media"] else None T = p.get("optimal_temperature_c", {}) pH = p.get("optimal_ph", {}) O2 = p.get("oxygen_requirement", {}) salt = p.get("salt_tolerance_pct", {}) st.markdown( f""" """, unsafe_allow_html=True, ) with st.expander("Full prediction · phenotype intervals + ranked media", expanded=False): ic = st.columns(4) for col, (key, label, unit, scale) in zip( ic, [ ("optimal_temperature_c", "T_opt", "°C", (0, 110)), ("optimal_ph", "pH", "", (2, 11)), ("oxygen_requirement", "O₂", "", None), ("salt_tolerance_pct", "salt", "%", (0, 25)), ], strict=True, ): info = p.get(key) or {} with col: if info.get("task") == "regression": v, lo, hi = info["prediction"], info.get("low_80"), info.get("high_80") c = (temp_color(v) if "temp" in key else ph_color(v) if "ph" in key else salt_color(v)) st.markdown( f"""

{label}

{v:.1f}{unit}

{interval_bar(v, lo or v, hi or v, scale[0], scale[1], c, unit, show_label=True)}

""", unsafe_allow_html=True, ) else: st.markdown( f"""

{label}

{oxygen_conf_arc(info.get("confidence", 0), size=40)}

{info.get("prediction", "—")}

""", unsafe_allow_html=True, ) st.markdown('

', unsafe_allow_html=True) st.markdown('

Top media

', unsafe_allow_html=True) for i, r in enumerate(result["media"][:5], 1): st.markdown( f"""

{r['medium_id']} {r['name']} {media_conf_bar(r['confidence'])}

{f'

{r["recipe"]}

' if r['recipe'] else ''}

""", unsafe_allow_html=True, ) if st.button("Clear prediction", key="clear_pred"): st.session_state.pop("last_result", None) st.rerun() st.markdown("

', unsafe_allow_html=True) sc1, sc2 = st.columns([4, 1]) with sc1: search = st.text_input( label="search", label_visibility="collapsed", placeholder="⌕ filter by organism name…", key="catalog_search", ) with sc2: st.markdown( f'

' f'showing {len(unc):,} · sorted by confidence

', unsafe_allow_html=True, ) st.markdown("

' f'

Top {len(featured)} picksby media confidence

' f'

'] for _, m in featured.iterrows(): T = float(m["pred_optimal_temperature_c"]) ph = float(m["pred_optimal_ph"]) slt = float(m["pred_salt_tolerance_pct"]) o2_lbl = m["pred_oxygen_requirement"] or "—" o2_conf = float(m.get("pred_oxygen_requirement_confidence") or 0) top_id = m["top1_medium_id"] top_name = m["top1_medium_name"] top_conf = float(m["top1_confidence"]) short = (m["ncbi_organism_name"] or m["genome_accession"])[:80] cards_html.append(f"""

{m['genome_accession']} · {m['phylum']}

{short}

T_opt {T:.0f}°C

{interval_bar(T, max(0, T - 5), min(110, T + 5), 0, 110, temp_color(T))}

pH {ph:.1f}

{interval_bar(ph, max(2, ph - 0.5), min(11, ph + 0.5), 2, 11, ph_color(ph))}

salt {slt:.1f}%

{interval_bar(slt, max(0, slt - 1), min(25, slt + 1), 0, 25, salt_color(slt))}

O₂

{oxygen_conf_arc(o2_conf, size=28)}

{o2_lbl}

Try this medium

{top_id} {top_name}

{media_conf_bar(top_conf)}

""") cards_html.append("

' f'

Remaining {len(rest):,}

' f'

' f'' f'{"".join(f"" for h in ["Accession", "Organism", "Phylum", "Try this medium", "Conf.", "T", "pH", "O₂", "Salt", "CheckM"])}' f'{"".join(table_rows)}

{h}

' f'

' f'showing first 80 of {len(rest):,} remaining · use search and filters to narrow' f'

' f'

Sanity-check the model on a microbe with published growth conditions.

' f'

', unsafe_allow_html=True) pcols = st.columns(3) for col, org in zip(pcols, SANITY_ORGANISMS, strict=True): with col: k = org["known"] st.markdown( f"""

{org['name']}

{org['accession']}

{k['T_opt']:.0f}°C pH {k['pH']:.1f} {k['O2']}

""", unsafe_allow_html=True, ) if st.button(f"Predict {org['name'].split()[0]}", key=f"sanity_{org['accession']}", use_container_width=True): st.session_state["test_target"] = org["accession"] st.session_state["test_known"] = org["known"] st.session_state["test_run"] = True st.markdown('

', unsafe_allow_html=True) with st.form("test_form", clear_on_submit=False): tcol1, tcol2 = st.columns([5, 2]) with tcol1: t_query = st.text_input( label="test query", label_visibility="collapsed", placeholder="⌕ organism name or NCBI accession…", value=st.session_state.get("test_target", ""), ) with tcol2: t_upload = st.file_uploader("test upload", type=["fna", "fa", "fasta", "gz"], label_visibility="collapsed") t_submit = st.form_submit_button("Run", type="primary", use_container_width=True) auto = st.session_state.pop("test_run", False) known = st.session_state.pop("test_known", None) t_target = None if t_upload is not None: tmp = ROOT / "data" / "_uploaded" / t_upload.name tmp.parent.mkdir(parents=True, exist_ok=True) tmp.write_bytes(t_upload.getbuffer()) t_target = str(tmp) elif t_submit and t_query.strip() and is_accession(t_query): t_target = t_query.strip() elif t_submit and t_query.strip(): with st.spinner(f"Searching NCBI for '{t_query.strip()}'…"): t_hits = search_ncbi_assembly(t_query.strip(), retmax=5) if t_hits: t_target = t_hits[0]["accession"] else: st.warning(f"No NCBI hits for '{t_query.strip()}'.") elif auto: t_target = st.session_state.get("test_target") if t_target: with st.spinner(f"Predicting {t_target}…"): try: t_result = run_inference(t_target) except SystemExit as e: st.error(str(e)) st.stop() p = t_result["phenotypes"] st.markdown( f'

' f'

Predicted vs published — {t_result["accession"]}

', unsafe_allow_html=True, ) cards = [] # Temperature T = p.get("optimal_temperature_c", {}) if T: v, lo, hi = T["prediction"], T.get("low_80", T["prediction"]), T.get("high_80", T["prediction"]) pub = known["T_opt"] if known else None ok = pub is not None and lo <= pub <= hi cards.append(_compare_card_html("Optimum temperature", v, lo, hi, pub, "°C", temp_color(v), 0, 110, ok)) pH = p.get("optimal_ph", {}) if pH: v, lo, hi = pH["prediction"], pH.get("low_80", pH["prediction"]), pH.get("high_80", pH["prediction"]) pub = known["pH"] if known else None ok = pub is not None and lo <= pub <= hi cards.append(_compare_card_html("Optimum pH", v, lo, hi, pub, "", ph_color(v), 2, 11, ok)) slt = p.get("salt_tolerance_pct", {}) if slt: v, lo, hi = slt["prediction"], slt.get("low_80", slt["prediction"]), slt.get("high_80", slt["prediction"]) pub = known["salt"] if known else None ok = pub is not None and lo <= pub <= hi cards.append(_compare_card_html("Salt tolerance", v, lo, hi, pub, "%", salt_color(v), 0, 25, ok)) O2 = p.get("oxygen_requirement", {}) if O2: cards.append(_oxygen_compare_card(O2.get("prediction", "—"), O2.get("confidence", 0), known["O2"] if known else None)) st.markdown( f'

' f'{"".join(cards)}

', unsafe_allow_html=True, ) st.markdown('

', unsafe_allow_html=True) st.markdown('

Top media to try

', unsafe_allow_html=True) for i, r in enumerate(t_result["media"][:5], 1): st.markdown( f"""

{r['medium_id']} {r['name']} {media_conf_bar(r['confidence'])}

{f'

{r["recipe"]}

' if r['recipe'] else ''}

""", unsafe_allow_html=True, ) st.markdown("

", unsafe_allow_html=True) # ────────────────────────────────────────────────────────────────────── # Tab 3 — Model accuracy # ────────────────────────────────────────────────────────────────────── with tab_about: results = load_results() targets_meta = [ ("optimal_temperature_c", "Temperature optimum", "MAE", "°C", temp_color(45), "Useful — labs incubate in 5°C steps; you'd usually pick the right shelf.", "Model rarely misses by a tube. Trust the median; verify edge cases."), ("optimal_ph", "pH optimum", "MAE", "", ph_color(7), "Marginal — distinguishes acidic / neutral / alkaline, not finer.", "Buffer to predicted ±0.5; don't over-interpret tenths."), ("oxygen_requirement", "Oxygen requirement", "F1", "", O2_COLOR, "Weak — 9 imbalanced classes, frequent aerobe ↔ aerotolerant confusion.", "Treat predicted O₂ as a hint; check obligate vs facultative in a tube."), ("salt_tolerance_pct", "Salt tolerance", "MAE", "%", salt_color(3), "Decent — separates freshwater / marine / halotolerant.", "Reasonable for screening; not for fine-tuning compound concentrations."), ] st.markdown( f"""

The verdict

v1 handcrafted features is the working baseline. Trust temperature and pH; verify oxygen and salt with a tube.

""", unsafe_allow_html=True, ) cards_html = ['

'] for key, label, metric, unit, color, verdict, detail in targets_meta: a = results.get(key, {}) val = a.get("mean_metric", 0) cards_html.append(f"""

{label} 5-fold GroupKFold by family

{metric} {val:.2f} {unit}

"{verdict}"

{detail}

""") cards_html.append("

") st.markdown("\n".join(cards_html), unsafe_allow_html=True) # Confidence legend — three primitives st.markdown( f"""

How confidence is calculated

{media_conf_bar(0.72)}

Media confidence

Per-medium binary classifier predict_proba. Not perfectly calibrated — BacDive only has positive examples.

{oxygen_conf_arc(0.72, size=36)}

Oxygen confidence

Max softmax probability across 9 imbalanced classes. Low values mean the model can't pick between near-neighbour categories.

{interval_bar(37, 32, 43, 0, 80, temp_color(37), "°C", show_label=True)}

Prediction interval

Quantile regression at α=0.1 / 0.9 → 80% PI for T, pH, salt. Wide interval = model uncertain.

""", unsafe_allow_html=True, ) st.markdown( f"""

Trained on 17,047 BacDive strains with growth conditions; uncultured catalog is 5,000 held-out GTDB genomes scored against 24 DSMZ media. Features: 353 handcrafted genome statistics — GC, codon usage, tetranucleotide frequencies, AA composition. XGBoost classifiers for media; quantile regression XGBoost for prediction intervals.

""", unsafe_allow_html=True, )

microbe-model