Spaces:

miyuiu
/

microbe-model

Running

Miyu Horiuchi Claude Opus 4.7 (1M context) commited on 24 days ago

Commit

f0f1d93

1 Parent(s): c3d9a06

Expand training corpus to 46K strains: species-name → NCBI genome + isolation features

Two layered improvements over the 17K-strain genome-only baseline:

1. Isolation-source features (Cat1/Cat2 from BacDive's `isolation source categories`)
- Re-extracted from cached BacDive JSON via scripts/17 (no re-download)
- One-hot encoded in scripts/03 (and mirrored in scripts/14 for the v3 trainer)

2. Species-name → NCBI representative genome resolution
- scripts/18 queries NCBI Datasets v2 /genome/taxon/ for each unique species
name in the gap (had phenotype labels but no genome accession in BacDive).
5,393 of 7,905 species resolved (68%).
- scripts/19 deduplicated featurize: each unique accession downloaded once,
feature dict replicated to all sibling strains (avoids ~5x redundant downloads).
- 5,254 of 5,283 unique genomes featurized (99.5% success).

Result: 17,054 → 46,058 training-ready strains.

Per-target metrics (vs original v0 genome-only):
optimal_temperature_c MAE 3.28 → 2.94 (-10.4%)
oxygen_requirement F1 0.279 → 0.341 (+22.2%)
optimal_ph MAE 0.52 → 0.51 ( -2.1%, label-limited)
salt_tolerance_pct MAE 2.51 → 2.52 ( +0.3%, label-limited)

pH and salt did not lift because BacDive label coverage for those targets barely
grew (1.1× and 1.3× vs 2.7× and 2.1× for T_opt and oxygen respectively).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (9) hide show

artifacts/baseline_results.json +210 -145
artifacts/combined_results.json +204 -141
artifacts/eval_report.md +161 -161
scripts/03_train_baseline.py +40 -0
scripts/14_train_combined.py +40 -3
scripts/17_reextract_phenotypes.py +43 -0
scripts/18_resolve_species_to_genome.py +110 -0
scripts/19_featurize_resolved.py +106 -0
src/microbe_model/data/bacdive.py +29 -0

artifacts/baseline_results.json CHANGED Viewed

@@ -1,274 +1,274 @@
 {
   "optimal_temperature_c": {
     "task": "regression",
-    "mean_metric": 3.2796768780969154,
     "folds": [
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.7927897739802856,
-        "n_train": 13605,
-        "n_test": 3402
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.528644174915563,
-        "n_train": 13605,
-        "n_test": 3402
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.460899077440423,
-        "n_train": 13606,
-        "n_test": 3401
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.1300457459483138,
-        "n_train": 13606,
-        "n_test": 3401
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.4860056181999908,
-        "n_train": 13606,
-        "n_test": 3401
       }
     ],
     "top_features": {
-      "ivywrel_frac": 0.19766514897346496,
-      "n_predicted_cds": 0.02504703141748905,
-      "pos_charged_frac": 0.01956883855164051,
-      "aa_frac_E": 0.013097247015684843,
-      "aa_frac_C": 0.011947918310761451,
-      "codon_TTG": 0.009919821098446847,
-      "codon_TGA": 0.00989874340593815,
-      "codon_AGG": 0.009789730794727803,
-      "tetra_GCAA": 0.008400670997798444,
-      "aa_frac_S": 0.008075917325913905,
-      "tetra_CTAA": 0.008004277455620467,
-      "tetra_CTAG": 0.0076253050938248634,
-      "tetra_AGGC": 0.007276883872691542,
-      "mean_isoelectric_point": 0.00710052065551281,
-      "aa_frac_Y": 0.006824395339936018,
-      "aa_frac_L": 0.006632247474044561,
-      "neg_charged_frac": 0.006289067305624485,
-      "aa_frac_Q": 0.005970604927279055,
-      "tetra_TTTA": 0.005658498092088848,
-      "aa_frac_D": 0.005623110197484493
     }
   },
   "optimal_ph": {
     "task": "regression",
-    "mean_metric": 0.5202253475923528,
     "folds": [
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.4754666561983069,
-        "n_train": 3721,
-        "n_test": 931
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.6132477354670441,
-        "n_train": 3721,
-        "n_test": 931
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.4791252064653622,
-        "n_train": 3722,
-        "n_test": 930
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.5286309639510288,
-        "n_train": 3722,
-        "n_test": 930
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.504656175880022,
-        "n_train": 3722,
-        "n_test": 930
       }
     ],
     "top_features": {
-      "neg_charged_frac": 0.021104557998478414,
-      "tetra_TGCT": 0.014129164954647421,
-      "aa_frac_H": 0.01198372757062316,
-      "tetra_CACT": 0.00974916813429445,
-      "tetra_AGAC": 0.008753480762243271,
-      "tetra_GAGA": 0.00805287561379373,
-      "tetra_TCTC": 0.007691349275410176,
-      "ivywrel_frac": 0.0076533622108399865,
-      "aa_frac_E": 0.007637183275073766,
-      "tetra_CTCT": 0.006926810601726174,
-      "codon_AAG": 0.006873661652207375,
-      "n_predicted_cds": 0.006350988755002618,
-      "mean_isoelectric_point": 0.005934224603697658,
-      "tetra_AGGT": 0.005835541151463985,
-      "tetra_GTGA": 0.0058299127034842965,
-      "tetra_CCTG": 0.005803165677934885,
-      "tetra_TGTT": 0.005650706822052598,
-      "tetra_TGGT": 0.005588132468983531,
-      "tetra_ACGA": 0.0055527932243421676,
-      "tetra_TGAG": 0.005398272210732103
     }
   },
   "oxygen_requirement": {
     "task": "classification",
-    "mean_metric": 0.2790834034149142,
     "folds": [
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.203774595737485,
-        "n_train": 8340,
-        "n_test": 2085
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.25451263916216754,
-        "n_train": 8341,
-        "n_test": 2085
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.28578249475572015,
-        "n_train": 8341,
-        "n_test": 2085
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.3554691036453791,
-        "n_train": 8341,
-        "n_test": 2085
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.29587818377381925,
-        "n_train": 8341,
-        "n_test": 2085
       }
     ],
     "top_features": {
-      "codon_ATA": 0.03811337612569332,
-      "aa_frac_C": 0.021676772832870485,
-      "genome_size_nt": 0.019434500206261873,
-      "tetra_CAAA": 0.017337634600698947,
-      "codon_CAA": 0.014458012580871583,
-      "aa_frac_Q": 0.012433571089059115,
-      "tetra_TCAA": 0.01232931949198246,
-      "n_predicted_cds": 0.011694983579218388,
-      "aa_frac_K": 0.009837790858000517,
-      "aa_frac_W": 0.008752449508756399,
-      "codon_TGG": 0.008257251046597958,
-      "aa_frac_M": 0.007389512192457914,
-      "aa_frac_Y": 0.007202606648206711,
-      "codon_ATG": 0.007109537813812494,
-      "aa_frac_L": 0.006909955851733684,
-      "codon_CGT": 0.006404342176392674,
-      "codon_CAT": 0.005392152117565274,
-      "tetra_AGGA": 0.0053115392103791235,
-      "aa_frac_H": 0.005291619151830673,
-      "ivywrel_frac": 0.004459869768470526
     }
   },
   "salt_tolerance_pct": {
     "task": "regression",
-    "mean_metric": 2.5102975998466475,
     "folds": [
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.557428962086115,
-        "n_train": 3834,
-        "n_test": 959
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.1644325020106914,
-        "n_train": 3834,
-        "n_test": 959
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.777298618858686,
-        "n_train": 3834,
-        "n_test": 959
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.65442228433608,
-        "n_train": 3835,
-        "n_test": 958
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.3979056319416676,
-        "n_train": 3835,
-        "n_test": 958
       }
     ],
     "top_features": {
-      "aa_frac_C": 0.04922681804746389,
-      "neg_charged_frac": 0.03649095520377159,
-      "tetra_CGTT": 0.015985671523958446,
-      "tetra_GAAA": 0.012250523548573256,
-      "tetra_GACT": 0.011047882377170026,
-      "tetra_AACC": 0.009099948103539646,
-      "codon_CGT": 0.007261711079627276,
-      "tetra_TGTG": 0.007125534303486347,
-      "tetra_AGGA": 0.006638824963010848,
-      "tetra_GGAG": 0.006525454460643232,
-      "tetra_CACA": 0.0065232118591666225,
-      "tetra_AGTC": 0.0065024693030864,
-      "tetra_GTGG": 0.0063626600429415705,
-      "codon_TGG": 0.006061221100389957,
-      "tetra_GTAT": 0.00588638405315578,
-      "tetra_CGTA": 0.00577605227008462,
-      "tetra_CTGA": 0.005612925114110112,
-      "codon_TGC": 0.005607163021340966,
-      "tetra_TACT": 0.005531361280009151,
-      "codon_GAT": 0.005294737778604031
     }
   },
   "__meta__": {
@@ -625,7 +625,72 @@
       "codon_TTA",
       "codon_TTC",
       "codon_TTG",
-      "codon_TTT"
     ]
   }
 }

 {
   "optimal_temperature_c": {
     "task": "regression",
+    "mean_metric": 2.939444159350111,
     "folds": [
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.103597222252415,
+        "n_train": 36496,
+        "n_test": 9125
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.7356862682357583,
+        "n_train": 36497,
+        "n_test": 9124
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.145843773419164,
+        "n_train": 36497,
+        "n_test": 9124
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.2767152481045656,
+        "n_train": 36497,
+        "n_test": 9124
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.43537828473865,
+        "n_train": 36497,
+        "n_test": 9124
       }
     ],
     "top_features": {
+      "ivywrel_frac": 0.12668818831443787,
+      "iso_cat2_thermophilic_gt45_c": 0.029868930205702783,
+      "n_predicted_cds": 0.025075340643525124,
+      "iso_cat2_human": 0.020858772844076157,
+      "iso_cat1_infection": 0.020640516839921474,
+      "iso_cat2_patient": 0.017751351464539766,
+      "aa_frac_C": 0.015003016591072083,
+      "genome_size_nt": 0.012203263118863106,
+      "aa_frac_D": 0.011290411837399006,
+      "codon_AGG": 0.010900856088846922,
+      "iso_cat1_environmental": 0.010176281817257405,
+      "tetra_GCCT": 0.009658925677649676,
+      "tetra_TAGT": 0.00883282758295536,
+      "aa_frac_Y": 0.008421392692252994,
+      "aa_frac_E": 0.007741594593971968,
+      "tetra_TTCC": 0.007376640872098506,
+      "mean_isoelectric_point": 0.007058459660038352,
+      "tetra_CTAA": 0.0070426638238132,
+      "iso_cat2_built_environment": 0.006164434866514057,
+      "iso_cat2_industrial": 0.005895084328949451
     }
   },
   "optimal_ph": {
     "task": "regression",
+    "mean_metric": 0.5090253015368336,
     "folds": [
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.45639293885487886,
+        "n_train": 4082,
+        "n_test": 1021
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.6262803867911733,
+        "n_train": 4082,
+        "n_test": 1021
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.528334212326513,
+        "n_train": 4082,
+        "n_test": 1021
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.48048674237494376,
+        "n_train": 4083,
+        "n_test": 1020
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.4536322273366591,
+        "n_train": 4083,
+        "n_test": 1020
       }
     ],
     "top_features": {
+      "iso_cat2_acidic": 0.05219607315957546,
+      "iso_cat2_alkaline": 0.043521419167518616,
+      "neg_charged_frac": 0.016875072754919528,
+      "aa_frac_E": 0.008599728252738715,
+      "tetra_CTCT": 0.008368687890470027,
+      "aa_frac_H": 0.008003219496458769,
+      "mean_isoelectric_point": 0.007599354162812233,
+      "tetra_CACT": 0.007427609874866903,
+      "tetra_AGAC": 0.007137532206252217,
+      "tetra_AGGT": 0.005891842069104314,
+      "tetra_GACT": 0.005873983446508646,
+      "tetra_GAGA": 0.005548427533358336,
+      "tetra_GTCT": 0.005475769587792456,
+      "codon_GAA": 0.005408304557204246,
+      "n_predicted_cds": 0.005280579440295696,
+      "iso_cat2_plants": 0.005045945569872856,
+      "tetra_TTGA": 0.004973787232302129,
+      "codon_AAG": 0.0048154488438740374,
+      "tetra_ACGA": 0.004731484339572489,
+      "aa_frac_Y": 0.0046834095381200315
     }
   },
   "oxygen_requirement": {
     "task": "classification",
+    "mean_metric": 0.34127360853732613,
     "folds": [
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.31515576471296236,
+        "n_train": 17311,
+        "n_test": 4328
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.38181774862206597,
+        "n_train": 17311,
+        "n_test": 4326
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.34440677114867413,
+        "n_train": 17311,
+        "n_test": 4328
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.25943178539399836,
+        "n_train": 17311,
+        "n_test": 4328
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.40555597280892947,
+        "n_train": 17312,
+        "n_test": 4327
       }
     ],
     "top_features": {
+      "codon_ATA": 0.0414140235632658,
+      "iso_cat1_host": 0.02601129524409771,
+      "n_predicted_cds": 0.025201210007071494,
+      "aa_frac_C": 0.019132474437355995,
+      "iso_cat1_environmental": 0.01645018421113491,
+      "codon_CGT": 0.014759847987443208,
+      "iso_cat1_engineered": 0.01378793753683567,
+      "genome_size_nt": 0.011305144988000393,
+      "iso_cat2_human": 0.010168002359569073,
+      "codon_TAA": 0.00900037819519639,
+      "aa_frac_V": 0.008459322061389685,
+      "aa_frac_Y": 0.008259046915918588,
+      "aa_frac_L": 0.0072497081011533735,
+      "tetra_CTGG": 0.006922230357304215,
+      "aa_frac_T": 0.006535647064447403,
+      "codon_TGG": 0.006477221753448248,
+      "aa_frac_Q": 0.0063397581689059734,
+      "aa_frac_M": 0.006198597187176347,
+      "tetra_CAAA": 0.006141273584216833,
+      "codon_CAA": 0.00611291266977787
     }
   },
   "salt_tolerance_pct": {
     "task": "regression",
+    "mean_metric": 2.516896605067264,
     "folds": [
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.218365752012856,
+        "n_train": 5064,
+        "n_test": 1266
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.249367568591289,
+        "n_train": 5064,
+        "n_test": 1266
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.8189112452912664,
+        "n_train": 5064,
+        "n_test": 1266
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.3502065964041967,
+        "n_train": 5064,
+        "n_test": 1266
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.947631863036709,
+        "n_train": 5064,
+        "n_test": 1266
       }
     ],
     "top_features": {
+      "aa_frac_C": 0.029796541761606933,
+      "neg_charged_frac": 0.027759117633104326,
+      "tetra_ATCC": 0.018280067457817496,
+      "iso_cat1_environmental": 0.014224943332374096,
+      "tetra_GACT": 0.01211925563402474,
+      "iso_cat2_saline": 0.011419120244681835,
+      "codon_TGC": 0.011161889415234327,
+      "tetra_CGTT": 0.009351400006562472,
+      "codon_CGT": 0.008664370141923427,
+      "iso_cat2_industrial": 0.008528076158836485,
+      "tetra_TAAT": 0.008236682531423867,
+      "iso_cat2_contamination": 0.008197423309320584,
+      "tetra_CGTA": 0.007803171873092651,
+      "tetra_TGTG": 0.007793005835264921,
+      "tetra_TACC": 0.007619049632921815,
+      "codon_CAC": 0.007169742346741259,
+      "tetra_AGTC": 0.006417827447876334,
+      "tetra_CCTG": 0.006371588306501507,
+      "tetra_GGTA": 0.006226122658699751,
+      "tetra_GAAA": 0.006115543586201966
     }
   },
   "__meta__": {
       "codon_TTA",
       "codon_TTC",
       "codon_TTG",
+      "codon_TTT",
+      "iso_cat1_climate",
+      "iso_cat1_condition",
+      "iso_cat1_engineered",
+      "iso_cat1_environmental",
+      "iso_cat1_host",
+      "iso_cat1_host_body_product",
+      "iso_cat1_host_body_site",
+      "iso_cat1_infection",
+      "iso_cat2_acidic",
+      "iso_cat2_agriculture",
+      "iso_cat2_air",
+      "iso_cat2_algae",
+      "iso_cat2_alkaline",
+      "iso_cat2_anoxic_anaerobic",
+      "iso_cat2_aquatic",
+      "iso_cat2_arthropoda",
+      "iso_cat2_biodegradation",
+      "iso_cat2_biofilm",
+      "iso_cat2_bioreactor",
+      "iso_cat2_bioremediation",
+      "iso_cat2_birds",
+      "iso_cat2_built_environment",
+      "iso_cat2_cold",
+      "iso_cat2_contamination",
+      "iso_cat2_disease",
+      "iso_cat2_fishes",
+      "iso_cat2_fluids",
+      "iso_cat2_food_production",
+      "iso_cat2_fungi",
+      "iso_cat2_gastrointestinal_tract",
+      "iso_cat2_hot",
+      "iso_cat2_human",
+      "iso_cat2_humid",
+      "iso_cat2_industrial",
+      "iso_cat2_inflammation",
+      "iso_cat2_invertebrates_other",
+      "iso_cat2_juvenile",
+      "iso_cat2_laboratory",
+      "iso_cat2_limb",
+      "iso_cat2_mammals",
+      "iso_cat2_medical_device",
+      "iso_cat2_medical_environment",
+      "iso_cat2_medical_product",
+      "iso_cat2_microbial",
+      "iso_cat2_microbial_community",
+      "iso_cat2_oral_cavity_and_airways",
+      "iso_cat2_organ",
+      "iso_cat2_other",
+      "iso_cat2_patient",
+      "iso_cat2_plant",
+      "iso_cat2_plant_infections",
+      "iso_cat2_plants",
+      "iso_cat2_protozoa",
+      "iso_cat2_psychrophilic_lt10_c",
+      "iso_cat2_reptilia",
+      "iso_cat2_saline",
+      "iso_cat2_sulfuric",
+      "iso_cat2_temperate",
+      "iso_cat2_terrestrial",
+      "iso_cat2_thermophilic_gt45_c",
+      "iso_cat2_treatment",
+      "iso_cat2_urogenital_tract",
+      "iso_cat2_waste",
+      "iso_cat2_xerophilic",
+      "iso_cat2_yeast"
     ]
   }
 }

artifacts/combined_results.json CHANGED Viewed

@@ -1,274 +1,274 @@
 {
   "optimal_temperature_c": {
     "task": "regression",
-    "mean_metric": 3.2743260321046983,
     "folds": [
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.800930039710605,
-        "n_train": 13577,
-        "n_test": 3395
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.212330534180766,
-        "n_train": 13577,
-        "n_test": 3395
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.8308504694689707,
-        "n_train": 13578,
-        "n_test": 3394
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.055139859704741,
-        "n_train": 13578,
-        "n_test": 3394
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.47237925745841,
-        "n_train": 13578,
-        "n_test": 3394
       }
     ],
     "top_features": {
-      "ivywrel_frac": 0.13554759174585343,
-      "n_predicted_cds": 0.016180129162967204,
-      "pos_charged_frac": 0.015452616102993488,
-      "emb_42": 0.009146808803780004,
-      "aa_frac_C": 0.009010343812406063,
-      "aa_frac_E": 0.008539762906730175,
-      "codon_TGA": 0.008345812978222966,
-      "mean_isoelectric_point": 0.00771149517968297,
-      "emb_43": 0.007495118898805231,
-      "codon_TTG": 0.007401803624816239,
-      "codon_AGG": 0.006896090880036354,
-      "tetra_GCAA": 0.006816221633926034,
-      "emb_103": 0.0065081269480288025,
-      "tetra_AGGC": 0.0060555480304174125,
-      "aa_frac_Q": 0.005575686926022172,
-      "aa_frac_L": 0.00546089680865407,
-      "aa_frac_S": 0.0053946841042488815,
-      "tetra_CAAA": 0.005041174124926329,
-      "tetra_TTTA": 0.005022482725325972,
-      "emb_271": 0.004957860894501209
     }
   },
   "optimal_ph": {
     "task": "regression",
-    "mean_metric": 0.5148330200525308,
     "folds": [
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.45630543930777195,
-        "n_train": 3710,
         "n_test": 928
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.5329306995252083,
-        "n_train": 3710,
         "n_test": 928
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.5447937661203844,
-        "n_train": 3710,
         "n_test": 928
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.5399269027771687,
-        "n_train": 3711,
-        "n_test": 927
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.5002082925321215,
-        "n_train": 3711,
-        "n_test": 927
       }
     ],
     "top_features": {
-      "neg_charged_frac": 0.01469268798828125,
-      "tetra_CACT": 0.010008076298981905,
-      "tetra_TGCT": 0.009113076934590935,
-      "aa_frac_H": 0.008188599767163397,
-      "tetra_GAGA": 0.00789343281649053,
-      "tetra_TCTC": 0.007285104691982269,
-      "tetra_AGAC": 0.006973244273103774,
-      "aa_frac_E": 0.005855221953243017,
-      "codon_AAG": 0.005304977297782898,
-      "tetra_GTCT": 0.004962419020012021,
-      "mean_isoelectric_point": 0.00494197946973145,
-      "tetra_GACT": 0.004903565905988216,
-      "codon_GAA": 0.004646051116287709,
-      "n_predicted_cds": 0.004488307330757379,
-      "tetra_GCGT": 0.004404617205727846,
-      "tetra_ACGA": 0.004399605770595371,
-      "ivywrel_frac": 0.004385623009875416,
-      "codon_TCA": 0.004316977364942432,
-      "tetra_TGGT": 0.004235964128747582,
-      "emb_50": 0.004220196325331926
     }
   },
   "oxygen_requirement": {
     "task": "classification",
-    "mean_metric": 0.2926196414104487,
     "folds": [
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.2568366096040457,
-        "n_train": 8314,
-        "n_test": 2079
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.28331761646078946,
-        "n_train": 8314,
-        "n_test": 2079
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.3003208823814207,
-        "n_train": 8314,
-        "n_test": 2078
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.3245556247911396,
-        "n_train": 8315,
         "n_test": 2078
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.2980674738148481,
-        "n_train": 8315,
-        "n_test": 2078
       }
     ],
     "top_features": {
-      "emb_103": 0.023965770192444325,
-      "tetra_TCAA": 0.014948683325201273,
-      "codon_CAA": 0.014257160667330026,
-      "codon_ATA": 0.013358323276042939,
-      "tetra_CAAA": 0.0109681696863845,
-      "aa_frac_C": 0.010774458199739457,
-      "genome_size_nt": 0.01065667849034071,
-      "aa_frac_Q": 0.0091962531208992,
-      "tetra_TTTG": 0.008046209486201406,
-      "n_predicted_cds": 0.007619006186723709,
-      "codon_TGG": 0.007263739546760917,
-      "aa_frac_M": 0.006333135021850467,
-      "emb_65": 0.0060926306061446665,
-      "emb_12": 0.005908917868509889,
-      "aa_frac_K": 0.005255377222783864,
-      "aa_frac_W": 0.004311660584062338,
-      "aa_frac_L": 0.004282204434275627,
-      "codon_CGT": 0.004179813340306282,
-      "aa_frac_Y": 0.0039975212421268225,
-      "tetra_ATCT": 0.0035881946329027413
     }
   },
   "salt_tolerance_pct": {
     "task": "regression",
-    "mean_metric": 2.500490414750344,
     "folds": [
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.4981446771859317,
-        "n_train": 3826,
-        "n_test": 957
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.1454447971134294,
-        "n_train": 3826,
-        "n_test": 957
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.8626325660039327,
-        "n_train": 3826,
-        "n_test": 957
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.5895608990698675,
-        "n_train": 3827,
-        "n_test": 956
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.4066691343785576,
-        "n_train": 3827,
-        "n_test": 956
       }
     ],
     "top_features": {
-      "aa_frac_C": 0.03929822724312544,
-      "neg_charged_frac": 0.024765344709157942,
-      "tetra_TGTG": 0.010924566886387765,
-      "tetra_GACT": 0.010432313196361065,
-      "emb_84": 0.007880622497759759,
-      "mean_isoelectric_point": 0.006233150523621589,
-      "codon_CGT": 0.006036084098741412,
-      "emb_264": 0.0059552066260948775,
-      "tetra_AGTC": 0.0055462203454226255,
-      "tetra_GTGG": 0.005274477787315846,
-      "tetra_TACT": 0.004922923888079822,
-      "codon_TGT": 0.004755359562113881,
-      "tetra_TACG": 0.004655727057252079,
-      "aa_frac_D": 0.004644650942645967,
-      "tetra_CGTA": 0.004608191270381212,
-      "tetra_CACA": 0.004475983197335154,
-      "codon_GAA": 0.004449379863217473,
-      "tetra_CTAA": 0.004377065307926387,
-      "tetra_AACC": 0.004368867329321802,
-      "tetra_TACC": 0.004245994682423771
     }
   },
   "__meta__": {
@@ -945,7 +945,70 @@
       "emb_316",
       "emb_317",
       "emb_318",
-      "emb_319"
     ]
   }
 }

 {
   "optimal_temperature_c": {
     "task": "regression",
+    "mean_metric": 3.1503700219525386,
     "folds": [
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.738956731512239,
+        "n_train": 13592,
+        "n_test": 3398
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.1457219391587623,
+        "n_train": 13592,
+        "n_test": 3398
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.6463728072293864,
+        "n_train": 13592,
+        "n_test": 3398
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.8936606017051547,
+        "n_train": 13592,
+        "n_test": 3398
       },
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.327138030157151,
+        "n_train": 13592,
+        "n_test": 3398
       }
     ],
     "top_features": {
+      "ivywrel_frac": 0.1272028997540474,
+      "iso_cat2_human": 0.02456752024590969,
+      "iso_cat2_thermophilic_gt45_c": 0.02335200347006321,
+      "iso_cat1_host_body_product": 0.016055696457624436,
+      "pos_charged_frac": 0.014609704539179803,
+      "iso_cat1_infection": 0.01389338243752718,
+      "iso_cat2_mammals": 0.013551290705800056,
+      "n_predicted_cds": 0.012836913019418717,
+      "iso_cat1_environmental": 0.010669851303100586,
+      "tetra_CTAA": 0.009250188246369362,
+      "codon_AGG": 0.008724220609292389,
+      "aa_frac_E": 0.008193913381546736,
+      "aa_frac_C": 0.007445563282817602,
+      "mean_isoelectric_point": 0.0069396811537444595,
+      "codon_TTG": 0.006236870028078556,
+      "emb_42": 0.006092228952911683,
+      "aa_frac_Q": 0.006049463665112853,
+      "tetra_GCAA": 0.005999001779127866,
+      "codon_TGA": 0.005786543060094118,
+      "tetra_TTAG": 0.0056871567387133835
     }
   },
   "optimal_ph": {
     "task": "regression",
+    "mean_metric": 0.5009073096637068,
     "folds": [
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.45625025033950806,
+        "n_train": 3712,
         "n_test": 928
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.5460629706958244,
+        "n_train": 3712,
         "n_test": 928
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.45289194491402857,
+        "n_train": 3712,
         "n_test": 928
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.5152855838783856,
+        "n_train": 3712,
+        "n_test": 928
       },
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.5340457984907874,
+        "n_train": 3712,
+        "n_test": 928
       }
     ],
     "top_features": {
+      "iso_cat2_acidic": 0.03560468032956123,
+      "iso_cat2_alkaline": 0.021716910228133202,
+      "neg_charged_frac": 0.012849669158458709,
+      "tetra_CACT": 0.007109560957178474,
+      "tetra_AGAC": 0.006792897148989141,
+      "aa_frac_E": 0.006575863063335419,
+      "tetra_CTCT": 0.0063951408956199884,
+      "aa_frac_H": 0.006346661783754826,
+      "tetra_GACT": 0.005681420909240842,
+      "codon_TTT": 0.00532103287987411,
+      "tetra_TGCT": 0.005244059395045042,
+      "codon_TGC": 0.004600144876167178,
+      "tetra_GAGA": 0.004556609224528075,
+      "tetra_ACGA": 0.004443949507549405,
+      "codon_AAG": 0.004402201203629374,
+      "mean_isoelectric_point": 0.004346802597865462,
+      "codon_GAA": 0.0043407921912148595,
+      "tetra_GGAT": 0.0042743304162286225,
+      "codon_GAG": 0.004212372726760805,
+      "tetra_AGGT": 0.004194398503750562
     }
   },
   "oxygen_requirement": {
     "task": "classification",
+    "mean_metric": 0.31635288673665096,
     "folds": [
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.2856587696238043,
+        "n_train": 8320,
+        "n_test": 2081
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.3076087339800983,
+        "n_train": 8321,
+        "n_test": 2080
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.30314339231057225,
+        "n_train": 8321,
+        "n_test": 2079
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.37851699811618567,
+        "n_train": 8321,
         "n_test": 2078
       },
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.30683653965259416,
+        "n_train": 8321,
+        "n_test": 2080
       }
     ],
     "top_features": {
+      "emb_103": 0.021533418074250223,
+      "tetra_TCAA": 0.017144506447948515,
+      "codon_CAA": 0.01247407766059041,
+      "genome_size_nt": 0.011898068338632583,
+      "codon_ATA": 0.010783014632761479,
+      "aa_frac_C": 0.010261021554470062,
+      "tetra_CAAA": 0.009564016468357295,
+      "emb_50": 0.00891582112526521,
+      "aa_frac_Q": 0.008642746694386005,
+      "n_predicted_cds": 0.007285213563591242,
+      "aa_frac_K": 0.006437191320583224,
+      "aa_frac_M": 0.006172794941812753,
+      "emb_3": 0.005596142518334091,
+      "emb_12": 0.005483857169747352,
+      "tetra_ATAG": 0.0051641249097883705,
+      "aa_frac_L": 0.004760731570422649,
+      "aa_frac_W": 0.004419099772349,
+      "codon_CGT": 0.004018026869744062,
+      "codon_ATG": 0.00391377778723836,
+      "aa_frac_Y": 0.0038635179633274676
     }
   },
   "salt_tolerance_pct": {
     "task": "regression",
+    "mean_metric": 2.4756113285254835,
     "folds": [
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.2104132311571862,
+        "n_train": 3832,
+        "n_test": 958
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.5142923461258535,
+        "n_train": 3832,
+        "n_test": 958
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.9433706752467503,
+        "n_train": 3832,
+        "n_test": 958
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.352060198972643,
+        "n_train": 3832,
+        "n_test": 958
       },
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.3579201911249834,
+        "n_train": 3832,
+        "n_test": 958
       }
     ],
     "top_features": {
+      "aa_frac_C": 0.03296378441154957,
+      "neg_charged_frac": 0.025052556581795216,
+      "iso_cat2_saline": 0.010697953775525093,
+      "tetra_CACA": 0.008883546688593924,
+      "tetra_GACT": 0.007743655145168305,
+      "emb_84": 0.006587694090558216,
+      "codon_TGG": 0.006308569852262735,
+      "iso_cat2_human": 0.005952583346515894,
+      "tetra_AGTC": 0.005895164678804576,
+      "tetra_GGAA": 0.005703426687978208,
+      "codon_CGT": 0.005462939362041652,
+      "iso_cat1_infection": 0.005432895617559552,
+      "tetra_ACTG": 0.005339212145190686,
+      "tetra_AACC": 0.0049897018994670365,
+      "mean_isoelectric_point": 0.004982588707935065,
+      "iso_cat2_patient": 0.0049561535939574245,
+      "tetra_GAAA": 0.004851629724726081,
+      "aa_frac_D": 0.0045981516130268575,
+      "codon_TGT": 0.004288341873325408,
+      "tetra_CGTA": 0.0041915396694093944
     }
   },
   "__meta__": {
       "emb_316",
       "emb_317",
       "emb_318",
+      "emb_319",
+      "iso_cat1_climate",
+      "iso_cat1_condition",
+      "iso_cat1_engineered",
+      "iso_cat1_environmental",
+      "iso_cat1_host",
+      "iso_cat1_host_body_product",
+      "iso_cat1_host_body_site",
+      "iso_cat1_infection",
+      "iso_cat2_acidic",
+      "iso_cat2_agriculture",
+      "iso_cat2_air",
+      "iso_cat2_algae",
+      "iso_cat2_alkaline",
+      "iso_cat2_anoxic_anaerobic",
+      "iso_cat2_aquatic",
+      "iso_cat2_arthropoda",
+      "iso_cat2_biodegradation",
+      "iso_cat2_biofilm",
+      "iso_cat2_bioreactor",
+      "iso_cat2_bioremediation",
+      "iso_cat2_birds",
+      "iso_cat2_built_environment",
+      "iso_cat2_cold",
+      "iso_cat2_contamination",
+      "iso_cat2_disease",
+      "iso_cat2_fishes",
+      "iso_cat2_fluids",
+      "iso_cat2_food_production",
+      "iso_cat2_fungi",
+      "iso_cat2_gastrointestinal_tract",
+      "iso_cat2_hot",
+      "iso_cat2_human",
+      "iso_cat2_humid",
+      "iso_cat2_industrial",
+      "iso_cat2_inflammation",
+      "iso_cat2_invertebrates_other",
+      "iso_cat2_juvenile",
+      "iso_cat2_laboratory",
+      "iso_cat2_limb",
+      "iso_cat2_mammals",
+      "iso_cat2_medical_device",
+      "iso_cat2_medical_environment",
+      "iso_cat2_microbial",
+      "iso_cat2_microbial_community",
+      "iso_cat2_oral_cavity_and_airways",
+      "iso_cat2_organ",
+      "iso_cat2_other",
+      "iso_cat2_patient",
+      "iso_cat2_plant",
+      "iso_cat2_plant_infections",
+      "iso_cat2_plants",
+      "iso_cat2_protozoa",
+      "iso_cat2_psychrophilic_lt10_c",
+      "iso_cat2_reptilia",
+      "iso_cat2_saline",
+      "iso_cat2_sulfuric",
+      "iso_cat2_terrestrial",
+      "iso_cat2_thermophilic_gt45_c",
+      "iso_cat2_treatment",
+      "iso_cat2_urogenital_tract",
+      "iso_cat2_waste",
+      "iso_cat2_xerophilic",
+      "iso_cat2_yeast"
     ]
   }
 }

artifacts/eval_report.md CHANGED Viewed

@@ -1,40 +1,40 @@
 # microbe-model — v0 baseline eval report
-_Generated: 2026-04-27T11:37:17+00:00_
 ## TL;DR
-- **`optimal_temperature_c`**: MAE = **3.28** (vs always-predict-mean 5.53, **+41%**)
-- **`optimal_ph`**: MAE = **0.52** (vs always-predict-mean 0.55, **+5%**)
-- **`oxygen_requirement`**: macro-F1 = **0.279** (vs always-predict-majority 0.072, **+289%**)
-- **`salt_tolerance_pct`**: MAE = **2.51** (vs always-predict-mean 2.72, **+8%**)
-Trained on **17,047** strains with **353** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
 ## Corpus
-- Total strains in feature table: **17,047**
 - Labeled-strain counts by target:
-  - `optimal_temperature_c`: 17,007
-  - `optimal_ph`: 4,652
-  - `oxygen_requirement`: 10,426
-  - `salt_tolerance_pct`: 4,793
 ## Target distributions
-- `optimal_temperature_c`: n=17,007, mean=31.96, std=8.57, p10=25.00, median=30.00, p90=37.00
-- `optimal_ph`: n=4,652, mean=7.19, std=0.83, p10=6.50, median=7.00, p90=8.00
-- `salt_tolerance_pct`: n=4,793, mean=3.56, std=4.11, p10=0.00, median=2.50, p90=8.00
 - `oxygen_requirement`:
-  - `aerobe`: 4,973
-  - `anaerobe`: 2,120
-  - `facultative anaerobe`: 1,226
-  - `obligate aerobe`: 1,027
-  - `microaerophile`: 889
-  - `obligate anaerobe`: 105
-  - `facultative aerobe`: 83
   - `microaerotolerant`: 2
-  - `aerotolerant`: 1
 ## Per-target results (5-fold GroupKFold by family)
@@ -43,102 +43,102 @@ Each is shown alongside the dumb-baseline (always-predict-mean / always-predict-
 | Target | Task | n labeled | Model metric | Baseline | Improvement |
 |---|---|---|---|---|---|
-| `optimal_temperature_c` | regression | 17,007 | MAE=3.280 | MAE=5.531 | +40.7% |
-| `optimal_ph` | regression | 4,652 | MAE=0.520 | MAE=0.547 | +4.8% |
-| `oxygen_requirement` | classification | 10,426 | F1=0.279 | F1=0.072 | +288.9% |
-| `salt_tolerance_pct` | regression | 4,793 | MAE=2.510 | MAE=2.721 | +7.7% |
 ### `optimal_temperature_c` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | mae = 2.793 | n=13,605 | n=3,402 |
-| 2 | mae = 3.529 | n=13,605 | n=3,402 |
-| 3 | mae = 3.461 | n=13,606 | n=3,401 |
-| 4 | mae = 3.130 | n=13,606 | n=3,401 |
-| 5 | mae = 3.486 | n=13,606 | n=3,401 |
 **Top 10 features for `optimal_temperature_c`:**
-- `ivywrel_frac` — 0.1977
-- `n_predicted_cds` — 0.0250
-- `pos_charged_frac` — 0.0196
-- `aa_frac_E` — 0.0131
-- `aa_frac_C` — 0.0119
-- `codon_TTG` — 0.0099
-- `codon_TGA` — 0.0099
-- `codon_AGG` — 0.0098
-- `tetra_GCAA` — 0.0084
-- `aa_frac_S` — 0.0081
 ### `optimal_ph` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | mae = 0.475 | n=3,721 | n=931 |
-| 2 | mae = 0.613 | n=3,721 | n=931 |
-| 3 | mae = 0.479 | n=3,722 | n=930 |
-| 4 | mae = 0.529 | n=3,722 | n=930 |
-| 5 | mae = 0.505 | n=3,722 | n=930 |
 **Top 10 features for `optimal_ph`:**
-- `neg_charged_frac` — 0.0211
-- `tetra_TGCT` — 0.0141
-- `aa_frac_H` — 0.0120
-- `tetra_CACT` — 0.0097
-- `tetra_AGAC` — 0.0088
-- `tetra_GAGA` — 0.0081
-- `tetra_TCTC` — 0.0077
-- `ivywrel_frac` — 0.0077
-- `aa_frac_E` — 0.0076
-- `tetra_CTCT` — 0.0069
 ### `oxygen_requirement` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | f1_macro = 0.204 | n=8,340 | n=2,085 |
-| 2 | f1_macro = 0.255 | n=8,341 | n=2,085 |
-| 3 | f1_macro = 0.286 | n=8,341 | n=2,085 |
-| 4 | f1_macro = 0.355 | n=8,341 | n=2,085 |
-| 5 | f1_macro = 0.296 | n=8,341 | n=2,085 |
 **Top 10 features for `oxygen_requirement`:**
-- `codon_ATA` — 0.0381
-- `aa_frac_C` — 0.0217
-- `genome_size_nt` — 0.0194
-- `tetra_CAAA` — 0.0173
-- `codon_CAA` — 0.0145
-- `aa_frac_Q` — 0.0124
-- `tetra_TCAA` — 0.0123
-- `n_predicted_cds` — 0.0117
-- `aa_frac_K` — 0.0098
-- `aa_frac_W` — 0.0088
 ### `salt_tolerance_pct` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | mae = 2.557 | n=3,834 | n=959 |
-| 2 | mae = 2.164 | n=3,834 | n=959 |
-| 3 | mae = 2.777 | n=3,834 | n=959 |
-| 4 | mae = 2.654 | n=3,835 | n=958 |
-| 5 | mae = 2.398 | n=3,835 | n=958 |
 **Top 10 features for `salt_tolerance_pct`:**
-- `aa_frac_C` — 0.0492
-- `neg_charged_frac` — 0.0365
-- `tetra_CGTT` — 0.0160
-- `tetra_GAAA` — 0.0123
-- `tetra_GACT` — 0.0110
-- `tetra_AACC` — 0.0091
-- `codon_CGT` — 0.0073
-- `tetra_TGTG` — 0.0071
-- `tetra_AGGA` — 0.0066
-- `tetra_GGAG` — 0.0065
 ## Feature ↔ target correlations (Spearman, top 10)
@@ -148,46 +148,46 @@ Sanity-checks the biology — features known to track each target should appear
 | Feature | Spearman ρ | p-value |
 |---|---|---|
-| `genome_size_nt` | -0.405 | 0.0e+00 |
-| `aa_frac_E` | +0.388 | 0.0e+00 |
-| `n_predicted_cds` | -0.386 | 0.0e+00 |
-| `ivywrel_frac` | +0.320 | 0.0e+00 |
-| `aa_frac_Y` | +0.318 | 0.0e+00 |
-| `aa_frac_W` | -0.309 | 0.0e+00 |
-| `codon_TGG` | -0.309 | 0.0e+00 |
-| `tetra_TCTT` | +0.300 | 0.0e+00 |
-| `pos_charged_frac` | +0.299 | 0.0e+00 |
-| `tetra_AAGA` | +0.298 | 0.0e+00 |
 ### `optimal_ph`
 | Feature | Spearman ρ | p-value |
 |---|---|---|
-| `neg_charged_frac` | +0.308 | 4.5e-103 |
-| `mean_isoelectric_point` | -0.276 | 4.4e-82 |
-| `aa_frac_E` | +0.260 | 6.3e-73 |
-| `ivywrel_frac` | +0.166 | 3.8e-30 |
-| `codon_AAG` | -0.163 | 6.2e-29 |
-| `codon_TGC` | -0.149 | 1.9e-24 |
-| `codon_CGA` | +0.149 | 2.2e-24 |
-| `tetra_CACT` | +0.134 | 3.8e-20 |
-| `tetra_AGTG` | +0.133 | 6.4e-20 |
-| `tetra_ACTC` | +0.119 | 4.0e-16 |
 ### `salt_tolerance_pct`
 | Feature | Spearman ρ | p-value |
 |---|---|---|
-| `tetra_AGTC` | +0.232 | 9.5e-60 |
-| `tetra_GACT` | +0.232 | 1.5e-59 |
-| `neg_charged_frac` | +0.227 | 3.2e-57 |
-| `mean_isoelectric_point` | -0.204 | 2.9e-46 |
-| `ivywrel_frac` | +0.196 | 1.4e-42 |
-| `aa_frac_C` | -0.187 | 7.3e-39 |
-| `tetra_ACTC` | +0.176 | 1.4e-34 |
-| `tetra_GAGT` | +0.173 | 2.5e-33 |
-| `tetra_ATGC` | -0.164 | 3.7e-30 |
-| `tetra_TCAC` | +0.163 | 5.0e-30 |
 ## Per-family error breakdown (regression targets)
@@ -197,61 +197,61 @@ Top 15 most-represented families, MAE per family. Highlights where the model is
 | Family | n | MAE |
 |---|---|---|
-| Streptomycetaceae | 798 | 1.311 |
-| Bacillaceae | 643 | 4.423 |
-| Flavobacteriaceae | 631 | 4.303 |
-| Lactobacillaceae | 471 | 3.389 |
-| Enterobacteriaceae | 439 | 3.719 |
-| Microbacteriaceae | 396 | 2.467 |
-| Pseudomonadaceae | 388 | 2.254 |
-| Roseobacteraceae | 341 | 3.054 |
-| Paenibacillaceae | 319 | 3.319 |
-| Pseudonocardiaceae | 306 | 2.325 |
-| Moraxellaceae | 260 | 4.196 |
-| Sphingomonadaceae | 256 | 1.890 |
-| Streptococcaceae | 251 | 3.510 |
-| Clostridiaceae | 247 | 4.372 |
-| Vibrionaceae | 237 | 3.256 |
 ### `optimal_ph`
 | Family | n | MAE |
 |---|---|---|
-| Flavobacteriaceae | 323 | 0.412 |
-| Bacillaceae | 273 | 0.689 |
-| Roseobacteraceae | 192 | 0.389 |
-| Paenibacillaceae | 126 | 0.442 |
-| Microbacteriaceae | 112 | 0.477 |
-| Sphingobacteriaceae | 100 | 0.395 |
-| Sphingomonadaceae | 96 | 0.387 |
-| Streptomycetaceae | 92 | 0.546 |
-| Pseudonocardiaceae | 85 | 0.555 |
-| Halomonadaceae | 81 | 0.566 |
-| Nocardioidaceae | 74 | 0.495 |
-| Paracoccaceae | 71 | 0.577 |
-| Micrococcaceae | 71 | 0.598 |
-| Erythrobacteraceae | 68 | 0.450 |
-| Alteromonadaceae | 68 | 0.365 |
 ### `salt_tolerance_pct`
 | Family | n | MAE |
 |---|---|---|
-| Flavobacteriaceae | 285 | 1.711 |
-| Streptomycetaceae | 283 | 2.141 |
-| Bacillaceae | 246 | 3.508 |
-| Microbacteriaceae | 140 | 2.795 |
-| Pseudonocardiaceae | 134 | 2.345 |
-| Roseobacteraceae | 134 | 1.794 |
-| Paenibacillaceae | 125 | 2.184 |
-| Pseudomonadaceae | 110 | 4.033 |
-| Vibrionaceae | 99 | 2.488 |
-| Sphingomonadaceae | 92 | 1.809 |
-| Micromonosporaceae | 88 | 1.634 |
-| Micrococcaceae | 85 | 3.008 |
-| Nocardiaceae | 84 | 2.674 |
-| Streptococcaceae | 82 | 1.180 |
-| Lactobacillaceae | 78 | 1.852 |
 ## Known limitations

 # microbe-model — v0 baseline eval report
+_Generated: 2026-05-05T06:56:14+00:00_
 ## TL;DR
+- **`optimal_temperature_c`**: MAE = **2.94** (vs always-predict-mean 4.98, **+41%**)
+- **`optimal_ph`**: MAE = **0.51** (vs always-predict-mean 0.55, **+7%**)
+- **`oxygen_requirement`**: macro-F1 = **0.341** (vs always-predict-majority 0.059, **+479%**)
+- **`salt_tolerance_pct`**: MAE = **2.52** (vs always-predict-mean 2.83, **+11%**)
+Trained on **46,029** strains with **418** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
 ## Corpus
+- Total strains in feature table: **46,029**
 - Labeled-strain counts by target:
+  - `optimal_temperature_c`: 45,621
+  - `optimal_ph`: 5,103
+  - `oxygen_requirement`: 21,639
+  - `salt_tolerance_pct`: 6,330
 ## Target distributions
+- `optimal_temperature_c`: n=45,621, mean=32.24, std=7.13, p10=27.50, median=30.00, p90=37.00
+- `optimal_ph`: n=5,103, mean=7.19, std=0.82, p10=6.50, median=7.00, p90=8.00
+- `salt_tolerance_pct`: n=6,330, mean=3.93, std=4.03, p10=0.00, median=3.00, p90=8.00
 - `oxygen_requirement`:
+  - `aerobe`: 7,803
+  - `anaerobe`: 4,193
+  - `microaerophile`: 3,804
+  - `facultative anaerobe`: 3,389
+  - `obligate aerobe`: 2,213
+  - `obligate anaerobe`: 136
+  - `facultative aerobe`: 87
+  - `aerotolerant`: 12
   - `microaerotolerant`: 2
 ## Per-target results (5-fold GroupKFold by family)
 | Target | Task | n labeled | Model metric | Baseline | Improvement |
 |---|---|---|---|---|---|
+| `optimal_temperature_c` | regression | 45,621 | MAE=2.939 | MAE=4.981 | +41.0% |
+| `optimal_ph` | regression | 5,103 | MAE=0.509 | MAE=0.546 | +6.8% |
+| `oxygen_requirement` | classification | 21,639 | F1=0.341 | F1=0.059 | +479.5% |
+| `salt_tolerance_pct` | regression | 6,330 | MAE=2.517 | MAE=2.827 | +11.0% |
 ### `optimal_temperature_c` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | mae = 3.104 | n=36,496 | n=9,125 |
+| 2 | mae = 2.736 | n=36,497 | n=9,124 |
+| 3 | mae = 3.146 | n=36,497 | n=9,124 |
+| 4 | mae = 3.277 | n=36,497 | n=9,124 |
+| 5 | mae = 2.435 | n=36,497 | n=9,124 |
 **Top 10 features for `optimal_temperature_c`:**
+- `ivywrel_frac` — 0.1267
+- `iso_cat2_thermophilic_gt45_c` — 0.0299
+- `n_predicted_cds` — 0.0251
+- `iso_cat2_human` — 0.0209
+- `iso_cat1_infection` — 0.0206
+- `iso_cat2_patient` — 0.0178
+- `aa_frac_C` — 0.0150
+- `genome_size_nt` — 0.0122
+- `aa_frac_D` — 0.0113
+- `codon_AGG` — 0.0109
 ### `optimal_ph` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | mae = 0.456 | n=4,082 | n=1,021 |
+| 2 | mae = 0.626 | n=4,082 | n=1,021 |
+| 3 | mae = 0.528 | n=4,082 | n=1,021 |
+| 4 | mae = 0.480 | n=4,083 | n=1,020 |
+| 5 | mae = 0.454 | n=4,083 | n=1,020 |
 **Top 10 features for `optimal_ph`:**
+- `iso_cat2_acidic` — 0.0522
+- `iso_cat2_alkaline` — 0.0435
+- `neg_charged_frac` — 0.0169
+- `aa_frac_E` — 0.0086
+- `tetra_CTCT` — 0.0084
+- `aa_frac_H` — 0.0080
+- `mean_isoelectric_point` — 0.0076
+- `tetra_CACT` — 0.0074
+- `tetra_AGAC` — 0.0071
+- `tetra_AGGT` — 0.0059
 ### `oxygen_requirement` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | f1_macro = 0.315 | n=17,311 | n=4,328 |
+| 2 | f1_macro = 0.382 | n=17,311 | n=4,326 |
+| 3 | f1_macro = 0.344 | n=17,311 | n=4,328 |
+| 4 | f1_macro = 0.259 | n=17,311 | n=4,328 |
+| 5 | f1_macro = 0.406 | n=17,312 | n=4,327 |
 **Top 10 features for `oxygen_requirement`:**
+- `codon_ATA` — 0.0414
+- `iso_cat1_host` — 0.0260
+- `n_predicted_cds` — 0.0252
+- `aa_frac_C` — 0.0191
+- `iso_cat1_environmental` — 0.0165
+- `codon_CGT` — 0.0148
+- `iso_cat1_engineered` — 0.0138
+- `genome_size_nt` — 0.0113
+- `iso_cat2_human` — 0.0102
+- `codon_TAA` — 0.0090
 ### `salt_tolerance_pct` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | mae = 2.218 | n=5,064 | n=1,266 |
+| 2 | mae = 2.249 | n=5,064 | n=1,266 |
+| 3 | mae = 2.819 | n=5,064 | n=1,266 |
+| 4 | mae = 2.350 | n=5,064 | n=1,266 |
+| 5 | mae = 2.948 | n=5,064 | n=1,266 |
 **Top 10 features for `salt_tolerance_pct`:**
+- `aa_frac_C` — 0.0298
+- `neg_charged_frac` — 0.0278
+- `tetra_ATCC` — 0.0183
+- `iso_cat1_environmental` — 0.0142
+- `tetra_GACT` — 0.0121
+- `iso_cat2_saline` — 0.0114
+- `codon_TGC` — 0.0112
+- `tetra_CGTT` — 0.0094
+- `codon_CGT` — 0.0087
+- `iso_cat2_industrial` — 0.0085
 ## Feature ↔ target correlations (Spearman, top 10)
 | Feature | Spearman ρ | p-value |
 |---|---|---|
+| `genome_size_nt` | -0.493 | 0.0e+00 |
+| `n_predicted_cds` | -0.482 | 0.0e+00 |
+| `aa_frac_P` | -0.391 | 0.0e+00 |
+| `aa_frac_Y` | +0.390 | 0.0e+00 |
+| `tetra_TCTT` | +0.383 | 0.0e+00 |
+| `tetra_TATC` | +0.381 | 0.0e+00 |
+| `tetra_GATA` | +0.381 | 0.0e+00 |
+| `tetra_AAGA` | +0.381 | 0.0e+00 |
+| `tetra_CATA` | +0.380 | 0.0e+00 |
+| `tetra_TATG` | +0.379 | 0.0e+00 |
 ### `optimal_ph`
 | Feature | Spearman ρ | p-value |
 |---|---|---|
+| `neg_charged_frac` | +0.304 | 1.6e-109 |
+| `mean_isoelectric_point` | -0.278 | 1.8e-91 |
+| `aa_frac_E` | +0.256 | 4.5e-77 |
+| `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
+| `ivywrel_frac` | +0.159 | 2.4e-30 |
+| `codon_AAG` | -0.154 | 1.7e-28 |
+| `codon_CGA` | +0.153 | 5.8e-28 |
+| `codon_TGC` | -0.151 | 2.6e-27 |
+| `iso_cat2_saline` | +0.137 | 8.9e-23 |
+| `tetra_CACT` | +0.135 | 4.3e-22 |
 ### `salt_tolerance_pct`
 | Feature | Spearman ρ | p-value |
 |---|---|---|
+| `tetra_AGTC` | +0.270 | 4.0e-106 |
+| `tetra_GACT` | +0.268 | 1.4e-104 |
+| `neg_charged_frac` | +0.221 | 3.9e-71 |
+| `ivywrel_frac` | +0.221 | 8.4e-71 |
+| `aa_frac_C` | -0.202 | 4.7e-59 |
+| `iso_cat1_environmental` | -0.193 | 2.6e-54 |
+| `n_contigs` | -0.181 | 1.0e-47 |
+| `mean_cds_aa_length` | -0.177 | 8.2e-46 |
+| `tetra_ACTC` | +0.176 | 4.5e-45 |
+| `tetra_GAGT` | +0.173 | 1.5e-43 |
 ## Per-family error breakdown (regression targets)
 | Family | n | MAE |
 |---|---|---|
+| Enterobacteriaceae | 2662 | 4.086 |
+| Streptomycetaceae | 2212 | 1.919 |
+| Bacillaceae | 1886 | 3.195 |
+| Lactobacillaceae | 1732 | 3.537 |
+| Pseudomonadaceae | 1621 | 2.576 |
+| Myxococcaceae | 1546 | 0.403 |
+| Streptococcaceae | 1170 | 2.367 |
+| Staphylococcaceae | 1068 | 4.288 |
+| Flavobacteriaceae | 981 | 4.202 |
+| Corynebacteriaceae | 900 | 2.231 |
+| Moraxellaceae | 890 | 3.514 |
+| Paenibacillaceae | 760 | 2.967 |
+| Microbacteriaceae | 734 | 2.482 |
+| Micrococcaceae | 719 | 2.991 |
+| Nocardiaceae | 715 | 2.679 |
 ### `optimal_ph`
 | Family | n | MAE |
 |---|---|---|
+| Flavobacteriaceae | 355 | 0.391 |
+| Bacillaceae | 298 | 0.678 |
+| Roseobacteraceae | 204 | 0.400 |
+| Paenibacillaceae | 139 | 0.435 |
+| Microbacteriaceae | 120 | 0.438 |
+| Sphingobacteriaceae | 114 | 0.353 |
+| Sphingomonadaceae | 102 | 0.346 |
+| Streptomycetaceae | 98 | 0.599 |
+| Pseudonocardiaceae | 93 | 0.495 |
+| Halomonadaceae | 82 | 0.603 |
+| Micrococcaceae | 82 | 0.619 |
+| Nocardioidaceae | 80 | 0.490 |
+| Paracoccaceae | 76 | 0.564 |
+| Alteromonadaceae | 71 | 0.349 |
+| Erythrobacteraceae | 68 | 0.423 |
 ### `salt_tolerance_pct`
 | Family | n | MAE |
 |---|---|---|
+| Streptococcaceae | 340 | 0.891 |
+| Flavobacteriaceae | 312 | 1.834 |
+| Bacillaceae | 310 | 3.417 |
+| Streptomycetaceae | 309 | 2.116 |
+| Pseudomonadaceae | 196 | 4.802 |
+| Corynebacteriaceae | 194 | 3.853 |
+| Vibrionaceae | 173 | 2.872 |
+| Microbacteriaceae | 166 | 2.616 |
+| Paenibacillaceae | 150 | 2.096 |
+| Roseobacteraceae | 143 | 1.556 |
+| Pseudonocardiaceae | 142 | 2.400 |
+| Moraxellaceae | 126 | 2.581 |
+| Nocardiaceae | 125 | 2.899 |
+| Enterococcaceae | 111 | 1.723 |
+| Alcaligenaceae | 104 | 4.454 |
 ## Known limitations

scripts/03_train_baseline.py CHANGED Viewed

@@ -25,6 +25,41 @@ def derive_group(row: pd.Series) -> str:
     return "__unknown__"
 def main() -> None:
     t0 = time.time()
     pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
@@ -32,7 +67,12 @@ def main() -> None:
     df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
     df["group"] = df.apply(derive_group, axis=1)
     feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
     print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
     print(f"Distinct groups: {df['group'].nunique():,}")

     return "__unknown__"
+def encode_isolation_categories(
+    df: pd.DataFrame,
+    *,
+    min_count: int = 10,
+) -> tuple[pd.DataFrame, list[str]]:
+    """One-hot encode isolation_cat1/cat2 (pipe-joined multi-labels).
+    Each strain's category cell is "Tag1|Tag2|..." (or NaN). We split, then create one
+    iso_<level>_<tag> column per tag that appears in ≥min_count training rows. Strains
+    without any isolation info get all-zero rows for these features (XGBoost treats this
+    as "no signal" rather than missing).
+    """
+    new_cols: list[str] = []
+    for level in ("isolation_cat1", "isolation_cat2"):
+        if level not in df.columns:
+            continue
+        from collections import Counter
+        tag_counts: Counter[str] = Counter()
+        for v in df[level].dropna():
+            tag_counts.update(v.split("|"))
+        kept = [t for t, n in tag_counts.items() if n >= min_count]
+        seen_slugs: set[str] = set()
+        import re
+        for tag in sorted(kept):
+            slug = tag.lower().replace(">", "gt").replace("<", "lt")
+            slug = re.sub(r"[^a-z0-9]+", "_", slug).strip("_")
+            col = f"iso_{level.split('_')[1]}_{slug}"
+            if col in seen_slugs:
+                continue
+            seen_slugs.add(col)
+            df[col] = df[level].fillna("").apply(lambda v, t=tag: int(t in v.split("|")))
+            new_cols.append(col)
+    return df, new_cols
 def main() -> None:
     t0 = time.time()
     pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
     df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
     df["group"] = df.apply(derive_group, axis=1)
+    df, iso_cols = encode_isolation_categories(df)
+    print(f"Encoded {len(iso_cols)} isolation-category features "
+          f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
     feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
+    feature_cols = feature_cols + iso_cols
     print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
     print(f"Distinct groups: {df['group'].nunique():,}")

scripts/14_train_combined.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Train v3: hand-crafted features (v1) concatenated with ESM-2 embeddings (v2).
 Tests whether embeddings carry complementary signal to the curated features even
 when they lose head-to-head. Same train/test splits and XGBoost hyperparameters
@@ -14,7 +14,9 @@ Writes:
 """
 from __future__ import annotations
 import time
 import pandas as pd
@@ -33,6 +35,37 @@ def derive_group(row: pd.Series) -> str:
     return "__unknown__"
 def main() -> None:
     t0 = time.time()
     pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
@@ -43,12 +76,16 @@ def main() -> None:
     df = df.merge(embeds, on=["bacdive_id", "genome_accession"], how="inner")
     df["group"] = df.apply(derive_group, axis=1)
     v1_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
     v2_cols = [c for c in embeds.columns if c.startswith("emb_")]
-    feature_cols = v1_cols + v2_cols
     print(f"Training table: {len(df):,} strains × {len(feature_cols)} features "
-          f"({len(v1_cols)} hand-crafted + {len(v2_cols)} embedding dims)")
     print(f"Distinct groups: {df['group'].nunique():,}")
     print()

+"""Train v3: hand-crafted features (v1) + ESM-2 embeddings (v2) + isolation tags.
 Tests whether embeddings carry complementary signal to the curated features even
 when they lose head-to-head. Same train/test splits and XGBoost hyperparameters
 """
 from __future__ import annotations
+import re
 import time
+from collections import Counter
 import pandas as pd
     return "__unknown__"
+def encode_isolation_categories(
+    df: pd.DataFrame,
+    *,
+    min_count: int = 10,
+) -> tuple[pd.DataFrame, list[str]]:
+    """One-hot encode isolation_cat1/cat2 (pipe-joined multi-labels).
+    Mirrors the encoder in scripts/03_train_baseline.py so v3 sees the same
+    isolation-tag vocabulary as v1.
+    """
+    new_cols: list[str] = []
+    for level in ("isolation_cat1", "isolation_cat2"):
+        if level not in df.columns:
+            continue
+        tag_counts: Counter[str] = Counter()
+        for v in df[level].dropna():
+            tag_counts.update(v.split("|"))
+        kept = [t for t, n in tag_counts.items() if n >= min_count]
+        seen_slugs: set[str] = set()
+        for tag in sorted(kept):
+            slug = tag.lower().replace(">", "gt").replace("<", "lt")
+            slug = re.sub(r"[^a-z0-9]+", "_", slug).strip("_")
+            col = f"iso_{level.split('_')[1]}_{slug}"
+            if col in seen_slugs:
+                continue
+            seen_slugs.add(col)
+            df[col] = df[level].fillna("").apply(lambda v, t=tag: int(t in v.split("|")))
+            new_cols.append(col)
+    return df, new_cols
 def main() -> None:
     t0 = time.time()
     pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
     df = df.merge(embeds, on=["bacdive_id", "genome_accession"], how="inner")
     df["group"] = df.apply(derive_group, axis=1)
+    df, iso_cols = encode_isolation_categories(df)
+    print(f"Encoded {len(iso_cols)} isolation-category features "
+          f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
     v1_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
     v2_cols = [c for c in embeds.columns if c.startswith("emb_")]
+    feature_cols = v1_cols + v2_cols + iso_cols
     print(f"Training table: {len(df):,} strains × {len(feature_cols)} features "
+          f"({len(v1_cols)} hand-crafted + {len(v2_cols)} embedding dims + {len(iso_cols)} iso tags)")
     print(f"Distinct groups: {df['group'].nunique():,}")
     print()

scripts/17_reextract_phenotypes.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Rebuild data/bacdive_phenotypes.parquet from cached data/bacdive/*.json.
+Use this after extending extract_phenotypes() to add fields without re-running the
+~30-min API scan. Reads every cached JSON, re-applies the extractor, and overwrites
+the parquet.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+from microbe_model import config
+from microbe_model.data.bacdive import extract_phenotypes
+def main() -> None:
+    files = sorted(Path(config.BACDIVE_DIR).glob("*.json"))
+    print(f"Re-extracting from {len(files):,} cached JSONs in {config.BACDIVE_DIR}")
+    rows = []
+    for path in tqdm(files, desc="re-extract", unit="strain"):
+        try:
+            record = json.loads(path.read_text())
+        except json.JSONDecodeError:
+            continue
+        rows.append(extract_phenotypes(record))
+    df = pd.DataFrame(rows)
+    out = config.DATA / "bacdive_phenotypes.parquet"
+    df.to_parquet(out, index=False)
+    print(f"\nWrote {len(df):,} strains to {out}")
+    print("Field coverage:")
+    for col in df.columns:
+        n = df[col].notna().sum()
+        print(f"  {col:30s} {n:>6,} / {len(df):,} ({100 * n / max(1, len(df)):.1f}%)")
+if __name__ == "__main__":
+    main()

scripts/18_resolve_species_to_genome.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Resolve BacDive species names → NCBI representative genome accessions.
+Targets the phenotype-labeled strains that lack a `genome_accession` in BacDive.
+Many of those species DO have a sequenced genome — BacDive just doesn't link to it.
+We query NCBI Datasets v2 for one RefSeq assembly per unique species name and write
+the {species: accession} map so the next pipeline step can pull the FASTAs.
+Output: data/species_to_genome.parquet  (species, ncbi_accession, status)
+Resumable: re-runs skip species already present in the output.
+"""
+from __future__ import annotations
+import time
+from urllib.parse import quote
+import pandas as pd
+import requests
+from tqdm import tqdm
+from microbe_model import config
+API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{taxon}/dataset_report"
+RATE_LIMIT_S = 0.11 if config.NCBI_API_KEY else 0.36
+OUT_PATH = config.DATA / "species_to_genome.parquet"
+def fetch_one(species: str, session: requests.Session) -> tuple[str | None, str]:
+    """Return (accession, status) for a species. status ∈ {hit, miss, error}."""
+    headers: dict[str, str] = {"Accept": "application/json"}
+    if config.NCBI_API_KEY:
+        headers["api-key"] = config.NCBI_API_KEY
+    params = {"filters.assembly_source": "RefSeq", "page_size": 1}
+    for attempt in range(3):
+        try:
+            time.sleep(RATE_LIMIT_S)
+            resp = session.get(
+                API_URL.format(taxon=quote(species)),
+                headers=headers,
+                params=params,
+                timeout=30,
+            )
+            if resp.status_code == 404:
+                return None, "miss"
+            if resp.status_code in (429, 502, 503):
+                time.sleep(2 ** attempt)
+                continue
+            resp.raise_for_status()
+            reports = resp.json().get("reports", [])
+            if reports:
+                acc = reports[0].get("accession")
+                return (acc, "hit") if acc else (None, "miss")
+            return None, "miss"
+        except requests.RequestException:
+            if attempt == 2:
+                return None, "error"
+            time.sleep(2 ** attempt)
+    return None, "error"
+def main() -> None:
+    df = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
+    has_label = df[
+        ["optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"]
+    ].notna().any(axis=1)
+    no_genome = df["genome_accession"].isna()
+    valid_species = df["species"].notna() & df["species"].str.contains(" ", na=False)
+    gap_species = sorted(df[has_label & no_genome & valid_species]["species"].unique())
+    print(f"unique species to resolve: {len(gap_species):,}")
+    # Resume from prior partial run
+    done: dict[str, tuple[str | None, str]] = {}
+    if OUT_PATH.exists():
+        prev = pd.read_parquet(OUT_PATH)
+        for _, row in prev.iterrows():
+            done[row["species"]] = (row["ncbi_accession"], row["status"])
+        print(f"resuming — {len(done):,} already cached")
+    todo = [s for s in gap_species if s not in done]
+    print(f"to fetch: {len(todo):,}")
+    session = requests.Session()
+    rows: list[dict] = [
+        {"species": sp, "ncbi_accession": acc, "status": st}
+        for sp, (acc, st) in done.items()
+    ]
+    n_hits = sum(1 for _, st in done.values() if st == "hit")
+    try:
+        for sp in tqdm(todo, desc="resolving", unit="species"):
+            acc, status = fetch_one(sp, session)
+            rows.append({"species": sp, "ncbi_accession": acc, "status": status})
+            if status == "hit":
+                n_hits += 1
+            # Periodic checkpoint every 200 species so an interrupt doesn't lose progress
+            if len(rows) % 200 == 0:
+                pd.DataFrame(rows).to_parquet(OUT_PATH, index=False)
+    finally:
+        pd.DataFrame(rows).to_parquet(OUT_PATH, index=False)
+    out = pd.DataFrame(rows)
+    print(f"\nwrote {len(out):,} rows to {OUT_PATH}")
+    print(f"  hit:   {(out['status'] == 'hit').sum():,} ({100 * (out['status'] == 'hit').mean():.0f}%)")
+    print(f"  miss:  {(out['status'] == 'miss').sum():,}")
+    print(f"  error: {(out['status'] == 'error').sum():,}")
+if __name__ == "__main__":
+    main()

scripts/19_featurize_resolved.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""Deduplicated featurize for species-resolved genomes.
+When many BacDive strains share a single species-level representative genome (the
+common case after scripts/18), naively running scripts/02 re-downloads + re-runs
+pyrodigal on the same FASTA per-strain. This script downloads each unique accession
+once, then replicates the resulting feature dict across all bacdive_ids that share it.
+Resumable via data/features.jsonl (skips bacdive_ids already in the log).
+Usage:
+    uv run python scripts/19_featurize_resolved.py --workers 7
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+from microbe_model import config
+from microbe_model.pipeline import _load_done_ids, _process_one
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--workers", type=int, default=max(1, (os.cpu_count() or 4) - 1))
+    parser.add_argument("--max-accessions", type=int, default=None,
+                        help="Cap how many unique accessions to process (debug).")
+    args = parser.parse_args()
+    pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
+    # Training-ready pool: any phenotype label + a genome accession
+    label_cols = list(config.PHENOTYPE_TARGETS.keys())
+    has_label = pheno[label_cols].notna().any(axis=1)
+    has_genome = pheno["genome_accession"].notna()
+    ready = pheno[has_label & has_genome].copy()
+    ready["bacdive_id"] = ready["bacdive_id"].astype(int)
+    ready["genome_accession"] = ready["genome_accession"].astype(str)
+    out_path = config.DATA / "features.jsonl"
+    done_ids = _load_done_ids(out_path)
+    todo = ready[~ready["bacdive_id"].isin(done_ids)]
+    print(f"strains in pool: {len(ready):,}")
+    print(f"  already featurized: {len(done_ids):,}")
+    print(f"  remaining: {len(todo):,}")
+    # Group remaining strains by accession
+    by_acc = todo.groupby("genome_accession")["bacdive_id"].apply(list).to_dict()
+    accessions = sorted(by_acc.keys())
+    if args.max_accessions:
+        accessions = accessions[: args.max_accessions]
+    print(f"unique accessions to download: {len(accessions):,}")
+    print(f"  avg strains per accession: {sum(len(by_acc[a]) for a in accessions) / max(1, len(accessions)):.1f}")
+    print(f"workers: {args.workers}\n")
+    # Featurize each accession once; the worker tags the result with the *first* bacdive_id
+    # of that accession's strain group. We then replicate the feature dict to all sibling
+    # bacdive_ids before writing.
+    rep_tasks = [(by_acc[acc][0], acc) for acc in accessions]
+    n_success = 0
+    n_replicated_rows = 0
+    start = time.time()
+    with open(out_path, "a") as fh, \
+         ProcessPoolExecutor(max_workers=args.workers) as pool, \
+         tqdm(total=len(rep_tasks), desc="featurize", unit="genome") as bar:
+        futures = {pool.submit(_process_one, t): t for t in rep_tasks}
+        for fut in as_completed(futures):
+            rep_id, acc = futures[fut]
+            bar.update(1)
+            try:
+                feats = fut.result()
+            except Exception:
+                feats = None
+            if not feats:
+                continue
+            n_success += 1
+            for bid in by_acc[acc]:
+                row = dict(feats)
+                row["bacdive_id"] = bid
+                row["genome_accession"] = acc
+                fh.write(json.dumps(row) + "\n")
+                n_replicated_rows += 1
+            fh.flush()
+            bar.set_postfix(genomes_ok=n_success, rows=n_replicated_rows)
+    print(f"\nfinished in {(time.time() - start) / 60:.1f} min")
+    print(f"  unique genomes featurized: {n_success:,}/{len(rep_tasks):,}")
+    print(f"  feature rows written: {n_replicated_rows:,}")
+    # Materialize parquet
+    df = pd.read_json(out_path, lines=True)
+    df = df.drop_duplicates(subset=["bacdive_id"], keep="last")
+    parquet = config.DATA / "features.parquet"
+    df.to_parquet(parquet, index=False)
+    print(f"  wrote {len(df):,} rows to {parquet}")
+if __name__ == "__main__":
+    main()

src/microbe_model/data/bacdive.py CHANGED Viewed

@@ -102,12 +102,16 @@ def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
       - Physiology and metabolism → oxygen tolerance[]
       - Physiology and metabolism → halophily[]
       - Sequence information → Genome sequences[].INSDC accession
     """
     general = record.get("General") or {}
     taxon = record.get("Name and taxonomic classification") or {}
     culture = record.get("Culture and growth conditions") or {}
     physio = record.get("Physiology and metabolism") or {}
     seq = record.get("Sequence information") or {}
     out: dict[str, Any] = {
         "bacdive_id": general.get("BacDive-ID"),
@@ -120,10 +124,35 @@ def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
         "oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"),
         "salt_tolerance_pct": _derive_salt(physio.get("halophily")),
         "genome_accession": _first_genome_accession(seq.get("Genome sequences")),
     }
     return out
 def _as_list(x: Any) -> list:
     if x is None:
         return []

       - Physiology and metabolism → oxygen tolerance[]
       - Physiology and metabolism → halophily[]
       - Sequence information → Genome sequences[].INSDC accession
+      - Isolation, sampling and environmental information → isolation source categories[].Cat{1,2,3}
     """
     general = record.get("General") or {}
     taxon = record.get("Name and taxonomic classification") or {}
     culture = record.get("Culture and growth conditions") or {}
     physio = record.get("Physiology and metabolism") or {}
     seq = record.get("Sequence information") or {}
+    iso = record.get("Isolation, sampling and environmental information") or {}
+    iso_cats = _collect_isolation_categories(iso.get("isolation source categories"))
     out: dict[str, Any] = {
         "bacdive_id": general.get("BacDive-ID"),
         "oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"),
         "salt_tolerance_pct": _derive_salt(physio.get("halophily")),
         "genome_accession": _first_genome_accession(seq.get("Genome sequences")),
+        "isolation_cat1": iso_cats["cat1"],
+        "isolation_cat2": iso_cats["cat2"],
+        "isolation_cat3": iso_cats["cat3"],
     }
     return out
+def _collect_isolation_categories(raw: Any) -> dict[str, str | None]:
+    """Flatten BacDive's `isolation source categories` into 3 pipe-joined string fields.
+    A strain commonly has multiple parallel category descriptions (e.g., #Host=Human AND
+    #Host Body Product=Blood). We collect *all* unique values per level into a sorted,
+    pipe-joined string so downstream code can split & one-hot. The leading '#' is stripped.
+    """
+    cats: dict[str, set[str]] = {"Cat1": set(), "Cat2": set(), "Cat3": set()}
+    for entry in _as_list(raw):
+        if not isinstance(entry, dict):
+            continue
+        for level in cats:
+            value = entry.get(level)
+            if isinstance(value, str) and value:
+                cats[level].add(value.lstrip("#").strip())
+    return {
+        "cat1": "|".join(sorted(cats["Cat1"])) or None,
+        "cat2": "|".join(sorted(cats["Cat2"])) or None,
+        "cat3": "|".join(sorted(cats["Cat3"])) or None,
+    }
 def _as_list(x: Any) -> list:
     if x is None:
         return []