Miyu Horiuchi Claude Opus 4.7 (1M context) commited on
Commit
f0f1d93
·
1 Parent(s): c3d9a06

Expand training corpus to 46K strains: species-name → NCBI genome + isolation features

Browse files

Two layered improvements over the 17K-strain genome-only baseline:

1. Isolation-source features (Cat1/Cat2 from BacDive's `isolation source categories`)
- Re-extracted from cached BacDive JSON via scripts/17 (no re-download)
- One-hot encoded in scripts/03 (and mirrored in scripts/14 for the v3 trainer)

2. Species-name → NCBI representative genome resolution
- scripts/18 queries NCBI Datasets v2 /genome/taxon/ for each unique species
name in the gap (had phenotype labels but no genome accession in BacDive).
5,393 of 7,905 species resolved (68%).
- scripts/19 deduplicated featurize: each unique accession downloaded once,
feature dict replicated to all sibling strains (avoids ~5x redundant downloads).
- 5,254 of 5,283 unique genomes featurized (99.5% success).

Result: 17,054 → 46,058 training-ready strains.

Per-target metrics (vs original v0 genome-only):
optimal_temperature_c MAE 3.28 → 2.94 (-10.4%)
oxygen_requirement F1 0.279 → 0.341 (+22.2%)
optimal_ph MAE 0.52 → 0.51 ( -2.1%, label-limited)
salt_tolerance_pct MAE 2.51 → 2.52 ( +0.3%, label-limited)

pH and salt did not lift because BacDive label coverage for those targets barely
grew (1.1× and 1.3× vs 2.7× and 2.1× for T_opt and oxygen respectively).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

artifacts/baseline_results.json CHANGED
@@ -1,274 +1,274 @@
1
  {
2
  "optimal_temperature_c": {
3
  "task": "regression",
4
- "mean_metric": 3.2796768780969154,
5
  "folds": [
6
  {
7
  "target": "optimal_temperature_c",
8
  "task": "regression",
9
  "metric_name": "mae",
10
- "value": 2.7927897739802856,
11
- "n_train": 13605,
12
- "n_test": 3402
13
  },
14
  {
15
  "target": "optimal_temperature_c",
16
  "task": "regression",
17
  "metric_name": "mae",
18
- "value": 3.528644174915563,
19
- "n_train": 13605,
20
- "n_test": 3402
21
  },
22
  {
23
  "target": "optimal_temperature_c",
24
  "task": "regression",
25
  "metric_name": "mae",
26
- "value": 3.460899077440423,
27
- "n_train": 13606,
28
- "n_test": 3401
29
  },
30
  {
31
  "target": "optimal_temperature_c",
32
  "task": "regression",
33
  "metric_name": "mae",
34
- "value": 3.1300457459483138,
35
- "n_train": 13606,
36
- "n_test": 3401
37
  },
38
  {
39
  "target": "optimal_temperature_c",
40
  "task": "regression",
41
  "metric_name": "mae",
42
- "value": 3.4860056181999908,
43
- "n_train": 13606,
44
- "n_test": 3401
45
  }
46
  ],
47
  "top_features": {
48
- "ivywrel_frac": 0.19766514897346496,
49
- "n_predicted_cds": 0.02504703141748905,
50
- "pos_charged_frac": 0.01956883855164051,
51
- "aa_frac_E": 0.013097247015684843,
52
- "aa_frac_C": 0.011947918310761451,
53
- "codon_TTG": 0.009919821098446847,
54
- "codon_TGA": 0.00989874340593815,
55
- "codon_AGG": 0.009789730794727803,
56
- "tetra_GCAA": 0.008400670997798444,
57
- "aa_frac_S": 0.008075917325913905,
58
- "tetra_CTAA": 0.008004277455620467,
59
- "tetra_CTAG": 0.0076253050938248634,
60
- "tetra_AGGC": 0.007276883872691542,
61
- "mean_isoelectric_point": 0.00710052065551281,
62
- "aa_frac_Y": 0.006824395339936018,
63
- "aa_frac_L": 0.006632247474044561,
64
- "neg_charged_frac": 0.006289067305624485,
65
- "aa_frac_Q": 0.005970604927279055,
66
- "tetra_TTTA": 0.005658498092088848,
67
- "aa_frac_D": 0.005623110197484493
68
  }
69
  },
70
  "optimal_ph": {
71
  "task": "regression",
72
- "mean_metric": 0.5202253475923528,
73
  "folds": [
74
  {
75
  "target": "optimal_ph",
76
  "task": "regression",
77
  "metric_name": "mae",
78
- "value": 0.4754666561983069,
79
- "n_train": 3721,
80
- "n_test": 931
81
  },
82
  {
83
  "target": "optimal_ph",
84
  "task": "regression",
85
  "metric_name": "mae",
86
- "value": 0.6132477354670441,
87
- "n_train": 3721,
88
- "n_test": 931
89
  },
90
  {
91
  "target": "optimal_ph",
92
  "task": "regression",
93
  "metric_name": "mae",
94
- "value": 0.4791252064653622,
95
- "n_train": 3722,
96
- "n_test": 930
97
  },
98
  {
99
  "target": "optimal_ph",
100
  "task": "regression",
101
  "metric_name": "mae",
102
- "value": 0.5286309639510288,
103
- "n_train": 3722,
104
- "n_test": 930
105
  },
106
  {
107
  "target": "optimal_ph",
108
  "task": "regression",
109
  "metric_name": "mae",
110
- "value": 0.504656175880022,
111
- "n_train": 3722,
112
- "n_test": 930
113
  }
114
  ],
115
  "top_features": {
116
- "neg_charged_frac": 0.021104557998478414,
117
- "tetra_TGCT": 0.014129164954647421,
118
- "aa_frac_H": 0.01198372757062316,
119
- "tetra_CACT": 0.00974916813429445,
120
- "tetra_AGAC": 0.008753480762243271,
121
- "tetra_GAGA": 0.00805287561379373,
122
- "tetra_TCTC": 0.007691349275410176,
123
- "ivywrel_frac": 0.0076533622108399865,
124
- "aa_frac_E": 0.007637183275073766,
125
- "tetra_CTCT": 0.006926810601726174,
126
- "codon_AAG": 0.006873661652207375,
127
- "n_predicted_cds": 0.006350988755002618,
128
- "mean_isoelectric_point": 0.005934224603697658,
129
- "tetra_AGGT": 0.005835541151463985,
130
- "tetra_GTGA": 0.0058299127034842965,
131
- "tetra_CCTG": 0.005803165677934885,
132
- "tetra_TGTT": 0.005650706822052598,
133
- "tetra_TGGT": 0.005588132468983531,
134
- "tetra_ACGA": 0.0055527932243421676,
135
- "tetra_TGAG": 0.005398272210732103
136
  }
137
  },
138
  "oxygen_requirement": {
139
  "task": "classification",
140
- "mean_metric": 0.2790834034149142,
141
  "folds": [
142
  {
143
  "target": "oxygen_requirement",
144
  "task": "classification",
145
  "metric_name": "f1_macro",
146
- "value": 0.203774595737485,
147
- "n_train": 8340,
148
- "n_test": 2085
149
  },
150
  {
151
  "target": "oxygen_requirement",
152
  "task": "classification",
153
  "metric_name": "f1_macro",
154
- "value": 0.25451263916216754,
155
- "n_train": 8341,
156
- "n_test": 2085
157
  },
158
  {
159
  "target": "oxygen_requirement",
160
  "task": "classification",
161
  "metric_name": "f1_macro",
162
- "value": 0.28578249475572015,
163
- "n_train": 8341,
164
- "n_test": 2085
165
  },
166
  {
167
  "target": "oxygen_requirement",
168
  "task": "classification",
169
  "metric_name": "f1_macro",
170
- "value": 0.3554691036453791,
171
- "n_train": 8341,
172
- "n_test": 2085
173
  },
174
  {
175
  "target": "oxygen_requirement",
176
  "task": "classification",
177
  "metric_name": "f1_macro",
178
- "value": 0.29587818377381925,
179
- "n_train": 8341,
180
- "n_test": 2085
181
  }
182
  ],
183
  "top_features": {
184
- "codon_ATA": 0.03811337612569332,
185
- "aa_frac_C": 0.021676772832870485,
186
- "genome_size_nt": 0.019434500206261873,
187
- "tetra_CAAA": 0.017337634600698947,
188
- "codon_CAA": 0.014458012580871583,
189
- "aa_frac_Q": 0.012433571089059115,
190
- "tetra_TCAA": 0.01232931949198246,
191
- "n_predicted_cds": 0.011694983579218388,
192
- "aa_frac_K": 0.009837790858000517,
193
- "aa_frac_W": 0.008752449508756399,
194
- "codon_TGG": 0.008257251046597958,
195
- "aa_frac_M": 0.007389512192457914,
196
- "aa_frac_Y": 0.007202606648206711,
197
- "codon_ATG": 0.007109537813812494,
198
- "aa_frac_L": 0.006909955851733684,
199
- "codon_CGT": 0.006404342176392674,
200
- "codon_CAT": 0.005392152117565274,
201
- "tetra_AGGA": 0.0053115392103791235,
202
- "aa_frac_H": 0.005291619151830673,
203
- "ivywrel_frac": 0.004459869768470526
204
  }
205
  },
206
  "salt_tolerance_pct": {
207
  "task": "regression",
208
- "mean_metric": 2.5102975998466475,
209
  "folds": [
210
  {
211
  "target": "salt_tolerance_pct",
212
  "task": "regression",
213
  "metric_name": "mae",
214
- "value": 2.557428962086115,
215
- "n_train": 3834,
216
- "n_test": 959
217
  },
218
  {
219
  "target": "salt_tolerance_pct",
220
  "task": "regression",
221
  "metric_name": "mae",
222
- "value": 2.1644325020106914,
223
- "n_train": 3834,
224
- "n_test": 959
225
  },
226
  {
227
  "target": "salt_tolerance_pct",
228
  "task": "regression",
229
  "metric_name": "mae",
230
- "value": 2.777298618858686,
231
- "n_train": 3834,
232
- "n_test": 959
233
  },
234
  {
235
  "target": "salt_tolerance_pct",
236
  "task": "regression",
237
  "metric_name": "mae",
238
- "value": 2.65442228433608,
239
- "n_train": 3835,
240
- "n_test": 958
241
  },
242
  {
243
  "target": "salt_tolerance_pct",
244
  "task": "regression",
245
  "metric_name": "mae",
246
- "value": 2.3979056319416676,
247
- "n_train": 3835,
248
- "n_test": 958
249
  }
250
  ],
251
  "top_features": {
252
- "aa_frac_C": 0.04922681804746389,
253
- "neg_charged_frac": 0.03649095520377159,
254
- "tetra_CGTT": 0.015985671523958446,
255
- "tetra_GAAA": 0.012250523548573256,
256
- "tetra_GACT": 0.011047882377170026,
257
- "tetra_AACC": 0.009099948103539646,
258
- "codon_CGT": 0.007261711079627276,
259
- "tetra_TGTG": 0.007125534303486347,
260
- "tetra_AGGA": 0.006638824963010848,
261
- "tetra_GGAG": 0.006525454460643232,
262
- "tetra_CACA": 0.0065232118591666225,
263
- "tetra_AGTC": 0.0065024693030864,
264
- "tetra_GTGG": 0.0063626600429415705,
265
- "codon_TGG": 0.006061221100389957,
266
- "tetra_GTAT": 0.00588638405315578,
267
- "tetra_CGTA": 0.00577605227008462,
268
- "tetra_CTGA": 0.005612925114110112,
269
- "codon_TGC": 0.005607163021340966,
270
- "tetra_TACT": 0.005531361280009151,
271
- "codon_GAT": 0.005294737778604031
272
  }
273
  },
274
  "__meta__": {
@@ -625,7 +625,72 @@
625
  "codon_TTA",
626
  "codon_TTC",
627
  "codon_TTG",
628
- "codon_TTT"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  ]
630
  }
631
  }
 
1
  {
2
  "optimal_temperature_c": {
3
  "task": "regression",
4
+ "mean_metric": 2.939444159350111,
5
  "folds": [
6
  {
7
  "target": "optimal_temperature_c",
8
  "task": "regression",
9
  "metric_name": "mae",
10
+ "value": 3.103597222252415,
11
+ "n_train": 36496,
12
+ "n_test": 9125
13
  },
14
  {
15
  "target": "optimal_temperature_c",
16
  "task": "regression",
17
  "metric_name": "mae",
18
+ "value": 2.7356862682357583,
19
+ "n_train": 36497,
20
+ "n_test": 9124
21
  },
22
  {
23
  "target": "optimal_temperature_c",
24
  "task": "regression",
25
  "metric_name": "mae",
26
+ "value": 3.145843773419164,
27
+ "n_train": 36497,
28
+ "n_test": 9124
29
  },
30
  {
31
  "target": "optimal_temperature_c",
32
  "task": "regression",
33
  "metric_name": "mae",
34
+ "value": 3.2767152481045656,
35
+ "n_train": 36497,
36
+ "n_test": 9124
37
  },
38
  {
39
  "target": "optimal_temperature_c",
40
  "task": "regression",
41
  "metric_name": "mae",
42
+ "value": 2.43537828473865,
43
+ "n_train": 36497,
44
+ "n_test": 9124
45
  }
46
  ],
47
  "top_features": {
48
+ "ivywrel_frac": 0.12668818831443787,
49
+ "iso_cat2_thermophilic_gt45_c": 0.029868930205702783,
50
+ "n_predicted_cds": 0.025075340643525124,
51
+ "iso_cat2_human": 0.020858772844076157,
52
+ "iso_cat1_infection": 0.020640516839921474,
53
+ "iso_cat2_patient": 0.017751351464539766,
54
+ "aa_frac_C": 0.015003016591072083,
55
+ "genome_size_nt": 0.012203263118863106,
56
+ "aa_frac_D": 0.011290411837399006,
57
+ "codon_AGG": 0.010900856088846922,
58
+ "iso_cat1_environmental": 0.010176281817257405,
59
+ "tetra_GCCT": 0.009658925677649676,
60
+ "tetra_TAGT": 0.00883282758295536,
61
+ "aa_frac_Y": 0.008421392692252994,
62
+ "aa_frac_E": 0.007741594593971968,
63
+ "tetra_TTCC": 0.007376640872098506,
64
+ "mean_isoelectric_point": 0.007058459660038352,
65
+ "tetra_CTAA": 0.0070426638238132,
66
+ "iso_cat2_built_environment": 0.006164434866514057,
67
+ "iso_cat2_industrial": 0.005895084328949451
68
  }
69
  },
70
  "optimal_ph": {
71
  "task": "regression",
72
+ "mean_metric": 0.5090253015368336,
73
  "folds": [
74
  {
75
  "target": "optimal_ph",
76
  "task": "regression",
77
  "metric_name": "mae",
78
+ "value": 0.45639293885487886,
79
+ "n_train": 4082,
80
+ "n_test": 1021
81
  },
82
  {
83
  "target": "optimal_ph",
84
  "task": "regression",
85
  "metric_name": "mae",
86
+ "value": 0.6262803867911733,
87
+ "n_train": 4082,
88
+ "n_test": 1021
89
  },
90
  {
91
  "target": "optimal_ph",
92
  "task": "regression",
93
  "metric_name": "mae",
94
+ "value": 0.528334212326513,
95
+ "n_train": 4082,
96
+ "n_test": 1021
97
  },
98
  {
99
  "target": "optimal_ph",
100
  "task": "regression",
101
  "metric_name": "mae",
102
+ "value": 0.48048674237494376,
103
+ "n_train": 4083,
104
+ "n_test": 1020
105
  },
106
  {
107
  "target": "optimal_ph",
108
  "task": "regression",
109
  "metric_name": "mae",
110
+ "value": 0.4536322273366591,
111
+ "n_train": 4083,
112
+ "n_test": 1020
113
  }
114
  ],
115
  "top_features": {
116
+ "iso_cat2_acidic": 0.05219607315957546,
117
+ "iso_cat2_alkaline": 0.043521419167518616,
118
+ "neg_charged_frac": 0.016875072754919528,
119
+ "aa_frac_E": 0.008599728252738715,
120
+ "tetra_CTCT": 0.008368687890470027,
121
+ "aa_frac_H": 0.008003219496458769,
122
+ "mean_isoelectric_point": 0.007599354162812233,
123
+ "tetra_CACT": 0.007427609874866903,
124
+ "tetra_AGAC": 0.007137532206252217,
125
+ "tetra_AGGT": 0.005891842069104314,
126
+ "tetra_GACT": 0.005873983446508646,
127
+ "tetra_GAGA": 0.005548427533358336,
128
+ "tetra_GTCT": 0.005475769587792456,
129
+ "codon_GAA": 0.005408304557204246,
130
+ "n_predicted_cds": 0.005280579440295696,
131
+ "iso_cat2_plants": 0.005045945569872856,
132
+ "tetra_TTGA": 0.004973787232302129,
133
+ "codon_AAG": 0.0048154488438740374,
134
+ "tetra_ACGA": 0.004731484339572489,
135
+ "aa_frac_Y": 0.0046834095381200315
136
  }
137
  },
138
  "oxygen_requirement": {
139
  "task": "classification",
140
+ "mean_metric": 0.34127360853732613,
141
  "folds": [
142
  {
143
  "target": "oxygen_requirement",
144
  "task": "classification",
145
  "metric_name": "f1_macro",
146
+ "value": 0.31515576471296236,
147
+ "n_train": 17311,
148
+ "n_test": 4328
149
  },
150
  {
151
  "target": "oxygen_requirement",
152
  "task": "classification",
153
  "metric_name": "f1_macro",
154
+ "value": 0.38181774862206597,
155
+ "n_train": 17311,
156
+ "n_test": 4326
157
  },
158
  {
159
  "target": "oxygen_requirement",
160
  "task": "classification",
161
  "metric_name": "f1_macro",
162
+ "value": 0.34440677114867413,
163
+ "n_train": 17311,
164
+ "n_test": 4328
165
  },
166
  {
167
  "target": "oxygen_requirement",
168
  "task": "classification",
169
  "metric_name": "f1_macro",
170
+ "value": 0.25943178539399836,
171
+ "n_train": 17311,
172
+ "n_test": 4328
173
  },
174
  {
175
  "target": "oxygen_requirement",
176
  "task": "classification",
177
  "metric_name": "f1_macro",
178
+ "value": 0.40555597280892947,
179
+ "n_train": 17312,
180
+ "n_test": 4327
181
  }
182
  ],
183
  "top_features": {
184
+ "codon_ATA": 0.0414140235632658,
185
+ "iso_cat1_host": 0.02601129524409771,
186
+ "n_predicted_cds": 0.025201210007071494,
187
+ "aa_frac_C": 0.019132474437355995,
188
+ "iso_cat1_environmental": 0.01645018421113491,
189
+ "codon_CGT": 0.014759847987443208,
190
+ "iso_cat1_engineered": 0.01378793753683567,
191
+ "genome_size_nt": 0.011305144988000393,
192
+ "iso_cat2_human": 0.010168002359569073,
193
+ "codon_TAA": 0.00900037819519639,
194
+ "aa_frac_V": 0.008459322061389685,
195
+ "aa_frac_Y": 0.008259046915918588,
196
+ "aa_frac_L": 0.0072497081011533735,
197
+ "tetra_CTGG": 0.006922230357304215,
198
+ "aa_frac_T": 0.006535647064447403,
199
+ "codon_TGG": 0.006477221753448248,
200
+ "aa_frac_Q": 0.0063397581689059734,
201
+ "aa_frac_M": 0.006198597187176347,
202
+ "tetra_CAAA": 0.006141273584216833,
203
+ "codon_CAA": 0.00611291266977787
204
  }
205
  },
206
  "salt_tolerance_pct": {
207
  "task": "regression",
208
+ "mean_metric": 2.516896605067264,
209
  "folds": [
210
  {
211
  "target": "salt_tolerance_pct",
212
  "task": "regression",
213
  "metric_name": "mae",
214
+ "value": 2.218365752012856,
215
+ "n_train": 5064,
216
+ "n_test": 1266
217
  },
218
  {
219
  "target": "salt_tolerance_pct",
220
  "task": "regression",
221
  "metric_name": "mae",
222
+ "value": 2.249367568591289,
223
+ "n_train": 5064,
224
+ "n_test": 1266
225
  },
226
  {
227
  "target": "salt_tolerance_pct",
228
  "task": "regression",
229
  "metric_name": "mae",
230
+ "value": 2.8189112452912664,
231
+ "n_train": 5064,
232
+ "n_test": 1266
233
  },
234
  {
235
  "target": "salt_tolerance_pct",
236
  "task": "regression",
237
  "metric_name": "mae",
238
+ "value": 2.3502065964041967,
239
+ "n_train": 5064,
240
+ "n_test": 1266
241
  },
242
  {
243
  "target": "salt_tolerance_pct",
244
  "task": "regression",
245
  "metric_name": "mae",
246
+ "value": 2.947631863036709,
247
+ "n_train": 5064,
248
+ "n_test": 1266
249
  }
250
  ],
251
  "top_features": {
252
+ "aa_frac_C": 0.029796541761606933,
253
+ "neg_charged_frac": 0.027759117633104326,
254
+ "tetra_ATCC": 0.018280067457817496,
255
+ "iso_cat1_environmental": 0.014224943332374096,
256
+ "tetra_GACT": 0.01211925563402474,
257
+ "iso_cat2_saline": 0.011419120244681835,
258
+ "codon_TGC": 0.011161889415234327,
259
+ "tetra_CGTT": 0.009351400006562472,
260
+ "codon_CGT": 0.008664370141923427,
261
+ "iso_cat2_industrial": 0.008528076158836485,
262
+ "tetra_TAAT": 0.008236682531423867,
263
+ "iso_cat2_contamination": 0.008197423309320584,
264
+ "tetra_CGTA": 0.007803171873092651,
265
+ "tetra_TGTG": 0.007793005835264921,
266
+ "tetra_TACC": 0.007619049632921815,
267
+ "codon_CAC": 0.007169742346741259,
268
+ "tetra_AGTC": 0.006417827447876334,
269
+ "tetra_CCTG": 0.006371588306501507,
270
+ "tetra_GGTA": 0.006226122658699751,
271
+ "tetra_GAAA": 0.006115543586201966
272
  }
273
  },
274
  "__meta__": {
 
625
  "codon_TTA",
626
  "codon_TTC",
627
  "codon_TTG",
628
+ "codon_TTT",
629
+ "iso_cat1_climate",
630
+ "iso_cat1_condition",
631
+ "iso_cat1_engineered",
632
+ "iso_cat1_environmental",
633
+ "iso_cat1_host",
634
+ "iso_cat1_host_body_product",
635
+ "iso_cat1_host_body_site",
636
+ "iso_cat1_infection",
637
+ "iso_cat2_acidic",
638
+ "iso_cat2_agriculture",
639
+ "iso_cat2_air",
640
+ "iso_cat2_algae",
641
+ "iso_cat2_alkaline",
642
+ "iso_cat2_anoxic_anaerobic",
643
+ "iso_cat2_aquatic",
644
+ "iso_cat2_arthropoda",
645
+ "iso_cat2_biodegradation",
646
+ "iso_cat2_biofilm",
647
+ "iso_cat2_bioreactor",
648
+ "iso_cat2_bioremediation",
649
+ "iso_cat2_birds",
650
+ "iso_cat2_built_environment",
651
+ "iso_cat2_cold",
652
+ "iso_cat2_contamination",
653
+ "iso_cat2_disease",
654
+ "iso_cat2_fishes",
655
+ "iso_cat2_fluids",
656
+ "iso_cat2_food_production",
657
+ "iso_cat2_fungi",
658
+ "iso_cat2_gastrointestinal_tract",
659
+ "iso_cat2_hot",
660
+ "iso_cat2_human",
661
+ "iso_cat2_humid",
662
+ "iso_cat2_industrial",
663
+ "iso_cat2_inflammation",
664
+ "iso_cat2_invertebrates_other",
665
+ "iso_cat2_juvenile",
666
+ "iso_cat2_laboratory",
667
+ "iso_cat2_limb",
668
+ "iso_cat2_mammals",
669
+ "iso_cat2_medical_device",
670
+ "iso_cat2_medical_environment",
671
+ "iso_cat2_medical_product",
672
+ "iso_cat2_microbial",
673
+ "iso_cat2_microbial_community",
674
+ "iso_cat2_oral_cavity_and_airways",
675
+ "iso_cat2_organ",
676
+ "iso_cat2_other",
677
+ "iso_cat2_patient",
678
+ "iso_cat2_plant",
679
+ "iso_cat2_plant_infections",
680
+ "iso_cat2_plants",
681
+ "iso_cat2_protozoa",
682
+ "iso_cat2_psychrophilic_lt10_c",
683
+ "iso_cat2_reptilia",
684
+ "iso_cat2_saline",
685
+ "iso_cat2_sulfuric",
686
+ "iso_cat2_temperate",
687
+ "iso_cat2_terrestrial",
688
+ "iso_cat2_thermophilic_gt45_c",
689
+ "iso_cat2_treatment",
690
+ "iso_cat2_urogenital_tract",
691
+ "iso_cat2_waste",
692
+ "iso_cat2_xerophilic",
693
+ "iso_cat2_yeast"
694
  ]
695
  }
696
  }
artifacts/combined_results.json CHANGED
@@ -1,274 +1,274 @@
1
  {
2
  "optimal_temperature_c": {
3
  "task": "regression",
4
- "mean_metric": 3.2743260321046983,
5
  "folds": [
6
  {
7
  "target": "optimal_temperature_c",
8
  "task": "regression",
9
  "metric_name": "mae",
10
- "value": 2.800930039710605,
11
- "n_train": 13577,
12
- "n_test": 3395
13
  },
14
  {
15
  "target": "optimal_temperature_c",
16
  "task": "regression",
17
  "metric_name": "mae",
18
- "value": 3.212330534180766,
19
- "n_train": 13577,
20
- "n_test": 3395
21
  },
22
  {
23
  "target": "optimal_temperature_c",
24
  "task": "regression",
25
  "metric_name": "mae",
26
- "value": 3.8308504694689707,
27
- "n_train": 13578,
28
- "n_test": 3394
29
  },
30
  {
31
  "target": "optimal_temperature_c",
32
  "task": "regression",
33
  "metric_name": "mae",
34
- "value": 3.055139859704741,
35
- "n_train": 13578,
36
- "n_test": 3394
37
  },
38
  {
39
  "target": "optimal_temperature_c",
40
  "task": "regression",
41
  "metric_name": "mae",
42
- "value": 3.47237925745841,
43
- "n_train": 13578,
44
- "n_test": 3394
45
  }
46
  ],
47
  "top_features": {
48
- "ivywrel_frac": 0.13554759174585343,
49
- "n_predicted_cds": 0.016180129162967204,
50
- "pos_charged_frac": 0.015452616102993488,
51
- "emb_42": 0.009146808803780004,
52
- "aa_frac_C": 0.009010343812406063,
53
- "aa_frac_E": 0.008539762906730175,
54
- "codon_TGA": 0.008345812978222966,
55
- "mean_isoelectric_point": 0.00771149517968297,
56
- "emb_43": 0.007495118898805231,
57
- "codon_TTG": 0.007401803624816239,
58
- "codon_AGG": 0.006896090880036354,
59
- "tetra_GCAA": 0.006816221633926034,
60
- "emb_103": 0.0065081269480288025,
61
- "tetra_AGGC": 0.0060555480304174125,
62
- "aa_frac_Q": 0.005575686926022172,
63
- "aa_frac_L": 0.00546089680865407,
64
- "aa_frac_S": 0.0053946841042488815,
65
- "tetra_CAAA": 0.005041174124926329,
66
- "tetra_TTTA": 0.005022482725325972,
67
- "emb_271": 0.004957860894501209
68
  }
69
  },
70
  "optimal_ph": {
71
  "task": "regression",
72
- "mean_metric": 0.5148330200525308,
73
  "folds": [
74
  {
75
  "target": "optimal_ph",
76
  "task": "regression",
77
  "metric_name": "mae",
78
- "value": 0.45630543930777195,
79
- "n_train": 3710,
80
  "n_test": 928
81
  },
82
  {
83
  "target": "optimal_ph",
84
  "task": "regression",
85
  "metric_name": "mae",
86
- "value": 0.5329306995252083,
87
- "n_train": 3710,
88
  "n_test": 928
89
  },
90
  {
91
  "target": "optimal_ph",
92
  "task": "regression",
93
  "metric_name": "mae",
94
- "value": 0.5447937661203844,
95
- "n_train": 3710,
96
  "n_test": 928
97
  },
98
  {
99
  "target": "optimal_ph",
100
  "task": "regression",
101
  "metric_name": "mae",
102
- "value": 0.5399269027771687,
103
- "n_train": 3711,
104
- "n_test": 927
105
  },
106
  {
107
  "target": "optimal_ph",
108
  "task": "regression",
109
  "metric_name": "mae",
110
- "value": 0.5002082925321215,
111
- "n_train": 3711,
112
- "n_test": 927
113
  }
114
  ],
115
  "top_features": {
116
- "neg_charged_frac": 0.01469268798828125,
117
- "tetra_CACT": 0.010008076298981905,
118
- "tetra_TGCT": 0.009113076934590935,
119
- "aa_frac_H": 0.008188599767163397,
120
- "tetra_GAGA": 0.00789343281649053,
121
- "tetra_TCTC": 0.007285104691982269,
122
- "tetra_AGAC": 0.006973244273103774,
123
- "aa_frac_E": 0.005855221953243017,
124
- "codon_AAG": 0.005304977297782898,
125
- "tetra_GTCT": 0.004962419020012021,
126
- "mean_isoelectric_point": 0.00494197946973145,
127
- "tetra_GACT": 0.004903565905988216,
128
- "codon_GAA": 0.004646051116287709,
129
- "n_predicted_cds": 0.004488307330757379,
130
- "tetra_GCGT": 0.004404617205727846,
131
- "tetra_ACGA": 0.004399605770595371,
132
- "ivywrel_frac": 0.004385623009875416,
133
- "codon_TCA": 0.004316977364942432,
134
- "tetra_TGGT": 0.004235964128747582,
135
- "emb_50": 0.004220196325331926
136
  }
137
  },
138
  "oxygen_requirement": {
139
  "task": "classification",
140
- "mean_metric": 0.2926196414104487,
141
  "folds": [
142
  {
143
  "target": "oxygen_requirement",
144
  "task": "classification",
145
  "metric_name": "f1_macro",
146
- "value": 0.2568366096040457,
147
- "n_train": 8314,
148
- "n_test": 2079
149
  },
150
  {
151
  "target": "oxygen_requirement",
152
  "task": "classification",
153
  "metric_name": "f1_macro",
154
- "value": 0.28331761646078946,
155
- "n_train": 8314,
156
- "n_test": 2079
157
  },
158
  {
159
  "target": "oxygen_requirement",
160
  "task": "classification",
161
  "metric_name": "f1_macro",
162
- "value": 0.3003208823814207,
163
- "n_train": 8314,
164
- "n_test": 2078
165
  },
166
  {
167
  "target": "oxygen_requirement",
168
  "task": "classification",
169
  "metric_name": "f1_macro",
170
- "value": 0.3245556247911396,
171
- "n_train": 8315,
172
  "n_test": 2078
173
  },
174
  {
175
  "target": "oxygen_requirement",
176
  "task": "classification",
177
  "metric_name": "f1_macro",
178
- "value": 0.2980674738148481,
179
- "n_train": 8315,
180
- "n_test": 2078
181
  }
182
  ],
183
  "top_features": {
184
- "emb_103": 0.023965770192444325,
185
- "tetra_TCAA": 0.014948683325201273,
186
- "codon_CAA": 0.014257160667330026,
187
- "codon_ATA": 0.013358323276042939,
188
- "tetra_CAAA": 0.0109681696863845,
189
- "aa_frac_C": 0.010774458199739457,
190
- "genome_size_nt": 0.01065667849034071,
191
- "aa_frac_Q": 0.0091962531208992,
192
- "tetra_TTTG": 0.008046209486201406,
193
- "n_predicted_cds": 0.007619006186723709,
194
- "codon_TGG": 0.007263739546760917,
195
- "aa_frac_M": 0.006333135021850467,
196
- "emb_65": 0.0060926306061446665,
197
- "emb_12": 0.005908917868509889,
198
- "aa_frac_K": 0.005255377222783864,
199
- "aa_frac_W": 0.004311660584062338,
200
- "aa_frac_L": 0.004282204434275627,
201
- "codon_CGT": 0.004179813340306282,
202
- "aa_frac_Y": 0.0039975212421268225,
203
- "tetra_ATCT": 0.0035881946329027413
204
  }
205
  },
206
  "salt_tolerance_pct": {
207
  "task": "regression",
208
- "mean_metric": 2.500490414750344,
209
  "folds": [
210
  {
211
  "target": "salt_tolerance_pct",
212
  "task": "regression",
213
  "metric_name": "mae",
214
- "value": 2.4981446771859317,
215
- "n_train": 3826,
216
- "n_test": 957
217
  },
218
  {
219
  "target": "salt_tolerance_pct",
220
  "task": "regression",
221
  "metric_name": "mae",
222
- "value": 2.1454447971134294,
223
- "n_train": 3826,
224
- "n_test": 957
225
  },
226
  {
227
  "target": "salt_tolerance_pct",
228
  "task": "regression",
229
  "metric_name": "mae",
230
- "value": 2.8626325660039327,
231
- "n_train": 3826,
232
- "n_test": 957
233
  },
234
  {
235
  "target": "salt_tolerance_pct",
236
  "task": "regression",
237
  "metric_name": "mae",
238
- "value": 2.5895608990698675,
239
- "n_train": 3827,
240
- "n_test": 956
241
  },
242
  {
243
  "target": "salt_tolerance_pct",
244
  "task": "regression",
245
  "metric_name": "mae",
246
- "value": 2.4066691343785576,
247
- "n_train": 3827,
248
- "n_test": 956
249
  }
250
  ],
251
  "top_features": {
252
- "aa_frac_C": 0.03929822724312544,
253
- "neg_charged_frac": 0.024765344709157942,
254
- "tetra_TGTG": 0.010924566886387765,
255
- "tetra_GACT": 0.010432313196361065,
256
- "emb_84": 0.007880622497759759,
257
- "mean_isoelectric_point": 0.006233150523621589,
258
- "codon_CGT": 0.006036084098741412,
259
- "emb_264": 0.0059552066260948775,
260
- "tetra_AGTC": 0.0055462203454226255,
261
- "tetra_GTGG": 0.005274477787315846,
262
- "tetra_TACT": 0.004922923888079822,
263
- "codon_TGT": 0.004755359562113881,
264
- "tetra_TACG": 0.004655727057252079,
265
- "aa_frac_D": 0.004644650942645967,
266
- "tetra_CGTA": 0.004608191270381212,
267
- "tetra_CACA": 0.004475983197335154,
268
- "codon_GAA": 0.004449379863217473,
269
- "tetra_CTAA": 0.004377065307926387,
270
- "tetra_AACC": 0.004368867329321802,
271
- "tetra_TACC": 0.004245994682423771
272
  }
273
  },
274
  "__meta__": {
@@ -945,7 +945,70 @@
945
  "emb_316",
946
  "emb_317",
947
  "emb_318",
948
- "emb_319"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
949
  ]
950
  }
951
  }
 
1
  {
2
  "optimal_temperature_c": {
3
  "task": "regression",
4
+ "mean_metric": 3.1503700219525386,
5
  "folds": [
6
  {
7
  "target": "optimal_temperature_c",
8
  "task": "regression",
9
  "metric_name": "mae",
10
+ "value": 2.738956731512239,
11
+ "n_train": 13592,
12
+ "n_test": 3398
13
  },
14
  {
15
  "target": "optimal_temperature_c",
16
  "task": "regression",
17
  "metric_name": "mae",
18
+ "value": 3.1457219391587623,
19
+ "n_train": 13592,
20
+ "n_test": 3398
21
  },
22
  {
23
  "target": "optimal_temperature_c",
24
  "task": "regression",
25
  "metric_name": "mae",
26
+ "value": 3.6463728072293864,
27
+ "n_train": 13592,
28
+ "n_test": 3398
29
  },
30
  {
31
  "target": "optimal_temperature_c",
32
  "task": "regression",
33
  "metric_name": "mae",
34
+ "value": 2.8936606017051547,
35
+ "n_train": 13592,
36
+ "n_test": 3398
37
  },
38
  {
39
  "target": "optimal_temperature_c",
40
  "task": "regression",
41
  "metric_name": "mae",
42
+ "value": 3.327138030157151,
43
+ "n_train": 13592,
44
+ "n_test": 3398
45
  }
46
  ],
47
  "top_features": {
48
+ "ivywrel_frac": 0.1272028997540474,
49
+ "iso_cat2_human": 0.02456752024590969,
50
+ "iso_cat2_thermophilic_gt45_c": 0.02335200347006321,
51
+ "iso_cat1_host_body_product": 0.016055696457624436,
52
+ "pos_charged_frac": 0.014609704539179803,
53
+ "iso_cat1_infection": 0.01389338243752718,
54
+ "iso_cat2_mammals": 0.013551290705800056,
55
+ "n_predicted_cds": 0.012836913019418717,
56
+ "iso_cat1_environmental": 0.010669851303100586,
57
+ "tetra_CTAA": 0.009250188246369362,
58
+ "codon_AGG": 0.008724220609292389,
59
+ "aa_frac_E": 0.008193913381546736,
60
+ "aa_frac_C": 0.007445563282817602,
61
+ "mean_isoelectric_point": 0.0069396811537444595,
62
+ "codon_TTG": 0.006236870028078556,
63
+ "emb_42": 0.006092228952911683,
64
+ "aa_frac_Q": 0.006049463665112853,
65
+ "tetra_GCAA": 0.005999001779127866,
66
+ "codon_TGA": 0.005786543060094118,
67
+ "tetra_TTAG": 0.0056871567387133835
68
  }
69
  },
70
  "optimal_ph": {
71
  "task": "regression",
72
+ "mean_metric": 0.5009073096637068,
73
  "folds": [
74
  {
75
  "target": "optimal_ph",
76
  "task": "regression",
77
  "metric_name": "mae",
78
+ "value": 0.45625025033950806,
79
+ "n_train": 3712,
80
  "n_test": 928
81
  },
82
  {
83
  "target": "optimal_ph",
84
  "task": "regression",
85
  "metric_name": "mae",
86
+ "value": 0.5460629706958244,
87
+ "n_train": 3712,
88
  "n_test": 928
89
  },
90
  {
91
  "target": "optimal_ph",
92
  "task": "regression",
93
  "metric_name": "mae",
94
+ "value": 0.45289194491402857,
95
+ "n_train": 3712,
96
  "n_test": 928
97
  },
98
  {
99
  "target": "optimal_ph",
100
  "task": "regression",
101
  "metric_name": "mae",
102
+ "value": 0.5152855838783856,
103
+ "n_train": 3712,
104
+ "n_test": 928
105
  },
106
  {
107
  "target": "optimal_ph",
108
  "task": "regression",
109
  "metric_name": "mae",
110
+ "value": 0.5340457984907874,
111
+ "n_train": 3712,
112
+ "n_test": 928
113
  }
114
  ],
115
  "top_features": {
116
+ "iso_cat2_acidic": 0.03560468032956123,
117
+ "iso_cat2_alkaline": 0.021716910228133202,
118
+ "neg_charged_frac": 0.012849669158458709,
119
+ "tetra_CACT": 0.007109560957178474,
120
+ "tetra_AGAC": 0.006792897148989141,
121
+ "aa_frac_E": 0.006575863063335419,
122
+ "tetra_CTCT": 0.0063951408956199884,
123
+ "aa_frac_H": 0.006346661783754826,
124
+ "tetra_GACT": 0.005681420909240842,
125
+ "codon_TTT": 0.00532103287987411,
126
+ "tetra_TGCT": 0.005244059395045042,
127
+ "codon_TGC": 0.004600144876167178,
128
+ "tetra_GAGA": 0.004556609224528075,
129
+ "tetra_ACGA": 0.004443949507549405,
130
+ "codon_AAG": 0.004402201203629374,
131
+ "mean_isoelectric_point": 0.004346802597865462,
132
+ "codon_GAA": 0.0043407921912148595,
133
+ "tetra_GGAT": 0.0042743304162286225,
134
+ "codon_GAG": 0.004212372726760805,
135
+ "tetra_AGGT": 0.004194398503750562
136
  }
137
  },
138
  "oxygen_requirement": {
139
  "task": "classification",
140
+ "mean_metric": 0.31635288673665096,
141
  "folds": [
142
  {
143
  "target": "oxygen_requirement",
144
  "task": "classification",
145
  "metric_name": "f1_macro",
146
+ "value": 0.2856587696238043,
147
+ "n_train": 8320,
148
+ "n_test": 2081
149
  },
150
  {
151
  "target": "oxygen_requirement",
152
  "task": "classification",
153
  "metric_name": "f1_macro",
154
+ "value": 0.3076087339800983,
155
+ "n_train": 8321,
156
+ "n_test": 2080
157
  },
158
  {
159
  "target": "oxygen_requirement",
160
  "task": "classification",
161
  "metric_name": "f1_macro",
162
+ "value": 0.30314339231057225,
163
+ "n_train": 8321,
164
+ "n_test": 2079
165
  },
166
  {
167
  "target": "oxygen_requirement",
168
  "task": "classification",
169
  "metric_name": "f1_macro",
170
+ "value": 0.37851699811618567,
171
+ "n_train": 8321,
172
  "n_test": 2078
173
  },
174
  {
175
  "target": "oxygen_requirement",
176
  "task": "classification",
177
  "metric_name": "f1_macro",
178
+ "value": 0.30683653965259416,
179
+ "n_train": 8321,
180
+ "n_test": 2080
181
  }
182
  ],
183
  "top_features": {
184
+ "emb_103": 0.021533418074250223,
185
+ "tetra_TCAA": 0.017144506447948515,
186
+ "codon_CAA": 0.01247407766059041,
187
+ "genome_size_nt": 0.011898068338632583,
188
+ "codon_ATA": 0.010783014632761479,
189
+ "aa_frac_C": 0.010261021554470062,
190
+ "tetra_CAAA": 0.009564016468357295,
191
+ "emb_50": 0.00891582112526521,
192
+ "aa_frac_Q": 0.008642746694386005,
193
+ "n_predicted_cds": 0.007285213563591242,
194
+ "aa_frac_K": 0.006437191320583224,
195
+ "aa_frac_M": 0.006172794941812753,
196
+ "emb_3": 0.005596142518334091,
197
+ "emb_12": 0.005483857169747352,
198
+ "tetra_ATAG": 0.0051641249097883705,
199
+ "aa_frac_L": 0.004760731570422649,
200
+ "aa_frac_W": 0.004419099772349,
201
+ "codon_CGT": 0.004018026869744062,
202
+ "codon_ATG": 0.00391377778723836,
203
+ "aa_frac_Y": 0.0038635179633274676
204
  }
205
  },
206
  "salt_tolerance_pct": {
207
  "task": "regression",
208
+ "mean_metric": 2.4756113285254835,
209
  "folds": [
210
  {
211
  "target": "salt_tolerance_pct",
212
  "task": "regression",
213
  "metric_name": "mae",
214
+ "value": 2.2104132311571862,
215
+ "n_train": 3832,
216
+ "n_test": 958
217
  },
218
  {
219
  "target": "salt_tolerance_pct",
220
  "task": "regression",
221
  "metric_name": "mae",
222
+ "value": 2.5142923461258535,
223
+ "n_train": 3832,
224
+ "n_test": 958
225
  },
226
  {
227
  "target": "salt_tolerance_pct",
228
  "task": "regression",
229
  "metric_name": "mae",
230
+ "value": 2.9433706752467503,
231
+ "n_train": 3832,
232
+ "n_test": 958
233
  },
234
  {
235
  "target": "salt_tolerance_pct",
236
  "task": "regression",
237
  "metric_name": "mae",
238
+ "value": 2.352060198972643,
239
+ "n_train": 3832,
240
+ "n_test": 958
241
  },
242
  {
243
  "target": "salt_tolerance_pct",
244
  "task": "regression",
245
  "metric_name": "mae",
246
+ "value": 2.3579201911249834,
247
+ "n_train": 3832,
248
+ "n_test": 958
249
  }
250
  ],
251
  "top_features": {
252
+ "aa_frac_C": 0.03296378441154957,
253
+ "neg_charged_frac": 0.025052556581795216,
254
+ "iso_cat2_saline": 0.010697953775525093,
255
+ "tetra_CACA": 0.008883546688593924,
256
+ "tetra_GACT": 0.007743655145168305,
257
+ "emb_84": 0.006587694090558216,
258
+ "codon_TGG": 0.006308569852262735,
259
+ "iso_cat2_human": 0.005952583346515894,
260
+ "tetra_AGTC": 0.005895164678804576,
261
+ "tetra_GGAA": 0.005703426687978208,
262
+ "codon_CGT": 0.005462939362041652,
263
+ "iso_cat1_infection": 0.005432895617559552,
264
+ "tetra_ACTG": 0.005339212145190686,
265
+ "tetra_AACC": 0.0049897018994670365,
266
+ "mean_isoelectric_point": 0.004982588707935065,
267
+ "iso_cat2_patient": 0.0049561535939574245,
268
+ "tetra_GAAA": 0.004851629724726081,
269
+ "aa_frac_D": 0.0045981516130268575,
270
+ "codon_TGT": 0.004288341873325408,
271
+ "tetra_CGTA": 0.0041915396694093944
272
  }
273
  },
274
  "__meta__": {
 
945
  "emb_316",
946
  "emb_317",
947
  "emb_318",
948
+ "emb_319",
949
+ "iso_cat1_climate",
950
+ "iso_cat1_condition",
951
+ "iso_cat1_engineered",
952
+ "iso_cat1_environmental",
953
+ "iso_cat1_host",
954
+ "iso_cat1_host_body_product",
955
+ "iso_cat1_host_body_site",
956
+ "iso_cat1_infection",
957
+ "iso_cat2_acidic",
958
+ "iso_cat2_agriculture",
959
+ "iso_cat2_air",
960
+ "iso_cat2_algae",
961
+ "iso_cat2_alkaline",
962
+ "iso_cat2_anoxic_anaerobic",
963
+ "iso_cat2_aquatic",
964
+ "iso_cat2_arthropoda",
965
+ "iso_cat2_biodegradation",
966
+ "iso_cat2_biofilm",
967
+ "iso_cat2_bioreactor",
968
+ "iso_cat2_bioremediation",
969
+ "iso_cat2_birds",
970
+ "iso_cat2_built_environment",
971
+ "iso_cat2_cold",
972
+ "iso_cat2_contamination",
973
+ "iso_cat2_disease",
974
+ "iso_cat2_fishes",
975
+ "iso_cat2_fluids",
976
+ "iso_cat2_food_production",
977
+ "iso_cat2_fungi",
978
+ "iso_cat2_gastrointestinal_tract",
979
+ "iso_cat2_hot",
980
+ "iso_cat2_human",
981
+ "iso_cat2_humid",
982
+ "iso_cat2_industrial",
983
+ "iso_cat2_inflammation",
984
+ "iso_cat2_invertebrates_other",
985
+ "iso_cat2_juvenile",
986
+ "iso_cat2_laboratory",
987
+ "iso_cat2_limb",
988
+ "iso_cat2_mammals",
989
+ "iso_cat2_medical_device",
990
+ "iso_cat2_medical_environment",
991
+ "iso_cat2_microbial",
992
+ "iso_cat2_microbial_community",
993
+ "iso_cat2_oral_cavity_and_airways",
994
+ "iso_cat2_organ",
995
+ "iso_cat2_other",
996
+ "iso_cat2_patient",
997
+ "iso_cat2_plant",
998
+ "iso_cat2_plant_infections",
999
+ "iso_cat2_plants",
1000
+ "iso_cat2_protozoa",
1001
+ "iso_cat2_psychrophilic_lt10_c",
1002
+ "iso_cat2_reptilia",
1003
+ "iso_cat2_saline",
1004
+ "iso_cat2_sulfuric",
1005
+ "iso_cat2_terrestrial",
1006
+ "iso_cat2_thermophilic_gt45_c",
1007
+ "iso_cat2_treatment",
1008
+ "iso_cat2_urogenital_tract",
1009
+ "iso_cat2_waste",
1010
+ "iso_cat2_xerophilic",
1011
+ "iso_cat2_yeast"
1012
  ]
1013
  }
1014
  }
artifacts/eval_report.md CHANGED
@@ -1,40 +1,40 @@
1
  # microbe-model — v0 baseline eval report
2
 
3
- _Generated: 2026-04-27T11:37:17+00:00_
4
 
5
  ## TL;DR
6
 
7
- - **`optimal_temperature_c`**: MAE = **3.28** (vs always-predict-mean 5.53, **+41%**)
8
- - **`optimal_ph`**: MAE = **0.52** (vs always-predict-mean 0.55, **+5%**)
9
- - **`oxygen_requirement`**: macro-F1 = **0.279** (vs always-predict-majority 0.072, **+289%**)
10
- - **`salt_tolerance_pct`**: MAE = **2.51** (vs always-predict-mean 2.72, **+8%**)
11
 
12
- Trained on **17,047** strains with **353** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
13
 
14
  ## Corpus
15
 
16
- - Total strains in feature table: **17,047**
17
  - Labeled-strain counts by target:
18
- - `optimal_temperature_c`: 17,007
19
- - `optimal_ph`: 4,652
20
- - `oxygen_requirement`: 10,426
21
- - `salt_tolerance_pct`: 4,793
22
 
23
  ## Target distributions
24
 
25
- - `optimal_temperature_c`: n=17,007, mean=31.96, std=8.57, p10=25.00, median=30.00, p90=37.00
26
- - `optimal_ph`: n=4,652, mean=7.19, std=0.83, p10=6.50, median=7.00, p90=8.00
27
- - `salt_tolerance_pct`: n=4,793, mean=3.56, std=4.11, p10=0.00, median=2.50, p90=8.00
28
  - `oxygen_requirement`:
29
- - `aerobe`: 4,973
30
- - `anaerobe`: 2,120
31
- - `facultative anaerobe`: 1,226
32
- - `obligate aerobe`: 1,027
33
- - `microaerophile`: 889
34
- - `obligate anaerobe`: 105
35
- - `facultative aerobe`: 83
 
36
  - `microaerotolerant`: 2
37
- - `aerotolerant`: 1
38
 
39
  ## Per-target results (5-fold GroupKFold by family)
40
 
@@ -43,102 +43,102 @@ Each is shown alongside the dumb-baseline (always-predict-mean / always-predict-
43
 
44
  | Target | Task | n labeled | Model metric | Baseline | Improvement |
45
  |---|---|---|---|---|---|
46
- | `optimal_temperature_c` | regression | 17,007 | MAE=3.280 | MAE=5.531 | +40.7% |
47
- | `optimal_ph` | regression | 4,652 | MAE=0.520 | MAE=0.547 | +4.8% |
48
- | `oxygen_requirement` | classification | 10,426 | F1=0.279 | F1=0.072 | +288.9% |
49
- | `salt_tolerance_pct` | regression | 4,793 | MAE=2.510 | MAE=2.721 | +7.7% |
50
 
51
  ### `optimal_temperature_c` — fold-by-fold
52
 
53
  | Fold | Metric | Train | Test |
54
  |---|---|---|---|
55
- | 1 | mae = 2.793 | n=13,605 | n=3,402 |
56
- | 2 | mae = 3.529 | n=13,605 | n=3,402 |
57
- | 3 | mae = 3.461 | n=13,606 | n=3,401 |
58
- | 4 | mae = 3.130 | n=13,606 | n=3,401 |
59
- | 5 | mae = 3.486 | n=13,606 | n=3,401 |
60
 
61
  **Top 10 features for `optimal_temperature_c`:**
62
 
63
- - `ivywrel_frac` — 0.1977
64
- - `n_predicted_cds` — 0.0250
65
- - `pos_charged_frac` — 0.0196
66
- - `aa_frac_E` — 0.0131
67
- - `aa_frac_C` — 0.0119
68
- - `codon_TTG` — 0.0099
69
- - `codon_TGA` — 0.0099
70
- - `codon_AGG` — 0.0098
71
- - `tetra_GCAA` — 0.0084
72
- - `aa_frac_S` — 0.0081
73
 
74
  ### `optimal_ph` — fold-by-fold
75
 
76
  | Fold | Metric | Train | Test |
77
  |---|---|---|---|
78
- | 1 | mae = 0.475 | n=3,721 | n=931 |
79
- | 2 | mae = 0.613 | n=3,721 | n=931 |
80
- | 3 | mae = 0.479 | n=3,722 | n=930 |
81
- | 4 | mae = 0.529 | n=3,722 | n=930 |
82
- | 5 | mae = 0.505 | n=3,722 | n=930 |
83
 
84
  **Top 10 features for `optimal_ph`:**
85
 
86
- - `neg_charged_frac` — 0.0211
87
- - `tetra_TGCT` — 0.0141
88
- - `aa_frac_H` — 0.0120
89
- - `tetra_CACT` — 0.0097
90
- - `tetra_AGAC` — 0.0088
91
- - `tetra_GAGA` — 0.0081
92
- - `tetra_TCTC` — 0.0077
93
- - `ivywrel_frac` — 0.0077
94
- - `aa_frac_E` — 0.0076
95
- - `tetra_CTCT` — 0.0069
96
 
97
  ### `oxygen_requirement` — fold-by-fold
98
 
99
  | Fold | Metric | Train | Test |
100
  |---|---|---|---|
101
- | 1 | f1_macro = 0.204 | n=8,340 | n=2,085 |
102
- | 2 | f1_macro = 0.255 | n=8,341 | n=2,085 |
103
- | 3 | f1_macro = 0.286 | n=8,341 | n=2,085 |
104
- | 4 | f1_macro = 0.355 | n=8,341 | n=2,085 |
105
- | 5 | f1_macro = 0.296 | n=8,341 | n=2,085 |
106
 
107
  **Top 10 features for `oxygen_requirement`:**
108
 
109
- - `codon_ATA` — 0.0381
110
- - `aa_frac_C` — 0.0217
111
- - `genome_size_nt` — 0.0194
112
- - `tetra_CAAA` — 0.0173
113
- - `codon_CAA` — 0.0145
114
- - `aa_frac_Q` — 0.0124
115
- - `tetra_TCAA` — 0.0123
116
- - `n_predicted_cds` — 0.0117
117
- - `aa_frac_K` — 0.0098
118
- - `aa_frac_W` — 0.0088
119
 
120
  ### `salt_tolerance_pct` — fold-by-fold
121
 
122
  | Fold | Metric | Train | Test |
123
  |---|---|---|---|
124
- | 1 | mae = 2.557 | n=3,834 | n=959 |
125
- | 2 | mae = 2.164 | n=3,834 | n=959 |
126
- | 3 | mae = 2.777 | n=3,834 | n=959 |
127
- | 4 | mae = 2.654 | n=3,835 | n=958 |
128
- | 5 | mae = 2.398 | n=3,835 | n=958 |
129
 
130
  **Top 10 features for `salt_tolerance_pct`:**
131
 
132
- - `aa_frac_C` — 0.0492
133
- - `neg_charged_frac` — 0.0365
134
- - `tetra_CGTT` — 0.0160
135
- - `tetra_GAAA` — 0.0123
136
- - `tetra_GACT` — 0.0110
137
- - `tetra_AACC` — 0.0091
138
- - `codon_CGT` — 0.0073
139
- - `tetra_TGTG` — 0.0071
140
- - `tetra_AGGA` — 0.0066
141
- - `tetra_GGAG` — 0.0065
142
 
143
  ## Feature ↔ target correlations (Spearman, top 10)
144
 
@@ -148,46 +148,46 @@ Sanity-checks the biology — features known to track each target should appear
148
 
149
  | Feature | Spearman ρ | p-value |
150
  |---|---|---|
151
- | `genome_size_nt` | -0.405 | 0.0e+00 |
152
- | `aa_frac_E` | +0.388 | 0.0e+00 |
153
- | `n_predicted_cds` | -0.386 | 0.0e+00 |
154
- | `ivywrel_frac` | +0.320 | 0.0e+00 |
155
- | `aa_frac_Y` | +0.318 | 0.0e+00 |
156
- | `aa_frac_W` | -0.309 | 0.0e+00 |
157
- | `codon_TGG` | -0.309 | 0.0e+00 |
158
- | `tetra_TCTT` | +0.300 | 0.0e+00 |
159
- | `pos_charged_frac` | +0.299 | 0.0e+00 |
160
- | `tetra_AAGA` | +0.298 | 0.0e+00 |
161
 
162
  ### `optimal_ph`
163
 
164
  | Feature | Spearman ρ | p-value |
165
  |---|---|---|
166
- | `neg_charged_frac` | +0.308 | 4.5e-103 |
167
- | `mean_isoelectric_point` | -0.276 | 4.4e-82 |
168
- | `aa_frac_E` | +0.260 | 6.3e-73 |
169
- | `ivywrel_frac` | +0.166 | 3.8e-30 |
170
- | `codon_AAG` | -0.163 | 6.2e-29 |
171
- | `codon_TGC` | -0.149 | 1.9e-24 |
172
- | `codon_CGA` | +0.149 | 2.2e-24 |
173
- | `tetra_CACT` | +0.134 | 3.8e-20 |
174
- | `tetra_AGTG` | +0.133 | 6.4e-20 |
175
- | `tetra_ACTC` | +0.119 | 4.0e-16 |
176
 
177
  ### `salt_tolerance_pct`
178
 
179
  | Feature | Spearman ρ | p-value |
180
  |---|---|---|
181
- | `tetra_AGTC` | +0.232 | 9.5e-60 |
182
- | `tetra_GACT` | +0.232 | 1.5e-59 |
183
- | `neg_charged_frac` | +0.227 | 3.2e-57 |
184
- | `mean_isoelectric_point` | -0.204 | 2.9e-46 |
185
- | `ivywrel_frac` | +0.196 | 1.4e-42 |
186
- | `aa_frac_C` | -0.187 | 7.3e-39 |
187
- | `tetra_ACTC` | +0.176 | 1.4e-34 |
188
- | `tetra_GAGT` | +0.173 | 2.5e-33 |
189
- | `tetra_ATGC` | -0.164 | 3.7e-30 |
190
- | `tetra_TCAC` | +0.163 | 5.0e-30 |
191
 
192
  ## Per-family error breakdown (regression targets)
193
 
@@ -197,61 +197,61 @@ Top 15 most-represented families, MAE per family. Highlights where the model is
197
 
198
  | Family | n | MAE |
199
  |---|---|---|
200
- | Streptomycetaceae | 798 | 1.311 |
201
- | Bacillaceae | 643 | 4.423 |
202
- | Flavobacteriaceae | 631 | 4.303 |
203
- | Lactobacillaceae | 471 | 3.389 |
204
- | Enterobacteriaceae | 439 | 3.719 |
205
- | Microbacteriaceae | 396 | 2.467 |
206
- | Pseudomonadaceae | 388 | 2.254 |
207
- | Roseobacteraceae | 341 | 3.054 |
208
- | Paenibacillaceae | 319 | 3.319 |
209
- | Pseudonocardiaceae | 306 | 2.325 |
210
- | Moraxellaceae | 260 | 4.196 |
211
- | Sphingomonadaceae | 256 | 1.890 |
212
- | Streptococcaceae | 251 | 3.510 |
213
- | Clostridiaceae | 247 | 4.372 |
214
- | Vibrionaceae | 237 | 3.256 |
215
 
216
  ### `optimal_ph`
217
 
218
  | Family | n | MAE |
219
  |---|---|---|
220
- | Flavobacteriaceae | 323 | 0.412 |
221
- | Bacillaceae | 273 | 0.689 |
222
- | Roseobacteraceae | 192 | 0.389 |
223
- | Paenibacillaceae | 126 | 0.442 |
224
- | Microbacteriaceae | 112 | 0.477 |
225
- | Sphingobacteriaceae | 100 | 0.395 |
226
- | Sphingomonadaceae | 96 | 0.387 |
227
- | Streptomycetaceae | 92 | 0.546 |
228
- | Pseudonocardiaceae | 85 | 0.555 |
229
- | Halomonadaceae | 81 | 0.566 |
230
- | Nocardioidaceae | 74 | 0.495 |
231
- | Paracoccaceae | 71 | 0.577 |
232
- | Micrococcaceae | 71 | 0.598 |
233
- | Erythrobacteraceae | 68 | 0.450 |
234
- | Alteromonadaceae | 68 | 0.365 |
235
 
236
  ### `salt_tolerance_pct`
237
 
238
  | Family | n | MAE |
239
  |---|---|---|
240
- | Flavobacteriaceae | 285 | 1.711 |
241
- | Streptomycetaceae | 283 | 2.141 |
242
- | Bacillaceae | 246 | 3.508 |
243
- | Microbacteriaceae | 140 | 2.795 |
244
- | Pseudonocardiaceae | 134 | 2.345 |
245
- | Roseobacteraceae | 134 | 1.794 |
246
- | Paenibacillaceae | 125 | 2.184 |
247
- | Pseudomonadaceae | 110 | 4.033 |
248
- | Vibrionaceae | 99 | 2.488 |
249
- | Sphingomonadaceae | 92 | 1.809 |
250
- | Micromonosporaceae | 88 | 1.634 |
251
- | Micrococcaceae | 85 | 3.008 |
252
- | Nocardiaceae | 84 | 2.674 |
253
- | Streptococcaceae | 82 | 1.180 |
254
- | Lactobacillaceae | 78 | 1.852 |
255
 
256
  ## Known limitations
257
 
 
1
  # microbe-model — v0 baseline eval report
2
 
3
+ _Generated: 2026-05-05T06:56:14+00:00_
4
 
5
  ## TL;DR
6
 
7
+ - **`optimal_temperature_c`**: MAE = **2.94** (vs always-predict-mean 4.98, **+41%**)
8
+ - **`optimal_ph`**: MAE = **0.51** (vs always-predict-mean 0.55, **+7%**)
9
+ - **`oxygen_requirement`**: macro-F1 = **0.341** (vs always-predict-majority 0.059, **+479%**)
10
+ - **`salt_tolerance_pct`**: MAE = **2.52** (vs always-predict-mean 2.83, **+11%**)
11
 
12
+ Trained on **46,029** strains with **418** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
13
 
14
  ## Corpus
15
 
16
+ - Total strains in feature table: **46,029**
17
  - Labeled-strain counts by target:
18
+ - `optimal_temperature_c`: 45,621
19
+ - `optimal_ph`: 5,103
20
+ - `oxygen_requirement`: 21,639
21
+ - `salt_tolerance_pct`: 6,330
22
 
23
  ## Target distributions
24
 
25
+ - `optimal_temperature_c`: n=45,621, mean=32.24, std=7.13, p10=27.50, median=30.00, p90=37.00
26
+ - `optimal_ph`: n=5,103, mean=7.19, std=0.82, p10=6.50, median=7.00, p90=8.00
27
+ - `salt_tolerance_pct`: n=6,330, mean=3.93, std=4.03, p10=0.00, median=3.00, p90=8.00
28
  - `oxygen_requirement`:
29
+ - `aerobe`: 7,803
30
+ - `anaerobe`: 4,193
31
+ - `microaerophile`: 3,804
32
+ - `facultative anaerobe`: 3,389
33
+ - `obligate aerobe`: 2,213
34
+ - `obligate anaerobe`: 136
35
+ - `facultative aerobe`: 87
36
+ - `aerotolerant`: 12
37
  - `microaerotolerant`: 2
 
38
 
39
  ## Per-target results (5-fold GroupKFold by family)
40
 
 
43
 
44
  | Target | Task | n labeled | Model metric | Baseline | Improvement |
45
  |---|---|---|---|---|---|
46
+ | `optimal_temperature_c` | regression | 45,621 | MAE=2.939 | MAE=4.981 | +41.0% |
47
+ | `optimal_ph` | regression | 5,103 | MAE=0.509 | MAE=0.546 | +6.8% |
48
+ | `oxygen_requirement` | classification | 21,639 | F1=0.341 | F1=0.059 | +479.5% |
49
+ | `salt_tolerance_pct` | regression | 6,330 | MAE=2.517 | MAE=2.827 | +11.0% |
50
 
51
  ### `optimal_temperature_c` — fold-by-fold
52
 
53
  | Fold | Metric | Train | Test |
54
  |---|---|---|---|
55
+ | 1 | mae = 3.104 | n=36,496 | n=9,125 |
56
+ | 2 | mae = 2.736 | n=36,497 | n=9,124 |
57
+ | 3 | mae = 3.146 | n=36,497 | n=9,124 |
58
+ | 4 | mae = 3.277 | n=36,497 | n=9,124 |
59
+ | 5 | mae = 2.435 | n=36,497 | n=9,124 |
60
 
61
  **Top 10 features for `optimal_temperature_c`:**
62
 
63
+ - `ivywrel_frac` — 0.1267
64
+ - `iso_cat2_thermophilic_gt45_c` — 0.0299
65
+ - `n_predicted_cds` — 0.0251
66
+ - `iso_cat2_human` — 0.0209
67
+ - `iso_cat1_infection` — 0.0206
68
+ - `iso_cat2_patient` — 0.0178
69
+ - `aa_frac_C` — 0.0150
70
+ - `genome_size_nt` — 0.0122
71
+ - `aa_frac_D` — 0.0113
72
+ - `codon_AGG` — 0.0109
73
 
74
  ### `optimal_ph` — fold-by-fold
75
 
76
  | Fold | Metric | Train | Test |
77
  |---|---|---|---|
78
+ | 1 | mae = 0.456 | n=4,082 | n=1,021 |
79
+ | 2 | mae = 0.626 | n=4,082 | n=1,021 |
80
+ | 3 | mae = 0.528 | n=4,082 | n=1,021 |
81
+ | 4 | mae = 0.480 | n=4,083 | n=1,020 |
82
+ | 5 | mae = 0.454 | n=4,083 | n=1,020 |
83
 
84
  **Top 10 features for `optimal_ph`:**
85
 
86
+ - `iso_cat2_acidic` — 0.0522
87
+ - `iso_cat2_alkaline` — 0.0435
88
+ - `neg_charged_frac` — 0.0169
89
+ - `aa_frac_E` — 0.0086
90
+ - `tetra_CTCT` — 0.0084
91
+ - `aa_frac_H` — 0.0080
92
+ - `mean_isoelectric_point` — 0.0076
93
+ - `tetra_CACT` — 0.0074
94
+ - `tetra_AGAC` — 0.0071
95
+ - `tetra_AGGT` — 0.0059
96
 
97
  ### `oxygen_requirement` — fold-by-fold
98
 
99
  | Fold | Metric | Train | Test |
100
  |---|---|---|---|
101
+ | 1 | f1_macro = 0.315 | n=17,311 | n=4,328 |
102
+ | 2 | f1_macro = 0.382 | n=17,311 | n=4,326 |
103
+ | 3 | f1_macro = 0.344 | n=17,311 | n=4,328 |
104
+ | 4 | f1_macro = 0.259 | n=17,311 | n=4,328 |
105
+ | 5 | f1_macro = 0.406 | n=17,312 | n=4,327 |
106
 
107
  **Top 10 features for `oxygen_requirement`:**
108
 
109
+ - `codon_ATA` — 0.0414
110
+ - `iso_cat1_host` — 0.0260
111
+ - `n_predicted_cds` — 0.0252
112
+ - `aa_frac_C` — 0.0191
113
+ - `iso_cat1_environmental` — 0.0165
114
+ - `codon_CGT` — 0.0148
115
+ - `iso_cat1_engineered` — 0.0138
116
+ - `genome_size_nt` — 0.0113
117
+ - `iso_cat2_human` — 0.0102
118
+ - `codon_TAA` — 0.0090
119
 
120
  ### `salt_tolerance_pct` — fold-by-fold
121
 
122
  | Fold | Metric | Train | Test |
123
  |---|---|---|---|
124
+ | 1 | mae = 2.218 | n=5,064 | n=1,266 |
125
+ | 2 | mae = 2.249 | n=5,064 | n=1,266 |
126
+ | 3 | mae = 2.819 | n=5,064 | n=1,266 |
127
+ | 4 | mae = 2.350 | n=5,064 | n=1,266 |
128
+ | 5 | mae = 2.948 | n=5,064 | n=1,266 |
129
 
130
  **Top 10 features for `salt_tolerance_pct`:**
131
 
132
+ - `aa_frac_C` — 0.0298
133
+ - `neg_charged_frac` — 0.0278
134
+ - `tetra_ATCC` — 0.0183
135
+ - `iso_cat1_environmental` — 0.0142
136
+ - `tetra_GACT` — 0.0121
137
+ - `iso_cat2_saline` — 0.0114
138
+ - `codon_TGC` — 0.0112
139
+ - `tetra_CGTT` — 0.0094
140
+ - `codon_CGT` — 0.0087
141
+ - `iso_cat2_industrial` — 0.0085
142
 
143
  ## Feature ↔ target correlations (Spearman, top 10)
144
 
 
148
 
149
  | Feature | Spearman ρ | p-value |
150
  |---|---|---|
151
+ | `genome_size_nt` | -0.493 | 0.0e+00 |
152
+ | `n_predicted_cds` | -0.482 | 0.0e+00 |
153
+ | `aa_frac_P` | -0.391 | 0.0e+00 |
154
+ | `aa_frac_Y` | +0.390 | 0.0e+00 |
155
+ | `tetra_TCTT` | +0.383 | 0.0e+00 |
156
+ | `tetra_TATC` | +0.381 | 0.0e+00 |
157
+ | `tetra_GATA` | +0.381 | 0.0e+00 |
158
+ | `tetra_AAGA` | +0.381 | 0.0e+00 |
159
+ | `tetra_CATA` | +0.380 | 0.0e+00 |
160
+ | `tetra_TATG` | +0.379 | 0.0e+00 |
161
 
162
  ### `optimal_ph`
163
 
164
  | Feature | Spearman ρ | p-value |
165
  |---|---|---|
166
+ | `neg_charged_frac` | +0.304 | 1.6e-109 |
167
+ | `mean_isoelectric_point` | -0.278 | 1.8e-91 |
168
+ | `aa_frac_E` | +0.256 | 4.5e-77 |
169
+ | `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
170
+ | `ivywrel_frac` | +0.159 | 2.4e-30 |
171
+ | `codon_AAG` | -0.154 | 1.7e-28 |
172
+ | `codon_CGA` | +0.153 | 5.8e-28 |
173
+ | `codon_TGC` | -0.151 | 2.6e-27 |
174
+ | `iso_cat2_saline` | +0.137 | 8.9e-23 |
175
+ | `tetra_CACT` | +0.135 | 4.3e-22 |
176
 
177
  ### `salt_tolerance_pct`
178
 
179
  | Feature | Spearman ρ | p-value |
180
  |---|---|---|
181
+ | `tetra_AGTC` | +0.270 | 4.0e-106 |
182
+ | `tetra_GACT` | +0.268 | 1.4e-104 |
183
+ | `neg_charged_frac` | +0.221 | 3.9e-71 |
184
+ | `ivywrel_frac` | +0.221 | 8.4e-71 |
185
+ | `aa_frac_C` | -0.202 | 4.7e-59 |
186
+ | `iso_cat1_environmental` | -0.193 | 2.6e-54 |
187
+ | `n_contigs` | -0.181 | 1.0e-47 |
188
+ | `mean_cds_aa_length` | -0.177 | 8.2e-46 |
189
+ | `tetra_ACTC` | +0.176 | 4.5e-45 |
190
+ | `tetra_GAGT` | +0.173 | 1.5e-43 |
191
 
192
  ## Per-family error breakdown (regression targets)
193
 
 
197
 
198
  | Family | n | MAE |
199
  |---|---|---|
200
+ | Enterobacteriaceae | 2662 | 4.086 |
201
+ | Streptomycetaceae | 2212 | 1.919 |
202
+ | Bacillaceae | 1886 | 3.195 |
203
+ | Lactobacillaceae | 1732 | 3.537 |
204
+ | Pseudomonadaceae | 1621 | 2.576 |
205
+ | Myxococcaceae | 1546 | 0.403 |
206
+ | Streptococcaceae | 1170 | 2.367 |
207
+ | Staphylococcaceae | 1068 | 4.288 |
208
+ | Flavobacteriaceae | 981 | 4.202 |
209
+ | Corynebacteriaceae | 900 | 2.231 |
210
+ | Moraxellaceae | 890 | 3.514 |
211
+ | Paenibacillaceae | 760 | 2.967 |
212
+ | Microbacteriaceae | 734 | 2.482 |
213
+ | Micrococcaceae | 719 | 2.991 |
214
+ | Nocardiaceae | 715 | 2.679 |
215
 
216
  ### `optimal_ph`
217
 
218
  | Family | n | MAE |
219
  |---|---|---|
220
+ | Flavobacteriaceae | 355 | 0.391 |
221
+ | Bacillaceae | 298 | 0.678 |
222
+ | Roseobacteraceae | 204 | 0.400 |
223
+ | Paenibacillaceae | 139 | 0.435 |
224
+ | Microbacteriaceae | 120 | 0.438 |
225
+ | Sphingobacteriaceae | 114 | 0.353 |
226
+ | Sphingomonadaceae | 102 | 0.346 |
227
+ | Streptomycetaceae | 98 | 0.599 |
228
+ | Pseudonocardiaceae | 93 | 0.495 |
229
+ | Halomonadaceae | 82 | 0.603 |
230
+ | Micrococcaceae | 82 | 0.619 |
231
+ | Nocardioidaceae | 80 | 0.490 |
232
+ | Paracoccaceae | 76 | 0.564 |
233
+ | Alteromonadaceae | 71 | 0.349 |
234
+ | Erythrobacteraceae | 68 | 0.423 |
235
 
236
  ### `salt_tolerance_pct`
237
 
238
  | Family | n | MAE |
239
  |---|---|---|
240
+ | Streptococcaceae | 340 | 0.891 |
241
+ | Flavobacteriaceae | 312 | 1.834 |
242
+ | Bacillaceae | 310 | 3.417 |
243
+ | Streptomycetaceae | 309 | 2.116 |
244
+ | Pseudomonadaceae | 196 | 4.802 |
245
+ | Corynebacteriaceae | 194 | 3.853 |
246
+ | Vibrionaceae | 173 | 2.872 |
247
+ | Microbacteriaceae | 166 | 2.616 |
248
+ | Paenibacillaceae | 150 | 2.096 |
249
+ | Roseobacteraceae | 143 | 1.556 |
250
+ | Pseudonocardiaceae | 142 | 2.400 |
251
+ | Moraxellaceae | 126 | 2.581 |
252
+ | Nocardiaceae | 125 | 2.899 |
253
+ | Enterococcaceae | 111 | 1.723 |
254
+ | Alcaligenaceae | 104 | 4.454 |
255
 
256
  ## Known limitations
257
 
scripts/03_train_baseline.py CHANGED
@@ -25,6 +25,41 @@ def derive_group(row: pd.Series) -> str:
25
  return "__unknown__"
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def main() -> None:
29
  t0 = time.time()
30
  pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
@@ -32,7 +67,12 @@ def main() -> None:
32
  df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
33
  df["group"] = df.apply(derive_group, axis=1)
34
 
 
 
 
 
35
  feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
 
36
 
37
  print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
38
  print(f"Distinct groups: {df['group'].nunique():,}")
 
25
  return "__unknown__"
26
 
27
 
28
+ def encode_isolation_categories(
29
+ df: pd.DataFrame,
30
+ *,
31
+ min_count: int = 10,
32
+ ) -> tuple[pd.DataFrame, list[str]]:
33
+ """One-hot encode isolation_cat1/cat2 (pipe-joined multi-labels).
34
+
35
+ Each strain's category cell is "Tag1|Tag2|..." (or NaN). We split, then create one
36
+ iso_<level>_<tag> column per tag that appears in ≥min_count training rows. Strains
37
+ without any isolation info get all-zero rows for these features (XGBoost treats this
38
+ as "no signal" rather than missing).
39
+ """
40
+ new_cols: list[str] = []
41
+ for level in ("isolation_cat1", "isolation_cat2"):
42
+ if level not in df.columns:
43
+ continue
44
+ from collections import Counter
45
+ tag_counts: Counter[str] = Counter()
46
+ for v in df[level].dropna():
47
+ tag_counts.update(v.split("|"))
48
+ kept = [t for t, n in tag_counts.items() if n >= min_count]
49
+ seen_slugs: set[str] = set()
50
+ import re
51
+ for tag in sorted(kept):
52
+ slug = tag.lower().replace(">", "gt").replace("<", "lt")
53
+ slug = re.sub(r"[^a-z0-9]+", "_", slug).strip("_")
54
+ col = f"iso_{level.split('_')[1]}_{slug}"
55
+ if col in seen_slugs:
56
+ continue
57
+ seen_slugs.add(col)
58
+ df[col] = df[level].fillna("").apply(lambda v, t=tag: int(t in v.split("|")))
59
+ new_cols.append(col)
60
+ return df, new_cols
61
+
62
+
63
  def main() -> None:
64
  t0 = time.time()
65
  pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
 
67
  df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
68
  df["group"] = df.apply(derive_group, axis=1)
69
 
70
+ df, iso_cols = encode_isolation_categories(df)
71
+ print(f"Encoded {len(iso_cols)} isolation-category features "
72
+ f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
73
+
74
  feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
75
+ feature_cols = feature_cols + iso_cols
76
 
77
  print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
78
  print(f"Distinct groups: {df['group'].nunique():,}")
scripts/14_train_combined.py CHANGED
@@ -1,4 +1,4 @@
1
- """Train v3: hand-crafted features (v1) concatenated with ESM-2 embeddings (v2).
2
 
3
  Tests whether embeddings carry complementary signal to the curated features even
4
  when they lose head-to-head. Same train/test splits and XGBoost hyperparameters
@@ -14,7 +14,9 @@ Writes:
14
  """
15
  from __future__ import annotations
16
 
 
17
  import time
 
18
 
19
  import pandas as pd
20
 
@@ -33,6 +35,37 @@ def derive_group(row: pd.Series) -> str:
33
  return "__unknown__"
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def main() -> None:
37
  t0 = time.time()
38
  pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
@@ -43,12 +76,16 @@ def main() -> None:
43
  df = df.merge(embeds, on=["bacdive_id", "genome_accession"], how="inner")
44
  df["group"] = df.apply(derive_group, axis=1)
45
 
 
 
 
 
46
  v1_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
47
  v2_cols = [c for c in embeds.columns if c.startswith("emb_")]
48
- feature_cols = v1_cols + v2_cols
49
 
50
  print(f"Training table: {len(df):,} strains × {len(feature_cols)} features "
51
- f"({len(v1_cols)} hand-crafted + {len(v2_cols)} embedding dims)")
52
  print(f"Distinct groups: {df['group'].nunique():,}")
53
  print()
54
 
 
1
+ """Train v3: hand-crafted features (v1) + ESM-2 embeddings (v2) + isolation tags.
2
 
3
  Tests whether embeddings carry complementary signal to the curated features even
4
  when they lose head-to-head. Same train/test splits and XGBoost hyperparameters
 
14
  """
15
  from __future__ import annotations
16
 
17
+ import re
18
  import time
19
+ from collections import Counter
20
 
21
  import pandas as pd
22
 
 
35
  return "__unknown__"
36
 
37
 
38
+ def encode_isolation_categories(
39
+ df: pd.DataFrame,
40
+ *,
41
+ min_count: int = 10,
42
+ ) -> tuple[pd.DataFrame, list[str]]:
43
+ """One-hot encode isolation_cat1/cat2 (pipe-joined multi-labels).
44
+
45
+ Mirrors the encoder in scripts/03_train_baseline.py so v3 sees the same
46
+ isolation-tag vocabulary as v1.
47
+ """
48
+ new_cols: list[str] = []
49
+ for level in ("isolation_cat1", "isolation_cat2"):
50
+ if level not in df.columns:
51
+ continue
52
+ tag_counts: Counter[str] = Counter()
53
+ for v in df[level].dropna():
54
+ tag_counts.update(v.split("|"))
55
+ kept = [t for t, n in tag_counts.items() if n >= min_count]
56
+ seen_slugs: set[str] = set()
57
+ for tag in sorted(kept):
58
+ slug = tag.lower().replace(">", "gt").replace("<", "lt")
59
+ slug = re.sub(r"[^a-z0-9]+", "_", slug).strip("_")
60
+ col = f"iso_{level.split('_')[1]}_{slug}"
61
+ if col in seen_slugs:
62
+ continue
63
+ seen_slugs.add(col)
64
+ df[col] = df[level].fillna("").apply(lambda v, t=tag: int(t in v.split("|")))
65
+ new_cols.append(col)
66
+ return df, new_cols
67
+
68
+
69
  def main() -> None:
70
  t0 = time.time()
71
  pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
 
76
  df = df.merge(embeds, on=["bacdive_id", "genome_accession"], how="inner")
77
  df["group"] = df.apply(derive_group, axis=1)
78
 
79
+ df, iso_cols = encode_isolation_categories(df)
80
+ print(f"Encoded {len(iso_cols)} isolation-category features "
81
+ f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
82
+
83
  v1_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
84
  v2_cols = [c for c in embeds.columns if c.startswith("emb_")]
85
+ feature_cols = v1_cols + v2_cols + iso_cols
86
 
87
  print(f"Training table: {len(df):,} strains × {len(feature_cols)} features "
88
+ f"({len(v1_cols)} hand-crafted + {len(v2_cols)} embedding dims + {len(iso_cols)} iso tags)")
89
  print(f"Distinct groups: {df['group'].nunique():,}")
90
  print()
91
 
scripts/17_reextract_phenotypes.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rebuild data/bacdive_phenotypes.parquet from cached data/bacdive/*.json.
2
+
3
+ Use this after extending extract_phenotypes() to add fields without re-running the
4
+ ~30-min API scan. Reads every cached JSON, re-applies the extractor, and overwrites
5
+ the parquet.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+
15
+ from microbe_model import config
16
+ from microbe_model.data.bacdive import extract_phenotypes
17
+
18
+
19
+ def main() -> None:
20
+ files = sorted(Path(config.BACDIVE_DIR).glob("*.json"))
21
+ print(f"Re-extracting from {len(files):,} cached JSONs in {config.BACDIVE_DIR}")
22
+
23
+ rows = []
24
+ for path in tqdm(files, desc="re-extract", unit="strain"):
25
+ try:
26
+ record = json.loads(path.read_text())
27
+ except json.JSONDecodeError:
28
+ continue
29
+ rows.append(extract_phenotypes(record))
30
+
31
+ df = pd.DataFrame(rows)
32
+ out = config.DATA / "bacdive_phenotypes.parquet"
33
+ df.to_parquet(out, index=False)
34
+
35
+ print(f"\nWrote {len(df):,} strains to {out}")
36
+ print("Field coverage:")
37
+ for col in df.columns:
38
+ n = df[col].notna().sum()
39
+ print(f" {col:30s} {n:>6,} / {len(df):,} ({100 * n / max(1, len(df)):.1f}%)")
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()
scripts/18_resolve_species_to_genome.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Resolve BacDive species names → NCBI representative genome accessions.
2
+
3
+ Targets the phenotype-labeled strains that lack a `genome_accession` in BacDive.
4
+ Many of those species DO have a sequenced genome — BacDive just doesn't link to it.
5
+ We query NCBI Datasets v2 for one RefSeq assembly per unique species name and write
6
+ the {species: accession} map so the next pipeline step can pull the FASTAs.
7
+
8
+ Output: data/species_to_genome.parquet (species, ncbi_accession, status)
9
+
10
+ Resumable: re-runs skip species already present in the output.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import time
15
+ from urllib.parse import quote
16
+
17
+ import pandas as pd
18
+ import requests
19
+ from tqdm import tqdm
20
+
21
+ from microbe_model import config
22
+
23
+ API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{taxon}/dataset_report"
24
+ RATE_LIMIT_S = 0.11 if config.NCBI_API_KEY else 0.36
25
+ OUT_PATH = config.DATA / "species_to_genome.parquet"
26
+
27
+
28
+ def fetch_one(species: str, session: requests.Session) -> tuple[str | None, str]:
29
+ """Return (accession, status) for a species. status ∈ {hit, miss, error}."""
30
+ headers: dict[str, str] = {"Accept": "application/json"}
31
+ if config.NCBI_API_KEY:
32
+ headers["api-key"] = config.NCBI_API_KEY
33
+ params = {"filters.assembly_source": "RefSeq", "page_size": 1}
34
+
35
+ for attempt in range(3):
36
+ try:
37
+ time.sleep(RATE_LIMIT_S)
38
+ resp = session.get(
39
+ API_URL.format(taxon=quote(species)),
40
+ headers=headers,
41
+ params=params,
42
+ timeout=30,
43
+ )
44
+ if resp.status_code == 404:
45
+ return None, "miss"
46
+ if resp.status_code in (429, 502, 503):
47
+ time.sleep(2 ** attempt)
48
+ continue
49
+ resp.raise_for_status()
50
+ reports = resp.json().get("reports", [])
51
+ if reports:
52
+ acc = reports[0].get("accession")
53
+ return (acc, "hit") if acc else (None, "miss")
54
+ return None, "miss"
55
+ except requests.RequestException:
56
+ if attempt == 2:
57
+ return None, "error"
58
+ time.sleep(2 ** attempt)
59
+ return None, "error"
60
+
61
+
62
+ def main() -> None:
63
+ df = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
64
+ has_label = df[
65
+ ["optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"]
66
+ ].notna().any(axis=1)
67
+ no_genome = df["genome_accession"].isna()
68
+ valid_species = df["species"].notna() & df["species"].str.contains(" ", na=False)
69
+ gap_species = sorted(df[has_label & no_genome & valid_species]["species"].unique())
70
+ print(f"unique species to resolve: {len(gap_species):,}")
71
+
72
+ # Resume from prior partial run
73
+ done: dict[str, tuple[str | None, str]] = {}
74
+ if OUT_PATH.exists():
75
+ prev = pd.read_parquet(OUT_PATH)
76
+ for _, row in prev.iterrows():
77
+ done[row["species"]] = (row["ncbi_accession"], row["status"])
78
+ print(f"resuming — {len(done):,} already cached")
79
+
80
+ todo = [s for s in gap_species if s not in done]
81
+ print(f"to fetch: {len(todo):,}")
82
+
83
+ session = requests.Session()
84
+ rows: list[dict] = [
85
+ {"species": sp, "ncbi_accession": acc, "status": st}
86
+ for sp, (acc, st) in done.items()
87
+ ]
88
+ n_hits = sum(1 for _, st in done.values() if st == "hit")
89
+
90
+ try:
91
+ for sp in tqdm(todo, desc="resolving", unit="species"):
92
+ acc, status = fetch_one(sp, session)
93
+ rows.append({"species": sp, "ncbi_accession": acc, "status": status})
94
+ if status == "hit":
95
+ n_hits += 1
96
+ # Periodic checkpoint every 200 species so an interrupt doesn't lose progress
97
+ if len(rows) % 200 == 0:
98
+ pd.DataFrame(rows).to_parquet(OUT_PATH, index=False)
99
+ finally:
100
+ pd.DataFrame(rows).to_parquet(OUT_PATH, index=False)
101
+
102
+ out = pd.DataFrame(rows)
103
+ print(f"\nwrote {len(out):,} rows to {OUT_PATH}")
104
+ print(f" hit: {(out['status'] == 'hit').sum():,} ({100 * (out['status'] == 'hit').mean():.0f}%)")
105
+ print(f" miss: {(out['status'] == 'miss').sum():,}")
106
+ print(f" error: {(out['status'] == 'error').sum():,}")
107
+
108
+
109
+ if __name__ == "__main__":
110
+ main()
scripts/19_featurize_resolved.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deduplicated featurize for species-resolved genomes.
2
+
3
+ When many BacDive strains share a single species-level representative genome (the
4
+ common case after scripts/18), naively running scripts/02 re-downloads + re-runs
5
+ pyrodigal on the same FASTA per-strain. This script downloads each unique accession
6
+ once, then replicates the resulting feature dict across all bacdive_ids that share it.
7
+
8
+ Resumable via data/features.jsonl (skips bacdive_ids already in the log).
9
+
10
+ Usage:
11
+ uv run python scripts/19_featurize_resolved.py --workers 7
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import os
18
+ import time
19
+ from concurrent.futures import ProcessPoolExecutor, as_completed
20
+ from pathlib import Path
21
+
22
+ import pandas as pd
23
+ from tqdm import tqdm
24
+
25
+ from microbe_model import config
26
+ from microbe_model.pipeline import _load_done_ids, _process_one
27
+
28
+
29
+ def main() -> None:
30
+ parser = argparse.ArgumentParser()
31
+ parser.add_argument("--workers", type=int, default=max(1, (os.cpu_count() or 4) - 1))
32
+ parser.add_argument("--max-accessions", type=int, default=None,
33
+ help="Cap how many unique accessions to process (debug).")
34
+ args = parser.parse_args()
35
+
36
+ pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
37
+
38
+ # Training-ready pool: any phenotype label + a genome accession
39
+ label_cols = list(config.PHENOTYPE_TARGETS.keys())
40
+ has_label = pheno[label_cols].notna().any(axis=1)
41
+ has_genome = pheno["genome_accession"].notna()
42
+ ready = pheno[has_label & has_genome].copy()
43
+ ready["bacdive_id"] = ready["bacdive_id"].astype(int)
44
+ ready["genome_accession"] = ready["genome_accession"].astype(str)
45
+
46
+ out_path = config.DATA / "features.jsonl"
47
+ done_ids = _load_done_ids(out_path)
48
+ todo = ready[~ready["bacdive_id"].isin(done_ids)]
49
+ print(f"strains in pool: {len(ready):,}")
50
+ print(f" already featurized: {len(done_ids):,}")
51
+ print(f" remaining: {len(todo):,}")
52
+
53
+ # Group remaining strains by accession
54
+ by_acc = todo.groupby("genome_accession")["bacdive_id"].apply(list).to_dict()
55
+ accessions = sorted(by_acc.keys())
56
+ if args.max_accessions:
57
+ accessions = accessions[: args.max_accessions]
58
+ print(f"unique accessions to download: {len(accessions):,}")
59
+ print(f" avg strains per accession: {sum(len(by_acc[a]) for a in accessions) / max(1, len(accessions)):.1f}")
60
+ print(f"workers: {args.workers}\n")
61
+
62
+ # Featurize each accession once; the worker tags the result with the *first* bacdive_id
63
+ # of that accession's strain group. We then replicate the feature dict to all sibling
64
+ # bacdive_ids before writing.
65
+ rep_tasks = [(by_acc[acc][0], acc) for acc in accessions]
66
+
67
+ n_success = 0
68
+ n_replicated_rows = 0
69
+ start = time.time()
70
+ with open(out_path, "a") as fh, \
71
+ ProcessPoolExecutor(max_workers=args.workers) as pool, \
72
+ tqdm(total=len(rep_tasks), desc="featurize", unit="genome") as bar:
73
+ futures = {pool.submit(_process_one, t): t for t in rep_tasks}
74
+ for fut in as_completed(futures):
75
+ rep_id, acc = futures[fut]
76
+ bar.update(1)
77
+ try:
78
+ feats = fut.result()
79
+ except Exception:
80
+ feats = None
81
+ if not feats:
82
+ continue
83
+ n_success += 1
84
+ for bid in by_acc[acc]:
85
+ row = dict(feats)
86
+ row["bacdive_id"] = bid
87
+ row["genome_accession"] = acc
88
+ fh.write(json.dumps(row) + "\n")
89
+ n_replicated_rows += 1
90
+ fh.flush()
91
+ bar.set_postfix(genomes_ok=n_success, rows=n_replicated_rows)
92
+
93
+ print(f"\nfinished in {(time.time() - start) / 60:.1f} min")
94
+ print(f" unique genomes featurized: {n_success:,}/{len(rep_tasks):,}")
95
+ print(f" feature rows written: {n_replicated_rows:,}")
96
+
97
+ # Materialize parquet
98
+ df = pd.read_json(out_path, lines=True)
99
+ df = df.drop_duplicates(subset=["bacdive_id"], keep="last")
100
+ parquet = config.DATA / "features.parquet"
101
+ df.to_parquet(parquet, index=False)
102
+ print(f" wrote {len(df):,} rows to {parquet}")
103
+
104
+
105
+ if __name__ == "__main__":
106
+ main()
src/microbe_model/data/bacdive.py CHANGED
@@ -102,12 +102,16 @@ def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
102
  - Physiology and metabolism → oxygen tolerance[]
103
  - Physiology and metabolism → halophily[]
104
  - Sequence information → Genome sequences[].INSDC accession
 
105
  """
106
  general = record.get("General") or {}
107
  taxon = record.get("Name and taxonomic classification") or {}
108
  culture = record.get("Culture and growth conditions") or {}
109
  physio = record.get("Physiology and metabolism") or {}
110
  seq = record.get("Sequence information") or {}
 
 
 
111
 
112
  out: dict[str, Any] = {
113
  "bacdive_id": general.get("BacDive-ID"),
@@ -120,10 +124,35 @@ def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
120
  "oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"),
121
  "salt_tolerance_pct": _derive_salt(physio.get("halophily")),
122
  "genome_accession": _first_genome_accession(seq.get("Genome sequences")),
 
 
 
123
  }
124
  return out
125
 
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def _as_list(x: Any) -> list:
128
  if x is None:
129
  return []
 
102
  - Physiology and metabolism → oxygen tolerance[]
103
  - Physiology and metabolism → halophily[]
104
  - Sequence information → Genome sequences[].INSDC accession
105
+ - Isolation, sampling and environmental information → isolation source categories[].Cat{1,2,3}
106
  """
107
  general = record.get("General") or {}
108
  taxon = record.get("Name and taxonomic classification") or {}
109
  culture = record.get("Culture and growth conditions") or {}
110
  physio = record.get("Physiology and metabolism") or {}
111
  seq = record.get("Sequence information") or {}
112
+ iso = record.get("Isolation, sampling and environmental information") or {}
113
+
114
+ iso_cats = _collect_isolation_categories(iso.get("isolation source categories"))
115
 
116
  out: dict[str, Any] = {
117
  "bacdive_id": general.get("BacDive-ID"),
 
124
  "oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"),
125
  "salt_tolerance_pct": _derive_salt(physio.get("halophily")),
126
  "genome_accession": _first_genome_accession(seq.get("Genome sequences")),
127
+ "isolation_cat1": iso_cats["cat1"],
128
+ "isolation_cat2": iso_cats["cat2"],
129
+ "isolation_cat3": iso_cats["cat3"],
130
  }
131
  return out
132
 
133
 
134
+ def _collect_isolation_categories(raw: Any) -> dict[str, str | None]:
135
+ """Flatten BacDive's `isolation source categories` into 3 pipe-joined string fields.
136
+
137
+ A strain commonly has multiple parallel category descriptions (e.g., #Host=Human AND
138
+ #Host Body Product=Blood). We collect *all* unique values per level into a sorted,
139
+ pipe-joined string so downstream code can split & one-hot. The leading '#' is stripped.
140
+ """
141
+ cats: dict[str, set[str]] = {"Cat1": set(), "Cat2": set(), "Cat3": set()}
142
+ for entry in _as_list(raw):
143
+ if not isinstance(entry, dict):
144
+ continue
145
+ for level in cats:
146
+ value = entry.get(level)
147
+ if isinstance(value, str) and value:
148
+ cats[level].add(value.lstrip("#").strip())
149
+ return {
150
+ "cat1": "|".join(sorted(cats["Cat1"])) or None,
151
+ "cat2": "|".join(sorted(cats["Cat2"])) or None,
152
+ "cat3": "|".join(sorted(cats["Cat3"])) or None,
153
+ }
154
+
155
+
156
  def _as_list(x: Any) -> list:
157
  if x is None:
158
  return []