Spaces:
Running
Expand training corpus to 46K strains: species-name → NCBI genome + isolation features
Browse filesTwo layered improvements over the 17K-strain genome-only baseline:
1. Isolation-source features (Cat1/Cat2 from BacDive's `isolation source categories`)
- Re-extracted from cached BacDive JSON via scripts/17 (no re-download)
- One-hot encoded in scripts/03 (and mirrored in scripts/14 for the v3 trainer)
2. Species-name → NCBI representative genome resolution
- scripts/18 queries NCBI Datasets v2 /genome/taxon/ for each unique species
name in the gap (had phenotype labels but no genome accession in BacDive).
5,393 of 7,905 species resolved (68%).
- scripts/19 deduplicated featurize: each unique accession downloaded once,
feature dict replicated to all sibling strains (avoids ~5x redundant downloads).
- 5,254 of 5,283 unique genomes featurized (99.5% success).
Result: 17,054 → 46,058 training-ready strains.
Per-target metrics (vs original v0 genome-only):
optimal_temperature_c MAE 3.28 → 2.94 (-10.4%)
oxygen_requirement F1 0.279 → 0.341 (+22.2%)
optimal_ph MAE 0.52 → 0.51 ( -2.1%, label-limited)
salt_tolerance_pct MAE 2.51 → 2.52 ( +0.3%, label-limited)
pH and salt did not lift because BacDive label coverage for those targets barely
grew (1.1× and 1.3× vs 2.7× and 2.1× for T_opt and oxygen respectively).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- artifacts/baseline_results.json +210 -145
- artifacts/combined_results.json +204 -141
- artifacts/eval_report.md +161 -161
- scripts/03_train_baseline.py +40 -0
- scripts/14_train_combined.py +40 -3
- scripts/17_reextract_phenotypes.py +43 -0
- scripts/18_resolve_species_to_genome.py +110 -0
- scripts/19_featurize_resolved.py +106 -0
- src/microbe_model/data/bacdive.py +29 -0
|
@@ -1,274 +1,274 @@
|
|
| 1 |
{
|
| 2 |
"optimal_temperature_c": {
|
| 3 |
"task": "regression",
|
| 4 |
-
"mean_metric":
|
| 5 |
"folds": [
|
| 6 |
{
|
| 7 |
"target": "optimal_temperature_c",
|
| 8 |
"task": "regression",
|
| 9 |
"metric_name": "mae",
|
| 10 |
-
"value":
|
| 11 |
-
"n_train":
|
| 12 |
-
"n_test":
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"target": "optimal_temperature_c",
|
| 16 |
"task": "regression",
|
| 17 |
"metric_name": "mae",
|
| 18 |
-
"value":
|
| 19 |
-
"n_train":
|
| 20 |
-
"n_test":
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"target": "optimal_temperature_c",
|
| 24 |
"task": "regression",
|
| 25 |
"metric_name": "mae",
|
| 26 |
-
"value": 3.
|
| 27 |
-
"n_train":
|
| 28 |
-
"n_test":
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"target": "optimal_temperature_c",
|
| 32 |
"task": "regression",
|
| 33 |
"metric_name": "mae",
|
| 34 |
-
"value": 3.
|
| 35 |
-
"n_train":
|
| 36 |
-
"n_test":
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"target": "optimal_temperature_c",
|
| 40 |
"task": "regression",
|
| 41 |
"metric_name": "mae",
|
| 42 |
-
"value":
|
| 43 |
-
"n_train":
|
| 44 |
-
"n_test":
|
| 45 |
}
|
| 46 |
],
|
| 47 |
"top_features": {
|
| 48 |
-
"ivywrel_frac": 0.
|
| 49 |
-
"
|
| 50 |
-
"
|
| 51 |
-
"
|
| 52 |
-
"
|
| 53 |
-
"
|
| 54 |
-
"
|
| 55 |
-
"
|
| 56 |
-
"
|
| 57 |
-
"
|
| 58 |
-
"
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
}
|
| 69 |
},
|
| 70 |
"optimal_ph": {
|
| 71 |
"task": "regression",
|
| 72 |
-
"mean_metric": 0.
|
| 73 |
"folds": [
|
| 74 |
{
|
| 75 |
"target": "optimal_ph",
|
| 76 |
"task": "regression",
|
| 77 |
"metric_name": "mae",
|
| 78 |
-
"value": 0.
|
| 79 |
-
"n_train":
|
| 80 |
-
"n_test":
|
| 81 |
},
|
| 82 |
{
|
| 83 |
"target": "optimal_ph",
|
| 84 |
"task": "regression",
|
| 85 |
"metric_name": "mae",
|
| 86 |
-
"value": 0.
|
| 87 |
-
"n_train":
|
| 88 |
-
"n_test":
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"target": "optimal_ph",
|
| 92 |
"task": "regression",
|
| 93 |
"metric_name": "mae",
|
| 94 |
-
"value": 0.
|
| 95 |
-
"n_train":
|
| 96 |
-
"n_test":
|
| 97 |
},
|
| 98 |
{
|
| 99 |
"target": "optimal_ph",
|
| 100 |
"task": "regression",
|
| 101 |
"metric_name": "mae",
|
| 102 |
-
"value": 0.
|
| 103 |
-
"n_train":
|
| 104 |
-
"n_test":
|
| 105 |
},
|
| 106 |
{
|
| 107 |
"target": "optimal_ph",
|
| 108 |
"task": "regression",
|
| 109 |
"metric_name": "mae",
|
| 110 |
-
"value": 0.
|
| 111 |
-
"n_train":
|
| 112 |
-
"n_test":
|
| 113 |
}
|
| 114 |
],
|
| 115 |
"top_features": {
|
| 116 |
-
"
|
| 117 |
-
"
|
| 118 |
-
"
|
| 119 |
-
"
|
| 120 |
-
"
|
| 121 |
-
"
|
| 122 |
-
"
|
| 123 |
-
"
|
| 124 |
-
"
|
| 125 |
-
"
|
| 126 |
-
"
|
| 127 |
-
"
|
| 128 |
-
"
|
| 129 |
-
"
|
| 130 |
-
"
|
| 131 |
-
"
|
| 132 |
-
"
|
| 133 |
-
"
|
| 134 |
-
"tetra_ACGA": 0.
|
| 135 |
-
"
|
| 136 |
}
|
| 137 |
},
|
| 138 |
"oxygen_requirement": {
|
| 139 |
"task": "classification",
|
| 140 |
-
"mean_metric": 0.
|
| 141 |
"folds": [
|
| 142 |
{
|
| 143 |
"target": "oxygen_requirement",
|
| 144 |
"task": "classification",
|
| 145 |
"metric_name": "f1_macro",
|
| 146 |
-
"value": 0.
|
| 147 |
-
"n_train":
|
| 148 |
-
"n_test":
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"target": "oxygen_requirement",
|
| 152 |
"task": "classification",
|
| 153 |
"metric_name": "f1_macro",
|
| 154 |
-
"value": 0.
|
| 155 |
-
"n_train":
|
| 156 |
-
"n_test":
|
| 157 |
},
|
| 158 |
{
|
| 159 |
"target": "oxygen_requirement",
|
| 160 |
"task": "classification",
|
| 161 |
"metric_name": "f1_macro",
|
| 162 |
-
"value": 0.
|
| 163 |
-
"n_train":
|
| 164 |
-
"n_test":
|
| 165 |
},
|
| 166 |
{
|
| 167 |
"target": "oxygen_requirement",
|
| 168 |
"task": "classification",
|
| 169 |
"metric_name": "f1_macro",
|
| 170 |
-
"value": 0.
|
| 171 |
-
"n_train":
|
| 172 |
-
"n_test":
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"target": "oxygen_requirement",
|
| 176 |
"task": "classification",
|
| 177 |
"metric_name": "f1_macro",
|
| 178 |
-
"value": 0.
|
| 179 |
-
"n_train":
|
| 180 |
-
"n_test":
|
| 181 |
}
|
| 182 |
],
|
| 183 |
"top_features": {
|
| 184 |
-
"codon_ATA": 0.
|
| 185 |
-
"
|
| 186 |
-
"
|
| 187 |
-
"
|
| 188 |
-
"
|
| 189 |
-
"
|
| 190 |
-
"
|
| 191 |
-
"
|
| 192 |
-
"
|
| 193 |
-
"
|
| 194 |
-
"
|
| 195 |
-
"
|
| 196 |
-
"
|
| 197 |
-
"
|
| 198 |
-
"
|
| 199 |
-
"
|
| 200 |
-
"
|
| 201 |
-
"
|
| 202 |
-
"
|
| 203 |
-
"
|
| 204 |
}
|
| 205 |
},
|
| 206 |
"salt_tolerance_pct": {
|
| 207 |
"task": "regression",
|
| 208 |
-
"mean_metric": 2.
|
| 209 |
"folds": [
|
| 210 |
{
|
| 211 |
"target": "salt_tolerance_pct",
|
| 212 |
"task": "regression",
|
| 213 |
"metric_name": "mae",
|
| 214 |
-
"value": 2.
|
| 215 |
-
"n_train":
|
| 216 |
-
"n_test":
|
| 217 |
},
|
| 218 |
{
|
| 219 |
"target": "salt_tolerance_pct",
|
| 220 |
"task": "regression",
|
| 221 |
"metric_name": "mae",
|
| 222 |
-
"value": 2.
|
| 223 |
-
"n_train":
|
| 224 |
-
"n_test":
|
| 225 |
},
|
| 226 |
{
|
| 227 |
"target": "salt_tolerance_pct",
|
| 228 |
"task": "regression",
|
| 229 |
"metric_name": "mae",
|
| 230 |
-
"value": 2.
|
| 231 |
-
"n_train":
|
| 232 |
-
"n_test":
|
| 233 |
},
|
| 234 |
{
|
| 235 |
"target": "salt_tolerance_pct",
|
| 236 |
"task": "regression",
|
| 237 |
"metric_name": "mae",
|
| 238 |
-
"value": 2.
|
| 239 |
-
"n_train":
|
| 240 |
-
"n_test":
|
| 241 |
},
|
| 242 |
{
|
| 243 |
"target": "salt_tolerance_pct",
|
| 244 |
"task": "regression",
|
| 245 |
"metric_name": "mae",
|
| 246 |
-
"value": 2.
|
| 247 |
-
"n_train":
|
| 248 |
-
"n_test":
|
| 249 |
}
|
| 250 |
],
|
| 251 |
"top_features": {
|
| 252 |
-
"aa_frac_C": 0.
|
| 253 |
-
"neg_charged_frac": 0.
|
| 254 |
-
"
|
| 255 |
-
"
|
| 256 |
-
"tetra_GACT": 0.
|
| 257 |
-
"
|
| 258 |
-
"
|
| 259 |
-
"
|
| 260 |
-
"
|
| 261 |
-
"
|
| 262 |
-
"
|
| 263 |
-
"
|
| 264 |
-
"
|
| 265 |
-
"
|
| 266 |
-
"
|
| 267 |
-
"
|
| 268 |
-
"
|
| 269 |
-
"
|
| 270 |
-
"
|
| 271 |
-
"
|
| 272 |
}
|
| 273 |
},
|
| 274 |
"__meta__": {
|
|
@@ -625,7 +625,72 @@
|
|
| 625 |
"codon_TTA",
|
| 626 |
"codon_TTC",
|
| 627 |
"codon_TTG",
|
| 628 |
-
"codon_TTT"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
]
|
| 630 |
}
|
| 631 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"optimal_temperature_c": {
|
| 3 |
"task": "regression",
|
| 4 |
+
"mean_metric": 2.939444159350111,
|
| 5 |
"folds": [
|
| 6 |
{
|
| 7 |
"target": "optimal_temperature_c",
|
| 8 |
"task": "regression",
|
| 9 |
"metric_name": "mae",
|
| 10 |
+
"value": 3.103597222252415,
|
| 11 |
+
"n_train": 36496,
|
| 12 |
+
"n_test": 9125
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"target": "optimal_temperature_c",
|
| 16 |
"task": "regression",
|
| 17 |
"metric_name": "mae",
|
| 18 |
+
"value": 2.7356862682357583,
|
| 19 |
+
"n_train": 36497,
|
| 20 |
+
"n_test": 9124
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"target": "optimal_temperature_c",
|
| 24 |
"task": "regression",
|
| 25 |
"metric_name": "mae",
|
| 26 |
+
"value": 3.145843773419164,
|
| 27 |
+
"n_train": 36497,
|
| 28 |
+
"n_test": 9124
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"target": "optimal_temperature_c",
|
| 32 |
"task": "regression",
|
| 33 |
"metric_name": "mae",
|
| 34 |
+
"value": 3.2767152481045656,
|
| 35 |
+
"n_train": 36497,
|
| 36 |
+
"n_test": 9124
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"target": "optimal_temperature_c",
|
| 40 |
"task": "regression",
|
| 41 |
"metric_name": "mae",
|
| 42 |
+
"value": 2.43537828473865,
|
| 43 |
+
"n_train": 36497,
|
| 44 |
+
"n_test": 9124
|
| 45 |
}
|
| 46 |
],
|
| 47 |
"top_features": {
|
| 48 |
+
"ivywrel_frac": 0.12668818831443787,
|
| 49 |
+
"iso_cat2_thermophilic_gt45_c": 0.029868930205702783,
|
| 50 |
+
"n_predicted_cds": 0.025075340643525124,
|
| 51 |
+
"iso_cat2_human": 0.020858772844076157,
|
| 52 |
+
"iso_cat1_infection": 0.020640516839921474,
|
| 53 |
+
"iso_cat2_patient": 0.017751351464539766,
|
| 54 |
+
"aa_frac_C": 0.015003016591072083,
|
| 55 |
+
"genome_size_nt": 0.012203263118863106,
|
| 56 |
+
"aa_frac_D": 0.011290411837399006,
|
| 57 |
+
"codon_AGG": 0.010900856088846922,
|
| 58 |
+
"iso_cat1_environmental": 0.010176281817257405,
|
| 59 |
+
"tetra_GCCT": 0.009658925677649676,
|
| 60 |
+
"tetra_TAGT": 0.00883282758295536,
|
| 61 |
+
"aa_frac_Y": 0.008421392692252994,
|
| 62 |
+
"aa_frac_E": 0.007741594593971968,
|
| 63 |
+
"tetra_TTCC": 0.007376640872098506,
|
| 64 |
+
"mean_isoelectric_point": 0.007058459660038352,
|
| 65 |
+
"tetra_CTAA": 0.0070426638238132,
|
| 66 |
+
"iso_cat2_built_environment": 0.006164434866514057,
|
| 67 |
+
"iso_cat2_industrial": 0.005895084328949451
|
| 68 |
}
|
| 69 |
},
|
| 70 |
"optimal_ph": {
|
| 71 |
"task": "regression",
|
| 72 |
+
"mean_metric": 0.5090253015368336,
|
| 73 |
"folds": [
|
| 74 |
{
|
| 75 |
"target": "optimal_ph",
|
| 76 |
"task": "regression",
|
| 77 |
"metric_name": "mae",
|
| 78 |
+
"value": 0.45639293885487886,
|
| 79 |
+
"n_train": 4082,
|
| 80 |
+
"n_test": 1021
|
| 81 |
},
|
| 82 |
{
|
| 83 |
"target": "optimal_ph",
|
| 84 |
"task": "regression",
|
| 85 |
"metric_name": "mae",
|
| 86 |
+
"value": 0.6262803867911733,
|
| 87 |
+
"n_train": 4082,
|
| 88 |
+
"n_test": 1021
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"target": "optimal_ph",
|
| 92 |
"task": "regression",
|
| 93 |
"metric_name": "mae",
|
| 94 |
+
"value": 0.528334212326513,
|
| 95 |
+
"n_train": 4082,
|
| 96 |
+
"n_test": 1021
|
| 97 |
},
|
| 98 |
{
|
| 99 |
"target": "optimal_ph",
|
| 100 |
"task": "regression",
|
| 101 |
"metric_name": "mae",
|
| 102 |
+
"value": 0.48048674237494376,
|
| 103 |
+
"n_train": 4083,
|
| 104 |
+
"n_test": 1020
|
| 105 |
},
|
| 106 |
{
|
| 107 |
"target": "optimal_ph",
|
| 108 |
"task": "regression",
|
| 109 |
"metric_name": "mae",
|
| 110 |
+
"value": 0.4536322273366591,
|
| 111 |
+
"n_train": 4083,
|
| 112 |
+
"n_test": 1020
|
| 113 |
}
|
| 114 |
],
|
| 115 |
"top_features": {
|
| 116 |
+
"iso_cat2_acidic": 0.05219607315957546,
|
| 117 |
+
"iso_cat2_alkaline": 0.043521419167518616,
|
| 118 |
+
"neg_charged_frac": 0.016875072754919528,
|
| 119 |
+
"aa_frac_E": 0.008599728252738715,
|
| 120 |
+
"tetra_CTCT": 0.008368687890470027,
|
| 121 |
+
"aa_frac_H": 0.008003219496458769,
|
| 122 |
+
"mean_isoelectric_point": 0.007599354162812233,
|
| 123 |
+
"tetra_CACT": 0.007427609874866903,
|
| 124 |
+
"tetra_AGAC": 0.007137532206252217,
|
| 125 |
+
"tetra_AGGT": 0.005891842069104314,
|
| 126 |
+
"tetra_GACT": 0.005873983446508646,
|
| 127 |
+
"tetra_GAGA": 0.005548427533358336,
|
| 128 |
+
"tetra_GTCT": 0.005475769587792456,
|
| 129 |
+
"codon_GAA": 0.005408304557204246,
|
| 130 |
+
"n_predicted_cds": 0.005280579440295696,
|
| 131 |
+
"iso_cat2_plants": 0.005045945569872856,
|
| 132 |
+
"tetra_TTGA": 0.004973787232302129,
|
| 133 |
+
"codon_AAG": 0.0048154488438740374,
|
| 134 |
+
"tetra_ACGA": 0.004731484339572489,
|
| 135 |
+
"aa_frac_Y": 0.0046834095381200315
|
| 136 |
}
|
| 137 |
},
|
| 138 |
"oxygen_requirement": {
|
| 139 |
"task": "classification",
|
| 140 |
+
"mean_metric": 0.34127360853732613,
|
| 141 |
"folds": [
|
| 142 |
{
|
| 143 |
"target": "oxygen_requirement",
|
| 144 |
"task": "classification",
|
| 145 |
"metric_name": "f1_macro",
|
| 146 |
+
"value": 0.31515576471296236,
|
| 147 |
+
"n_train": 17311,
|
| 148 |
+
"n_test": 4328
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"target": "oxygen_requirement",
|
| 152 |
"task": "classification",
|
| 153 |
"metric_name": "f1_macro",
|
| 154 |
+
"value": 0.38181774862206597,
|
| 155 |
+
"n_train": 17311,
|
| 156 |
+
"n_test": 4326
|
| 157 |
},
|
| 158 |
{
|
| 159 |
"target": "oxygen_requirement",
|
| 160 |
"task": "classification",
|
| 161 |
"metric_name": "f1_macro",
|
| 162 |
+
"value": 0.34440677114867413,
|
| 163 |
+
"n_train": 17311,
|
| 164 |
+
"n_test": 4328
|
| 165 |
},
|
| 166 |
{
|
| 167 |
"target": "oxygen_requirement",
|
| 168 |
"task": "classification",
|
| 169 |
"metric_name": "f1_macro",
|
| 170 |
+
"value": 0.25943178539399836,
|
| 171 |
+
"n_train": 17311,
|
| 172 |
+
"n_test": 4328
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"target": "oxygen_requirement",
|
| 176 |
"task": "classification",
|
| 177 |
"metric_name": "f1_macro",
|
| 178 |
+
"value": 0.40555597280892947,
|
| 179 |
+
"n_train": 17312,
|
| 180 |
+
"n_test": 4327
|
| 181 |
}
|
| 182 |
],
|
| 183 |
"top_features": {
|
| 184 |
+
"codon_ATA": 0.0414140235632658,
|
| 185 |
+
"iso_cat1_host": 0.02601129524409771,
|
| 186 |
+
"n_predicted_cds": 0.025201210007071494,
|
| 187 |
+
"aa_frac_C": 0.019132474437355995,
|
| 188 |
+
"iso_cat1_environmental": 0.01645018421113491,
|
| 189 |
+
"codon_CGT": 0.014759847987443208,
|
| 190 |
+
"iso_cat1_engineered": 0.01378793753683567,
|
| 191 |
+
"genome_size_nt": 0.011305144988000393,
|
| 192 |
+
"iso_cat2_human": 0.010168002359569073,
|
| 193 |
+
"codon_TAA": 0.00900037819519639,
|
| 194 |
+
"aa_frac_V": 0.008459322061389685,
|
| 195 |
+
"aa_frac_Y": 0.008259046915918588,
|
| 196 |
+
"aa_frac_L": 0.0072497081011533735,
|
| 197 |
+
"tetra_CTGG": 0.006922230357304215,
|
| 198 |
+
"aa_frac_T": 0.006535647064447403,
|
| 199 |
+
"codon_TGG": 0.006477221753448248,
|
| 200 |
+
"aa_frac_Q": 0.0063397581689059734,
|
| 201 |
+
"aa_frac_M": 0.006198597187176347,
|
| 202 |
+
"tetra_CAAA": 0.006141273584216833,
|
| 203 |
+
"codon_CAA": 0.00611291266977787
|
| 204 |
}
|
| 205 |
},
|
| 206 |
"salt_tolerance_pct": {
|
| 207 |
"task": "regression",
|
| 208 |
+
"mean_metric": 2.516896605067264,
|
| 209 |
"folds": [
|
| 210 |
{
|
| 211 |
"target": "salt_tolerance_pct",
|
| 212 |
"task": "regression",
|
| 213 |
"metric_name": "mae",
|
| 214 |
+
"value": 2.218365752012856,
|
| 215 |
+
"n_train": 5064,
|
| 216 |
+
"n_test": 1266
|
| 217 |
},
|
| 218 |
{
|
| 219 |
"target": "salt_tolerance_pct",
|
| 220 |
"task": "regression",
|
| 221 |
"metric_name": "mae",
|
| 222 |
+
"value": 2.249367568591289,
|
| 223 |
+
"n_train": 5064,
|
| 224 |
+
"n_test": 1266
|
| 225 |
},
|
| 226 |
{
|
| 227 |
"target": "salt_tolerance_pct",
|
| 228 |
"task": "regression",
|
| 229 |
"metric_name": "mae",
|
| 230 |
+
"value": 2.8189112452912664,
|
| 231 |
+
"n_train": 5064,
|
| 232 |
+
"n_test": 1266
|
| 233 |
},
|
| 234 |
{
|
| 235 |
"target": "salt_tolerance_pct",
|
| 236 |
"task": "regression",
|
| 237 |
"metric_name": "mae",
|
| 238 |
+
"value": 2.3502065964041967,
|
| 239 |
+
"n_train": 5064,
|
| 240 |
+
"n_test": 1266
|
| 241 |
},
|
| 242 |
{
|
| 243 |
"target": "salt_tolerance_pct",
|
| 244 |
"task": "regression",
|
| 245 |
"metric_name": "mae",
|
| 246 |
+
"value": 2.947631863036709,
|
| 247 |
+
"n_train": 5064,
|
| 248 |
+
"n_test": 1266
|
| 249 |
}
|
| 250 |
],
|
| 251 |
"top_features": {
|
| 252 |
+
"aa_frac_C": 0.029796541761606933,
|
| 253 |
+
"neg_charged_frac": 0.027759117633104326,
|
| 254 |
+
"tetra_ATCC": 0.018280067457817496,
|
| 255 |
+
"iso_cat1_environmental": 0.014224943332374096,
|
| 256 |
+
"tetra_GACT": 0.01211925563402474,
|
| 257 |
+
"iso_cat2_saline": 0.011419120244681835,
|
| 258 |
+
"codon_TGC": 0.011161889415234327,
|
| 259 |
+
"tetra_CGTT": 0.009351400006562472,
|
| 260 |
+
"codon_CGT": 0.008664370141923427,
|
| 261 |
+
"iso_cat2_industrial": 0.008528076158836485,
|
| 262 |
+
"tetra_TAAT": 0.008236682531423867,
|
| 263 |
+
"iso_cat2_contamination": 0.008197423309320584,
|
| 264 |
+
"tetra_CGTA": 0.007803171873092651,
|
| 265 |
+
"tetra_TGTG": 0.007793005835264921,
|
| 266 |
+
"tetra_TACC": 0.007619049632921815,
|
| 267 |
+
"codon_CAC": 0.007169742346741259,
|
| 268 |
+
"tetra_AGTC": 0.006417827447876334,
|
| 269 |
+
"tetra_CCTG": 0.006371588306501507,
|
| 270 |
+
"tetra_GGTA": 0.006226122658699751,
|
| 271 |
+
"tetra_GAAA": 0.006115543586201966
|
| 272 |
}
|
| 273 |
},
|
| 274 |
"__meta__": {
|
|
|
|
| 625 |
"codon_TTA",
|
| 626 |
"codon_TTC",
|
| 627 |
"codon_TTG",
|
| 628 |
+
"codon_TTT",
|
| 629 |
+
"iso_cat1_climate",
|
| 630 |
+
"iso_cat1_condition",
|
| 631 |
+
"iso_cat1_engineered",
|
| 632 |
+
"iso_cat1_environmental",
|
| 633 |
+
"iso_cat1_host",
|
| 634 |
+
"iso_cat1_host_body_product",
|
| 635 |
+
"iso_cat1_host_body_site",
|
| 636 |
+
"iso_cat1_infection",
|
| 637 |
+
"iso_cat2_acidic",
|
| 638 |
+
"iso_cat2_agriculture",
|
| 639 |
+
"iso_cat2_air",
|
| 640 |
+
"iso_cat2_algae",
|
| 641 |
+
"iso_cat2_alkaline",
|
| 642 |
+
"iso_cat2_anoxic_anaerobic",
|
| 643 |
+
"iso_cat2_aquatic",
|
| 644 |
+
"iso_cat2_arthropoda",
|
| 645 |
+
"iso_cat2_biodegradation",
|
| 646 |
+
"iso_cat2_biofilm",
|
| 647 |
+
"iso_cat2_bioreactor",
|
| 648 |
+
"iso_cat2_bioremediation",
|
| 649 |
+
"iso_cat2_birds",
|
| 650 |
+
"iso_cat2_built_environment",
|
| 651 |
+
"iso_cat2_cold",
|
| 652 |
+
"iso_cat2_contamination",
|
| 653 |
+
"iso_cat2_disease",
|
| 654 |
+
"iso_cat2_fishes",
|
| 655 |
+
"iso_cat2_fluids",
|
| 656 |
+
"iso_cat2_food_production",
|
| 657 |
+
"iso_cat2_fungi",
|
| 658 |
+
"iso_cat2_gastrointestinal_tract",
|
| 659 |
+
"iso_cat2_hot",
|
| 660 |
+
"iso_cat2_human",
|
| 661 |
+
"iso_cat2_humid",
|
| 662 |
+
"iso_cat2_industrial",
|
| 663 |
+
"iso_cat2_inflammation",
|
| 664 |
+
"iso_cat2_invertebrates_other",
|
| 665 |
+
"iso_cat2_juvenile",
|
| 666 |
+
"iso_cat2_laboratory",
|
| 667 |
+
"iso_cat2_limb",
|
| 668 |
+
"iso_cat2_mammals",
|
| 669 |
+
"iso_cat2_medical_device",
|
| 670 |
+
"iso_cat2_medical_environment",
|
| 671 |
+
"iso_cat2_medical_product",
|
| 672 |
+
"iso_cat2_microbial",
|
| 673 |
+
"iso_cat2_microbial_community",
|
| 674 |
+
"iso_cat2_oral_cavity_and_airways",
|
| 675 |
+
"iso_cat2_organ",
|
| 676 |
+
"iso_cat2_other",
|
| 677 |
+
"iso_cat2_patient",
|
| 678 |
+
"iso_cat2_plant",
|
| 679 |
+
"iso_cat2_plant_infections",
|
| 680 |
+
"iso_cat2_plants",
|
| 681 |
+
"iso_cat2_protozoa",
|
| 682 |
+
"iso_cat2_psychrophilic_lt10_c",
|
| 683 |
+
"iso_cat2_reptilia",
|
| 684 |
+
"iso_cat2_saline",
|
| 685 |
+
"iso_cat2_sulfuric",
|
| 686 |
+
"iso_cat2_temperate",
|
| 687 |
+
"iso_cat2_terrestrial",
|
| 688 |
+
"iso_cat2_thermophilic_gt45_c",
|
| 689 |
+
"iso_cat2_treatment",
|
| 690 |
+
"iso_cat2_urogenital_tract",
|
| 691 |
+
"iso_cat2_waste",
|
| 692 |
+
"iso_cat2_xerophilic",
|
| 693 |
+
"iso_cat2_yeast"
|
| 694 |
]
|
| 695 |
}
|
| 696 |
}
|
|
@@ -1,274 +1,274 @@
|
|
| 1 |
{
|
| 2 |
"optimal_temperature_c": {
|
| 3 |
"task": "regression",
|
| 4 |
-
"mean_metric": 3.
|
| 5 |
"folds": [
|
| 6 |
{
|
| 7 |
"target": "optimal_temperature_c",
|
| 8 |
"task": "regression",
|
| 9 |
"metric_name": "mae",
|
| 10 |
-
"value": 2.
|
| 11 |
-
"n_train":
|
| 12 |
-
"n_test":
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"target": "optimal_temperature_c",
|
| 16 |
"task": "regression",
|
| 17 |
"metric_name": "mae",
|
| 18 |
-
"value": 3.
|
| 19 |
-
"n_train":
|
| 20 |
-
"n_test":
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"target": "optimal_temperature_c",
|
| 24 |
"task": "regression",
|
| 25 |
"metric_name": "mae",
|
| 26 |
-
"value": 3.
|
| 27 |
-
"n_train":
|
| 28 |
-
"n_test":
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"target": "optimal_temperature_c",
|
| 32 |
"task": "regression",
|
| 33 |
"metric_name": "mae",
|
| 34 |
-
"value":
|
| 35 |
-
"n_train":
|
| 36 |
-
"n_test":
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"target": "optimal_temperature_c",
|
| 40 |
"task": "regression",
|
| 41 |
"metric_name": "mae",
|
| 42 |
-
"value": 3.
|
| 43 |
-
"n_train":
|
| 44 |
-
"n_test":
|
| 45 |
}
|
| 46 |
],
|
| 47 |
"top_features": {
|
| 48 |
-
"ivywrel_frac": 0.
|
| 49 |
-
"
|
| 50 |
-
"
|
| 51 |
-
"
|
| 52 |
-
"
|
| 53 |
-
"
|
| 54 |
-
"
|
| 55 |
-
"
|
| 56 |
-
"
|
| 57 |
-
"
|
| 58 |
-
"codon_AGG": 0.
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
}
|
| 69 |
},
|
| 70 |
"optimal_ph": {
|
| 71 |
"task": "regression",
|
| 72 |
-
"mean_metric": 0.
|
| 73 |
"folds": [
|
| 74 |
{
|
| 75 |
"target": "optimal_ph",
|
| 76 |
"task": "regression",
|
| 77 |
"metric_name": "mae",
|
| 78 |
-
"value": 0.
|
| 79 |
-
"n_train":
|
| 80 |
"n_test": 928
|
| 81 |
},
|
| 82 |
{
|
| 83 |
"target": "optimal_ph",
|
| 84 |
"task": "regression",
|
| 85 |
"metric_name": "mae",
|
| 86 |
-
"value": 0.
|
| 87 |
-
"n_train":
|
| 88 |
"n_test": 928
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"target": "optimal_ph",
|
| 92 |
"task": "regression",
|
| 93 |
"metric_name": "mae",
|
| 94 |
-
"value": 0.
|
| 95 |
-
"n_train":
|
| 96 |
"n_test": 928
|
| 97 |
},
|
| 98 |
{
|
| 99 |
"target": "optimal_ph",
|
| 100 |
"task": "regression",
|
| 101 |
"metric_name": "mae",
|
| 102 |
-
"value": 0.
|
| 103 |
-
"n_train":
|
| 104 |
-
"n_test":
|
| 105 |
},
|
| 106 |
{
|
| 107 |
"target": "optimal_ph",
|
| 108 |
"task": "regression",
|
| 109 |
"metric_name": "mae",
|
| 110 |
-
"value": 0.
|
| 111 |
-
"n_train":
|
| 112 |
-
"n_test":
|
| 113 |
}
|
| 114 |
],
|
| 115 |
"top_features": {
|
| 116 |
-
"
|
| 117 |
-
"
|
| 118 |
-
"
|
| 119 |
-
"
|
| 120 |
-
"
|
| 121 |
-
"
|
| 122 |
-
"
|
| 123 |
-
"
|
| 124 |
-
"
|
| 125 |
-
"
|
| 126 |
-
"
|
| 127 |
-
"
|
| 128 |
-
"
|
| 129 |
-
"
|
| 130 |
-
"
|
| 131 |
-
"
|
| 132 |
-
"
|
| 133 |
-
"
|
| 134 |
-
"
|
| 135 |
-
"
|
| 136 |
}
|
| 137 |
},
|
| 138 |
"oxygen_requirement": {
|
| 139 |
"task": "classification",
|
| 140 |
-
"mean_metric": 0.
|
| 141 |
"folds": [
|
| 142 |
{
|
| 143 |
"target": "oxygen_requirement",
|
| 144 |
"task": "classification",
|
| 145 |
"metric_name": "f1_macro",
|
| 146 |
-
"value": 0.
|
| 147 |
-
"n_train":
|
| 148 |
-
"n_test":
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"target": "oxygen_requirement",
|
| 152 |
"task": "classification",
|
| 153 |
"metric_name": "f1_macro",
|
| 154 |
-
"value": 0.
|
| 155 |
-
"n_train":
|
| 156 |
-
"n_test":
|
| 157 |
},
|
| 158 |
{
|
| 159 |
"target": "oxygen_requirement",
|
| 160 |
"task": "classification",
|
| 161 |
"metric_name": "f1_macro",
|
| 162 |
-
"value": 0.
|
| 163 |
-
"n_train":
|
| 164 |
-
"n_test":
|
| 165 |
},
|
| 166 |
{
|
| 167 |
"target": "oxygen_requirement",
|
| 168 |
"task": "classification",
|
| 169 |
"metric_name": "f1_macro",
|
| 170 |
-
"value": 0.
|
| 171 |
-
"n_train":
|
| 172 |
"n_test": 2078
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"target": "oxygen_requirement",
|
| 176 |
"task": "classification",
|
| 177 |
"metric_name": "f1_macro",
|
| 178 |
-
"value": 0.
|
| 179 |
-
"n_train":
|
| 180 |
-
"n_test":
|
| 181 |
}
|
| 182 |
],
|
| 183 |
"top_features": {
|
| 184 |
-
"emb_103": 0.
|
| 185 |
-
"tetra_TCAA": 0.
|
| 186 |
-
"codon_CAA": 0.
|
| 187 |
-
"
|
| 188 |
-
"
|
| 189 |
-
"aa_frac_C": 0.
|
| 190 |
-
"
|
| 191 |
-
"
|
| 192 |
-
"
|
| 193 |
-
"n_predicted_cds": 0.
|
| 194 |
-
"
|
| 195 |
-
"aa_frac_M": 0.
|
| 196 |
-
"
|
| 197 |
-
"emb_12": 0.
|
| 198 |
-
"
|
| 199 |
-
"
|
| 200 |
-
"
|
| 201 |
-
"codon_CGT": 0.
|
| 202 |
-
"
|
| 203 |
-
"
|
| 204 |
}
|
| 205 |
},
|
| 206 |
"salt_tolerance_pct": {
|
| 207 |
"task": "regression",
|
| 208 |
-
"mean_metric": 2.
|
| 209 |
"folds": [
|
| 210 |
{
|
| 211 |
"target": "salt_tolerance_pct",
|
| 212 |
"task": "regression",
|
| 213 |
"metric_name": "mae",
|
| 214 |
-
"value": 2.
|
| 215 |
-
"n_train":
|
| 216 |
-
"n_test":
|
| 217 |
},
|
| 218 |
{
|
| 219 |
"target": "salt_tolerance_pct",
|
| 220 |
"task": "regression",
|
| 221 |
"metric_name": "mae",
|
| 222 |
-
"value": 2.
|
| 223 |
-
"n_train":
|
| 224 |
-
"n_test":
|
| 225 |
},
|
| 226 |
{
|
| 227 |
"target": "salt_tolerance_pct",
|
| 228 |
"task": "regression",
|
| 229 |
"metric_name": "mae",
|
| 230 |
-
"value": 2.
|
| 231 |
-
"n_train":
|
| 232 |
-
"n_test":
|
| 233 |
},
|
| 234 |
{
|
| 235 |
"target": "salt_tolerance_pct",
|
| 236 |
"task": "regression",
|
| 237 |
"metric_name": "mae",
|
| 238 |
-
"value": 2.
|
| 239 |
-
"n_train":
|
| 240 |
-
"n_test":
|
| 241 |
},
|
| 242 |
{
|
| 243 |
"target": "salt_tolerance_pct",
|
| 244 |
"task": "regression",
|
| 245 |
"metric_name": "mae",
|
| 246 |
-
"value": 2.
|
| 247 |
-
"n_train":
|
| 248 |
-
"n_test":
|
| 249 |
}
|
| 250 |
],
|
| 251 |
"top_features": {
|
| 252 |
-
"aa_frac_C": 0.
|
| 253 |
-
"neg_charged_frac": 0.
|
| 254 |
-
"
|
| 255 |
-
"
|
| 256 |
-
"
|
| 257 |
-
"
|
| 258 |
-
"
|
| 259 |
-
"
|
| 260 |
-
"tetra_AGTC": 0.
|
| 261 |
-
"
|
| 262 |
-
"
|
| 263 |
-
"
|
| 264 |
-
"
|
| 265 |
-
"
|
| 266 |
-
"
|
| 267 |
-
"
|
| 268 |
-
"
|
| 269 |
-
"
|
| 270 |
-
"
|
| 271 |
-
"
|
| 272 |
}
|
| 273 |
},
|
| 274 |
"__meta__": {
|
|
@@ -945,7 +945,70 @@
|
|
| 945 |
"emb_316",
|
| 946 |
"emb_317",
|
| 947 |
"emb_318",
|
| 948 |
-
"emb_319"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
]
|
| 950 |
}
|
| 951 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"optimal_temperature_c": {
|
| 3 |
"task": "regression",
|
| 4 |
+
"mean_metric": 3.1503700219525386,
|
| 5 |
"folds": [
|
| 6 |
{
|
| 7 |
"target": "optimal_temperature_c",
|
| 8 |
"task": "regression",
|
| 9 |
"metric_name": "mae",
|
| 10 |
+
"value": 2.738956731512239,
|
| 11 |
+
"n_train": 13592,
|
| 12 |
+
"n_test": 3398
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"target": "optimal_temperature_c",
|
| 16 |
"task": "regression",
|
| 17 |
"metric_name": "mae",
|
| 18 |
+
"value": 3.1457219391587623,
|
| 19 |
+
"n_train": 13592,
|
| 20 |
+
"n_test": 3398
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"target": "optimal_temperature_c",
|
| 24 |
"task": "regression",
|
| 25 |
"metric_name": "mae",
|
| 26 |
+
"value": 3.6463728072293864,
|
| 27 |
+
"n_train": 13592,
|
| 28 |
+
"n_test": 3398
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"target": "optimal_temperature_c",
|
| 32 |
"task": "regression",
|
| 33 |
"metric_name": "mae",
|
| 34 |
+
"value": 2.8936606017051547,
|
| 35 |
+
"n_train": 13592,
|
| 36 |
+
"n_test": 3398
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"target": "optimal_temperature_c",
|
| 40 |
"task": "regression",
|
| 41 |
"metric_name": "mae",
|
| 42 |
+
"value": 3.327138030157151,
|
| 43 |
+
"n_train": 13592,
|
| 44 |
+
"n_test": 3398
|
| 45 |
}
|
| 46 |
],
|
| 47 |
"top_features": {
|
| 48 |
+
"ivywrel_frac": 0.1272028997540474,
|
| 49 |
+
"iso_cat2_human": 0.02456752024590969,
|
| 50 |
+
"iso_cat2_thermophilic_gt45_c": 0.02335200347006321,
|
| 51 |
+
"iso_cat1_host_body_product": 0.016055696457624436,
|
| 52 |
+
"pos_charged_frac": 0.014609704539179803,
|
| 53 |
+
"iso_cat1_infection": 0.01389338243752718,
|
| 54 |
+
"iso_cat2_mammals": 0.013551290705800056,
|
| 55 |
+
"n_predicted_cds": 0.012836913019418717,
|
| 56 |
+
"iso_cat1_environmental": 0.010669851303100586,
|
| 57 |
+
"tetra_CTAA": 0.009250188246369362,
|
| 58 |
+
"codon_AGG": 0.008724220609292389,
|
| 59 |
+
"aa_frac_E": 0.008193913381546736,
|
| 60 |
+
"aa_frac_C": 0.007445563282817602,
|
| 61 |
+
"mean_isoelectric_point": 0.0069396811537444595,
|
| 62 |
+
"codon_TTG": 0.006236870028078556,
|
| 63 |
+
"emb_42": 0.006092228952911683,
|
| 64 |
+
"aa_frac_Q": 0.006049463665112853,
|
| 65 |
+
"tetra_GCAA": 0.005999001779127866,
|
| 66 |
+
"codon_TGA": 0.005786543060094118,
|
| 67 |
+
"tetra_TTAG": 0.0056871567387133835
|
| 68 |
}
|
| 69 |
},
|
| 70 |
"optimal_ph": {
|
| 71 |
"task": "regression",
|
| 72 |
+
"mean_metric": 0.5009073096637068,
|
| 73 |
"folds": [
|
| 74 |
{
|
| 75 |
"target": "optimal_ph",
|
| 76 |
"task": "regression",
|
| 77 |
"metric_name": "mae",
|
| 78 |
+
"value": 0.45625025033950806,
|
| 79 |
+
"n_train": 3712,
|
| 80 |
"n_test": 928
|
| 81 |
},
|
| 82 |
{
|
| 83 |
"target": "optimal_ph",
|
| 84 |
"task": "regression",
|
| 85 |
"metric_name": "mae",
|
| 86 |
+
"value": 0.5460629706958244,
|
| 87 |
+
"n_train": 3712,
|
| 88 |
"n_test": 928
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"target": "optimal_ph",
|
| 92 |
"task": "regression",
|
| 93 |
"metric_name": "mae",
|
| 94 |
+
"value": 0.45289194491402857,
|
| 95 |
+
"n_train": 3712,
|
| 96 |
"n_test": 928
|
| 97 |
},
|
| 98 |
{
|
| 99 |
"target": "optimal_ph",
|
| 100 |
"task": "regression",
|
| 101 |
"metric_name": "mae",
|
| 102 |
+
"value": 0.5152855838783856,
|
| 103 |
+
"n_train": 3712,
|
| 104 |
+
"n_test": 928
|
| 105 |
},
|
| 106 |
{
|
| 107 |
"target": "optimal_ph",
|
| 108 |
"task": "regression",
|
| 109 |
"metric_name": "mae",
|
| 110 |
+
"value": 0.5340457984907874,
|
| 111 |
+
"n_train": 3712,
|
| 112 |
+
"n_test": 928
|
| 113 |
}
|
| 114 |
],
|
| 115 |
"top_features": {
|
| 116 |
+
"iso_cat2_acidic": 0.03560468032956123,
|
| 117 |
+
"iso_cat2_alkaline": 0.021716910228133202,
|
| 118 |
+
"neg_charged_frac": 0.012849669158458709,
|
| 119 |
+
"tetra_CACT": 0.007109560957178474,
|
| 120 |
+
"tetra_AGAC": 0.006792897148989141,
|
| 121 |
+
"aa_frac_E": 0.006575863063335419,
|
| 122 |
+
"tetra_CTCT": 0.0063951408956199884,
|
| 123 |
+
"aa_frac_H": 0.006346661783754826,
|
| 124 |
+
"tetra_GACT": 0.005681420909240842,
|
| 125 |
+
"codon_TTT": 0.00532103287987411,
|
| 126 |
+
"tetra_TGCT": 0.005244059395045042,
|
| 127 |
+
"codon_TGC": 0.004600144876167178,
|
| 128 |
+
"tetra_GAGA": 0.004556609224528075,
|
| 129 |
+
"tetra_ACGA": 0.004443949507549405,
|
| 130 |
+
"codon_AAG": 0.004402201203629374,
|
| 131 |
+
"mean_isoelectric_point": 0.004346802597865462,
|
| 132 |
+
"codon_GAA": 0.0043407921912148595,
|
| 133 |
+
"tetra_GGAT": 0.0042743304162286225,
|
| 134 |
+
"codon_GAG": 0.004212372726760805,
|
| 135 |
+
"tetra_AGGT": 0.004194398503750562
|
| 136 |
}
|
| 137 |
},
|
| 138 |
"oxygen_requirement": {
|
| 139 |
"task": "classification",
|
| 140 |
+
"mean_metric": 0.31635288673665096,
|
| 141 |
"folds": [
|
| 142 |
{
|
| 143 |
"target": "oxygen_requirement",
|
| 144 |
"task": "classification",
|
| 145 |
"metric_name": "f1_macro",
|
| 146 |
+
"value": 0.2856587696238043,
|
| 147 |
+
"n_train": 8320,
|
| 148 |
+
"n_test": 2081
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"target": "oxygen_requirement",
|
| 152 |
"task": "classification",
|
| 153 |
"metric_name": "f1_macro",
|
| 154 |
+
"value": 0.3076087339800983,
|
| 155 |
+
"n_train": 8321,
|
| 156 |
+
"n_test": 2080
|
| 157 |
},
|
| 158 |
{
|
| 159 |
"target": "oxygen_requirement",
|
| 160 |
"task": "classification",
|
| 161 |
"metric_name": "f1_macro",
|
| 162 |
+
"value": 0.30314339231057225,
|
| 163 |
+
"n_train": 8321,
|
| 164 |
+
"n_test": 2079
|
| 165 |
},
|
| 166 |
{
|
| 167 |
"target": "oxygen_requirement",
|
| 168 |
"task": "classification",
|
| 169 |
"metric_name": "f1_macro",
|
| 170 |
+
"value": 0.37851699811618567,
|
| 171 |
+
"n_train": 8321,
|
| 172 |
"n_test": 2078
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"target": "oxygen_requirement",
|
| 176 |
"task": "classification",
|
| 177 |
"metric_name": "f1_macro",
|
| 178 |
+
"value": 0.30683653965259416,
|
| 179 |
+
"n_train": 8321,
|
| 180 |
+
"n_test": 2080
|
| 181 |
}
|
| 182 |
],
|
| 183 |
"top_features": {
|
| 184 |
+
"emb_103": 0.021533418074250223,
|
| 185 |
+
"tetra_TCAA": 0.017144506447948515,
|
| 186 |
+
"codon_CAA": 0.01247407766059041,
|
| 187 |
+
"genome_size_nt": 0.011898068338632583,
|
| 188 |
+
"codon_ATA": 0.010783014632761479,
|
| 189 |
+
"aa_frac_C": 0.010261021554470062,
|
| 190 |
+
"tetra_CAAA": 0.009564016468357295,
|
| 191 |
+
"emb_50": 0.00891582112526521,
|
| 192 |
+
"aa_frac_Q": 0.008642746694386005,
|
| 193 |
+
"n_predicted_cds": 0.007285213563591242,
|
| 194 |
+
"aa_frac_K": 0.006437191320583224,
|
| 195 |
+
"aa_frac_M": 0.006172794941812753,
|
| 196 |
+
"emb_3": 0.005596142518334091,
|
| 197 |
+
"emb_12": 0.005483857169747352,
|
| 198 |
+
"tetra_ATAG": 0.0051641249097883705,
|
| 199 |
+
"aa_frac_L": 0.004760731570422649,
|
| 200 |
+
"aa_frac_W": 0.004419099772349,
|
| 201 |
+
"codon_CGT": 0.004018026869744062,
|
| 202 |
+
"codon_ATG": 0.00391377778723836,
|
| 203 |
+
"aa_frac_Y": 0.0038635179633274676
|
| 204 |
}
|
| 205 |
},
|
| 206 |
"salt_tolerance_pct": {
|
| 207 |
"task": "regression",
|
| 208 |
+
"mean_metric": 2.4756113285254835,
|
| 209 |
"folds": [
|
| 210 |
{
|
| 211 |
"target": "salt_tolerance_pct",
|
| 212 |
"task": "regression",
|
| 213 |
"metric_name": "mae",
|
| 214 |
+
"value": 2.2104132311571862,
|
| 215 |
+
"n_train": 3832,
|
| 216 |
+
"n_test": 958
|
| 217 |
},
|
| 218 |
{
|
| 219 |
"target": "salt_tolerance_pct",
|
| 220 |
"task": "regression",
|
| 221 |
"metric_name": "mae",
|
| 222 |
+
"value": 2.5142923461258535,
|
| 223 |
+
"n_train": 3832,
|
| 224 |
+
"n_test": 958
|
| 225 |
},
|
| 226 |
{
|
| 227 |
"target": "salt_tolerance_pct",
|
| 228 |
"task": "regression",
|
| 229 |
"metric_name": "mae",
|
| 230 |
+
"value": 2.9433706752467503,
|
| 231 |
+
"n_train": 3832,
|
| 232 |
+
"n_test": 958
|
| 233 |
},
|
| 234 |
{
|
| 235 |
"target": "salt_tolerance_pct",
|
| 236 |
"task": "regression",
|
| 237 |
"metric_name": "mae",
|
| 238 |
+
"value": 2.352060198972643,
|
| 239 |
+
"n_train": 3832,
|
| 240 |
+
"n_test": 958
|
| 241 |
},
|
| 242 |
{
|
| 243 |
"target": "salt_tolerance_pct",
|
| 244 |
"task": "regression",
|
| 245 |
"metric_name": "mae",
|
| 246 |
+
"value": 2.3579201911249834,
|
| 247 |
+
"n_train": 3832,
|
| 248 |
+
"n_test": 958
|
| 249 |
}
|
| 250 |
],
|
| 251 |
"top_features": {
|
| 252 |
+
"aa_frac_C": 0.03296378441154957,
|
| 253 |
+
"neg_charged_frac": 0.025052556581795216,
|
| 254 |
+
"iso_cat2_saline": 0.010697953775525093,
|
| 255 |
+
"tetra_CACA": 0.008883546688593924,
|
| 256 |
+
"tetra_GACT": 0.007743655145168305,
|
| 257 |
+
"emb_84": 0.006587694090558216,
|
| 258 |
+
"codon_TGG": 0.006308569852262735,
|
| 259 |
+
"iso_cat2_human": 0.005952583346515894,
|
| 260 |
+
"tetra_AGTC": 0.005895164678804576,
|
| 261 |
+
"tetra_GGAA": 0.005703426687978208,
|
| 262 |
+
"codon_CGT": 0.005462939362041652,
|
| 263 |
+
"iso_cat1_infection": 0.005432895617559552,
|
| 264 |
+
"tetra_ACTG": 0.005339212145190686,
|
| 265 |
+
"tetra_AACC": 0.0049897018994670365,
|
| 266 |
+
"mean_isoelectric_point": 0.004982588707935065,
|
| 267 |
+
"iso_cat2_patient": 0.0049561535939574245,
|
| 268 |
+
"tetra_GAAA": 0.004851629724726081,
|
| 269 |
+
"aa_frac_D": 0.0045981516130268575,
|
| 270 |
+
"codon_TGT": 0.004288341873325408,
|
| 271 |
+
"tetra_CGTA": 0.0041915396694093944
|
| 272 |
}
|
| 273 |
},
|
| 274 |
"__meta__": {
|
|
|
|
| 945 |
"emb_316",
|
| 946 |
"emb_317",
|
| 947 |
"emb_318",
|
| 948 |
+
"emb_319",
|
| 949 |
+
"iso_cat1_climate",
|
| 950 |
+
"iso_cat1_condition",
|
| 951 |
+
"iso_cat1_engineered",
|
| 952 |
+
"iso_cat1_environmental",
|
| 953 |
+
"iso_cat1_host",
|
| 954 |
+
"iso_cat1_host_body_product",
|
| 955 |
+
"iso_cat1_host_body_site",
|
| 956 |
+
"iso_cat1_infection",
|
| 957 |
+
"iso_cat2_acidic",
|
| 958 |
+
"iso_cat2_agriculture",
|
| 959 |
+
"iso_cat2_air",
|
| 960 |
+
"iso_cat2_algae",
|
| 961 |
+
"iso_cat2_alkaline",
|
| 962 |
+
"iso_cat2_anoxic_anaerobic",
|
| 963 |
+
"iso_cat2_aquatic",
|
| 964 |
+
"iso_cat2_arthropoda",
|
| 965 |
+
"iso_cat2_biodegradation",
|
| 966 |
+
"iso_cat2_biofilm",
|
| 967 |
+
"iso_cat2_bioreactor",
|
| 968 |
+
"iso_cat2_bioremediation",
|
| 969 |
+
"iso_cat2_birds",
|
| 970 |
+
"iso_cat2_built_environment",
|
| 971 |
+
"iso_cat2_cold",
|
| 972 |
+
"iso_cat2_contamination",
|
| 973 |
+
"iso_cat2_disease",
|
| 974 |
+
"iso_cat2_fishes",
|
| 975 |
+
"iso_cat2_fluids",
|
| 976 |
+
"iso_cat2_food_production",
|
| 977 |
+
"iso_cat2_fungi",
|
| 978 |
+
"iso_cat2_gastrointestinal_tract",
|
| 979 |
+
"iso_cat2_hot",
|
| 980 |
+
"iso_cat2_human",
|
| 981 |
+
"iso_cat2_humid",
|
| 982 |
+
"iso_cat2_industrial",
|
| 983 |
+
"iso_cat2_inflammation",
|
| 984 |
+
"iso_cat2_invertebrates_other",
|
| 985 |
+
"iso_cat2_juvenile",
|
| 986 |
+
"iso_cat2_laboratory",
|
| 987 |
+
"iso_cat2_limb",
|
| 988 |
+
"iso_cat2_mammals",
|
| 989 |
+
"iso_cat2_medical_device",
|
| 990 |
+
"iso_cat2_medical_environment",
|
| 991 |
+
"iso_cat2_microbial",
|
| 992 |
+
"iso_cat2_microbial_community",
|
| 993 |
+
"iso_cat2_oral_cavity_and_airways",
|
| 994 |
+
"iso_cat2_organ",
|
| 995 |
+
"iso_cat2_other",
|
| 996 |
+
"iso_cat2_patient",
|
| 997 |
+
"iso_cat2_plant",
|
| 998 |
+
"iso_cat2_plant_infections",
|
| 999 |
+
"iso_cat2_plants",
|
| 1000 |
+
"iso_cat2_protozoa",
|
| 1001 |
+
"iso_cat2_psychrophilic_lt10_c",
|
| 1002 |
+
"iso_cat2_reptilia",
|
| 1003 |
+
"iso_cat2_saline",
|
| 1004 |
+
"iso_cat2_sulfuric",
|
| 1005 |
+
"iso_cat2_terrestrial",
|
| 1006 |
+
"iso_cat2_thermophilic_gt45_c",
|
| 1007 |
+
"iso_cat2_treatment",
|
| 1008 |
+
"iso_cat2_urogenital_tract",
|
| 1009 |
+
"iso_cat2_waste",
|
| 1010 |
+
"iso_cat2_xerophilic",
|
| 1011 |
+
"iso_cat2_yeast"
|
| 1012 |
]
|
| 1013 |
}
|
| 1014 |
}
|
|
@@ -1,40 +1,40 @@
|
|
| 1 |
# microbe-model — v0 baseline eval report
|
| 2 |
|
| 3 |
-
_Generated: 2026-
|
| 4 |
|
| 5 |
## TL;DR
|
| 6 |
|
| 7 |
-
- **`optimal_temperature_c`**: MAE = **
|
| 8 |
-
- **`optimal_ph`**: MAE = **0.
|
| 9 |
-
- **`oxygen_requirement`**: macro-F1 = **0.
|
| 10 |
-
- **`salt_tolerance_pct`**: MAE = **2.
|
| 11 |
|
| 12 |
-
Trained on **
|
| 13 |
|
| 14 |
## Corpus
|
| 15 |
|
| 16 |
-
- Total strains in feature table: **
|
| 17 |
- Labeled-strain counts by target:
|
| 18 |
-
- `optimal_temperature_c`:
|
| 19 |
-
- `optimal_ph`:
|
| 20 |
-
- `oxygen_requirement`:
|
| 21 |
-
- `salt_tolerance_pct`:
|
| 22 |
|
| 23 |
## Target distributions
|
| 24 |
|
| 25 |
-
- `optimal_temperature_c`: n=
|
| 26 |
-
- `optimal_ph`: n=
|
| 27 |
-
- `salt_tolerance_pct`: n=
|
| 28 |
- `oxygen_requirement`:
|
| 29 |
-
- `aerobe`:
|
| 30 |
-
- `anaerobe`:
|
| 31 |
-
- `
|
| 32 |
-
- `
|
| 33 |
-
- `
|
| 34 |
-
- `obligate anaerobe`:
|
| 35 |
-
- `facultative aerobe`:
|
|
|
|
| 36 |
- `microaerotolerant`: 2
|
| 37 |
-
- `aerotolerant`: 1
|
| 38 |
|
| 39 |
## Per-target results (5-fold GroupKFold by family)
|
| 40 |
|
|
@@ -43,102 +43,102 @@ Each is shown alongside the dumb-baseline (always-predict-mean / always-predict-
|
|
| 43 |
|
| 44 |
| Target | Task | n labeled | Model metric | Baseline | Improvement |
|
| 45 |
|---|---|---|---|---|---|
|
| 46 |
-
| `optimal_temperature_c` | regression |
|
| 47 |
-
| `optimal_ph` | regression |
|
| 48 |
-
| `oxygen_requirement` | classification |
|
| 49 |
-
| `salt_tolerance_pct` | regression |
|
| 50 |
|
| 51 |
### `optimal_temperature_c` — fold-by-fold
|
| 52 |
|
| 53 |
| Fold | Metric | Train | Test |
|
| 54 |
|---|---|---|---|
|
| 55 |
-
| 1 | mae =
|
| 56 |
-
| 2 | mae =
|
| 57 |
-
| 3 | mae = 3.
|
| 58 |
-
| 4 | mae = 3.
|
| 59 |
-
| 5 | mae =
|
| 60 |
|
| 61 |
**Top 10 features for `optimal_temperature_c`:**
|
| 62 |
|
| 63 |
-
- `ivywrel_frac` — 0.
|
| 64 |
-
- `
|
| 65 |
-
- `
|
| 66 |
-
- `
|
| 67 |
-
- `
|
| 68 |
-
- `
|
| 69 |
-
- `
|
| 70 |
-
- `
|
| 71 |
-
- `
|
| 72 |
-
- `
|
| 73 |
|
| 74 |
### `optimal_ph` — fold-by-fold
|
| 75 |
|
| 76 |
| Fold | Metric | Train | Test |
|
| 77 |
|---|---|---|---|
|
| 78 |
-
| 1 | mae = 0.
|
| 79 |
-
| 2 | mae = 0.
|
| 80 |
-
| 3 | mae = 0.
|
| 81 |
-
| 4 | mae = 0.
|
| 82 |
-
| 5 | mae = 0.
|
| 83 |
|
| 84 |
**Top 10 features for `optimal_ph`:**
|
| 85 |
|
| 86 |
-
- `
|
| 87 |
-
- `
|
| 88 |
-
- `
|
| 89 |
-
- `
|
| 90 |
-
- `
|
| 91 |
-
- `
|
| 92 |
-
- `
|
| 93 |
-
- `
|
| 94 |
-
- `
|
| 95 |
-
- `
|
| 96 |
|
| 97 |
### `oxygen_requirement` — fold-by-fold
|
| 98 |
|
| 99 |
| Fold | Metric | Train | Test |
|
| 100 |
|---|---|---|---|
|
| 101 |
-
| 1 | f1_macro = 0.
|
| 102 |
-
| 2 | f1_macro = 0.
|
| 103 |
-
| 3 | f1_macro = 0.
|
| 104 |
-
| 4 | f1_macro = 0.
|
| 105 |
-
| 5 | f1_macro = 0.
|
| 106 |
|
| 107 |
**Top 10 features for `oxygen_requirement`:**
|
| 108 |
|
| 109 |
-
- `codon_ATA` — 0.
|
| 110 |
-
- `
|
| 111 |
-
- `
|
| 112 |
-
- `
|
| 113 |
-
- `
|
| 114 |
-
- `
|
| 115 |
-
- `
|
| 116 |
-
- `
|
| 117 |
-
- `
|
| 118 |
-
- `
|
| 119 |
|
| 120 |
### `salt_tolerance_pct` — fold-by-fold
|
| 121 |
|
| 122 |
| Fold | Metric | Train | Test |
|
| 123 |
|---|---|---|---|
|
| 124 |
-
| 1 | mae = 2.
|
| 125 |
-
| 2 | mae = 2.
|
| 126 |
-
| 3 | mae = 2.
|
| 127 |
-
| 4 | mae = 2.
|
| 128 |
-
| 5 | mae = 2.
|
| 129 |
|
| 130 |
**Top 10 features for `salt_tolerance_pct`:**
|
| 131 |
|
| 132 |
-
- `aa_frac_C` — 0.
|
| 133 |
-
- `neg_charged_frac` — 0.
|
| 134 |
-
- `
|
| 135 |
-
- `
|
| 136 |
-
- `tetra_GACT` — 0.
|
| 137 |
-
- `
|
| 138 |
-
- `
|
| 139 |
-
- `
|
| 140 |
-
- `
|
| 141 |
-
- `
|
| 142 |
|
| 143 |
## Feature ↔ target correlations (Spearman, top 10)
|
| 144 |
|
|
@@ -148,46 +148,46 @@ Sanity-checks the biology — features known to track each target should appear
|
|
| 148 |
|
| 149 |
| Feature | Spearman ρ | p-value |
|
| 150 |
|---|---|---|
|
| 151 |
-
| `genome_size_nt` | -0.
|
| 152 |
-
| `
|
| 153 |
-
| `
|
| 154 |
-
| `
|
| 155 |
-
| `
|
| 156 |
-
| `
|
| 157 |
-
| `
|
| 158 |
-
| `
|
| 159 |
-
| `
|
| 160 |
-
| `
|
| 161 |
|
| 162 |
### `optimal_ph`
|
| 163 |
|
| 164 |
| Feature | Spearman ρ | p-value |
|
| 165 |
|---|---|---|
|
| 166 |
-
| `neg_charged_frac` | +0.
|
| 167 |
-
| `mean_isoelectric_point` | -0.
|
| 168 |
-
| `aa_frac_E` | +0.
|
| 169 |
-
| `
|
| 170 |
-
| `
|
| 171 |
-
| `
|
| 172 |
-
| `codon_CGA` | +0.
|
| 173 |
-
| `
|
| 174 |
-
| `
|
| 175 |
-
| `
|
| 176 |
|
| 177 |
### `salt_tolerance_pct`
|
| 178 |
|
| 179 |
| Feature | Spearman ρ | p-value |
|
| 180 |
|---|---|---|
|
| 181 |
-
| `tetra_AGTC` | +0.
|
| 182 |
-
| `tetra_GACT` | +0.
|
| 183 |
-
| `neg_charged_frac` | +0.
|
| 184 |
-
| `
|
| 185 |
-
| `
|
| 186 |
-
| `
|
| 187 |
-
| `
|
| 188 |
-
| `
|
| 189 |
-
| `
|
| 190 |
-
| `
|
| 191 |
|
| 192 |
## Per-family error breakdown (regression targets)
|
| 193 |
|
|
@@ -197,61 +197,61 @@ Top 15 most-represented families, MAE per family. Highlights where the model is
|
|
| 197 |
|
| 198 |
| Family | n | MAE |
|
| 199 |
|---|---|---|
|
| 200 |
-
|
|
| 201 |
-
|
|
| 202 |
-
|
|
| 203 |
-
| Lactobacillaceae |
|
| 204 |
-
|
|
| 205 |
-
|
|
| 206 |
-
|
|
| 207 |
-
|
|
| 208 |
-
|
|
| 209 |
-
|
|
| 210 |
-
| Moraxellaceae |
|
| 211 |
-
|
|
| 212 |
-
|
|
| 213 |
-
|
|
| 214 |
-
|
|
| 215 |
|
| 216 |
### `optimal_ph`
|
| 217 |
|
| 218 |
| Family | n | MAE |
|
| 219 |
|---|---|---|
|
| 220 |
-
| Flavobacteriaceae |
|
| 221 |
-
| Bacillaceae |
|
| 222 |
-
| Roseobacteraceae |
|
| 223 |
-
| Paenibacillaceae |
|
| 224 |
-
| Microbacteriaceae |
|
| 225 |
-
| Sphingobacteriaceae |
|
| 226 |
-
| Sphingomonadaceae |
|
| 227 |
-
| Streptomycetaceae |
|
| 228 |
-
| Pseudonocardiaceae |
|
| 229 |
-
| Halomonadaceae |
|
| 230 |
-
|
|
| 231 |
-
|
|
| 232 |
-
|
|
| 233 |
-
|
|
| 234 |
-
|
|
| 235 |
|
| 236 |
### `salt_tolerance_pct`
|
| 237 |
|
| 238 |
| Family | n | MAE |
|
| 239 |
|---|---|---|
|
| 240 |
-
|
|
| 241 |
-
|
|
| 242 |
-
| Bacillaceae |
|
| 243 |
-
|
|
| 244 |
-
|
|
| 245 |
-
|
|
| 246 |
-
|
|
| 247 |
-
|
|
| 248 |
-
|
|
| 249 |
-
|
|
| 250 |
-
|
|
| 251 |
-
|
|
| 252 |
-
| Nocardiaceae |
|
| 253 |
-
|
|
| 254 |
-
|
|
| 255 |
|
| 256 |
## Known limitations
|
| 257 |
|
|
|
|
| 1 |
# microbe-model — v0 baseline eval report
|
| 2 |
|
| 3 |
+
_Generated: 2026-05-05T06:56:14+00:00_
|
| 4 |
|
| 5 |
## TL;DR
|
| 6 |
|
| 7 |
+
- **`optimal_temperature_c`**: MAE = **2.94** (vs always-predict-mean 4.98, **+41%**)
|
| 8 |
+
- **`optimal_ph`**: MAE = **0.51** (vs always-predict-mean 0.55, **+7%**)
|
| 9 |
+
- **`oxygen_requirement`**: macro-F1 = **0.341** (vs always-predict-majority 0.059, **+479%**)
|
| 10 |
+
- **`salt_tolerance_pct`**: MAE = **2.52** (vs always-predict-mean 2.83, **+11%**)
|
| 11 |
|
| 12 |
+
Trained on **46,029** strains with **418** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
|
| 13 |
|
| 14 |
## Corpus
|
| 15 |
|
| 16 |
+
- Total strains in feature table: **46,029**
|
| 17 |
- Labeled-strain counts by target:
|
| 18 |
+
- `optimal_temperature_c`: 45,621
|
| 19 |
+
- `optimal_ph`: 5,103
|
| 20 |
+
- `oxygen_requirement`: 21,639
|
| 21 |
+
- `salt_tolerance_pct`: 6,330
|
| 22 |
|
| 23 |
## Target distributions
|
| 24 |
|
| 25 |
+
- `optimal_temperature_c`: n=45,621, mean=32.24, std=7.13, p10=27.50, median=30.00, p90=37.00
|
| 26 |
+
- `optimal_ph`: n=5,103, mean=7.19, std=0.82, p10=6.50, median=7.00, p90=8.00
|
| 27 |
+
- `salt_tolerance_pct`: n=6,330, mean=3.93, std=4.03, p10=0.00, median=3.00, p90=8.00
|
| 28 |
- `oxygen_requirement`:
|
| 29 |
+
- `aerobe`: 7,803
|
| 30 |
+
- `anaerobe`: 4,193
|
| 31 |
+
- `microaerophile`: 3,804
|
| 32 |
+
- `facultative anaerobe`: 3,389
|
| 33 |
+
- `obligate aerobe`: 2,213
|
| 34 |
+
- `obligate anaerobe`: 136
|
| 35 |
+
- `facultative aerobe`: 87
|
| 36 |
+
- `aerotolerant`: 12
|
| 37 |
- `microaerotolerant`: 2
|
|
|
|
| 38 |
|
| 39 |
## Per-target results (5-fold GroupKFold by family)
|
| 40 |
|
|
|
|
| 43 |
|
| 44 |
| Target | Task | n labeled | Model metric | Baseline | Improvement |
|
| 45 |
|---|---|---|---|---|---|
|
| 46 |
+
| `optimal_temperature_c` | regression | 45,621 | MAE=2.939 | MAE=4.981 | +41.0% |
|
| 47 |
+
| `optimal_ph` | regression | 5,103 | MAE=0.509 | MAE=0.546 | +6.8% |
|
| 48 |
+
| `oxygen_requirement` | classification | 21,639 | F1=0.341 | F1=0.059 | +479.5% |
|
| 49 |
+
| `salt_tolerance_pct` | regression | 6,330 | MAE=2.517 | MAE=2.827 | +11.0% |
|
| 50 |
|
| 51 |
### `optimal_temperature_c` — fold-by-fold
|
| 52 |
|
| 53 |
| Fold | Metric | Train | Test |
|
| 54 |
|---|---|---|---|
|
| 55 |
+
| 1 | mae = 3.104 | n=36,496 | n=9,125 |
|
| 56 |
+
| 2 | mae = 2.736 | n=36,497 | n=9,124 |
|
| 57 |
+
| 3 | mae = 3.146 | n=36,497 | n=9,124 |
|
| 58 |
+
| 4 | mae = 3.277 | n=36,497 | n=9,124 |
|
| 59 |
+
| 5 | mae = 2.435 | n=36,497 | n=9,124 |
|
| 60 |
|
| 61 |
**Top 10 features for `optimal_temperature_c`:**
|
| 62 |
|
| 63 |
+
- `ivywrel_frac` — 0.1267
|
| 64 |
+
- `iso_cat2_thermophilic_gt45_c` — 0.0299
|
| 65 |
+
- `n_predicted_cds` — 0.0251
|
| 66 |
+
- `iso_cat2_human` — 0.0209
|
| 67 |
+
- `iso_cat1_infection` — 0.0206
|
| 68 |
+
- `iso_cat2_patient` — 0.0178
|
| 69 |
+
- `aa_frac_C` — 0.0150
|
| 70 |
+
- `genome_size_nt` — 0.0122
|
| 71 |
+
- `aa_frac_D` — 0.0113
|
| 72 |
+
- `codon_AGG` — 0.0109
|
| 73 |
|
| 74 |
### `optimal_ph` — fold-by-fold
|
| 75 |
|
| 76 |
| Fold | Metric | Train | Test |
|
| 77 |
|---|---|---|---|
|
| 78 |
+
| 1 | mae = 0.456 | n=4,082 | n=1,021 |
|
| 79 |
+
| 2 | mae = 0.626 | n=4,082 | n=1,021 |
|
| 80 |
+
| 3 | mae = 0.528 | n=4,082 | n=1,021 |
|
| 81 |
+
| 4 | mae = 0.480 | n=4,083 | n=1,020 |
|
| 82 |
+
| 5 | mae = 0.454 | n=4,083 | n=1,020 |
|
| 83 |
|
| 84 |
**Top 10 features for `optimal_ph`:**
|
| 85 |
|
| 86 |
+
- `iso_cat2_acidic` — 0.0522
|
| 87 |
+
- `iso_cat2_alkaline` — 0.0435
|
| 88 |
+
- `neg_charged_frac` — 0.0169
|
| 89 |
+
- `aa_frac_E` — 0.0086
|
| 90 |
+
- `tetra_CTCT` — 0.0084
|
| 91 |
+
- `aa_frac_H` — 0.0080
|
| 92 |
+
- `mean_isoelectric_point` — 0.0076
|
| 93 |
+
- `tetra_CACT` — 0.0074
|
| 94 |
+
- `tetra_AGAC` — 0.0071
|
| 95 |
+
- `tetra_AGGT` — 0.0059
|
| 96 |
|
| 97 |
### `oxygen_requirement` — fold-by-fold
|
| 98 |
|
| 99 |
| Fold | Metric | Train | Test |
|
| 100 |
|---|---|---|---|
|
| 101 |
+
| 1 | f1_macro = 0.315 | n=17,311 | n=4,328 |
|
| 102 |
+
| 2 | f1_macro = 0.382 | n=17,311 | n=4,326 |
|
| 103 |
+
| 3 | f1_macro = 0.344 | n=17,311 | n=4,328 |
|
| 104 |
+
| 4 | f1_macro = 0.259 | n=17,311 | n=4,328 |
|
| 105 |
+
| 5 | f1_macro = 0.406 | n=17,312 | n=4,327 |
|
| 106 |
|
| 107 |
**Top 10 features for `oxygen_requirement`:**
|
| 108 |
|
| 109 |
+
- `codon_ATA` — 0.0414
|
| 110 |
+
- `iso_cat1_host` — 0.0260
|
| 111 |
+
- `n_predicted_cds` — 0.0252
|
| 112 |
+
- `aa_frac_C` — 0.0191
|
| 113 |
+
- `iso_cat1_environmental` — 0.0165
|
| 114 |
+
- `codon_CGT` — 0.0148
|
| 115 |
+
- `iso_cat1_engineered` — 0.0138
|
| 116 |
+
- `genome_size_nt` — 0.0113
|
| 117 |
+
- `iso_cat2_human` — 0.0102
|
| 118 |
+
- `codon_TAA` — 0.0090
|
| 119 |
|
| 120 |
### `salt_tolerance_pct` — fold-by-fold
|
| 121 |
|
| 122 |
| Fold | Metric | Train | Test |
|
| 123 |
|---|---|---|---|
|
| 124 |
+
| 1 | mae = 2.218 | n=5,064 | n=1,266 |
|
| 125 |
+
| 2 | mae = 2.249 | n=5,064 | n=1,266 |
|
| 126 |
+
| 3 | mae = 2.819 | n=5,064 | n=1,266 |
|
| 127 |
+
| 4 | mae = 2.350 | n=5,064 | n=1,266 |
|
| 128 |
+
| 5 | mae = 2.948 | n=5,064 | n=1,266 |
|
| 129 |
|
| 130 |
**Top 10 features for `salt_tolerance_pct`:**
|
| 131 |
|
| 132 |
+
- `aa_frac_C` — 0.0298
|
| 133 |
+
- `neg_charged_frac` — 0.0278
|
| 134 |
+
- `tetra_ATCC` — 0.0183
|
| 135 |
+
- `iso_cat1_environmental` — 0.0142
|
| 136 |
+
- `tetra_GACT` — 0.0121
|
| 137 |
+
- `iso_cat2_saline` — 0.0114
|
| 138 |
+
- `codon_TGC` — 0.0112
|
| 139 |
+
- `tetra_CGTT` — 0.0094
|
| 140 |
+
- `codon_CGT` — 0.0087
|
| 141 |
+
- `iso_cat2_industrial` — 0.0085
|
| 142 |
|
| 143 |
## Feature ↔ target correlations (Spearman, top 10)
|
| 144 |
|
|
|
|
| 148 |
|
| 149 |
| Feature | Spearman ρ | p-value |
|
| 150 |
|---|---|---|
|
| 151 |
+
| `genome_size_nt` | -0.493 | 0.0e+00 |
|
| 152 |
+
| `n_predicted_cds` | -0.482 | 0.0e+00 |
|
| 153 |
+
| `aa_frac_P` | -0.391 | 0.0e+00 |
|
| 154 |
+
| `aa_frac_Y` | +0.390 | 0.0e+00 |
|
| 155 |
+
| `tetra_TCTT` | +0.383 | 0.0e+00 |
|
| 156 |
+
| `tetra_TATC` | +0.381 | 0.0e+00 |
|
| 157 |
+
| `tetra_GATA` | +0.381 | 0.0e+00 |
|
| 158 |
+
| `tetra_AAGA` | +0.381 | 0.0e+00 |
|
| 159 |
+
| `tetra_CATA` | +0.380 | 0.0e+00 |
|
| 160 |
+
| `tetra_TATG` | +0.379 | 0.0e+00 |
|
| 161 |
|
| 162 |
### `optimal_ph`
|
| 163 |
|
| 164 |
| Feature | Spearman ρ | p-value |
|
| 165 |
|---|---|---|
|
| 166 |
+
| `neg_charged_frac` | +0.304 | 1.6e-109 |
|
| 167 |
+
| `mean_isoelectric_point` | -0.278 | 1.8e-91 |
|
| 168 |
+
| `aa_frac_E` | +0.256 | 4.5e-77 |
|
| 169 |
+
| `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
|
| 170 |
+
| `ivywrel_frac` | +0.159 | 2.4e-30 |
|
| 171 |
+
| `codon_AAG` | -0.154 | 1.7e-28 |
|
| 172 |
+
| `codon_CGA` | +0.153 | 5.8e-28 |
|
| 173 |
+
| `codon_TGC` | -0.151 | 2.6e-27 |
|
| 174 |
+
| `iso_cat2_saline` | +0.137 | 8.9e-23 |
|
| 175 |
+
| `tetra_CACT` | +0.135 | 4.3e-22 |
|
| 176 |
|
| 177 |
### `salt_tolerance_pct`
|
| 178 |
|
| 179 |
| Feature | Spearman ρ | p-value |
|
| 180 |
|---|---|---|
|
| 181 |
+
| `tetra_AGTC` | +0.270 | 4.0e-106 |
|
| 182 |
+
| `tetra_GACT` | +0.268 | 1.4e-104 |
|
| 183 |
+
| `neg_charged_frac` | +0.221 | 3.9e-71 |
|
| 184 |
+
| `ivywrel_frac` | +0.221 | 8.4e-71 |
|
| 185 |
+
| `aa_frac_C` | -0.202 | 4.7e-59 |
|
| 186 |
+
| `iso_cat1_environmental` | -0.193 | 2.6e-54 |
|
| 187 |
+
| `n_contigs` | -0.181 | 1.0e-47 |
|
| 188 |
+
| `mean_cds_aa_length` | -0.177 | 8.2e-46 |
|
| 189 |
+
| `tetra_ACTC` | +0.176 | 4.5e-45 |
|
| 190 |
+
| `tetra_GAGT` | +0.173 | 1.5e-43 |
|
| 191 |
|
| 192 |
## Per-family error breakdown (regression targets)
|
| 193 |
|
|
|
|
| 197 |
|
| 198 |
| Family | n | MAE |
|
| 199 |
|---|---|---|
|
| 200 |
+
| Enterobacteriaceae | 2662 | 4.086 |
|
| 201 |
+
| Streptomycetaceae | 2212 | 1.919 |
|
| 202 |
+
| Bacillaceae | 1886 | 3.195 |
|
| 203 |
+
| Lactobacillaceae | 1732 | 3.537 |
|
| 204 |
+
| Pseudomonadaceae | 1621 | 2.576 |
|
| 205 |
+
| Myxococcaceae | 1546 | 0.403 |
|
| 206 |
+
| Streptococcaceae | 1170 | 2.367 |
|
| 207 |
+
| Staphylococcaceae | 1068 | 4.288 |
|
| 208 |
+
| Flavobacteriaceae | 981 | 4.202 |
|
| 209 |
+
| Corynebacteriaceae | 900 | 2.231 |
|
| 210 |
+
| Moraxellaceae | 890 | 3.514 |
|
| 211 |
+
| Paenibacillaceae | 760 | 2.967 |
|
| 212 |
+
| Microbacteriaceae | 734 | 2.482 |
|
| 213 |
+
| Micrococcaceae | 719 | 2.991 |
|
| 214 |
+
| Nocardiaceae | 715 | 2.679 |
|
| 215 |
|
| 216 |
### `optimal_ph`
|
| 217 |
|
| 218 |
| Family | n | MAE |
|
| 219 |
|---|---|---|
|
| 220 |
+
| Flavobacteriaceae | 355 | 0.391 |
|
| 221 |
+
| Bacillaceae | 298 | 0.678 |
|
| 222 |
+
| Roseobacteraceae | 204 | 0.400 |
|
| 223 |
+
| Paenibacillaceae | 139 | 0.435 |
|
| 224 |
+
| Microbacteriaceae | 120 | 0.438 |
|
| 225 |
+
| Sphingobacteriaceae | 114 | 0.353 |
|
| 226 |
+
| Sphingomonadaceae | 102 | 0.346 |
|
| 227 |
+
| Streptomycetaceae | 98 | 0.599 |
|
| 228 |
+
| Pseudonocardiaceae | 93 | 0.495 |
|
| 229 |
+
| Halomonadaceae | 82 | 0.603 |
|
| 230 |
+
| Micrococcaceae | 82 | 0.619 |
|
| 231 |
+
| Nocardioidaceae | 80 | 0.490 |
|
| 232 |
+
| Paracoccaceae | 76 | 0.564 |
|
| 233 |
+
| Alteromonadaceae | 71 | 0.349 |
|
| 234 |
+
| Erythrobacteraceae | 68 | 0.423 |
|
| 235 |
|
| 236 |
### `salt_tolerance_pct`
|
| 237 |
|
| 238 |
| Family | n | MAE |
|
| 239 |
|---|---|---|
|
| 240 |
+
| Streptococcaceae | 340 | 0.891 |
|
| 241 |
+
| Flavobacteriaceae | 312 | 1.834 |
|
| 242 |
+
| Bacillaceae | 310 | 3.417 |
|
| 243 |
+
| Streptomycetaceae | 309 | 2.116 |
|
| 244 |
+
| Pseudomonadaceae | 196 | 4.802 |
|
| 245 |
+
| Corynebacteriaceae | 194 | 3.853 |
|
| 246 |
+
| Vibrionaceae | 173 | 2.872 |
|
| 247 |
+
| Microbacteriaceae | 166 | 2.616 |
|
| 248 |
+
| Paenibacillaceae | 150 | 2.096 |
|
| 249 |
+
| Roseobacteraceae | 143 | 1.556 |
|
| 250 |
+
| Pseudonocardiaceae | 142 | 2.400 |
|
| 251 |
+
| Moraxellaceae | 126 | 2.581 |
|
| 252 |
+
| Nocardiaceae | 125 | 2.899 |
|
| 253 |
+
| Enterococcaceae | 111 | 1.723 |
|
| 254 |
+
| Alcaligenaceae | 104 | 4.454 |
|
| 255 |
|
| 256 |
## Known limitations
|
| 257 |
|
|
@@ -25,6 +25,41 @@ def derive_group(row: pd.Series) -> str:
|
|
| 25 |
return "__unknown__"
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def main() -> None:
|
| 29 |
t0 = time.time()
|
| 30 |
pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
|
|
@@ -32,7 +67,12 @@ def main() -> None:
|
|
| 32 |
df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
|
| 33 |
df["group"] = df.apply(derive_group, axis=1)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
|
|
|
|
| 36 |
|
| 37 |
print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
|
| 38 |
print(f"Distinct groups: {df['group'].nunique():,}")
|
|
|
|
| 25 |
return "__unknown__"
|
| 26 |
|
| 27 |
|
| 28 |
+
def encode_isolation_categories(
|
| 29 |
+
df: pd.DataFrame,
|
| 30 |
+
*,
|
| 31 |
+
min_count: int = 10,
|
| 32 |
+
) -> tuple[pd.DataFrame, list[str]]:
|
| 33 |
+
"""One-hot encode isolation_cat1/cat2 (pipe-joined multi-labels).
|
| 34 |
+
|
| 35 |
+
Each strain's category cell is "Tag1|Tag2|..." (or NaN). We split, then create one
|
| 36 |
+
iso_<level>_<tag> column per tag that appears in ≥min_count training rows. Strains
|
| 37 |
+
without any isolation info get all-zero rows for these features (XGBoost treats this
|
| 38 |
+
as "no signal" rather than missing).
|
| 39 |
+
"""
|
| 40 |
+
new_cols: list[str] = []
|
| 41 |
+
for level in ("isolation_cat1", "isolation_cat2"):
|
| 42 |
+
if level not in df.columns:
|
| 43 |
+
continue
|
| 44 |
+
from collections import Counter
|
| 45 |
+
tag_counts: Counter[str] = Counter()
|
| 46 |
+
for v in df[level].dropna():
|
| 47 |
+
tag_counts.update(v.split("|"))
|
| 48 |
+
kept = [t for t, n in tag_counts.items() if n >= min_count]
|
| 49 |
+
seen_slugs: set[str] = set()
|
| 50 |
+
import re
|
| 51 |
+
for tag in sorted(kept):
|
| 52 |
+
slug = tag.lower().replace(">", "gt").replace("<", "lt")
|
| 53 |
+
slug = re.sub(r"[^a-z0-9]+", "_", slug).strip("_")
|
| 54 |
+
col = f"iso_{level.split('_')[1]}_{slug}"
|
| 55 |
+
if col in seen_slugs:
|
| 56 |
+
continue
|
| 57 |
+
seen_slugs.add(col)
|
| 58 |
+
df[col] = df[level].fillna("").apply(lambda v, t=tag: int(t in v.split("|")))
|
| 59 |
+
new_cols.append(col)
|
| 60 |
+
return df, new_cols
|
| 61 |
+
|
| 62 |
+
|
| 63 |
def main() -> None:
|
| 64 |
t0 = time.time()
|
| 65 |
pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
|
|
|
|
| 67 |
df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
|
| 68 |
df["group"] = df.apply(derive_group, axis=1)
|
| 69 |
|
| 70 |
+
df, iso_cols = encode_isolation_categories(df)
|
| 71 |
+
print(f"Encoded {len(iso_cols)} isolation-category features "
|
| 72 |
+
f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
|
| 73 |
+
|
| 74 |
feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
|
| 75 |
+
feature_cols = feature_cols + iso_cols
|
| 76 |
|
| 77 |
print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
|
| 78 |
print(f"Distinct groups: {df['group'].nunique():,}")
|
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Train v3: hand-crafted features (v1)
|
| 2 |
|
| 3 |
Tests whether embeddings carry complementary signal to the curated features even
|
| 4 |
when they lose head-to-head. Same train/test splits and XGBoost hyperparameters
|
|
@@ -14,7 +14,9 @@ Writes:
|
|
| 14 |
"""
|
| 15 |
from __future__ import annotations
|
| 16 |
|
|
|
|
| 17 |
import time
|
|
|
|
| 18 |
|
| 19 |
import pandas as pd
|
| 20 |
|
|
@@ -33,6 +35,37 @@ def derive_group(row: pd.Series) -> str:
|
|
| 33 |
return "__unknown__"
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def main() -> None:
|
| 37 |
t0 = time.time()
|
| 38 |
pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
|
|
@@ -43,12 +76,16 @@ def main() -> None:
|
|
| 43 |
df = df.merge(embeds, on=["bacdive_id", "genome_accession"], how="inner")
|
| 44 |
df["group"] = df.apply(derive_group, axis=1)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
v1_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
|
| 47 |
v2_cols = [c for c in embeds.columns if c.startswith("emb_")]
|
| 48 |
-
feature_cols = v1_cols + v2_cols
|
| 49 |
|
| 50 |
print(f"Training table: {len(df):,} strains × {len(feature_cols)} features "
|
| 51 |
-
f"({len(v1_cols)} hand-crafted + {len(v2_cols)} embedding dims)")
|
| 52 |
print(f"Distinct groups: {df['group'].nunique():,}")
|
| 53 |
print()
|
| 54 |
|
|
|
|
| 1 |
+
"""Train v3: hand-crafted features (v1) + ESM-2 embeddings (v2) + isolation tags.
|
| 2 |
|
| 3 |
Tests whether embeddings carry complementary signal to the curated features even
|
| 4 |
when they lose head-to-head. Same train/test splits and XGBoost hyperparameters
|
|
|
|
| 14 |
"""
|
| 15 |
from __future__ import annotations
|
| 16 |
|
| 17 |
+
import re
|
| 18 |
import time
|
| 19 |
+
from collections import Counter
|
| 20 |
|
| 21 |
import pandas as pd
|
| 22 |
|
|
|
|
| 35 |
return "__unknown__"
|
| 36 |
|
| 37 |
|
| 38 |
+
def encode_isolation_categories(
|
| 39 |
+
df: pd.DataFrame,
|
| 40 |
+
*,
|
| 41 |
+
min_count: int = 10,
|
| 42 |
+
) -> tuple[pd.DataFrame, list[str]]:
|
| 43 |
+
"""One-hot encode isolation_cat1/cat2 (pipe-joined multi-labels).
|
| 44 |
+
|
| 45 |
+
Mirrors the encoder in scripts/03_train_baseline.py so v3 sees the same
|
| 46 |
+
isolation-tag vocabulary as v1.
|
| 47 |
+
"""
|
| 48 |
+
new_cols: list[str] = []
|
| 49 |
+
for level in ("isolation_cat1", "isolation_cat2"):
|
| 50 |
+
if level not in df.columns:
|
| 51 |
+
continue
|
| 52 |
+
tag_counts: Counter[str] = Counter()
|
| 53 |
+
for v in df[level].dropna():
|
| 54 |
+
tag_counts.update(v.split("|"))
|
| 55 |
+
kept = [t for t, n in tag_counts.items() if n >= min_count]
|
| 56 |
+
seen_slugs: set[str] = set()
|
| 57 |
+
for tag in sorted(kept):
|
| 58 |
+
slug = tag.lower().replace(">", "gt").replace("<", "lt")
|
| 59 |
+
slug = re.sub(r"[^a-z0-9]+", "_", slug).strip("_")
|
| 60 |
+
col = f"iso_{level.split('_')[1]}_{slug}"
|
| 61 |
+
if col in seen_slugs:
|
| 62 |
+
continue
|
| 63 |
+
seen_slugs.add(col)
|
| 64 |
+
df[col] = df[level].fillna("").apply(lambda v, t=tag: int(t in v.split("|")))
|
| 65 |
+
new_cols.append(col)
|
| 66 |
+
return df, new_cols
|
| 67 |
+
|
| 68 |
+
|
| 69 |
def main() -> None:
|
| 70 |
t0 = time.time()
|
| 71 |
pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
|
|
|
|
| 76 |
df = df.merge(embeds, on=["bacdive_id", "genome_accession"], how="inner")
|
| 77 |
df["group"] = df.apply(derive_group, axis=1)
|
| 78 |
|
| 79 |
+
df, iso_cols = encode_isolation_categories(df)
|
| 80 |
+
print(f"Encoded {len(iso_cols)} isolation-category features "
|
| 81 |
+
f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
|
| 82 |
+
|
| 83 |
v1_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
|
| 84 |
v2_cols = [c for c in embeds.columns if c.startswith("emb_")]
|
| 85 |
+
feature_cols = v1_cols + v2_cols + iso_cols
|
| 86 |
|
| 87 |
print(f"Training table: {len(df):,} strains × {len(feature_cols)} features "
|
| 88 |
+
f"({len(v1_cols)} hand-crafted + {len(v2_cols)} embedding dims + {len(iso_cols)} iso tags)")
|
| 89 |
print(f"Distinct groups: {df['group'].nunique():,}")
|
| 90 |
print()
|
| 91 |
|
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Rebuild data/bacdive_phenotypes.parquet from cached data/bacdive/*.json.
|
| 2 |
+
|
| 3 |
+
Use this after extending extract_phenotypes() to add fields without re-running the
|
| 4 |
+
~30-min API scan. Reads every cached JSON, re-applies the extractor, and overwrites
|
| 5 |
+
the parquet.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
|
| 15 |
+
from microbe_model import config
|
| 16 |
+
from microbe_model.data.bacdive import extract_phenotypes
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main() -> None:
|
| 20 |
+
files = sorted(Path(config.BACDIVE_DIR).glob("*.json"))
|
| 21 |
+
print(f"Re-extracting from {len(files):,} cached JSONs in {config.BACDIVE_DIR}")
|
| 22 |
+
|
| 23 |
+
rows = []
|
| 24 |
+
for path in tqdm(files, desc="re-extract", unit="strain"):
|
| 25 |
+
try:
|
| 26 |
+
record = json.loads(path.read_text())
|
| 27 |
+
except json.JSONDecodeError:
|
| 28 |
+
continue
|
| 29 |
+
rows.append(extract_phenotypes(record))
|
| 30 |
+
|
| 31 |
+
df = pd.DataFrame(rows)
|
| 32 |
+
out = config.DATA / "bacdive_phenotypes.parquet"
|
| 33 |
+
df.to_parquet(out, index=False)
|
| 34 |
+
|
| 35 |
+
print(f"\nWrote {len(df):,} strains to {out}")
|
| 36 |
+
print("Field coverage:")
|
| 37 |
+
for col in df.columns:
|
| 38 |
+
n = df[col].notna().sum()
|
| 39 |
+
print(f" {col:30s} {n:>6,} / {len(df):,} ({100 * n / max(1, len(df)):.1f}%)")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
main()
|
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Resolve BacDive species names → NCBI representative genome accessions.
|
| 2 |
+
|
| 3 |
+
Targets the phenotype-labeled strains that lack a `genome_accession` in BacDive.
|
| 4 |
+
Many of those species DO have a sequenced genome — BacDive just doesn't link to it.
|
| 5 |
+
We query NCBI Datasets v2 for one RefSeq assembly per unique species name and write
|
| 6 |
+
the {species: accession} map so the next pipeline step can pull the FASTAs.
|
| 7 |
+
|
| 8 |
+
Output: data/species_to_genome.parquet (species, ncbi_accession, status)
|
| 9 |
+
|
| 10 |
+
Resumable: re-runs skip species already present in the output.
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import time
|
| 15 |
+
from urllib.parse import quote
|
| 16 |
+
|
| 17 |
+
import pandas as pd
|
| 18 |
+
import requests
|
| 19 |
+
from tqdm import tqdm
|
| 20 |
+
|
| 21 |
+
from microbe_model import config
|
| 22 |
+
|
| 23 |
+
API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{taxon}/dataset_report"
|
| 24 |
+
RATE_LIMIT_S = 0.11 if config.NCBI_API_KEY else 0.36
|
| 25 |
+
OUT_PATH = config.DATA / "species_to_genome.parquet"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def fetch_one(species: str, session: requests.Session) -> tuple[str | None, str]:
|
| 29 |
+
"""Return (accession, status) for a species. status ∈ {hit, miss, error}."""
|
| 30 |
+
headers: dict[str, str] = {"Accept": "application/json"}
|
| 31 |
+
if config.NCBI_API_KEY:
|
| 32 |
+
headers["api-key"] = config.NCBI_API_KEY
|
| 33 |
+
params = {"filters.assembly_source": "RefSeq", "page_size": 1}
|
| 34 |
+
|
| 35 |
+
for attempt in range(3):
|
| 36 |
+
try:
|
| 37 |
+
time.sleep(RATE_LIMIT_S)
|
| 38 |
+
resp = session.get(
|
| 39 |
+
API_URL.format(taxon=quote(species)),
|
| 40 |
+
headers=headers,
|
| 41 |
+
params=params,
|
| 42 |
+
timeout=30,
|
| 43 |
+
)
|
| 44 |
+
if resp.status_code == 404:
|
| 45 |
+
return None, "miss"
|
| 46 |
+
if resp.status_code in (429, 502, 503):
|
| 47 |
+
time.sleep(2 ** attempt)
|
| 48 |
+
continue
|
| 49 |
+
resp.raise_for_status()
|
| 50 |
+
reports = resp.json().get("reports", [])
|
| 51 |
+
if reports:
|
| 52 |
+
acc = reports[0].get("accession")
|
| 53 |
+
return (acc, "hit") if acc else (None, "miss")
|
| 54 |
+
return None, "miss"
|
| 55 |
+
except requests.RequestException:
|
| 56 |
+
if attempt == 2:
|
| 57 |
+
return None, "error"
|
| 58 |
+
time.sleep(2 ** attempt)
|
| 59 |
+
return None, "error"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def main() -> None:
|
| 63 |
+
df = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
|
| 64 |
+
has_label = df[
|
| 65 |
+
["optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"]
|
| 66 |
+
].notna().any(axis=1)
|
| 67 |
+
no_genome = df["genome_accession"].isna()
|
| 68 |
+
valid_species = df["species"].notna() & df["species"].str.contains(" ", na=False)
|
| 69 |
+
gap_species = sorted(df[has_label & no_genome & valid_species]["species"].unique())
|
| 70 |
+
print(f"unique species to resolve: {len(gap_species):,}")
|
| 71 |
+
|
| 72 |
+
# Resume from prior partial run
|
| 73 |
+
done: dict[str, tuple[str | None, str]] = {}
|
| 74 |
+
if OUT_PATH.exists():
|
| 75 |
+
prev = pd.read_parquet(OUT_PATH)
|
| 76 |
+
for _, row in prev.iterrows():
|
| 77 |
+
done[row["species"]] = (row["ncbi_accession"], row["status"])
|
| 78 |
+
print(f"resuming — {len(done):,} already cached")
|
| 79 |
+
|
| 80 |
+
todo = [s for s in gap_species if s not in done]
|
| 81 |
+
print(f"to fetch: {len(todo):,}")
|
| 82 |
+
|
| 83 |
+
session = requests.Session()
|
| 84 |
+
rows: list[dict] = [
|
| 85 |
+
{"species": sp, "ncbi_accession": acc, "status": st}
|
| 86 |
+
for sp, (acc, st) in done.items()
|
| 87 |
+
]
|
| 88 |
+
n_hits = sum(1 for _, st in done.values() if st == "hit")
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
for sp in tqdm(todo, desc="resolving", unit="species"):
|
| 92 |
+
acc, status = fetch_one(sp, session)
|
| 93 |
+
rows.append({"species": sp, "ncbi_accession": acc, "status": status})
|
| 94 |
+
if status == "hit":
|
| 95 |
+
n_hits += 1
|
| 96 |
+
# Periodic checkpoint every 200 species so an interrupt doesn't lose progress
|
| 97 |
+
if len(rows) % 200 == 0:
|
| 98 |
+
pd.DataFrame(rows).to_parquet(OUT_PATH, index=False)
|
| 99 |
+
finally:
|
| 100 |
+
pd.DataFrame(rows).to_parquet(OUT_PATH, index=False)
|
| 101 |
+
|
| 102 |
+
out = pd.DataFrame(rows)
|
| 103 |
+
print(f"\nwrote {len(out):,} rows to {OUT_PATH}")
|
| 104 |
+
print(f" hit: {(out['status'] == 'hit').sum():,} ({100 * (out['status'] == 'hit').mean():.0f}%)")
|
| 105 |
+
print(f" miss: {(out['status'] == 'miss').sum():,}")
|
| 106 |
+
print(f" error: {(out['status'] == 'error').sum():,}")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deduplicated featurize for species-resolved genomes.
|
| 2 |
+
|
| 3 |
+
When many BacDive strains share a single species-level representative genome (the
|
| 4 |
+
common case after scripts/18), naively running scripts/02 re-downloads + re-runs
|
| 5 |
+
pyrodigal on the same FASTA per-strain. This script downloads each unique accession
|
| 6 |
+
once, then replicates the resulting feature dict across all bacdive_ids that share it.
|
| 7 |
+
|
| 8 |
+
Resumable via data/features.jsonl (skips bacdive_ids already in the log).
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
uv run python scripts/19_featurize_resolved.py --workers 7
|
| 12 |
+
"""
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import time
|
| 19 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
import pandas as pd
|
| 23 |
+
from tqdm import tqdm
|
| 24 |
+
|
| 25 |
+
from microbe_model import config
|
| 26 |
+
from microbe_model.pipeline import _load_done_ids, _process_one
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def main() -> None:
|
| 30 |
+
parser = argparse.ArgumentParser()
|
| 31 |
+
parser.add_argument("--workers", type=int, default=max(1, (os.cpu_count() or 4) - 1))
|
| 32 |
+
parser.add_argument("--max-accessions", type=int, default=None,
|
| 33 |
+
help="Cap how many unique accessions to process (debug).")
|
| 34 |
+
args = parser.parse_args()
|
| 35 |
+
|
| 36 |
+
pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
|
| 37 |
+
|
| 38 |
+
# Training-ready pool: any phenotype label + a genome accession
|
| 39 |
+
label_cols = list(config.PHENOTYPE_TARGETS.keys())
|
| 40 |
+
has_label = pheno[label_cols].notna().any(axis=1)
|
| 41 |
+
has_genome = pheno["genome_accession"].notna()
|
| 42 |
+
ready = pheno[has_label & has_genome].copy()
|
| 43 |
+
ready["bacdive_id"] = ready["bacdive_id"].astype(int)
|
| 44 |
+
ready["genome_accession"] = ready["genome_accession"].astype(str)
|
| 45 |
+
|
| 46 |
+
out_path = config.DATA / "features.jsonl"
|
| 47 |
+
done_ids = _load_done_ids(out_path)
|
| 48 |
+
todo = ready[~ready["bacdive_id"].isin(done_ids)]
|
| 49 |
+
print(f"strains in pool: {len(ready):,}")
|
| 50 |
+
print(f" already featurized: {len(done_ids):,}")
|
| 51 |
+
print(f" remaining: {len(todo):,}")
|
| 52 |
+
|
| 53 |
+
# Group remaining strains by accession
|
| 54 |
+
by_acc = todo.groupby("genome_accession")["bacdive_id"].apply(list).to_dict()
|
| 55 |
+
accessions = sorted(by_acc.keys())
|
| 56 |
+
if args.max_accessions:
|
| 57 |
+
accessions = accessions[: args.max_accessions]
|
| 58 |
+
print(f"unique accessions to download: {len(accessions):,}")
|
| 59 |
+
print(f" avg strains per accession: {sum(len(by_acc[a]) for a in accessions) / max(1, len(accessions)):.1f}")
|
| 60 |
+
print(f"workers: {args.workers}\n")
|
| 61 |
+
|
| 62 |
+
# Featurize each accession once; the worker tags the result with the *first* bacdive_id
|
| 63 |
+
# of that accession's strain group. We then replicate the feature dict to all sibling
|
| 64 |
+
# bacdive_ids before writing.
|
| 65 |
+
rep_tasks = [(by_acc[acc][0], acc) for acc in accessions]
|
| 66 |
+
|
| 67 |
+
n_success = 0
|
| 68 |
+
n_replicated_rows = 0
|
| 69 |
+
start = time.time()
|
| 70 |
+
with open(out_path, "a") as fh, \
|
| 71 |
+
ProcessPoolExecutor(max_workers=args.workers) as pool, \
|
| 72 |
+
tqdm(total=len(rep_tasks), desc="featurize", unit="genome") as bar:
|
| 73 |
+
futures = {pool.submit(_process_one, t): t for t in rep_tasks}
|
| 74 |
+
for fut in as_completed(futures):
|
| 75 |
+
rep_id, acc = futures[fut]
|
| 76 |
+
bar.update(1)
|
| 77 |
+
try:
|
| 78 |
+
feats = fut.result()
|
| 79 |
+
except Exception:
|
| 80 |
+
feats = None
|
| 81 |
+
if not feats:
|
| 82 |
+
continue
|
| 83 |
+
n_success += 1
|
| 84 |
+
for bid in by_acc[acc]:
|
| 85 |
+
row = dict(feats)
|
| 86 |
+
row["bacdive_id"] = bid
|
| 87 |
+
row["genome_accession"] = acc
|
| 88 |
+
fh.write(json.dumps(row) + "\n")
|
| 89 |
+
n_replicated_rows += 1
|
| 90 |
+
fh.flush()
|
| 91 |
+
bar.set_postfix(genomes_ok=n_success, rows=n_replicated_rows)
|
| 92 |
+
|
| 93 |
+
print(f"\nfinished in {(time.time() - start) / 60:.1f} min")
|
| 94 |
+
print(f" unique genomes featurized: {n_success:,}/{len(rep_tasks):,}")
|
| 95 |
+
print(f" feature rows written: {n_replicated_rows:,}")
|
| 96 |
+
|
| 97 |
+
# Materialize parquet
|
| 98 |
+
df = pd.read_json(out_path, lines=True)
|
| 99 |
+
df = df.drop_duplicates(subset=["bacdive_id"], keep="last")
|
| 100 |
+
parquet = config.DATA / "features.parquet"
|
| 101 |
+
df.to_parquet(parquet, index=False)
|
| 102 |
+
print(f" wrote {len(df):,} rows to {parquet}")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|
|
@@ -102,12 +102,16 @@ def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
|
|
| 102 |
- Physiology and metabolism → oxygen tolerance[]
|
| 103 |
- Physiology and metabolism → halophily[]
|
| 104 |
- Sequence information → Genome sequences[].INSDC accession
|
|
|
|
| 105 |
"""
|
| 106 |
general = record.get("General") or {}
|
| 107 |
taxon = record.get("Name and taxonomic classification") or {}
|
| 108 |
culture = record.get("Culture and growth conditions") or {}
|
| 109 |
physio = record.get("Physiology and metabolism") or {}
|
| 110 |
seq = record.get("Sequence information") or {}
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
out: dict[str, Any] = {
|
| 113 |
"bacdive_id": general.get("BacDive-ID"),
|
|
@@ -120,10 +124,35 @@ def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]:
|
|
| 120 |
"oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"),
|
| 121 |
"salt_tolerance_pct": _derive_salt(physio.get("halophily")),
|
| 122 |
"genome_accession": _first_genome_accession(seq.get("Genome sequences")),
|
|
|
|
|
|
|
|
|
|
| 123 |
}
|
| 124 |
return out
|
| 125 |
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
def _as_list(x: Any) -> list:
|
| 128 |
if x is None:
|
| 129 |
return []
|
|
|
|
| 102 |
- Physiology and metabolism → oxygen tolerance[]
|
| 103 |
- Physiology and metabolism → halophily[]
|
| 104 |
- Sequence information → Genome sequences[].INSDC accession
|
| 105 |
+
- Isolation, sampling and environmental information → isolation source categories[].Cat{1,2,3}
|
| 106 |
"""
|
| 107 |
general = record.get("General") or {}
|
| 108 |
taxon = record.get("Name and taxonomic classification") or {}
|
| 109 |
culture = record.get("Culture and growth conditions") or {}
|
| 110 |
physio = record.get("Physiology and metabolism") or {}
|
| 111 |
seq = record.get("Sequence information") or {}
|
| 112 |
+
iso = record.get("Isolation, sampling and environmental information") or {}
|
| 113 |
+
|
| 114 |
+
iso_cats = _collect_isolation_categories(iso.get("isolation source categories"))
|
| 115 |
|
| 116 |
out: dict[str, Any] = {
|
| 117 |
"bacdive_id": general.get("BacDive-ID"),
|
|
|
|
| 124 |
"oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"),
|
| 125 |
"salt_tolerance_pct": _derive_salt(physio.get("halophily")),
|
| 126 |
"genome_accession": _first_genome_accession(seq.get("Genome sequences")),
|
| 127 |
+
"isolation_cat1": iso_cats["cat1"],
|
| 128 |
+
"isolation_cat2": iso_cats["cat2"],
|
| 129 |
+
"isolation_cat3": iso_cats["cat3"],
|
| 130 |
}
|
| 131 |
return out
|
| 132 |
|
| 133 |
|
| 134 |
+
def _collect_isolation_categories(raw: Any) -> dict[str, str | None]:
|
| 135 |
+
"""Flatten BacDive's `isolation source categories` into 3 pipe-joined string fields.
|
| 136 |
+
|
| 137 |
+
A strain commonly has multiple parallel category descriptions (e.g., #Host=Human AND
|
| 138 |
+
#Host Body Product=Blood). We collect *all* unique values per level into a sorted,
|
| 139 |
+
pipe-joined string so downstream code can split & one-hot. The leading '#' is stripped.
|
| 140 |
+
"""
|
| 141 |
+
cats: dict[str, set[str]] = {"Cat1": set(), "Cat2": set(), "Cat3": set()}
|
| 142 |
+
for entry in _as_list(raw):
|
| 143 |
+
if not isinstance(entry, dict):
|
| 144 |
+
continue
|
| 145 |
+
for level in cats:
|
| 146 |
+
value = entry.get(level)
|
| 147 |
+
if isinstance(value, str) and value:
|
| 148 |
+
cats[level].add(value.lstrip("#").strip())
|
| 149 |
+
return {
|
| 150 |
+
"cat1": "|".join(sorted(cats["Cat1"])) or None,
|
| 151 |
+
"cat2": "|".join(sorted(cats["Cat2"])) or None,
|
| 152 |
+
"cat3": "|".join(sorted(cats["Cat3"])) or None,
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
|
| 156 |
def _as_list(x: Any) -> list:
|
| 157 |
if x is None:
|
| 158 |
return []
|