clean
Browse files- results.json +0 -148
results.json
CHANGED
|
@@ -5785,43 +5785,6 @@
|
|
| 5785 |
"accuracy": 0.5723270440251572
|
| 5786 |
}
|
| 5787 |
},
|
| 5788 |
-
"data/ifeval_best_of_k/Qwen2.5-7B-RM-09192024.json": {
|
| 5789 |
-
"all": {
|
| 5790 |
-
"accuracy": 0.5726562500000001,
|
| 5791 |
-
"area_under_curve": 0.5536947485942088,
|
| 5792 |
-
"loss": 0.1320898115634918,
|
| 5793 |
-
"mean_max_score": 0.58177734375,
|
| 5794 |
-
"mean_end_score": 0.565234375
|
| 5795 |
-
},
|
| 5796 |
-
"gemma-2-9b-it": {
|
| 5797 |
-
"accuracy": 0.5796875,
|
| 5798 |
-
"area_under_curve": 0.5556161705870118,
|
| 5799 |
-
"loss": 0.11947629928588867,
|
| 5800 |
-
"mean_max_score": 0.62125,
|
| 5801 |
-
"mean_end_score": 0.58953125
|
| 5802 |
-
},
|
| 5803 |
-
"gpt-4o-mini-2024-07-18": {
|
| 5804 |
-
"accuracy": 0.61875,
|
| 5805 |
-
"area_under_curve": 0.5427620686093066,
|
| 5806 |
-
"loss": 0.1205466079711914,
|
| 5807 |
-
"mean_max_score": 0.631328125,
|
| 5808 |
-
"mean_end_score": 0.605390625
|
| 5809 |
-
},
|
| 5810 |
-
"Meta-Llama-3-8B-Instruct": {
|
| 5811 |
-
"accuracy": 0.58125,
|
| 5812 |
-
"area_under_curve": 0.5553819052297072,
|
| 5813 |
-
"loss": 0.14538890838623048,
|
| 5814 |
-
"mean_max_score": 0.60421875,
|
| 5815 |
-
"mean_end_score": 0.55078125
|
| 5816 |
-
},
|
| 5817 |
-
"claude-3-haiku-20240307": {
|
| 5818 |
-
"accuracy": 0.5109375,
|
| 5819 |
-
"area_under_curve": 0.5500825653737205,
|
| 5820 |
-
"loss": 0.15359460830688476,
|
| 5821 |
-
"mean_max_score": 0.54703125,
|
| 5822 |
-
"mean_end_score": 0.515625
|
| 5823 |
-
}
|
| 5824 |
-
},
|
| 5825 |
"data/ifeval_best_of_k/internlm2-1_8b-reward.json": {
|
| 5826 |
"all": {
|
| 5827 |
"accuracy": 0.5386718749999999,
|
|
@@ -6189,43 +6152,6 @@
|
|
| 6189 |
"mean_end_score": 0.606015625
|
| 6190 |
}
|
| 6191 |
},
|
| 6192 |
-
"data/ifeval_best_of_k/Qwen2.5-72B-RM-09242024.json": {
|
| 6193 |
-
"all": {
|
| 6194 |
-
"accuracy": 0.591796875,
|
| 6195 |
-
"area_under_curve": 0.5682828198209156,
|
| 6196 |
-
"loss": 0.11081918239593506,
|
| 6197 |
-
"mean_max_score": 0.61642578125,
|
| 6198 |
-
"mean_end_score": 0.61126953125
|
| 6199 |
-
},
|
| 6200 |
-
"gemma-2-9b-it": {
|
| 6201 |
-
"accuracy": 0.615625,
|
| 6202 |
-
"area_under_curve": 0.5724099728839697,
|
| 6203 |
-
"loss": 0.08845264434814454,
|
| 6204 |
-
"mean_max_score": 0.667265625,
|
| 6205 |
-
"mean_end_score": 0.6484375
|
| 6206 |
-
},
|
| 6207 |
-
"gpt-4o-mini-2024-07-18": {
|
| 6208 |
-
"accuracy": 0.6046875,
|
| 6209 |
-
"area_under_curve": 0.5579399685462639,
|
| 6210 |
-
"loss": 0.1147395133972168,
|
| 6211 |
-
"mean_max_score": 0.644296875,
|
| 6212 |
-
"mean_end_score": 0.6171875
|
| 6213 |
-
},
|
| 6214 |
-
"Meta-Llama-3-8B-Instruct": {
|
| 6215 |
-
"accuracy": 0.6140625,
|
| 6216 |
-
"area_under_curve": 0.576997247648311,
|
| 6217 |
-
"loss": 0.11122642517089844,
|
| 6218 |
-
"mean_max_score": 0.64453125,
|
| 6219 |
-
"mean_end_score": 0.636640625
|
| 6220 |
-
},
|
| 6221 |
-
"claude-3-haiku-20240307": {
|
| 6222 |
-
"accuracy": 0.5328125,
|
| 6223 |
-
"area_under_curve": 0.5592622087343524,
|
| 6224 |
-
"loss": 0.13447074890136718,
|
| 6225 |
-
"mean_max_score": 0.5696875,
|
| 6226 |
-
"mean_end_score": 0.543359375
|
| 6227 |
-
}
|
| 6228 |
-
},
|
| 6229 |
"data/ifeval_best_of_k/nemotron-4-340b-reward.json": {
|
| 6230 |
"all": {
|
| 6231 |
"accuracy": 0.6265624999999999,
|
|
@@ -6300,43 +6226,6 @@
|
|
| 6300 |
"mean_end_score": 0.578125
|
| 6301 |
}
|
| 6302 |
},
|
| 6303 |
-
"data/ifeval_best_of_k/Llama-3.1-8B-Instruct-RM-Test.json": {
|
| 6304 |
-
"all": {
|
| 6305 |
-
"accuracy": 0.5953124999999999,
|
| 6306 |
-
"area_under_curve": 0.5659010925349728,
|
| 6307 |
-
"loss": 0.11261327266693115,
|
| 6308 |
-
"mean_max_score": 0.61439453125,
|
| 6309 |
-
"mean_end_score": 0.60623046875
|
| 6310 |
-
},
|
| 6311 |
-
"gemma-2-9b-it": {
|
| 6312 |
-
"accuracy": 0.6,
|
| 6313 |
-
"area_under_curve": 0.5742011950437376,
|
| 6314 |
-
"loss": 0.08687259674072266,
|
| 6315 |
-
"mean_max_score": 0.67890625,
|
| 6316 |
-
"mean_end_score": 0.6640625
|
| 6317 |
-
},
|
| 6318 |
-
"gpt-4o-mini-2024-07-18": {
|
| 6319 |
-
"accuracy": 0.5984375,
|
| 6320 |
-
"area_under_curve": 0.5628933527191842,
|
| 6321 |
-
"loss": 0.10282745361328124,
|
| 6322 |
-
"mean_max_score": 0.655625,
|
| 6323 |
-
"mean_end_score": 0.644375
|
| 6324 |
-
},
|
| 6325 |
-
"Meta-Llama-3-8B-Instruct": {
|
| 6326 |
-
"accuracy": 0.603125,
|
| 6327 |
-
"area_under_curve": 0.5555893773327166,
|
| 6328 |
-
"loss": 0.12582313537597656,
|
| 6329 |
-
"mean_max_score": 0.618515625,
|
| 6330 |
-
"mean_end_score": 0.578125
|
| 6331 |
-
},
|
| 6332 |
-
"claude-3-haiku-20240307": {
|
| 6333 |
-
"accuracy": 0.5796874999999999,
|
| 6334 |
-
"area_under_curve": 0.5637145211028964,
|
| 6335 |
-
"loss": 0.13854501724243165,
|
| 6336 |
-
"mean_max_score": 0.564296875,
|
| 6337 |
-
"mean_end_score": 0.5390625
|
| 6338 |
-
}
|
| 6339 |
-
},
|
| 6340 |
"data/ifeval_best_of_k/Starling-RM-7B-alpha.json": {
|
| 6341 |
"all": {
|
| 6342 |
"accuracy": 0.5406249999999999,
|
|
@@ -6411,43 +6300,6 @@
|
|
| 6411 |
"mean_end_score": 0.484375
|
| 6412 |
}
|
| 6413 |
},
|
| 6414 |
-
"data/ifeval_best_of_k/Llama-3.1-70B-RM-09172024.json": {
|
| 6415 |
-
"all": {
|
| 6416 |
-
"accuracy": 0.630078125,
|
| 6417 |
-
"area_under_curve": 0.5902905300669057,
|
| 6418 |
-
"loss": 0.09440629482269287,
|
| 6419 |
-
"mean_max_score": 0.64310546875,
|
| 6420 |
-
"mean_end_score": 0.62984375
|
| 6421 |
-
},
|
| 6422 |
-
"gemma-2-9b-it": {
|
| 6423 |
-
"accuracy": 0.6375,
|
| 6424 |
-
"area_under_curve": 0.6064561485832756,
|
| 6425 |
-
"loss": 0.07111602783203125,
|
| 6426 |
-
"mean_max_score": 0.709375,
|
| 6427 |
-
"mean_end_score": 0.6953125
|
| 6428 |
-
},
|
| 6429 |
-
"gpt-4o-mini-2024-07-18": {
|
| 6430 |
-
"accuracy": 0.6359374999999999,
|
| 6431 |
-
"area_under_curve": 0.5804507982664724,
|
| 6432 |
-
"loss": 0.08310569763183594,
|
| 6433 |
-
"mean_max_score": 0.693203125,
|
| 6434 |
-
"mean_end_score": 0.6759375
|
| 6435 |
-
},
|
| 6436 |
-
"Meta-Llama-3-8B-Instruct": {
|
| 6437 |
-
"accuracy": 0.6468750000000001,
|
| 6438 |
-
"area_under_curve": 0.5893750619966321,
|
| 6439 |
-
"loss": 0.10088687896728515,
|
| 6440 |
-
"mean_max_score": 0.653359375,
|
| 6441 |
-
"mean_end_score": 0.6171875
|
| 6442 |
-
},
|
| 6443 |
-
"claude-3-haiku-20240307": {
|
| 6444 |
-
"accuracy": 0.6000000000000001,
|
| 6445 |
-
"area_under_curve": 0.585711467200442,
|
| 6446 |
-
"loss": 0.12550268173217774,
|
| 6447 |
-
"mean_max_score": 0.588984375,
|
| 6448 |
-
"mean_end_score": 0.53125
|
| 6449 |
-
}
|
| 6450 |
-
},
|
| 6451 |
"data/ifeval_best_of_k/Skywork-Reward-Gemma-2-27B.json": {
|
| 6452 |
"all": {
|
| 6453 |
"accuracy": 0.537890625,
|
|
|
|
| 5785 |
"accuracy": 0.5723270440251572
|
| 5786 |
}
|
| 5787 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5788 |
"data/ifeval_best_of_k/internlm2-1_8b-reward.json": {
|
| 5789 |
"all": {
|
| 5790 |
"accuracy": 0.5386718749999999,
|
|
|
|
| 6152 |
"mean_end_score": 0.606015625
|
| 6153 |
}
|
| 6154 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6155 |
"data/ifeval_best_of_k/nemotron-4-340b-reward.json": {
|
| 6156 |
"all": {
|
| 6157 |
"accuracy": 0.6265624999999999,
|
|
|
|
| 6226 |
"mean_end_score": 0.578125
|
| 6227 |
}
|
| 6228 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6229 |
"data/ifeval_best_of_k/Starling-RM-7B-alpha.json": {
|
| 6230 |
"all": {
|
| 6231 |
"accuracy": 0.5406249999999999,
|
|
|
|
| 6300 |
"mean_end_score": 0.484375
|
| 6301 |
}
|
| 6302 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6303 |
"data/ifeval_best_of_k/Skywork-Reward-Gemma-2-27B.json": {
|
| 6304 |
"all": {
|
| 6305 |
"accuracy": 0.537890625,
|