Spaces:
Sleeping
Sleeping
zhimin-z
commited on
Commit
·
f670436
1
Parent(s):
af464aa
remove 5/5
Browse files
msr.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
-
import tempfile
|
| 5 |
from datetime import datetime, timezone, timedelta
|
| 6 |
from collections import defaultdict
|
| 7 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
@@ -24,7 +23,6 @@ load_dotenv()
|
|
| 24 |
# =============================================================================
|
| 25 |
|
| 26 |
AGENTS_REPO = "SWE-Arena/bot_metadata"
|
| 27 |
-
REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
|
| 28 |
LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
|
| 29 |
LEADERBOARD_TIME_FRAME_DAYS = 180
|
| 30 |
GHARCHIVE_DATA_DIR = "../gharchive/data"
|
|
@@ -453,128 +451,6 @@ def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date)
|
|
| 453 |
# HUGGINGFACE STORAGE FUNCTIONS
|
| 454 |
# =============================================================================
|
| 455 |
|
| 456 |
-
def group_metadata_by_date(metadata_list):
|
| 457 |
-
"""Group review metadata by date for daily storage."""
|
| 458 |
-
grouped = defaultdict(list)
|
| 459 |
-
|
| 460 |
-
for review_meta in metadata_list:
|
| 461 |
-
reviewed_at = review_meta.get('reviewed_at')
|
| 462 |
-
if not reviewed_at:
|
| 463 |
-
continue
|
| 464 |
-
|
| 465 |
-
try:
|
| 466 |
-
dt = datetime.fromisoformat(reviewed_at.replace('Z', '+00:00'))
|
| 467 |
-
key = (dt.year, dt.month, dt.day)
|
| 468 |
-
grouped[key].append(review_meta)
|
| 469 |
-
except Exception as e:
|
| 470 |
-
print(f"Warning: Could not parse date '{reviewed_at}': {e}")
|
| 471 |
-
|
| 472 |
-
return dict(grouped)
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
|
| 476 |
-
"""Upload a single file with exponential backoff retry logic."""
|
| 477 |
-
for attempt in range(max_retries):
|
| 478 |
-
try:
|
| 479 |
-
upload_file_with_backoff(
|
| 480 |
-
api=api,
|
| 481 |
-
path_or_fileobj=local_path,
|
| 482 |
-
path_in_repo=repo_path,
|
| 483 |
-
repo_id=repo_id,
|
| 484 |
-
repo_type=repo_type,
|
| 485 |
-
commit_message=commit_message
|
| 486 |
-
)
|
| 487 |
-
return True
|
| 488 |
-
except Exception as e:
|
| 489 |
-
print(f" {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {UPLOAD_MAX_BACKOFF}s...")
|
| 490 |
-
time.sleep(UPLOAD_MAX_BACKOFF)
|
| 491 |
-
return False
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
def batch_upload_review_metadata(all_metadata):
|
| 495 |
-
"""Upload review metadata for all agents with time gaps between uploads."""
|
| 496 |
-
try:
|
| 497 |
-
token = get_hf_token()
|
| 498 |
-
if not token:
|
| 499 |
-
raise Exception("No HuggingFace token found")
|
| 500 |
-
|
| 501 |
-
api = HfApi(token=token)
|
| 502 |
-
|
| 503 |
-
success_count = 0
|
| 504 |
-
error_count = 0
|
| 505 |
-
total_files = 0
|
| 506 |
-
|
| 507 |
-
for agent_identifier, metadata_list in all_metadata.items():
|
| 508 |
-
if metadata_list:
|
| 509 |
-
grouped = group_metadata_by_date(metadata_list)
|
| 510 |
-
total_files += len(grouped)
|
| 511 |
-
|
| 512 |
-
print(f"Uploading {total_files} files for {len(all_metadata)} agents...")
|
| 513 |
-
|
| 514 |
-
file_count = 0
|
| 515 |
-
|
| 516 |
-
for agent_idx, (agent_identifier, metadata_list) in enumerate(all_metadata.items(), 1):
|
| 517 |
-
if not metadata_list:
|
| 518 |
-
continue
|
| 519 |
-
|
| 520 |
-
grouped = group_metadata_by_date(metadata_list)
|
| 521 |
-
|
| 522 |
-
agent_temp_dir = tempfile.mkdtemp()
|
| 523 |
-
|
| 524 |
-
try:
|
| 525 |
-
local_files = []
|
| 526 |
-
for (review_year, month, day), day_metadata in grouped.items():
|
| 527 |
-
filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
|
| 528 |
-
local_path = os.path.join(agent_temp_dir, filename)
|
| 529 |
-
repo_path = f"{agent_identifier}/{filename}"
|
| 530 |
-
|
| 531 |
-
day_metadata.sort(key=lambda x: x.get('reviewed_at', ''), reverse=True)
|
| 532 |
-
save_jsonl(local_path, day_metadata)
|
| 533 |
-
local_files.append((local_path, repo_path, len(day_metadata)))
|
| 534 |
-
|
| 535 |
-
agent_success = 0
|
| 536 |
-
agent_error = 0
|
| 537 |
-
|
| 538 |
-
for file_idx, (local_path, repo_path, review_count) in enumerate(local_files, 1):
|
| 539 |
-
file_count += 1
|
| 540 |
-
|
| 541 |
-
if upload_single_file_with_retry(
|
| 542 |
-
api=api,
|
| 543 |
-
local_path=local_path,
|
| 544 |
-
repo_path=repo_path,
|
| 545 |
-
repo_id=REVIEW_METADATA_REPO,
|
| 546 |
-
repo_type="dataset",
|
| 547 |
-
commit_message=f"Update {repo_path}",
|
| 548 |
-
max_retries=MAX_RETRIES
|
| 549 |
-
):
|
| 550 |
-
agent_success += 1
|
| 551 |
-
success_count += 1
|
| 552 |
-
else:
|
| 553 |
-
agent_error += 1
|
| 554 |
-
error_count += 1
|
| 555 |
-
|
| 556 |
-
if file_idx < len(local_files):
|
| 557 |
-
time.sleep(UPLOAD_DELAY_SECONDS)
|
| 558 |
-
|
| 559 |
-
finally:
|
| 560 |
-
if os.path.exists(agent_temp_dir):
|
| 561 |
-
import shutil
|
| 562 |
-
shutil.rmtree(agent_temp_dir)
|
| 563 |
-
|
| 564 |
-
if error_count > 0:
|
| 565 |
-
print(f"Upload complete: {success_count}/{total_files} succeeded, {error_count} errors")
|
| 566 |
-
else:
|
| 567 |
-
print(f"Upload complete: {success_count}/{total_files} files")
|
| 568 |
-
|
| 569 |
-
return success_count, error_count
|
| 570 |
-
|
| 571 |
-
except Exception as e:
|
| 572 |
-
print(f"Error during batch upload: {str(e)}")
|
| 573 |
-
import traceback
|
| 574 |
-
traceback.print_exc()
|
| 575 |
-
return 0, total_files if 'total_files' in locals() else 0
|
| 576 |
-
|
| 577 |
-
|
| 578 |
def load_agents_from_hf():
|
| 579 |
"""Load all agent metadata JSON files from HuggingFace dataset."""
|
| 580 |
try:
|
|
@@ -803,12 +679,12 @@ def mine_all_agents():
|
|
| 803 |
Mine review metadata for all agents using STREAMING batch processing.
|
| 804 |
Downloads GHArchive data, then uses BATCH-based DuckDB queries.
|
| 805 |
"""
|
| 806 |
-
print(f"\n[1/
|
| 807 |
|
| 808 |
if not download_all_gharchive_data():
|
| 809 |
print("Warning: Download had errors, continuing with available data...")
|
| 810 |
|
| 811 |
-
print(f"\n[2/
|
| 812 |
|
| 813 |
agents = load_agents_from_hf()
|
| 814 |
if not agents:
|
|
@@ -820,7 +696,7 @@ def mine_all_agents():
|
|
| 820 |
print("Error: No valid agent identifiers found")
|
| 821 |
return
|
| 822 |
|
| 823 |
-
print(f"\n[3/
|
| 824 |
|
| 825 |
try:
|
| 826 |
conn = get_duckdb_connection()
|
|
@@ -846,7 +722,7 @@ def mine_all_agents():
|
|
| 846 |
finally:
|
| 847 |
conn.close()
|
| 848 |
|
| 849 |
-
print(f"\n[4/
|
| 850 |
|
| 851 |
try:
|
| 852 |
leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
|
|
@@ -858,12 +734,6 @@ def mine_all_agents():
|
|
| 858 |
import traceback
|
| 859 |
traceback.print_exc()
|
| 860 |
|
| 861 |
-
print(f"\n[5/5] Uploading review metadata...")
|
| 862 |
-
|
| 863 |
-
success_count, error_count = batch_upload_review_metadata(all_metadata)
|
| 864 |
-
|
| 865 |
-
print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
|
| 866 |
-
|
| 867 |
|
| 868 |
# =============================================================================
|
| 869 |
# SCHEDULER SETUP
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import time
|
|
|
|
| 4 |
from datetime import datetime, timezone, timedelta
|
| 5 |
from collections import defaultdict
|
| 6 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
| 23 |
# =============================================================================
|
| 24 |
|
| 25 |
AGENTS_REPO = "SWE-Arena/bot_metadata"
|
|
|
|
| 26 |
LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
|
| 27 |
LEADERBOARD_TIME_FRAME_DAYS = 180
|
| 28 |
GHARCHIVE_DATA_DIR = "../gharchive/data"
|
|
|
|
| 451 |
# HUGGINGFACE STORAGE FUNCTIONS
|
| 452 |
# =============================================================================
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
def load_agents_from_hf():
|
| 455 |
"""Load all agent metadata JSON files from HuggingFace dataset."""
|
| 456 |
try:
|
|
|
|
| 679 |
Mine review metadata for all agents using STREAMING batch processing.
|
| 680 |
Downloads GHArchive data, then uses BATCH-based DuckDB queries.
|
| 681 |
"""
|
| 682 |
+
print(f"\n[1/4] Downloading GHArchive data...")
|
| 683 |
|
| 684 |
if not download_all_gharchive_data():
|
| 685 |
print("Warning: Download had errors, continuing with available data...")
|
| 686 |
|
| 687 |
+
print(f"\n[2/4] Loading agent metadata...")
|
| 688 |
|
| 689 |
agents = load_agents_from_hf()
|
| 690 |
if not agents:
|
|
|
|
| 696 |
print("Error: No valid agent identifiers found")
|
| 697 |
return
|
| 698 |
|
| 699 |
+
print(f"\n[3/4] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
|
| 700 |
|
| 701 |
try:
|
| 702 |
conn = get_duckdb_connection()
|
|
|
|
| 722 |
finally:
|
| 723 |
conn.close()
|
| 724 |
|
| 725 |
+
print(f"\n[4/4] Saving leaderboard...")
|
| 726 |
|
| 727 |
try:
|
| 728 |
leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
|
|
|
|
| 734 |
import traceback
|
| 735 |
traceback.print_exc()
|
| 736 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
|
| 738 |
# =============================================================================
|
| 739 |
# SCHEDULER SETUP
|