zhimin-z commited on
Commit
f670436
·
1 Parent(s): af464aa

remove 5/5

Browse files
Files changed (1) hide show
  1. msr.py +4 -134
msr.py CHANGED
@@ -1,7 +1,6 @@
1
  import json
2
  import os
3
  import time
4
- import tempfile
5
  from datetime import datetime, timezone, timedelta
6
  from collections import defaultdict
7
  from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -24,7 +23,6 @@ load_dotenv()
24
  # =============================================================================
25
 
26
  AGENTS_REPO = "SWE-Arena/bot_metadata"
27
- REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
28
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
29
  LEADERBOARD_TIME_FRAME_DAYS = 180
30
  GHARCHIVE_DATA_DIR = "../gharchive/data"
@@ -453,128 +451,6 @@ def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date)
453
  # HUGGINGFACE STORAGE FUNCTIONS
454
  # =============================================================================
455
 
456
- def group_metadata_by_date(metadata_list):
457
- """Group review metadata by date for daily storage."""
458
- grouped = defaultdict(list)
459
-
460
- for review_meta in metadata_list:
461
- reviewed_at = review_meta.get('reviewed_at')
462
- if not reviewed_at:
463
- continue
464
-
465
- try:
466
- dt = datetime.fromisoformat(reviewed_at.replace('Z', '+00:00'))
467
- key = (dt.year, dt.month, dt.day)
468
- grouped[key].append(review_meta)
469
- except Exception as e:
470
- print(f"Warning: Could not parse date '{reviewed_at}': {e}")
471
-
472
- return dict(grouped)
473
-
474
-
475
- def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
476
- """Upload a single file with exponential backoff retry logic."""
477
- for attempt in range(max_retries):
478
- try:
479
- upload_file_with_backoff(
480
- api=api,
481
- path_or_fileobj=local_path,
482
- path_in_repo=repo_path,
483
- repo_id=repo_id,
484
- repo_type=repo_type,
485
- commit_message=commit_message
486
- )
487
- return True
488
- except Exception as e:
489
- print(f" {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {UPLOAD_MAX_BACKOFF}s...")
490
- time.sleep(UPLOAD_MAX_BACKOFF)
491
- return False
492
-
493
-
494
- def batch_upload_review_metadata(all_metadata):
495
- """Upload review metadata for all agents with time gaps between uploads."""
496
- try:
497
- token = get_hf_token()
498
- if not token:
499
- raise Exception("No HuggingFace token found")
500
-
501
- api = HfApi(token=token)
502
-
503
- success_count = 0
504
- error_count = 0
505
- total_files = 0
506
-
507
- for agent_identifier, metadata_list in all_metadata.items():
508
- if metadata_list:
509
- grouped = group_metadata_by_date(metadata_list)
510
- total_files += len(grouped)
511
-
512
- print(f"Uploading {total_files} files for {len(all_metadata)} agents...")
513
-
514
- file_count = 0
515
-
516
- for agent_idx, (agent_identifier, metadata_list) in enumerate(all_metadata.items(), 1):
517
- if not metadata_list:
518
- continue
519
-
520
- grouped = group_metadata_by_date(metadata_list)
521
-
522
- agent_temp_dir = tempfile.mkdtemp()
523
-
524
- try:
525
- local_files = []
526
- for (review_year, month, day), day_metadata in grouped.items():
527
- filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
528
- local_path = os.path.join(agent_temp_dir, filename)
529
- repo_path = f"{agent_identifier}/{filename}"
530
-
531
- day_metadata.sort(key=lambda x: x.get('reviewed_at', ''), reverse=True)
532
- save_jsonl(local_path, day_metadata)
533
- local_files.append((local_path, repo_path, len(day_metadata)))
534
-
535
- agent_success = 0
536
- agent_error = 0
537
-
538
- for file_idx, (local_path, repo_path, review_count) in enumerate(local_files, 1):
539
- file_count += 1
540
-
541
- if upload_single_file_with_retry(
542
- api=api,
543
- local_path=local_path,
544
- repo_path=repo_path,
545
- repo_id=REVIEW_METADATA_REPO,
546
- repo_type="dataset",
547
- commit_message=f"Update {repo_path}",
548
- max_retries=MAX_RETRIES
549
- ):
550
- agent_success += 1
551
- success_count += 1
552
- else:
553
- agent_error += 1
554
- error_count += 1
555
-
556
- if file_idx < len(local_files):
557
- time.sleep(UPLOAD_DELAY_SECONDS)
558
-
559
- finally:
560
- if os.path.exists(agent_temp_dir):
561
- import shutil
562
- shutil.rmtree(agent_temp_dir)
563
-
564
- if error_count > 0:
565
- print(f"Upload complete: {success_count}/{total_files} succeeded, {error_count} errors")
566
- else:
567
- print(f"Upload complete: {success_count}/{total_files} files")
568
-
569
- return success_count, error_count
570
-
571
- except Exception as e:
572
- print(f"Error during batch upload: {str(e)}")
573
- import traceback
574
- traceback.print_exc()
575
- return 0, total_files if 'total_files' in locals() else 0
576
-
577
-
578
  def load_agents_from_hf():
579
  """Load all agent metadata JSON files from HuggingFace dataset."""
580
  try:
@@ -803,12 +679,12 @@ def mine_all_agents():
803
  Mine review metadata for all agents using STREAMING batch processing.
804
  Downloads GHArchive data, then uses BATCH-based DuckDB queries.
805
  """
806
- print(f"\n[1/5] Downloading GHArchive data...")
807
 
808
  if not download_all_gharchive_data():
809
  print("Warning: Download had errors, continuing with available data...")
810
 
811
- print(f"\n[2/5] Loading agent metadata...")
812
 
813
  agents = load_agents_from_hf()
814
  if not agents:
@@ -820,7 +696,7 @@ def mine_all_agents():
820
  print("Error: No valid agent identifiers found")
821
  return
822
 
823
- print(f"\n[3/5] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
824
 
825
  try:
826
  conn = get_duckdb_connection()
@@ -846,7 +722,7 @@ def mine_all_agents():
846
  finally:
847
  conn.close()
848
 
849
- print(f"\n[4/5] Saving leaderboard...")
850
 
851
  try:
852
  leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
@@ -858,12 +734,6 @@ def mine_all_agents():
858
  import traceback
859
  traceback.print_exc()
860
 
861
- print(f"\n[5/5] Uploading review metadata...")
862
-
863
- success_count, error_count = batch_upload_review_metadata(all_metadata)
864
-
865
- print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
866
-
867
 
868
  # =============================================================================
869
  # SCHEDULER SETUP
 
1
  import json
2
  import os
3
  import time
 
4
  from datetime import datetime, timezone, timedelta
5
  from collections import defaultdict
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
23
  # =============================================================================
24
 
25
  AGENTS_REPO = "SWE-Arena/bot_metadata"
 
26
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
27
  LEADERBOARD_TIME_FRAME_DAYS = 180
28
  GHARCHIVE_DATA_DIR = "../gharchive/data"
 
451
  # HUGGINGFACE STORAGE FUNCTIONS
452
  # =============================================================================
453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  def load_agents_from_hf():
455
  """Load all agent metadata JSON files from HuggingFace dataset."""
456
  try:
 
679
  Mine review metadata for all agents using STREAMING batch processing.
680
  Downloads GHArchive data, then uses BATCH-based DuckDB queries.
681
  """
682
+ print(f"\n[1/4] Downloading GHArchive data...")
683
 
684
  if not download_all_gharchive_data():
685
  print("Warning: Download had errors, continuing with available data...")
686
 
687
+ print(f"\n[2/4] Loading agent metadata...")
688
 
689
  agents = load_agents_from_hf()
690
  if not agents:
 
696
  print("Error: No valid agent identifiers found")
697
  return
698
 
699
+ print(f"\n[3/4] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
700
 
701
  try:
702
  conn = get_duckdb_connection()
 
722
  finally:
723
  conn.close()
724
 
725
+ print(f"\n[4/4] Saving leaderboard...")
726
 
727
  try:
728
  leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
 
734
  import traceback
735
  traceback.print_exc()
736
 
 
 
 
 
 
 
737
 
738
  # =============================================================================
739
  # SCHEDULER SETUP