Spaces:

SWE-Arena
/

SWE-Review

Sleeping

App Files Files Community

zhimin-z commited on Nov 16

Commit

f670436

1 Parent(s): af464aa

remove 5/5

Browse files

Files changed (1) hide show

msr.py +4 -134

msr.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import json
 import os
 import time
-import tempfile
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -24,7 +23,6 @@ load_dotenv()
 # =============================================================================
 AGENTS_REPO = "SWE-Arena/bot_metadata"
-REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
 LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
 LEADERBOARD_TIME_FRAME_DAYS = 180
 GHARCHIVE_DATA_DIR = "../gharchive/data"
@@ -453,128 +451,6 @@ def fetch_all_review_metadata_streaming(conn, identifiers, start_date, end_date)
 # HUGGINGFACE STORAGE FUNCTIONS
 # =============================================================================
-def group_metadata_by_date(metadata_list):
-    """Group review metadata by date for daily storage."""
-    grouped = defaultdict(list)
-    for review_meta in metadata_list:
-        reviewed_at = review_meta.get('reviewed_at')
-        if not reviewed_at:
-            continue
-        try:
-            dt = datetime.fromisoformat(reviewed_at.replace('Z', '+00:00'))
-            key = (dt.year, dt.month, dt.day)
-            grouped[key].append(review_meta)
-        except Exception as e:
-            print(f"Warning: Could not parse date '{reviewed_at}': {e}")
-    return dict(grouped)
-def upload_single_file_with_retry(api, local_path, repo_path, repo_id, repo_type, commit_message, max_retries=MAX_RETRIES):
-    """Upload a single file with exponential backoff retry logic."""
-    for attempt in range(max_retries):
-        try:
-            upload_file_with_backoff(
-                api=api,
-                path_or_fileobj=local_path,
-                path_in_repo=repo_path,
-                repo_id=repo_id,
-                repo_type=repo_type,
-                commit_message=commit_message
-            )
-            return True
-        except Exception as e:
-            print(f"      {e} error on attempt {attempt + 1}/{max_retries}. Retrying in {UPLOAD_MAX_BACKOFF}s...")
-            time.sleep(UPLOAD_MAX_BACKOFF)
-    return False
-def batch_upload_review_metadata(all_metadata):
-    """Upload review metadata for all agents with time gaps between uploads."""
-    try:
-        token = get_hf_token()
-        if not token:
-            raise Exception("No HuggingFace token found")
-        api = HfApi(token=token)
-        success_count = 0
-        error_count = 0
-        total_files = 0
-        for agent_identifier, metadata_list in all_metadata.items():
-            if metadata_list:
-                grouped = group_metadata_by_date(metadata_list)
-                total_files += len(grouped)
-        print(f"Uploading {total_files} files for {len(all_metadata)} agents...")
-        file_count = 0
-        for agent_idx, (agent_identifier, metadata_list) in enumerate(all_metadata.items(), 1):
-            if not metadata_list:
-                continue
-            grouped = group_metadata_by_date(metadata_list)
-            agent_temp_dir = tempfile.mkdtemp()
-            try:
-                local_files = []
-                for (review_year, month, day), day_metadata in grouped.items():
-                    filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
-                    local_path = os.path.join(agent_temp_dir, filename)
-                    repo_path = f"{agent_identifier}/{filename}"
-                    day_metadata.sort(key=lambda x: x.get('reviewed_at', ''), reverse=True)
-                    save_jsonl(local_path, day_metadata)
-                    local_files.append((local_path, repo_path, len(day_metadata)))
-                agent_success = 0
-                agent_error = 0
-                for file_idx, (local_path, repo_path, review_count) in enumerate(local_files, 1):
-                    file_count += 1
-                    if upload_single_file_with_retry(
-                        api=api,
-                        local_path=local_path,
-                        repo_path=repo_path,
-                        repo_id=REVIEW_METADATA_REPO,
-                        repo_type="dataset",
-                        commit_message=f"Update {repo_path}",
-                        max_retries=MAX_RETRIES
-                    ):
-                        agent_success += 1
-                        success_count += 1
-                    else:
-                        agent_error += 1
-                        error_count += 1
-                    if file_idx < len(local_files):
-                        time.sleep(UPLOAD_DELAY_SECONDS)
-            finally:
-                if os.path.exists(agent_temp_dir):
-                    import shutil
-                    shutil.rmtree(agent_temp_dir)
-        if error_count > 0:
-            print(f"Upload complete: {success_count}/{total_files} succeeded, {error_count} errors")
-        else:
-            print(f"Upload complete: {success_count}/{total_files} files")
-        return success_count, error_count
-    except Exception as e:
-        print(f"Error during batch upload: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return 0, total_files if 'total_files' in locals() else 0
 def load_agents_from_hf():
     """Load all agent metadata JSON files from HuggingFace dataset."""
     try:
@@ -803,12 +679,12 @@ def mine_all_agents():
     Mine review metadata for all agents using STREAMING batch processing.
     Downloads GHArchive data, then uses BATCH-based DuckDB queries.
     """
-    print(f"\n[1/5] Downloading GHArchive data...")
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
-    print(f"\n[2/5] Loading agent metadata...")
     agents = load_agents_from_hf()
     if not agents:
@@ -820,7 +696,7 @@ def mine_all_agents():
         print("Error: No valid agent identifiers found")
         return
-    print(f"\n[3/5] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
     try:
         conn = get_duckdb_connection()
@@ -846,7 +722,7 @@ def mine_all_agents():
     finally:
         conn.close()
-    print(f"\n[4/5] Saving leaderboard...")
     try:
         leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
@@ -858,12 +734,6 @@ def mine_all_agents():
         import traceback
         traceback.print_exc()
-    print(f"\n[5/5] Uploading review metadata...")
-    success_count, error_count = batch_upload_review_metadata(all_metadata)
-    print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
 # =============================================================================
 # SCHEDULER SETUP

 import json
 import os
 import time
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # =============================================================================
 AGENTS_REPO = "SWE-Arena/bot_metadata"
 LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
 LEADERBOARD_TIME_FRAME_DAYS = 180
 GHARCHIVE_DATA_DIR = "../gharchive/data"
 # HUGGINGFACE STORAGE FUNCTIONS
 # =============================================================================
 def load_agents_from_hf():
     """Load all agent metadata JSON files from HuggingFace dataset."""
     try:
     Mine review metadata for all agents using STREAMING batch processing.
     Downloads GHArchive data, then uses BATCH-based DuckDB queries.
     """
+    print(f"\n[1/4] Downloading GHArchive data...")
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
+    print(f"\n[2/4] Loading agent metadata...")
     agents = load_agents_from_hf()
     if not agents:
         print("Error: No valid agent identifiers found")
         return
+    print(f"\n[3/4] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
     try:
         conn = get_duckdb_connection()
     finally:
         conn.close()
+    print(f"\n[4/4] Saving leaderboard...")
     try:
         leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
         import traceback
         traceback.print_exc()
 # =============================================================================
 # SCHEDULER SETUP