Spaces:

SWE-Arena
/

SWE-Review

Running

App Files Files Community

zhiminy commited on Nov 9

Commit

0fd77fe

1 Parent(s): d00dcbf

refine workflow

Browse files

Files changed (2) hide show

app.py +39 -55
msr.py +38 -57

app.py CHANGED Viewed

@@ -208,7 +208,7 @@ def get_bigquery_client():
         raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
-def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
     """
     Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large queries.
@@ -219,12 +219,19 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
         batch_size: Number of agents to process per batch (default: 50)
     Returns:
         Dictionary mapping agent identifier to list of PR metadata
     """
     print(f"\n🔍 Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
     total_batches = len(batches)
@@ -241,6 +248,7 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
         try:
             # Query this batch - process each agent in the batch
             for identifier in batch_identifiers:
                 review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
@@ -258,10 +266,26 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
                 if metadata_list:
                     all_metadata[identifier] = metadata_list
             successful_batches += 1
             print(f"   ✓ Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
         except Exception as e:
             failed_batches += 1
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
@@ -2086,68 +2110,28 @@ def mine_all_agents():
     try:
         # Use batched approach for better performance
         all_metadata = fetch_all_pr_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=50
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
-    # Save results for each agent
-    print(f"\n{'='*80}")
-    print(f"💾 Saving results to HuggingFace for each agent...")
-    print(f"{'='*80}\n")
-    success_count = 0
-    error_count = 0
-    no_data_count = 0
-    for i, agent in enumerate(agents, 1):
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        if not identifier:
-            print(f"[{i}/{len(agents)}] Skipping agent without identifier")
-            error_count += 1
-            continue
-        metadata = all_metadata.get(identifier, [])
-        print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
-        try:
-            if metadata:
-                print(f"   💾 Saving {len(metadata)} review records...")
-                if save_review_metadata_to_hf(metadata, identifier):
-                    success_count += 1
-                else:
-                    error_count += 1
-            else:
-                print(f"   No reviews found")
-                no_data_count += 1
-        except Exception as e:
-            print(f"   ✗ Error saving {identifier}: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            error_count += 1
-            continue
-    # Calculate number of batches
-    batch_size = 50
-    total_batches = (len(identifiers) + batch_size - 1) // batch_size
-    print(f"\n{'='*80}")
-    print(f"✅ Mining complete!")
-    print(f"   Total agents: {len(agents)}")
-    print(f"   Successfully saved: {success_count}")
-    print(f"   No data (skipped): {no_data_count}")
-    print(f"   Errors: {error_count}")
-    print(f"   BigQuery batches executed: {total_batches} (batch size: {batch_size})")
-    print(f"{'='*80}\n")
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf():

         raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
+def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
     """
     Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large queries.
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
         batch_size: Number of agents to process per batch (default: 50)
+        upload_immediately: If True, upload each batch to HuggingFace immediately after processing (default: True)
     Returns:
         Dictionary mapping agent identifier to list of PR metadata
     """
     print(f"\n🔍 Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
+    # Log upload mode
+    if upload_immediately:
+        print(f"   📤 Upload mode: IMMEDIATE (upload after each batch)")
+    else:
+        print(f"   📤 Upload mode: DEFERRED (upload after all batches complete)")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
     total_batches = len(batches)
         try:
             # Query this batch - process each agent in the batch
+            batch_results = {}
             for identifier in batch_identifiers:
                 review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
                 if metadata_list:
                     all_metadata[identifier] = metadata_list
+                    batch_results[identifier] = metadata_list
             successful_batches += 1
             print(f"   ✓ Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
+            # Upload immediately after this batch if enabled
+            if upload_immediately and batch_results:
+                print(f"\n   📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
+                upload_success = 0
+                upload_errors = 0
+                for identifier, metadata_list in batch_results.items():
+                    if metadata_list:
+                        if save_review_metadata_to_hf(metadata_list, identifier):
+                            upload_success += 1
+                        else:
+                            upload_errors += 1
+                print(f"   ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
         except Exception as e:
             failed_batches += 1
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
     try:
         # Use batched approach for better performance
+        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
         all_metadata = fetch_all_pr_metadata_batched(
+            client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
         )
+        # Calculate summary statistics
+        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
+        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
+        print(f"\n{'='*80}")
+        print(f"✅ BigQuery mining and upload complete!")
+        print(f"   Total agents: {len(agents)}")
+        print(f"   Agents with data: {agents_with_data}")
+        print(f"   Total PRs found: {total_prs}")
+        print(f"{'='*80}\n")
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf():

msr.py CHANGED Viewed

@@ -222,7 +222,7 @@ def generate_table_union_statements(start_date, end_date):
 # BIGQUERY FUNCTIONS
 # =============================================================================
-def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
     """
     Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large queries.
@@ -232,13 +232,20 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
-        batch_size: Number of agents to process per batch (default: 100)
     Returns:
         Dictionary mapping agent identifier to list of PR metadata (same format as single query)
     """
     print(f"\n🔍 Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
     total_batches = len(batches)
@@ -269,6 +276,21 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
             successful_batches += 1
             print(f"   ✓ Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
         except Exception as e:
             failed_batches += 1
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
@@ -1004,68 +1026,27 @@ def mine_all_agents():
     try:
         # Use batched approach for better performance
         all_metadata = fetch_all_pr_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=50
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
-    # Save results for each agent
-    print(f"\n{'='*80}")
-    print(f"💾 Saving results to HuggingFace for each agent...")
-    print(f"{'='*80}\n")
-    success_count = 0
-    error_count = 0
-    no_data_count = 0
-    for i, agent in enumerate(agents, 1):
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        if not identifier:
-            print(f"[{i}/{len(agents)}] Skipping agent without identifier")
-            error_count += 1
-            continue
-        metadata = all_metadata.get(identifier, [])
-        print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
-        try:
-            if metadata:
-                print(f"   💾 Saving {len(metadata)} review records...")
-                if save_review_metadata_to_hf(metadata, identifier):
-                    success_count += 1
-                else:
-                    error_count += 1
-            else:
-                print(f"   No reviews found")
-                no_data_count += 1
-        except Exception as e:
-            print(f"   ✗ Error saving {identifier}: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            error_count += 1
-            continue
-    # Calculate number of batches
-    total_identifiers = len(identifiers)
-    batch_size = 50
-    num_batches = (total_identifiers + batch_size - 1) // batch_size  # Ceiling division
-    print(f"\n{'='*80}")
-    print(f"✅ Mining complete!")
-    print(f"   Total agents: {len(agents)}")
-    print(f"   Successfully saved: {success_count}")
-    print(f"   No data (skipped): {no_data_count}")
-    print(f"   Errors: {error_count}")
-    print(f"   BigQuery batches executed: {num_batches} (batch size: {batch_size})")
-    print(f"{'='*80}\n")
     # Construct and save leaderboard data
     print(f"\n{'='*80}")

 # BIGQUERY FUNCTIONS
 # =============================================================================
+def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
     """
     Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
     Splits agents into smaller batches to avoid performance issues with large queries.
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
+        batch_size: Number of agents to process per batch (default: 50)
+        upload_immediately: If True, upload each batch to HuggingFace immediately after processing (default: True)
     Returns:
         Dictionary mapping agent identifier to list of PR metadata (same format as single query)
     """
     print(f"\n🔍 Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
+    # Log upload mode
+    if upload_immediately:
+        print(f"   📤 Upload mode: IMMEDIATE (upload after each batch)")
+    else:
+        print(f"   📤 Upload mode: DEFERRED (upload after all batches complete)")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
     total_batches = len(batches)
             successful_batches += 1
             print(f"   ✓ Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
+            # Upload immediately after this batch if enabled
+            if upload_immediately and batch_results:
+                print(f"\n   📤 Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
+                upload_success = 0
+                upload_errors = 0
+                for identifier, metadata_list in batch_results.items():
+                    if metadata_list:
+                        if save_review_metadata_to_hf(metadata_list, identifier):
+                            upload_success += 1
+                        else:
+                            upload_errors += 1
+                print(f"   ✓ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
         except Exception as e:
             failed_batches += 1
             print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
     try:
         # Use batched approach for better performance
+        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
         all_metadata = fetch_all_pr_metadata_batched(
+            client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
         )
+        # Calculate summary statistics
+        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
+        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
+        print(f"\n{'='*80}")
+        print(f"✅ BigQuery mining and upload complete!")
+        print(f"   Total agents: {len(agents)}")
+        print(f"   Agents with data: {agents_with_data}")
+        print(f"   Total PRs found: {total_prs}")
+        print(f"{'='*80}\n")
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     # Construct and save leaderboard data
     print(f"\n{'='*80}")