Spaces:
Running
Running
refine workflow
Browse files
app.py
CHANGED
|
@@ -208,7 +208,7 @@ def get_bigquery_client():
|
|
| 208 |
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
|
| 209 |
|
| 210 |
|
| 211 |
-
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
|
| 212 |
"""
|
| 213 |
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
| 214 |
Splits agents into smaller batches to avoid performance issues with large queries.
|
|
@@ -219,12 +219,19 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
|
|
| 219 |
start_date: Start datetime (timezone-aware)
|
| 220 |
end_date: End datetime (timezone-aware)
|
| 221 |
batch_size: Number of agents to process per batch (default: 50)
|
|
|
|
| 222 |
|
| 223 |
Returns:
|
| 224 |
Dictionary mapping agent identifier to list of PR metadata
|
| 225 |
"""
|
| 226 |
print(f"\nπ Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
# Split identifiers into batches
|
| 229 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 230 |
total_batches = len(batches)
|
|
@@ -241,6 +248,7 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
|
|
| 241 |
|
| 242 |
try:
|
| 243 |
# Query this batch - process each agent in the batch
|
|
|
|
| 244 |
for identifier in batch_identifiers:
|
| 245 |
review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
|
| 246 |
|
|
@@ -258,10 +266,26 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
|
|
| 258 |
|
| 259 |
if metadata_list:
|
| 260 |
all_metadata[identifier] = metadata_list
|
|
|
|
| 261 |
|
| 262 |
successful_batches += 1
|
| 263 |
print(f" β Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
except Exception as e:
|
| 266 |
failed_batches += 1
|
| 267 |
print(f" β Batch {batch_num}/{total_batches} failed: {str(e)}")
|
|
@@ -2086,68 +2110,28 @@ def mine_all_agents():
|
|
| 2086 |
|
| 2087 |
try:
|
| 2088 |
# Use batched approach for better performance
|
|
|
|
| 2089 |
all_metadata = fetch_all_pr_metadata_batched(
|
| 2090 |
-
client, identifiers, start_date, end_date, batch_size=50
|
| 2091 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2092 |
except Exception as e:
|
| 2093 |
print(f"β Error during BigQuery fetch: {str(e)}")
|
| 2094 |
import traceback
|
| 2095 |
traceback.print_exc()
|
| 2096 |
return
|
| 2097 |
|
| 2098 |
-
# Save results for each agent
|
| 2099 |
-
print(f"\n{'='*80}")
|
| 2100 |
-
print(f"πΎ Saving results to HuggingFace for each agent...")
|
| 2101 |
-
print(f"{'='*80}\n")
|
| 2102 |
-
|
| 2103 |
-
success_count = 0
|
| 2104 |
-
error_count = 0
|
| 2105 |
-
no_data_count = 0
|
| 2106 |
-
|
| 2107 |
-
for i, agent in enumerate(agents, 1):
|
| 2108 |
-
identifier = agent.get('github_identifier')
|
| 2109 |
-
agent_name = agent.get('name', 'Unknown')
|
| 2110 |
-
|
| 2111 |
-
if not identifier:
|
| 2112 |
-
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
| 2113 |
-
error_count += 1
|
| 2114 |
-
continue
|
| 2115 |
-
|
| 2116 |
-
metadata = all_metadata.get(identifier, [])
|
| 2117 |
-
|
| 2118 |
-
print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
|
| 2119 |
-
|
| 2120 |
-
try:
|
| 2121 |
-
if metadata:
|
| 2122 |
-
print(f" πΎ Saving {len(metadata)} review records...")
|
| 2123 |
-
if save_review_metadata_to_hf(metadata, identifier):
|
| 2124 |
-
success_count += 1
|
| 2125 |
-
else:
|
| 2126 |
-
error_count += 1
|
| 2127 |
-
else:
|
| 2128 |
-
print(f" No reviews found")
|
| 2129 |
-
no_data_count += 1
|
| 2130 |
-
|
| 2131 |
-
except Exception as e:
|
| 2132 |
-
print(f" β Error saving {identifier}: {str(e)}")
|
| 2133 |
-
import traceback
|
| 2134 |
-
traceback.print_exc()
|
| 2135 |
-
error_count += 1
|
| 2136 |
-
continue
|
| 2137 |
-
|
| 2138 |
-
# Calculate number of batches
|
| 2139 |
-
batch_size = 50
|
| 2140 |
-
total_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 2141 |
-
|
| 2142 |
-
print(f"\n{'='*80}")
|
| 2143 |
-
print(f"β
Mining complete!")
|
| 2144 |
-
print(f" Total agents: {len(agents)}")
|
| 2145 |
-
print(f" Successfully saved: {success_count}")
|
| 2146 |
-
print(f" No data (skipped): {no_data_count}")
|
| 2147 |
-
print(f" Errors: {error_count}")
|
| 2148 |
-
print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
|
| 2149 |
-
print(f"{'='*80}\n")
|
| 2150 |
-
|
| 2151 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 2152 |
print(f"π€ Uploading leaderboard and metrics data...")
|
| 2153 |
if save_leaderboard_and_metrics_to_hf():
|
|
|
|
| 208 |
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
|
| 209 |
|
| 210 |
|
| 211 |
+
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
|
| 212 |
"""
|
| 213 |
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
| 214 |
Splits agents into smaller batches to avoid performance issues with large queries.
|
|
|
|
| 219 |
start_date: Start datetime (timezone-aware)
|
| 220 |
end_date: End datetime (timezone-aware)
|
| 221 |
batch_size: Number of agents to process per batch (default: 50)
|
| 222 |
+
upload_immediately: If True, upload each batch to HuggingFace immediately after processing (default: True)
|
| 223 |
|
| 224 |
Returns:
|
| 225 |
Dictionary mapping agent identifier to list of PR metadata
|
| 226 |
"""
|
| 227 |
print(f"\nπ Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
|
| 228 |
|
| 229 |
+
# Log upload mode
|
| 230 |
+
if upload_immediately:
|
| 231 |
+
print(f" π€ Upload mode: IMMEDIATE (upload after each batch)")
|
| 232 |
+
else:
|
| 233 |
+
print(f" π€ Upload mode: DEFERRED (upload after all batches complete)")
|
| 234 |
+
|
| 235 |
# Split identifiers into batches
|
| 236 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 237 |
total_batches = len(batches)
|
|
|
|
| 248 |
|
| 249 |
try:
|
| 250 |
# Query this batch - process each agent in the batch
|
| 251 |
+
batch_results = {}
|
| 252 |
for identifier in batch_identifiers:
|
| 253 |
review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
|
| 254 |
|
|
|
|
| 266 |
|
| 267 |
if metadata_list:
|
| 268 |
all_metadata[identifier] = metadata_list
|
| 269 |
+
batch_results[identifier] = metadata_list
|
| 270 |
|
| 271 |
successful_batches += 1
|
| 272 |
print(f" β Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
|
| 273 |
|
| 274 |
+
# Upload immediately after this batch if enabled
|
| 275 |
+
if upload_immediately and batch_results:
|
| 276 |
+
print(f"\n π€ Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
|
| 277 |
+
upload_success = 0
|
| 278 |
+
upload_errors = 0
|
| 279 |
+
|
| 280 |
+
for identifier, metadata_list in batch_results.items():
|
| 281 |
+
if metadata_list:
|
| 282 |
+
if save_review_metadata_to_hf(metadata_list, identifier):
|
| 283 |
+
upload_success += 1
|
| 284 |
+
else:
|
| 285 |
+
upload_errors += 1
|
| 286 |
+
|
| 287 |
+
print(f" β Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
|
| 288 |
+
|
| 289 |
except Exception as e:
|
| 290 |
failed_batches += 1
|
| 291 |
print(f" β Batch {batch_num}/{total_batches} failed: {str(e)}")
|
|
|
|
| 2110 |
|
| 2111 |
try:
|
| 2112 |
# Use batched approach for better performance
|
| 2113 |
+
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 2114 |
all_metadata = fetch_all_pr_metadata_batched(
|
| 2115 |
+
client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
|
| 2116 |
)
|
| 2117 |
+
|
| 2118 |
+
# Calculate summary statistics
|
| 2119 |
+
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 2120 |
+
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 2121 |
+
|
| 2122 |
+
print(f"\n{'='*80}")
|
| 2123 |
+
print(f"β
BigQuery mining and upload complete!")
|
| 2124 |
+
print(f" Total agents: {len(agents)}")
|
| 2125 |
+
print(f" Agents with data: {agents_with_data}")
|
| 2126 |
+
print(f" Total PRs found: {total_prs}")
|
| 2127 |
+
print(f"{'='*80}\n")
|
| 2128 |
+
|
| 2129 |
except Exception as e:
|
| 2130 |
print(f"β Error during BigQuery fetch: {str(e)}")
|
| 2131 |
import traceback
|
| 2132 |
traceback.print_exc()
|
| 2133 |
return
|
| 2134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2135 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 2136 |
print(f"π€ Uploading leaderboard and metrics data...")
|
| 2137 |
if save_leaderboard_and_metrics_to_hf():
|
msr.py
CHANGED
|
@@ -222,7 +222,7 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 222 |
# BIGQUERY FUNCTIONS
|
| 223 |
# =============================================================================
|
| 224 |
|
| 225 |
-
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
|
| 226 |
"""
|
| 227 |
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
| 228 |
Splits agents into smaller batches to avoid performance issues with large queries.
|
|
@@ -232,13 +232,20 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
|
|
| 232 |
identifiers: List of GitHub usernames/bot identifiers
|
| 233 |
start_date: Start datetime (timezone-aware)
|
| 234 |
end_date: End datetime (timezone-aware)
|
| 235 |
-
batch_size: Number of agents to process per batch (default:
|
|
|
|
| 236 |
|
| 237 |
Returns:
|
| 238 |
Dictionary mapping agent identifier to list of PR metadata (same format as single query)
|
| 239 |
"""
|
| 240 |
print(f"\nπ Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
# Split identifiers into batches
|
| 243 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 244 |
total_batches = len(batches)
|
|
@@ -269,6 +276,21 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
|
|
| 269 |
successful_batches += 1
|
| 270 |
print(f" β Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
except Exception as e:
|
| 273 |
failed_batches += 1
|
| 274 |
print(f" β Batch {batch_num}/{total_batches} failed: {str(e)}")
|
|
@@ -1004,68 +1026,27 @@ def mine_all_agents():
|
|
| 1004 |
|
| 1005 |
try:
|
| 1006 |
# Use batched approach for better performance
|
|
|
|
| 1007 |
all_metadata = fetch_all_pr_metadata_batched(
|
| 1008 |
-
client, identifiers, start_date, end_date, batch_size=50
|
| 1009 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1010 |
except Exception as e:
|
| 1011 |
print(f"β Error during BigQuery fetch: {str(e)}")
|
| 1012 |
import traceback
|
| 1013 |
traceback.print_exc()
|
| 1014 |
return
|
| 1015 |
-
|
| 1016 |
-
# Save results for each agent
|
| 1017 |
-
print(f"\n{'='*80}")
|
| 1018 |
-
print(f"πΎ Saving results to HuggingFace for each agent...")
|
| 1019 |
-
print(f"{'='*80}\n")
|
| 1020 |
-
|
| 1021 |
-
success_count = 0
|
| 1022 |
-
error_count = 0
|
| 1023 |
-
no_data_count = 0
|
| 1024 |
-
|
| 1025 |
-
for i, agent in enumerate(agents, 1):
|
| 1026 |
-
identifier = agent.get('github_identifier')
|
| 1027 |
-
agent_name = agent.get('name', 'Unknown')
|
| 1028 |
-
|
| 1029 |
-
if not identifier:
|
| 1030 |
-
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
| 1031 |
-
error_count += 1
|
| 1032 |
-
continue
|
| 1033 |
-
|
| 1034 |
-
metadata = all_metadata.get(identifier, [])
|
| 1035 |
-
|
| 1036 |
-
print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
|
| 1037 |
-
|
| 1038 |
-
try:
|
| 1039 |
-
if metadata:
|
| 1040 |
-
print(f" πΎ Saving {len(metadata)} review records...")
|
| 1041 |
-
if save_review_metadata_to_hf(metadata, identifier):
|
| 1042 |
-
success_count += 1
|
| 1043 |
-
else:
|
| 1044 |
-
error_count += 1
|
| 1045 |
-
else:
|
| 1046 |
-
print(f" No reviews found")
|
| 1047 |
-
no_data_count += 1
|
| 1048 |
-
|
| 1049 |
-
except Exception as e:
|
| 1050 |
-
print(f" β Error saving {identifier}: {str(e)}")
|
| 1051 |
-
import traceback
|
| 1052 |
-
traceback.print_exc()
|
| 1053 |
-
error_count += 1
|
| 1054 |
-
continue
|
| 1055 |
-
|
| 1056 |
-
# Calculate number of batches
|
| 1057 |
-
total_identifiers = len(identifiers)
|
| 1058 |
-
batch_size = 50
|
| 1059 |
-
num_batches = (total_identifiers + batch_size - 1) // batch_size # Ceiling division
|
| 1060 |
-
|
| 1061 |
-
print(f"\n{'='*80}")
|
| 1062 |
-
print(f"β
Mining complete!")
|
| 1063 |
-
print(f" Total agents: {len(agents)}")
|
| 1064 |
-
print(f" Successfully saved: {success_count}")
|
| 1065 |
-
print(f" No data (skipped): {no_data_count}")
|
| 1066 |
-
print(f" Errors: {error_count}")
|
| 1067 |
-
print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
|
| 1068 |
-
print(f"{'='*80}\n")
|
| 1069 |
|
| 1070 |
# Construct and save leaderboard data
|
| 1071 |
print(f"\n{'='*80}")
|
|
|
|
| 222 |
# BIGQUERY FUNCTIONS
|
| 223 |
# =============================================================================
|
| 224 |
|
| 225 |
+
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
|
| 226 |
"""
|
| 227 |
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
| 228 |
Splits agents into smaller batches to avoid performance issues with large queries.
|
|
|
|
| 232 |
identifiers: List of GitHub usernames/bot identifiers
|
| 233 |
start_date: Start datetime (timezone-aware)
|
| 234 |
end_date: End datetime (timezone-aware)
|
| 235 |
+
batch_size: Number of agents to process per batch (default: 50)
|
| 236 |
+
upload_immediately: If True, upload each batch to HuggingFace immediately after processing (default: True)
|
| 237 |
|
| 238 |
Returns:
|
| 239 |
Dictionary mapping agent identifier to list of PR metadata (same format as single query)
|
| 240 |
"""
|
| 241 |
print(f"\nπ Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
|
| 242 |
|
| 243 |
+
# Log upload mode
|
| 244 |
+
if upload_immediately:
|
| 245 |
+
print(f" π€ Upload mode: IMMEDIATE (upload after each batch)")
|
| 246 |
+
else:
|
| 247 |
+
print(f" π€ Upload mode: DEFERRED (upload after all batches complete)")
|
| 248 |
+
|
| 249 |
# Split identifiers into batches
|
| 250 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 251 |
total_batches = len(batches)
|
|
|
|
| 276 |
successful_batches += 1
|
| 277 |
print(f" β Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
|
| 278 |
|
| 279 |
+
# Upload immediately after this batch if enabled
|
| 280 |
+
if upload_immediately and batch_results:
|
| 281 |
+
print(f"\n π€ Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
|
| 282 |
+
upload_success = 0
|
| 283 |
+
upload_errors = 0
|
| 284 |
+
|
| 285 |
+
for identifier, metadata_list in batch_results.items():
|
| 286 |
+
if metadata_list:
|
| 287 |
+
if save_review_metadata_to_hf(metadata_list, identifier):
|
| 288 |
+
upload_success += 1
|
| 289 |
+
else:
|
| 290 |
+
upload_errors += 1
|
| 291 |
+
|
| 292 |
+
print(f" β Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
|
| 293 |
+
|
| 294 |
except Exception as e:
|
| 295 |
failed_batches += 1
|
| 296 |
print(f" β Batch {batch_num}/{total_batches} failed: {str(e)}")
|
|
|
|
| 1026 |
|
| 1027 |
try:
|
| 1028 |
# Use batched approach for better performance
|
| 1029 |
+
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 1030 |
all_metadata = fetch_all_pr_metadata_batched(
|
| 1031 |
+
client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
|
| 1032 |
)
|
| 1033 |
+
|
| 1034 |
+
# Calculate summary statistics
|
| 1035 |
+
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 1036 |
+
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 1037 |
+
|
| 1038 |
+
print(f"\n{'='*80}")
|
| 1039 |
+
print(f"β
BigQuery mining and upload complete!")
|
| 1040 |
+
print(f" Total agents: {len(agents)}")
|
| 1041 |
+
print(f" Agents with data: {agents_with_data}")
|
| 1042 |
+
print(f" Total PRs found: {total_prs}")
|
| 1043 |
+
print(f"{'='*80}\n")
|
| 1044 |
+
|
| 1045 |
except Exception as e:
|
| 1046 |
print(f"β Error during BigQuery fetch: {str(e)}")
|
| 1047 |
import traceback
|
| 1048 |
traceback.print_exc()
|
| 1049 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1050 |
|
| 1051 |
# Construct and save leaderboard data
|
| 1052 |
print(f"\n{'='*80}")
|