zhiminy commited on
Commit
0fd77fe
Β·
1 Parent(s): d00dcbf

refine workflow

Browse files
Files changed (2) hide show
  1. app.py +39 -55
  2. msr.py +38 -57
app.py CHANGED
@@ -208,7 +208,7 @@ def get_bigquery_client():
208
  raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
209
 
210
 
211
- def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
212
  """
213
  Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
214
  Splits agents into smaller batches to avoid performance issues with large queries.
@@ -219,12 +219,19 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
219
  start_date: Start datetime (timezone-aware)
220
  end_date: End datetime (timezone-aware)
221
  batch_size: Number of agents to process per batch (default: 50)
 
222
 
223
  Returns:
224
  Dictionary mapping agent identifier to list of PR metadata
225
  """
226
  print(f"\nπŸ” Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
227
 
 
 
 
 
 
 
228
  # Split identifiers into batches
229
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
230
  total_batches = len(batches)
@@ -241,6 +248,7 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
241
 
242
  try:
243
  # Query this batch - process each agent in the batch
 
244
  for identifier in batch_identifiers:
245
  review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
246
 
@@ -258,10 +266,26 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
258
 
259
  if metadata_list:
260
  all_metadata[identifier] = metadata_list
 
261
 
262
  successful_batches += 1
263
  print(f" βœ“ Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  except Exception as e:
266
  failed_batches += 1
267
  print(f" βœ— Batch {batch_num}/{total_batches} failed: {str(e)}")
@@ -2086,68 +2110,28 @@ def mine_all_agents():
2086
 
2087
  try:
2088
  # Use batched approach for better performance
 
2089
  all_metadata = fetch_all_pr_metadata_batched(
2090
- client, identifiers, start_date, end_date, batch_size=50
2091
  )
 
 
 
 
 
 
 
 
 
 
 
 
2092
  except Exception as e:
2093
  print(f"βœ— Error during BigQuery fetch: {str(e)}")
2094
  import traceback
2095
  traceback.print_exc()
2096
  return
2097
 
2098
- # Save results for each agent
2099
- print(f"\n{'='*80}")
2100
- print(f"πŸ’Ύ Saving results to HuggingFace for each agent...")
2101
- print(f"{'='*80}\n")
2102
-
2103
- success_count = 0
2104
- error_count = 0
2105
- no_data_count = 0
2106
-
2107
- for i, agent in enumerate(agents, 1):
2108
- identifier = agent.get('github_identifier')
2109
- agent_name = agent.get('name', 'Unknown')
2110
-
2111
- if not identifier:
2112
- print(f"[{i}/{len(agents)}] Skipping agent without identifier")
2113
- error_count += 1
2114
- continue
2115
-
2116
- metadata = all_metadata.get(identifier, [])
2117
-
2118
- print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
2119
-
2120
- try:
2121
- if metadata:
2122
- print(f" πŸ’Ύ Saving {len(metadata)} review records...")
2123
- if save_review_metadata_to_hf(metadata, identifier):
2124
- success_count += 1
2125
- else:
2126
- error_count += 1
2127
- else:
2128
- print(f" No reviews found")
2129
- no_data_count += 1
2130
-
2131
- except Exception as e:
2132
- print(f" βœ— Error saving {identifier}: {str(e)}")
2133
- import traceback
2134
- traceback.print_exc()
2135
- error_count += 1
2136
- continue
2137
-
2138
- # Calculate number of batches
2139
- batch_size = 50
2140
- total_batches = (len(identifiers) + batch_size - 1) // batch_size
2141
-
2142
- print(f"\n{'='*80}")
2143
- print(f"βœ… Mining complete!")
2144
- print(f" Total agents: {len(agents)}")
2145
- print(f" Successfully saved: {success_count}")
2146
- print(f" No data (skipped): {no_data_count}")
2147
- print(f" Errors: {error_count}")
2148
- print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
2149
- print(f"{'='*80}\n")
2150
-
2151
  # After mining is complete, save leaderboard and metrics to HuggingFace
2152
  print(f"πŸ“€ Uploading leaderboard and metrics data...")
2153
  if save_leaderboard_and_metrics_to_hf():
 
208
  raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
209
 
210
 
211
+ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
212
  """
213
  Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
214
  Splits agents into smaller batches to avoid performance issues with large queries.
 
219
  start_date: Start datetime (timezone-aware)
220
  end_date: End datetime (timezone-aware)
221
  batch_size: Number of agents to process per batch (default: 50)
222
+ upload_immediately: If True, upload each batch to HuggingFace immediately after processing (default: True)
223
 
224
  Returns:
225
  Dictionary mapping agent identifier to list of PR metadata
226
  """
227
  print(f"\nπŸ” Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
228
 
229
+ # Log upload mode
230
+ if upload_immediately:
231
+ print(f" πŸ“€ Upload mode: IMMEDIATE (upload after each batch)")
232
+ else:
233
+ print(f" πŸ“€ Upload mode: DEFERRED (upload after all batches complete)")
234
+
235
  # Split identifiers into batches
236
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
237
  total_batches = len(batches)
 
248
 
249
  try:
250
  # Query this batch - process each agent in the batch
251
+ batch_results = {}
252
  for identifier in batch_identifiers:
253
  review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
254
 
 
266
 
267
  if metadata_list:
268
  all_metadata[identifier] = metadata_list
269
+ batch_results[identifier] = metadata_list
270
 
271
  successful_batches += 1
272
  print(f" βœ“ Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
273
 
274
+ # Upload immediately after this batch if enabled
275
+ if upload_immediately and batch_results:
276
+ print(f"\n πŸ“€ Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
277
+ upload_success = 0
278
+ upload_errors = 0
279
+
280
+ for identifier, metadata_list in batch_results.items():
281
+ if metadata_list:
282
+ if save_review_metadata_to_hf(metadata_list, identifier):
283
+ upload_success += 1
284
+ else:
285
+ upload_errors += 1
286
+
287
+ print(f" βœ“ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
288
+
289
  except Exception as e:
290
  failed_batches += 1
291
  print(f" βœ— Batch {batch_num}/{total_batches} failed: {str(e)}")
 
2110
 
2111
  try:
2112
  # Use batched approach for better performance
2113
+ # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
2114
  all_metadata = fetch_all_pr_metadata_batched(
2115
+ client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
2116
  )
2117
+
2118
+ # Calculate summary statistics
2119
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
2120
+ agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
2121
+
2122
+ print(f"\n{'='*80}")
2123
+ print(f"βœ… BigQuery mining and upload complete!")
2124
+ print(f" Total agents: {len(agents)}")
2125
+ print(f" Agents with data: {agents_with_data}")
2126
+ print(f" Total PRs found: {total_prs}")
2127
+ print(f"{'='*80}\n")
2128
+
2129
  except Exception as e:
2130
  print(f"βœ— Error during BigQuery fetch: {str(e)}")
2131
  import traceback
2132
  traceback.print_exc()
2133
  return
2134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2135
  # After mining is complete, save leaderboard and metrics to HuggingFace
2136
  print(f"πŸ“€ Uploading leaderboard and metrics data...")
2137
  if save_leaderboard_and_metrics_to_hf():
msr.py CHANGED
@@ -222,7 +222,7 @@ def generate_table_union_statements(start_date, end_date):
222
  # BIGQUERY FUNCTIONS
223
  # =============================================================================
224
 
225
- def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
226
  """
227
  Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
228
  Splits agents into smaller batches to avoid performance issues with large queries.
@@ -232,13 +232,20 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
232
  identifiers: List of GitHub usernames/bot identifiers
233
  start_date: Start datetime (timezone-aware)
234
  end_date: End datetime (timezone-aware)
235
- batch_size: Number of agents to process per batch (default: 100)
 
236
 
237
  Returns:
238
  Dictionary mapping agent identifier to list of PR metadata (same format as single query)
239
  """
240
  print(f"\nπŸ” Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
241
 
 
 
 
 
 
 
242
  # Split identifiers into batches
243
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
244
  total_batches = len(batches)
@@ -269,6 +276,21 @@ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, bat
269
  successful_batches += 1
270
  print(f" βœ“ Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  except Exception as e:
273
  failed_batches += 1
274
  print(f" βœ— Batch {batch_num}/{total_batches} failed: {str(e)}")
@@ -1004,68 +1026,27 @@ def mine_all_agents():
1004
 
1005
  try:
1006
  # Use batched approach for better performance
 
1007
  all_metadata = fetch_all_pr_metadata_batched(
1008
- client, identifiers, start_date, end_date, batch_size=50
1009
  )
 
 
 
 
 
 
 
 
 
 
 
 
1010
  except Exception as e:
1011
  print(f"βœ— Error during BigQuery fetch: {str(e)}")
1012
  import traceback
1013
  traceback.print_exc()
1014
  return
1015
-
1016
- # Save results for each agent
1017
- print(f"\n{'='*80}")
1018
- print(f"πŸ’Ύ Saving results to HuggingFace for each agent...")
1019
- print(f"{'='*80}\n")
1020
-
1021
- success_count = 0
1022
- error_count = 0
1023
- no_data_count = 0
1024
-
1025
- for i, agent in enumerate(agents, 1):
1026
- identifier = agent.get('github_identifier')
1027
- agent_name = agent.get('name', 'Unknown')
1028
-
1029
- if not identifier:
1030
- print(f"[{i}/{len(agents)}] Skipping agent without identifier")
1031
- error_count += 1
1032
- continue
1033
-
1034
- metadata = all_metadata.get(identifier, [])
1035
-
1036
- print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
1037
-
1038
- try:
1039
- if metadata:
1040
- print(f" πŸ’Ύ Saving {len(metadata)} review records...")
1041
- if save_review_metadata_to_hf(metadata, identifier):
1042
- success_count += 1
1043
- else:
1044
- error_count += 1
1045
- else:
1046
- print(f" No reviews found")
1047
- no_data_count += 1
1048
-
1049
- except Exception as e:
1050
- print(f" βœ— Error saving {identifier}: {str(e)}")
1051
- import traceback
1052
- traceback.print_exc()
1053
- error_count += 1
1054
- continue
1055
-
1056
- # Calculate number of batches
1057
- total_identifiers = len(identifiers)
1058
- batch_size = 50
1059
- num_batches = (total_identifiers + batch_size - 1) // batch_size # Ceiling division
1060
-
1061
- print(f"\n{'='*80}")
1062
- print(f"βœ… Mining complete!")
1063
- print(f" Total agents: {len(agents)}")
1064
- print(f" Successfully saved: {success_count}")
1065
- print(f" No data (skipped): {no_data_count}")
1066
- print(f" Errors: {error_count}")
1067
- print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
1068
- print(f"{'='*80}\n")
1069
 
1070
  # Construct and save leaderboard data
1071
  print(f"\n{'='*80}")
 
222
  # BIGQUERY FUNCTIONS
223
  # =============================================================================
224
 
225
+ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
226
  """
227
  Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
228
  Splits agents into smaller batches to avoid performance issues with large queries.
 
232
  identifiers: List of GitHub usernames/bot identifiers
233
  start_date: Start datetime (timezone-aware)
234
  end_date: End datetime (timezone-aware)
235
+ batch_size: Number of agents to process per batch (default: 50)
236
+ upload_immediately: If True, upload each batch to HuggingFace immediately after processing (default: True)
237
 
238
  Returns:
239
  Dictionary mapping agent identifier to list of PR metadata (same format as single query)
240
  """
241
  print(f"\nπŸ” Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
242
 
243
+ # Log upload mode
244
+ if upload_immediately:
245
+ print(f" πŸ“€ Upload mode: IMMEDIATE (upload after each batch)")
246
+ else:
247
+ print(f" πŸ“€ Upload mode: DEFERRED (upload after all batches complete)")
248
+
249
  # Split identifiers into batches
250
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
251
  total_batches = len(batches)
 
276
  successful_batches += 1
277
  print(f" βœ“ Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
278
 
279
+ # Upload immediately after this batch if enabled
280
+ if upload_immediately and batch_results:
281
+ print(f"\n πŸ“€ Uploading batch {batch_num}/{total_batches} results to HuggingFace...")
282
+ upload_success = 0
283
+ upload_errors = 0
284
+
285
+ for identifier, metadata_list in batch_results.items():
286
+ if metadata_list:
287
+ if save_review_metadata_to_hf(metadata_list, identifier):
288
+ upload_success += 1
289
+ else:
290
+ upload_errors += 1
291
+
292
+ print(f" βœ“ Batch {batch_num}/{total_batches} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
293
+
294
  except Exception as e:
295
  failed_batches += 1
296
  print(f" βœ— Batch {batch_num}/{total_batches} failed: {str(e)}")
 
1026
 
1027
  try:
1028
  # Use batched approach for better performance
1029
+ # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
1030
  all_metadata = fetch_all_pr_metadata_batched(
1031
+ client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
1032
  )
1033
+
1034
+ # Calculate summary statistics
1035
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
1036
+ agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
1037
+
1038
+ print(f"\n{'='*80}")
1039
+ print(f"βœ… BigQuery mining and upload complete!")
1040
+ print(f" Total agents: {len(agents)}")
1041
+ print(f" Agents with data: {agents_with_data}")
1042
+ print(f" Total PRs found: {total_prs}")
1043
+ print(f"{'='*80}\n")
1044
+
1045
  except Exception as e:
1046
  print(f"βœ— Error during BigQuery fetch: {str(e)}")
1047
  import traceback
1048
  traceback.print_exc()
1049
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1050
 
1051
  # Construct and save leaderboard data
1052
  print(f"\n{'='*80}")