zhiminy commited on
Commit
0d658c0
·
1 Parent(s): ad9701d

reduce expansion

Browse files
Files changed (2) hide show
  1. app.py +44 -18
  2. msr.py +21 -10
app.py CHANGED
@@ -215,6 +215,41 @@ def get_bigquery_client():
215
  raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
219
  """
220
  Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
@@ -315,7 +350,7 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
315
  NOTE: This function is designed for querying a single agent at a time.
316
  For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
317
 
318
- Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
319
  actor.login matches the agent identifier, and joins with PR status.
320
 
321
  Args:
@@ -330,24 +365,12 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
330
  print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
331
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
332
 
333
- # Generate list of table names for each day in the range
334
- review_tables = []
335
- current_date = start_date
336
- while current_date < end_date:
337
- table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
338
- review_tables.append(f"SELECT * FROM {table_name}")
339
- current_date += timedelta(days=1)
340
- review_union = " UNION ALL ".join(review_tables)
341
 
342
- # Generate status tables (lookback for PR status)
343
  status_start = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
344
- status_tables = []
345
- current_date = status_start
346
- while current_date < end_date:
347
- table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
348
- status_tables.append(f"SELECT * FROM {table_name}")
349
- current_date += timedelta(days=1)
350
- status_union = " UNION ALL ".join(status_tables)
351
 
352
  # Build comprehensive query with CTEs for PR status
353
  query = f"""
@@ -400,7 +423,10 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
400
  ]
401
  )
402
 
403
- print(f" Querying {len(review_tables)} review tables and {len(status_tables)} status tables...")
 
 
 
404
 
405
  try:
406
  query_job = client.query(query, job_config=job_config)
 
215
  raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
216
 
217
 
218
+ def generate_table_union_statements(start_date, end_date):
219
+ """
220
+ Generate UNION ALL statements for githubarchive.month tables in date range.
221
+ Uses monthly tables instead of daily to drastically reduce query size.
222
+
223
+ Args:
224
+ start_date: Start datetime
225
+ end_date: End datetime
226
+
227
+ Returns:
228
+ String with UNION ALL SELECT statements for all monthly tables in range
229
+ """
230
+ table_names = []
231
+
232
+ # Start from the beginning of start_date's month
233
+ current_date = start_date.replace(day=1)
234
+
235
+ # End at the beginning of end_date's month (inclusive)
236
+ end_month = end_date.replace(day=1)
237
+
238
+ while current_date <= end_month:
239
+ table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
240
+ table_names.append(table_name)
241
+
242
+ # Move to next month
243
+ if current_date.month == 12:
244
+ current_date = current_date.replace(year=current_date.year + 1, month=1)
245
+ else:
246
+ current_date = current_date.replace(month=current_date.month + 1)
247
+
248
+ # Create UNION ALL chain
249
+ union_parts = [f"SELECT * FROM {table}" for table in table_names]
250
+ return " UNION ALL ".join(union_parts)
251
+
252
+
253
  def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
254
  """
255
  Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
 
350
  NOTE: This function is designed for querying a single agent at a time.
351
  For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
352
 
353
+ Queries githubarchive.month.YYYYMM tables for PullRequestReviewEvent where
354
  actor.login matches the agent identifier, and joins with PR status.
355
 
356
  Args:
 
365
  print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
366
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
367
 
368
+ # Generate monthly table UNION statements for review period
369
+ review_union = generate_table_union_statements(start_date, end_date)
 
 
 
 
 
 
370
 
371
+ # Generate monthly table UNION statements for PR status (lookback)
372
  status_start = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
373
+ status_union = generate_table_union_statements(status_start, end_date)
 
 
 
 
 
 
374
 
375
  # Build comprehensive query with CTEs for PR status
376
  query = f"""
 
423
  ]
424
  )
425
 
426
+ # Calculate months for logging
427
+ review_months = ((end_date.year - start_date.year) * 12 + end_date.month - start_date.month + 1)
428
+ status_months = ((end_date.year - status_start.year) * 12 + end_date.month - status_start.month + 1)
429
+ print(f" Querying {review_months} monthly review tables and {status_months} monthly status tables...")
430
 
431
  try:
432
  query_job = client.query(query, job_config=job_config)
msr.py CHANGED
@@ -204,23 +204,34 @@ def get_bigquery_client():
204
 
205
  def generate_table_union_statements(start_date, end_date):
206
  """
207
- Generate UNION ALL statements for githubarchive.day tables in date range.
208
-
 
209
  Args:
210
  start_date: Start datetime
211
  end_date: End datetime
212
-
213
  Returns:
214
- String with UNION ALL SELECT statements for all tables in range
215
  """
216
  table_names = []
217
- current_date = start_date
218
-
219
- while current_date < end_date:
220
- table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
 
 
 
 
 
221
  table_names.append(table_name)
222
- current_date += timedelta(days=1)
223
-
 
 
 
 
 
224
  # Create UNION ALL chain
225
  union_parts = [f"SELECT * FROM {table}" for table in table_names]
226
  return " UNION ALL ".join(union_parts)
 
204
 
205
  def generate_table_union_statements(start_date, end_date):
206
  """
207
+ Generate UNION ALL statements for githubarchive.month tables in date range.
208
+ Uses monthly tables instead of daily to drastically reduce query size.
209
+
210
  Args:
211
  start_date: Start datetime
212
  end_date: End datetime
213
+
214
  Returns:
215
+ String with UNION ALL SELECT statements for all monthly tables in range
216
  """
217
  table_names = []
218
+
219
+ # Start from the beginning of start_date's month
220
+ current_date = start_date.replace(day=1)
221
+
222
+ # End at the beginning of end_date's month (inclusive)
223
+ end_month = end_date.replace(day=1)
224
+
225
+ while current_date <= end_month:
226
+ table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
227
  table_names.append(table_name)
228
+
229
+ # Move to next month
230
+ if current_date.month == 12:
231
+ current_date = current_date.replace(year=current_date.year + 1, month=1)
232
+ else:
233
+ current_date = current_date.replace(month=current_date.month + 1)
234
+
235
  # Create UNION ALL chain
236
  union_parts = [f"SELECT * FROM {table}" for table in table_names]
237
  return " UNION ALL ".join(union_parts)