Spaces:
Running
Running
reduce expansion
Browse files
app.py
CHANGED
|
@@ -215,6 +215,41 @@ def get_bigquery_client():
|
|
| 215 |
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
|
| 216 |
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
|
| 219 |
"""
|
| 220 |
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
|
@@ -315,7 +350,7 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
|
| 315 |
NOTE: This function is designed for querying a single agent at a time.
|
| 316 |
For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
|
| 317 |
|
| 318 |
-
Queries githubarchive.
|
| 319 |
actor.login matches the agent identifier, and joins with PR status.
|
| 320 |
|
| 321 |
Args:
|
|
@@ -330,24 +365,12 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
|
| 330 |
print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
|
| 331 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 332 |
|
| 333 |
-
# Generate
|
| 334 |
-
|
| 335 |
-
current_date = start_date
|
| 336 |
-
while current_date < end_date:
|
| 337 |
-
table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
|
| 338 |
-
review_tables.append(f"SELECT * FROM {table_name}")
|
| 339 |
-
current_date += timedelta(days=1)
|
| 340 |
-
review_union = " UNION ALL ".join(review_tables)
|
| 341 |
|
| 342 |
-
# Generate
|
| 343 |
status_start = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 344 |
-
|
| 345 |
-
current_date = status_start
|
| 346 |
-
while current_date < end_date:
|
| 347 |
-
table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
|
| 348 |
-
status_tables.append(f"SELECT * FROM {table_name}")
|
| 349 |
-
current_date += timedelta(days=1)
|
| 350 |
-
status_union = " UNION ALL ".join(status_tables)
|
| 351 |
|
| 352 |
# Build comprehensive query with CTEs for PR status
|
| 353 |
query = f"""
|
|
@@ -400,7 +423,10 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
|
| 400 |
]
|
| 401 |
)
|
| 402 |
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
try:
|
| 406 |
query_job = client.query(query, job_config=job_config)
|
|
|
|
| 215 |
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
|
| 216 |
|
| 217 |
|
| 218 |
+
def generate_table_union_statements(start_date, end_date):
|
| 219 |
+
"""
|
| 220 |
+
Generate UNION ALL statements for githubarchive.month tables in date range.
|
| 221 |
+
Uses monthly tables instead of daily to drastically reduce query size.
|
| 222 |
+
|
| 223 |
+
Args:
|
| 224 |
+
start_date: Start datetime
|
| 225 |
+
end_date: End datetime
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
String with UNION ALL SELECT statements for all monthly tables in range
|
| 229 |
+
"""
|
| 230 |
+
table_names = []
|
| 231 |
+
|
| 232 |
+
# Start from the beginning of start_date's month
|
| 233 |
+
current_date = start_date.replace(day=1)
|
| 234 |
+
|
| 235 |
+
# End at the beginning of end_date's month (inclusive)
|
| 236 |
+
end_month = end_date.replace(day=1)
|
| 237 |
+
|
| 238 |
+
while current_date <= end_month:
|
| 239 |
+
table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
|
| 240 |
+
table_names.append(table_name)
|
| 241 |
+
|
| 242 |
+
# Move to next month
|
| 243 |
+
if current_date.month == 12:
|
| 244 |
+
current_date = current_date.replace(year=current_date.year + 1, month=1)
|
| 245 |
+
else:
|
| 246 |
+
current_date = current_date.replace(month=current_date.month + 1)
|
| 247 |
+
|
| 248 |
+
# Create UNION ALL chain
|
| 249 |
+
union_parts = [f"SELECT * FROM {table}" for table in table_names]
|
| 250 |
+
return " UNION ALL ".join(union_parts)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
|
| 254 |
"""
|
| 255 |
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
|
|
|
| 350 |
NOTE: This function is designed for querying a single agent at a time.
|
| 351 |
For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
|
| 352 |
|
| 353 |
+
Queries githubarchive.month.YYYYMM tables for PullRequestReviewEvent where
|
| 354 |
actor.login matches the agent identifier, and joins with PR status.
|
| 355 |
|
| 356 |
Args:
|
|
|
|
| 365 |
print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
|
| 366 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 367 |
|
| 368 |
+
# Generate monthly table UNION statements for review period
|
| 369 |
+
review_union = generate_table_union_statements(start_date, end_date)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
+
# Generate monthly table UNION statements for PR status (lookback)
|
| 372 |
status_start = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 373 |
+
status_union = generate_table_union_statements(status_start, end_date)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
# Build comprehensive query with CTEs for PR status
|
| 376 |
query = f"""
|
|
|
|
| 423 |
]
|
| 424 |
)
|
| 425 |
|
| 426 |
+
# Calculate months for logging
|
| 427 |
+
review_months = ((end_date.year - start_date.year) * 12 + end_date.month - start_date.month + 1)
|
| 428 |
+
status_months = ((end_date.year - status_start.year) * 12 + end_date.month - status_start.month + 1)
|
| 429 |
+
print(f" Querying {review_months} monthly review tables and {status_months} monthly status tables...")
|
| 430 |
|
| 431 |
try:
|
| 432 |
query_job = client.query(query, job_config=job_config)
|
msr.py
CHANGED
|
@@ -204,23 +204,34 @@ def get_bigquery_client():
|
|
| 204 |
|
| 205 |
def generate_table_union_statements(start_date, end_date):
|
| 206 |
"""
|
| 207 |
-
Generate UNION ALL statements for githubarchive.
|
| 208 |
-
|
|
|
|
| 209 |
Args:
|
| 210 |
start_date: Start datetime
|
| 211 |
end_date: End datetime
|
| 212 |
-
|
| 213 |
Returns:
|
| 214 |
-
String with UNION ALL SELECT statements for all tables in range
|
| 215 |
"""
|
| 216 |
table_names = []
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
table_names.append(table_name)
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
# Create UNION ALL chain
|
| 225 |
union_parts = [f"SELECT * FROM {table}" for table in table_names]
|
| 226 |
return " UNION ALL ".join(union_parts)
|
|
|
|
| 204 |
|
| 205 |
def generate_table_union_statements(start_date, end_date):
|
| 206 |
"""
|
| 207 |
+
Generate UNION ALL statements for githubarchive.month tables in date range.
|
| 208 |
+
Uses monthly tables instead of daily to drastically reduce query size.
|
| 209 |
+
|
| 210 |
Args:
|
| 211 |
start_date: Start datetime
|
| 212 |
end_date: End datetime
|
| 213 |
+
|
| 214 |
Returns:
|
| 215 |
+
String with UNION ALL SELECT statements for all monthly tables in range
|
| 216 |
"""
|
| 217 |
table_names = []
|
| 218 |
+
|
| 219 |
+
# Start from the beginning of start_date's month
|
| 220 |
+
current_date = start_date.replace(day=1)
|
| 221 |
+
|
| 222 |
+
# End at the beginning of end_date's month (inclusive)
|
| 223 |
+
end_month = end_date.replace(day=1)
|
| 224 |
+
|
| 225 |
+
while current_date <= end_month:
|
| 226 |
+
table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
|
| 227 |
table_names.append(table_name)
|
| 228 |
+
|
| 229 |
+
# Move to next month
|
| 230 |
+
if current_date.month == 12:
|
| 231 |
+
current_date = current_date.replace(year=current_date.year + 1, month=1)
|
| 232 |
+
else:
|
| 233 |
+
current_date = current_date.replace(month=current_date.month + 1)
|
| 234 |
+
|
| 235 |
# Create UNION ALL chain
|
| 236 |
union_parts = [f"SELECT * FROM {table}" for table in table_names]
|
| 237 |
return " UNION ALL ".join(union_parts)
|