zhimin-z commited on
Commit
c1b8cab
·
1 Parent(s): fc294eb
Files changed (1) hide show
  1. msr.py +59 -48
msr.py CHANGED
@@ -217,6 +217,7 @@ def get_duckdb_connection():
217
  def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
218
  """
219
  Generate file path patterns for GHArchive data in date range.
 
220
 
221
  Args:
222
  start_date: Start datetime
@@ -224,21 +225,35 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DI
224
  data_dir: Directory containing GHArchive data files
225
 
226
  Returns:
227
- List of file path patterns (one per day)
228
  """
229
  file_patterns = []
 
230
 
231
  current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
232
  end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
233
 
234
  while current_date <= end_day:
235
- # Pattern for daily parquet file: 2024-11-15.parquet
236
- pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}.parquet")
237
- file_patterns.append(pattern)
 
 
 
 
 
 
 
 
 
238
 
239
  # Move to next day
240
  current_date += timedelta(days=1)
241
 
 
 
 
 
242
  return file_patterns
243
 
244
 
@@ -288,72 +303,72 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
288
  # Build identifier list for IN clause
289
  identifier_list = ', '.join([f"'{id}'" for id in identifiers])
290
 
291
- # Build comprehensive query with CTEs using parameterized file lists (Parquet optimized)
292
  query = f"""
293
  WITH review_events AS (
294
  -- Get all review events for ALL agents
295
  SELECT
296
- payload.pull_request.html_url as url,
297
  COALESCE(
298
- payload.review.submitted_at,
299
- CAST(created_at AS VARCHAR)
300
  ) as reviewed_at,
301
- actor.login as reviewer,
302
- repo.name as repo_name,
303
- CAST(payload.pull_request.number AS INTEGER) as pr_number
304
- FROM read_parquet($review_patterns, union_by_name=true, filename=true)
305
  WHERE
306
- type = 'PullRequestReviewEvent'
307
- AND actor.login IN ({identifier_list})
308
- AND payload.pull_request.html_url IS NOT NULL
309
 
310
  UNION ALL
311
 
312
  -- Get PR comments (IssueCommentEvent on PRs)
313
  SELECT
314
- payload.issue.html_url as url,
315
- CAST(created_at AS VARCHAR) as reviewed_at,
316
- actor.login as reviewer,
317
- repo.name as repo_name,
318
- CAST(payload.issue.number AS INTEGER) as pr_number
319
- FROM read_parquet($review_patterns, union_by_name=true, filename=true)
320
  WHERE
321
- type = 'IssueCommentEvent'
322
- AND actor.login IN ({identifier_list})
323
- AND payload.issue.pull_request.url IS NOT NULL
324
- AND payload.issue.html_url IS NOT NULL
325
 
326
  UNION ALL
327
 
328
  -- Get review comments (PullRequestReviewCommentEvent)
329
  SELECT
330
- payload.pull_request.html_url as url,
331
- CAST(created_at AS VARCHAR) as reviewed_at,
332
- actor.login as reviewer,
333
- repo.name as repo_name,
334
- CAST(payload.pull_request.number AS INTEGER) as pr_number
335
- FROM read_parquet($review_patterns, union_by_name=true, filename=true)
336
  WHERE
337
- type = 'PullRequestReviewCommentEvent'
338
- AND actor.login IN ({identifier_list})
339
- AND payload.pull_request.html_url IS NOT NULL
340
  ),
341
 
342
  pr_status AS (
343
  -- Get merge/close status for those PRs
344
  SELECT
345
- payload.pull_request.html_url as url,
346
- CAST(payload.pull_request.merged AS BOOLEAN) as is_merged,
347
- payload.pull_request.merged_at as merged_at,
348
- payload.pull_request.closed_at as closed_at,
349
  created_at,
350
- ROW_NUMBER() OVER (PARTITION BY payload.pull_request.html_url ORDER BY created_at DESC) as rn
351
- FROM read_parquet($status_patterns, union_by_name=true, filename=true)
352
  WHERE
353
- type = 'PullRequestEvent'
354
- AND payload.action = 'closed'
355
- AND payload.pull_request.html_url IS NOT NULL
356
- AND payload.pull_request.html_url IN (
357
  SELECT DISTINCT url FROM review_events
358
  )
359
  )
@@ -603,11 +618,9 @@ def batch_upload_review_metadata(all_metadata):
603
  commit_message=f"Update {repo_path}",
604
  max_retries=MAX_RETRIES
605
  ):
606
- print(" ")
607
  agent_success += 1
608
  success_count += 1
609
  else:
610
- print(" ")
611
  agent_error += 1
612
  error_count += 1
613
 
@@ -927,7 +940,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
927
  repo_id=LEADERBOARD_REPO,
928
  repo_type="dataset"
929
  )
930
- print(" ")
931
  print(f"Saved leaderboard data to HuggingFace: {filename}")
932
  return True
933
  finally:
@@ -936,7 +948,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
936
  os.remove(filename)
937
 
938
  except Exception as e:
939
- print(f" ")
940
  print(f"Error saving leaderboard data: {str(e)}")
941
  import traceback
942
  traceback.print_exc()
 
217
  def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
218
  """
219
  Generate file path patterns for GHArchive data in date range.
220
+ Only includes files that actually exist on disk.
221
 
222
  Args:
223
  start_date: Start datetime
 
225
  data_dir: Directory containing GHArchive data files
226
 
227
  Returns:
228
+ List of file path patterns (hourly JSON.gz files) that exist
229
  """
230
  file_patterns = []
231
+ missing_dates = set()
232
 
233
  current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
234
  end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
235
 
236
  while current_date <= end_day:
237
+ # Pattern for hourly JSON.gz files: 2024-11-15-{0..23}.json.gz
238
+ date_has_files = False
239
+ for hour in range(24):
240
+ pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
241
+ # Only add pattern if file exists
242
+ if os.path.exists(pattern):
243
+ file_patterns.append(pattern)
244
+ date_has_files = True
245
+
246
+ # Track missing dates
247
+ if not date_has_files:
248
+ missing_dates.add(current_date.strftime('%Y-%m-%d'))
249
 
250
  # Move to next day
251
  current_date += timedelta(days=1)
252
 
253
+ # Print warning about missing dates
254
+ if missing_dates:
255
+ print(f" Warning: Skipping {len(missing_dates)} date(s) with no data files: {', '.join(sorted(missing_dates))}")
256
+
257
  return file_patterns
258
 
259
 
 
303
  # Build identifier list for IN clause
304
  identifier_list = ', '.join([f"'{id}'" for id in identifiers])
305
 
306
+ # Build comprehensive query with CTEs using parameterized file lists (JSON.gz format)
307
  query = f"""
308
  WITH review_events AS (
309
  -- Get all review events for ALL agents
310
  SELECT
311
+ TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
312
  COALESCE(
313
+ TRY_CAST(json_extract_string(payload, '$.review.submitted_at') AS VARCHAR),
314
+ TRY_CAST(created_at AS VARCHAR)
315
  ) as reviewed_at,
316
+ TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as reviewer,
317
+ TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
318
+ TRY_CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
319
+ FROM read_json($review_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
320
  WHERE
321
+ TRY_CAST(type AS VARCHAR) = 'PullRequestReviewEvent'
322
+ AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
323
+ AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
324
 
325
  UNION ALL
326
 
327
  -- Get PR comments (IssueCommentEvent on PRs)
328
  SELECT
329
+ TRY_CAST(json_extract_string(payload, '$.issue.html_url') AS VARCHAR) as url,
330
+ TRY_CAST(created_at AS VARCHAR) as reviewed_at,
331
+ TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as reviewer,
332
+ TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
333
+ TRY_CAST(json_extract_string(payload, '$.issue.number') AS INTEGER) as pr_number
334
+ FROM read_json($review_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
335
  WHERE
336
+ TRY_CAST(type AS VARCHAR) = 'IssueCommentEvent'
337
+ AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
338
+ AND json_extract_string(payload, '$.issue.pull_request.url') IS NOT NULL
339
+ AND json_extract_string(payload, '$.issue.html_url') IS NOT NULL
340
 
341
  UNION ALL
342
 
343
  -- Get review comments (PullRequestReviewCommentEvent)
344
  SELECT
345
+ TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
346
+ TRY_CAST(created_at AS VARCHAR) as reviewed_at,
347
+ TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as reviewer,
348
+ TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
349
+ TRY_CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
350
+ FROM read_json($review_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
351
  WHERE
352
+ TRY_CAST(type AS VARCHAR) = 'PullRequestReviewCommentEvent'
353
+ AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
354
+ AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
355
  ),
356
 
357
  pr_status AS (
358
  -- Get merge/close status for those PRs
359
  SELECT
360
+ TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
361
+ TRY_CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
362
+ TRY_CAST(json_extract_string(payload, '$.pull_request.merged_at') AS VARCHAR) as merged_at,
363
+ TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
364
  created_at,
365
+ ROW_NUMBER() OVER (PARTITION BY json_extract_string(payload, '$.pull_request.html_url') ORDER BY created_at DESC) as rn
366
+ FROM read_json($status_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
367
  WHERE
368
+ TRY_CAST(type AS VARCHAR) = 'PullRequestEvent'
369
+ AND TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) = 'closed'
370
+ AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
371
+ AND json_extract_string(payload, '$.pull_request.html_url') IN (
372
  SELECT DISTINCT url FROM review_events
373
  )
374
  )
 
618
  commit_message=f"Update {repo_path}",
619
  max_retries=MAX_RETRIES
620
  ):
 
621
  agent_success += 1
622
  success_count += 1
623
  else:
 
624
  agent_error += 1
625
  error_count += 1
626
 
 
940
  repo_id=LEADERBOARD_REPO,
941
  repo_type="dataset"
942
  )
 
943
  print(f"Saved leaderboard data to HuggingFace: {filename}")
944
  return True
945
  finally:
 
948
  os.remove(filename)
949
 
950
  except Exception as e:
 
951
  print(f"Error saving leaderboard data: {str(e)}")
952
  import traceback
953
  traceback.print_exc()