zhimin-z commited on
Commit
d34dfc3
·
1 Parent(s): 9a4a0ec
Files changed (1) hide show
  1. msr.py +77 -43
msr.py CHANGED
@@ -33,6 +33,7 @@ REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
33
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # HuggingFace dataset for leaderboard data
34
  LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
35
  GHARCHIVE_DATA_DIR = "../gharchive/data" # Local GHArchive data directory
 
36
 
37
  # Upload configuration
38
  UPLOAD_DELAY_SECONDS = 2 # Delay between individual file uploads to avoid rate limits
@@ -205,16 +206,13 @@ def upload_folder_with_backoff(api, **kwargs):
205
 
206
  def get_duckdb_connection():
207
  """
208
- Initialize DuckDB connection with JSON support and optimized parallelization.
209
 
210
  Returns:
211
  DuckDB connection object
212
  """
213
- conn = duckdb.connect(':memory:')
214
-
215
- # Enable JSON extension if needed
216
- conn.execute("INSTALL json;")
217
- conn.execute("LOAD json;")
218
 
219
  # Optimize for 96-core CPU parallelization with 754GB RAM
220
  conn.execute("SET threads TO 48;") # Use all available cores
@@ -245,8 +243,8 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DI
245
  end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
246
 
247
  while current_date <= end_day:
248
- # Pattern for all hours in this day: 2024-11-15-*.json.gz
249
- pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-*.json.gz")
250
  file_patterns.append(pattern)
251
 
252
  # Move to next day
@@ -301,72 +299,72 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
301
  # Build identifier list for IN clause
302
  identifier_list = ', '.join([f"'{id}'" for id in identifiers])
303
 
304
- # Build comprehensive query with CTEs using parameterized file lists
305
  query = f"""
306
  WITH review_events AS (
307
  -- Get all review events for ALL agents
308
  SELECT
309
- json_extract_string(payload, '$.pull_request.html_url') as url,
310
  COALESCE(
311
- json_extract_string(payload, '$.review.submitted_at'),
312
  CAST(created_at AS VARCHAR)
313
  ) as reviewed_at,
314
- json_extract_string(actor, '$.login') as reviewer,
315
- json_extract_string(repo, '$.name') as repo_name,
316
- CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
317
- FROM read_json_auto($review_patterns, ignore_errors=true, union_by_name=true)
318
  WHERE
319
  type = 'PullRequestReviewEvent'
320
- AND json_extract_string(actor, '$.login') IN ({identifier_list})
321
- AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
322
 
323
  UNION ALL
324
 
325
  -- Get PR comments (IssueCommentEvent on PRs)
326
  SELECT
327
- json_extract_string(payload, '$.issue.html_url') as url,
328
  CAST(created_at AS VARCHAR) as reviewed_at,
329
- json_extract_string(actor, '$.login') as reviewer,
330
- json_extract_string(repo, '$.name') as repo_name,
331
- CAST(json_extract_string(payload, '$.issue.number') AS INTEGER) as pr_number
332
- FROM read_json_auto($review_patterns, ignore_errors=true, union_by_name=true)
333
  WHERE
334
  type = 'IssueCommentEvent'
335
- AND json_extract_string(actor, '$.login') IN ({identifier_list})
336
- AND json_extract_string(payload, '$.issue.pull_request.url') IS NOT NULL
337
- AND json_extract_string(payload, '$.issue.html_url') IS NOT NULL
338
 
339
  UNION ALL
340
 
341
  -- Get review comments (PullRequestReviewCommentEvent)
342
  SELECT
343
- json_extract_string(payload, '$.pull_request.html_url') as url,
344
  CAST(created_at AS VARCHAR) as reviewed_at,
345
- json_extract_string(actor, '$.login') as reviewer,
346
- json_extract_string(repo, '$.name') as repo_name,
347
- CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
348
- FROM read_json_auto($review_patterns, ignore_errors=true, union_by_name=true)
349
  WHERE
350
  type = 'PullRequestReviewCommentEvent'
351
- AND json_extract_string(actor, '$.login') IN ({identifier_list})
352
- AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
353
  ),
354
 
355
  pr_status AS (
356
  -- Get merge/close status for those PRs
357
  SELECT
358
- json_extract_string(payload, '$.pull_request.html_url') as url,
359
- CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
360
- json_extract_string(payload, '$.pull_request.merged_at') as merged_at,
361
- json_extract_string(payload, '$.pull_request.closed_at') as closed_at,
362
  created_at,
363
- ROW_NUMBER() OVER (PARTITION BY json_extract_string(payload, '$.pull_request.html_url') ORDER BY created_at DESC) as rn
364
- FROM read_json_auto($status_patterns, ignore_errors=true, union_by_name=true)
365
  WHERE
366
  type = 'PullRequestEvent'
367
- AND json_extract_string(payload, '$.action') = 'closed'
368
- AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
369
- AND json_extract_string(payload, '$.pull_request.html_url') IN (
370
  SELECT DISTINCT url FROM review_events
371
  )
372
  )
@@ -391,8 +389,44 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
391
  print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
392
 
393
  try:
394
- # Execute query with parameters
395
- results = conn.execute(query, {'review_patterns': review_patterns, 'status_patterns': status_patterns}).fetchall()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  print(f" Found {len(results)} total PR review records across all agents")
398
 
 
33
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # HuggingFace dataset for leaderboard data
34
  LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
35
  GHARCHIVE_DATA_DIR = "../gharchive/data" # Local GHArchive data directory
36
+ DUCKDB_CACHE_FILE = "gharchive_cache.duckdb" # Persistent DuckDB database for caching
37
 
38
  # Upload configuration
39
  UPLOAD_DELAY_SECONDS = 2 # Delay between individual file uploads to avoid rate limits
 
206
 
207
  def get_duckdb_connection():
208
  """
209
+ Initialize DuckDB connection with persistent database and optimized parallelization.
210
 
211
  Returns:
212
  DuckDB connection object
213
  """
214
+ # Use persistent database for caching results
215
+ conn = duckdb.connect(DUCKDB_CACHE_FILE)
 
 
 
216
 
217
  # Optimize for 96-core CPU parallelization with 754GB RAM
218
  conn.execute("SET threads TO 48;") # Use all available cores
 
243
  end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
244
 
245
  while current_date <= end_day:
246
+ # Pattern for all hours in this day: 2024-11-15-*.parquet
247
+ pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-*.parquet")
248
  file_patterns.append(pattern)
249
 
250
  # Move to next day
 
299
  # Build identifier list for IN clause
300
  identifier_list = ', '.join([f"'{id}'" for id in identifiers])
301
 
302
+ # Build comprehensive query with CTEs using parameterized file lists (Parquet optimized)
303
  query = f"""
304
  WITH review_events AS (
305
  -- Get all review events for ALL agents
306
  SELECT
307
+ payload.pull_request.html_url as url,
308
  COALESCE(
309
+ payload.review.submitted_at,
310
  CAST(created_at AS VARCHAR)
311
  ) as reviewed_at,
312
+ actor.login as reviewer,
313
+ repo.name as repo_name,
314
+ CAST(payload.pull_request.number AS INTEGER) as pr_number
315
+ FROM read_parquet($review_patterns, union_by_name=true, filename=true)
316
  WHERE
317
  type = 'PullRequestReviewEvent'
318
+ AND actor.login IN ({identifier_list})
319
+ AND payload.pull_request.html_url IS NOT NULL
320
 
321
  UNION ALL
322
 
323
  -- Get PR comments (IssueCommentEvent on PRs)
324
  SELECT
325
+ payload.issue.html_url as url,
326
  CAST(created_at AS VARCHAR) as reviewed_at,
327
+ actor.login as reviewer,
328
+ repo.name as repo_name,
329
+ CAST(payload.issue.number AS INTEGER) as pr_number
330
+ FROM read_parquet($review_patterns, union_by_name=true, filename=true)
331
  WHERE
332
  type = 'IssueCommentEvent'
333
+ AND actor.login IN ({identifier_list})
334
+ AND payload.issue.pull_request.url IS NOT NULL
335
+ AND payload.issue.html_url IS NOT NULL
336
 
337
  UNION ALL
338
 
339
  -- Get review comments (PullRequestReviewCommentEvent)
340
  SELECT
341
+ payload.pull_request.html_url as url,
342
  CAST(created_at AS VARCHAR) as reviewed_at,
343
+ actor.login as reviewer,
344
+ repo.name as repo_name,
345
+ CAST(payload.pull_request.number AS INTEGER) as pr_number
346
+ FROM read_parquet($review_patterns, union_by_name=true, filename=true)
347
  WHERE
348
  type = 'PullRequestReviewCommentEvent'
349
+ AND actor.login IN ({identifier_list})
350
+ AND payload.pull_request.html_url IS NOT NULL
351
  ),
352
 
353
  pr_status AS (
354
  -- Get merge/close status for those PRs
355
  SELECT
356
+ payload.pull_request.html_url as url,
357
+ CAST(payload.pull_request.merged AS BOOLEAN) as is_merged,
358
+ payload.pull_request.merged_at as merged_at,
359
+ payload.pull_request.closed_at as closed_at,
360
  created_at,
361
+ ROW_NUMBER() OVER (PARTITION BY payload.pull_request.html_url ORDER BY created_at DESC) as rn
362
+ FROM read_parquet($status_patterns, union_by_name=true, filename=true)
363
  WHERE
364
  type = 'PullRequestEvent'
365
+ AND payload.action = 'closed'
366
+ AND payload.pull_request.html_url IS NOT NULL
367
+ AND payload.pull_request.html_url IN (
368
  SELECT DISTINCT url FROM review_events
369
  )
370
  )
 
389
  print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
390
 
391
  try:
392
+ # Create cache table name based on date range
393
+ cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
394
+
395
+ # Check if cache exists and is valid
396
+ cache_exists = conn.execute(f"""
397
+ SELECT COUNT(*) FROM information_schema.tables
398
+ WHERE table_name = '{cache_table_name}'
399
+ """).fetchone()[0] > 0
400
+
401
+ if cache_exists:
402
+ print(f" Using cached results from table {cache_table_name}")
403
+ results = conn.execute(f"""
404
+ SELECT reviewer, url, reviewed_at, merged_at, closed_at
405
+ FROM {cache_table_name}
406
+ WHERE reviewer IN ({identifier_list})
407
+ """).fetchall()
408
+ else:
409
+ print(f" Cache miss - executing full query and caching to {cache_table_name}")
410
+ # Execute query with parameters
411
+ results = conn.execute(query, {'review_patterns': review_patterns, 'status_patterns': status_patterns}).fetchall()
412
+
413
+ # Cache the complete results for all future queries in this date range
414
+ if len(results) > 0:
415
+ conn.execute(f"""
416
+ CREATE TABLE {cache_table_name} AS
417
+ SELECT * FROM (
418
+ SELECT UNNEST($1) as reviewer, UNNEST($2) as url,
419
+ UNNEST($3) as reviewed_at, UNNEST($4) as merged_at,
420
+ UNNEST($5) as closed_at
421
+ )
422
+ """, [
423
+ [r[0] for r in results],
424
+ [r[1] for r in results],
425
+ [r[2] for r in results],
426
+ [r[3] for r in results],
427
+ [r[4] for r in results]
428
+ ])
429
+ print(f" Cached {len(results)} results to {cache_table_name}")
430
 
431
  print(f" Found {len(results)} total PR review records across all agents")
432