Spaces:
Sleeping
Sleeping
zhimin-z
commited on
Commit
·
c1b8cab
1
Parent(s):
fc294eb
refine
Browse files
msr.py
CHANGED
|
@@ -217,6 +217,7 @@ def get_duckdb_connection():
|
|
| 217 |
def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
|
| 218 |
"""
|
| 219 |
Generate file path patterns for GHArchive data in date range.
|
|
|
|
| 220 |
|
| 221 |
Args:
|
| 222 |
start_date: Start datetime
|
|
@@ -224,21 +225,35 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DI
|
|
| 224 |
data_dir: Directory containing GHArchive data files
|
| 225 |
|
| 226 |
Returns:
|
| 227 |
-
List of file path patterns (
|
| 228 |
"""
|
| 229 |
file_patterns = []
|
|
|
|
| 230 |
|
| 231 |
current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 232 |
end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 233 |
|
| 234 |
while current_date <= end_day:
|
| 235 |
-
# Pattern for
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
# Move to next day
|
| 240 |
current_date += timedelta(days=1)
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
return file_patterns
|
| 243 |
|
| 244 |
|
|
@@ -288,72 +303,72 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 288 |
# Build identifier list for IN clause
|
| 289 |
identifier_list = ', '.join([f"'{id}'" for id in identifiers])
|
| 290 |
|
| 291 |
-
# Build comprehensive query with CTEs using parameterized file lists (
|
| 292 |
query = f"""
|
| 293 |
WITH review_events AS (
|
| 294 |
-- Get all review events for ALL agents
|
| 295 |
SELECT
|
| 296 |
-
payload
|
| 297 |
COALESCE(
|
| 298 |
-
payload
|
| 299 |
-
|
| 300 |
) as reviewed_at,
|
| 301 |
-
actor
|
| 302 |
-
repo
|
| 303 |
-
|
| 304 |
-
FROM
|
| 305 |
WHERE
|
| 306 |
-
type = 'PullRequestReviewEvent'
|
| 307 |
-
AND actor
|
| 308 |
-
AND payload
|
| 309 |
|
| 310 |
UNION ALL
|
| 311 |
|
| 312 |
-- Get PR comments (IssueCommentEvent on PRs)
|
| 313 |
SELECT
|
| 314 |
-
payload
|
| 315 |
-
|
| 316 |
-
actor
|
| 317 |
-
repo
|
| 318 |
-
|
| 319 |
-
FROM
|
| 320 |
WHERE
|
| 321 |
-
type = 'IssueCommentEvent'
|
| 322 |
-
AND actor
|
| 323 |
-
AND payload
|
| 324 |
-
AND payload
|
| 325 |
|
| 326 |
UNION ALL
|
| 327 |
|
| 328 |
-- Get review comments (PullRequestReviewCommentEvent)
|
| 329 |
SELECT
|
| 330 |
-
payload
|
| 331 |
-
|
| 332 |
-
actor
|
| 333 |
-
repo
|
| 334 |
-
|
| 335 |
-
FROM
|
| 336 |
WHERE
|
| 337 |
-
type = 'PullRequestReviewCommentEvent'
|
| 338 |
-
AND actor
|
| 339 |
-
AND payload
|
| 340 |
),
|
| 341 |
|
| 342 |
pr_status AS (
|
| 343 |
-- Get merge/close status for those PRs
|
| 344 |
SELECT
|
| 345 |
-
payload
|
| 346 |
-
|
| 347 |
-
payload
|
| 348 |
-
payload
|
| 349 |
created_at,
|
| 350 |
-
ROW_NUMBER() OVER (PARTITION BY payload
|
| 351 |
-
FROM
|
| 352 |
WHERE
|
| 353 |
-
type = 'PullRequestEvent'
|
| 354 |
-
AND payload
|
| 355 |
-
AND payload
|
| 356 |
-
AND payload
|
| 357 |
SELECT DISTINCT url FROM review_events
|
| 358 |
)
|
| 359 |
)
|
|
@@ -603,11 +618,9 @@ def batch_upload_review_metadata(all_metadata):
|
|
| 603 |
commit_message=f"Update {repo_path}",
|
| 604 |
max_retries=MAX_RETRIES
|
| 605 |
):
|
| 606 |
-
print(" ")
|
| 607 |
agent_success += 1
|
| 608 |
success_count += 1
|
| 609 |
else:
|
| 610 |
-
print(" ")
|
| 611 |
agent_error += 1
|
| 612 |
error_count += 1
|
| 613 |
|
|
@@ -927,7 +940,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
|
|
| 927 |
repo_id=LEADERBOARD_REPO,
|
| 928 |
repo_type="dataset"
|
| 929 |
)
|
| 930 |
-
print(" ")
|
| 931 |
print(f"Saved leaderboard data to HuggingFace: {filename}")
|
| 932 |
return True
|
| 933 |
finally:
|
|
@@ -936,7 +948,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
|
|
| 936 |
os.remove(filename)
|
| 937 |
|
| 938 |
except Exception as e:
|
| 939 |
-
print(f" ")
|
| 940 |
print(f"Error saving leaderboard data: {str(e)}")
|
| 941 |
import traceback
|
| 942 |
traceback.print_exc()
|
|
|
|
| 217 |
def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DIR):
|
| 218 |
"""
|
| 219 |
Generate file path patterns for GHArchive data in date range.
|
| 220 |
+
Only includes files that actually exist on disk.
|
| 221 |
|
| 222 |
Args:
|
| 223 |
start_date: Start datetime
|
|
|
|
| 225 |
data_dir: Directory containing GHArchive data files
|
| 226 |
|
| 227 |
Returns:
|
| 228 |
+
List of file path patterns (hourly JSON.gz files) that exist
|
| 229 |
"""
|
| 230 |
file_patterns = []
|
| 231 |
+
missing_dates = set()
|
| 232 |
|
| 233 |
current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 234 |
end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 235 |
|
| 236 |
while current_date <= end_day:
|
| 237 |
+
# Pattern for hourly JSON.gz files: 2024-11-15-{0..23}.json.gz
|
| 238 |
+
date_has_files = False
|
| 239 |
+
for hour in range(24):
|
| 240 |
+
pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
|
| 241 |
+
# Only add pattern if file exists
|
| 242 |
+
if os.path.exists(pattern):
|
| 243 |
+
file_patterns.append(pattern)
|
| 244 |
+
date_has_files = True
|
| 245 |
+
|
| 246 |
+
# Track missing dates
|
| 247 |
+
if not date_has_files:
|
| 248 |
+
missing_dates.add(current_date.strftime('%Y-%m-%d'))
|
| 249 |
|
| 250 |
# Move to next day
|
| 251 |
current_date += timedelta(days=1)
|
| 252 |
|
| 253 |
+
# Print warning about missing dates
|
| 254 |
+
if missing_dates:
|
| 255 |
+
print(f" Warning: Skipping {len(missing_dates)} date(s) with no data files: {', '.join(sorted(missing_dates))}")
|
| 256 |
+
|
| 257 |
return file_patterns
|
| 258 |
|
| 259 |
|
|
|
|
| 303 |
# Build identifier list for IN clause
|
| 304 |
identifier_list = ', '.join([f"'{id}'" for id in identifiers])
|
| 305 |
|
| 306 |
+
# Build comprehensive query with CTEs using parameterized file lists (JSON.gz format)
|
| 307 |
query = f"""
|
| 308 |
WITH review_events AS (
|
| 309 |
-- Get all review events for ALL agents
|
| 310 |
SELECT
|
| 311 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
|
| 312 |
COALESCE(
|
| 313 |
+
TRY_CAST(json_extract_string(payload, '$.review.submitted_at') AS VARCHAR),
|
| 314 |
+
TRY_CAST(created_at AS VARCHAR)
|
| 315 |
) as reviewed_at,
|
| 316 |
+
TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as reviewer,
|
| 317 |
+
TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
|
| 318 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
|
| 319 |
+
FROM read_json($review_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 320 |
WHERE
|
| 321 |
+
TRY_CAST(type AS VARCHAR) = 'PullRequestReviewEvent'
|
| 322 |
+
AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
|
| 323 |
+
AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
|
| 324 |
|
| 325 |
UNION ALL
|
| 326 |
|
| 327 |
-- Get PR comments (IssueCommentEvent on PRs)
|
| 328 |
SELECT
|
| 329 |
+
TRY_CAST(json_extract_string(payload, '$.issue.html_url') AS VARCHAR) as url,
|
| 330 |
+
TRY_CAST(created_at AS VARCHAR) as reviewed_at,
|
| 331 |
+
TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as reviewer,
|
| 332 |
+
TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
|
| 333 |
+
TRY_CAST(json_extract_string(payload, '$.issue.number') AS INTEGER) as pr_number
|
| 334 |
+
FROM read_json($review_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 335 |
WHERE
|
| 336 |
+
TRY_CAST(type AS VARCHAR) = 'IssueCommentEvent'
|
| 337 |
+
AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
|
| 338 |
+
AND json_extract_string(payload, '$.issue.pull_request.url') IS NOT NULL
|
| 339 |
+
AND json_extract_string(payload, '$.issue.html_url') IS NOT NULL
|
| 340 |
|
| 341 |
UNION ALL
|
| 342 |
|
| 343 |
-- Get review comments (PullRequestReviewCommentEvent)
|
| 344 |
SELECT
|
| 345 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
|
| 346 |
+
TRY_CAST(created_at AS VARCHAR) as reviewed_at,
|
| 347 |
+
TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) as reviewer,
|
| 348 |
+
TRY_CAST(json_extract_string(repo, '$.name') AS VARCHAR) as repo_name,
|
| 349 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
|
| 350 |
+
FROM read_json($review_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 351 |
WHERE
|
| 352 |
+
TRY_CAST(type AS VARCHAR) = 'PullRequestReviewCommentEvent'
|
| 353 |
+
AND TRY_CAST(json_extract_string(actor, '$.login') AS VARCHAR) IN ({identifier_list})
|
| 354 |
+
AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
|
| 355 |
),
|
| 356 |
|
| 357 |
pr_status AS (
|
| 358 |
-- Get merge/close status for those PRs
|
| 359 |
SELECT
|
| 360 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.html_url') AS VARCHAR) as url,
|
| 361 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
|
| 362 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.merged_at') AS VARCHAR) as merged_at,
|
| 363 |
+
TRY_CAST(json_extract_string(payload, '$.pull_request.closed_at') AS VARCHAR) as closed_at,
|
| 364 |
created_at,
|
| 365 |
+
ROW_NUMBER() OVER (PARTITION BY json_extract_string(payload, '$.pull_request.html_url') ORDER BY created_at DESC) as rn
|
| 366 |
+
FROM read_json($status_patterns, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 367 |
WHERE
|
| 368 |
+
TRY_CAST(type AS VARCHAR) = 'PullRequestEvent'
|
| 369 |
+
AND TRY_CAST(json_extract_string(payload, '$.action') AS VARCHAR) = 'closed'
|
| 370 |
+
AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
|
| 371 |
+
AND json_extract_string(payload, '$.pull_request.html_url') IN (
|
| 372 |
SELECT DISTINCT url FROM review_events
|
| 373 |
)
|
| 374 |
)
|
|
|
|
| 618 |
commit_message=f"Update {repo_path}",
|
| 619 |
max_retries=MAX_RETRIES
|
| 620 |
):
|
|
|
|
| 621 |
agent_success += 1
|
| 622 |
success_count += 1
|
| 623 |
else:
|
|
|
|
| 624 |
agent_error += 1
|
| 625 |
error_count += 1
|
| 626 |
|
|
|
|
| 940 |
repo_id=LEADERBOARD_REPO,
|
| 941 |
repo_type="dataset"
|
| 942 |
)
|
|
|
|
| 943 |
print(f"Saved leaderboard data to HuggingFace: {filename}")
|
| 944 |
return True
|
| 945 |
finally:
|
|
|
|
| 948 |
os.remove(filename)
|
| 949 |
|
| 950 |
except Exception as e:
|
|
|
|
| 951 |
print(f"Error saving leaderboard data: {str(e)}")
|
| 952 |
import traceback
|
| 953 |
traceback.print_exc()
|