Spaces:
Sleeping
Sleeping
backoff
Browse files- app.py +147 -33
- msr.py +79 -8
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -8,8 +8,10 @@ import requests
|
|
| 8 |
from datetime import datetime, timezone, timedelta
|
| 9 |
from collections import defaultdict
|
| 10 |
from huggingface_hub import HfApi, hf_hub_download
|
|
|
|
| 11 |
from datasets import load_dataset, Dataset
|
| 12 |
import threading
|
|
|
|
| 13 |
from dotenv import load_dotenv
|
| 14 |
import pandas as pd
|
| 15 |
import random
|
|
@@ -105,6 +107,73 @@ def normalize_date_format(date_string):
|
|
| 105 |
return date_string
|
| 106 |
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
# =============================================================================
|
| 109 |
# BIGQUERY FUNCTIONS
|
| 110 |
# =============================================================================
|
|
@@ -216,7 +285,7 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
|
| 216 |
For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
|
| 217 |
|
| 218 |
Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
|
| 219 |
-
actor.login matches the agent identifier.
|
| 220 |
|
| 221 |
Args:
|
| 222 |
client: BigQuery client instance
|
|
@@ -225,37 +294,74 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
|
| 225 |
end_date: End datetime (timezone-aware)
|
| 226 |
|
| 227 |
Returns:
|
| 228 |
-
List of review event rows with PR information
|
| 229 |
"""
|
| 230 |
print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
|
| 231 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 232 |
|
| 233 |
# Generate list of table names for each day in the range
|
| 234 |
-
|
| 235 |
current_date = start_date
|
| 236 |
while current_date < end_date:
|
| 237 |
-
table_name = f"githubarchive.day.{current_date.strftime('%Y%m%d')}"
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
current_date += timedelta(days=1)
|
|
|
|
| 240 |
|
| 241 |
-
# Build
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
union_parts.append(f"""
|
| 245 |
SELECT
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
| 251 |
created_at
|
| 252 |
-
FROM
|
|
|
|
|
|
|
| 253 |
WHERE type = 'PullRequestReviewEvent'
|
| 254 |
AND actor.login = @identifier
|
| 255 |
-
AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
job_config = bigquery.QueryJobConfig(
|
| 261 |
query_parameters=[
|
|
@@ -263,7 +369,7 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
|
| 263 |
]
|
| 264 |
)
|
| 265 |
|
| 266 |
-
print(f" Querying {len(
|
| 267 |
|
| 268 |
try:
|
| 269 |
query_job = client.query(query, job_config=job_config)
|
|
@@ -1233,7 +1339,8 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 1233 |
# Upload entire folder using upload_large_folder (optimized for large files)
|
| 1234 |
# Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
|
| 1235 |
print(f"📤 Uploading {len(grouped)} files...")
|
| 1236 |
-
|
|
|
|
| 1237 |
folder_path=temp_dir,
|
| 1238 |
repo_id=REVIEW_METADATA_REPO,
|
| 1239 |
repo_type="dataset"
|
|
@@ -1273,7 +1380,7 @@ def load_review_metadata():
|
|
| 1273 |
token = get_hf_token()
|
| 1274 |
|
| 1275 |
# List all files in the repository
|
| 1276 |
-
files = api
|
| 1277 |
|
| 1278 |
# Filter for files matching the pattern: [agent_identifier]/YYYY.MM.DD.jsonl
|
| 1279 |
# AND within the time frame (parse date from filename)
|
|
@@ -1315,7 +1422,7 @@ def load_review_metadata():
|
|
| 1315 |
agent_identifier = parts[0]
|
| 1316 |
agent_identifiers_found.add(agent_identifier)
|
| 1317 |
|
| 1318 |
-
file_path =
|
| 1319 |
repo_id=REVIEW_METADATA_REPO,
|
| 1320 |
filename=filename,
|
| 1321 |
repo_type="dataset",
|
|
@@ -1371,7 +1478,7 @@ def get_latest_review_date_for_agent(agent_identifier):
|
|
| 1371 |
token = get_hf_token()
|
| 1372 |
|
| 1373 |
# List all files in the repository
|
| 1374 |
-
files = api
|
| 1375 |
|
| 1376 |
# Filter for files in this agent's folder
|
| 1377 |
# New structure: [agent_identifier]/YYYY.MM.DD.jsonl
|
|
@@ -1385,7 +1492,7 @@ def get_latest_review_date_for_agent(agent_identifier):
|
|
| 1385 |
latest_date = None
|
| 1386 |
for filename in agent_files:
|
| 1387 |
try:
|
| 1388 |
-
file_path =
|
| 1389 |
repo_id=REVIEW_METADATA_REPO,
|
| 1390 |
filename=filename,
|
| 1391 |
repo_type="dataset",
|
|
@@ -1430,7 +1537,7 @@ def get_daily_files_last_time_frame(agent_identifier):
|
|
| 1430 |
cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 1431 |
|
| 1432 |
# List all files in the repository
|
| 1433 |
-
files = api
|
| 1434 |
|
| 1435 |
# Filter for files in this agent's folder
|
| 1436 |
agent_pattern = f"{agent_identifier}/"
|
|
@@ -1639,7 +1746,7 @@ def load_agents_from_hf():
|
|
| 1639 |
agents = []
|
| 1640 |
|
| 1641 |
# List all files in the repository
|
| 1642 |
-
files = api
|
| 1643 |
|
| 1644 |
# Filter for JSON files only
|
| 1645 |
json_files = [f for f in files if f.endswith('.json')]
|
|
@@ -1647,7 +1754,7 @@ def load_agents_from_hf():
|
|
| 1647 |
# Download and parse each JSON file
|
| 1648 |
for json_file in json_files:
|
| 1649 |
try:
|
| 1650 |
-
file_path =
|
| 1651 |
repo_id=AGENTS_REPO,
|
| 1652 |
filename=json_file,
|
| 1653 |
repo_type="dataset"
|
|
@@ -1656,6 +1763,11 @@ def load_agents_from_hf():
|
|
| 1656 |
with open(file_path, 'r') as f:
|
| 1657 |
agent_data = json.load(f)
|
| 1658 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1659 |
# Extract github_identifier from filename (e.g., "claude[bot].json" -> "claude[bot]")
|
| 1660 |
filename_identifier = json_file.replace('.json', '')
|
| 1661 |
|
|
@@ -1839,7 +1951,7 @@ def load_leaderboard_data_from_hf():
|
|
| 1839 |
filename = "swe-review.json"
|
| 1840 |
|
| 1841 |
# Download file
|
| 1842 |
-
file_path =
|
| 1843 |
repo_id=LEADERBOARD_REPO,
|
| 1844 |
filename=filename,
|
| 1845 |
repo_type="dataset",
|
|
@@ -1910,7 +2022,8 @@ def save_leaderboard_and_metrics_to_hf():
|
|
| 1910 |
|
| 1911 |
# Upload to HuggingFace (will overwrite if exists)
|
| 1912 |
print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
|
| 1913 |
-
|
|
|
|
| 1914 |
path_or_fileobj=file_like_object,
|
| 1915 |
path_in_repo="swe-review.json",
|
| 1916 |
repo_id=LEADERBOARD_REPO,
|
|
@@ -2019,7 +2132,8 @@ def construct_leaderboard_from_metadata():
|
|
| 2019 |
stats = calculate_review_stats_from_metadata(agent_metadata)
|
| 2020 |
|
| 2021 |
cache_dict[identifier] = {
|
| 2022 |
-
'
|
|
|
|
| 2023 |
'website': agent.get('website', 'N/A'),
|
| 2024 |
'github_identifier': identifier,
|
| 2025 |
**stats
|
|
@@ -2232,7 +2346,7 @@ def get_leaderboard_dataframe():
|
|
| 2232 |
|
| 2233 |
# Only include display-relevant fields
|
| 2234 |
rows.append([
|
| 2235 |
-
data.get('
|
| 2236 |
data.get('website', 'N/A'),
|
| 2237 |
total_reviews,
|
| 2238 |
data.get('merged_prs', 0),
|
|
@@ -2297,7 +2411,7 @@ def submit_agent(identifier, agent_name, developer, website):
|
|
| 2297 |
|
| 2298 |
# Create submission
|
| 2299 |
submission = {
|
| 2300 |
-
'
|
| 2301 |
'developer': developer,
|
| 2302 |
'github_identifier': identifier,
|
| 2303 |
'website': website,
|
|
|
|
| 8 |
from datetime import datetime, timezone, timedelta
|
| 9 |
from collections import defaultdict
|
| 10 |
from huggingface_hub import HfApi, hf_hub_download
|
| 11 |
+
from huggingface_hub.errors import HfHubHTTPError
|
| 12 |
from datasets import load_dataset, Dataset
|
| 13 |
import threading
|
| 14 |
+
import backoff
|
| 15 |
from dotenv import load_dotenv
|
| 16 |
import pandas as pd
|
| 17 |
import random
|
|
|
|
| 107 |
return date_string
|
| 108 |
|
| 109 |
|
| 110 |
+
# =============================================================================
|
| 111 |
+
# HUGGINGFACE API WRAPPERS WITH BACKOFF
|
| 112 |
+
# =============================================================================
|
| 113 |
+
|
| 114 |
+
def is_rate_limit_error(e):
|
| 115 |
+
"""Check if exception is a HuggingFace rate limit error (429)."""
|
| 116 |
+
if isinstance(e, HfHubHTTPError):
|
| 117 |
+
return e.response.status_code == 429
|
| 118 |
+
return False
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@backoff.on_exception(
|
| 122 |
+
backoff.expo,
|
| 123 |
+
HfHubHTTPError,
|
| 124 |
+
max_tries=8,
|
| 125 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 126 |
+
on_backoff=lambda details: print(
|
| 127 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 128 |
+
)
|
| 129 |
+
)
|
| 130 |
+
def upload_large_folder_with_backoff(api, **kwargs):
|
| 131 |
+
"""Wrapper for api.upload_large_folder() with exponential backoff for rate limits."""
|
| 132 |
+
return api.upload_large_folder(**kwargs)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@backoff.on_exception(
|
| 136 |
+
backoff.expo,
|
| 137 |
+
HfHubHTTPError,
|
| 138 |
+
max_tries=8,
|
| 139 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 140 |
+
on_backoff=lambda details: print(
|
| 141 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 142 |
+
)
|
| 143 |
+
)
|
| 144 |
+
def list_repo_files_with_backoff(api, **kwargs):
|
| 145 |
+
"""Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
|
| 146 |
+
return api.list_repo_files(**kwargs)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
@backoff.on_exception(
|
| 150 |
+
backoff.expo,
|
| 151 |
+
HfHubHTTPError,
|
| 152 |
+
max_tries=8,
|
| 153 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 154 |
+
on_backoff=lambda details: print(
|
| 155 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 156 |
+
)
|
| 157 |
+
)
|
| 158 |
+
def hf_hub_download_with_backoff(**kwargs):
|
| 159 |
+
"""Wrapper for hf_hub_download() with exponential backoff for rate limits."""
|
| 160 |
+
return hf_hub_download(**kwargs)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
@backoff.on_exception(
|
| 164 |
+
backoff.expo,
|
| 165 |
+
HfHubHTTPError,
|
| 166 |
+
max_tries=8,
|
| 167 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 168 |
+
on_backoff=lambda details: print(
|
| 169 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 170 |
+
)
|
| 171 |
+
)
|
| 172 |
+
def upload_file_with_backoff(api, **kwargs):
|
| 173 |
+
"""Wrapper for api.upload_file() with exponential backoff for rate limits."""
|
| 174 |
+
return api.upload_file(**kwargs)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
# =============================================================================
|
| 178 |
# BIGQUERY FUNCTIONS
|
| 179 |
# =============================================================================
|
|
|
|
| 285 |
For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
|
| 286 |
|
| 287 |
Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
|
| 288 |
+
actor.login matches the agent identifier, and joins with PR status.
|
| 289 |
|
| 290 |
Args:
|
| 291 |
client: BigQuery client instance
|
|
|
|
| 294 |
end_date: End datetime (timezone-aware)
|
| 295 |
|
| 296 |
Returns:
|
| 297 |
+
List of review event rows with PR information including merged_at and closed_at
|
| 298 |
"""
|
| 299 |
print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
|
| 300 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 301 |
|
| 302 |
# Generate list of table names for each day in the range
|
| 303 |
+
review_tables = []
|
| 304 |
current_date = start_date
|
| 305 |
while current_date < end_date:
|
| 306 |
+
table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
|
| 307 |
+
review_tables.append(f"SELECT * FROM {table_name}")
|
| 308 |
+
current_date += timedelta(days=1)
|
| 309 |
+
review_union = " UNION ALL ".join(review_tables)
|
| 310 |
+
|
| 311 |
+
# Generate status tables (lookback for PR status)
|
| 312 |
+
status_start = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 313 |
+
status_tables = []
|
| 314 |
+
current_date = status_start
|
| 315 |
+
while current_date < end_date:
|
| 316 |
+
table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
|
| 317 |
+
status_tables.append(f"SELECT * FROM {table_name}")
|
| 318 |
current_date += timedelta(days=1)
|
| 319 |
+
status_union = " UNION ALL ".join(status_tables)
|
| 320 |
|
| 321 |
+
# Build comprehensive query with CTEs for PR status
|
| 322 |
+
query = f"""
|
| 323 |
+
WITH review_events AS (
|
|
|
|
| 324 |
SELECT
|
| 325 |
+
JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as url,
|
| 326 |
+
COALESCE(
|
| 327 |
+
JSON_EXTRACT_SCALAR(payload, '$.review.submitted_at'),
|
| 328 |
+
CAST(created_at AS STRING)
|
| 329 |
+
) as reviewed_at,
|
| 330 |
+
actor.login as reviewer,
|
| 331 |
created_at
|
| 332 |
+
FROM (
|
| 333 |
+
{review_union}
|
| 334 |
+
)
|
| 335 |
WHERE type = 'PullRequestReviewEvent'
|
| 336 |
AND actor.login = @identifier
|
| 337 |
+
AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IS NOT NULL
|
| 338 |
+
),
|
| 339 |
+
pr_status AS (
|
| 340 |
+
SELECT
|
| 341 |
+
JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as url,
|
| 342 |
+
JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
|
| 343 |
+
JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
|
| 344 |
+
created_at
|
| 345 |
+
FROM (
|
| 346 |
+
{status_union}
|
| 347 |
+
)
|
| 348 |
+
WHERE type = 'PullRequestEvent'
|
| 349 |
+
AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'closed'
|
| 350 |
+
AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IN (
|
| 351 |
+
SELECT DISTINCT url FROM review_events
|
| 352 |
+
)
|
| 353 |
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY url ORDER BY created_at DESC) = 1
|
| 354 |
+
)
|
| 355 |
+
SELECT DISTINCT
|
| 356 |
+
re.url,
|
| 357 |
+
re.reviewed_at,
|
| 358 |
+
re.created_at,
|
| 359 |
+
ps.merged_at,
|
| 360 |
+
ps.closed_at
|
| 361 |
+
FROM review_events re
|
| 362 |
+
LEFT JOIN pr_status ps ON re.url = ps.url
|
| 363 |
+
ORDER BY re.reviewed_at DESC
|
| 364 |
+
"""
|
| 365 |
|
| 366 |
job_config = bigquery.QueryJobConfig(
|
| 367 |
query_parameters=[
|
|
|
|
| 369 |
]
|
| 370 |
)
|
| 371 |
|
| 372 |
+
print(f" Querying {len(review_tables)} review tables and {len(status_tables)} status tables...")
|
| 373 |
|
| 374 |
try:
|
| 375 |
query_job = client.query(query, job_config=job_config)
|
|
|
|
| 1339 |
# Upload entire folder using upload_large_folder (optimized for large files)
|
| 1340 |
# Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
|
| 1341 |
print(f"📤 Uploading {len(grouped)} files...")
|
| 1342 |
+
upload_large_folder_with_backoff(
|
| 1343 |
+
api=api,
|
| 1344 |
folder_path=temp_dir,
|
| 1345 |
repo_id=REVIEW_METADATA_REPO,
|
| 1346 |
repo_type="dataset"
|
|
|
|
| 1380 |
token = get_hf_token()
|
| 1381 |
|
| 1382 |
# List all files in the repository
|
| 1383 |
+
files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
|
| 1384 |
|
| 1385 |
# Filter for files matching the pattern: [agent_identifier]/YYYY.MM.DD.jsonl
|
| 1386 |
# AND within the time frame (parse date from filename)
|
|
|
|
| 1422 |
agent_identifier = parts[0]
|
| 1423 |
agent_identifiers_found.add(agent_identifier)
|
| 1424 |
|
| 1425 |
+
file_path = hf_hub_download_with_backoff(
|
| 1426 |
repo_id=REVIEW_METADATA_REPO,
|
| 1427 |
filename=filename,
|
| 1428 |
repo_type="dataset",
|
|
|
|
| 1478 |
token = get_hf_token()
|
| 1479 |
|
| 1480 |
# List all files in the repository
|
| 1481 |
+
files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
|
| 1482 |
|
| 1483 |
# Filter for files in this agent's folder
|
| 1484 |
# New structure: [agent_identifier]/YYYY.MM.DD.jsonl
|
|
|
|
| 1492 |
latest_date = None
|
| 1493 |
for filename in agent_files:
|
| 1494 |
try:
|
| 1495 |
+
file_path = hf_hub_download_with_backoff(
|
| 1496 |
repo_id=REVIEW_METADATA_REPO,
|
| 1497 |
filename=filename,
|
| 1498 |
repo_type="dataset",
|
|
|
|
| 1537 |
cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 1538 |
|
| 1539 |
# List all files in the repository
|
| 1540 |
+
files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
|
| 1541 |
|
| 1542 |
# Filter for files in this agent's folder
|
| 1543 |
agent_pattern = f"{agent_identifier}/"
|
|
|
|
| 1746 |
agents = []
|
| 1747 |
|
| 1748 |
# List all files in the repository
|
| 1749 |
+
files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
|
| 1750 |
|
| 1751 |
# Filter for JSON files only
|
| 1752 |
json_files = [f for f in files if f.endswith('.json')]
|
|
|
|
| 1754 |
# Download and parse each JSON file
|
| 1755 |
for json_file in json_files:
|
| 1756 |
try:
|
| 1757 |
+
file_path = hf_hub_download_with_backoff(
|
| 1758 |
repo_id=AGENTS_REPO,
|
| 1759 |
filename=json_file,
|
| 1760 |
repo_type="dataset"
|
|
|
|
| 1763 |
with open(file_path, 'r') as f:
|
| 1764 |
agent_data = json.load(f)
|
| 1765 |
|
| 1766 |
+
# Only process agents with status == "public"
|
| 1767 |
+
if agent_data.get('status') != 'public':
|
| 1768 |
+
print(f"Skipping {json_file}: status is not 'public'")
|
| 1769 |
+
continue
|
| 1770 |
+
|
| 1771 |
# Extract github_identifier from filename (e.g., "claude[bot].json" -> "claude[bot]")
|
| 1772 |
filename_identifier = json_file.replace('.json', '')
|
| 1773 |
|
|
|
|
| 1951 |
filename = "swe-review.json"
|
| 1952 |
|
| 1953 |
# Download file
|
| 1954 |
+
file_path = hf_hub_download_with_backoff(
|
| 1955 |
repo_id=LEADERBOARD_REPO,
|
| 1956 |
filename=filename,
|
| 1957 |
repo_type="dataset",
|
|
|
|
| 2022 |
|
| 2023 |
# Upload to HuggingFace (will overwrite if exists)
|
| 2024 |
print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
|
| 2025 |
+
upload_file_with_backoff(
|
| 2026 |
+
api=api,
|
| 2027 |
path_or_fileobj=file_like_object,
|
| 2028 |
path_in_repo="swe-review.json",
|
| 2029 |
repo_id=LEADERBOARD_REPO,
|
|
|
|
| 2132 |
stats = calculate_review_stats_from_metadata(agent_metadata)
|
| 2133 |
|
| 2134 |
cache_dict[identifier] = {
|
| 2135 |
+
'name': agent_name,
|
| 2136 |
+
'name': agent_name, # Store both for compatibility
|
| 2137 |
'website': agent.get('website', 'N/A'),
|
| 2138 |
'github_identifier': identifier,
|
| 2139 |
**stats
|
|
|
|
| 2346 |
|
| 2347 |
# Only include display-relevant fields
|
| 2348 |
rows.append([
|
| 2349 |
+
data.get('name', 'Unknown'),
|
| 2350 |
data.get('website', 'N/A'),
|
| 2351 |
total_reviews,
|
| 2352 |
data.get('merged_prs', 0),
|
|
|
|
| 2411 |
|
| 2412 |
# Create submission
|
| 2413 |
submission = {
|
| 2414 |
+
'name': agent_name,
|
| 2415 |
'developer': developer,
|
| 2416 |
'github_identifier': identifier,
|
| 2417 |
'website': website,
|
msr.py
CHANGED
|
@@ -9,8 +9,10 @@ import tempfile
|
|
| 9 |
from datetime import datetime, timezone, timedelta
|
| 10 |
from collections import defaultdict
|
| 11 |
from huggingface_hub import HfApi, hf_hub_download
|
|
|
|
| 12 |
from dotenv import load_dotenv
|
| 13 |
from google.cloud import bigquery
|
|
|
|
| 14 |
|
| 15 |
# Load environment variables
|
| 16 |
load_dotenv()
|
|
@@ -95,6 +97,73 @@ def get_hf_token():
|
|
| 95 |
return token
|
| 96 |
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
def get_bigquery_client():
|
| 99 |
"""
|
| 100 |
Initialize BigQuery client using credentials from environment variable.
|
|
@@ -490,7 +559,8 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 490 |
# Upload entire folder using upload_large_folder (optimized for large files)
|
| 491 |
# Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
|
| 492 |
print(f" 📤 Uploading {len(grouped)} files ({len(metadata_list)} total reviews)...")
|
| 493 |
-
|
|
|
|
| 494 |
folder_path=temp_dir,
|
| 495 |
repo_id=REVIEW_METADATA_REPO,
|
| 496 |
repo_type="dataset"
|
|
@@ -522,7 +592,7 @@ def load_agents_from_hf():
|
|
| 522 |
agents = []
|
| 523 |
|
| 524 |
# List all files in the repository
|
| 525 |
-
files = api
|
| 526 |
|
| 527 |
# Filter for JSON files only
|
| 528 |
json_files = [f for f in files if f.endswith('.json')]
|
|
@@ -532,7 +602,7 @@ def load_agents_from_hf():
|
|
| 532 |
# Download and parse each JSON file
|
| 533 |
for json_file in json_files:
|
| 534 |
try:
|
| 535 |
-
file_path =
|
| 536 |
repo_id=AGENTS_REPO,
|
| 537 |
filename=json_file,
|
| 538 |
repo_type="dataset"
|
|
@@ -580,7 +650,7 @@ def load_review_metadata():
|
|
| 580 |
token = get_hf_token()
|
| 581 |
|
| 582 |
# List all files in the repository
|
| 583 |
-
files = api
|
| 584 |
|
| 585 |
# Filter for JSONL files matching pattern: [agent_identifier]/YYYY.MM.DD.jsonl
|
| 586 |
time_frame_files = []
|
|
@@ -616,7 +686,7 @@ def load_review_metadata():
|
|
| 616 |
|
| 617 |
agent_identifier = parts[0]
|
| 618 |
|
| 619 |
-
file_path =
|
| 620 |
repo_id=REVIEW_METADATA_REPO,
|
| 621 |
filename=filename,
|
| 622 |
repo_type="dataset",
|
|
@@ -825,7 +895,7 @@ def construct_leaderboard_from_metadata():
|
|
| 825 |
stats = calculate_review_stats_from_metadata(agent_metadata)
|
| 826 |
|
| 827 |
cache_dict[identifier] = {
|
| 828 |
-
'
|
| 829 |
'name': agent_name,
|
| 830 |
'website': agent.get('website', 'N/A'),
|
| 831 |
'github_identifier': identifier,
|
|
@@ -872,7 +942,8 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
|
|
| 872 |
|
| 873 |
try:
|
| 874 |
# Upload to HuggingFace
|
| 875 |
-
|
|
|
|
| 876 |
path_or_fileobj=filename,
|
| 877 |
path_in_repo=filename,
|
| 878 |
repo_id=LEADERBOARD_REPO,
|
|
@@ -953,7 +1024,7 @@ def mine_all_agents():
|
|
| 953 |
|
| 954 |
for i, agent in enumerate(agents, 1):
|
| 955 |
identifier = agent.get('github_identifier')
|
| 956 |
-
agent_name = agent.get('name',
|
| 957 |
|
| 958 |
if not identifier:
|
| 959 |
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
|
|
|
| 9 |
from datetime import datetime, timezone, timedelta
|
| 10 |
from collections import defaultdict
|
| 11 |
from huggingface_hub import HfApi, hf_hub_download
|
| 12 |
+
from huggingface_hub.errors import HfHubHTTPError
|
| 13 |
from dotenv import load_dotenv
|
| 14 |
from google.cloud import bigquery
|
| 15 |
+
import backoff
|
| 16 |
|
| 17 |
# Load environment variables
|
| 18 |
load_dotenv()
|
|
|
|
| 97 |
return token
|
| 98 |
|
| 99 |
|
| 100 |
+
# =============================================================================
|
| 101 |
+
# HUGGINGFACE API WRAPPERS WITH BACKOFF
|
| 102 |
+
# =============================================================================
|
| 103 |
+
|
| 104 |
+
def is_rate_limit_error(e):
|
| 105 |
+
"""Check if exception is a HuggingFace rate limit error (429)."""
|
| 106 |
+
if isinstance(e, HfHubHTTPError):
|
| 107 |
+
return e.response.status_code == 429
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@backoff.on_exception(
|
| 112 |
+
backoff.expo,
|
| 113 |
+
HfHubHTTPError,
|
| 114 |
+
max_tries=8,
|
| 115 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 116 |
+
on_backoff=lambda details: print(
|
| 117 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 118 |
+
)
|
| 119 |
+
)
|
| 120 |
+
def upload_large_folder_with_backoff(api, **kwargs):
|
| 121 |
+
"""Wrapper for api.upload_large_folder() with exponential backoff for rate limits."""
|
| 122 |
+
return api.upload_large_folder(**kwargs)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@backoff.on_exception(
|
| 126 |
+
backoff.expo,
|
| 127 |
+
HfHubHTTPError,
|
| 128 |
+
max_tries=8,
|
| 129 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 130 |
+
on_backoff=lambda details: print(
|
| 131 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 132 |
+
)
|
| 133 |
+
)
|
| 134 |
+
def list_repo_files_with_backoff(api, **kwargs):
|
| 135 |
+
"""Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
|
| 136 |
+
return api.list_repo_files(**kwargs)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@backoff.on_exception(
|
| 140 |
+
backoff.expo,
|
| 141 |
+
HfHubHTTPError,
|
| 142 |
+
max_tries=8,
|
| 143 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 144 |
+
on_backoff=lambda details: print(
|
| 145 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 146 |
+
)
|
| 147 |
+
)
|
| 148 |
+
def hf_hub_download_with_backoff(**kwargs):
|
| 149 |
+
"""Wrapper for hf_hub_download() with exponential backoff for rate limits."""
|
| 150 |
+
return hf_hub_download(**kwargs)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@backoff.on_exception(
|
| 154 |
+
backoff.expo,
|
| 155 |
+
HfHubHTTPError,
|
| 156 |
+
max_tries=8,
|
| 157 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 158 |
+
on_backoff=lambda details: print(
|
| 159 |
+
f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
|
| 160 |
+
)
|
| 161 |
+
)
|
| 162 |
+
def upload_file_with_backoff(api, **kwargs):
|
| 163 |
+
"""Wrapper for api.upload_file() with exponential backoff for rate limits."""
|
| 164 |
+
return api.upload_file(**kwargs)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
def get_bigquery_client():
|
| 168 |
"""
|
| 169 |
Initialize BigQuery client using credentials from environment variable.
|
|
|
|
| 559 |
# Upload entire folder using upload_large_folder (optimized for large files)
|
| 560 |
# Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
|
| 561 |
print(f" 📤 Uploading {len(grouped)} files ({len(metadata_list)} total reviews)...")
|
| 562 |
+
upload_large_folder_with_backoff(
|
| 563 |
+
api=api,
|
| 564 |
folder_path=temp_dir,
|
| 565 |
repo_id=REVIEW_METADATA_REPO,
|
| 566 |
repo_type="dataset"
|
|
|
|
| 592 |
agents = []
|
| 593 |
|
| 594 |
# List all files in the repository
|
| 595 |
+
files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
|
| 596 |
|
| 597 |
# Filter for JSON files only
|
| 598 |
json_files = [f for f in files if f.endswith('.json')]
|
|
|
|
| 602 |
# Download and parse each JSON file
|
| 603 |
for json_file in json_files:
|
| 604 |
try:
|
| 605 |
+
file_path = hf_hub_download_with_backoff(
|
| 606 |
repo_id=AGENTS_REPO,
|
| 607 |
filename=json_file,
|
| 608 |
repo_type="dataset"
|
|
|
|
| 650 |
token = get_hf_token()
|
| 651 |
|
| 652 |
# List all files in the repository
|
| 653 |
+
files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
|
| 654 |
|
| 655 |
# Filter for JSONL files matching pattern: [agent_identifier]/YYYY.MM.DD.jsonl
|
| 656 |
time_frame_files = []
|
|
|
|
| 686 |
|
| 687 |
agent_identifier = parts[0]
|
| 688 |
|
| 689 |
+
file_path = hf_hub_download_with_backoff(
|
| 690 |
repo_id=REVIEW_METADATA_REPO,
|
| 691 |
filename=filename,
|
| 692 |
repo_type="dataset",
|
|
|
|
| 895 |
stats = calculate_review_stats_from_metadata(agent_metadata)
|
| 896 |
|
| 897 |
cache_dict[identifier] = {
|
| 898 |
+
'name': agent_name,
|
| 899 |
'name': agent_name,
|
| 900 |
'website': agent.get('website', 'N/A'),
|
| 901 |
'github_identifier': identifier,
|
|
|
|
| 942 |
|
| 943 |
try:
|
| 944 |
# Upload to HuggingFace
|
| 945 |
+
upload_file_with_backoff(
|
| 946 |
+
api=api,
|
| 947 |
path_or_fileobj=filename,
|
| 948 |
path_in_repo=filename,
|
| 949 |
repo_id=LEADERBOARD_REPO,
|
|
|
|
| 1024 |
|
| 1025 |
for i, agent in enumerate(agents, 1):
|
| 1026 |
identifier = agent.get('github_identifier')
|
| 1027 |
+
agent_name = agent.get('name', 'Unknown')
|
| 1028 |
|
| 1029 |
if not identifier:
|
| 1030 |
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
APScheduler
|
|
|
|
| 2 |
datasets
|
| 3 |
db-dtypes
|
| 4 |
google-cloud-bigquery
|
|
|
|
| 1 |
APScheduler
|
| 2 |
+
backoff
|
| 3 |
datasets
|
| 4 |
db-dtypes
|
| 5 |
google-cloud-bigquery
|