zhiminy commited on
Commit
f1f4f5c
·
1 Parent(s): 313696f
Files changed (3) hide show
  1. app.py +147 -33
  2. msr.py +79 -8
  3. requirements.txt +1 -0
app.py CHANGED
@@ -8,8 +8,10 @@ import requests
8
  from datetime import datetime, timezone, timedelta
9
  from collections import defaultdict
10
  from huggingface_hub import HfApi, hf_hub_download
 
11
  from datasets import load_dataset, Dataset
12
  import threading
 
13
  from dotenv import load_dotenv
14
  import pandas as pd
15
  import random
@@ -105,6 +107,73 @@ def normalize_date_format(date_string):
105
  return date_string
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # =============================================================================
109
  # BIGQUERY FUNCTIONS
110
  # =============================================================================
@@ -216,7 +285,7 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
216
  For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
217
 
218
  Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
219
- actor.login matches the agent identifier.
220
 
221
  Args:
222
  client: BigQuery client instance
@@ -225,37 +294,74 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
225
  end_date: End datetime (timezone-aware)
226
 
227
  Returns:
228
- List of review event rows with PR information
229
  """
230
  print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
231
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
232
 
233
  # Generate list of table names for each day in the range
234
- table_refs = []
235
  current_date = start_date
236
  while current_date < end_date:
237
- table_name = f"githubarchive.day.{current_date.strftime('%Y%m%d')}"
238
- table_refs.append(table_name)
 
 
 
 
 
 
 
 
 
 
239
  current_date += timedelta(days=1)
 
240
 
241
- # Build UNION ALL query for all daily tables
242
- union_parts = []
243
- for table_name in table_refs:
244
- union_parts.append(f"""
245
  SELECT
246
- repo.name as repo_name,
247
- actor.login as actor_login,
248
- JSON_EXTRACT_SCALAR(payload, '$.pull_request.url') as url,
249
- CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) as pr_number,
250
- JSON_EXTRACT_SCALAR(payload, '$.review.submitted_at') as reviewed_at,
 
251
  created_at
252
- FROM `{table_name}`
 
 
253
  WHERE type = 'PullRequestReviewEvent'
254
  AND actor.login = @identifier
255
- AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.url') IS NOT NULL
256
- """)
257
-
258
- query = " UNION ALL ".join(union_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  job_config = bigquery.QueryJobConfig(
261
  query_parameters=[
@@ -263,7 +369,7 @@ def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
263
  ]
264
  )
265
 
266
- print(f" Querying {len(table_refs)} daily tables...")
267
 
268
  try:
269
  query_job = client.query(query, job_config=job_config)
@@ -1233,7 +1339,8 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
1233
  # Upload entire folder using upload_large_folder (optimized for large files)
1234
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
1235
  print(f"📤 Uploading {len(grouped)} files...")
1236
- api.upload_large_folder(
 
1237
  folder_path=temp_dir,
1238
  repo_id=REVIEW_METADATA_REPO,
1239
  repo_type="dataset"
@@ -1273,7 +1380,7 @@ def load_review_metadata():
1273
  token = get_hf_token()
1274
 
1275
  # List all files in the repository
1276
- files = api.list_repo_files(repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
1277
 
1278
  # Filter for files matching the pattern: [agent_identifier]/YYYY.MM.DD.jsonl
1279
  # AND within the time frame (parse date from filename)
@@ -1315,7 +1422,7 @@ def load_review_metadata():
1315
  agent_identifier = parts[0]
1316
  agent_identifiers_found.add(agent_identifier)
1317
 
1318
- file_path = hf_hub_download(
1319
  repo_id=REVIEW_METADATA_REPO,
1320
  filename=filename,
1321
  repo_type="dataset",
@@ -1371,7 +1478,7 @@ def get_latest_review_date_for_agent(agent_identifier):
1371
  token = get_hf_token()
1372
 
1373
  # List all files in the repository
1374
- files = api.list_repo_files(repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
1375
 
1376
  # Filter for files in this agent's folder
1377
  # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
@@ -1385,7 +1492,7 @@ def get_latest_review_date_for_agent(agent_identifier):
1385
  latest_date = None
1386
  for filename in agent_files:
1387
  try:
1388
- file_path = hf_hub_download(
1389
  repo_id=REVIEW_METADATA_REPO,
1390
  filename=filename,
1391
  repo_type="dataset",
@@ -1430,7 +1537,7 @@ def get_daily_files_last_time_frame(agent_identifier):
1430
  cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1431
 
1432
  # List all files in the repository
1433
- files = api.list_repo_files(repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
1434
 
1435
  # Filter for files in this agent's folder
1436
  agent_pattern = f"{agent_identifier}/"
@@ -1639,7 +1746,7 @@ def load_agents_from_hf():
1639
  agents = []
1640
 
1641
  # List all files in the repository
1642
- files = api.list_repo_files(repo_id=AGENTS_REPO, repo_type="dataset")
1643
 
1644
  # Filter for JSON files only
1645
  json_files = [f for f in files if f.endswith('.json')]
@@ -1647,7 +1754,7 @@ def load_agents_from_hf():
1647
  # Download and parse each JSON file
1648
  for json_file in json_files:
1649
  try:
1650
- file_path = hf_hub_download(
1651
  repo_id=AGENTS_REPO,
1652
  filename=json_file,
1653
  repo_type="dataset"
@@ -1656,6 +1763,11 @@ def load_agents_from_hf():
1656
  with open(file_path, 'r') as f:
1657
  agent_data = json.load(f)
1658
 
 
 
 
 
 
1659
  # Extract github_identifier from filename (e.g., "claude[bot].json" -> "claude[bot]")
1660
  filename_identifier = json_file.replace('.json', '')
1661
 
@@ -1839,7 +1951,7 @@ def load_leaderboard_data_from_hf():
1839
  filename = "swe-review.json"
1840
 
1841
  # Download file
1842
- file_path = hf_hub_download(
1843
  repo_id=LEADERBOARD_REPO,
1844
  filename=filename,
1845
  repo_type="dataset",
@@ -1910,7 +2022,8 @@ def save_leaderboard_and_metrics_to_hf():
1910
 
1911
  # Upload to HuggingFace (will overwrite if exists)
1912
  print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
1913
- api.upload_file(
 
1914
  path_or_fileobj=file_like_object,
1915
  path_in_repo="swe-review.json",
1916
  repo_id=LEADERBOARD_REPO,
@@ -2019,7 +2132,8 @@ def construct_leaderboard_from_metadata():
2019
  stats = calculate_review_stats_from_metadata(agent_metadata)
2020
 
2021
  cache_dict[identifier] = {
2022
- 'agent_name': agent_name,
 
2023
  'website': agent.get('website', 'N/A'),
2024
  'github_identifier': identifier,
2025
  **stats
@@ -2232,7 +2346,7 @@ def get_leaderboard_dataframe():
2232
 
2233
  # Only include display-relevant fields
2234
  rows.append([
2235
- data.get('agent_name', 'Unknown'),
2236
  data.get('website', 'N/A'),
2237
  total_reviews,
2238
  data.get('merged_prs', 0),
@@ -2297,7 +2411,7 @@ def submit_agent(identifier, agent_name, developer, website):
2297
 
2298
  # Create submission
2299
  submission = {
2300
- 'agent_name': agent_name,
2301
  'developer': developer,
2302
  'github_identifier': identifier,
2303
  'website': website,
 
8
  from datetime import datetime, timezone, timedelta
9
  from collections import defaultdict
10
  from huggingface_hub import HfApi, hf_hub_download
11
+ from huggingface_hub.errors import HfHubHTTPError
12
  from datasets import load_dataset, Dataset
13
  import threading
14
+ import backoff
15
  from dotenv import load_dotenv
16
  import pandas as pd
17
  import random
 
107
  return date_string
108
 
109
 
110
+ # =============================================================================
111
+ # HUGGINGFACE API WRAPPERS WITH BACKOFF
112
+ # =============================================================================
113
+
114
+ def is_rate_limit_error(e):
115
+ """Check if exception is a HuggingFace rate limit error (429)."""
116
+ if isinstance(e, HfHubHTTPError):
117
+ return e.response.status_code == 429
118
+ return False
119
+
120
+
121
+ @backoff.on_exception(
122
+ backoff.expo,
123
+ HfHubHTTPError,
124
+ max_tries=8,
125
+ giveup=lambda e: not is_rate_limit_error(e),
126
+ on_backoff=lambda details: print(
127
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
128
+ )
129
+ )
130
+ def upload_large_folder_with_backoff(api, **kwargs):
131
+ """Wrapper for api.upload_large_folder() with exponential backoff for rate limits."""
132
+ return api.upload_large_folder(**kwargs)
133
+
134
+
135
+ @backoff.on_exception(
136
+ backoff.expo,
137
+ HfHubHTTPError,
138
+ max_tries=8,
139
+ giveup=lambda e: not is_rate_limit_error(e),
140
+ on_backoff=lambda details: print(
141
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
142
+ )
143
+ )
144
+ def list_repo_files_with_backoff(api, **kwargs):
145
+ """Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
146
+ return api.list_repo_files(**kwargs)
147
+
148
+
149
+ @backoff.on_exception(
150
+ backoff.expo,
151
+ HfHubHTTPError,
152
+ max_tries=8,
153
+ giveup=lambda e: not is_rate_limit_error(e),
154
+ on_backoff=lambda details: print(
155
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
156
+ )
157
+ )
158
+ def hf_hub_download_with_backoff(**kwargs):
159
+ """Wrapper for hf_hub_download() with exponential backoff for rate limits."""
160
+ return hf_hub_download(**kwargs)
161
+
162
+
163
+ @backoff.on_exception(
164
+ backoff.expo,
165
+ HfHubHTTPError,
166
+ max_tries=8,
167
+ giveup=lambda e: not is_rate_limit_error(e),
168
+ on_backoff=lambda details: print(
169
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
170
+ )
171
+ )
172
+ def upload_file_with_backoff(api, **kwargs):
173
+ """Wrapper for api.upload_file() with exponential backoff for rate limits."""
174
+ return api.upload_file(**kwargs)
175
+
176
+
177
  # =============================================================================
178
  # BIGQUERY FUNCTIONS
179
  # =============================================================================
 
285
  For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
286
 
287
  Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
288
+ actor.login matches the agent identifier, and joins with PR status.
289
 
290
  Args:
291
  client: BigQuery client instance
 
294
  end_date: End datetime (timezone-aware)
295
 
296
  Returns:
297
+ List of review event rows with PR information including merged_at and closed_at
298
  """
299
  print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
300
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
301
 
302
  # Generate list of table names for each day in the range
303
+ review_tables = []
304
  current_date = start_date
305
  while current_date < end_date:
306
+ table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
307
+ review_tables.append(f"SELECT * FROM {table_name}")
308
+ current_date += timedelta(days=1)
309
+ review_union = " UNION ALL ".join(review_tables)
310
+
311
+ # Generate status tables (lookback for PR status)
312
+ status_start = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
313
+ status_tables = []
314
+ current_date = status_start
315
+ while current_date < end_date:
316
+ table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
317
+ status_tables.append(f"SELECT * FROM {table_name}")
318
  current_date += timedelta(days=1)
319
+ status_union = " UNION ALL ".join(status_tables)
320
 
321
+ # Build comprehensive query with CTEs for PR status
322
+ query = f"""
323
+ WITH review_events AS (
 
324
  SELECT
325
+ JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as url,
326
+ COALESCE(
327
+ JSON_EXTRACT_SCALAR(payload, '$.review.submitted_at'),
328
+ CAST(created_at AS STRING)
329
+ ) as reviewed_at,
330
+ actor.login as reviewer,
331
  created_at
332
+ FROM (
333
+ {review_union}
334
+ )
335
  WHERE type = 'PullRequestReviewEvent'
336
  AND actor.login = @identifier
337
+ AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IS NOT NULL
338
+ ),
339
+ pr_status AS (
340
+ SELECT
341
+ JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as url,
342
+ JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
343
+ JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
344
+ created_at
345
+ FROM (
346
+ {status_union}
347
+ )
348
+ WHERE type = 'PullRequestEvent'
349
+ AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'closed'
350
+ AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') IN (
351
+ SELECT DISTINCT url FROM review_events
352
+ )
353
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY url ORDER BY created_at DESC) = 1
354
+ )
355
+ SELECT DISTINCT
356
+ re.url,
357
+ re.reviewed_at,
358
+ re.created_at,
359
+ ps.merged_at,
360
+ ps.closed_at
361
+ FROM review_events re
362
+ LEFT JOIN pr_status ps ON re.url = ps.url
363
+ ORDER BY re.reviewed_at DESC
364
+ """
365
 
366
  job_config = bigquery.QueryJobConfig(
367
  query_parameters=[
 
369
  ]
370
  )
371
 
372
+ print(f" Querying {len(review_tables)} review tables and {len(status_tables)} status tables...")
373
 
374
  try:
375
  query_job = client.query(query, job_config=job_config)
 
1339
  # Upload entire folder using upload_large_folder (optimized for large files)
1340
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
1341
  print(f"📤 Uploading {len(grouped)} files...")
1342
+ upload_large_folder_with_backoff(
1343
+ api=api,
1344
  folder_path=temp_dir,
1345
  repo_id=REVIEW_METADATA_REPO,
1346
  repo_type="dataset"
 
1380
  token = get_hf_token()
1381
 
1382
  # List all files in the repository
1383
+ files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
1384
 
1385
  # Filter for files matching the pattern: [agent_identifier]/YYYY.MM.DD.jsonl
1386
  # AND within the time frame (parse date from filename)
 
1422
  agent_identifier = parts[0]
1423
  agent_identifiers_found.add(agent_identifier)
1424
 
1425
+ file_path = hf_hub_download_with_backoff(
1426
  repo_id=REVIEW_METADATA_REPO,
1427
  filename=filename,
1428
  repo_type="dataset",
 
1478
  token = get_hf_token()
1479
 
1480
  # List all files in the repository
1481
+ files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
1482
 
1483
  # Filter for files in this agent's folder
1484
  # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
 
1492
  latest_date = None
1493
  for filename in agent_files:
1494
  try:
1495
+ file_path = hf_hub_download_with_backoff(
1496
  repo_id=REVIEW_METADATA_REPO,
1497
  filename=filename,
1498
  repo_type="dataset",
 
1537
  cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1538
 
1539
  # List all files in the repository
1540
+ files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
1541
 
1542
  # Filter for files in this agent's folder
1543
  agent_pattern = f"{agent_identifier}/"
 
1746
  agents = []
1747
 
1748
  # List all files in the repository
1749
+ files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
1750
 
1751
  # Filter for JSON files only
1752
  json_files = [f for f in files if f.endswith('.json')]
 
1754
  # Download and parse each JSON file
1755
  for json_file in json_files:
1756
  try:
1757
+ file_path = hf_hub_download_with_backoff(
1758
  repo_id=AGENTS_REPO,
1759
  filename=json_file,
1760
  repo_type="dataset"
 
1763
  with open(file_path, 'r') as f:
1764
  agent_data = json.load(f)
1765
 
1766
+ # Only process agents with status == "public"
1767
+ if agent_data.get('status') != 'public':
1768
+ print(f"Skipping {json_file}: status is not 'public'")
1769
+ continue
1770
+
1771
  # Extract github_identifier from filename (e.g., "claude[bot].json" -> "claude[bot]")
1772
  filename_identifier = json_file.replace('.json', '')
1773
 
 
1951
  filename = "swe-review.json"
1952
 
1953
  # Download file
1954
+ file_path = hf_hub_download_with_backoff(
1955
  repo_id=LEADERBOARD_REPO,
1956
  filename=filename,
1957
  repo_type="dataset",
 
2022
 
2023
  # Upload to HuggingFace (will overwrite if exists)
2024
  print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
2025
+ upload_file_with_backoff(
2026
+ api=api,
2027
  path_or_fileobj=file_like_object,
2028
  path_in_repo="swe-review.json",
2029
  repo_id=LEADERBOARD_REPO,
 
2132
  stats = calculate_review_stats_from_metadata(agent_metadata)
2133
 
2134
  cache_dict[identifier] = {
2135
+ 'name': agent_name,
2136
+ 'name': agent_name, # Store both for compatibility
2137
  'website': agent.get('website', 'N/A'),
2138
  'github_identifier': identifier,
2139
  **stats
 
2346
 
2347
  # Only include display-relevant fields
2348
  rows.append([
2349
+ data.get('name', 'Unknown'),
2350
  data.get('website', 'N/A'),
2351
  total_reviews,
2352
  data.get('merged_prs', 0),
 
2411
 
2412
  # Create submission
2413
  submission = {
2414
+ 'name': agent_name,
2415
  'developer': developer,
2416
  'github_identifier': identifier,
2417
  'website': website,
msr.py CHANGED
@@ -9,8 +9,10 @@ import tempfile
9
  from datetime import datetime, timezone, timedelta
10
  from collections import defaultdict
11
  from huggingface_hub import HfApi, hf_hub_download
 
12
  from dotenv import load_dotenv
13
  from google.cloud import bigquery
 
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -95,6 +97,73 @@ def get_hf_token():
95
  return token
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def get_bigquery_client():
99
  """
100
  Initialize BigQuery client using credentials from environment variable.
@@ -490,7 +559,8 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
490
  # Upload entire folder using upload_large_folder (optimized for large files)
491
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
492
  print(f" 📤 Uploading {len(grouped)} files ({len(metadata_list)} total reviews)...")
493
- api.upload_large_folder(
 
494
  folder_path=temp_dir,
495
  repo_id=REVIEW_METADATA_REPO,
496
  repo_type="dataset"
@@ -522,7 +592,7 @@ def load_agents_from_hf():
522
  agents = []
523
 
524
  # List all files in the repository
525
- files = api.list_repo_files(repo_id=AGENTS_REPO, repo_type="dataset")
526
 
527
  # Filter for JSON files only
528
  json_files = [f for f in files if f.endswith('.json')]
@@ -532,7 +602,7 @@ def load_agents_from_hf():
532
  # Download and parse each JSON file
533
  for json_file in json_files:
534
  try:
535
- file_path = hf_hub_download(
536
  repo_id=AGENTS_REPO,
537
  filename=json_file,
538
  repo_type="dataset"
@@ -580,7 +650,7 @@ def load_review_metadata():
580
  token = get_hf_token()
581
 
582
  # List all files in the repository
583
- files = api.list_repo_files(repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
584
 
585
  # Filter for JSONL files matching pattern: [agent_identifier]/YYYY.MM.DD.jsonl
586
  time_frame_files = []
@@ -616,7 +686,7 @@ def load_review_metadata():
616
 
617
  agent_identifier = parts[0]
618
 
619
- file_path = hf_hub_download(
620
  repo_id=REVIEW_METADATA_REPO,
621
  filename=filename,
622
  repo_type="dataset",
@@ -825,7 +895,7 @@ def construct_leaderboard_from_metadata():
825
  stats = calculate_review_stats_from_metadata(agent_metadata)
826
 
827
  cache_dict[identifier] = {
828
- 'agent_name': agent_name,
829
  'name': agent_name,
830
  'website': agent.get('website', 'N/A'),
831
  'github_identifier': identifier,
@@ -872,7 +942,8 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
872
 
873
  try:
874
  # Upload to HuggingFace
875
- api.upload_file(
 
876
  path_or_fileobj=filename,
877
  path_in_repo=filename,
878
  repo_id=LEADERBOARD_REPO,
@@ -953,7 +1024,7 @@ def mine_all_agents():
953
 
954
  for i, agent in enumerate(agents, 1):
955
  identifier = agent.get('github_identifier')
956
- agent_name = agent.get('name', agent.get('agent_name', 'Unknown'))
957
 
958
  if not identifier:
959
  print(f"[{i}/{len(agents)}] Skipping agent without identifier")
 
9
  from datetime import datetime, timezone, timedelta
10
  from collections import defaultdict
11
  from huggingface_hub import HfApi, hf_hub_download
12
+ from huggingface_hub.errors import HfHubHTTPError
13
  from dotenv import load_dotenv
14
  from google.cloud import bigquery
15
+ import backoff
16
 
17
  # Load environment variables
18
  load_dotenv()
 
97
  return token
98
 
99
 
100
+ # =============================================================================
101
+ # HUGGINGFACE API WRAPPERS WITH BACKOFF
102
+ # =============================================================================
103
+
104
+ def is_rate_limit_error(e):
105
+ """Check if exception is a HuggingFace rate limit error (429)."""
106
+ if isinstance(e, HfHubHTTPError):
107
+ return e.response.status_code == 429
108
+ return False
109
+
110
+
111
+ @backoff.on_exception(
112
+ backoff.expo,
113
+ HfHubHTTPError,
114
+ max_tries=8,
115
+ giveup=lambda e: not is_rate_limit_error(e),
116
+ on_backoff=lambda details: print(
117
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
118
+ )
119
+ )
120
+ def upload_large_folder_with_backoff(api, **kwargs):
121
+ """Wrapper for api.upload_large_folder() with exponential backoff for rate limits."""
122
+ return api.upload_large_folder(**kwargs)
123
+
124
+
125
+ @backoff.on_exception(
126
+ backoff.expo,
127
+ HfHubHTTPError,
128
+ max_tries=8,
129
+ giveup=lambda e: not is_rate_limit_error(e),
130
+ on_backoff=lambda details: print(
131
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
132
+ )
133
+ )
134
+ def list_repo_files_with_backoff(api, **kwargs):
135
+ """Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
136
+ return api.list_repo_files(**kwargs)
137
+
138
+
139
+ @backoff.on_exception(
140
+ backoff.expo,
141
+ HfHubHTTPError,
142
+ max_tries=8,
143
+ giveup=lambda e: not is_rate_limit_error(e),
144
+ on_backoff=lambda details: print(
145
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
146
+ )
147
+ )
148
+ def hf_hub_download_with_backoff(**kwargs):
149
+ """Wrapper for hf_hub_download() with exponential backoff for rate limits."""
150
+ return hf_hub_download(**kwargs)
151
+
152
+
153
+ @backoff.on_exception(
154
+ backoff.expo,
155
+ HfHubHTTPError,
156
+ max_tries=8,
157
+ giveup=lambda e: not is_rate_limit_error(e),
158
+ on_backoff=lambda details: print(
159
+ f"⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/8)..."
160
+ )
161
+ )
162
+ def upload_file_with_backoff(api, **kwargs):
163
+ """Wrapper for api.upload_file() with exponential backoff for rate limits."""
164
+ return api.upload_file(**kwargs)
165
+
166
+
167
  def get_bigquery_client():
168
  """
169
  Initialize BigQuery client using credentials from environment variable.
 
559
  # Upload entire folder using upload_large_folder (optimized for large files)
560
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
561
  print(f" 📤 Uploading {len(grouped)} files ({len(metadata_list)} total reviews)...")
562
+ upload_large_folder_with_backoff(
563
+ api=api,
564
  folder_path=temp_dir,
565
  repo_id=REVIEW_METADATA_REPO,
566
  repo_type="dataset"
 
592
  agents = []
593
 
594
  # List all files in the repository
595
+ files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
596
 
597
  # Filter for JSON files only
598
  json_files = [f for f in files if f.endswith('.json')]
 
602
  # Download and parse each JSON file
603
  for json_file in json_files:
604
  try:
605
+ file_path = hf_hub_download_with_backoff(
606
  repo_id=AGENTS_REPO,
607
  filename=json_file,
608
  repo_type="dataset"
 
650
  token = get_hf_token()
651
 
652
  # List all files in the repository
653
+ files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
654
 
655
  # Filter for JSONL files matching pattern: [agent_identifier]/YYYY.MM.DD.jsonl
656
  time_frame_files = []
 
686
 
687
  agent_identifier = parts[0]
688
 
689
+ file_path = hf_hub_download_with_backoff(
690
  repo_id=REVIEW_METADATA_REPO,
691
  filename=filename,
692
  repo_type="dataset",
 
895
  stats = calculate_review_stats_from_metadata(agent_metadata)
896
 
897
  cache_dict[identifier] = {
898
+ 'name': agent_name,
899
  'name': agent_name,
900
  'website': agent.get('website', 'N/A'),
901
  'github_identifier': identifier,
 
942
 
943
  try:
944
  # Upload to HuggingFace
945
+ upload_file_with_backoff(
946
+ api=api,
947
  path_or_fileobj=filename,
948
  path_in_repo=filename,
949
  repo_id=LEADERBOARD_REPO,
 
1024
 
1025
  for i, agent in enumerate(agents, 1):
1026
  identifier = agent.get('github_identifier')
1027
+ agent_name = agent.get('name', 'Unknown')
1028
 
1029
  if not identifier:
1030
  print(f"[{i}/{len(agents)}] Skipping agent without identifier")
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  APScheduler
 
2
  datasets
3
  db-dtypes
4
  google-cloud-bigquery
 
1
  APScheduler
2
+ backoff
3
  datasets
4
  db-dtypes
5
  google-cloud-bigquery