Spaces:
Sleeping
Sleeping
zhimin-z
commited on
Commit
·
0118014
1
Parent(s):
73d682a
refin
Browse files- Dockerfile +0 -7
- docker-compose.yml +2 -4
- msr.py +26 -122
Dockerfile
CHANGED
|
@@ -9,16 +9,9 @@ RUN apt-get update && apt-get install -y \
|
|
| 9 |
g++ \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
-
# Copy requirements first to leverage Docker cache
|
| 13 |
-
COPY requirements.txt .
|
| 14 |
-
|
| 15 |
# Install Python dependencies
|
| 16 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
|
| 18 |
-
# Copy application code
|
| 19 |
-
COPY msr.py .
|
| 20 |
-
COPY .env .env
|
| 21 |
-
|
| 22 |
# Set environment variables
|
| 23 |
ENV PYTHONUNBUFFERED=1
|
| 24 |
|
|
|
|
| 9 |
g++ \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
# Install Python dependencies
|
| 13 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Set environment variables
|
| 16 |
ENV PYTHONUNBUFFERED=1
|
| 17 |
|
docker-compose.yml
CHANGED
|
@@ -10,12 +10,10 @@ services:
|
|
| 10 |
env_file:
|
| 11 |
- .env
|
| 12 |
volumes:
|
|
|
|
|
|
|
| 13 |
# Mount gharchive data directory
|
| 14 |
- ../gharchive/data:/gharchive/data:ro
|
| 15 |
-
# Persist DuckDB cache
|
| 16 |
-
- ./gharchive_cache.duckdb:/app/gharchive_cache.duckdb
|
| 17 |
-
# Persist logs
|
| 18 |
-
- ./logs:/app/logs
|
| 19 |
environment:
|
| 20 |
- PYTHONUNBUFFERED=1
|
| 21 |
logging:
|
|
|
|
| 10 |
env_file:
|
| 11 |
- .env
|
| 12 |
volumes:
|
| 13 |
+
# Mount entire workspace for live code updates
|
| 14 |
+
- .:/app
|
| 15 |
# Mount gharchive data directory
|
| 16 |
- ../gharchive/data:/gharchive/data:ro
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
environment:
|
| 18 |
- PYTHONUNBUFFERED=1
|
| 19 |
logging:
|
msr.py
CHANGED
|
@@ -137,7 +137,6 @@ def download_file(url):
|
|
| 137 |
|
| 138 |
# Skip if json.gz already exists
|
| 139 |
if os.path.exists(filepath):
|
| 140 |
-
print(f" ✓ {filename} (already exists)")
|
| 141 |
return True
|
| 142 |
|
| 143 |
# Download with retry logic
|
|
@@ -147,7 +146,6 @@ def download_file(url):
|
|
| 147 |
response.raise_for_status()
|
| 148 |
with open(filepath, "wb") as f:
|
| 149 |
f.write(response.content)
|
| 150 |
-
print(f" ✓ {filename} (downloaded)")
|
| 151 |
return True
|
| 152 |
|
| 153 |
except requests.exceptions.HTTPError as e:
|
|
@@ -194,10 +192,6 @@ def download_all_gharchive_data():
|
|
| 194 |
Returns:
|
| 195 |
bool: True if all downloads completed (some may have failed), False if critical error
|
| 196 |
"""
|
| 197 |
-
print(f"\n{'='*80}")
|
| 198 |
-
print(f"DOWNLOADING GHARCHIVE DATA")
|
| 199 |
-
print(f"{'='*80}")
|
| 200 |
-
|
| 201 |
# Create data directory if it doesn't exist
|
| 202 |
os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
|
| 203 |
|
|
@@ -215,11 +209,6 @@ def download_all_gharchive_data():
|
|
| 215 |
urls.append(url)
|
| 216 |
current_date += timedelta(days=1)
|
| 217 |
|
| 218 |
-
print(f"Downloading {len(urls)} files ({len(urls)//24} days × 24 hours)")
|
| 219 |
-
print(f"Workers: {DOWNLOAD_WORKERS}")
|
| 220 |
-
print(f"Target directory: {GHARCHIVE_DATA_DIR}")
|
| 221 |
-
print(f"{'='*80}\n")
|
| 222 |
-
|
| 223 |
downloads_processed = 0
|
| 224 |
|
| 225 |
try:
|
|
@@ -230,18 +219,12 @@ def download_all_gharchive_data():
|
|
| 230 |
# Wait for downloads to complete
|
| 231 |
for future in as_completed(futures):
|
| 232 |
downloads_processed += 1
|
| 233 |
-
if downloads_processed % 100 == 0:
|
| 234 |
-
print(f" Progress: {downloads_processed}/{len(urls)} files processed ({downloads_processed*100//len(urls)}%)")
|
| 235 |
|
| 236 |
-
print(f"
|
| 237 |
-
print(f"Download complete: {downloads_processed}/{len(urls)} files processed")
|
| 238 |
-
print(f"{'='*80}\n")
|
| 239 |
return True
|
| 240 |
|
| 241 |
except Exception as e:
|
| 242 |
-
print(f"\n{'='*80}")
|
| 243 |
print(f"Error during download: {str(e)}")
|
| 244 |
-
print(f"{'='*80}\n")
|
| 245 |
import traceback
|
| 246 |
traceback.print_exc()
|
| 247 |
return False
|
|
@@ -435,9 +418,6 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 435 |
...
|
| 436 |
}
|
| 437 |
"""
|
| 438 |
-
print(f"Querying DuckDB for ALL {len(identifiers)} agents in ONE QUERY")
|
| 439 |
-
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 440 |
-
|
| 441 |
# Generate file path patterns for review period
|
| 442 |
review_patterns = generate_file_path_patterns(start_date, end_date)
|
| 443 |
|
|
@@ -530,13 +510,6 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 530 |
ORDER BY re.reviewer, re.reviewed_at DESC
|
| 531 |
"""
|
| 532 |
|
| 533 |
-
# Calculate number of days for reporting
|
| 534 |
-
review_days = (end_date - start_date).days
|
| 535 |
-
status_days = (end_date - status_start_date).days
|
| 536 |
-
|
| 537 |
-
print(f" Querying {review_days} days for reviews, {status_days} days for PR status...")
|
| 538 |
-
print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
|
| 539 |
-
|
| 540 |
try:
|
| 541 |
# Create cache table name based on date range
|
| 542 |
cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
|
|
@@ -548,14 +521,12 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 548 |
""").fetchone()[0] > 0
|
| 549 |
|
| 550 |
if cache_exists:
|
| 551 |
-
print(f" Using cached results from table {cache_table_name}")
|
| 552 |
results = conn.execute(f"""
|
| 553 |
SELECT reviewer, url, reviewed_at, merged_at, closed_at
|
| 554 |
FROM {cache_table_name}
|
| 555 |
WHERE reviewer IN ({identifier_list})
|
| 556 |
""").fetchall()
|
| 557 |
else:
|
| 558 |
-
print(f" Cache miss - executing full query and caching to {cache_table_name}")
|
| 559 |
# Execute query with parameters
|
| 560 |
results = conn.execute(query, {'review_patterns': review_patterns, 'status_patterns': status_patterns}).fetchall()
|
| 561 |
|
|
@@ -575,9 +546,6 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 575 |
[r[3] for r in results],
|
| 576 |
[r[4] for r in results]
|
| 577 |
])
|
| 578 |
-
print(f" Cached {len(results)} results to {cache_table_name}")
|
| 579 |
-
|
| 580 |
-
print(f" Found {len(results)} total PR review records across all agents")
|
| 581 |
|
| 582 |
# Group results by agent
|
| 583 |
metadata_by_agent = defaultdict(list)
|
|
@@ -596,22 +564,11 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
|
|
| 596 |
'closed_at': closed_at,
|
| 597 |
})
|
| 598 |
|
| 599 |
-
# Print breakdown by agent
|
| 600 |
-
print(f"Results breakdown by agent:")
|
| 601 |
-
for identifier in identifiers:
|
| 602 |
-
count = len(metadata_by_agent.get(identifier, []))
|
| 603 |
-
if count > 0:
|
| 604 |
-
metadata = metadata_by_agent[identifier]
|
| 605 |
-
merged_count = sum(1 for m in metadata if m['merged_at'] is not None)
|
| 606 |
-
closed_count = sum(1 for m in metadata if m['closed_at'] is not None and m['merged_at'] is None)
|
| 607 |
-
open_count = count - merged_count - closed_count
|
| 608 |
-
print(f" {identifier}: {count} PRs ({merged_count} merged, {closed_count} closed, {open_count} open)")
|
| 609 |
-
|
| 610 |
# Convert defaultdict to regular dict
|
| 611 |
return dict(metadata_by_agent)
|
| 612 |
|
| 613 |
except Exception as e:
|
| 614 |
-
print(f"
|
| 615 |
import traceback
|
| 616 |
traceback.print_exc()
|
| 617 |
return {}
|
|
@@ -710,23 +667,17 @@ def batch_upload_review_metadata(all_metadata):
|
|
| 710 |
grouped = group_metadata_by_date(metadata_list)
|
| 711 |
total_files += len(grouped)
|
| 712 |
|
| 713 |
-
print(f"
|
| 714 |
-
print(f"Starting batch upload: {len(all_metadata)} agents, {total_files} total files")
|
| 715 |
-
print(f"Upload delay: {UPLOAD_DELAY_SECONDS}s between files")
|
| 716 |
-
print(f"{'='*80}\n")
|
| 717 |
|
| 718 |
file_count = 0
|
| 719 |
|
| 720 |
for agent_idx, (agent_identifier, metadata_list) in enumerate(all_metadata.items(), 1):
|
| 721 |
if not metadata_list:
|
| 722 |
-
print(f"[{agent_idx}/{len(all_metadata)}] Skipping {agent_identifier} (no data)")
|
| 723 |
continue
|
| 724 |
|
| 725 |
# Group by date
|
| 726 |
grouped = group_metadata_by_date(metadata_list)
|
| 727 |
|
| 728 |
-
print(f"[{agent_idx}/{len(all_metadata)}] Uploading {len(grouped)} files for {agent_identifier}...")
|
| 729 |
-
|
| 730 |
# Create temporary files for this agent
|
| 731 |
agent_temp_dir = tempfile.mkdtemp()
|
| 732 |
|
|
@@ -752,8 +703,6 @@ def batch_upload_review_metadata(all_metadata):
|
|
| 752 |
for file_idx, (local_path, repo_path, review_count) in enumerate(local_files, 1):
|
| 753 |
file_count += 1
|
| 754 |
|
| 755 |
-
print(f" [{file_count}/{total_files}] Uploading {repo_path} ({review_count} reviews)...", end='')
|
| 756 |
-
|
| 757 |
if upload_single_file_with_retry(
|
| 758 |
api=api,
|
| 759 |
local_path=local_path,
|
|
@@ -773,20 +722,16 @@ def batch_upload_review_metadata(all_metadata):
|
|
| 773 |
if file_idx < len(local_files):
|
| 774 |
time.sleep(UPLOAD_DELAY_SECONDS)
|
| 775 |
|
| 776 |
-
print(f" Agent {agent_identifier}: {agent_success} uploaded, {agent_error} errors\n")
|
| 777 |
-
|
| 778 |
finally:
|
| 779 |
# Clean up temp directory
|
| 780 |
if os.path.exists(agent_temp_dir):
|
| 781 |
import shutil
|
| 782 |
shutil.rmtree(agent_temp_dir)
|
| 783 |
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
print(f" Errors: {error_count}")
|
| 789 |
-
print(f"{'='*80}\n")
|
| 790 |
|
| 791 |
return success_count, error_count
|
| 792 |
|
|
@@ -813,8 +758,6 @@ def load_agents_from_hf():
|
|
| 813 |
# Filter for JSON files only
|
| 814 |
json_files = [f for f in files if f.endswith('.json')]
|
| 815 |
|
| 816 |
-
print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
|
| 817 |
-
|
| 818 |
# Download and parse each JSON file
|
| 819 |
for json_file in json_files:
|
| 820 |
try:
|
|
@@ -838,10 +781,9 @@ def load_agents_from_hf():
|
|
| 838 |
agents.append(agent_data)
|
| 839 |
|
| 840 |
except Exception as e:
|
| 841 |
-
print(f"
|
| 842 |
continue
|
| 843 |
|
| 844 |
-
print(f"Loaded {len(agents)} agents from HuggingFace")
|
| 845 |
return agents
|
| 846 |
|
| 847 |
except Exception as e:
|
|
@@ -1010,14 +952,10 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
|
|
| 1010 |
Returns:
|
| 1011 |
Dictionary of agent stats.
|
| 1012 |
"""
|
| 1013 |
-
print("Constructing leaderboard from review metadata...")
|
| 1014 |
-
|
| 1015 |
if not agents:
|
| 1016 |
-
print("No agents found")
|
| 1017 |
return {}
|
| 1018 |
|
| 1019 |
-
print(f"Processing {len(agents)} agents")
|
| 1020 |
-
|
| 1021 |
cache_dict = {}
|
| 1022 |
|
| 1023 |
for agent in agents:
|
|
@@ -1037,8 +975,6 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
|
|
| 1037 |
**stats
|
| 1038 |
}
|
| 1039 |
|
| 1040 |
-
print(f"Constructed cache with {len(cache_dict)} agent entries")
|
| 1041 |
-
|
| 1042 |
return cache_dict
|
| 1043 |
|
| 1044 |
|
|
@@ -1077,7 +1013,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
|
|
| 1077 |
|
| 1078 |
try:
|
| 1079 |
# Upload to HuggingFace with retry logic
|
| 1080 |
-
print(f"Uploading leaderboard data...", end='')
|
| 1081 |
upload_file_with_backoff(
|
| 1082 |
api=api,
|
| 1083 |
path_or_fileobj=filename,
|
|
@@ -1085,7 +1020,6 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
|
|
| 1085 |
repo_id=LEADERBOARD_REPO,
|
| 1086 |
repo_type="dataset"
|
| 1087 |
)
|
| 1088 |
-
print(f"Saved leaderboard data to HuggingFace: {filename}")
|
| 1089 |
return True
|
| 1090 |
finally:
|
| 1091 |
# Always clean up local file
|
|
@@ -1109,35 +1043,26 @@ def mine_all_agents():
|
|
| 1109 |
Downloads GHArchive data first, then uses ONE DuckDB query for ALL agents, then batch uploads with time gaps.
|
| 1110 |
"""
|
| 1111 |
# Step 1: Download GHArchive data
|
| 1112 |
-
print(f"\n
|
| 1113 |
-
print(f"STEP 1: DOWNLOADING GHARCHIVE DATA")
|
| 1114 |
-
print(f"{'='*80}\n")
|
| 1115 |
|
| 1116 |
if not download_all_gharchive_data():
|
| 1117 |
-
print("Warning: Download had errors,
|
| 1118 |
|
| 1119 |
# Step 2: Load agent metadata from HuggingFace
|
| 1120 |
-
print(f"\n
|
| 1121 |
-
print(f"STEP 2: LOADING AGENT METADATA")
|
| 1122 |
-
print(f"{'='*80}\n")
|
| 1123 |
|
| 1124 |
agents = load_agents_from_hf()
|
| 1125 |
if not agents:
|
| 1126 |
-
print("No agents found
|
| 1127 |
return
|
| 1128 |
|
| 1129 |
# Extract all identifiers
|
| 1130 |
identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
|
| 1131 |
if not identifiers:
|
| 1132 |
-
print("No valid agent identifiers found")
|
| 1133 |
return
|
| 1134 |
|
| 1135 |
-
print(f"\n{
|
| 1136 |
-
print(f"STEP 3: MINING REVIEW METADATA")
|
| 1137 |
-
print(f"{'='*80}")
|
| 1138 |
-
print(f"Agents: {len(identifiers)}")
|
| 1139 |
-
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
|
| 1140 |
-
print(f"{'='*80}")
|
| 1141 |
|
| 1142 |
# Initialize DuckDB connection
|
| 1143 |
try:
|
|
@@ -1161,12 +1086,7 @@ def mine_all_agents():
|
|
| 1161 |
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 1162 |
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 1163 |
|
| 1164 |
-
print(f"
|
| 1165 |
-
print(f"DuckDB query complete!")
|
| 1166 |
-
print(f" Total agents: {len(agents)}")
|
| 1167 |
-
print(f" Agents with data: {agents_with_data}")
|
| 1168 |
-
print(f" Total PRs found: {total_prs}")
|
| 1169 |
-
print(f"{'='*80}")
|
| 1170 |
|
| 1171 |
except Exception as e:
|
| 1172 |
print(f"Error during DuckDB fetch: {str(e)}")
|
|
@@ -1178,39 +1098,27 @@ def mine_all_agents():
|
|
| 1178 |
conn.close()
|
| 1179 |
|
| 1180 |
# Step 4: Batch upload review metadata with time gaps
|
| 1181 |
-
print(f"\n
|
| 1182 |
-
print(f"STEP 4: UPLOADING REVIEW METADATA")
|
| 1183 |
-
print(f"{'='*80}\n")
|
| 1184 |
|
| 1185 |
success_count, error_count = batch_upload_review_metadata(all_metadata)
|
| 1186 |
|
| 1187 |
# Step 5: Construct and save leaderboard data
|
| 1188 |
-
print(f"\n
|
| 1189 |
-
print(f"STEP 5: CONSTRUCTING AND SAVING LEADERBOARD")
|
| 1190 |
-
print(f"{'='*80}\n")
|
| 1191 |
|
| 1192 |
try:
|
| 1193 |
# Construct leaderboard from in-memory data
|
| 1194 |
leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
|
| 1195 |
|
| 1196 |
# Calculate monthly metrics from in-memory data
|
| 1197 |
-
print(f"Calculating monthly metrics...")
|
| 1198 |
monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
|
| 1199 |
|
| 1200 |
# Save to HuggingFace
|
| 1201 |
-
print(f"Saving leaderboard data to HuggingFace...")
|
| 1202 |
save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
|
| 1203 |
|
| 1204 |
-
print(f"\
|
| 1205 |
-
print(f"ALL TASKS COMPLETE!")
|
| 1206 |
-
print(f" Review metadata: {success_count} files uploaded, {error_count} errors")
|
| 1207 |
-
print(f" Leaderboard entries: {len(leaderboard_dict)}")
|
| 1208 |
-
print(f" Monthly data points: {len(monthly_metrics.get('months', []))} months")
|
| 1209 |
-
print(f" Saved to: {LEADERBOARD_REPO}/swe-review.json")
|
| 1210 |
-
print(f"{'='*80}")
|
| 1211 |
|
| 1212 |
except Exception as e:
|
| 1213 |
-
print(f"
|
| 1214 |
import traceback
|
| 1215 |
traceback.print_exc()
|
| 1216 |
|
|
@@ -1258,21 +1166,17 @@ def setup_scheduler():
|
|
| 1258 |
)
|
| 1259 |
|
| 1260 |
# Print schedule information
|
| 1261 |
-
|
| 1262 |
-
|
| 1263 |
-
print(f"{
|
| 1264 |
-
print(f"
|
| 1265 |
-
print(f"Next run: {scheduler.get_jobs()[0].next_run_time}")
|
| 1266 |
-
print(f"{'='*80}\n")
|
| 1267 |
|
| 1268 |
# Run immediately on startup
|
| 1269 |
-
print("Running initial mining job
|
| 1270 |
mine_all_agents()
|
| 1271 |
|
| 1272 |
# Start scheduler (blocking call)
|
| 1273 |
-
print(f"\
|
| 1274 |
-
print("Starting scheduler... (Press Ctrl+C to exit)")
|
| 1275 |
-
print(f"{'='*80}\n")
|
| 1276 |
|
| 1277 |
try:
|
| 1278 |
scheduler.start()
|
|
|
|
| 137 |
|
| 138 |
# Skip if json.gz already exists
|
| 139 |
if os.path.exists(filepath):
|
|
|
|
| 140 |
return True
|
| 141 |
|
| 142 |
# Download with retry logic
|
|
|
|
| 146 |
response.raise_for_status()
|
| 147 |
with open(filepath, "wb") as f:
|
| 148 |
f.write(response.content)
|
|
|
|
| 149 |
return True
|
| 150 |
|
| 151 |
except requests.exceptions.HTTPError as e:
|
|
|
|
| 192 |
Returns:
|
| 193 |
bool: True if all downloads completed (some may have failed), False if critical error
|
| 194 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
# Create data directory if it doesn't exist
|
| 196 |
os.makedirs(GHARCHIVE_DATA_DIR, exist_ok=True)
|
| 197 |
|
|
|
|
| 209 |
urls.append(url)
|
| 210 |
current_date += timedelta(days=1)
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
downloads_processed = 0
|
| 213 |
|
| 214 |
try:
|
|
|
|
| 219 |
# Wait for downloads to complete
|
| 220 |
for future in as_completed(futures):
|
| 221 |
downloads_processed += 1
|
|
|
|
|
|
|
| 222 |
|
| 223 |
+
print(f"Download complete: {downloads_processed} files")
|
|
|
|
|
|
|
| 224 |
return True
|
| 225 |
|
| 226 |
except Exception as e:
|
|
|
|
| 227 |
print(f"Error during download: {str(e)}")
|
|
|
|
| 228 |
import traceback
|
| 229 |
traceback.print_exc()
|
| 230 |
return False
|
|
|
|
| 418 |
...
|
| 419 |
}
|
| 420 |
"""
|
|
|
|
|
|
|
|
|
|
| 421 |
# Generate file path patterns for review period
|
| 422 |
review_patterns = generate_file_path_patterns(start_date, end_date)
|
| 423 |
|
|
|
|
| 510 |
ORDER BY re.reviewer, re.reviewed_at DESC
|
| 511 |
"""
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
try:
|
| 514 |
# Create cache table name based on date range
|
| 515 |
cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
|
|
|
|
| 521 |
""").fetchone()[0] > 0
|
| 522 |
|
| 523 |
if cache_exists:
|
|
|
|
| 524 |
results = conn.execute(f"""
|
| 525 |
SELECT reviewer, url, reviewed_at, merged_at, closed_at
|
| 526 |
FROM {cache_table_name}
|
| 527 |
WHERE reviewer IN ({identifier_list})
|
| 528 |
""").fetchall()
|
| 529 |
else:
|
|
|
|
| 530 |
# Execute query with parameters
|
| 531 |
results = conn.execute(query, {'review_patterns': review_patterns, 'status_patterns': status_patterns}).fetchall()
|
| 532 |
|
|
|
|
| 546 |
[r[3] for r in results],
|
| 547 |
[r[4] for r in results]
|
| 548 |
])
|
|
|
|
|
|
|
|
|
|
| 549 |
|
| 550 |
# Group results by agent
|
| 551 |
metadata_by_agent = defaultdict(list)
|
|
|
|
| 564 |
'closed_at': closed_at,
|
| 565 |
})
|
| 566 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
# Convert defaultdict to regular dict
|
| 568 |
return dict(metadata_by_agent)
|
| 569 |
|
| 570 |
except Exception as e:
|
| 571 |
+
print(f"DuckDB error: {str(e)}")
|
| 572 |
import traceback
|
| 573 |
traceback.print_exc()
|
| 574 |
return {}
|
|
|
|
| 667 |
grouped = group_metadata_by_date(metadata_list)
|
| 668 |
total_files += len(grouped)
|
| 669 |
|
| 670 |
+
print(f"Uploading {total_files} files for {len(all_metadata)} agents...")
|
|
|
|
|
|
|
|
|
|
| 671 |
|
| 672 |
file_count = 0
|
| 673 |
|
| 674 |
for agent_idx, (agent_identifier, metadata_list) in enumerate(all_metadata.items(), 1):
|
| 675 |
if not metadata_list:
|
|
|
|
| 676 |
continue
|
| 677 |
|
| 678 |
# Group by date
|
| 679 |
grouped = group_metadata_by_date(metadata_list)
|
| 680 |
|
|
|
|
|
|
|
| 681 |
# Create temporary files for this agent
|
| 682 |
agent_temp_dir = tempfile.mkdtemp()
|
| 683 |
|
|
|
|
| 703 |
for file_idx, (local_path, repo_path, review_count) in enumerate(local_files, 1):
|
| 704 |
file_count += 1
|
| 705 |
|
|
|
|
|
|
|
| 706 |
if upload_single_file_with_retry(
|
| 707 |
api=api,
|
| 708 |
local_path=local_path,
|
|
|
|
| 722 |
if file_idx < len(local_files):
|
| 723 |
time.sleep(UPLOAD_DELAY_SECONDS)
|
| 724 |
|
|
|
|
|
|
|
| 725 |
finally:
|
| 726 |
# Clean up temp directory
|
| 727 |
if os.path.exists(agent_temp_dir):
|
| 728 |
import shutil
|
| 729 |
shutil.rmtree(agent_temp_dir)
|
| 730 |
|
| 731 |
+
if error_count > 0:
|
| 732 |
+
print(f"Upload complete: {success_count}/{total_files} succeeded, {error_count} errors")
|
| 733 |
+
else:
|
| 734 |
+
print(f"Upload complete: {success_count}/{total_files} files")
|
|
|
|
|
|
|
| 735 |
|
| 736 |
return success_count, error_count
|
| 737 |
|
|
|
|
| 758 |
# Filter for JSON files only
|
| 759 |
json_files = [f for f in files if f.endswith('.json')]
|
| 760 |
|
|
|
|
|
|
|
| 761 |
# Download and parse each JSON file
|
| 762 |
for json_file in json_files:
|
| 763 |
try:
|
|
|
|
| 781 |
agents.append(agent_data)
|
| 782 |
|
| 783 |
except Exception as e:
|
| 784 |
+
print(f"Error loading {json_file}: {str(e)}")
|
| 785 |
continue
|
| 786 |
|
|
|
|
| 787 |
return agents
|
| 788 |
|
| 789 |
except Exception as e:
|
|
|
|
| 952 |
Returns:
|
| 953 |
Dictionary of agent stats.
|
| 954 |
"""
|
|
|
|
|
|
|
| 955 |
if not agents:
|
| 956 |
+
print("Error: No agents found")
|
| 957 |
return {}
|
| 958 |
|
|
|
|
|
|
|
| 959 |
cache_dict = {}
|
| 960 |
|
| 961 |
for agent in agents:
|
|
|
|
| 975 |
**stats
|
| 976 |
}
|
| 977 |
|
|
|
|
|
|
|
| 978 |
return cache_dict
|
| 979 |
|
| 980 |
|
|
|
|
| 1013 |
|
| 1014 |
try:
|
| 1015 |
# Upload to HuggingFace with retry logic
|
|
|
|
| 1016 |
upload_file_with_backoff(
|
| 1017 |
api=api,
|
| 1018 |
path_or_fileobj=filename,
|
|
|
|
| 1020 |
repo_id=LEADERBOARD_REPO,
|
| 1021 |
repo_type="dataset"
|
| 1022 |
)
|
|
|
|
| 1023 |
return True
|
| 1024 |
finally:
|
| 1025 |
# Always clean up local file
|
|
|
|
| 1043 |
Downloads GHArchive data first, then uses ONE DuckDB query for ALL agents, then batch uploads with time gaps.
|
| 1044 |
"""
|
| 1045 |
# Step 1: Download GHArchive data
|
| 1046 |
+
print(f"\n[1/5] Downloading GHArchive data...")
|
|
|
|
|
|
|
| 1047 |
|
| 1048 |
if not download_all_gharchive_data():
|
| 1049 |
+
print("Warning: Download had errors, continuing with available data...")
|
| 1050 |
|
| 1051 |
# Step 2: Load agent metadata from HuggingFace
|
| 1052 |
+
print(f"\n[2/5] Loading agent metadata...")
|
|
|
|
|
|
|
| 1053 |
|
| 1054 |
agents = load_agents_from_hf()
|
| 1055 |
if not agents:
|
| 1056 |
+
print("Error: No agents found")
|
| 1057 |
return
|
| 1058 |
|
| 1059 |
# Extract all identifiers
|
| 1060 |
identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
|
| 1061 |
if not identifiers:
|
| 1062 |
+
print("Error: No valid agent identifiers found")
|
| 1063 |
return
|
| 1064 |
|
| 1065 |
+
print(f"\n[3/5] Mining review metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1066 |
|
| 1067 |
# Initialize DuckDB connection
|
| 1068 |
try:
|
|
|
|
| 1086 |
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 1087 |
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 1088 |
|
| 1089 |
+
print(f"Query complete: {total_prs} PRs found for {agents_with_data}/{len(agents)} agents")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
|
| 1091 |
except Exception as e:
|
| 1092 |
print(f"Error during DuckDB fetch: {str(e)}")
|
|
|
|
| 1098 |
conn.close()
|
| 1099 |
|
| 1100 |
# Step 4: Batch upload review metadata with time gaps
|
| 1101 |
+
print(f"\n[4/5] Uploading review metadata...")
|
|
|
|
|
|
|
| 1102 |
|
| 1103 |
success_count, error_count = batch_upload_review_metadata(all_metadata)
|
| 1104 |
|
| 1105 |
# Step 5: Construct and save leaderboard data
|
| 1106 |
+
print(f"\n[5/5] Saving leaderboard...")
|
|
|
|
|
|
|
| 1107 |
|
| 1108 |
try:
|
| 1109 |
# Construct leaderboard from in-memory data
|
| 1110 |
leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
|
| 1111 |
|
| 1112 |
# Calculate monthly metrics from in-memory data
|
|
|
|
| 1113 |
monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
|
| 1114 |
|
| 1115 |
# Save to HuggingFace
|
|
|
|
| 1116 |
save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
|
| 1117 |
|
| 1118 |
+
print(f"\nCOMPLETE: {success_count} files uploaded" + (f", {error_count} errors" if error_count > 0 else ""))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1119 |
|
| 1120 |
except Exception as e:
|
| 1121 |
+
print(f"Error saving leaderboard: {str(e)}")
|
| 1122 |
import traceback
|
| 1123 |
traceback.print_exc()
|
| 1124 |
|
|
|
|
| 1166 |
)
|
| 1167 |
|
| 1168 |
# Print schedule information
|
| 1169 |
+
from datetime import datetime
|
| 1170 |
+
next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
|
| 1171 |
+
print(f"Scheduler: Monthly on day {SCHEDULE_DAY_OF_MONTH} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
|
| 1172 |
+
print(f"Next run: {next_run}\n")
|
|
|
|
|
|
|
| 1173 |
|
| 1174 |
# Run immediately on startup
|
| 1175 |
+
print("Running initial mining job...")
|
| 1176 |
mine_all_agents()
|
| 1177 |
|
| 1178 |
# Start scheduler (blocking call)
|
| 1179 |
+
print(f"\nScheduler started (Press Ctrl+C to exit)")
|
|
|
|
|
|
|
| 1180 |
|
| 1181 |
try:
|
| 1182 |
scheduler.start()
|