Spaces:

SWE-Arena
/

SWE-Review

Sleeping

App Files Files Community

zhimin-z commited on Nov 12

Commit

d34dfc3

1 Parent(s): 9a4a0ec

optimize

Browse files

Files changed (1) hide show

msr.py +77 -43

msr.py CHANGED Viewed

@@ -33,6 +33,7 @@ REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
 LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard data
 LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard
 GHARCHIVE_DATA_DIR = "../gharchive/data"  # Local GHArchive data directory
 # Upload configuration
 UPLOAD_DELAY_SECONDS = 2  # Delay between individual file uploads to avoid rate limits
@@ -205,16 +206,13 @@ def upload_folder_with_backoff(api, **kwargs):
 def get_duckdb_connection():
     """
-    Initialize DuckDB connection with JSON support and optimized parallelization.
     Returns:
         DuckDB connection object
     """
-    conn = duckdb.connect(':memory:')
-    # Enable JSON extension if needed
-    conn.execute("INSTALL json;")
-    conn.execute("LOAD json;")
     # Optimize for 96-core CPU parallelization with 754GB RAM
     conn.execute("SET threads TO 48;")  # Use all available cores
@@ -245,8 +243,8 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_DI
     end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
     while current_date <= end_day:
-        # Pattern for all hours in this day: 2024-11-15-*.json.gz
-        pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-*.json.gz")
         file_patterns.append(pattern)
         # Move to next day
@@ -301,72 +299,72 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
     # Build identifier list for IN clause
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
-    # Build comprehensive query with CTEs using parameterized file lists
     query = f"""
     WITH review_events AS (
       -- Get all review events for ALL agents
       SELECT
-        json_extract_string(payload, '$.pull_request.html_url') as url,
         COALESCE(
-          json_extract_string(payload, '$.review.submitted_at'),
           CAST(created_at AS VARCHAR)
         ) as reviewed_at,
-        json_extract_string(actor, '$.login') as reviewer,
-        json_extract_string(repo, '$.name') as repo_name,
-        CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
-      FROM read_json_auto($review_patterns, ignore_errors=true, union_by_name=true)
       WHERE
         type = 'PullRequestReviewEvent'
-        AND json_extract_string(actor, '$.login') IN ({identifier_list})
-        AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
       UNION ALL
       -- Get PR comments (IssueCommentEvent on PRs)
       SELECT
-        json_extract_string(payload, '$.issue.html_url') as url,
         CAST(created_at AS VARCHAR) as reviewed_at,
-        json_extract_string(actor, '$.login') as reviewer,
-        json_extract_string(repo, '$.name') as repo_name,
-        CAST(json_extract_string(payload, '$.issue.number') AS INTEGER) as pr_number
-      FROM read_json_auto($review_patterns, ignore_errors=true, union_by_name=true)
       WHERE
         type = 'IssueCommentEvent'
-        AND json_extract_string(actor, '$.login') IN ({identifier_list})
-        AND json_extract_string(payload, '$.issue.pull_request.url') IS NOT NULL
-        AND json_extract_string(payload, '$.issue.html_url') IS NOT NULL
       UNION ALL
       -- Get review comments (PullRequestReviewCommentEvent)
       SELECT
-        json_extract_string(payload, '$.pull_request.html_url') as url,
         CAST(created_at AS VARCHAR) as reviewed_at,
-        json_extract_string(actor, '$.login') as reviewer,
-        json_extract_string(repo, '$.name') as repo_name,
-        CAST(json_extract_string(payload, '$.pull_request.number') AS INTEGER) as pr_number
-      FROM read_json_auto($review_patterns, ignore_errors=true, union_by_name=true)
       WHERE
         type = 'PullRequestReviewCommentEvent'
-        AND json_extract_string(actor, '$.login') IN ({identifier_list})
-        AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
     ),
     pr_status AS (
       -- Get merge/close status for those PRs
       SELECT
-        json_extract_string(payload, '$.pull_request.html_url') as url,
-        CAST(json_extract_string(payload, '$.pull_request.merged') AS BOOLEAN) as is_merged,
-        json_extract_string(payload, '$.pull_request.merged_at') as merged_at,
-        json_extract_string(payload, '$.pull_request.closed_at') as closed_at,
         created_at,
-        ROW_NUMBER() OVER (PARTITION BY json_extract_string(payload, '$.pull_request.html_url') ORDER BY created_at DESC) as rn
-      FROM read_json_auto($status_patterns, ignore_errors=true, union_by_name=true)
       WHERE
         type = 'PullRequestEvent'
-        AND json_extract_string(payload, '$.action') = 'closed'
-        AND json_extract_string(payload, '$.pull_request.html_url') IS NOT NULL
-        AND json_extract_string(payload, '$.pull_request.html_url') IN (
           SELECT DISTINCT url FROM review_events
         )
     )
@@ -391,8 +389,44 @@ def fetch_all_pr_metadata_single_query(conn, identifiers, start_date, end_date):
     print(f"   Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
     try:
-        # Execute query with parameters
-        results = conn.execute(query, {'review_patterns': review_patterns, 'status_patterns': status_patterns}).fetchall()
         print(f"   Found {len(results)} total PR review records across all agents")

 LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard data
 LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard
 GHARCHIVE_DATA_DIR = "../gharchive/data"  # Local GHArchive data directory
+DUCKDB_CACHE_FILE = "gharchive_cache.duckdb"  # Persistent DuckDB database for caching
 # Upload configuration
 UPLOAD_DELAY_SECONDS = 2  # Delay between individual file uploads to avoid rate limits
 def get_duckdb_connection():
     """
+    Initialize DuckDB connection with persistent database and optimized parallelization.
     Returns:
         DuckDB connection object
     """
+    # Use persistent database for caching results
+    conn = duckdb.connect(DUCKDB_CACHE_FILE)
     # Optimize for 96-core CPU parallelization with 754GB RAM
     conn.execute("SET threads TO 48;")  # Use all available cores
     end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
     while current_date <= end_day:
+        # Pattern for all hours in this day: 2024-11-15-*.parquet
+        pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-*.parquet")
         file_patterns.append(pattern)
         # Move to next day
     # Build identifier list for IN clause
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    # Build comprehensive query with CTEs using parameterized file lists (Parquet optimized)
     query = f"""
     WITH review_events AS (
       -- Get all review events for ALL agents
       SELECT
+        payload.pull_request.html_url as url,
         COALESCE(
+          payload.review.submitted_at,
           CAST(created_at AS VARCHAR)
         ) as reviewed_at,
+        actor.login as reviewer,
+        repo.name as repo_name,
+        CAST(payload.pull_request.number AS INTEGER) as pr_number
+      FROM read_parquet($review_patterns, union_by_name=true, filename=true)
       WHERE
         type = 'PullRequestReviewEvent'
+        AND actor.login IN ({identifier_list})
+        AND payload.pull_request.html_url IS NOT NULL
       UNION ALL
       -- Get PR comments (IssueCommentEvent on PRs)
       SELECT
+        payload.issue.html_url as url,
         CAST(created_at AS VARCHAR) as reviewed_at,
+        actor.login as reviewer,
+        repo.name as repo_name,
+        CAST(payload.issue.number AS INTEGER) as pr_number
+      FROM read_parquet($review_patterns, union_by_name=true, filename=true)
       WHERE
         type = 'IssueCommentEvent'
+        AND actor.login IN ({identifier_list})
+        AND payload.issue.pull_request.url IS NOT NULL
+        AND payload.issue.html_url IS NOT NULL
       UNION ALL
       -- Get review comments (PullRequestReviewCommentEvent)
       SELECT
+        payload.pull_request.html_url as url,
         CAST(created_at AS VARCHAR) as reviewed_at,
+        actor.login as reviewer,
+        repo.name as repo_name,
+        CAST(payload.pull_request.number AS INTEGER) as pr_number
+      FROM read_parquet($review_patterns, union_by_name=true, filename=true)
       WHERE
         type = 'PullRequestReviewCommentEvent'
+        AND actor.login IN ({identifier_list})
+        AND payload.pull_request.html_url IS NOT NULL
     ),
     pr_status AS (
       -- Get merge/close status for those PRs
       SELECT
+        payload.pull_request.html_url as url,
+        CAST(payload.pull_request.merged AS BOOLEAN) as is_merged,
+        payload.pull_request.merged_at as merged_at,
+        payload.pull_request.closed_at as closed_at,
         created_at,
+        ROW_NUMBER() OVER (PARTITION BY payload.pull_request.html_url ORDER BY created_at DESC) as rn
+      FROM read_parquet($status_patterns, union_by_name=true, filename=true)
       WHERE
         type = 'PullRequestEvent'
+        AND payload.action = 'closed'
+        AND payload.pull_request.html_url IS NOT NULL
+        AND payload.pull_request.html_url IN (
           SELECT DISTINCT url FROM review_events
         )
     )
     print(f"   Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
     try:
+        # Create cache table name based on date range
+        cache_table_name = f"pr_cache_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
+        # Check if cache exists and is valid
+        cache_exists = conn.execute(f"""
+            SELECT COUNT(*) FROM information_schema.tables
+            WHERE table_name = '{cache_table_name}'
+        """).fetchone()[0] > 0
+        if cache_exists:
+            print(f"   Using cached results from table {cache_table_name}")
+            results = conn.execute(f"""
+                SELECT reviewer, url, reviewed_at, merged_at, closed_at
+                FROM {cache_table_name}
+                WHERE reviewer IN ({identifier_list})
+            """).fetchall()
+        else:
+            print(f"   Cache miss - executing full query and caching to {cache_table_name}")
+            # Execute query with parameters
+            results = conn.execute(query, {'review_patterns': review_patterns, 'status_patterns': status_patterns}).fetchall()
+            # Cache the complete results for all future queries in this date range
+            if len(results) > 0:
+                conn.execute(f"""
+                    CREATE TABLE {cache_table_name} AS
+                    SELECT * FROM (
+                        SELECT UNNEST($1) as reviewer, UNNEST($2) as url,
+                               UNNEST($3) as reviewed_at, UNNEST($4) as merged_at,
+                               UNNEST($5) as closed_at
+                    )
+                """, [
+                    [r[0] for r in results],
+                    [r[1] for r in results],
+                    [r[2] for r in results],
+                    [r[3] for r in results],
+                    [r[4] for r in results]
+                ])
+                print(f"   Cached {len(results)} results to {cache_table_name}")
         print(f"   Found {len(results)} total PR review records across all agents")