zhimin-z commited on
Commit
4f8012e
·
1 Parent(s): f94ac6f
Files changed (1) hide show
  1. msr.py +34 -113
msr.py CHANGED
@@ -691,82 +691,6 @@ def load_agents_from_hf():
691
  return []
692
 
693
 
694
- def load_review_metadata():
695
- """
696
- Load all review metadata from HuggingFace dataset within LEADERBOARD_TIME_FRAME_DAYS.
697
-
698
- Returns:
699
- List of dictionaries with 'agent_identifier' added to each review metadata.
700
- """
701
- # Calculate cutoff date
702
- current_time = datetime.now(timezone.utc)
703
- cutoff_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
704
-
705
- try:
706
- api = HfApi()
707
- token = get_hf_token()
708
-
709
- # List all files in the repository
710
- files = list_repo_files_with_backoff(api=api, repo_id=REVIEW_METADATA_REPO, repo_type="dataset")
711
-
712
- # Filter for JSONL files matching pattern: [agent_identifier]/YYYY.MM.DD.jsonl
713
- time_frame_files = []
714
- for f in files:
715
- if f.endswith('.jsonl'):
716
- parts = f.split('/')
717
- if len(parts) == 2:
718
- filename = parts[1]
719
- # Parse date from filename: YYYY.MM.DD.jsonl
720
- try:
721
- date_part = filename.replace('.jsonl', '')
722
- date_components = date_part.split('.')
723
- if len(date_components) == 3:
724
- file_year, file_month, file_day = map(int, date_components)
725
- file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
726
-
727
- # Only include files within time frame
728
- if file_date >= cutoff_date:
729
- time_frame_files.append(f)
730
- except Exception:
731
- continue
732
-
733
- print(f"Loading review metadata from last {LEADERBOARD_TIME_FRAME_DAYS} days ({len(time_frame_files)} daily files)...")
734
-
735
- all_metadata = []
736
-
737
- for filename in time_frame_files:
738
- try:
739
- # Extract agent_identifier from path
740
- parts = filename.split('/')
741
- if len(parts) != 2:
742
- continue
743
-
744
- agent_identifier = parts[0]
745
-
746
- file_path = hf_hub_download_with_backoff(
747
- repo_id=REVIEW_METADATA_REPO,
748
- filename=filename,
749
- repo_type="dataset",
750
- token=token
751
- )
752
- day_metadata = load_jsonl(file_path)
753
-
754
- # Add agent_identifier to each review
755
- for review_meta in day_metadata:
756
- review_meta['agent_identifier'] = agent_identifier
757
- all_metadata.append(review_meta)
758
-
759
- except Exception as e:
760
- print(f" Warning: Could not load {filename}: {str(e)}")
761
-
762
- print(f"Loaded {len(all_metadata)} total reviews from last {LEADERBOARD_TIME_FRAME_DAYS} days")
763
- return all_metadata
764
-
765
- except Exception as e:
766
- print(f"Error loading review metadata: {str(e)}")
767
- return []
768
-
769
-
770
  def get_pr_status_from_metadata(review_meta):
771
  """
772
  Derive PR status from merged_at and closed_at fields.
@@ -818,10 +742,14 @@ def calculate_review_stats_from_metadata(metadata_list):
818
  }
819
 
820
 
821
- def calculate_monthly_metrics_by_agent():
822
  """
823
  Calculate monthly metrics for all agents for visualization.
824
 
 
 
 
 
825
  Returns:
826
  dict: {
827
  'agents': list of agent names,
@@ -835,38 +763,33 @@ def calculate_monthly_metrics_by_agent():
835
  }
836
  }
837
  """
838
- # Load agents
839
- agents = load_agents_from_hf()
840
-
841
  # Create mapping from agent_identifier to agent_name
842
  identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
843
 
844
- # Load all review metadata
845
- all_metadata = load_review_metadata()
846
-
847
- if not all_metadata:
848
  return {'agents': [], 'months': [], 'data': {}}
849
 
850
  # Group by agent and month
851
  agent_month_data = defaultdict(lambda: defaultdict(list))
852
 
853
- for review_meta in all_metadata:
854
- agent_identifier = review_meta.get('agent_identifier')
855
- reviewed_at = review_meta.get('reviewed_at')
 
856
 
857
- if not agent_identifier or not reviewed_at:
858
- continue
859
 
860
- # Get agent_name from identifier
861
- agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
862
 
863
- try:
864
- dt = datetime.fromisoformat(reviewed_at.replace('Z', '+00:00'))
865
- month_key = f"{dt.year}-{dt.month:02d}"
866
- agent_month_data[agent_name][month_key].append(review_meta)
867
- except Exception as e:
868
- print(f"Warning: Could not parse date '{reviewed_at}': {e}")
869
- continue
870
 
871
  # Get all unique months and sort them
872
  all_months = set()
@@ -918,26 +841,24 @@ def calculate_monthly_metrics_by_agent():
918
  }
919
 
920
 
921
- def construct_leaderboard_from_metadata():
922
  """
923
- Construct leaderboard from stored review metadata.
 
 
 
 
924
 
925
  Returns:
926
  Dictionary of agent stats.
927
  """
928
  print("Constructing leaderboard from review metadata...")
929
 
930
- # Load agents
931
- agents = load_agents_from_hf()
932
  if not agents:
933
  print("No agents found")
934
  return {}
935
 
936
- print(f"Loaded {len(agents)} agents")
937
-
938
- # Load all review metadata
939
- all_metadata = load_review_metadata()
940
- print(f"Loaded {len(all_metadata)} review metadata entries")
941
 
942
  cache_dict = {}
943
 
@@ -945,8 +866,8 @@ def construct_leaderboard_from_metadata():
945
  identifier = agent.get('github_identifier')
946
  agent_name = agent.get('name', 'Unknown')
947
 
948
- # Filter metadata for this agent
949
- bot_metadata = [review for review in all_metadata if review.get("agent_identifier") == identifier]
950
 
951
  # Calculate stats
952
  stats = calculate_review_stats_from_metadata(bot_metadata)
@@ -1095,12 +1016,12 @@ def mine_all_agents():
1095
  print(f"{'='*80}\n")
1096
 
1097
  try:
1098
- # Construct leaderboard
1099
- leaderboard_dict = construct_leaderboard_from_metadata()
1100
 
1101
- # Calculate monthly metrics
1102
  print(f"Calculating monthly metrics...")
1103
- monthly_metrics = calculate_monthly_metrics_by_agent()
1104
 
1105
  # Save to HuggingFace
1106
  print(f"Saving leaderboard data to HuggingFace...")
 
691
  return []
692
 
693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
  def get_pr_status_from_metadata(review_meta):
695
  """
696
  Derive PR status from merged_at and closed_at fields.
 
742
  }
743
 
744
 
745
+ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
746
  """
747
  Calculate monthly metrics for all agents for visualization.
748
 
749
+ Args:
750
+ all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
751
+ agents: List of agent dictionaries with metadata
752
+
753
  Returns:
754
  dict: {
755
  'agents': list of agent names,
 
763
  }
764
  }
765
  """
 
 
 
766
  # Create mapping from agent_identifier to agent_name
767
  identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
768
 
769
+ if not all_metadata_dict:
 
 
 
770
  return {'agents': [], 'months': [], 'data': {}}
771
 
772
  # Group by agent and month
773
  agent_month_data = defaultdict(lambda: defaultdict(list))
774
 
775
+ # Flatten the dict of lists into a single list with agent_identifier added
776
+ for agent_identifier, metadata_list in all_metadata_dict.items():
777
+ for review_meta in metadata_list:
778
+ reviewed_at = review_meta.get('reviewed_at')
779
 
780
+ if not reviewed_at:
781
+ continue
782
 
783
+ # Get agent_name from identifier
784
+ agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
785
 
786
+ try:
787
+ dt = datetime.fromisoformat(reviewed_at.replace('Z', '+00:00'))
788
+ month_key = f"{dt.year}-{dt.month:02d}"
789
+ agent_month_data[agent_name][month_key].append(review_meta)
790
+ except Exception as e:
791
+ print(f"Warning: Could not parse date '{reviewed_at}': {e}")
792
+ continue
793
 
794
  # Get all unique months and sort them
795
  all_months = set()
 
841
  }
842
 
843
 
844
+ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
845
  """
846
+ Construct leaderboard from in-memory review metadata.
847
+
848
+ Args:
849
+ all_metadata_dict: Dictionary mapping agent identifier to list of PR metadata
850
+ agents: List of agent dictionaries with metadata
851
 
852
  Returns:
853
  Dictionary of agent stats.
854
  """
855
  print("Constructing leaderboard from review metadata...")
856
 
 
 
857
  if not agents:
858
  print("No agents found")
859
  return {}
860
 
861
+ print(f"Processing {len(agents)} agents")
 
 
 
 
862
 
863
  cache_dict = {}
864
 
 
866
  identifier = agent.get('github_identifier')
867
  agent_name = agent.get('name', 'Unknown')
868
 
869
+ # Get metadata for this agent from the dictionary
870
+ bot_metadata = all_metadata_dict.get(identifier, [])
871
 
872
  # Calculate stats
873
  stats = calculate_review_stats_from_metadata(bot_metadata)
 
1016
  print(f"{'='*80}\n")
1017
 
1018
  try:
1019
+ # Construct leaderboard from in-memory data
1020
+ leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
1021
 
1022
+ # Calculate monthly metrics from in-memory data
1023
  print(f"Calculating monthly metrics...")
1024
+ monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
1025
 
1026
  # Save to HuggingFace
1027
  print(f"Saving leaderboard data to HuggingFace...")