Spaces:

Simon9
/

afrigoals

Sleeping

App Files Files Community

Simon9 commited on 26 days ago

Commit

1fec87c

verified ·

1 Parent(s): 301aff9

Update pipeline_full.py

Browse files

Files changed (1) hide show

pipeline_full.py +196 -139

pipeline_full.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import os
 import base64
 from io import BytesIO
-from typing import List, Dict, Any, Optional
 from collections import deque, defaultdict
 import numpy as np
@@ -10,6 +10,7 @@ import cv2
 import torch
 from more_itertools import chunked
 from PIL import Image
 import supervision as sv
 from inference import get_model
@@ -25,77 +26,106 @@ from sports.annotators.soccer import (
     draw_pitch,
     draw_points_on_pitch,
     draw_pitch_voronoi_diagram,
-    draw_paths_on_pitch
 )
-# ------------------------------------
-# Global config and models
-# ------------------------------------
-ROBOFLOW_API_KEY = os.environ.get("ROBOFLOW_API_KEY")
-if not ROBOFLOW_API_KEY:
-    raise RuntimeError("ROBOFLOW_API_KEY must be set in Space secrets.")
-PLAYER_DETECTION_MODEL_ID = "football-players-detection-3zvbc/11"
-FIELD_DETECTION_MODEL_ID = "football-field-detection-f07vi/14"
 BALL_ID = 0
 GOALKEEPER_ID = 1
 PLAYER_ID = 2
 REFEREE_ID = 3
-PLAYER_DETECTION_MODEL = get_model(
-    model_id=PLAYER_DETECTION_MODEL_ID,
-    api_key=ROBOFLOW_API_KEY
-)
-FIELD_DETECTION_MODEL = get_model(
-    model_id=FIELD_DETECTION_MODEL_ID,
-    api_key=ROBOFLOW_API_KEY
-)
-SIGLIP_MODEL_PATH = "google/siglip-base-patch16-224"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-EMBEDDINGS_MODEL = SiglipVisionModel.from_pretrained(SIGLIP_MODEL_PATH).to(DEVICE)
-EMBEDDINGS_PROCESSOR = AutoProcessor.from_pretrained(SIGLIP_MODEL_PATH)
-PITCH_CONFIG = SoccerPitchConfiguration()
-TEAM_CLASSIFIER = TeamClassifier(device="cuda")
-# ------------------------------------
-# Utility for saving images
-# ------------------------------------
 def save_image(path: str, img: np.ndarray) -> None:
     os.makedirs(os.path.dirname(path), exist_ok=True)
-    # supervision uses BGR/ RGB interchangeably; assume RGB here
     if img.ndim == 3 and img.shape[2] == 3:
-        # convert RGB to BGR for cv2
         img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
     else:
         img_bgr = img
     cv2.imwrite(path, img_bgr)
-# ------------------------------------
-# 1. Basic frame + detection views
-# ------------------------------------
 def step_basic_frames(video_path: str, out_dir: str) -> Dict[str, str]:
     frame_generator = sv.get_video_frames_generator(video_path)
     frame = next(frame_generator)
-    # Raw frame
     raw_path = os.path.join(out_dir, "frame_raw.png")
     save_image(raw_path, frame)
-    # boxes + labels
     box_annotator = sv.BoxAnnotator(
-        color=sv.ColorPalette.from_hex(['#FF8C00', '#00BFFF', '#FF1493', '#FFD700']),
-        thickness=2
     )
     label_annotator = sv.LabelAnnotator(
-        color=sv.ColorPalette.from_hex(['#FF8C00', '#00BFFF', '#FF1493', '#FFD700']),
-        text_color=sv.Color.from_hex('#000000')
     )
     result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
@@ -103,8 +133,7 @@ def step_basic_frames(video_path: str, out_dir: str) -> Dict[str, str]:
     labels = [
         f"{class_name} {confidence:.2f}"
-        for class_name, confidence
-        in zip(detections["class_name"], detections.confidence)
     ]
     annotated = frame.copy()
@@ -114,16 +143,15 @@ def step_basic_frames(video_path: str, out_dir: str) -> Dict[str, str]:
     boxes_path = os.path.join(out_dir, "frame_boxes_labels.png")
     save_image(boxes_path, annotated)
-    # ball vs players using ellipse/triangle
     ellipse_annotator = sv.EllipseAnnotator(
-        color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
-        thickness=2
     )
     triangle_annotator = sv.TriangleAnnotator(
-        color=sv.Color.from_hex('#FFD700'),
         base=25,
         height=21,
-        outline_thickness=1
     )
     result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
@@ -149,21 +177,17 @@ def step_basic_frames(video_path: str, out_dir: str) -> Dict[str, str]:
         "ball_players": ball_players_path,
     }
-# ------------------------------------
-# 2. SigLIP embeddings + UMAP + KMeans + Plotly HTML
-# ------------------------------------
 def step_siglip_clustering(video_path: str, out_dir: str) -> Dict[str, str]:
-    SOURCE_VIDEO_PATH = video_path
-    PLAYER_ID = PLAYER_ID
-    STRIDE = 30
-    frame_generator = sv.get_video_frames_generator(
-        source_path=SOURCE_VIDEO_PATH, stride=STRIDE
-    )
     crops = []
-    from tqdm import tqdm
     for frame in tqdm(frame_generator, desc="collecting crops (SigLIP)"):
         result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
         detections = sv.Detections.from_inference(result)
@@ -180,9 +204,10 @@ def step_siglip_clustering(video_path: str, out_dir: str) -> Dict[str, str]:
     BATCH_SIZE = 32
     batches = chunked(crops_pil, BATCH_SIZE)
     data = []
     with torch.no_grad():
         for batch in tqdm(batches, desc="embedding extraction"):
-            inputs = EMBEDDINGS_PROCESSOR(images=batch, return_tensors="pt").to(DEVICE)
             outputs = EMBEDDINGS_MODEL(**inputs)
             embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
             data.append(embeddings)
@@ -190,27 +215,24 @@ def step_siglip_clustering(video_path: str, out_dir: str) -> Dict[str, str]:
     data = np.concatenate(data)
     REDUCER = umap.UMAP(n_components=3)
-    CLUSTERING_MODEL = KMeans(n_clusters=2)
     projections = REDUCER.fit_transform(data)
     clusters = CLUSTERING_MODEL.fit_predict(projections)
-    # build Plotly 3D + JS same as in notebook
     def pil_image_to_data_uri(image: Image.Image) -> str:
         buffered = BytesIO()
         image.save(buffered, format="PNG")
         img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
         return f"data:image/png;base64,{img_str}"
-    image_data_uris = {
-        f"image_{i}": pil_image_to_data_uri(image) for i, image in enumerate(crops_pil)
-    }
     image_ids = np.array([f"image_{i}" for i in range(len(crops_pil))])
     traces = []
     unique_labels = np.unique(clusters)
-    for unique_label in unique_labels:
-        mask = clusters == unique_label
         customdata_masked = image_ids[mask]
         trace = go.Scatter3d(
             x=projections[mask][:, 0],
@@ -219,11 +241,9 @@ def step_siglip_clustering(video_path: str, out_dir: str) -> Dict[str, str]:
             mode="markers+text",
             text=clusters[mask],
             customdata=customdata_masked,
-            name=str(unique_label),
             marker=dict(size=8),
-            hovertemplate=(
-                "<b>class: %{text}</b><br>image ID: %{customdata}<extra></extra>"
-            ),
         )
         traces.append(trace)
@@ -307,22 +327,22 @@ def step_siglip_clustering(video_path: str, out_dir: str) -> Dict[str, str]:
     </html>
     """
     html_path = os.path.join(out_dir, "siglip_clusters.html")
     with open(html_path, "w", encoding="utf-8") as f:
         f.write(html_template)
     return {"plot_html": html_path}
-# ------------------------------------
-# 3. TeamClassifier training (same logic)
-# ------------------------------------
 def train_team_classifier_on_video(video_path: str, stride: int = 30) -> None:
-    frame_generator = sv.get_video_frames_generator(
-        source_path=video_path, stride=stride
-    )
     crops = []
-    from tqdm import tqdm
     for frame in tqdm(frame_generator, desc="collecting crops (TeamClassifier)"):
         result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
         detections = sv.Detections.from_inference(result)
@@ -333,13 +353,11 @@ def train_team_classifier_on_video(video_path: str, stride: int = 30) -> None:
     if crops:
         TEAM_CLASSIFIER.fit(crops)
-# ------------------------------------
-# 4. resolve_goalkeepers_team_id – your function
-# ------------------------------------
-def resolve_goalkeepers_team_id(
-    players: sv.Detections, goalkeepers: sv.Detections
-) -> np.ndarray:
     goalkeepers_xy = goalkeepers.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     players_xy = players.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     team_0_centroid = players_xy[players.class_id == 0].mean(axis=0)
@@ -351,21 +369,79 @@ def resolve_goalkeepers_team_id(
         goalkeepers_team_id.append(0 if dist_0 < dist_1 else 1)
     return np.array(goalkeepers_team_id)
-# ------------------------------------
-# 5. One-frame full annotation + radar + Voronoi etc.
-# ------------------------------------
 def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     frame_generator = sv.get_video_frames_generator(video_path)
     frame = next(frame_generator)
     ellipse_annotator = sv.EllipseAnnotator(
-        color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
         thickness=2,
     )
     label_annotator = sv.LabelAnnotator(
-        color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
-        text_color=sv.Color.from_hex('#000000'),
         text_position=sv.Position.BOTTOM_CENTER,
     )
     triangle_annotator = sv.TriangleAnnotator(
@@ -375,7 +451,6 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     tracker = sv.ByteTrack()
     tracker.reset()
-    # detect ball, goalkeeper, player, referee
     result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
     detections = sv.Detections.from_inference(result)
@@ -391,9 +466,10 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     referees_detections = all_detections[all_detections.class_id == REFEREE_ID]
     players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
-    players_detections.class_id = TEAM_CLASSIFIER.predict(players_crops)
-    if len(goalkeepers_detections) > 0:
         goalkeepers_detections.class_id = resolve_goalkeepers_team_id(
             players_detections, goalkeepers_detections
         )
@@ -408,9 +484,7 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     all_detections2.class_id = all_detections2.class_id.astype(int)
     annotated_frame = frame.copy()
-    annotated_frame = ellipse_annotator.annotate(
-        scene=annotated_frame, detections=all_detections2
-    )
     annotated_frame = label_annotator.annotate(
         scene=annotated_frame, detections=all_detections2, labels=labels
     )
@@ -418,10 +492,11 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
         scene=annotated_frame, detections=ball_detections
     )
     annotated_path = os.path.join(out_dir, "frame_advanced.png")
     save_image(annotated_path, annotated_frame)
-    # Pitch keypoints, radar, Voronoi, etc. – same as notebook logic
     result = FIELD_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
     key_points = sv.KeyPoints.from_inference(result)
@@ -429,9 +504,7 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     frame_reference_points = key_points.xy[0][filt]
     pitch_reference_points = np.array(PITCH_CONFIG.vertices)[filt]
-    transformer = ViewTransformer(
-        source=frame_reference_points, target=pitch_reference_points
-    )
     frame_ball_xy = ball_detections.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     pitch_ball_xy = transformer.transform_points(points=frame_ball_xy)
@@ -442,7 +515,6 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     referees_xy = referees_detections.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     pitch_referees_xy = transformer.transform_points(points=referees_xy)
-    # radar view
     radar = draw_pitch(PITCH_CONFIG)
     radar = draw_points_on_pitch(
         config=PITCH_CONFIG,
@@ -479,7 +551,6 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     radar_path = os.path.join(out_dir, "radar_view.png")
     save_image(radar_path, radar)
-    # Voronoi classic
     vor = draw_pitch(PITCH_CONFIG)
     vor = draw_pitch_voronoi_diagram(
         config=PITCH_CONFIG,
@@ -492,11 +563,8 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
     vor_path = os.path.join(out_dir, "voronoi.png")
     save_image(vor_path, vor)
-    # Blended Voronoi (your custom function)
     blended = draw_pitch(
-        config=PITCH_CONFIG,
-        background_color=sv.Color.WHITE,
-        line_color=sv.Color.BLACK,
     )
     blended = draw_pitch_voronoi_diagram_2(
         config=PITCH_CONFIG,
@@ -543,16 +611,12 @@ def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
         "voronoi_blended": blended_path,
     }
-# ------------------------------------
-# 6. Ball path & outlier cleaning – same logic
-# ------------------------------------
-def replace_outliers_based_on_distance(
-    positions: List[np.ndarray], distance_threshold: float
-) -> List[np.ndarray]:
-    from typing import Union
-    last_valid_position: Union[np.ndarray, None] = None
     cleaned_positions: List[np.ndarray] = []
     for position in positions:
@@ -572,7 +636,10 @@ def replace_outliers_based_on_distance(
     return cleaned_positions
 def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
     MAXLEN = 5
     MAX_DISTANCE_THRESHOLD = 500
@@ -582,8 +649,7 @@ def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
     path_raw: List[np.ndarray] = []
     M = deque(maxlen=MAXLEN)
-    from tqdm import tqdm
-    for frame in tqdm(frame_generator, total=video_info.total_frames):
         result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
         detections = sv.Detections.from_inference(result)
@@ -603,9 +669,7 @@ def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
         M.append(transformer.m)
         transformer.m = np.mean(np.array(M), axis=0)
-        frame_ball_xy = ball_detections.get_anchors_coordinates(
-            sv.Position.BOTTOM_CENTER
-        )
         pitch_ball_xy = transformer.transform_points(points=frame_ball_xy)
         path_raw.append(pitch_ball_xy)
@@ -618,7 +682,6 @@ def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
     path_clean = replace_outliers_based_on_distance(path, MAX_DISTANCE_THRESHOLD)
-    # draw raw
     raw_pitch = draw_pitch(PITCH_CONFIG)
     raw_pitch = draw_paths_on_pitch(
         config=PITCH_CONFIG, paths=[path], color=sv.Color.WHITE, pitch=raw_pitch
@@ -626,7 +689,6 @@ def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
     raw_path_img = os.path.join(out_dir, "ball_path_raw.png")
     save_image(raw_path_img, raw_pitch)
-    # draw cleaned
     clean_pitch = draw_pitch(PITCH_CONFIG)
     clean_pitch = draw_paths_on_pitch(
         config=PITCH_CONFIG, paths=[path_clean], color=sv.Color.WHITE, pitch=clean_pitch
@@ -634,7 +696,6 @@ def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
     cleaned_path_img = os.path.join(out_dir, "ball_path_cleaned.png")
     save_image(cleaned_path_img, clean_pitch)
-    # return coords as simple list for JSON
     coords_clean = [
         coords.tolist() if len(coords) > 0 else [] for coords in path_clean
     ]
@@ -645,11 +706,13 @@ def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
         "ball_path_cleaned_coords": coords_clean,
     }
-# ------------------------------------
-# 7. Stats-only process_video (like your FastAPI helper)
-# ------------------------------------
 def process_video_stats(video_path: str) -> Dict[str, Any]:
     tracker = sv.ByteTrack()
     tracker.reset()
     stats = {
@@ -675,29 +738,24 @@ def process_video_stats(video_path: str) -> Dict[str, Any]:
     stats["distance_covered"] = dict(stats["distance_covered"])
     return stats
-# ------------------------------------
-# 8. Entry point: run full pipeline on a video
-# ------------------------------------
 def run_full_pipeline(video_path: str, job_dir: str) -> Dict[str, Any]:
     os.makedirs(job_dir, exist_ok=True)
-    # 1) SigLIP & TeamClassifier training as in NB
-    step_siglip_clustering(video_path, os.path.join(job_dir, "siglip"))
     train_team_classifier_on_video(video_path)
-    # 2) Basic visualizations
     basic_paths = step_basic_frames(video_path, os.path.join(job_dir, "frames"))
-    # 3) Advanced one-frame analytics (radar, Voronoi, etc.)
-    adv_paths = step_single_frame_advanced(
-        video_path, os.path.join(job_dir, "advanced")
-    )
-    # 4) Ball path & heatmap
     ball_paths = step_ball_path(video_path, os.path.join(job_dir, "ball_path"))
-    # 5) Stats
     stats = process_video_stats(video_path)
     return {
@@ -705,6 +763,5 @@ def run_full_pipeline(video_path: str, job_dir: str) -> Dict[str, Any]:
         "advanced": adv_paths,
         "ball": ball_paths,
         "stats": stats,
-        # SigLIP HTML path known: job_dir/siglip/siglip_clusters.html
-        "siglip_html": os.path.join(job_dir, "siglip", "siglip_clusters.html"),
     }

 import os
 import base64
 from io import BytesIO
+from typing import List, Dict, Any
 from collections import deque, defaultdict
 import numpy as np
 import torch
 from more_itertools import chunked
 from PIL import Image
+from tqdm import tqdm
 import supervision as sv
 from inference import get_model
     draw_pitch,
     draw_points_on_pitch,
     draw_pitch_voronoi_diagram,
+    draw_paths_on_pitch,
 )
+# ------------------------------------------------------------------
+# Globals – will be initialized lazily so build/startup doesn't crash
+# ------------------------------------------------------------------
+PLAYER_DETECTION_MODEL = None
+FIELD_DETECTION_MODEL = None
+EMBEDDINGS_MODEL = None
+EMBEDDINGS_PROCESSOR = None
+TEAM_CLASSIFIER = None
+PITCH_CONFIG = None
 BALL_ID = 0
 GOALKEEPER_ID = 1
 PLAYER_ID = 2
 REFEREE_ID = 3
+MODELS_READY = False
+def ensure_models_loaded():
+    """
+    Lazily load all heavy models and config.
+    Called at the start of run_full_pipeline().
+    """
+    global PLAYER_DETECTION_MODEL, FIELD_DETECTION_MODEL
+    global EMBEDDINGS_MODEL, EMBEDDINGS_PROCESSOR
+    global TEAM_CLASSIFIER, PITCH_CONFIG, MODELS_READY
+    if MODELS_READY:
+        return
+    roboflow_api_key = os.environ.get("ROBOFLOW_API_KEY")
+    if not roboflow_api_key:
+        raise RuntimeError(
+            "ROBOFLOW_API_KEY env var must be set in the Space secrets "
+            "(Settings → Variables and secrets)."
+        )
+    # Roboflow models
+    PLAYER_DETECTION_MODEL_ID = "football-players-detection-3zvbc/11"
+    FIELD_DETECTION_MODEL_ID = "football-field-detection-f07vi/14"
+    PLAYER_DETECTION_MODEL = get_model(
+        model_id=PLAYER_DETECTION_MODEL_ID, api_key=roboflow_api_key
+    )
+    FIELD_DETECTION_MODEL = get_model(
+        model_id=FIELD_DETECTION_MODEL_ID, api_key=roboflow_api_key
+    )
+    # SigLIP embeddings
+    SIGLIP_MODEL_PATH = "google/siglip-base-patch16-224"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    EMBEDDINGS_MODEL = SiglipVisionModel.from_pretrained(SIGLIP_MODEL_PATH).to(device)
+    EMBEDDINGS_PROCESSOR = AutoProcessor.from_pretrained(SIGLIP_MODEL_PATH)
+    # Pitch + TeamClassifier
+    PITCH_CONFIG = SoccerPitchConfiguration()
+    TEAM_CLASSIFIER = TeamClassifier(device="cuda" if torch.cuda.is_available() else "cpu")
+    MODELS_READY = True
+def get_device():
+    return "cuda" if torch.cuda.is_available() else "cpu"
+# -------------------- utility for saving images --------------------
 def save_image(path: str, img: np.ndarray) -> None:
     os.makedirs(os.path.dirname(path), exist_ok=True)
     if img.ndim == 3 and img.shape[2] == 3:
         img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
     else:
         img_bgr = img
     cv2.imwrite(path, img_bgr)
+# -------------------- 1. basic frames & detections --------------------
 def step_basic_frames(video_path: str, out_dir: str) -> Dict[str, str]:
+    ensure_models_loaded()
     frame_generator = sv.get_video_frames_generator(video_path)
     frame = next(frame_generator)
     raw_path = os.path.join(out_dir, "frame_raw.png")
     save_image(raw_path, frame)
     box_annotator = sv.BoxAnnotator(
+        color=sv.ColorPalette.from_hex(["#FF8C00", "#00BFFF", "#FF1493", "#FFD700"]),
+        thickness=2,
     )
     label_annotator = sv.LabelAnnotator(
+        color=sv.ColorPalette.from_hex(["#FF8C00", "#00BFFF", "#FF1493", "#FFD700"]),
+        text_color=sv.Color.from_hex("#000000"),
     )
     result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
     labels = [
         f"{class_name} {confidence:.2f}"
+        for class_name, confidence in zip(detections["class_name"], detections.confidence)
     ]
     annotated = frame.copy()
     boxes_path = os.path.join(out_dir, "frame_boxes_labels.png")
     save_image(boxes_path, annotated)
     ellipse_annotator = sv.EllipseAnnotator(
+        color=sv.ColorPalette.from_hex(["#00BFFF", "#FF1493", "#FFD700"]),
+        thickness=2,
     )
     triangle_annotator = sv.TriangleAnnotator(
+        color=sv.Color.from_hex("#FFD700"),
         base=25,
         height=21,
+        outline_thickness=1,
     )
     result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
         "ball_players": ball_players_path,
     }
+# -------------------- 2. SigLIP + UMAP + KMeans + HTML --------------------
 def step_siglip_clustering(video_path: str, out_dir: str) -> Dict[str, str]:
+    ensure_models_loaded()
+    stride = 30
+    frame_generator = sv.get_video_frames_generator(source_path=video_path, stride=stride)
     crops = []
     for frame in tqdm(frame_generator, desc="collecting crops (SigLIP)"):
         result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
         detections = sv.Detections.from_inference(result)
     BATCH_SIZE = 32
     batches = chunked(crops_pil, BATCH_SIZE)
     data = []
+    device = get_device()
     with torch.no_grad():
         for batch in tqdm(batches, desc="embedding extraction"):
+            inputs = EMBEDDINGS_PROCESSOR(images=batch, return_tensors="pt").to(device)
             outputs = EMBEDDINGS_MODEL(**inputs)
             embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
             data.append(embeddings)
     data = np.concatenate(data)
     REDUCER = umap.UMAP(n_components=3)
+    CLUSTERING_MODEL = KMeans(n_clusters=2, n_init="auto")
     projections = REDUCER.fit_transform(data)
     clusters = CLUSTERING_MODEL.fit_predict(projections)
     def pil_image_to_data_uri(image: Image.Image) -> str:
         buffered = BytesIO()
         image.save(buffered, format="PNG")
         img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
         return f"data:image/png;base64,{img_str}"
+    image_data_uris = {f"image_{i}": pil_image_to_data_uri(img) for i, img in enumerate(crops_pil)}
     image_ids = np.array([f"image_{i}" for i in range(len(crops_pil))])
     traces = []
     unique_labels = np.unique(clusters)
+    for lbl in unique_labels:
+        mask = clusters == lbl
         customdata_masked = image_ids[mask]
         trace = go.Scatter3d(
             x=projections[mask][:, 0],
             mode="markers+text",
             text=clusters[mask],
             customdata=customdata_masked,
+            name=str(lbl),
             marker=dict(size=8),
+            hovertemplate="<b>class: %{text}</b><br>image ID: %{customdata}<extra></extra>",
         )
         traces.append(trace)
     </html>
     """
+    os.makedirs(out_dir, exist_ok=True)
     html_path = os.path.join(out_dir, "siglip_clusters.html")
     with open(html_path, "w", encoding="utf-8") as f:
         f.write(html_template)
     return {"plot_html": html_path}
+# -------------------- 3. TeamClassifier training --------------------
 def train_team_classifier_on_video(video_path: str, stride: int = 30) -> None:
+    ensure_models_loaded()
+    frame_generator = sv.get_video_frames_generator(source_path=video_path, stride=stride)
     crops = []
     for frame in tqdm(frame_generator, desc="collecting crops (TeamClassifier)"):
         result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
         detections = sv.Detections.from_inference(result)
     if crops:
         TEAM_CLASSIFIER.fit(crops)
+# -------------------- 4. goalkeeper team resolution --------------------
+def resolve_goalkeepers_team_id(players: sv.Detections, goalkeepers: sv.Detections) -> np.ndarray:
     goalkeepers_xy = goalkeepers.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     players_xy = players.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     team_0_centroid = players_xy[players.class_id == 0].mean(axis=0)
         goalkeepers_team_id.append(0 if dist_0 < dist_1 else 1)
     return np.array(goalkeepers_team_id)
+# -------------------- 5. Voronoi blend helper (your function) --------------------
+def draw_pitch_voronoi_diagram_2(
+    config: SoccerPitchConfiguration,
+    team_1_xy: np.ndarray,
+    team_2_xy: np.ndarray,
+    team_1_color: sv.Color = sv.Color.RED,
+    team_2_color: sv.Color = sv.Color.WHITE,
+    opacity: float = 0.5,
+    padding: int = 50,
+    scale: float = 0.1,
+    pitch: np.ndarray | None = None,
+) -> np.ndarray:
+    if pitch is None:
+        pitch = draw_pitch(config=config, padding=padding, scale=scale)
+    scaled_width = int(config.width * scale)
+    scaled_length = int(config.length * scale)
+    voronoi = np.zeros_like(pitch, dtype=np.uint8)
+    team_1_color_bgr = np.array(team_1_color.as_bgr(), dtype=np.uint8)
+    team_2_color_bgr = np.array(team_2_color.as_bgr(), dtype=np.uint8)
+    y_coordinates, x_coordinates = np.indices((scaled_width + 2 * padding, scaled_length + 2 * padding))
+    y_coordinates -= padding
+    x_coordinates -= padding
+    def calculate_distances(xy, x_coordinates, y_coordinates):
+        return np.sqrt(
+            (xy[:, 0][:, None, None] * scale - x_coordinates) ** 2
+            + (xy[:, 1][:, None, None] * scale - y_coordinates) ** 2
+        )
+    distances_team_1 = calculate_distances(team_1_xy, x_coordinates, y_coordinates)
+    distances_team_2 = calculate_distances(team_2_xy, x_coordinates, y_coordinates)
+    min_distances_team_1 = np.min(distances_team_1, axis=0)
+    min_distances_team_2 = np.min(distances_team_2, axis=0)
+    steepness = 15
+    distance_ratio = min_distances_team_2 / np.clip(
+        min_distances_team_1 + min_distances_team_2, a_min=1e-5, a_max=None
+    )
+    blend_factor = np.tanh((distance_ratio - 0.5) * steepness) * 0.5 + 0.5
+    for c in range(3):
+        voronoi[:, :, c] = (
+            blend_factor * team_1_color_bgr[c] + (1 - blend_factor) * team_2_color_bgr[c]
+        ).astype(np.uint8)
+    overlay = cv2.addWeighted(voronoi, opacity, pitch, 1 - opacity, 0)
+    return overlay
+# -------------------- 6. single-frame advanced views --------------------
 def step_single_frame_advanced(video_path: str, out_dir: str) -> Dict[str, str]:
+    ensure_models_loaded()
     frame_generator = sv.get_video_frames_generator(video_path)
     frame = next(frame_generator)
     ellipse_annotator = sv.EllipseAnnotator(
+        color=sv.ColorPalette.from_hex(["#00BFFF", "#FF1493", "#FFD700"]),
         thickness=2,
     )
     label_annotator = sv.LabelAnnotator(
+        color=sv.ColorPalette.from_hex(["#00BFFF", "#FF1493", "#FFD700"]),
+        text_color=sv.Color.from_hex("#000000"),
         text_position=sv.Position.BOTTOM_CENTER,
     )
     triangle_annotator = sv.TriangleAnnotator(
     tracker = sv.ByteTrack()
     tracker.reset()
     result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
     detections = sv.Detections.from_inference(result)
     referees_detections = all_detections[all_detections.class_id == REFEREE_ID]
     players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
+    if players_crops:
+        players_detections.class_id = TEAM_CLASSIFIER.predict(players_crops)
+    if len(goalkeepers_detections) > 0 and len(players_detections) > 0:
         goalkeepers_detections.class_id = resolve_goalkeepers_team_id(
             players_detections, goalkeepers_detections
         )
     all_detections2.class_id = all_detections2.class_id.astype(int)
     annotated_frame = frame.copy()
+    annotated_frame = ellipse_annotator.annotate(scene=annotated_frame, detections=all_detections2)
     annotated_frame = label_annotator.annotate(
         scene=annotated_frame, detections=all_detections2, labels=labels
     )
         scene=annotated_frame, detections=ball_detections
     )
+    os.makedirs(out_dir, exist_ok=True)
     annotated_path = os.path.join(out_dir, "frame_advanced.png")
     save_image(annotated_path, annotated_frame)
+    # Pitch + radar + Voronoi
     result = FIELD_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
     key_points = sv.KeyPoints.from_inference(result)
     frame_reference_points = key_points.xy[0][filt]
     pitch_reference_points = np.array(PITCH_CONFIG.vertices)[filt]
+    transformer = ViewTransformer(source=frame_reference_points, target=pitch_reference_points)
     frame_ball_xy = ball_detections.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     pitch_ball_xy = transformer.transform_points(points=frame_ball_xy)
     referees_xy = referees_detections.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
     pitch_referees_xy = transformer.transform_points(points=referees_xy)
     radar = draw_pitch(PITCH_CONFIG)
     radar = draw_points_on_pitch(
         config=PITCH_CONFIG,
     radar_path = os.path.join(out_dir, "radar_view.png")
     save_image(radar_path, radar)
     vor = draw_pitch(PITCH_CONFIG)
     vor = draw_pitch_voronoi_diagram(
         config=PITCH_CONFIG,
     vor_path = os.path.join(out_dir, "voronoi.png")
     save_image(vor_path, vor)
     blended = draw_pitch(
+        config=PITCH_CONFIG, background_color=sv.Color.WHITE, line_color=sv.Color.BLACK
     )
     blended = draw_pitch_voronoi_diagram_2(
         config=PITCH_CONFIG,
         "voronoi_blended": blended_path,
     }
+# -------------------- 7. ball path & cleaning --------------------
+def replace_outliers_based_on_distance(positions: List[np.ndarray], distance_threshold: float) -> List[np.ndarray]:
+    last_valid_position = None
     cleaned_positions: List[np.ndarray] = []
     for position in positions:
     return cleaned_positions
 def step_ball_path(video_path: str, out_dir: str) -> Dict[str, Any]:
+    ensure_models_loaded()
     MAXLEN = 5
     MAX_DISTANCE_THRESHOLD = 500
     path_raw: List[np.ndarray] = []
     M = deque(maxlen=MAXLEN)
+    for frame in tqdm(frame_generator, total=video_info.total_frames, desc="ball path"):
         result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
         detections = sv.Detections.from_inference(result)
         M.append(transformer.m)
         transformer.m = np.mean(np.array(M), axis=0)
+        frame_ball_xy = ball_detections.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
         pitch_ball_xy = transformer.transform_points(points=frame_ball_xy)
         path_raw.append(pitch_ball_xy)
     path_clean = replace_outliers_based_on_distance(path, MAX_DISTANCE_THRESHOLD)
     raw_pitch = draw_pitch(PITCH_CONFIG)
     raw_pitch = draw_paths_on_pitch(
         config=PITCH_CONFIG, paths=[path], color=sv.Color.WHITE, pitch=raw_pitch
     raw_path_img = os.path.join(out_dir, "ball_path_raw.png")
     save_image(raw_path_img, raw_pitch)
     clean_pitch = draw_pitch(PITCH_CONFIG)
     clean_pitch = draw_paths_on_pitch(
         config=PITCH_CONFIG, paths=[path_clean], color=sv.Color.WHITE, pitch=clean_pitch
     cleaned_path_img = os.path.join(out_dir, "ball_path_cleaned.png")
     save_image(cleaned_path_img, clean_pitch)
     coords_clean = [
         coords.tolist() if len(coords) > 0 else [] for coords in path_clean
     ]
         "ball_path_cleaned_coords": coords_clean,
     }
+# -------------------- 8. stats-only process_video --------------------
 def process_video_stats(video_path: str) -> Dict[str, Any]:
+    ensure_models_loaded()
     tracker = sv.ByteTrack()
     tracker.reset()
     stats = {
     stats["distance_covered"] = dict(stats["distance_covered"])
     return stats
+# -------------------- 9. full pipeline entrypoint --------------------
 def run_full_pipeline(video_path: str, job_dir: str) -> Dict[str, Any]:
+    """
+    Run the full notebook-equivalent pipeline on a video and save all artifacts
+    into job_dir. Returns paths + stats for the FastAPI app.
+    """
+    ensure_models_loaded()
     os.makedirs(job_dir, exist_ok=True)
+    siglip_out = step_siglip_clustering(video_path, os.path.join(job_dir, "siglip"))
     train_team_classifier_on_video(video_path)
     basic_paths = step_basic_frames(video_path, os.path.join(job_dir, "frames"))
+    adv_paths = step_single_frame_advanced(video_path, os.path.join(job_dir, "advanced"))
     ball_paths = step_ball_path(video_path, os.path.join(job_dir, "ball_path"))
     stats = process_video_stats(video_path)
     return {
         "advanced": adv_paths,
         "ball": ball_paths,
         "stats": stats,
+        "siglip_html": siglip_out["plot_html"],
     }