Spaces:
Running
Running
Commit
·
2a623ac
1
Parent(s):
ab96cfe
error message handling
Browse files- .kiro/specs/hf-eda-mcp-server/tasks.md +2 -2
- src/hf_eda_mcp/error_handling.py +445 -0
- src/hf_eda_mcp/integrations/hf_client.py +62 -12
- src/hf_eda_mcp/services/dataset_service.py +110 -16
- src/hf_eda_mcp/tools/__init__.py +7 -5
- src/hf_eda_mcp/tools/analysis.py +48 -63
- src/hf_eda_mcp/tools/metadata.py +45 -53
- src/hf_eda_mcp/tools/sampling.py +58 -79
- src/hf_eda_mcp/validation.py +358 -0
.kiro/specs/hf-eda-mcp-server/tasks.md
CHANGED
|
@@ -58,13 +58,13 @@
|
|
| 58 |
- _Requirements: 4.1, 4.2, 4.4_
|
| 59 |
|
| 60 |
- [ ] 5. Implement error handling and validation
|
| 61 |
-
- [
|
| 62 |
- Validate dataset identifiers and configuration names
|
| 63 |
- Check split names and sample size parameters
|
| 64 |
- Provide helpful error messages for invalid inputs
|
| 65 |
- _Requirements: 1.2, 2.1_
|
| 66 |
|
| 67 |
-
- [
|
| 68 |
- Handle dataset not found errors with suggestions
|
| 69 |
- Manage authentication errors for private datasets
|
| 70 |
- Add retry logic for network and API failures
|
|
|
|
| 58 |
- _Requirements: 4.1, 4.2, 4.4_
|
| 59 |
|
| 60 |
- [ ] 5. Implement error handling and validation
|
| 61 |
+
- [x] 5.1 Add input validation for all tools
|
| 62 |
- Validate dataset identifiers and configuration names
|
| 63 |
- Check split names and sample size parameters
|
| 64 |
- Provide helpful error messages for invalid inputs
|
| 65 |
- _Requirements: 1.2, 2.1_
|
| 66 |
|
| 67 |
+
- [x] 5.2 Implement comprehensive error handling
|
| 68 |
- Handle dataset not found errors with suggestions
|
| 69 |
- Manage authentication errors for private datasets
|
| 70 |
- Add retry logic for network and API failures
|
src/hf_eda_mcp/error_handling.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comprehensive error handling utilities for hf-eda-mcp.
|
| 3 |
+
|
| 4 |
+
This module provides error handling utilities including retry logic,
|
| 5 |
+
error suggestions, and formatted error responses for better user experience.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import time
|
| 10 |
+
import functools
|
| 11 |
+
from typing import Optional, Callable, Any, List, Dict, TypeVar, cast
|
| 12 |
+
from requests.exceptions import RequestException, ConnectionError, Timeout, HTTPError
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
# Type variable for generic function return types
|
| 17 |
+
T = TypeVar('T')
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class RetryConfig:
|
| 21 |
+
"""Configuration for retry logic."""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
max_attempts: int = 3,
|
| 26 |
+
initial_delay: float = 1.0,
|
| 27 |
+
max_delay: float = 30.0,
|
| 28 |
+
exponential_base: float = 2.0,
|
| 29 |
+
jitter: bool = True
|
| 30 |
+
):
|
| 31 |
+
"""
|
| 32 |
+
Initialize retry configuration.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
max_attempts: Maximum number of retry attempts
|
| 36 |
+
initial_delay: Initial delay between retries in seconds
|
| 37 |
+
max_delay: Maximum delay between retries in seconds
|
| 38 |
+
exponential_base: Base for exponential backoff
|
| 39 |
+
jitter: Whether to add random jitter to delays
|
| 40 |
+
"""
|
| 41 |
+
self.max_attempts = max_attempts
|
| 42 |
+
self.initial_delay = initial_delay
|
| 43 |
+
self.max_delay = max_delay
|
| 44 |
+
self.exponential_base = exponential_base
|
| 45 |
+
self.jitter = jitter
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Default retry configuration
|
| 49 |
+
DEFAULT_RETRY_CONFIG = RetryConfig(
|
| 50 |
+
max_attempts=3,
|
| 51 |
+
initial_delay=1.0,
|
| 52 |
+
max_delay=30.0,
|
| 53 |
+
exponential_base=2.0,
|
| 54 |
+
jitter=True
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def calculate_retry_delay(attempt: int, config: RetryConfig) -> float:
|
| 59 |
+
"""
|
| 60 |
+
Calculate delay for retry attempt using exponential backoff.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
attempt: Current attempt number (0-indexed)
|
| 64 |
+
config: Retry configuration
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Delay in seconds
|
| 68 |
+
"""
|
| 69 |
+
delay = min(
|
| 70 |
+
config.initial_delay * (config.exponential_base ** attempt),
|
| 71 |
+
config.max_delay
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Add jitter to prevent thundering herd
|
| 75 |
+
if config.jitter:
|
| 76 |
+
import random
|
| 77 |
+
delay = delay * (0.5 + random.random())
|
| 78 |
+
|
| 79 |
+
return delay
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def should_retry_error(error: Exception) -> bool:
|
| 83 |
+
"""
|
| 84 |
+
Determine if an error should trigger a retry.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
error: Exception to check
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
True if error is retryable, False otherwise
|
| 91 |
+
"""
|
| 92 |
+
# Network errors are retryable
|
| 93 |
+
if isinstance(error, (ConnectionError, Timeout)):
|
| 94 |
+
return True
|
| 95 |
+
|
| 96 |
+
# HTTP errors with specific status codes are retryable
|
| 97 |
+
if isinstance(error, HTTPError):
|
| 98 |
+
# Retry on 5xx server errors and 429 rate limiting
|
| 99 |
+
if hasattr(error, 'response') and error.response is not None:
|
| 100 |
+
status_code = error.response.status_code
|
| 101 |
+
return status_code >= 500 or status_code == 429
|
| 102 |
+
|
| 103 |
+
# Generic request exceptions might be retryable
|
| 104 |
+
if isinstance(error, RequestException):
|
| 105 |
+
# Check if it's a connection-related issue
|
| 106 |
+
error_str = str(error).lower()
|
| 107 |
+
retryable_keywords = ['timeout', 'connection', 'network', 'temporary']
|
| 108 |
+
return any(keyword in error_str for keyword in retryable_keywords)
|
| 109 |
+
|
| 110 |
+
# Don't retry other errors by default
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def retry_with_backoff(
|
| 115 |
+
func: Optional[Callable[..., T]] = None,
|
| 116 |
+
*,
|
| 117 |
+
config: Optional[RetryConfig] = None,
|
| 118 |
+
retryable_exceptions: Optional[tuple] = None
|
| 119 |
+
) -> Callable[..., T]:
|
| 120 |
+
"""
|
| 121 |
+
Decorator to retry a function with exponential backoff.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
func: Function to decorate (when used without arguments)
|
| 125 |
+
config: Retry configuration (uses default if not provided)
|
| 126 |
+
retryable_exceptions: Tuple of exception types to retry on
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Decorated function with retry logic
|
| 130 |
+
|
| 131 |
+
Example:
|
| 132 |
+
@retry_with_backoff
|
| 133 |
+
def fetch_data():
|
| 134 |
+
# ... network call ...
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
@retry_with_backoff(config=RetryConfig(max_attempts=5))
|
| 138 |
+
def fetch_with_custom_config():
|
| 139 |
+
# ... network call ...
|
| 140 |
+
pass
|
| 141 |
+
"""
|
| 142 |
+
if config is None:
|
| 143 |
+
config = DEFAULT_RETRY_CONFIG
|
| 144 |
+
|
| 145 |
+
if retryable_exceptions is None:
|
| 146 |
+
retryable_exceptions = (ConnectionError, Timeout, RequestException)
|
| 147 |
+
|
| 148 |
+
def decorator(f: Callable[..., T]) -> Callable[..., T]:
|
| 149 |
+
@functools.wraps(f)
|
| 150 |
+
def wrapper(*args: Any, **kwargs: Any) -> T:
|
| 151 |
+
last_exception: Optional[Exception] = None
|
| 152 |
+
|
| 153 |
+
for attempt in range(config.max_attempts):
|
| 154 |
+
try:
|
| 155 |
+
return f(*args, **kwargs)
|
| 156 |
+
|
| 157 |
+
except retryable_exceptions as e:
|
| 158 |
+
last_exception = e
|
| 159 |
+
|
| 160 |
+
# Check if we should retry this specific error
|
| 161 |
+
if not should_retry_error(e):
|
| 162 |
+
logger.warning(f"Error is not retryable: {e}")
|
| 163 |
+
raise
|
| 164 |
+
|
| 165 |
+
# Don't sleep after the last attempt
|
| 166 |
+
if attempt < config.max_attempts - 1:
|
| 167 |
+
delay = calculate_retry_delay(attempt, config)
|
| 168 |
+
logger.warning(
|
| 169 |
+
f"Attempt {attempt + 1}/{config.max_attempts} failed: {e}. "
|
| 170 |
+
f"Retrying in {delay:.2f}s..."
|
| 171 |
+
)
|
| 172 |
+
time.sleep(delay)
|
| 173 |
+
else:
|
| 174 |
+
logger.error(
|
| 175 |
+
f"All {config.max_attempts} attempts failed. Last error: {e}"
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
# Non-retryable exception, raise immediately
|
| 180 |
+
logger.error(f"Non-retryable error occurred: {e}")
|
| 181 |
+
raise
|
| 182 |
+
|
| 183 |
+
# If we get here, all retries failed
|
| 184 |
+
if last_exception:
|
| 185 |
+
raise last_exception
|
| 186 |
+
else:
|
| 187 |
+
raise RuntimeError("Retry logic failed without capturing exception")
|
| 188 |
+
|
| 189 |
+
return cast(Callable[..., T], wrapper)
|
| 190 |
+
|
| 191 |
+
# Support both @retry_with_backoff and @retry_with_backoff()
|
| 192 |
+
if func is None:
|
| 193 |
+
return decorator
|
| 194 |
+
else:
|
| 195 |
+
return decorator(func)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def get_dataset_suggestions(dataset_id: str) -> List[str]:
|
| 199 |
+
"""
|
| 200 |
+
Generate helpful suggestions for dataset not found errors.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
dataset_id: The dataset identifier that was not found
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
List of suggestion strings
|
| 207 |
+
"""
|
| 208 |
+
suggestions = []
|
| 209 |
+
|
| 210 |
+
# Check for common typos or formatting issues
|
| 211 |
+
if " " in dataset_id:
|
| 212 |
+
suggestions.append(
|
| 213 |
+
f"Dataset ID contains spaces. Try: '{dataset_id.replace(' ', '-')}' or '{dataset_id.replace(' ', '_')}'"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
if dataset_id.isupper():
|
| 217 |
+
suggestions.append(
|
| 218 |
+
f"Dataset ID is all uppercase. Try lowercase: '{dataset_id.lower()}'"
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# Check if it looks like it might be missing organization prefix
|
| 222 |
+
if "/" not in dataset_id:
|
| 223 |
+
suggestions.append(
|
| 224 |
+
f"Dataset might need an organization prefix. Try searching for: 'organization/{dataset_id}'"
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# General suggestions
|
| 228 |
+
suggestions.extend([
|
| 229 |
+
"Verify the dataset exists on HuggingFace Hub: https://huggingface.co/datasets",
|
| 230 |
+
f"Search for similar datasets: https://huggingface.co/datasets?search={dataset_id}",
|
| 231 |
+
"Check if the dataset name is spelled correctly",
|
| 232 |
+
"Ensure you have access if the dataset is private or gated"
|
| 233 |
+
])
|
| 234 |
+
|
| 235 |
+
return suggestions
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def format_authentication_error(
|
| 239 |
+
dataset_id: str,
|
| 240 |
+
is_gated: bool = False,
|
| 241 |
+
has_token: bool = False
|
| 242 |
+
) -> Dict[str, Any]:
|
| 243 |
+
"""
|
| 244 |
+
Format authentication error with helpful guidance.
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
dataset_id: The dataset identifier
|
| 248 |
+
is_gated: Whether the dataset is gated (requires approval)
|
| 249 |
+
has_token: Whether a token was provided
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
Dictionary with error details and suggestions
|
| 253 |
+
"""
|
| 254 |
+
error_details = {
|
| 255 |
+
"error_type": "authentication_error",
|
| 256 |
+
"dataset_id": dataset_id,
|
| 257 |
+
"is_gated": is_gated,
|
| 258 |
+
"has_token": has_token,
|
| 259 |
+
"message": "",
|
| 260 |
+
"suggestions": []
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
if is_gated:
|
| 264 |
+
error_details["message"] = (
|
| 265 |
+
f"Dataset '{dataset_id}' is gated and requires approval to access."
|
| 266 |
+
)
|
| 267 |
+
error_details["suggestions"] = [
|
| 268 |
+
f"Request access to the dataset: https://huggingface.co/datasets/{dataset_id}",
|
| 269 |
+
"Wait for approval from the dataset owner",
|
| 270 |
+
"Provide a valid HuggingFace token after receiving access",
|
| 271 |
+
"Check your HuggingFace account for access status"
|
| 272 |
+
]
|
| 273 |
+
elif not has_token:
|
| 274 |
+
error_details["message"] = (
|
| 275 |
+
f"Dataset '{dataset_id}' is private and requires authentication."
|
| 276 |
+
)
|
| 277 |
+
error_details["suggestions"] = [
|
| 278 |
+
"Provide a HuggingFace authentication token",
|
| 279 |
+
"Create a token at: https://huggingface.co/settings/tokens",
|
| 280 |
+
"Set the token in your environment: HF_TOKEN=your_token",
|
| 281 |
+
"Ensure the token has read access to datasets"
|
| 282 |
+
]
|
| 283 |
+
else:
|
| 284 |
+
error_details["message"] = (
|
| 285 |
+
f"Authentication failed for dataset '{dataset_id}'. "
|
| 286 |
+
"Your token may not have access to this dataset."
|
| 287 |
+
)
|
| 288 |
+
error_details["suggestions"] = [
|
| 289 |
+
"Verify your token is valid and not expired",
|
| 290 |
+
"Check if your token has the required permissions",
|
| 291 |
+
"Ensure you have been granted access to this private dataset",
|
| 292 |
+
"Try regenerating your token at: https://huggingface.co/settings/tokens"
|
| 293 |
+
]
|
| 294 |
+
|
| 295 |
+
return error_details
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def format_network_error(
|
| 299 |
+
error: Exception,
|
| 300 |
+
operation: str = "operation"
|
| 301 |
+
) -> Dict[str, Any]:
|
| 302 |
+
"""
|
| 303 |
+
Format network error with helpful guidance.
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
error: The network exception
|
| 307 |
+
operation: Description of the operation that failed
|
| 308 |
+
|
| 309 |
+
Returns:
|
| 310 |
+
Dictionary with error details and suggestions
|
| 311 |
+
"""
|
| 312 |
+
error_details = {
|
| 313 |
+
"error_type": "network_error",
|
| 314 |
+
"operation": operation,
|
| 315 |
+
"message": f"Network error during {operation}: {str(error)}",
|
| 316 |
+
"suggestions": []
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
# Determine specific error type and provide targeted suggestions
|
| 320 |
+
if isinstance(error, Timeout):
|
| 321 |
+
error_details["error_subtype"] = "timeout"
|
| 322 |
+
error_details["suggestions"] = [
|
| 323 |
+
"The request timed out. Try again in a moment",
|
| 324 |
+
"Check your internet connection",
|
| 325 |
+
"The HuggingFace Hub might be experiencing high load",
|
| 326 |
+
"Try with a smaller sample size or different dataset"
|
| 327 |
+
]
|
| 328 |
+
elif isinstance(error, ConnectionError):
|
| 329 |
+
error_details["error_subtype"] = "connection"
|
| 330 |
+
error_details["suggestions"] = [
|
| 331 |
+
"Unable to connect to HuggingFace Hub",
|
| 332 |
+
"Check your internet connection",
|
| 333 |
+
"Verify you can access https://huggingface.co",
|
| 334 |
+
"Check if you're behind a firewall or proxy",
|
| 335 |
+
"Try again in a few moments"
|
| 336 |
+
]
|
| 337 |
+
else:
|
| 338 |
+
error_details["error_subtype"] = "general"
|
| 339 |
+
error_details["suggestions"] = [
|
| 340 |
+
"A network error occurred. Please try again",
|
| 341 |
+
"Check your internet connection",
|
| 342 |
+
"The HuggingFace Hub might be temporarily unavailable",
|
| 343 |
+
"Try again in a few moments"
|
| 344 |
+
]
|
| 345 |
+
|
| 346 |
+
return error_details
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def format_error_response(
|
| 350 |
+
error: Exception,
|
| 351 |
+
context: Optional[Dict[str, Any]] = None
|
| 352 |
+
) -> Dict[str, Any]:
|
| 353 |
+
"""
|
| 354 |
+
Format any error into a structured response with helpful information.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
error: The exception to format
|
| 358 |
+
context: Optional context information (dataset_id, operation, etc.)
|
| 359 |
+
|
| 360 |
+
Returns:
|
| 361 |
+
Dictionary with formatted error information
|
| 362 |
+
"""
|
| 363 |
+
from hf_eda_mcp.integrations.hf_client import (
|
| 364 |
+
DatasetNotFoundError,
|
| 365 |
+
AuthenticationError,
|
| 366 |
+
NetworkError
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
context = context or {}
|
| 370 |
+
|
| 371 |
+
# Handle specific error types
|
| 372 |
+
if isinstance(error, DatasetNotFoundError):
|
| 373 |
+
dataset_id = context.get("dataset_id", "unknown")
|
| 374 |
+
return {
|
| 375 |
+
"error_type": "dataset_not_found",
|
| 376 |
+
"message": str(error),
|
| 377 |
+
"dataset_id": dataset_id,
|
| 378 |
+
"suggestions": get_dataset_suggestions(dataset_id)
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
elif isinstance(error, AuthenticationError):
|
| 382 |
+
dataset_id = context.get("dataset_id", "unknown")
|
| 383 |
+
is_gated = "gated" in str(error).lower()
|
| 384 |
+
has_token = context.get("has_token", False)
|
| 385 |
+
return format_authentication_error(dataset_id, is_gated, has_token)
|
| 386 |
+
|
| 387 |
+
elif isinstance(error, NetworkError):
|
| 388 |
+
operation = context.get("operation", "operation")
|
| 389 |
+
# Extract the original exception if available
|
| 390 |
+
original_error = error.__cause__ or error
|
| 391 |
+
return format_network_error(original_error, operation)
|
| 392 |
+
|
| 393 |
+
elif isinstance(error, (ConnectionError, Timeout, RequestException)):
|
| 394 |
+
operation = context.get("operation", "operation")
|
| 395 |
+
return format_network_error(error, operation)
|
| 396 |
+
|
| 397 |
+
elif isinstance(error, ValueError):
|
| 398 |
+
return {
|
| 399 |
+
"error_type": "validation_error",
|
| 400 |
+
"message": str(error),
|
| 401 |
+
"suggestions": [
|
| 402 |
+
"Check that all input parameters are valid",
|
| 403 |
+
"Refer to the tool documentation for parameter requirements"
|
| 404 |
+
]
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
else:
|
| 408 |
+
# Generic error
|
| 409 |
+
return {
|
| 410 |
+
"error_type": "unknown_error",
|
| 411 |
+
"message": f"An unexpected error occurred: {str(error)}",
|
| 412 |
+
"error_class": type(error).__name__,
|
| 413 |
+
"suggestions": [
|
| 414 |
+
"Try the operation again",
|
| 415 |
+
"Check the logs for more details",
|
| 416 |
+
"If the problem persists, report it as an issue"
|
| 417 |
+
]
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
def log_error_with_context(
|
| 422 |
+
error: Exception,
|
| 423 |
+
context: Optional[Dict[str, Any]] = None,
|
| 424 |
+
level: int = logging.ERROR
|
| 425 |
+
) -> None:
|
| 426 |
+
"""
|
| 427 |
+
Log an error with contextual information.
|
| 428 |
+
|
| 429 |
+
Args:
|
| 430 |
+
error: The exception to log
|
| 431 |
+
context: Optional context information
|
| 432 |
+
level: Logging level (default: ERROR)
|
| 433 |
+
"""
|
| 434 |
+
context = context or {}
|
| 435 |
+
|
| 436 |
+
# Build context string
|
| 437 |
+
context_parts = [f"{k}={v}" for k, v in context.items()]
|
| 438 |
+
context_str = ", ".join(context_parts) if context_parts else "no context"
|
| 439 |
+
|
| 440 |
+
# Log with full details
|
| 441 |
+
logger.log(
|
| 442 |
+
level,
|
| 443 |
+
f"Error occurred: {type(error).__name__}: {str(error)} | Context: {context_str}",
|
| 444 |
+
exc_info=True
|
| 445 |
+
)
|
src/hf_eda_mcp/integrations/hf_client.py
CHANGED
|
@@ -11,6 +11,14 @@ from huggingface_hub import HfApi
|
|
| 11 |
from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
|
| 12 |
from requests.exceptions import RequestException, ConnectionError, Timeout
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
logger = logging.getLogger()
|
| 15 |
|
| 16 |
|
|
@@ -80,11 +88,15 @@ class HfClient:
|
|
| 80 |
f"Failed to authenticate with HuggingFace Hub: {str(e)}"
|
| 81 |
)
|
| 82 |
|
|
|
|
| 83 |
def get_dataset_info(
|
| 84 |
self, dataset_id: str, config_name: Optional[str] = None
|
| 85 |
) -> Dict[str, Any]:
|
| 86 |
"""
|
| 87 |
Retrieve comprehensive dataset information from HuggingFace Hub.
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
Args:
|
| 90 |
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
|
|
@@ -102,6 +114,8 @@ class HfClient:
|
|
| 102 |
AuthenticationError: If dataset is private and authentication fails
|
| 103 |
NetworkError: If network request fails
|
| 104 |
"""
|
|
|
|
|
|
|
| 105 |
try:
|
| 106 |
# Get dataset info from HuggingFace Hub
|
| 107 |
dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
|
|
@@ -188,24 +202,60 @@ class HfClient:
|
|
| 188 |
|
| 189 |
return metadata
|
| 190 |
|
| 191 |
-
except RepositoryNotFoundError:
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
)
|
| 195 |
-
|
| 196 |
-
raise
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
except (ConnectionError, Timeout) as e:
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
except RequestException as e:
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
except Exception as e:
|
|
|
|
| 205 |
logger.error(
|
| 206 |
f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
|
| 207 |
)
|
| 208 |
-
raise HfClientError(f"Failed to get dataset info: {str(e)}")
|
| 209 |
|
| 210 |
def list_dataset_configs(self, dataset_id: str) -> List[str]:
|
| 211 |
"""
|
|
|
|
| 11 |
from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
|
| 12 |
from requests.exceptions import RequestException, ConnectionError, Timeout
|
| 13 |
|
| 14 |
+
from hf_eda_mcp.error_handling import (
|
| 15 |
+
retry_with_backoff,
|
| 16 |
+
RetryConfig,
|
| 17 |
+
format_error_response,
|
| 18 |
+
log_error_with_context,
|
| 19 |
+
get_dataset_suggestions
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
logger = logging.getLogger()
|
| 23 |
|
| 24 |
|
|
|
|
| 88 |
f"Failed to authenticate with HuggingFace Hub: {str(e)}"
|
| 89 |
)
|
| 90 |
|
| 91 |
+
@retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
|
| 92 |
def get_dataset_info(
|
| 93 |
self, dataset_id: str, config_name: Optional[str] = None
|
| 94 |
) -> Dict[str, Any]:
|
| 95 |
"""
|
| 96 |
Retrieve comprehensive dataset information from HuggingFace Hub.
|
| 97 |
+
|
| 98 |
+
This method includes automatic retry logic with exponential backoff
|
| 99 |
+
for transient network errors.
|
| 100 |
|
| 101 |
Args:
|
| 102 |
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
|
|
|
|
| 114 |
AuthenticationError: If dataset is private and authentication fails
|
| 115 |
NetworkError: If network request fails
|
| 116 |
"""
|
| 117 |
+
context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"}
|
| 118 |
+
|
| 119 |
try:
|
| 120 |
# Get dataset info from HuggingFace Hub
|
| 121 |
dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
|
|
|
|
| 202 |
|
| 203 |
return metadata
|
| 204 |
|
| 205 |
+
except RepositoryNotFoundError as e:
|
| 206 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 207 |
+
error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub."
|
| 208 |
+
suggestions = get_dataset_suggestions(dataset_id)
|
| 209 |
+
logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}")
|
| 210 |
+
raise DatasetNotFoundError(error_msg)
|
| 211 |
+
|
| 212 |
+
except GatedRepoError as e:
|
| 213 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 214 |
+
is_gated = True
|
| 215 |
+
has_token = self.token is not None
|
| 216 |
+
|
| 217 |
+
if is_gated:
|
| 218 |
+
error_msg = (
|
| 219 |
+
f"Dataset '{dataset_id}' is gated and requires approval. "
|
| 220 |
+
f"Request access at: https://huggingface.co/datasets/{dataset_id}"
|
| 221 |
+
)
|
| 222 |
+
else:
|
| 223 |
+
error_msg = (
|
| 224 |
+
f"Dataset '{dataset_id}' is private. "
|
| 225 |
+
"Please provide a valid authentication token."
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}")
|
| 229 |
+
raise AuthenticationError(error_msg)
|
| 230 |
+
|
| 231 |
except (ConnectionError, Timeout) as e:
|
| 232 |
+
log_error_with_context(e, context)
|
| 233 |
+
# Let retry decorator handle these - if we get here, all retries failed
|
| 234 |
+
raise NetworkError(
|
| 235 |
+
f"Network error while fetching dataset info after retries: {str(e)}"
|
| 236 |
+
) from e
|
| 237 |
+
|
| 238 |
except RequestException as e:
|
| 239 |
+
log_error_with_context(e, context)
|
| 240 |
+
# Check if it's a retryable error
|
| 241 |
+
if hasattr(e, 'response') and e.response is not None:
|
| 242 |
+
status_code = e.response.status_code
|
| 243 |
+
if status_code == 429:
|
| 244 |
+
raise NetworkError(
|
| 245 |
+
"Rate limit exceeded. Please try again later."
|
| 246 |
+
) from e
|
| 247 |
+
elif status_code >= 500:
|
| 248 |
+
raise NetworkError(
|
| 249 |
+
f"HuggingFace Hub server error (HTTP {status_code}). Please try again later."
|
| 250 |
+
) from e
|
| 251 |
+
raise NetworkError(f"Request failed: {str(e)}") from e
|
| 252 |
+
|
| 253 |
except Exception as e:
|
| 254 |
+
log_error_with_context(e, context)
|
| 255 |
logger.error(
|
| 256 |
f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
|
| 257 |
)
|
| 258 |
+
raise HfClientError(f"Failed to get dataset info: {str(e)}") from e
|
| 259 |
|
| 260 |
def list_dataset_configs(self, dataset_id: str) -> List[str]:
|
| 261 |
"""
|
src/hf_eda_mcp/services/dataset_service.py
CHANGED
|
@@ -14,7 +14,19 @@ from pathlib import Path
|
|
| 14 |
from datasets import load_dataset
|
| 15 |
from datasets.utils.logging import disable_progress_bar
|
| 16 |
|
| 17 |
-
from hf_eda_mcp.integrations.hf_client import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
|
@@ -128,6 +140,9 @@ class DatasetService:
|
|
| 128 |
"""
|
| 129 |
Load dataset information from HuggingFace Hub with caching.
|
| 130 |
|
|
|
|
|
|
|
|
|
|
| 131 |
Args:
|
| 132 |
dataset_id: HuggingFace dataset identifier
|
| 133 |
config_name: Optional configuration name
|
|
@@ -138,7 +153,14 @@ class DatasetService:
|
|
| 138 |
Raises:
|
| 139 |
DatasetNotFoundError: If dataset doesn't exist
|
| 140 |
AuthenticationError: If dataset is private and authentication fails
|
|
|
|
| 141 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
cache_key = self._get_cache_key(dataset_id, config_name)
|
| 143 |
cache_file = self.metadata_cache_dir / f"{cache_key}.json"
|
| 144 |
|
|
@@ -148,7 +170,7 @@ class DatasetService:
|
|
| 148 |
logger.debug(f"Using cached metadata for {dataset_id}")
|
| 149 |
return cached_data
|
| 150 |
|
| 151 |
-
# Fetch from HuggingFace Hub
|
| 152 |
try:
|
| 153 |
logger.info(f"Fetching metadata for dataset: {dataset_id}")
|
| 154 |
metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
|
|
@@ -156,15 +178,29 @@ class DatasetService:
|
|
| 156 |
# Add cache timestamp
|
| 157 |
metadata['_cached_at'] = time.time()
|
| 158 |
|
| 159 |
-
# Save to cache
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
return metadata
|
| 163 |
|
| 164 |
-
except
|
| 165 |
-
# Re-raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
|
|
|
| 168 |
def load_dataset_sample(
|
| 169 |
self,
|
| 170 |
dataset_id: str,
|
|
@@ -176,6 +212,9 @@ class DatasetService:
|
|
| 176 |
"""
|
| 177 |
Load samples from the specified dataset with caching.
|
| 178 |
|
|
|
|
|
|
|
|
|
|
| 179 |
Args:
|
| 180 |
dataset_id: HuggingFace dataset identifier
|
| 181 |
split: Dataset split to sample from
|
|
@@ -188,8 +227,18 @@ class DatasetService:
|
|
| 188 |
|
| 189 |
Raises:
|
| 190 |
DatasetNotFoundError: If dataset or split doesn't exist
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# For small samples, check cache first
|
| 194 |
if num_samples <= 100: # Only cache small samples
|
| 195 |
cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
|
|
@@ -208,7 +257,8 @@ class DatasetService:
|
|
| 208 |
dataset_id,
|
| 209 |
name=config_name,
|
| 210 |
split=split,
|
| 211 |
-
streaming=streaming
|
|
|
|
| 212 |
)
|
| 213 |
|
| 214 |
# Take the requested number of samples
|
|
@@ -240,21 +290,65 @@ class DatasetService:
|
|
| 240 |
'_sampled_at': time.time()
|
| 241 |
}
|
| 242 |
|
| 243 |
-
# Cache small samples
|
| 244 |
if num_samples <= 100:
|
| 245 |
try:
|
| 246 |
self._save_to_cache(cache_file, sample_data)
|
| 247 |
-
except CacheError:
|
| 248 |
-
|
| 249 |
-
pass
|
| 250 |
|
| 251 |
return sample_data
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
except Exception as e:
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
| 260 |
"""
|
|
|
|
| 14 |
from datasets import load_dataset
|
| 15 |
from datasets.utils.logging import disable_progress_bar
|
| 16 |
|
| 17 |
+
from hf_eda_mcp.integrations.hf_client import (
|
| 18 |
+
HfClient,
|
| 19 |
+
HfClientError,
|
| 20 |
+
DatasetNotFoundError,
|
| 21 |
+
AuthenticationError,
|
| 22 |
+
NetworkError
|
| 23 |
+
)
|
| 24 |
+
from hf_eda_mcp.error_handling import (
|
| 25 |
+
retry_with_backoff,
|
| 26 |
+
RetryConfig,
|
| 27 |
+
log_error_with_context,
|
| 28 |
+
format_error_response
|
| 29 |
+
)
|
| 30 |
|
| 31 |
logger = logging.getLogger(__name__)
|
| 32 |
|
|
|
|
| 140 |
"""
|
| 141 |
Load dataset information from HuggingFace Hub with caching.
|
| 142 |
|
| 143 |
+
Includes automatic retry logic for transient failures and comprehensive
|
| 144 |
+
error handling with helpful suggestions.
|
| 145 |
+
|
| 146 |
Args:
|
| 147 |
dataset_id: HuggingFace dataset identifier
|
| 148 |
config_name: Optional configuration name
|
|
|
|
| 153 |
Raises:
|
| 154 |
DatasetNotFoundError: If dataset doesn't exist
|
| 155 |
AuthenticationError: If dataset is private and authentication fails
|
| 156 |
+
NetworkError: If network operations fail after retries
|
| 157 |
"""
|
| 158 |
+
context = {
|
| 159 |
+
"dataset_id": dataset_id,
|
| 160 |
+
"config_name": config_name,
|
| 161 |
+
"operation": "load_dataset_info"
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
cache_key = self._get_cache_key(dataset_id, config_name)
|
| 165 |
cache_file = self.metadata_cache_dir / f"{cache_key}.json"
|
| 166 |
|
|
|
|
| 170 |
logger.debug(f"Using cached metadata for {dataset_id}")
|
| 171 |
return cached_data
|
| 172 |
|
| 173 |
+
# Fetch from HuggingFace Hub with retry logic
|
| 174 |
try:
|
| 175 |
logger.info(f"Fetching metadata for dataset: {dataset_id}")
|
| 176 |
metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
|
|
|
|
| 178 |
# Add cache timestamp
|
| 179 |
metadata['_cached_at'] = time.time()
|
| 180 |
|
| 181 |
+
# Save to cache (don't fail if caching fails)
|
| 182 |
+
try:
|
| 183 |
+
self._save_to_cache(cache_file, metadata)
|
| 184 |
+
except CacheError as e:
|
| 185 |
+
logger.warning(f"Failed to cache metadata, continuing anyway: {e}")
|
| 186 |
|
| 187 |
return metadata
|
| 188 |
|
| 189 |
+
except (DatasetNotFoundError, AuthenticationError, NetworkError):
|
| 190 |
+
# Re-raise these specific errors with context
|
| 191 |
+
log_error_with_context(
|
| 192 |
+
Exception(f"Failed to load dataset info for {dataset_id}"),
|
| 193 |
+
context,
|
| 194 |
+
level=logging.WARNING
|
| 195 |
+
)
|
| 196 |
raise
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
# Unexpected error
|
| 200 |
+
log_error_with_context(e, context)
|
| 201 |
+
raise DatasetServiceError(f"Unexpected error loading dataset info: {str(e)}") from e
|
| 202 |
|
| 203 |
+
@retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
|
| 204 |
def load_dataset_sample(
|
| 205 |
self,
|
| 206 |
dataset_id: str,
|
|
|
|
| 212 |
"""
|
| 213 |
Load samples from the specified dataset with caching.
|
| 214 |
|
| 215 |
+
Includes automatic retry logic for transient failures and comprehensive
|
| 216 |
+
error handling.
|
| 217 |
+
|
| 218 |
Args:
|
| 219 |
dataset_id: HuggingFace dataset identifier
|
| 220 |
split: Dataset split to sample from
|
|
|
|
| 227 |
|
| 228 |
Raises:
|
| 229 |
DatasetNotFoundError: If dataset or split doesn't exist
|
| 230 |
+
AuthenticationError: If dataset is private and authentication fails
|
| 231 |
+
NetworkError: If network operations fail after retries
|
| 232 |
+
DatasetServiceError: If sampling fails for other reasons
|
| 233 |
"""
|
| 234 |
+
context = {
|
| 235 |
+
"dataset_id": dataset_id,
|
| 236 |
+
"split": split,
|
| 237 |
+
"num_samples": num_samples,
|
| 238 |
+
"config_name": config_name,
|
| 239 |
+
"operation": "load_dataset_sample"
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
# For small samples, check cache first
|
| 243 |
if num_samples <= 100: # Only cache small samples
|
| 244 |
cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
|
|
|
|
| 257 |
dataset_id,
|
| 258 |
name=config_name,
|
| 259 |
split=split,
|
| 260 |
+
streaming=streaming,
|
| 261 |
+
token=self.hf_client.token
|
| 262 |
)
|
| 263 |
|
| 264 |
# Take the requested number of samples
|
|
|
|
| 290 |
'_sampled_at': time.time()
|
| 291 |
}
|
| 292 |
|
| 293 |
+
# Cache small samples (don't fail if caching fails)
|
| 294 |
if num_samples <= 100:
|
| 295 |
try:
|
| 296 |
self._save_to_cache(cache_file, sample_data)
|
| 297 |
+
except CacheError as e:
|
| 298 |
+
logger.warning(f"Failed to cache sample, continuing anyway: {e}")
|
|
|
|
| 299 |
|
| 300 |
return sample_data
|
| 301 |
|
| 302 |
+
except DatasetNotFoundError:
|
| 303 |
+
# Re-raise as-is
|
| 304 |
+
log_error_with_context(
|
| 305 |
+
Exception(f"Dataset or split not found: {dataset_id}/{split}"),
|
| 306 |
+
context,
|
| 307 |
+
level=logging.WARNING
|
| 308 |
+
)
|
| 309 |
+
raise
|
| 310 |
+
|
| 311 |
+
except AuthenticationError:
|
| 312 |
+
# Re-raise as-is
|
| 313 |
+
log_error_with_context(
|
| 314 |
+
Exception(f"Authentication failed for dataset: {dataset_id}"),
|
| 315 |
+
context,
|
| 316 |
+
level=logging.WARNING
|
| 317 |
+
)
|
| 318 |
+
raise
|
| 319 |
+
|
| 320 |
except Exception as e:
|
| 321 |
+
log_error_with_context(e, context)
|
| 322 |
+
|
| 323 |
+
# Try to provide more specific error messages
|
| 324 |
+
error_str = str(e).lower()
|
| 325 |
+
|
| 326 |
+
if "not found" in error_str or "doesn't exist" in error_str:
|
| 327 |
+
if "split" in error_str or split in error_str:
|
| 328 |
+
raise DatasetNotFoundError(
|
| 329 |
+
f"Split '{split}' not found in dataset '{dataset_id}'. "
|
| 330 |
+
f"Available splits may be different."
|
| 331 |
+
) from e
|
| 332 |
+
else:
|
| 333 |
+
raise DatasetNotFoundError(
|
| 334 |
+
f"Dataset '{dataset_id}' not found on HuggingFace Hub."
|
| 335 |
+
) from e
|
| 336 |
+
|
| 337 |
+
elif "gated" in error_str or "private" in error_str or "authentication" in error_str:
|
| 338 |
+
raise AuthenticationError(
|
| 339 |
+
f"Authentication required for dataset '{dataset_id}'. "
|
| 340 |
+
"Please provide a valid HuggingFace token."
|
| 341 |
+
) from e
|
| 342 |
+
|
| 343 |
+
elif "timeout" in error_str or "connection" in error_str:
|
| 344 |
+
raise NetworkError(
|
| 345 |
+
f"Network error while loading dataset sample: {str(e)}"
|
| 346 |
+
) from e
|
| 347 |
+
|
| 348 |
+
else:
|
| 349 |
+
raise DatasetServiceError(
|
| 350 |
+
f"Failed to load dataset sample: {str(e)}"
|
| 351 |
+
) from e
|
| 352 |
|
| 353 |
def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
| 354 |
"""
|
src/hf_eda_mcp/tools/__init__.py
CHANGED
|
@@ -4,14 +4,17 @@ EDA tools module for HuggingFace datasets.
|
|
| 4 |
This package contains individual EDA functions that will be exposed as MCP tools.
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
from .metadata import get_dataset_metadata
|
| 8 |
-
from .sampling import
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
__all__ = [
|
| 12 |
# Metadata tools
|
| 13 |
'get_dataset_metadata',
|
| 14 |
-
'validate_dataset_metadata_inputs',
|
| 15 |
|
| 16 |
# Sampling tools
|
| 17 |
'get_dataset_sample',
|
|
@@ -20,5 +23,4 @@ __all__ = [
|
|
| 20 |
|
| 21 |
# Analysis tools
|
| 22 |
'analyze_dataset_features',
|
| 23 |
-
'validate_analysis_inputs'
|
| 24 |
]
|
|
|
|
| 4 |
This package contains individual EDA functions that will be exposed as MCP tools.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from .metadata import get_dataset_metadata
|
| 8 |
+
from .sampling import (
|
| 9 |
+
get_dataset_sample,
|
| 10 |
+
get_dataset_sample_with_indices,
|
| 11 |
+
get_available_splits,
|
| 12 |
+
)
|
| 13 |
+
from .analysis import analyze_dataset_features
|
| 14 |
|
| 15 |
__all__ = [
|
| 16 |
# Metadata tools
|
| 17 |
'get_dataset_metadata',
|
|
|
|
| 18 |
|
| 19 |
# Sampling tools
|
| 20 |
'get_dataset_sample',
|
|
|
|
| 23 |
|
| 24 |
# Analysis tools
|
| 25 |
'analyze_dataset_features',
|
|
|
|
| 26 |
]
|
src/hf_eda_mcp/tools/analysis.py
CHANGED
|
@@ -11,7 +11,16 @@ from typing import Optional, Dict, Any, List
|
|
| 11 |
from collections import Counter
|
| 12 |
from hf_eda_mcp.config import get_config
|
| 13 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 14 |
-
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
|
@@ -84,9 +93,24 @@ def analyze_dataset_features(
|
|
| 84 |
>>> quality = analysis['data_quality']
|
| 85 |
>>> print(f"Overall quality score: {quality['quality_score']:.2f}")
|
| 86 |
"""
|
| 87 |
-
# Input validation
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
logger.info(
|
| 91 |
f"Analyzing features for dataset: {dataset_id}, split: {split}, "
|
| 92 |
f"sample_size: {sample_size}"
|
|
@@ -153,12 +177,28 @@ def analyze_dataset_features(
|
|
| 153 |
)
|
| 154 |
return analysis_result
|
| 155 |
|
| 156 |
-
except
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
raise
|
|
|
|
| 159 |
except Exception as e:
|
| 160 |
-
|
| 161 |
-
raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}")
|
| 162 |
|
| 163 |
|
| 164 |
def _analyze_single_feature(
|
|
@@ -495,58 +535,3 @@ def _generate_analysis_summary(
|
|
| 495 |
summary_parts.append(f"Avg missing: {avg_missing:.1f}%")
|
| 496 |
|
| 497 |
return " | ".join(summary_parts)
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
def validate_analysis_inputs(
|
| 501 |
-
dataset_id: str, split: str, sample_size: int, config_name: Optional[str] = None
|
| 502 |
-
) -> None:
|
| 503 |
-
"""
|
| 504 |
-
Validate inputs for dataset analysis.
|
| 505 |
-
|
| 506 |
-
Args:
|
| 507 |
-
dataset_id: Dataset identifier to validate
|
| 508 |
-
split: Split name to validate
|
| 509 |
-
sample_size: Sample size to validate
|
| 510 |
-
config_name: Optional configuration name to validate
|
| 511 |
-
|
| 512 |
-
Raises:
|
| 513 |
-
ValueError: If any input is invalid
|
| 514 |
-
"""
|
| 515 |
-
# Validate dataset_id
|
| 516 |
-
if not dataset_id or not isinstance(dataset_id, str):
|
| 517 |
-
raise ValueError("dataset_id must be a non-empty string")
|
| 518 |
-
|
| 519 |
-
dataset_id = dataset_id.strip()
|
| 520 |
-
if not dataset_id:
|
| 521 |
-
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 522 |
-
|
| 523 |
-
# Validate split
|
| 524 |
-
if not split or not isinstance(split, str):
|
| 525 |
-
raise ValueError("split must be a non-empty string")
|
| 526 |
-
|
| 527 |
-
split = split.strip()
|
| 528 |
-
if not split:
|
| 529 |
-
raise ValueError("split cannot be empty or whitespace")
|
| 530 |
-
|
| 531 |
-
# Validate sample_size
|
| 532 |
-
if not isinstance(sample_size, int):
|
| 533 |
-
raise ValueError("sample_size must be an integer")
|
| 534 |
-
|
| 535 |
-
if sample_size <= 0:
|
| 536 |
-
raise ValueError("sample_size must be positive")
|
| 537 |
-
|
| 538 |
-
# Get max sample size from config
|
| 539 |
-
config = get_config()
|
| 540 |
-
max_sample_size = config.max_sample_size
|
| 541 |
-
|
| 542 |
-
if sample_size > max_sample_size:
|
| 543 |
-
raise ValueError(f"sample_size cannot exceed {max_sample_size}")
|
| 544 |
-
|
| 545 |
-
# Validate config_name
|
| 546 |
-
if config_name is not None:
|
| 547 |
-
if not isinstance(config_name, str):
|
| 548 |
-
raise ValueError("config_name must be a string")
|
| 549 |
-
|
| 550 |
-
config_name = config_name.strip()
|
| 551 |
-
if not config_name:
|
| 552 |
-
raise ValueError("config_name cannot be empty or whitespace")
|
|
|
|
| 11 |
from collections import Counter
|
| 12 |
from hf_eda_mcp.config import get_config
|
| 13 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 14 |
+
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 15 |
+
from hf_eda_mcp.validation import (
|
| 16 |
+
validate_dataset_id,
|
| 17 |
+
validate_config_name,
|
| 18 |
+
validate_split_name,
|
| 19 |
+
validate_sample_size,
|
| 20 |
+
ValidationError,
|
| 21 |
+
format_validation_error,
|
| 22 |
+
)
|
| 23 |
+
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
|
| 24 |
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
|
|
|
| 93 |
>>> quality = analysis['data_quality']
|
| 94 |
>>> print(f"Overall quality score: {quality['quality_score']:.2f}")
|
| 95 |
"""
|
| 96 |
+
# Input validation using centralized validation
|
| 97 |
+
try:
|
| 98 |
+
dataset_id = validate_dataset_id(dataset_id)
|
| 99 |
+
config_name = validate_config_name(config_name)
|
| 100 |
+
split = validate_split_name(split)
|
| 101 |
+
sample_size = validate_sample_size(sample_size, "sample_size")
|
| 102 |
+
except ValidationError as e:
|
| 103 |
+
logger.error(f"Validation error: {format_validation_error(e)}")
|
| 104 |
+
raise ValueError(format_validation_error(e))
|
| 105 |
+
|
| 106 |
+
context = {
|
| 107 |
+
"dataset_id": dataset_id,
|
| 108 |
+
"split": split,
|
| 109 |
+
"sample_size": sample_size,
|
| 110 |
+
"config_name": config_name,
|
| 111 |
+
"operation": "analyze_dataset_features"
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
logger.info(
|
| 115 |
f"Analyzing features for dataset: {dataset_id}, split: {split}, "
|
| 116 |
f"sample_size: {sample_size}"
|
|
|
|
| 177 |
)
|
| 178 |
return analysis_result
|
| 179 |
|
| 180 |
+
except DatasetNotFoundError as e:
|
| 181 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 182 |
+
error_response = format_error_response(e, context)
|
| 183 |
+
logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
|
| 184 |
+
raise
|
| 185 |
+
|
| 186 |
+
except AuthenticationError as e:
|
| 187 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 188 |
+
context["has_token"] = get_dataset_service().is_authenticated
|
| 189 |
+
error_response = format_error_response(e, context)
|
| 190 |
+
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 191 |
+
raise
|
| 192 |
+
|
| 193 |
+
except NetworkError as e:
|
| 194 |
+
log_error_with_context(e, context)
|
| 195 |
+
error_response = format_error_response(e, context)
|
| 196 |
+
logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
|
| 197 |
raise
|
| 198 |
+
|
| 199 |
except Exception as e:
|
| 200 |
+
log_error_with_context(e, context)
|
| 201 |
+
raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}") from e
|
| 202 |
|
| 203 |
|
| 204 |
def _analyze_single_feature(
|
|
|
|
| 535 |
summary_parts.append(f"Avg missing: {avg_missing:.1f}%")
|
| 536 |
|
| 537 |
return " | ".join(summary_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/hf_eda_mcp/tools/metadata.py
CHANGED
|
@@ -9,7 +9,14 @@ import logging
|
|
| 9 |
from typing import Optional, Dict, Any
|
| 10 |
from hf_eda_mcp.config import get_config
|
| 11 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 12 |
-
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
|
@@ -72,18 +79,19 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
|
|
| 72 |
>>> metadata = get_dataset_metadata("glue", config_name="cola")
|
| 73 |
>>> print(f"Config: {metadata.get('config_name', 'default')}")
|
| 74 |
"""
|
| 75 |
-
# Input validation
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
raise ValueError(
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
| 87 |
|
| 88 |
logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
|
| 89 |
(f", config: {config_name}" if config_name else ""))
|
|
@@ -115,12 +123,31 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
|
|
| 115 |
logger.info(f"Successfully retrieved metadata for {dataset_id}")
|
| 116 |
return metadata
|
| 117 |
|
| 118 |
-
except
|
| 119 |
-
#
|
|
|
|
|
|
|
|
|
|
| 120 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
except Exception as e:
|
| 122 |
-
|
| 123 |
-
raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}")
|
| 124 |
|
| 125 |
|
| 126 |
def _format_bytes(size_bytes: int) -> str:
|
|
@@ -184,39 +211,4 @@ def _generate_metadata_summary(metadata: Dict[str, Any]) -> str:
|
|
| 184 |
if features:
|
| 185 |
summary_parts.append(f"Features: {len(features)} columns")
|
| 186 |
|
| 187 |
-
return " | ".join(summary_parts)
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
def validate_dataset_metadata_inputs(dataset_id: str, config_name: Optional[str] = None) -> None:
|
| 191 |
-
"""
|
| 192 |
-
Validate inputs for dataset metadata retrieval.
|
| 193 |
-
|
| 194 |
-
Args:
|
| 195 |
-
dataset_id: Dataset identifier to validate
|
| 196 |
-
config_name: Optional configuration name to validate
|
| 197 |
-
|
| 198 |
-
Raises:
|
| 199 |
-
ValueError: If inputs are invalid
|
| 200 |
-
"""
|
| 201 |
-
if not dataset_id or not isinstance(dataset_id, str):
|
| 202 |
-
raise ValueError("dataset_id must be a non-empty string")
|
| 203 |
-
|
| 204 |
-
dataset_id = dataset_id.strip()
|
| 205 |
-
if not dataset_id:
|
| 206 |
-
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 207 |
-
|
| 208 |
-
# Basic format validation for dataset_id
|
| 209 |
-
if not all(c.isalnum() or c in '-_/.@' for c in dataset_id):
|
| 210 |
-
raise ValueError("dataset_id contains invalid characters")
|
| 211 |
-
|
| 212 |
-
if config_name is not None:
|
| 213 |
-
if not isinstance(config_name, str):
|
| 214 |
-
raise ValueError("config_name must be a string")
|
| 215 |
-
|
| 216 |
-
config_name = config_name.strip()
|
| 217 |
-
if not config_name:
|
| 218 |
-
raise ValueError("config_name cannot be empty or whitespace")
|
| 219 |
-
|
| 220 |
-
# Basic format validation for config_name
|
| 221 |
-
if not all(c.isalnum() or c in '-_.' for c in config_name):
|
| 222 |
-
raise ValueError("config_name contains invalid characters")
|
|
|
|
| 9 |
from typing import Optional, Dict, Any
|
| 10 |
from hf_eda_mcp.config import get_config
|
| 11 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 12 |
+
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 13 |
+
from hf_eda_mcp.validation import (
|
| 14 |
+
validate_dataset_id,
|
| 15 |
+
validate_config_name,
|
| 16 |
+
ValidationError,
|
| 17 |
+
format_validation_error,
|
| 18 |
+
)
|
| 19 |
+
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
|
|
|
|
| 79 |
>>> metadata = get_dataset_metadata("glue", config_name="cola")
|
| 80 |
>>> print(f"Config: {metadata.get('config_name', 'default')}")
|
| 81 |
"""
|
| 82 |
+
# Input validation using centralized validation
|
| 83 |
+
try:
|
| 84 |
+
dataset_id = validate_dataset_id(dataset_id)
|
| 85 |
+
config_name = validate_config_name(config_name)
|
| 86 |
+
except ValidationError as e:
|
| 87 |
+
logger.error(f"Validation error: {format_validation_error(e)}")
|
| 88 |
+
raise ValueError(format_validation_error(e))
|
| 89 |
+
|
| 90 |
+
context = {
|
| 91 |
+
"dataset_id": dataset_id,
|
| 92 |
+
"config_name": config_name,
|
| 93 |
+
"operation": "get_dataset_metadata"
|
| 94 |
+
}
|
| 95 |
|
| 96 |
logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
|
| 97 |
(f", config: {config_name}" if config_name else ""))
|
|
|
|
| 123 |
logger.info(f"Successfully retrieved metadata for {dataset_id}")
|
| 124 |
return metadata
|
| 125 |
|
| 126 |
+
except DatasetNotFoundError as e:
|
| 127 |
+
# Add helpful context to the error
|
| 128 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 129 |
+
error_response = format_error_response(e, context)
|
| 130 |
+
logger.info(f"Dataset not found suggestions: {error_response.get('suggestions', [])}")
|
| 131 |
raise
|
| 132 |
+
|
| 133 |
+
except AuthenticationError as e:
|
| 134 |
+
# Add helpful context to the error
|
| 135 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 136 |
+
context["has_token"] = get_dataset_service().is_authenticated
|
| 137 |
+
error_response = format_error_response(e, context)
|
| 138 |
+
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 139 |
+
raise
|
| 140 |
+
|
| 141 |
+
except NetworkError as e:
|
| 142 |
+
# Network errors after retries
|
| 143 |
+
log_error_with_context(e, context)
|
| 144 |
+
error_response = format_error_response(e, context)
|
| 145 |
+
logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
|
| 146 |
+
raise
|
| 147 |
+
|
| 148 |
except Exception as e:
|
| 149 |
+
log_error_with_context(e, context)
|
| 150 |
+
raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}") from e
|
| 151 |
|
| 152 |
|
| 153 |
def _format_bytes(size_bytes: int) -> str:
|
|
|
|
| 211 |
if features:
|
| 212 |
summary_parts.append(f"Features: {len(features)} columns")
|
| 213 |
|
| 214 |
+
return " | ".join(summary_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/hf_eda_mcp/tools/sampling.py
CHANGED
|
@@ -9,7 +9,17 @@ import logging
|
|
| 9 |
from typing import Optional, Dict, Any, List
|
| 10 |
from hf_eda_mcp.config import get_config
|
| 11 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 12 |
-
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
|
@@ -82,9 +92,24 @@ def get_dataset_sample(
|
|
| 82 |
... num_samples=3, config_name="cola")
|
| 83 |
>>> print(f"Schema: {sample['schema']}")
|
| 84 |
"""
|
| 85 |
-
# Input validation
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
logger.info(
|
| 89 |
f"Sampling {num_samples} rows from dataset: {dataset_id}, "
|
| 90 |
f"split: {split}" + (f", config: {config_name}" if config_name else "")
|
|
@@ -132,12 +157,28 @@ def get_dataset_sample(
|
|
| 132 |
)
|
| 133 |
return sample_data
|
| 134 |
|
| 135 |
-
except
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
raise
|
|
|
|
| 138 |
except Exception as e:
|
| 139 |
-
|
| 140 |
-
raise DatasetServiceError(f"Failed to sample dataset: {str(e)}")
|
| 141 |
|
| 142 |
|
| 143 |
def get_dataset_sample_with_indices(
|
|
@@ -165,20 +206,15 @@ def get_dataset_sample_with_indices(
|
|
| 165 |
ValueError: If inputs are invalid
|
| 166 |
DatasetServiceError: If sampling fails
|
| 167 |
"""
|
| 168 |
-
# Input validation
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
raise ValueError(
|
| 178 |
-
f"Too many indices requested. Maximum: {config.max_sample_size}"
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
validate_sampling_inputs(dataset_id, split, len(indices), config_name)
|
| 182 |
|
| 183 |
logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
|
| 184 |
|
|
@@ -229,63 +265,6 @@ def get_dataset_sample_with_indices(
|
|
| 229 |
raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
|
| 230 |
|
| 231 |
|
| 232 |
-
def validate_sampling_inputs(
|
| 233 |
-
dataset_id: str, split: str, num_samples: int, config_name: Optional[str] = None
|
| 234 |
-
) -> None:
|
| 235 |
-
"""
|
| 236 |
-
Validate inputs for dataset sampling.
|
| 237 |
-
|
| 238 |
-
Args:
|
| 239 |
-
dataset_id: Dataset identifier to validate
|
| 240 |
-
split: Split name to validate
|
| 241 |
-
num_samples: Number of samples to validate
|
| 242 |
-
config_name: Optional configuration name to validate
|
| 243 |
-
|
| 244 |
-
Raises:
|
| 245 |
-
ValueError: If any input is invalid
|
| 246 |
-
"""
|
| 247 |
-
# Validate dataset_id
|
| 248 |
-
if not dataset_id or not isinstance(dataset_id, str):
|
| 249 |
-
raise ValueError("dataset_id must be a non-empty string")
|
| 250 |
-
|
| 251 |
-
dataset_id = dataset_id.strip()
|
| 252 |
-
if not dataset_id:
|
| 253 |
-
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 254 |
-
|
| 255 |
-
# Validate split
|
| 256 |
-
if not split or not isinstance(split, str):
|
| 257 |
-
raise ValueError("split must be a non-empty string")
|
| 258 |
-
|
| 259 |
-
split = split.strip().lower()
|
| 260 |
-
if not split:
|
| 261 |
-
raise ValueError("split cannot be empty or whitespace")
|
| 262 |
-
|
| 263 |
-
# Note: We don't strictly enforce VALID_SPLITS as datasets may have custom split names
|
| 264 |
-
|
| 265 |
-
# Validate num_samples
|
| 266 |
-
if not isinstance(num_samples, int):
|
| 267 |
-
raise ValueError("num_samples must be an integer")
|
| 268 |
-
|
| 269 |
-
if num_samples <= 0:
|
| 270 |
-
raise ValueError("num_samples must be positive")
|
| 271 |
-
|
| 272 |
-
# Get max sample size from config
|
| 273 |
-
config = get_config()
|
| 274 |
-
max_sample_size = config.max_sample_size
|
| 275 |
-
|
| 276 |
-
if num_samples > max_sample_size:
|
| 277 |
-
raise ValueError(f"num_samples cannot exceed {max_sample_size}")
|
| 278 |
-
|
| 279 |
-
# Validate config_name
|
| 280 |
-
if config_name is not None:
|
| 281 |
-
if not isinstance(config_name, str):
|
| 282 |
-
raise ValueError("config_name must be a string")
|
| 283 |
-
|
| 284 |
-
config_name = config_name.strip()
|
| 285 |
-
if not config_name:
|
| 286 |
-
raise ValueError("config_name cannot be empty or whitespace")
|
| 287 |
-
|
| 288 |
-
|
| 289 |
def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
| 290 |
"""Generate a human-readable summary of the sample data."""
|
| 291 |
summary_parts = []
|
|
|
|
| 9 |
from typing import Optional, Dict, Any, List
|
| 10 |
from hf_eda_mcp.config import get_config
|
| 11 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 12 |
+
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 13 |
+
from hf_eda_mcp.validation import (
|
| 14 |
+
validate_dataset_id,
|
| 15 |
+
validate_config_name,
|
| 16 |
+
validate_split_name,
|
| 17 |
+
validate_sample_size,
|
| 18 |
+
validate_indices,
|
| 19 |
+
ValidationError,
|
| 20 |
+
format_validation_error,
|
| 21 |
+
)
|
| 22 |
+
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
|
|
| 92 |
... num_samples=3, config_name="cola")
|
| 93 |
>>> print(f"Schema: {sample['schema']}")
|
| 94 |
"""
|
| 95 |
+
# Input validation using centralized validation
|
| 96 |
+
try:
|
| 97 |
+
dataset_id = validate_dataset_id(dataset_id)
|
| 98 |
+
config_name = validate_config_name(config_name)
|
| 99 |
+
split = validate_split_name(split)
|
| 100 |
+
num_samples = validate_sample_size(num_samples, "num_samples")
|
| 101 |
+
except ValidationError as e:
|
| 102 |
+
logger.error(f"Validation error: {format_validation_error(e)}")
|
| 103 |
+
raise ValueError(format_validation_error(e))
|
| 104 |
+
|
| 105 |
+
context = {
|
| 106 |
+
"dataset_id": dataset_id,
|
| 107 |
+
"split": split,
|
| 108 |
+
"num_samples": num_samples,
|
| 109 |
+
"config_name": config_name,
|
| 110 |
+
"operation": "get_dataset_sample"
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
logger.info(
|
| 114 |
f"Sampling {num_samples} rows from dataset: {dataset_id}, "
|
| 115 |
f"split: {split}" + (f", config: {config_name}" if config_name else "")
|
|
|
|
| 157 |
)
|
| 158 |
return sample_data
|
| 159 |
|
| 160 |
+
except DatasetNotFoundError as e:
|
| 161 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 162 |
+
error_response = format_error_response(e, context)
|
| 163 |
+
logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
|
| 164 |
+
raise
|
| 165 |
+
|
| 166 |
+
except AuthenticationError as e:
|
| 167 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 168 |
+
context["has_token"] = get_dataset_service().is_authenticated
|
| 169 |
+
error_response = format_error_response(e, context)
|
| 170 |
+
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 171 |
+
raise
|
| 172 |
+
|
| 173 |
+
except NetworkError as e:
|
| 174 |
+
log_error_with_context(e, context)
|
| 175 |
+
error_response = format_error_response(e, context)
|
| 176 |
+
logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
|
| 177 |
raise
|
| 178 |
+
|
| 179 |
except Exception as e:
|
| 180 |
+
log_error_with_context(e, context)
|
| 181 |
+
raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
|
| 182 |
|
| 183 |
|
| 184 |
def get_dataset_sample_with_indices(
|
|
|
|
| 206 |
ValueError: If inputs are invalid
|
| 207 |
DatasetServiceError: If sampling fails
|
| 208 |
"""
|
| 209 |
+
# Input validation using centralized validation
|
| 210 |
+
try:
|
| 211 |
+
dataset_id = validate_dataset_id(dataset_id)
|
| 212 |
+
config_name = validate_config_name(config_name)
|
| 213 |
+
split = validate_split_name(split)
|
| 214 |
+
indices = validate_indices(indices)
|
| 215 |
+
except ValidationError as e:
|
| 216 |
+
logger.error(f"Validation error: {format_validation_error(e)}")
|
| 217 |
+
raise ValueError(format_validation_error(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
|
| 220 |
|
|
|
|
| 265 |
raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
|
| 266 |
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
| 269 |
"""Generate a human-readable summary of the sample data."""
|
| 270 |
summary_parts = []
|
src/hf_eda_mcp/validation.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Input validation utilities for HF EDA MCP Server.
|
| 3 |
+
|
| 4 |
+
This module provides centralized validation functions for all tool inputs,
|
| 5 |
+
ensuring consistent error messages and validation logic across the application.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from typing import Optional, List
|
| 10 |
+
from hf_eda_mcp.config import get_config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ValidationError(ValueError):
|
| 14 |
+
"""Custom exception for validation errors with helpful messages."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, message: str, suggestions: Optional[List[str]] = None):
|
| 17 |
+
super().__init__(message)
|
| 18 |
+
self.suggestions = suggestions or []
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def validate_dataset_id(dataset_id: str) -> str:
|
| 22 |
+
"""
|
| 23 |
+
Validate and normalize a HuggingFace dataset identifier.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
dataset_id: Dataset identifier to validate
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Normalized dataset_id (stripped of whitespace)
|
| 30 |
+
|
| 31 |
+
Raises:
|
| 32 |
+
ValidationError: If dataset_id is invalid with helpful error message
|
| 33 |
+
"""
|
| 34 |
+
if not dataset_id:
|
| 35 |
+
raise ValidationError(
|
| 36 |
+
"dataset_id is required and cannot be empty",
|
| 37 |
+
suggestions=[
|
| 38 |
+
"Provide a valid HuggingFace dataset identifier",
|
| 39 |
+
"Examples: 'imdb', 'squad', 'glue', 'username/dataset-name'",
|
| 40 |
+
],
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
if not isinstance(dataset_id, str):
|
| 44 |
+
raise ValidationError(
|
| 45 |
+
f"dataset_id must be a string, got {type(dataset_id).__name__}",
|
| 46 |
+
suggestions=["Ensure dataset_id is passed as a string value"],
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
dataset_id = dataset_id.strip()
|
| 50 |
+
|
| 51 |
+
if not dataset_id:
|
| 52 |
+
raise ValidationError(
|
| 53 |
+
"dataset_id cannot be empty or contain only whitespace",
|
| 54 |
+
suggestions=["Provide a non-empty dataset identifier"],
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Validate format: alphanumeric, hyphens, underscores, slashes, dots, @
|
| 58 |
+
# Pattern: optional username/ followed by dataset name
|
| 59 |
+
pattern = r"^[a-zA-Z0-9][\w\-\.@]*(/[\w\-\.]+)?$"
|
| 60 |
+
if not re.match(pattern, dataset_id):
|
| 61 |
+
raise ValidationError(
|
| 62 |
+
f"Invalid dataset_id format: '{dataset_id}'",
|
| 63 |
+
suggestions=[
|
| 64 |
+
"Dataset IDs should contain only letters, numbers, hyphens, underscores, dots, and slashes",
|
| 65 |
+
"Valid formats: 'dataset-name' or 'username/dataset-name'",
|
| 66 |
+
"Examples: 'imdb', 'squad', 'huggingface/dataset-name'",
|
| 67 |
+
],
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Check for common mistakes
|
| 71 |
+
if dataset_id.startswith("/") or dataset_id.endswith("/"):
|
| 72 |
+
raise ValidationError(
|
| 73 |
+
f"Invalid dataset_id: '{dataset_id}' - cannot start or end with '/'",
|
| 74 |
+
suggestions=["Remove leading or trailing slashes from the dataset_id"],
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
if "//" in dataset_id:
|
| 78 |
+
raise ValidationError(
|
| 79 |
+
f"Invalid dataset_id: '{dataset_id}' - contains consecutive slashes",
|
| 80 |
+
suggestions=["Use single slashes to separate username from dataset name"],
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Warn about very long dataset IDs (likely an error)
|
| 84 |
+
if len(dataset_id) > 100:
|
| 85 |
+
raise ValidationError(
|
| 86 |
+
f"dataset_id is unusually long ({len(dataset_id)} characters)",
|
| 87 |
+
suggestions=[
|
| 88 |
+
"Check if the dataset_id is correct",
|
| 89 |
+
"Dataset IDs are typically shorter than 100 characters",
|
| 90 |
+
],
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
return dataset_id
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def validate_config_name(config_name: Optional[str]) -> Optional[str]:
|
| 97 |
+
"""
|
| 98 |
+
Validate and normalize a dataset configuration name.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
config_name: Configuration name to validate (can be None)
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
Normalized config_name or None
|
| 105 |
+
|
| 106 |
+
Raises:
|
| 107 |
+
ValidationError: If config_name is invalid
|
| 108 |
+
"""
|
| 109 |
+
if config_name is None:
|
| 110 |
+
return None
|
| 111 |
+
|
| 112 |
+
if not isinstance(config_name, str):
|
| 113 |
+
raise ValidationError(
|
| 114 |
+
f"config_name must be a string or None, got {type(config_name).__name__}",
|
| 115 |
+
suggestions=["Pass config_name as a string or omit it for default configuration"],
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
config_name = config_name.strip()
|
| 119 |
+
|
| 120 |
+
if not config_name:
|
| 121 |
+
raise ValidationError(
|
| 122 |
+
"config_name cannot be empty or contain only whitespace",
|
| 123 |
+
suggestions=[
|
| 124 |
+
"Provide a valid configuration name or omit the parameter",
|
| 125 |
+
"Use None or don't specify config_name for default configuration",
|
| 126 |
+
],
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Validate format: alphanumeric, hyphens, underscores, dots
|
| 130 |
+
pattern = r"^[a-zA-Z0-9][\w\-\.]*$"
|
| 131 |
+
if not re.match(pattern, config_name):
|
| 132 |
+
raise ValidationError(
|
| 133 |
+
f"Invalid config_name format: '{config_name}'",
|
| 134 |
+
suggestions=[
|
| 135 |
+
"Configuration names should contain only letters, numbers, hyphens, underscores, and dots",
|
| 136 |
+
"Examples: 'cola', 'sst2', 'plain_text'",
|
| 137 |
+
],
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if len(config_name) > 50:
|
| 141 |
+
raise ValidationError(
|
| 142 |
+
f"config_name is unusually long ({len(config_name)} characters)",
|
| 143 |
+
suggestions=[
|
| 144 |
+
"Check if the config_name is correct",
|
| 145 |
+
"Configuration names are typically shorter than 50 characters",
|
| 146 |
+
],
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
return config_name
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def validate_split_name(split: str) -> str:
|
| 153 |
+
"""
|
| 154 |
+
Validate and normalize a dataset split name.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
split: Split name to validate
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
Normalized split name (lowercase, stripped)
|
| 161 |
+
|
| 162 |
+
Raises:
|
| 163 |
+
ValidationError: If split is invalid
|
| 164 |
+
"""
|
| 165 |
+
if not split:
|
| 166 |
+
raise ValidationError(
|
| 167 |
+
"split is required and cannot be empty",
|
| 168 |
+
suggestions=[
|
| 169 |
+
"Provide a valid split name",
|
| 170 |
+
"Common splits: 'train', 'validation', 'test'",
|
| 171 |
+
],
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
if not isinstance(split, str):
|
| 175 |
+
raise ValidationError(
|
| 176 |
+
f"split must be a string, got {type(split).__name__}",
|
| 177 |
+
suggestions=["Ensure split is passed as a string value"],
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
split = split.strip().lower()
|
| 181 |
+
|
| 182 |
+
if not split:
|
| 183 |
+
raise ValidationError(
|
| 184 |
+
"split cannot be empty or contain only whitespace",
|
| 185 |
+
suggestions=["Provide a non-empty split name"],
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# Validate format: alphanumeric, hyphens, underscores
|
| 189 |
+
pattern = r"^[a-zA-Z0-9][\w\-]*$"
|
| 190 |
+
if not re.match(pattern, split):
|
| 191 |
+
raise ValidationError(
|
| 192 |
+
f"Invalid split name format: '{split}'",
|
| 193 |
+
suggestions=[
|
| 194 |
+
"Split names should contain only letters, numbers, hyphens, and underscores",
|
| 195 |
+
"Common splits: 'train', 'validation', 'test', 'dev'",
|
| 196 |
+
],
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Note: We don't enforce a specific set of split names as datasets can have custom splits
|
| 200 |
+
# Common splits for reference
|
| 201 |
+
common_splits = {"train", "validation", "test", "dev", "val"}
|
| 202 |
+
|
| 203 |
+
if split not in common_splits and len(split) > 20:
|
| 204 |
+
raise ValidationError(
|
| 205 |
+
f"Unusual split name: '{split}' (length: {len(split)})",
|
| 206 |
+
suggestions=[
|
| 207 |
+
"Check if the split name is correct",
|
| 208 |
+
f"Common splits are: {', '.join(sorted(common_splits))}",
|
| 209 |
+
"Some datasets may have custom split names",
|
| 210 |
+
],
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
return split
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def validate_sample_size(num_samples: int, parameter_name: str = "num_samples") -> int:
|
| 217 |
+
"""
|
| 218 |
+
Validate sample size parameter.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
num_samples: Number of samples to validate
|
| 222 |
+
parameter_name: Name of the parameter (for error messages)
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
Validated num_samples
|
| 226 |
+
|
| 227 |
+
Raises:
|
| 228 |
+
ValidationError: If num_samples is invalid
|
| 229 |
+
"""
|
| 230 |
+
if not isinstance(num_samples, int):
|
| 231 |
+
# Check if it's a float that's actually an integer
|
| 232 |
+
if isinstance(num_samples, float) and num_samples.is_integer():
|
| 233 |
+
num_samples = int(num_samples)
|
| 234 |
+
else:
|
| 235 |
+
raise ValidationError(
|
| 236 |
+
f"{parameter_name} must be an integer, got {type(num_samples).__name__}",
|
| 237 |
+
suggestions=[
|
| 238 |
+
f"Provide {parameter_name} as an integer value",
|
| 239 |
+
"Example: num_samples=100",
|
| 240 |
+
],
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
if num_samples <= 0:
|
| 244 |
+
raise ValidationError(
|
| 245 |
+
f"{parameter_name} must be positive, got {num_samples}",
|
| 246 |
+
suggestions=[
|
| 247 |
+
f"Provide a positive integer for {parameter_name}",
|
| 248 |
+
"Example: num_samples=10 or num_samples=1000",
|
| 249 |
+
],
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
# Get max sample size from config
|
| 253 |
+
config = get_config()
|
| 254 |
+
max_sample_size = config.max_sample_size
|
| 255 |
+
|
| 256 |
+
if num_samples > max_sample_size:
|
| 257 |
+
raise ValidationError(
|
| 258 |
+
f"{parameter_name} ({num_samples}) exceeds maximum allowed ({max_sample_size})",
|
| 259 |
+
suggestions=[
|
| 260 |
+
f"Reduce {parameter_name} to {max_sample_size} or less",
|
| 261 |
+
f"Current maximum is configured as {max_sample_size}",
|
| 262 |
+
"For larger samples, consider using streaming or batch processing",
|
| 263 |
+
],
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
# Warn about very small samples (might not be useful)
|
| 267 |
+
if num_samples < 5:
|
| 268 |
+
# This is just a soft warning, not an error
|
| 269 |
+
pass
|
| 270 |
+
|
| 271 |
+
return num_samples
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def validate_indices(indices: List[int]) -> List[int]:
|
| 275 |
+
"""
|
| 276 |
+
Validate a list of indices for sampling.
|
| 277 |
+
|
| 278 |
+
Args:
|
| 279 |
+
indices: List of indices to validate
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
Validated indices list
|
| 283 |
+
|
| 284 |
+
Raises:
|
| 285 |
+
ValidationError: If indices are invalid
|
| 286 |
+
"""
|
| 287 |
+
if not indices:
|
| 288 |
+
raise ValidationError(
|
| 289 |
+
"indices list is required and cannot be empty",
|
| 290 |
+
suggestions=[
|
| 291 |
+
"Provide a non-empty list of indices",
|
| 292 |
+
"Example: indices=[0, 1, 2, 10, 20]",
|
| 293 |
+
],
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
if not isinstance(indices, list):
|
| 297 |
+
raise ValidationError(
|
| 298 |
+
f"indices must be a list, got {type(indices).__name__}",
|
| 299 |
+
suggestions=[
|
| 300 |
+
"Provide indices as a list of integers",
|
| 301 |
+
"Example: indices=[0, 1, 2]",
|
| 302 |
+
],
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# Validate each index
|
| 306 |
+
for i, idx in enumerate(indices):
|
| 307 |
+
if not isinstance(idx, int):
|
| 308 |
+
raise ValidationError(
|
| 309 |
+
f"All indices must be integers, got {type(idx).__name__} at position {i}",
|
| 310 |
+
suggestions=[
|
| 311 |
+
"Ensure all indices are integer values",
|
| 312 |
+
"Example: indices=[0, 1, 2] (not [0.5, 1.2])",
|
| 313 |
+
],
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
if idx < 0:
|
| 317 |
+
raise ValidationError(
|
| 318 |
+
f"All indices must be non-negative, got {idx} at position {i}",
|
| 319 |
+
suggestions=[
|
| 320 |
+
"Provide only non-negative indices (0 or greater)",
|
| 321 |
+
"Example: indices=[0, 1, 2, 10]",
|
| 322 |
+
],
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
# Check for reasonable list size
|
| 326 |
+
config = get_config()
|
| 327 |
+
max_sample_size = config.max_sample_size
|
| 328 |
+
|
| 329 |
+
if len(indices) > max_sample_size:
|
| 330 |
+
raise ValidationError(
|
| 331 |
+
f"Too many indices requested ({len(indices)}), maximum is {max_sample_size}",
|
| 332 |
+
suggestions=[
|
| 333 |
+
f"Reduce the number of indices to {max_sample_size} or less",
|
| 334 |
+
"Consider using regular sampling instead of specific indices",
|
| 335 |
+
],
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
return indices
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def format_validation_error(error: ValidationError) -> str:
|
| 342 |
+
"""
|
| 343 |
+
Format a validation error with suggestions into a user-friendly message.
|
| 344 |
+
|
| 345 |
+
Args:
|
| 346 |
+
error: ValidationError to format
|
| 347 |
+
|
| 348 |
+
Returns:
|
| 349 |
+
Formatted error message with suggestions
|
| 350 |
+
"""
|
| 351 |
+
message = str(error)
|
| 352 |
+
|
| 353 |
+
if error.suggestions:
|
| 354 |
+
message += "\n\nSuggestions:"
|
| 355 |
+
for suggestion in error.suggestions:
|
| 356 |
+
message += f"\n - {suggestion}"
|
| 357 |
+
|
| 358 |
+
return message
|