KhalilGuetari commited on
Commit
2a623ac
·
1 Parent(s): ab96cfe

error message handling

Browse files
.kiro/specs/hf-eda-mcp-server/tasks.md CHANGED
@@ -58,13 +58,13 @@
58
  - _Requirements: 4.1, 4.2, 4.4_
59
 
60
  - [ ] 5. Implement error handling and validation
61
- - [ ] 5.1 Add input validation for all tools
62
  - Validate dataset identifiers and configuration names
63
  - Check split names and sample size parameters
64
  - Provide helpful error messages for invalid inputs
65
  - _Requirements: 1.2, 2.1_
66
 
67
- - [ ] 5.2 Implement comprehensive error handling
68
  - Handle dataset not found errors with suggestions
69
  - Manage authentication errors for private datasets
70
  - Add retry logic for network and API failures
 
58
  - _Requirements: 4.1, 4.2, 4.4_
59
 
60
  - [ ] 5. Implement error handling and validation
61
+ - [x] 5.1 Add input validation for all tools
62
  - Validate dataset identifiers and configuration names
63
  - Check split names and sample size parameters
64
  - Provide helpful error messages for invalid inputs
65
  - _Requirements: 1.2, 2.1_
66
 
67
+ - [x] 5.2 Implement comprehensive error handling
68
  - Handle dataset not found errors with suggestions
69
  - Manage authentication errors for private datasets
70
  - Add retry logic for network and API failures
src/hf_eda_mcp/error_handling.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive error handling utilities for hf-eda-mcp.
3
+
4
+ This module provides error handling utilities including retry logic,
5
+ error suggestions, and formatted error responses for better user experience.
6
+ """
7
+
8
+ import logging
9
+ import time
10
+ import functools
11
+ from typing import Optional, Callable, Any, List, Dict, TypeVar, cast
12
+ from requests.exceptions import RequestException, ConnectionError, Timeout, HTTPError
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Type variable for generic function return types
17
+ T = TypeVar('T')
18
+
19
+
20
+ class RetryConfig:
21
+ """Configuration for retry logic."""
22
+
23
+ def __init__(
24
+ self,
25
+ max_attempts: int = 3,
26
+ initial_delay: float = 1.0,
27
+ max_delay: float = 30.0,
28
+ exponential_base: float = 2.0,
29
+ jitter: bool = True
30
+ ):
31
+ """
32
+ Initialize retry configuration.
33
+
34
+ Args:
35
+ max_attempts: Maximum number of retry attempts
36
+ initial_delay: Initial delay between retries in seconds
37
+ max_delay: Maximum delay between retries in seconds
38
+ exponential_base: Base for exponential backoff
39
+ jitter: Whether to add random jitter to delays
40
+ """
41
+ self.max_attempts = max_attempts
42
+ self.initial_delay = initial_delay
43
+ self.max_delay = max_delay
44
+ self.exponential_base = exponential_base
45
+ self.jitter = jitter
46
+
47
+
48
+ # Default retry configuration
49
+ DEFAULT_RETRY_CONFIG = RetryConfig(
50
+ max_attempts=3,
51
+ initial_delay=1.0,
52
+ max_delay=30.0,
53
+ exponential_base=2.0,
54
+ jitter=True
55
+ )
56
+
57
+
58
+ def calculate_retry_delay(attempt: int, config: RetryConfig) -> float:
59
+ """
60
+ Calculate delay for retry attempt using exponential backoff.
61
+
62
+ Args:
63
+ attempt: Current attempt number (0-indexed)
64
+ config: Retry configuration
65
+
66
+ Returns:
67
+ Delay in seconds
68
+ """
69
+ delay = min(
70
+ config.initial_delay * (config.exponential_base ** attempt),
71
+ config.max_delay
72
+ )
73
+
74
+ # Add jitter to prevent thundering herd
75
+ if config.jitter:
76
+ import random
77
+ delay = delay * (0.5 + random.random())
78
+
79
+ return delay
80
+
81
+
82
+ def should_retry_error(error: Exception) -> bool:
83
+ """
84
+ Determine if an error should trigger a retry.
85
+
86
+ Args:
87
+ error: Exception to check
88
+
89
+ Returns:
90
+ True if error is retryable, False otherwise
91
+ """
92
+ # Network errors are retryable
93
+ if isinstance(error, (ConnectionError, Timeout)):
94
+ return True
95
+
96
+ # HTTP errors with specific status codes are retryable
97
+ if isinstance(error, HTTPError):
98
+ # Retry on 5xx server errors and 429 rate limiting
99
+ if hasattr(error, 'response') and error.response is not None:
100
+ status_code = error.response.status_code
101
+ return status_code >= 500 or status_code == 429
102
+
103
+ # Generic request exceptions might be retryable
104
+ if isinstance(error, RequestException):
105
+ # Check if it's a connection-related issue
106
+ error_str = str(error).lower()
107
+ retryable_keywords = ['timeout', 'connection', 'network', 'temporary']
108
+ return any(keyword in error_str for keyword in retryable_keywords)
109
+
110
+ # Don't retry other errors by default
111
+ return False
112
+
113
+
114
+ def retry_with_backoff(
115
+ func: Optional[Callable[..., T]] = None,
116
+ *,
117
+ config: Optional[RetryConfig] = None,
118
+ retryable_exceptions: Optional[tuple] = None
119
+ ) -> Callable[..., T]:
120
+ """
121
+ Decorator to retry a function with exponential backoff.
122
+
123
+ Args:
124
+ func: Function to decorate (when used without arguments)
125
+ config: Retry configuration (uses default if not provided)
126
+ retryable_exceptions: Tuple of exception types to retry on
127
+
128
+ Returns:
129
+ Decorated function with retry logic
130
+
131
+ Example:
132
+ @retry_with_backoff
133
+ def fetch_data():
134
+ # ... network call ...
135
+ pass
136
+
137
+ @retry_with_backoff(config=RetryConfig(max_attempts=5))
138
+ def fetch_with_custom_config():
139
+ # ... network call ...
140
+ pass
141
+ """
142
+ if config is None:
143
+ config = DEFAULT_RETRY_CONFIG
144
+
145
+ if retryable_exceptions is None:
146
+ retryable_exceptions = (ConnectionError, Timeout, RequestException)
147
+
148
+ def decorator(f: Callable[..., T]) -> Callable[..., T]:
149
+ @functools.wraps(f)
150
+ def wrapper(*args: Any, **kwargs: Any) -> T:
151
+ last_exception: Optional[Exception] = None
152
+
153
+ for attempt in range(config.max_attempts):
154
+ try:
155
+ return f(*args, **kwargs)
156
+
157
+ except retryable_exceptions as e:
158
+ last_exception = e
159
+
160
+ # Check if we should retry this specific error
161
+ if not should_retry_error(e):
162
+ logger.warning(f"Error is not retryable: {e}")
163
+ raise
164
+
165
+ # Don't sleep after the last attempt
166
+ if attempt < config.max_attempts - 1:
167
+ delay = calculate_retry_delay(attempt, config)
168
+ logger.warning(
169
+ f"Attempt {attempt + 1}/{config.max_attempts} failed: {e}. "
170
+ f"Retrying in {delay:.2f}s..."
171
+ )
172
+ time.sleep(delay)
173
+ else:
174
+ logger.error(
175
+ f"All {config.max_attempts} attempts failed. Last error: {e}"
176
+ )
177
+
178
+ except Exception as e:
179
+ # Non-retryable exception, raise immediately
180
+ logger.error(f"Non-retryable error occurred: {e}")
181
+ raise
182
+
183
+ # If we get here, all retries failed
184
+ if last_exception:
185
+ raise last_exception
186
+ else:
187
+ raise RuntimeError("Retry logic failed without capturing exception")
188
+
189
+ return cast(Callable[..., T], wrapper)
190
+
191
+ # Support both @retry_with_backoff and @retry_with_backoff()
192
+ if func is None:
193
+ return decorator
194
+ else:
195
+ return decorator(func)
196
+
197
+
198
+ def get_dataset_suggestions(dataset_id: str) -> List[str]:
199
+ """
200
+ Generate helpful suggestions for dataset not found errors.
201
+
202
+ Args:
203
+ dataset_id: The dataset identifier that was not found
204
+
205
+ Returns:
206
+ List of suggestion strings
207
+ """
208
+ suggestions = []
209
+
210
+ # Check for common typos or formatting issues
211
+ if " " in dataset_id:
212
+ suggestions.append(
213
+ f"Dataset ID contains spaces. Try: '{dataset_id.replace(' ', '-')}' or '{dataset_id.replace(' ', '_')}'"
214
+ )
215
+
216
+ if dataset_id.isupper():
217
+ suggestions.append(
218
+ f"Dataset ID is all uppercase. Try lowercase: '{dataset_id.lower()}'"
219
+ )
220
+
221
+ # Check if it looks like it might be missing organization prefix
222
+ if "/" not in dataset_id:
223
+ suggestions.append(
224
+ f"Dataset might need an organization prefix. Try searching for: 'organization/{dataset_id}'"
225
+ )
226
+
227
+ # General suggestions
228
+ suggestions.extend([
229
+ "Verify the dataset exists on HuggingFace Hub: https://huggingface.co/datasets",
230
+ f"Search for similar datasets: https://huggingface.co/datasets?search={dataset_id}",
231
+ "Check if the dataset name is spelled correctly",
232
+ "Ensure you have access if the dataset is private or gated"
233
+ ])
234
+
235
+ return suggestions
236
+
237
+
238
+ def format_authentication_error(
239
+ dataset_id: str,
240
+ is_gated: bool = False,
241
+ has_token: bool = False
242
+ ) -> Dict[str, Any]:
243
+ """
244
+ Format authentication error with helpful guidance.
245
+
246
+ Args:
247
+ dataset_id: The dataset identifier
248
+ is_gated: Whether the dataset is gated (requires approval)
249
+ has_token: Whether a token was provided
250
+
251
+ Returns:
252
+ Dictionary with error details and suggestions
253
+ """
254
+ error_details = {
255
+ "error_type": "authentication_error",
256
+ "dataset_id": dataset_id,
257
+ "is_gated": is_gated,
258
+ "has_token": has_token,
259
+ "message": "",
260
+ "suggestions": []
261
+ }
262
+
263
+ if is_gated:
264
+ error_details["message"] = (
265
+ f"Dataset '{dataset_id}' is gated and requires approval to access."
266
+ )
267
+ error_details["suggestions"] = [
268
+ f"Request access to the dataset: https://huggingface.co/datasets/{dataset_id}",
269
+ "Wait for approval from the dataset owner",
270
+ "Provide a valid HuggingFace token after receiving access",
271
+ "Check your HuggingFace account for access status"
272
+ ]
273
+ elif not has_token:
274
+ error_details["message"] = (
275
+ f"Dataset '{dataset_id}' is private and requires authentication."
276
+ )
277
+ error_details["suggestions"] = [
278
+ "Provide a HuggingFace authentication token",
279
+ "Create a token at: https://huggingface.co/settings/tokens",
280
+ "Set the token in your environment: HF_TOKEN=your_token",
281
+ "Ensure the token has read access to datasets"
282
+ ]
283
+ else:
284
+ error_details["message"] = (
285
+ f"Authentication failed for dataset '{dataset_id}'. "
286
+ "Your token may not have access to this dataset."
287
+ )
288
+ error_details["suggestions"] = [
289
+ "Verify your token is valid and not expired",
290
+ "Check if your token has the required permissions",
291
+ "Ensure you have been granted access to this private dataset",
292
+ "Try regenerating your token at: https://huggingface.co/settings/tokens"
293
+ ]
294
+
295
+ return error_details
296
+
297
+
298
+ def format_network_error(
299
+ error: Exception,
300
+ operation: str = "operation"
301
+ ) -> Dict[str, Any]:
302
+ """
303
+ Format network error with helpful guidance.
304
+
305
+ Args:
306
+ error: The network exception
307
+ operation: Description of the operation that failed
308
+
309
+ Returns:
310
+ Dictionary with error details and suggestions
311
+ """
312
+ error_details = {
313
+ "error_type": "network_error",
314
+ "operation": operation,
315
+ "message": f"Network error during {operation}: {str(error)}",
316
+ "suggestions": []
317
+ }
318
+
319
+ # Determine specific error type and provide targeted suggestions
320
+ if isinstance(error, Timeout):
321
+ error_details["error_subtype"] = "timeout"
322
+ error_details["suggestions"] = [
323
+ "The request timed out. Try again in a moment",
324
+ "Check your internet connection",
325
+ "The HuggingFace Hub might be experiencing high load",
326
+ "Try with a smaller sample size or different dataset"
327
+ ]
328
+ elif isinstance(error, ConnectionError):
329
+ error_details["error_subtype"] = "connection"
330
+ error_details["suggestions"] = [
331
+ "Unable to connect to HuggingFace Hub",
332
+ "Check your internet connection",
333
+ "Verify you can access https://huggingface.co",
334
+ "Check if you're behind a firewall or proxy",
335
+ "Try again in a few moments"
336
+ ]
337
+ else:
338
+ error_details["error_subtype"] = "general"
339
+ error_details["suggestions"] = [
340
+ "A network error occurred. Please try again",
341
+ "Check your internet connection",
342
+ "The HuggingFace Hub might be temporarily unavailable",
343
+ "Try again in a few moments"
344
+ ]
345
+
346
+ return error_details
347
+
348
+
349
+ def format_error_response(
350
+ error: Exception,
351
+ context: Optional[Dict[str, Any]] = None
352
+ ) -> Dict[str, Any]:
353
+ """
354
+ Format any error into a structured response with helpful information.
355
+
356
+ Args:
357
+ error: The exception to format
358
+ context: Optional context information (dataset_id, operation, etc.)
359
+
360
+ Returns:
361
+ Dictionary with formatted error information
362
+ """
363
+ from hf_eda_mcp.integrations.hf_client import (
364
+ DatasetNotFoundError,
365
+ AuthenticationError,
366
+ NetworkError
367
+ )
368
+
369
+ context = context or {}
370
+
371
+ # Handle specific error types
372
+ if isinstance(error, DatasetNotFoundError):
373
+ dataset_id = context.get("dataset_id", "unknown")
374
+ return {
375
+ "error_type": "dataset_not_found",
376
+ "message": str(error),
377
+ "dataset_id": dataset_id,
378
+ "suggestions": get_dataset_suggestions(dataset_id)
379
+ }
380
+
381
+ elif isinstance(error, AuthenticationError):
382
+ dataset_id = context.get("dataset_id", "unknown")
383
+ is_gated = "gated" in str(error).lower()
384
+ has_token = context.get("has_token", False)
385
+ return format_authentication_error(dataset_id, is_gated, has_token)
386
+
387
+ elif isinstance(error, NetworkError):
388
+ operation = context.get("operation", "operation")
389
+ # Extract the original exception if available
390
+ original_error = error.__cause__ or error
391
+ return format_network_error(original_error, operation)
392
+
393
+ elif isinstance(error, (ConnectionError, Timeout, RequestException)):
394
+ operation = context.get("operation", "operation")
395
+ return format_network_error(error, operation)
396
+
397
+ elif isinstance(error, ValueError):
398
+ return {
399
+ "error_type": "validation_error",
400
+ "message": str(error),
401
+ "suggestions": [
402
+ "Check that all input parameters are valid",
403
+ "Refer to the tool documentation for parameter requirements"
404
+ ]
405
+ }
406
+
407
+ else:
408
+ # Generic error
409
+ return {
410
+ "error_type": "unknown_error",
411
+ "message": f"An unexpected error occurred: {str(error)}",
412
+ "error_class": type(error).__name__,
413
+ "suggestions": [
414
+ "Try the operation again",
415
+ "Check the logs for more details",
416
+ "If the problem persists, report it as an issue"
417
+ ]
418
+ }
419
+
420
+
421
+ def log_error_with_context(
422
+ error: Exception,
423
+ context: Optional[Dict[str, Any]] = None,
424
+ level: int = logging.ERROR
425
+ ) -> None:
426
+ """
427
+ Log an error with contextual information.
428
+
429
+ Args:
430
+ error: The exception to log
431
+ context: Optional context information
432
+ level: Logging level (default: ERROR)
433
+ """
434
+ context = context or {}
435
+
436
+ # Build context string
437
+ context_parts = [f"{k}={v}" for k, v in context.items()]
438
+ context_str = ", ".join(context_parts) if context_parts else "no context"
439
+
440
+ # Log with full details
441
+ logger.log(
442
+ level,
443
+ f"Error occurred: {type(error).__name__}: {str(error)} | Context: {context_str}",
444
+ exc_info=True
445
+ )
src/hf_eda_mcp/integrations/hf_client.py CHANGED
@@ -11,6 +11,14 @@ from huggingface_hub import HfApi
11
  from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
12
  from requests.exceptions import RequestException, ConnectionError, Timeout
13
 
 
 
 
 
 
 
 
 
14
  logger = logging.getLogger()
15
 
16
 
@@ -80,11 +88,15 @@ class HfClient:
80
  f"Failed to authenticate with HuggingFace Hub: {str(e)}"
81
  )
82
 
 
83
  def get_dataset_info(
84
  self, dataset_id: str, config_name: Optional[str] = None
85
  ) -> Dict[str, Any]:
86
  """
87
  Retrieve comprehensive dataset information from HuggingFace Hub.
 
 
 
88
 
89
  Args:
90
  dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
@@ -102,6 +114,8 @@ class HfClient:
102
  AuthenticationError: If dataset is private and authentication fails
103
  NetworkError: If network request fails
104
  """
 
 
105
  try:
106
  # Get dataset info from HuggingFace Hub
107
  dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
@@ -188,24 +202,60 @@ class HfClient:
188
 
189
  return metadata
190
 
191
- except RepositoryNotFoundError:
192
- raise DatasetNotFoundError(
193
- f"Dataset '{dataset_id}' not found on HuggingFace Hub"
194
- )
195
- except GatedRepoError:
196
- raise AuthenticationError(
197
- f"Dataset '{dataset_id}' is private or gated. "
198
- "Please provide a valid authentication token or request access."
199
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  except (ConnectionError, Timeout) as e:
201
- raise NetworkError(f"Network error while fetching dataset info: {str(e)}")
 
 
 
 
 
202
  except RequestException as e:
203
- raise NetworkError(f"Request failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  except Exception as e:
 
205
  logger.error(
206
  f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
207
  )
208
- raise HfClientError(f"Failed to get dataset info: {str(e)}")
209
 
210
  def list_dataset_configs(self, dataset_id: str) -> List[str]:
211
  """
 
11
  from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
12
  from requests.exceptions import RequestException, ConnectionError, Timeout
13
 
14
+ from hf_eda_mcp.error_handling import (
15
+ retry_with_backoff,
16
+ RetryConfig,
17
+ format_error_response,
18
+ log_error_with_context,
19
+ get_dataset_suggestions
20
+ )
21
+
22
  logger = logging.getLogger()
23
 
24
 
 
88
  f"Failed to authenticate with HuggingFace Hub: {str(e)}"
89
  )
90
 
91
+ @retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
92
  def get_dataset_info(
93
  self, dataset_id: str, config_name: Optional[str] = None
94
  ) -> Dict[str, Any]:
95
  """
96
  Retrieve comprehensive dataset information from HuggingFace Hub.
97
+
98
+ This method includes automatic retry logic with exponential backoff
99
+ for transient network errors.
100
 
101
  Args:
102
  dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
 
114
  AuthenticationError: If dataset is private and authentication fails
115
  NetworkError: If network request fails
116
  """
117
+ context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"}
118
+
119
  try:
120
  # Get dataset info from HuggingFace Hub
121
  dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
 
202
 
203
  return metadata
204
 
205
+ except RepositoryNotFoundError as e:
206
+ log_error_with_context(e, context, level=logging.WARNING)
207
+ error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub."
208
+ suggestions = get_dataset_suggestions(dataset_id)
209
+ logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}")
210
+ raise DatasetNotFoundError(error_msg)
211
+
212
+ except GatedRepoError as e:
213
+ log_error_with_context(e, context, level=logging.WARNING)
214
+ is_gated = True
215
+ has_token = self.token is not None
216
+
217
+ if is_gated:
218
+ error_msg = (
219
+ f"Dataset '{dataset_id}' is gated and requires approval. "
220
+ f"Request access at: https://huggingface.co/datasets/{dataset_id}"
221
+ )
222
+ else:
223
+ error_msg = (
224
+ f"Dataset '{dataset_id}' is private. "
225
+ "Please provide a valid authentication token."
226
+ )
227
+
228
+ logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}")
229
+ raise AuthenticationError(error_msg)
230
+
231
  except (ConnectionError, Timeout) as e:
232
+ log_error_with_context(e, context)
233
+ # Let retry decorator handle these - if we get here, all retries failed
234
+ raise NetworkError(
235
+ f"Network error while fetching dataset info after retries: {str(e)}"
236
+ ) from e
237
+
238
  except RequestException as e:
239
+ log_error_with_context(e, context)
240
+ # Check if it's a retryable error
241
+ if hasattr(e, 'response') and e.response is not None:
242
+ status_code = e.response.status_code
243
+ if status_code == 429:
244
+ raise NetworkError(
245
+ "Rate limit exceeded. Please try again later."
246
+ ) from e
247
+ elif status_code >= 500:
248
+ raise NetworkError(
249
+ f"HuggingFace Hub server error (HTTP {status_code}). Please try again later."
250
+ ) from e
251
+ raise NetworkError(f"Request failed: {str(e)}") from e
252
+
253
  except Exception as e:
254
+ log_error_with_context(e, context)
255
  logger.error(
256
  f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
257
  )
258
+ raise HfClientError(f"Failed to get dataset info: {str(e)}") from e
259
 
260
  def list_dataset_configs(self, dataset_id: str) -> List[str]:
261
  """
src/hf_eda_mcp/services/dataset_service.py CHANGED
@@ -14,7 +14,19 @@ from pathlib import Path
14
  from datasets import load_dataset
15
  from datasets.utils.logging import disable_progress_bar
16
 
17
- from hf_eda_mcp.integrations.hf_client import HfClient, HfClientError, DatasetNotFoundError
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  logger = logging.getLogger(__name__)
20
 
@@ -128,6 +140,9 @@ class DatasetService:
128
  """
129
  Load dataset information from HuggingFace Hub with caching.
130
 
 
 
 
131
  Args:
132
  dataset_id: HuggingFace dataset identifier
133
  config_name: Optional configuration name
@@ -138,7 +153,14 @@ class DatasetService:
138
  Raises:
139
  DatasetNotFoundError: If dataset doesn't exist
140
  AuthenticationError: If dataset is private and authentication fails
 
141
  """
 
 
 
 
 
 
142
  cache_key = self._get_cache_key(dataset_id, config_name)
143
  cache_file = self.metadata_cache_dir / f"{cache_key}.json"
144
 
@@ -148,7 +170,7 @@ class DatasetService:
148
  logger.debug(f"Using cached metadata for {dataset_id}")
149
  return cached_data
150
 
151
- # Fetch from HuggingFace Hub
152
  try:
153
  logger.info(f"Fetching metadata for dataset: {dataset_id}")
154
  metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
@@ -156,15 +178,29 @@ class DatasetService:
156
  # Add cache timestamp
157
  metadata['_cached_at'] = time.time()
158
 
159
- # Save to cache
160
- self._save_to_cache(cache_file, metadata)
 
 
 
161
 
162
  return metadata
163
 
164
- except HfClientError:
165
- # Re-raise HfClient errors as-is
 
 
 
 
 
166
  raise
 
 
 
 
 
167
 
 
168
  def load_dataset_sample(
169
  self,
170
  dataset_id: str,
@@ -176,6 +212,9 @@ class DatasetService:
176
  """
177
  Load samples from the specified dataset with caching.
178
 
 
 
 
179
  Args:
180
  dataset_id: HuggingFace dataset identifier
181
  split: Dataset split to sample from
@@ -188,8 +227,18 @@ class DatasetService:
188
 
189
  Raises:
190
  DatasetNotFoundError: If dataset or split doesn't exist
191
- DatasetServiceError: If sampling fails
 
 
192
  """
 
 
 
 
 
 
 
 
193
  # For small samples, check cache first
194
  if num_samples <= 100: # Only cache small samples
195
  cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
@@ -208,7 +257,8 @@ class DatasetService:
208
  dataset_id,
209
  name=config_name,
210
  split=split,
211
- streaming=streaming
 
212
  )
213
 
214
  # Take the requested number of samples
@@ -240,21 +290,65 @@ class DatasetService:
240
  '_sampled_at': time.time()
241
  }
242
 
243
- # Cache small samples
244
  if num_samples <= 100:
245
  try:
246
  self._save_to_cache(cache_file, sample_data)
247
- except CacheError:
248
- # Don't fail if caching fails
249
- pass
250
 
251
  return sample_data
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  except Exception as e:
254
- logger.error(f"Failed to load dataset sample: {e}")
255
- if "not found" in str(e).lower():
256
- raise DatasetNotFoundError(f"Dataset '{dataset_id}' or split '{split}' not found")
257
- raise DatasetServiceError(f"Failed to load dataset sample: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
260
  """
 
14
  from datasets import load_dataset
15
  from datasets.utils.logging import disable_progress_bar
16
 
17
+ from hf_eda_mcp.integrations.hf_client import (
18
+ HfClient,
19
+ HfClientError,
20
+ DatasetNotFoundError,
21
+ AuthenticationError,
22
+ NetworkError
23
+ )
24
+ from hf_eda_mcp.error_handling import (
25
+ retry_with_backoff,
26
+ RetryConfig,
27
+ log_error_with_context,
28
+ format_error_response
29
+ )
30
 
31
  logger = logging.getLogger(__name__)
32
 
 
140
  """
141
  Load dataset information from HuggingFace Hub with caching.
142
 
143
+ Includes automatic retry logic for transient failures and comprehensive
144
+ error handling with helpful suggestions.
145
+
146
  Args:
147
  dataset_id: HuggingFace dataset identifier
148
  config_name: Optional configuration name
 
153
  Raises:
154
  DatasetNotFoundError: If dataset doesn't exist
155
  AuthenticationError: If dataset is private and authentication fails
156
+ NetworkError: If network operations fail after retries
157
  """
158
+ context = {
159
+ "dataset_id": dataset_id,
160
+ "config_name": config_name,
161
+ "operation": "load_dataset_info"
162
+ }
163
+
164
  cache_key = self._get_cache_key(dataset_id, config_name)
165
  cache_file = self.metadata_cache_dir / f"{cache_key}.json"
166
 
 
170
  logger.debug(f"Using cached metadata for {dataset_id}")
171
  return cached_data
172
 
173
+ # Fetch from HuggingFace Hub with retry logic
174
  try:
175
  logger.info(f"Fetching metadata for dataset: {dataset_id}")
176
  metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
 
178
  # Add cache timestamp
179
  metadata['_cached_at'] = time.time()
180
 
181
+ # Save to cache (don't fail if caching fails)
182
+ try:
183
+ self._save_to_cache(cache_file, metadata)
184
+ except CacheError as e:
185
+ logger.warning(f"Failed to cache metadata, continuing anyway: {e}")
186
 
187
  return metadata
188
 
189
+ except (DatasetNotFoundError, AuthenticationError, NetworkError):
190
+ # Re-raise these specific errors with context
191
+ log_error_with_context(
192
+ Exception(f"Failed to load dataset info for {dataset_id}"),
193
+ context,
194
+ level=logging.WARNING
195
+ )
196
  raise
197
+
198
+ except Exception as e:
199
+ # Unexpected error
200
+ log_error_with_context(e, context)
201
+ raise DatasetServiceError(f"Unexpected error loading dataset info: {str(e)}") from e
202
 
203
+ @retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
204
  def load_dataset_sample(
205
  self,
206
  dataset_id: str,
 
212
  """
213
  Load samples from the specified dataset with caching.
214
 
215
+ Includes automatic retry logic for transient failures and comprehensive
216
+ error handling.
217
+
218
  Args:
219
  dataset_id: HuggingFace dataset identifier
220
  split: Dataset split to sample from
 
227
 
228
  Raises:
229
  DatasetNotFoundError: If dataset or split doesn't exist
230
+ AuthenticationError: If dataset is private and authentication fails
231
+ NetworkError: If network operations fail after retries
232
+ DatasetServiceError: If sampling fails for other reasons
233
  """
234
+ context = {
235
+ "dataset_id": dataset_id,
236
+ "split": split,
237
+ "num_samples": num_samples,
238
+ "config_name": config_name,
239
+ "operation": "load_dataset_sample"
240
+ }
241
+
242
  # For small samples, check cache first
243
  if num_samples <= 100: # Only cache small samples
244
  cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
 
257
  dataset_id,
258
  name=config_name,
259
  split=split,
260
+ streaming=streaming,
261
+ token=self.hf_client.token
262
  )
263
 
264
  # Take the requested number of samples
 
290
  '_sampled_at': time.time()
291
  }
292
 
293
+ # Cache small samples (don't fail if caching fails)
294
  if num_samples <= 100:
295
  try:
296
  self._save_to_cache(cache_file, sample_data)
297
+ except CacheError as e:
298
+ logger.warning(f"Failed to cache sample, continuing anyway: {e}")
 
299
 
300
  return sample_data
301
 
302
+ except DatasetNotFoundError:
303
+ # Re-raise as-is
304
+ log_error_with_context(
305
+ Exception(f"Dataset or split not found: {dataset_id}/{split}"),
306
+ context,
307
+ level=logging.WARNING
308
+ )
309
+ raise
310
+
311
+ except AuthenticationError:
312
+ # Re-raise as-is
313
+ log_error_with_context(
314
+ Exception(f"Authentication failed for dataset: {dataset_id}"),
315
+ context,
316
+ level=logging.WARNING
317
+ )
318
+ raise
319
+
320
  except Exception as e:
321
+ log_error_with_context(e, context)
322
+
323
+ # Try to provide more specific error messages
324
+ error_str = str(e).lower()
325
+
326
+ if "not found" in error_str or "doesn't exist" in error_str:
327
+ if "split" in error_str or split in error_str:
328
+ raise DatasetNotFoundError(
329
+ f"Split '{split}' not found in dataset '{dataset_id}'. "
330
+ f"Available splits may be different."
331
+ ) from e
332
+ else:
333
+ raise DatasetNotFoundError(
334
+ f"Dataset '{dataset_id}' not found on HuggingFace Hub."
335
+ ) from e
336
+
337
+ elif "gated" in error_str or "private" in error_str or "authentication" in error_str:
338
+ raise AuthenticationError(
339
+ f"Authentication required for dataset '{dataset_id}'. "
340
+ "Please provide a valid HuggingFace token."
341
+ ) from e
342
+
343
+ elif "timeout" in error_str or "connection" in error_str:
344
+ raise NetworkError(
345
+ f"Network error while loading dataset sample: {str(e)}"
346
+ ) from e
347
+
348
+ else:
349
+ raise DatasetServiceError(
350
+ f"Failed to load dataset sample: {str(e)}"
351
+ ) from e
352
 
353
  def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
354
  """
src/hf_eda_mcp/tools/__init__.py CHANGED
@@ -4,14 +4,17 @@ EDA tools module for HuggingFace datasets.
4
  This package contains individual EDA functions that will be exposed as MCP tools.
5
  """
6
 
7
- from .metadata import get_dataset_metadata, validate_dataset_metadata_inputs
8
- from .sampling import get_dataset_sample, get_dataset_sample_with_indices, get_available_splits
9
- from .analysis import analyze_dataset_features, validate_analysis_inputs
 
 
 
 
10
 
11
  __all__ = [
12
  # Metadata tools
13
  'get_dataset_metadata',
14
- 'validate_dataset_metadata_inputs',
15
 
16
  # Sampling tools
17
  'get_dataset_sample',
@@ -20,5 +23,4 @@ __all__ = [
20
 
21
  # Analysis tools
22
  'analyze_dataset_features',
23
- 'validate_analysis_inputs'
24
  ]
 
4
  This package contains individual EDA functions that will be exposed as MCP tools.
5
  """
6
 
7
+ from .metadata import get_dataset_metadata
8
+ from .sampling import (
9
+ get_dataset_sample,
10
+ get_dataset_sample_with_indices,
11
+ get_available_splits,
12
+ )
13
+ from .analysis import analyze_dataset_features
14
 
15
  __all__ = [
16
  # Metadata tools
17
  'get_dataset_metadata',
 
18
 
19
  # Sampling tools
20
  'get_dataset_sample',
 
23
 
24
  # Analysis tools
25
  'analyze_dataset_features',
 
26
  ]
src/hf_eda_mcp/tools/analysis.py CHANGED
@@ -11,7 +11,16 @@ from typing import Optional, Dict, Any, List
11
  from collections import Counter
12
  from hf_eda_mcp.config import get_config
13
  from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
14
- from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
 
 
 
 
 
 
 
 
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
@@ -84,9 +93,24 @@ def analyze_dataset_features(
84
  >>> quality = analysis['data_quality']
85
  >>> print(f"Overall quality score: {quality['quality_score']:.2f}")
86
  """
87
- # Input validation
88
- validate_analysis_inputs(dataset_id, split, sample_size, config_name)
89
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  logger.info(
91
  f"Analyzing features for dataset: {dataset_id}, split: {split}, "
92
  f"sample_size: {sample_size}"
@@ -153,12 +177,28 @@ def analyze_dataset_features(
153
  )
154
  return analysis_result
155
 
156
- except (DatasetNotFoundError, AuthenticationError):
157
- # Re-raise these specific errors as-is
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  raise
 
159
  except Exception as e:
160
- logger.error(f"Failed to analyze dataset {dataset_id}: {str(e)}")
161
- raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}")
162
 
163
 
164
  def _analyze_single_feature(
@@ -495,58 +535,3 @@ def _generate_analysis_summary(
495
  summary_parts.append(f"Avg missing: {avg_missing:.1f}%")
496
 
497
  return " | ".join(summary_parts)
498
-
499
-
500
- def validate_analysis_inputs(
501
- dataset_id: str, split: str, sample_size: int, config_name: Optional[str] = None
502
- ) -> None:
503
- """
504
- Validate inputs for dataset analysis.
505
-
506
- Args:
507
- dataset_id: Dataset identifier to validate
508
- split: Split name to validate
509
- sample_size: Sample size to validate
510
- config_name: Optional configuration name to validate
511
-
512
- Raises:
513
- ValueError: If any input is invalid
514
- """
515
- # Validate dataset_id
516
- if not dataset_id or not isinstance(dataset_id, str):
517
- raise ValueError("dataset_id must be a non-empty string")
518
-
519
- dataset_id = dataset_id.strip()
520
- if not dataset_id:
521
- raise ValueError("dataset_id cannot be empty or whitespace")
522
-
523
- # Validate split
524
- if not split or not isinstance(split, str):
525
- raise ValueError("split must be a non-empty string")
526
-
527
- split = split.strip()
528
- if not split:
529
- raise ValueError("split cannot be empty or whitespace")
530
-
531
- # Validate sample_size
532
- if not isinstance(sample_size, int):
533
- raise ValueError("sample_size must be an integer")
534
-
535
- if sample_size <= 0:
536
- raise ValueError("sample_size must be positive")
537
-
538
- # Get max sample size from config
539
- config = get_config()
540
- max_sample_size = config.max_sample_size
541
-
542
- if sample_size > max_sample_size:
543
- raise ValueError(f"sample_size cannot exceed {max_sample_size}")
544
-
545
- # Validate config_name
546
- if config_name is not None:
547
- if not isinstance(config_name, str):
548
- raise ValueError("config_name must be a string")
549
-
550
- config_name = config_name.strip()
551
- if not config_name:
552
- raise ValueError("config_name cannot be empty or whitespace")
 
11
  from collections import Counter
12
  from hf_eda_mcp.config import get_config
13
  from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
14
+ from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
15
+ from hf_eda_mcp.validation import (
16
+ validate_dataset_id,
17
+ validate_config_name,
18
+ validate_split_name,
19
+ validate_sample_size,
20
+ ValidationError,
21
+ format_validation_error,
22
+ )
23
+ from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
24
 
25
  logger = logging.getLogger(__name__)
26
 
 
93
  >>> quality = analysis['data_quality']
94
  >>> print(f"Overall quality score: {quality['quality_score']:.2f}")
95
  """
96
+ # Input validation using centralized validation
97
+ try:
98
+ dataset_id = validate_dataset_id(dataset_id)
99
+ config_name = validate_config_name(config_name)
100
+ split = validate_split_name(split)
101
+ sample_size = validate_sample_size(sample_size, "sample_size")
102
+ except ValidationError as e:
103
+ logger.error(f"Validation error: {format_validation_error(e)}")
104
+ raise ValueError(format_validation_error(e))
105
+
106
+ context = {
107
+ "dataset_id": dataset_id,
108
+ "split": split,
109
+ "sample_size": sample_size,
110
+ "config_name": config_name,
111
+ "operation": "analyze_dataset_features"
112
+ }
113
+
114
  logger.info(
115
  f"Analyzing features for dataset: {dataset_id}, split: {split}, "
116
  f"sample_size: {sample_size}"
 
177
  )
178
  return analysis_result
179
 
180
+ except DatasetNotFoundError as e:
181
+ log_error_with_context(e, context, level=logging.WARNING)
182
+ error_response = format_error_response(e, context)
183
+ logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
184
+ raise
185
+
186
+ except AuthenticationError as e:
187
+ log_error_with_context(e, context, level=logging.WARNING)
188
+ context["has_token"] = get_dataset_service().is_authenticated
189
+ error_response = format_error_response(e, context)
190
+ logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
191
+ raise
192
+
193
+ except NetworkError as e:
194
+ log_error_with_context(e, context)
195
+ error_response = format_error_response(e, context)
196
+ logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
197
  raise
198
+
199
  except Exception as e:
200
+ log_error_with_context(e, context)
201
+ raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}") from e
202
 
203
 
204
  def _analyze_single_feature(
 
535
  summary_parts.append(f"Avg missing: {avg_missing:.1f}%")
536
 
537
  return " | ".join(summary_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/hf_eda_mcp/tools/metadata.py CHANGED
@@ -9,7 +9,14 @@ import logging
9
  from typing import Optional, Dict, Any
10
  from hf_eda_mcp.config import get_config
11
  from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
12
- from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
 
 
 
 
 
 
 
13
 
14
  logger = logging.getLogger(__name__)
15
 
@@ -72,18 +79,19 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
72
  >>> metadata = get_dataset_metadata("glue", config_name="cola")
73
  >>> print(f"Config: {metadata.get('config_name', 'default')}")
74
  """
75
- # Input validation
76
- if not dataset_id or not isinstance(dataset_id, str):
77
- raise ValueError("dataset_id must be a non-empty string")
78
-
79
- dataset_id = dataset_id.strip()
80
- if not dataset_id:
81
- raise ValueError("dataset_id cannot be empty or whitespace")
82
-
83
- if config_name is not None:
84
- config_name = config_name.strip()
85
- if not config_name:
86
- config_name = None
 
87
 
88
  logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
89
  (f", config: {config_name}" if config_name else ""))
@@ -115,12 +123,31 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
115
  logger.info(f"Successfully retrieved metadata for {dataset_id}")
116
  return metadata
117
 
118
- except (DatasetNotFoundError, AuthenticationError):
119
- # Re-raise these specific errors as-is
 
 
 
120
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  except Exception as e:
122
- logger.error(f"Failed to retrieve metadata for {dataset_id}: {str(e)}")
123
- raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}")
124
 
125
 
126
  def _format_bytes(size_bytes: int) -> str:
@@ -184,39 +211,4 @@ def _generate_metadata_summary(metadata: Dict[str, Any]) -> str:
184
  if features:
185
  summary_parts.append(f"Features: {len(features)} columns")
186
 
187
- return " | ".join(summary_parts)
188
-
189
-
190
- def validate_dataset_metadata_inputs(dataset_id: str, config_name: Optional[str] = None) -> None:
191
- """
192
- Validate inputs for dataset metadata retrieval.
193
-
194
- Args:
195
- dataset_id: Dataset identifier to validate
196
- config_name: Optional configuration name to validate
197
-
198
- Raises:
199
- ValueError: If inputs are invalid
200
- """
201
- if not dataset_id or not isinstance(dataset_id, str):
202
- raise ValueError("dataset_id must be a non-empty string")
203
-
204
- dataset_id = dataset_id.strip()
205
- if not dataset_id:
206
- raise ValueError("dataset_id cannot be empty or whitespace")
207
-
208
- # Basic format validation for dataset_id
209
- if not all(c.isalnum() or c in '-_/.@' for c in dataset_id):
210
- raise ValueError("dataset_id contains invalid characters")
211
-
212
- if config_name is not None:
213
- if not isinstance(config_name, str):
214
- raise ValueError("config_name must be a string")
215
-
216
- config_name = config_name.strip()
217
- if not config_name:
218
- raise ValueError("config_name cannot be empty or whitespace")
219
-
220
- # Basic format validation for config_name
221
- if not all(c.isalnum() or c in '-_.' for c in config_name):
222
- raise ValueError("config_name contains invalid characters")
 
9
  from typing import Optional, Dict, Any
10
  from hf_eda_mcp.config import get_config
11
  from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
12
+ from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
13
+ from hf_eda_mcp.validation import (
14
+ validate_dataset_id,
15
+ validate_config_name,
16
+ ValidationError,
17
+ format_validation_error,
18
+ )
19
+ from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
20
 
21
  logger = logging.getLogger(__name__)
22
 
 
79
  >>> metadata = get_dataset_metadata("glue", config_name="cola")
80
  >>> print(f"Config: {metadata.get('config_name', 'default')}")
81
  """
82
+ # Input validation using centralized validation
83
+ try:
84
+ dataset_id = validate_dataset_id(dataset_id)
85
+ config_name = validate_config_name(config_name)
86
+ except ValidationError as e:
87
+ logger.error(f"Validation error: {format_validation_error(e)}")
88
+ raise ValueError(format_validation_error(e))
89
+
90
+ context = {
91
+ "dataset_id": dataset_id,
92
+ "config_name": config_name,
93
+ "operation": "get_dataset_metadata"
94
+ }
95
 
96
  logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
97
  (f", config: {config_name}" if config_name else ""))
 
123
  logger.info(f"Successfully retrieved metadata for {dataset_id}")
124
  return metadata
125
 
126
+ except DatasetNotFoundError as e:
127
+ # Add helpful context to the error
128
+ log_error_with_context(e, context, level=logging.WARNING)
129
+ error_response = format_error_response(e, context)
130
+ logger.info(f"Dataset not found suggestions: {error_response.get('suggestions', [])}")
131
  raise
132
+
133
+ except AuthenticationError as e:
134
+ # Add helpful context to the error
135
+ log_error_with_context(e, context, level=logging.WARNING)
136
+ context["has_token"] = get_dataset_service().is_authenticated
137
+ error_response = format_error_response(e, context)
138
+ logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
139
+ raise
140
+
141
+ except NetworkError as e:
142
+ # Network errors after retries
143
+ log_error_with_context(e, context)
144
+ error_response = format_error_response(e, context)
145
+ logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
146
+ raise
147
+
148
  except Exception as e:
149
+ log_error_with_context(e, context)
150
+ raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}") from e
151
 
152
 
153
  def _format_bytes(size_bytes: int) -> str:
 
211
  if features:
212
  summary_parts.append(f"Features: {len(features)} columns")
213
 
214
+ return " | ".join(summary_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/hf_eda_mcp/tools/sampling.py CHANGED
@@ -9,7 +9,17 @@ import logging
9
  from typing import Optional, Dict, Any, List
10
  from hf_eda_mcp.config import get_config
11
  from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
12
- from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
 
 
 
 
 
 
 
 
 
 
13
 
14
  logger = logging.getLogger(__name__)
15
 
@@ -82,9 +92,24 @@ def get_dataset_sample(
82
  ... num_samples=3, config_name="cola")
83
  >>> print(f"Schema: {sample['schema']}")
84
  """
85
- # Input validation
86
- validate_sampling_inputs(dataset_id, split, num_samples, config_name)
87
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  logger.info(
89
  f"Sampling {num_samples} rows from dataset: {dataset_id}, "
90
  f"split: {split}" + (f", config: {config_name}" if config_name else "")
@@ -132,12 +157,28 @@ def get_dataset_sample(
132
  )
133
  return sample_data
134
 
135
- except (DatasetNotFoundError, AuthenticationError):
136
- # Re-raise these specific errors as-is
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  raise
 
138
  except Exception as e:
139
- logger.error(f"Failed to sample from dataset {dataset_id}: {str(e)}")
140
- raise DatasetServiceError(f"Failed to sample dataset: {str(e)}")
141
 
142
 
143
  def get_dataset_sample_with_indices(
@@ -165,20 +206,15 @@ def get_dataset_sample_with_indices(
165
  ValueError: If inputs are invalid
166
  DatasetServiceError: If sampling fails
167
  """
168
- # Input validation
169
- if not indices or not isinstance(indices, list):
170
- raise ValueError("indices must be a non-empty list")
171
-
172
- if not all(isinstance(i, int) and i >= 0 for i in indices):
173
- raise ValueError("All indices must be non-negative integers")
174
-
175
- config = get_config()
176
- if len(indices) > config.max_sample_size:
177
- raise ValueError(
178
- f"Too many indices requested. Maximum: {config.max_sample_size}"
179
- )
180
-
181
- validate_sampling_inputs(dataset_id, split, len(indices), config_name)
182
 
183
  logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
184
 
@@ -229,63 +265,6 @@ def get_dataset_sample_with_indices(
229
  raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
230
 
231
 
232
- def validate_sampling_inputs(
233
- dataset_id: str, split: str, num_samples: int, config_name: Optional[str] = None
234
- ) -> None:
235
- """
236
- Validate inputs for dataset sampling.
237
-
238
- Args:
239
- dataset_id: Dataset identifier to validate
240
- split: Split name to validate
241
- num_samples: Number of samples to validate
242
- config_name: Optional configuration name to validate
243
-
244
- Raises:
245
- ValueError: If any input is invalid
246
- """
247
- # Validate dataset_id
248
- if not dataset_id or not isinstance(dataset_id, str):
249
- raise ValueError("dataset_id must be a non-empty string")
250
-
251
- dataset_id = dataset_id.strip()
252
- if not dataset_id:
253
- raise ValueError("dataset_id cannot be empty or whitespace")
254
-
255
- # Validate split
256
- if not split or not isinstance(split, str):
257
- raise ValueError("split must be a non-empty string")
258
-
259
- split = split.strip().lower()
260
- if not split:
261
- raise ValueError("split cannot be empty or whitespace")
262
-
263
- # Note: We don't strictly enforce VALID_SPLITS as datasets may have custom split names
264
-
265
- # Validate num_samples
266
- if not isinstance(num_samples, int):
267
- raise ValueError("num_samples must be an integer")
268
-
269
- if num_samples <= 0:
270
- raise ValueError("num_samples must be positive")
271
-
272
- # Get max sample size from config
273
- config = get_config()
274
- max_sample_size = config.max_sample_size
275
-
276
- if num_samples > max_sample_size:
277
- raise ValueError(f"num_samples cannot exceed {max_sample_size}")
278
-
279
- # Validate config_name
280
- if config_name is not None:
281
- if not isinstance(config_name, str):
282
- raise ValueError("config_name must be a string")
283
-
284
- config_name = config_name.strip()
285
- if not config_name:
286
- raise ValueError("config_name cannot be empty or whitespace")
287
-
288
-
289
  def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
290
  """Generate a human-readable summary of the sample data."""
291
  summary_parts = []
 
9
  from typing import Optional, Dict, Any, List
10
  from hf_eda_mcp.config import get_config
11
  from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
12
+ from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
13
+ from hf_eda_mcp.validation import (
14
+ validate_dataset_id,
15
+ validate_config_name,
16
+ validate_split_name,
17
+ validate_sample_size,
18
+ validate_indices,
19
+ ValidationError,
20
+ format_validation_error,
21
+ )
22
+ from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
23
 
24
  logger = logging.getLogger(__name__)
25
 
 
92
  ... num_samples=3, config_name="cola")
93
  >>> print(f"Schema: {sample['schema']}")
94
  """
95
+ # Input validation using centralized validation
96
+ try:
97
+ dataset_id = validate_dataset_id(dataset_id)
98
+ config_name = validate_config_name(config_name)
99
+ split = validate_split_name(split)
100
+ num_samples = validate_sample_size(num_samples, "num_samples")
101
+ except ValidationError as e:
102
+ logger.error(f"Validation error: {format_validation_error(e)}")
103
+ raise ValueError(format_validation_error(e))
104
+
105
+ context = {
106
+ "dataset_id": dataset_id,
107
+ "split": split,
108
+ "num_samples": num_samples,
109
+ "config_name": config_name,
110
+ "operation": "get_dataset_sample"
111
+ }
112
+
113
  logger.info(
114
  f"Sampling {num_samples} rows from dataset: {dataset_id}, "
115
  f"split: {split}" + (f", config: {config_name}" if config_name else "")
 
157
  )
158
  return sample_data
159
 
160
+ except DatasetNotFoundError as e:
161
+ log_error_with_context(e, context, level=logging.WARNING)
162
+ error_response = format_error_response(e, context)
163
+ logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
164
+ raise
165
+
166
+ except AuthenticationError as e:
167
+ log_error_with_context(e, context, level=logging.WARNING)
168
+ context["has_token"] = get_dataset_service().is_authenticated
169
+ error_response = format_error_response(e, context)
170
+ logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
171
+ raise
172
+
173
+ except NetworkError as e:
174
+ log_error_with_context(e, context)
175
+ error_response = format_error_response(e, context)
176
+ logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
177
  raise
178
+
179
  except Exception as e:
180
+ log_error_with_context(e, context)
181
+ raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
182
 
183
 
184
  def get_dataset_sample_with_indices(
 
206
  ValueError: If inputs are invalid
207
  DatasetServiceError: If sampling fails
208
  """
209
+ # Input validation using centralized validation
210
+ try:
211
+ dataset_id = validate_dataset_id(dataset_id)
212
+ config_name = validate_config_name(config_name)
213
+ split = validate_split_name(split)
214
+ indices = validate_indices(indices)
215
+ except ValidationError as e:
216
+ logger.error(f"Validation error: {format_validation_error(e)}")
217
+ raise ValueError(format_validation_error(e))
 
 
 
 
 
218
 
219
  logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
220
 
 
265
  raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
266
 
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
269
  """Generate a human-readable summary of the sample data."""
270
  summary_parts = []
src/hf_eda_mcp/validation.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Input validation utilities for HF EDA MCP Server.
3
+
4
+ This module provides centralized validation functions for all tool inputs,
5
+ ensuring consistent error messages and validation logic across the application.
6
+ """
7
+
8
+ import re
9
+ from typing import Optional, List
10
+ from hf_eda_mcp.config import get_config
11
+
12
+
13
+ class ValidationError(ValueError):
14
+ """Custom exception for validation errors with helpful messages."""
15
+
16
+ def __init__(self, message: str, suggestions: Optional[List[str]] = None):
17
+ super().__init__(message)
18
+ self.suggestions = suggestions or []
19
+
20
+
21
+ def validate_dataset_id(dataset_id: str) -> str:
22
+ """
23
+ Validate and normalize a HuggingFace dataset identifier.
24
+
25
+ Args:
26
+ dataset_id: Dataset identifier to validate
27
+
28
+ Returns:
29
+ Normalized dataset_id (stripped of whitespace)
30
+
31
+ Raises:
32
+ ValidationError: If dataset_id is invalid with helpful error message
33
+ """
34
+ if not dataset_id:
35
+ raise ValidationError(
36
+ "dataset_id is required and cannot be empty",
37
+ suggestions=[
38
+ "Provide a valid HuggingFace dataset identifier",
39
+ "Examples: 'imdb', 'squad', 'glue', 'username/dataset-name'",
40
+ ],
41
+ )
42
+
43
+ if not isinstance(dataset_id, str):
44
+ raise ValidationError(
45
+ f"dataset_id must be a string, got {type(dataset_id).__name__}",
46
+ suggestions=["Ensure dataset_id is passed as a string value"],
47
+ )
48
+
49
+ dataset_id = dataset_id.strip()
50
+
51
+ if not dataset_id:
52
+ raise ValidationError(
53
+ "dataset_id cannot be empty or contain only whitespace",
54
+ suggestions=["Provide a non-empty dataset identifier"],
55
+ )
56
+
57
+ # Validate format: alphanumeric, hyphens, underscores, slashes, dots, @
58
+ # Pattern: optional username/ followed by dataset name
59
+ pattern = r"^[a-zA-Z0-9][\w\-\.@]*(/[\w\-\.]+)?$"
60
+ if not re.match(pattern, dataset_id):
61
+ raise ValidationError(
62
+ f"Invalid dataset_id format: '{dataset_id}'",
63
+ suggestions=[
64
+ "Dataset IDs should contain only letters, numbers, hyphens, underscores, dots, and slashes",
65
+ "Valid formats: 'dataset-name' or 'username/dataset-name'",
66
+ "Examples: 'imdb', 'squad', 'huggingface/dataset-name'",
67
+ ],
68
+ )
69
+
70
+ # Check for common mistakes
71
+ if dataset_id.startswith("/") or dataset_id.endswith("/"):
72
+ raise ValidationError(
73
+ f"Invalid dataset_id: '{dataset_id}' - cannot start or end with '/'",
74
+ suggestions=["Remove leading or trailing slashes from the dataset_id"],
75
+ )
76
+
77
+ if "//" in dataset_id:
78
+ raise ValidationError(
79
+ f"Invalid dataset_id: '{dataset_id}' - contains consecutive slashes",
80
+ suggestions=["Use single slashes to separate username from dataset name"],
81
+ )
82
+
83
+ # Warn about very long dataset IDs (likely an error)
84
+ if len(dataset_id) > 100:
85
+ raise ValidationError(
86
+ f"dataset_id is unusually long ({len(dataset_id)} characters)",
87
+ suggestions=[
88
+ "Check if the dataset_id is correct",
89
+ "Dataset IDs are typically shorter than 100 characters",
90
+ ],
91
+ )
92
+
93
+ return dataset_id
94
+
95
+
96
+ def validate_config_name(config_name: Optional[str]) -> Optional[str]:
97
+ """
98
+ Validate and normalize a dataset configuration name.
99
+
100
+ Args:
101
+ config_name: Configuration name to validate (can be None)
102
+
103
+ Returns:
104
+ Normalized config_name or None
105
+
106
+ Raises:
107
+ ValidationError: If config_name is invalid
108
+ """
109
+ if config_name is None:
110
+ return None
111
+
112
+ if not isinstance(config_name, str):
113
+ raise ValidationError(
114
+ f"config_name must be a string or None, got {type(config_name).__name__}",
115
+ suggestions=["Pass config_name as a string or omit it for default configuration"],
116
+ )
117
+
118
+ config_name = config_name.strip()
119
+
120
+ if not config_name:
121
+ raise ValidationError(
122
+ "config_name cannot be empty or contain only whitespace",
123
+ suggestions=[
124
+ "Provide a valid configuration name or omit the parameter",
125
+ "Use None or don't specify config_name for default configuration",
126
+ ],
127
+ )
128
+
129
+ # Validate format: alphanumeric, hyphens, underscores, dots
130
+ pattern = r"^[a-zA-Z0-9][\w\-\.]*$"
131
+ if not re.match(pattern, config_name):
132
+ raise ValidationError(
133
+ f"Invalid config_name format: '{config_name}'",
134
+ suggestions=[
135
+ "Configuration names should contain only letters, numbers, hyphens, underscores, and dots",
136
+ "Examples: 'cola', 'sst2', 'plain_text'",
137
+ ],
138
+ )
139
+
140
+ if len(config_name) > 50:
141
+ raise ValidationError(
142
+ f"config_name is unusually long ({len(config_name)} characters)",
143
+ suggestions=[
144
+ "Check if the config_name is correct",
145
+ "Configuration names are typically shorter than 50 characters",
146
+ ],
147
+ )
148
+
149
+ return config_name
150
+
151
+
152
+ def validate_split_name(split: str) -> str:
153
+ """
154
+ Validate and normalize a dataset split name.
155
+
156
+ Args:
157
+ split: Split name to validate
158
+
159
+ Returns:
160
+ Normalized split name (lowercase, stripped)
161
+
162
+ Raises:
163
+ ValidationError: If split is invalid
164
+ """
165
+ if not split:
166
+ raise ValidationError(
167
+ "split is required and cannot be empty",
168
+ suggestions=[
169
+ "Provide a valid split name",
170
+ "Common splits: 'train', 'validation', 'test'",
171
+ ],
172
+ )
173
+
174
+ if not isinstance(split, str):
175
+ raise ValidationError(
176
+ f"split must be a string, got {type(split).__name__}",
177
+ suggestions=["Ensure split is passed as a string value"],
178
+ )
179
+
180
+ split = split.strip().lower()
181
+
182
+ if not split:
183
+ raise ValidationError(
184
+ "split cannot be empty or contain only whitespace",
185
+ suggestions=["Provide a non-empty split name"],
186
+ )
187
+
188
+ # Validate format: alphanumeric, hyphens, underscores
189
+ pattern = r"^[a-zA-Z0-9][\w\-]*$"
190
+ if not re.match(pattern, split):
191
+ raise ValidationError(
192
+ f"Invalid split name format: '{split}'",
193
+ suggestions=[
194
+ "Split names should contain only letters, numbers, hyphens, and underscores",
195
+ "Common splits: 'train', 'validation', 'test', 'dev'",
196
+ ],
197
+ )
198
+
199
+ # Note: We don't enforce a specific set of split names as datasets can have custom splits
200
+ # Common splits for reference
201
+ common_splits = {"train", "validation", "test", "dev", "val"}
202
+
203
+ if split not in common_splits and len(split) > 20:
204
+ raise ValidationError(
205
+ f"Unusual split name: '{split}' (length: {len(split)})",
206
+ suggestions=[
207
+ "Check if the split name is correct",
208
+ f"Common splits are: {', '.join(sorted(common_splits))}",
209
+ "Some datasets may have custom split names",
210
+ ],
211
+ )
212
+
213
+ return split
214
+
215
+
216
+ def validate_sample_size(num_samples: int, parameter_name: str = "num_samples") -> int:
217
+ """
218
+ Validate sample size parameter.
219
+
220
+ Args:
221
+ num_samples: Number of samples to validate
222
+ parameter_name: Name of the parameter (for error messages)
223
+
224
+ Returns:
225
+ Validated num_samples
226
+
227
+ Raises:
228
+ ValidationError: If num_samples is invalid
229
+ """
230
+ if not isinstance(num_samples, int):
231
+ # Check if it's a float that's actually an integer
232
+ if isinstance(num_samples, float) and num_samples.is_integer():
233
+ num_samples = int(num_samples)
234
+ else:
235
+ raise ValidationError(
236
+ f"{parameter_name} must be an integer, got {type(num_samples).__name__}",
237
+ suggestions=[
238
+ f"Provide {parameter_name} as an integer value",
239
+ "Example: num_samples=100",
240
+ ],
241
+ )
242
+
243
+ if num_samples <= 0:
244
+ raise ValidationError(
245
+ f"{parameter_name} must be positive, got {num_samples}",
246
+ suggestions=[
247
+ f"Provide a positive integer for {parameter_name}",
248
+ "Example: num_samples=10 or num_samples=1000",
249
+ ],
250
+ )
251
+
252
+ # Get max sample size from config
253
+ config = get_config()
254
+ max_sample_size = config.max_sample_size
255
+
256
+ if num_samples > max_sample_size:
257
+ raise ValidationError(
258
+ f"{parameter_name} ({num_samples}) exceeds maximum allowed ({max_sample_size})",
259
+ suggestions=[
260
+ f"Reduce {parameter_name} to {max_sample_size} or less",
261
+ f"Current maximum is configured as {max_sample_size}",
262
+ "For larger samples, consider using streaming or batch processing",
263
+ ],
264
+ )
265
+
266
+ # Warn about very small samples (might not be useful)
267
+ if num_samples < 5:
268
+ # This is just a soft warning, not an error
269
+ pass
270
+
271
+ return num_samples
272
+
273
+
274
+ def validate_indices(indices: List[int]) -> List[int]:
275
+ """
276
+ Validate a list of indices for sampling.
277
+
278
+ Args:
279
+ indices: List of indices to validate
280
+
281
+ Returns:
282
+ Validated indices list
283
+
284
+ Raises:
285
+ ValidationError: If indices are invalid
286
+ """
287
+ if not indices:
288
+ raise ValidationError(
289
+ "indices list is required and cannot be empty",
290
+ suggestions=[
291
+ "Provide a non-empty list of indices",
292
+ "Example: indices=[0, 1, 2, 10, 20]",
293
+ ],
294
+ )
295
+
296
+ if not isinstance(indices, list):
297
+ raise ValidationError(
298
+ f"indices must be a list, got {type(indices).__name__}",
299
+ suggestions=[
300
+ "Provide indices as a list of integers",
301
+ "Example: indices=[0, 1, 2]",
302
+ ],
303
+ )
304
+
305
+ # Validate each index
306
+ for i, idx in enumerate(indices):
307
+ if not isinstance(idx, int):
308
+ raise ValidationError(
309
+ f"All indices must be integers, got {type(idx).__name__} at position {i}",
310
+ suggestions=[
311
+ "Ensure all indices are integer values",
312
+ "Example: indices=[0, 1, 2] (not [0.5, 1.2])",
313
+ ],
314
+ )
315
+
316
+ if idx < 0:
317
+ raise ValidationError(
318
+ f"All indices must be non-negative, got {idx} at position {i}",
319
+ suggestions=[
320
+ "Provide only non-negative indices (0 or greater)",
321
+ "Example: indices=[0, 1, 2, 10]",
322
+ ],
323
+ )
324
+
325
+ # Check for reasonable list size
326
+ config = get_config()
327
+ max_sample_size = config.max_sample_size
328
+
329
+ if len(indices) > max_sample_size:
330
+ raise ValidationError(
331
+ f"Too many indices requested ({len(indices)}), maximum is {max_sample_size}",
332
+ suggestions=[
333
+ f"Reduce the number of indices to {max_sample_size} or less",
334
+ "Consider using regular sampling instead of specific indices",
335
+ ],
336
+ )
337
+
338
+ return indices
339
+
340
+
341
+ def format_validation_error(error: ValidationError) -> str:
342
+ """
343
+ Format a validation error with suggestions into a user-friendly message.
344
+
345
+ Args:
346
+ error: ValidationError to format
347
+
348
+ Returns:
349
+ Formatted error message with suggestions
350
+ """
351
+ message = str(error)
352
+
353
+ if error.suggestions:
354
+ message += "\n\nSuggestions:"
355
+ for suggestion in error.suggestions:
356
+ message += f"\n - {suggestion}"
357
+
358
+ return message