hf-eda-mcp

Running

App Files Files Community

KhalilGuetari commited on 28 days ago

Commit

2a623ac

1 Parent(s): ab96cfe

error message handling

Browse files

Files changed (9) hide show

.kiro/specs/hf-eda-mcp-server/tasks.md +2 -2
src/hf_eda_mcp/error_handling.py +445 -0
src/hf_eda_mcp/integrations/hf_client.py +62 -12
src/hf_eda_mcp/services/dataset_service.py +110 -16
src/hf_eda_mcp/tools/__init__.py +7 -5
src/hf_eda_mcp/tools/analysis.py +48 -63
src/hf_eda_mcp/tools/metadata.py +45 -53
src/hf_eda_mcp/tools/sampling.py +58 -79
src/hf_eda_mcp/validation.py +358 -0

.kiro/specs/hf-eda-mcp-server/tasks.md CHANGED Viewed

@@ -58,13 +58,13 @@
     - _Requirements: 4.1, 4.2, 4.4_
 - [ ] 5. Implement error handling and validation
-  - [ ] 5.1 Add input validation for all tools
     - Validate dataset identifiers and configuration names
     - Check split names and sample size parameters
     - Provide helpful error messages for invalid inputs
     - _Requirements: 1.2, 2.1_
-  - [ ] 5.2 Implement comprehensive error handling
     - Handle dataset not found errors with suggestions
     - Manage authentication errors for private datasets
     - Add retry logic for network and API failures

     - _Requirements: 4.1, 4.2, 4.4_
 - [ ] 5. Implement error handling and validation
+  - [x] 5.1 Add input validation for all tools
     - Validate dataset identifiers and configuration names
     - Check split names and sample size parameters
     - Provide helpful error messages for invalid inputs
     - _Requirements: 1.2, 2.1_
+  - [x] 5.2 Implement comprehensive error handling
     - Handle dataset not found errors with suggestions
     - Manage authentication errors for private datasets
     - Add retry logic for network and API failures

src/hf_eda_mcp/error_handling.py ADDED Viewed

	@@ -0,0 +1,445 @@

+"""
+Comprehensive error handling utilities for hf-eda-mcp.
+This module provides error handling utilities including retry logic,
+error suggestions, and formatted error responses for better user experience.
+"""
+import logging
+import time
+import functools
+from typing import Optional, Callable, Any, List, Dict, TypeVar, cast
+from requests.exceptions import RequestException, ConnectionError, Timeout, HTTPError
+logger = logging.getLogger(__name__)
+# Type variable for generic function return types
+T = TypeVar('T')
+class RetryConfig:
+    """Configuration for retry logic."""
+    def __init__(
+        self,
+        max_attempts: int = 3,
+        initial_delay: float = 1.0,
+        max_delay: float = 30.0,
+        exponential_base: float = 2.0,
+        jitter: bool = True
+    ):
+        """
+        Initialize retry configuration.
+        Args:
+            max_attempts: Maximum number of retry attempts
+            initial_delay: Initial delay between retries in seconds
+            max_delay: Maximum delay between retries in seconds
+            exponential_base: Base for exponential backoff
+            jitter: Whether to add random jitter to delays
+        """
+        self.max_attempts = max_attempts
+        self.initial_delay = initial_delay
+        self.max_delay = max_delay
+        self.exponential_base = exponential_base
+        self.jitter = jitter
+# Default retry configuration
+DEFAULT_RETRY_CONFIG = RetryConfig(
+    max_attempts=3,
+    initial_delay=1.0,
+    max_delay=30.0,
+    exponential_base=2.0,
+    jitter=True
+)
+def calculate_retry_delay(attempt: int, config: RetryConfig) -> float:
+    """
+    Calculate delay for retry attempt using exponential backoff.
+    Args:
+        attempt: Current attempt number (0-indexed)
+        config: Retry configuration
+    Returns:
+        Delay in seconds
+    """
+    delay = min(
+        config.initial_delay * (config.exponential_base ** attempt),
+        config.max_delay
+    )
+    # Add jitter to prevent thundering herd
+    if config.jitter:
+        import random
+        delay = delay * (0.5 + random.random())
+    return delay
+def should_retry_error(error: Exception) -> bool:
+    """
+    Determine if an error should trigger a retry.
+    Args:
+        error: Exception to check
+    Returns:
+        True if error is retryable, False otherwise
+    """
+    # Network errors are retryable
+    if isinstance(error, (ConnectionError, Timeout)):
+        return True
+    # HTTP errors with specific status codes are retryable
+    if isinstance(error, HTTPError):
+        # Retry on 5xx server errors and 429 rate limiting
+        if hasattr(error, 'response') and error.response is not None:
+            status_code = error.response.status_code
+            return status_code >= 500 or status_code == 429
+    # Generic request exceptions might be retryable
+    if isinstance(error, RequestException):
+        # Check if it's a connection-related issue
+        error_str = str(error).lower()
+        retryable_keywords = ['timeout', 'connection', 'network', 'temporary']
+        return any(keyword in error_str for keyword in retryable_keywords)
+    # Don't retry other errors by default
+    return False
+def retry_with_backoff(
+    func: Optional[Callable[..., T]] = None,
+    *,
+    config: Optional[RetryConfig] = None,
+    retryable_exceptions: Optional[tuple] = None
+) -> Callable[..., T]:
+    """
+    Decorator to retry a function with exponential backoff.
+    Args:
+        func: Function to decorate (when used without arguments)
+        config: Retry configuration (uses default if not provided)
+        retryable_exceptions: Tuple of exception types to retry on
+    Returns:
+        Decorated function with retry logic
+    Example:
+        @retry_with_backoff
+        def fetch_data():
+            # ... network call ...
+            pass
+        @retry_with_backoff(config=RetryConfig(max_attempts=5))
+        def fetch_with_custom_config():
+            # ... network call ...
+            pass
+    """
+    if config is None:
+        config = DEFAULT_RETRY_CONFIG
+    if retryable_exceptions is None:
+        retryable_exceptions = (ConnectionError, Timeout, RequestException)
+    def decorator(f: Callable[..., T]) -> Callable[..., T]:
+        @functools.wraps(f)
+        def wrapper(*args: Any, **kwargs: Any) -> T:
+            last_exception: Optional[Exception] = None
+            for attempt in range(config.max_attempts):
+                try:
+                    return f(*args, **kwargs)
+                except retryable_exceptions as e:
+                    last_exception = e
+                    # Check if we should retry this specific error
+                    if not should_retry_error(e):
+                        logger.warning(f"Error is not retryable: {e}")
+                        raise
+                    # Don't sleep after the last attempt
+                    if attempt < config.max_attempts - 1:
+                        delay = calculate_retry_delay(attempt, config)
+                        logger.warning(
+                            f"Attempt {attempt + 1}/{config.max_attempts} failed: {e}. "
+                            f"Retrying in {delay:.2f}s..."
+                        )
+                        time.sleep(delay)
+                    else:
+                        logger.error(
+                            f"All {config.max_attempts} attempts failed. Last error: {e}"
+                        )
+                except Exception as e:
+                    # Non-retryable exception, raise immediately
+                    logger.error(f"Non-retryable error occurred: {e}")
+                    raise
+            # If we get here, all retries failed
+            if last_exception:
+                raise last_exception
+            else:
+                raise RuntimeError("Retry logic failed without capturing exception")
+        return cast(Callable[..., T], wrapper)
+    # Support both @retry_with_backoff and @retry_with_backoff()
+    if func is None:
+        return decorator
+    else:
+        return decorator(func)
+def get_dataset_suggestions(dataset_id: str) -> List[str]:
+    """
+    Generate helpful suggestions for dataset not found errors.
+    Args:
+        dataset_id: The dataset identifier that was not found
+    Returns:
+        List of suggestion strings
+    """
+    suggestions = []
+    # Check for common typos or formatting issues
+    if " " in dataset_id:
+        suggestions.append(
+            f"Dataset ID contains spaces. Try: '{dataset_id.replace(' ', '-')}' or '{dataset_id.replace(' ', '_')}'"
+        )
+    if dataset_id.isupper():
+        suggestions.append(
+            f"Dataset ID is all uppercase. Try lowercase: '{dataset_id.lower()}'"
+        )
+    # Check if it looks like it might be missing organization prefix
+    if "/" not in dataset_id:
+        suggestions.append(
+            f"Dataset might need an organization prefix. Try searching for: 'organization/{dataset_id}'"
+        )
+    # General suggestions
+    suggestions.extend([
+        "Verify the dataset exists on HuggingFace Hub: https://huggingface.co/datasets",
+        f"Search for similar datasets: https://huggingface.co/datasets?search={dataset_id}",
+        "Check if the dataset name is spelled correctly",
+        "Ensure you have access if the dataset is private or gated"
+    ])
+    return suggestions
+def format_authentication_error(
+    dataset_id: str,
+    is_gated: bool = False,
+    has_token: bool = False
+) -> Dict[str, Any]:
+    """
+    Format authentication error with helpful guidance.
+    Args:
+        dataset_id: The dataset identifier
+        is_gated: Whether the dataset is gated (requires approval)
+        has_token: Whether a token was provided
+    Returns:
+        Dictionary with error details and suggestions
+    """
+    error_details = {
+        "error_type": "authentication_error",
+        "dataset_id": dataset_id,
+        "is_gated": is_gated,
+        "has_token": has_token,
+        "message": "",
+        "suggestions": []
+    }
+    if is_gated:
+        error_details["message"] = (
+            f"Dataset '{dataset_id}' is gated and requires approval to access."
+        )
+        error_details["suggestions"] = [
+            f"Request access to the dataset: https://huggingface.co/datasets/{dataset_id}",
+            "Wait for approval from the dataset owner",
+            "Provide a valid HuggingFace token after receiving access",
+            "Check your HuggingFace account for access status"
+        ]
+    elif not has_token:
+        error_details["message"] = (
+            f"Dataset '{dataset_id}' is private and requires authentication."
+        )
+        error_details["suggestions"] = [
+            "Provide a HuggingFace authentication token",
+            "Create a token at: https://huggingface.co/settings/tokens",
+            "Set the token in your environment: HF_TOKEN=your_token",
+            "Ensure the token has read access to datasets"
+        ]
+    else:
+        error_details["message"] = (
+            f"Authentication failed for dataset '{dataset_id}'. "
+            "Your token may not have access to this dataset."
+        )
+        error_details["suggestions"] = [
+            "Verify your token is valid and not expired",
+            "Check if your token has the required permissions",
+            "Ensure you have been granted access to this private dataset",
+            "Try regenerating your token at: https://huggingface.co/settings/tokens"
+        ]
+    return error_details
+def format_network_error(
+    error: Exception,
+    operation: str = "operation"
+) -> Dict[str, Any]:
+    """
+    Format network error with helpful guidance.
+    Args:
+        error: The network exception
+        operation: Description of the operation that failed
+    Returns:
+        Dictionary with error details and suggestions
+    """
+    error_details = {
+        "error_type": "network_error",
+        "operation": operation,
+        "message": f"Network error during {operation}: {str(error)}",
+        "suggestions": []
+    }
+    # Determine specific error type and provide targeted suggestions
+    if isinstance(error, Timeout):
+        error_details["error_subtype"] = "timeout"
+        error_details["suggestions"] = [
+            "The request timed out. Try again in a moment",
+            "Check your internet connection",
+            "The HuggingFace Hub might be experiencing high load",
+            "Try with a smaller sample size or different dataset"
+        ]
+    elif isinstance(error, ConnectionError):
+        error_details["error_subtype"] = "connection"
+        error_details["suggestions"] = [
+            "Unable to connect to HuggingFace Hub",
+            "Check your internet connection",
+            "Verify you can access https://huggingface.co",
+            "Check if you're behind a firewall or proxy",
+            "Try again in a few moments"
+        ]
+    else:
+        error_details["error_subtype"] = "general"
+        error_details["suggestions"] = [
+            "A network error occurred. Please try again",
+            "Check your internet connection",
+            "The HuggingFace Hub might be temporarily unavailable",
+            "Try again in a few moments"
+        ]
+    return error_details
+def format_error_response(
+    error: Exception,
+    context: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
+    """
+    Format any error into a structured response with helpful information.
+    Args:
+        error: The exception to format
+        context: Optional context information (dataset_id, operation, etc.)
+    Returns:
+        Dictionary with formatted error information
+    """
+    from hf_eda_mcp.integrations.hf_client import (
+        DatasetNotFoundError,
+        AuthenticationError,
+        NetworkError
+    )
+    context = context or {}
+    # Handle specific error types
+    if isinstance(error, DatasetNotFoundError):
+        dataset_id = context.get("dataset_id", "unknown")
+        return {
+            "error_type": "dataset_not_found",
+            "message": str(error),
+            "dataset_id": dataset_id,
+            "suggestions": get_dataset_suggestions(dataset_id)
+        }
+    elif isinstance(error, AuthenticationError):
+        dataset_id = context.get("dataset_id", "unknown")
+        is_gated = "gated" in str(error).lower()
+        has_token = context.get("has_token", False)
+        return format_authentication_error(dataset_id, is_gated, has_token)
+    elif isinstance(error, NetworkError):
+        operation = context.get("operation", "operation")
+        # Extract the original exception if available
+        original_error = error.__cause__ or error
+        return format_network_error(original_error, operation)
+    elif isinstance(error, (ConnectionError, Timeout, RequestException)):
+        operation = context.get("operation", "operation")
+        return format_network_error(error, operation)
+    elif isinstance(error, ValueError):
+        return {
+            "error_type": "validation_error",
+            "message": str(error),
+            "suggestions": [
+                "Check that all input parameters are valid",
+                "Refer to the tool documentation for parameter requirements"
+            ]
+        }
+    else:
+        # Generic error
+        return {
+            "error_type": "unknown_error",
+            "message": f"An unexpected error occurred: {str(error)}",
+            "error_class": type(error).__name__,
+            "suggestions": [
+                "Try the operation again",
+                "Check the logs for more details",
+                "If the problem persists, report it as an issue"
+            ]
+        }
+def log_error_with_context(
+    error: Exception,
+    context: Optional[Dict[str, Any]] = None,
+    level: int = logging.ERROR
+) -> None:
+    """
+    Log an error with contextual information.
+    Args:
+        error: The exception to log
+        context: Optional context information
+        level: Logging level (default: ERROR)
+    """
+    context = context or {}
+    # Build context string
+    context_parts = [f"{k}={v}" for k, v in context.items()]
+    context_str = ", ".join(context_parts) if context_parts else "no context"
+    # Log with full details
+    logger.log(
+        level,
+        f"Error occurred: {type(error).__name__}: {str(error)} | Context: {context_str}",
+        exc_info=True
+    )

src/hf_eda_mcp/integrations/hf_client.py CHANGED Viewed

@@ -11,6 +11,14 @@ from huggingface_hub import HfApi
 from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
 from requests.exceptions import RequestException, ConnectionError, Timeout
 logger = logging.getLogger()
@@ -80,11 +88,15 @@ class HfClient:
                 f"Failed to authenticate with HuggingFace Hub: {str(e)}"
             )
     def get_dataset_info(
         self, dataset_id: str, config_name: Optional[str] = None
     ) -> Dict[str, Any]:
         """
         Retrieve comprehensive dataset information from HuggingFace Hub.
         Args:
             dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
@@ -102,6 +114,8 @@ class HfClient:
             AuthenticationError: If dataset is private and authentication fails
             NetworkError: If network request fails
         """
         try:
             # Get dataset info from HuggingFace Hub
             dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
@@ -188,24 +202,60 @@ class HfClient:
             return metadata
-        except RepositoryNotFoundError:
-            raise DatasetNotFoundError(
-                f"Dataset '{dataset_id}' not found on HuggingFace Hub"
-            )
-        except GatedRepoError:
-            raise AuthenticationError(
-                f"Dataset '{dataset_id}' is private or gated. "
-                "Please provide a valid authentication token or request access."
-            )
         except (ConnectionError, Timeout) as e:
-            raise NetworkError(f"Network error while fetching dataset info: {str(e)}")
         except RequestException as e:
-            raise NetworkError(f"Request failed: {str(e)}")
         except Exception as e:
             logger.error(
                 f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
             )
-            raise HfClientError(f"Failed to get dataset info: {str(e)}")
     def list_dataset_configs(self, dataset_id: str) -> List[str]:
         """

 from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
 from requests.exceptions import RequestException, ConnectionError, Timeout
+from hf_eda_mcp.error_handling import (
+    retry_with_backoff,
+    RetryConfig,
+    format_error_response,
+    log_error_with_context,
+    get_dataset_suggestions
+)
 logger = logging.getLogger()
                 f"Failed to authenticate with HuggingFace Hub: {str(e)}"
             )
+    @retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
     def get_dataset_info(
         self, dataset_id: str, config_name: Optional[str] = None
     ) -> Dict[str, Any]:
         """
         Retrieve comprehensive dataset information from HuggingFace Hub.
+        This method includes automatic retry logic with exponential backoff
+        for transient network errors.
         Args:
             dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
             AuthenticationError: If dataset is private and authentication fails
             NetworkError: If network request fails
         """
+        context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"}
         try:
             # Get dataset info from HuggingFace Hub
             dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
             return metadata
+        except RepositoryNotFoundError as e:
+            log_error_with_context(e, context, level=logging.WARNING)
+            error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub."
+            suggestions = get_dataset_suggestions(dataset_id)
+            logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}")
+            raise DatasetNotFoundError(error_msg)
+        except GatedRepoError as e:
+            log_error_with_context(e, context, level=logging.WARNING)
+            is_gated = True
+            has_token = self.token is not None
+            if is_gated:
+                error_msg = (
+                    f"Dataset '{dataset_id}' is gated and requires approval. "
+                    f"Request access at: https://huggingface.co/datasets/{dataset_id}"
+                )
+            else:
+                error_msg = (
+                    f"Dataset '{dataset_id}' is private. "
+                    "Please provide a valid authentication token."
+                )
+            logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}")
+            raise AuthenticationError(error_msg)
         except (ConnectionError, Timeout) as e:
+            log_error_with_context(e, context)
+            # Let retry decorator handle these - if we get here, all retries failed
+            raise NetworkError(
+                f"Network error while fetching dataset info after retries: {str(e)}"
+            ) from e
         except RequestException as e:
+            log_error_with_context(e, context)
+            # Check if it's a retryable error
+            if hasattr(e, 'response') and e.response is not None:
+                status_code = e.response.status_code
+                if status_code == 429:
+                    raise NetworkError(
+                        "Rate limit exceeded. Please try again later."
+                    ) from e
+                elif status_code >= 500:
+                    raise NetworkError(
+                        f"HuggingFace Hub server error (HTTP {status_code}). Please try again later."
+                    ) from e
+            raise NetworkError(f"Request failed: {str(e)}") from e
         except Exception as e:
+            log_error_with_context(e, context)
             logger.error(
                 f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
             )
+            raise HfClientError(f"Failed to get dataset info: {str(e)}") from e
     def list_dataset_configs(self, dataset_id: str) -> List[str]:
         """

src/hf_eda_mcp/services/dataset_service.py CHANGED Viewed

@@ -14,7 +14,19 @@ from pathlib import Path
 from datasets import load_dataset
 from datasets.utils.logging import disable_progress_bar
-from hf_eda_mcp.integrations.hf_client import HfClient, HfClientError, DatasetNotFoundError
 logger = logging.getLogger(__name__)
@@ -128,6 +140,9 @@ class DatasetService:
         """
         Load dataset information from HuggingFace Hub with caching.
         Args:
             dataset_id: HuggingFace dataset identifier
             config_name: Optional configuration name
@@ -138,7 +153,14 @@ class DatasetService:
         Raises:
             DatasetNotFoundError: If dataset doesn't exist
             AuthenticationError: If dataset is private and authentication fails
         """
         cache_key = self._get_cache_key(dataset_id, config_name)
         cache_file = self.metadata_cache_dir / f"{cache_key}.json"
@@ -148,7 +170,7 @@ class DatasetService:
             logger.debug(f"Using cached metadata for {dataset_id}")
             return cached_data
-        # Fetch from HuggingFace Hub
         try:
             logger.info(f"Fetching metadata for dataset: {dataset_id}")
             metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
@@ -156,15 +178,29 @@ class DatasetService:
             # Add cache timestamp
             metadata['_cached_at'] = time.time()
-            # Save to cache
-            self._save_to_cache(cache_file, metadata)
             return metadata
-        except HfClientError:
-            # Re-raise HfClient errors as-is
             raise
     def load_dataset_sample(
         self,
         dataset_id: str,
@@ -176,6 +212,9 @@ class DatasetService:
         """
         Load samples from the specified dataset with caching.
         Args:
             dataset_id: HuggingFace dataset identifier
             split: Dataset split to sample from
@@ -188,8 +227,18 @@ class DatasetService:
         Raises:
             DatasetNotFoundError: If dataset or split doesn't exist
-            DatasetServiceError: If sampling fails
         """
         # For small samples, check cache first
         if num_samples <= 100:  # Only cache small samples
             cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
@@ -208,7 +257,8 @@ class DatasetService:
                 dataset_id,
                 name=config_name,
                 split=split,
-                streaming=streaming
             )
             # Take the requested number of samples
@@ -240,21 +290,65 @@ class DatasetService:
                 '_sampled_at': time.time()
             }
-            # Cache small samples
             if num_samples <= 100:
                 try:
                     self._save_to_cache(cache_file, sample_data)
-                except CacheError:
-                    # Don't fail if caching fails
-                    pass
             return sample_data
         except Exception as e:
-            logger.error(f"Failed to load dataset sample: {e}")
-            if "not found" in str(e).lower():
-                raise DatasetNotFoundError(f"Dataset '{dataset_id}' or split '{split}' not found")
-            raise DatasetServiceError(f"Failed to load dataset sample: {e}")
     def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
         """

 from datasets import load_dataset
 from datasets.utils.logging import disable_progress_bar
+from hf_eda_mcp.integrations.hf_client import (
+    HfClient,
+    HfClientError,
+    DatasetNotFoundError,
+    AuthenticationError,
+    NetworkError
+)
+from hf_eda_mcp.error_handling import (
+    retry_with_backoff,
+    RetryConfig,
+    log_error_with_context,
+    format_error_response
+)
 logger = logging.getLogger(__name__)
         """
         Load dataset information from HuggingFace Hub with caching.
+        Includes automatic retry logic for transient failures and comprehensive
+        error handling with helpful suggestions.
         Args:
             dataset_id: HuggingFace dataset identifier
             config_name: Optional configuration name
         Raises:
             DatasetNotFoundError: If dataset doesn't exist
             AuthenticationError: If dataset is private and authentication fails
+            NetworkError: If network operations fail after retries
         """
+        context = {
+            "dataset_id": dataset_id,
+            "config_name": config_name,
+            "operation": "load_dataset_info"
+        }
         cache_key = self._get_cache_key(dataset_id, config_name)
         cache_file = self.metadata_cache_dir / f"{cache_key}.json"
             logger.debug(f"Using cached metadata for {dataset_id}")
             return cached_data
+        # Fetch from HuggingFace Hub with retry logic
         try:
             logger.info(f"Fetching metadata for dataset: {dataset_id}")
             metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
             # Add cache timestamp
             metadata['_cached_at'] = time.time()
+            # Save to cache (don't fail if caching fails)
+            try:
+                self._save_to_cache(cache_file, metadata)
+            except CacheError as e:
+                logger.warning(f"Failed to cache metadata, continuing anyway: {e}")
             return metadata
+        except (DatasetNotFoundError, AuthenticationError, NetworkError):
+            # Re-raise these specific errors with context
+            log_error_with_context(
+                Exception(f"Failed to load dataset info for {dataset_id}"),
+                context,
+                level=logging.WARNING
+            )
             raise
+        except Exception as e:
+            # Unexpected error
+            log_error_with_context(e, context)
+            raise DatasetServiceError(f"Unexpected error loading dataset info: {str(e)}") from e
+    @retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
     def load_dataset_sample(
         self,
         dataset_id: str,
         """
         Load samples from the specified dataset with caching.
+        Includes automatic retry logic for transient failures and comprehensive
+        error handling.
         Args:
             dataset_id: HuggingFace dataset identifier
             split: Dataset split to sample from
         Raises:
             DatasetNotFoundError: If dataset or split doesn't exist
+            AuthenticationError: If dataset is private and authentication fails
+            NetworkError: If network operations fail after retries
+            DatasetServiceError: If sampling fails for other reasons
         """
+        context = {
+            "dataset_id": dataset_id,
+            "split": split,
+            "num_samples": num_samples,
+            "config_name": config_name,
+            "operation": "load_dataset_sample"
+        }
         # For small samples, check cache first
         if num_samples <= 100:  # Only cache small samples
             cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
                 dataset_id,
                 name=config_name,
                 split=split,
+                streaming=streaming,
+                token=self.hf_client.token
             )
             # Take the requested number of samples
                 '_sampled_at': time.time()
             }
+            # Cache small samples (don't fail if caching fails)
             if num_samples <= 100:
                 try:
                     self._save_to_cache(cache_file, sample_data)
+                except CacheError as e:
+                    logger.warning(f"Failed to cache sample, continuing anyway: {e}")
             return sample_data
+        except DatasetNotFoundError:
+            # Re-raise as-is
+            log_error_with_context(
+                Exception(f"Dataset or split not found: {dataset_id}/{split}"),
+                context,
+                level=logging.WARNING
+            )
+            raise
+        except AuthenticationError:
+            # Re-raise as-is
+            log_error_with_context(
+                Exception(f"Authentication failed for dataset: {dataset_id}"),
+                context,
+                level=logging.WARNING
+            )
+            raise
         except Exception as e:
+            log_error_with_context(e, context)
+            # Try to provide more specific error messages
+            error_str = str(e).lower()
+            if "not found" in error_str or "doesn't exist" in error_str:
+                if "split" in error_str or split in error_str:
+                    raise DatasetNotFoundError(
+                        f"Split '{split}' not found in dataset '{dataset_id}'. "
+                        f"Available splits may be different."
+                    ) from e
+                else:
+                    raise DatasetNotFoundError(
+                        f"Dataset '{dataset_id}' not found on HuggingFace Hub."
+                    ) from e
+            elif "gated" in error_str or "private" in error_str or "authentication" in error_str:
+                raise AuthenticationError(
+                    f"Authentication required for dataset '{dataset_id}'. "
+                    "Please provide a valid HuggingFace token."
+                ) from e
+            elif "timeout" in error_str or "connection" in error_str:
+                raise NetworkError(
+                    f"Network error while loading dataset sample: {str(e)}"
+                ) from e
+            else:
+                raise DatasetServiceError(
+                    f"Failed to load dataset sample: {str(e)}"
+                ) from e
     def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
         """

src/hf_eda_mcp/tools/__init__.py CHANGED Viewed

@@ -4,14 +4,17 @@ EDA tools module for HuggingFace datasets.
 This package contains individual EDA functions that will be exposed as MCP tools.
 """
-from .metadata import get_dataset_metadata, validate_dataset_metadata_inputs
-from .sampling import get_dataset_sample, get_dataset_sample_with_indices, get_available_splits
-from .analysis import analyze_dataset_features, validate_analysis_inputs
 __all__ = [
     # Metadata tools
     'get_dataset_metadata',
-    'validate_dataset_metadata_inputs',
     # Sampling tools
     'get_dataset_sample',
@@ -20,5 +23,4 @@ __all__ = [
     # Analysis tools
     'analyze_dataset_features',
-    'validate_analysis_inputs'
 ]

 This package contains individual EDA functions that will be exposed as MCP tools.
 """
+from .metadata import get_dataset_metadata
+from .sampling import (
+    get_dataset_sample,
+    get_dataset_sample_with_indices,
+    get_available_splits,
+)
+from .analysis import analyze_dataset_features
 __all__ = [
     # Metadata tools
     'get_dataset_metadata',
     # Sampling tools
     'get_dataset_sample',
     # Analysis tools
     'analyze_dataset_features',
 ]

src/hf_eda_mcp/tools/analysis.py CHANGED Viewed

@@ -11,7 +11,16 @@ from typing import Optional, Dict, Any, List
 from collections import Counter
 from hf_eda_mcp.config import get_config
 from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
-from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
 logger = logging.getLogger(__name__)
@@ -84,9 +93,24 @@ def analyze_dataset_features(
         >>> quality = analysis['data_quality']
         >>> print(f"Overall quality score: {quality['quality_score']:.2f}")
     """
-    # Input validation
-    validate_analysis_inputs(dataset_id, split, sample_size, config_name)
     logger.info(
         f"Analyzing features for dataset: {dataset_id}, split: {split}, "
         f"sample_size: {sample_size}"
@@ -153,12 +177,28 @@ def analyze_dataset_features(
         )
         return analysis_result
-    except (DatasetNotFoundError, AuthenticationError):
-        # Re-raise these specific errors as-is
         raise
     except Exception as e:
-        logger.error(f"Failed to analyze dataset {dataset_id}: {str(e)}")
-        raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}")
 def _analyze_single_feature(
@@ -495,58 +535,3 @@ def _generate_analysis_summary(
         summary_parts.append(f"Avg missing: {avg_missing:.1f}%")
     return " | ".join(summary_parts)
-def validate_analysis_inputs(
-    dataset_id: str, split: str, sample_size: int, config_name: Optional[str] = None
-) -> None:
-    """
-    Validate inputs for dataset analysis.
-    Args:
-        dataset_id: Dataset identifier to validate
-        split: Split name to validate
-        sample_size: Sample size to validate
-        config_name: Optional configuration name to validate
-    Raises:
-        ValueError: If any input is invalid
-    """
-    # Validate dataset_id
-    if not dataset_id or not isinstance(dataset_id, str):
-        raise ValueError("dataset_id must be a non-empty string")
-    dataset_id = dataset_id.strip()
-    if not dataset_id:
-        raise ValueError("dataset_id cannot be empty or whitespace")
-    # Validate split
-    if not split or not isinstance(split, str):
-        raise ValueError("split must be a non-empty string")
-    split = split.strip()
-    if not split:
-        raise ValueError("split cannot be empty or whitespace")
-    # Validate sample_size
-    if not isinstance(sample_size, int):
-        raise ValueError("sample_size must be an integer")
-    if sample_size <= 0:
-        raise ValueError("sample_size must be positive")
-    # Get max sample size from config
-    config = get_config()
-    max_sample_size = config.max_sample_size
-    if sample_size > max_sample_size:
-        raise ValueError(f"sample_size cannot exceed {max_sample_size}")
-    # Validate config_name
-    if config_name is not None:
-        if not isinstance(config_name, str):
-            raise ValueError("config_name must be a string")
-        config_name = config_name.strip()
-        if not config_name:
-            raise ValueError("config_name cannot be empty or whitespace")

 from collections import Counter
 from hf_eda_mcp.config import get_config
 from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
+from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
+from hf_eda_mcp.validation import (
+    validate_dataset_id,
+    validate_config_name,
+    validate_split_name,
+    validate_sample_size,
+    ValidationError,
+    format_validation_error,
+)
+from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
 logger = logging.getLogger(__name__)
         >>> quality = analysis['data_quality']
         >>> print(f"Overall quality score: {quality['quality_score']:.2f}")
     """
+    # Input validation using centralized validation
+    try:
+        dataset_id = validate_dataset_id(dataset_id)
+        config_name = validate_config_name(config_name)
+        split = validate_split_name(split)
+        sample_size = validate_sample_size(sample_size, "sample_size")
+    except ValidationError as e:
+        logger.error(f"Validation error: {format_validation_error(e)}")
+        raise ValueError(format_validation_error(e))
+    context = {
+        "dataset_id": dataset_id,
+        "split": split,
+        "sample_size": sample_size,
+        "config_name": config_name,
+        "operation": "analyze_dataset_features"
+    }
     logger.info(
         f"Analyzing features for dataset: {dataset_id}, split: {split}, "
         f"sample_size: {sample_size}"
         )
         return analysis_result
+    except DatasetNotFoundError as e:
+        log_error_with_context(e, context, level=logging.WARNING)
+        error_response = format_error_response(e, context)
+        logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
+        raise
+    except AuthenticationError as e:
+        log_error_with_context(e, context, level=logging.WARNING)
+        context["has_token"] = get_dataset_service().is_authenticated
+        error_response = format_error_response(e, context)
+        logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
+        raise
+    except NetworkError as e:
+        log_error_with_context(e, context)
+        error_response = format_error_response(e, context)
+        logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
         raise
     except Exception as e:
+        log_error_with_context(e, context)
+        raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}") from e
 def _analyze_single_feature(
         summary_parts.append(f"Avg missing: {avg_missing:.1f}%")
     return " | ".join(summary_parts)

src/hf_eda_mcp/tools/metadata.py CHANGED Viewed

@@ -9,7 +9,14 @@ import logging
 from typing import Optional, Dict, Any
 from hf_eda_mcp.config import get_config
 from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
-from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
 logger = logging.getLogger(__name__)
@@ -72,18 +79,19 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
         >>> metadata = get_dataset_metadata("glue", config_name="cola")
         >>> print(f"Config: {metadata.get('config_name', 'default')}")
     """
-    # Input validation
-    if not dataset_id or not isinstance(dataset_id, str):
-        raise ValueError("dataset_id must be a non-empty string")
-    dataset_id = dataset_id.strip()
-    if not dataset_id:
-        raise ValueError("dataset_id cannot be empty or whitespace")
-    if config_name is not None:
-        config_name = config_name.strip()
-        if not config_name:
-            config_name = None
     logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
                 (f", config: {config_name}" if config_name else ""))
@@ -115,12 +123,31 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
         logger.info(f"Successfully retrieved metadata for {dataset_id}")
         return metadata
-    except (DatasetNotFoundError, AuthenticationError):
-        # Re-raise these specific errors as-is
         raise
     except Exception as e:
-        logger.error(f"Failed to retrieve metadata for {dataset_id}: {str(e)}")
-        raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}")
 def _format_bytes(size_bytes: int) -> str:
@@ -184,39 +211,4 @@ def _generate_metadata_summary(metadata: Dict[str, Any]) -> str:
     if features:
         summary_parts.append(f"Features: {len(features)} columns")
-    return " | ".join(summary_parts)
-def validate_dataset_metadata_inputs(dataset_id: str, config_name: Optional[str] = None) -> None:
-    """
-    Validate inputs for dataset metadata retrieval.
-    Args:
-        dataset_id: Dataset identifier to validate
-        config_name: Optional configuration name to validate
-    Raises:
-        ValueError: If inputs are invalid
-    """
-    if not dataset_id or not isinstance(dataset_id, str):
-        raise ValueError("dataset_id must be a non-empty string")
-    dataset_id = dataset_id.strip()
-    if not dataset_id:
-        raise ValueError("dataset_id cannot be empty or whitespace")
-    # Basic format validation for dataset_id
-    if not all(c.isalnum() or c in '-_/.@' for c in dataset_id):
-        raise ValueError("dataset_id contains invalid characters")
-    if config_name is not None:
-        if not isinstance(config_name, str):
-            raise ValueError("config_name must be a string")
-        config_name = config_name.strip()
-        if not config_name:
-            raise ValueError("config_name cannot be empty or whitespace")
-        # Basic format validation for config_name
-        if not all(c.isalnum() or c in '-_.' for c in config_name):
-            raise ValueError("config_name contains invalid characters")

 from typing import Optional, Dict, Any
 from hf_eda_mcp.config import get_config
 from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
+from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
+from hf_eda_mcp.validation import (
+    validate_dataset_id,
+    validate_config_name,
+    ValidationError,
+    format_validation_error,
+)
+from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
 logger = logging.getLogger(__name__)
         >>> metadata = get_dataset_metadata("glue", config_name="cola")
         >>> print(f"Config: {metadata.get('config_name', 'default')}")
     """
+    # Input validation using centralized validation
+    try:
+        dataset_id = validate_dataset_id(dataset_id)
+        config_name = validate_config_name(config_name)
+    except ValidationError as e:
+        logger.error(f"Validation error: {format_validation_error(e)}")
+        raise ValueError(format_validation_error(e))
+    context = {
+        "dataset_id": dataset_id,
+        "config_name": config_name,
+        "operation": "get_dataset_metadata"
+    }
     logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
                 (f", config: {config_name}" if config_name else ""))
         logger.info(f"Successfully retrieved metadata for {dataset_id}")
         return metadata
+    except DatasetNotFoundError as e:
+        # Add helpful context to the error
+        log_error_with_context(e, context, level=logging.WARNING)
+        error_response = format_error_response(e, context)
+        logger.info(f"Dataset not found suggestions: {error_response.get('suggestions', [])}")
         raise
+    except AuthenticationError as e:
+        # Add helpful context to the error
+        log_error_with_context(e, context, level=logging.WARNING)
+        context["has_token"] = get_dataset_service().is_authenticated
+        error_response = format_error_response(e, context)
+        logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
+        raise
+    except NetworkError as e:
+        # Network errors after retries
+        log_error_with_context(e, context)
+        error_response = format_error_response(e, context)
+        logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
+        raise
     except Exception as e:
+        log_error_with_context(e, context)
+        raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}") from e
 def _format_bytes(size_bytes: int) -> str:
     if features:
         summary_parts.append(f"Features: {len(features)} columns")
+    return " | ".join(summary_parts)

src/hf_eda_mcp/tools/sampling.py CHANGED Viewed

@@ -9,7 +9,17 @@ import logging
 from typing import Optional, Dict, Any, List
 from hf_eda_mcp.config import get_config
 from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
-from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
 logger = logging.getLogger(__name__)
@@ -82,9 +92,24 @@ def get_dataset_sample(
         ...                           num_samples=3, config_name="cola")
         >>> print(f"Schema: {sample['schema']}")
     """
-    # Input validation
-    validate_sampling_inputs(dataset_id, split, num_samples, config_name)
     logger.info(
         f"Sampling {num_samples} rows from dataset: {dataset_id}, "
         f"split: {split}" + (f", config: {config_name}" if config_name else "")
@@ -132,12 +157,28 @@ def get_dataset_sample(
         )
         return sample_data
-    except (DatasetNotFoundError, AuthenticationError):
-        # Re-raise these specific errors as-is
         raise
     except Exception as e:
-        logger.error(f"Failed to sample from dataset {dataset_id}: {str(e)}")
-        raise DatasetServiceError(f"Failed to sample dataset: {str(e)}")
 def get_dataset_sample_with_indices(
@@ -165,20 +206,15 @@ def get_dataset_sample_with_indices(
         ValueError: If inputs are invalid
         DatasetServiceError: If sampling fails
     """
-    # Input validation
-    if not indices or not isinstance(indices, list):
-        raise ValueError("indices must be a non-empty list")
-    if not all(isinstance(i, int) and i >= 0 for i in indices):
-        raise ValueError("All indices must be non-negative integers")
-    config = get_config()
-    if len(indices) > config.max_sample_size:
-        raise ValueError(
-            f"Too many indices requested. Maximum: {config.max_sample_size}"
-        )
-    validate_sampling_inputs(dataset_id, split, len(indices), config_name)
     logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
@@ -229,63 +265,6 @@ def get_dataset_sample_with_indices(
         raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
-def validate_sampling_inputs(
-    dataset_id: str, split: str, num_samples: int, config_name: Optional[str] = None
-) -> None:
-    """
-    Validate inputs for dataset sampling.
-    Args:
-        dataset_id: Dataset identifier to validate
-        split: Split name to validate
-        num_samples: Number of samples to validate
-        config_name: Optional configuration name to validate
-    Raises:
-        ValueError: If any input is invalid
-    """
-    # Validate dataset_id
-    if not dataset_id or not isinstance(dataset_id, str):
-        raise ValueError("dataset_id must be a non-empty string")
-    dataset_id = dataset_id.strip()
-    if not dataset_id:
-        raise ValueError("dataset_id cannot be empty or whitespace")
-    # Validate split
-    if not split or not isinstance(split, str):
-        raise ValueError("split must be a non-empty string")
-    split = split.strip().lower()
-    if not split:
-        raise ValueError("split cannot be empty or whitespace")
-    # Note: We don't strictly enforce VALID_SPLITS as datasets may have custom split names
-    # Validate num_samples
-    if not isinstance(num_samples, int):
-        raise ValueError("num_samples must be an integer")
-    if num_samples <= 0:
-        raise ValueError("num_samples must be positive")
-    # Get max sample size from config
-    config = get_config()
-    max_sample_size = config.max_sample_size
-    if num_samples > max_sample_size:
-        raise ValueError(f"num_samples cannot exceed {max_sample_size}")
-    # Validate config_name
-    if config_name is not None:
-        if not isinstance(config_name, str):
-            raise ValueError("config_name must be a string")
-        config_name = config_name.strip()
-        if not config_name:
-            raise ValueError("config_name cannot be empty or whitespace")
 def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
     """Generate a human-readable summary of the sample data."""
     summary_parts = []

 from typing import Optional, Dict, Any, List
 from hf_eda_mcp.config import get_config
 from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
+from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
+from hf_eda_mcp.validation import (
+    validate_dataset_id,
+    validate_config_name,
+    validate_split_name,
+    validate_sample_size,
+    validate_indices,
+    ValidationError,
+    format_validation_error,
+)
+from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
 logger = logging.getLogger(__name__)
         ...                           num_samples=3, config_name="cola")
         >>> print(f"Schema: {sample['schema']}")
     """
+    # Input validation using centralized validation
+    try:
+        dataset_id = validate_dataset_id(dataset_id)
+        config_name = validate_config_name(config_name)
+        split = validate_split_name(split)
+        num_samples = validate_sample_size(num_samples, "num_samples")
+    except ValidationError as e:
+        logger.error(f"Validation error: {format_validation_error(e)}")
+        raise ValueError(format_validation_error(e))
+    context = {
+        "dataset_id": dataset_id,
+        "split": split,
+        "num_samples": num_samples,
+        "config_name": config_name,
+        "operation": "get_dataset_sample"
+    }
     logger.info(
         f"Sampling {num_samples} rows from dataset: {dataset_id}, "
         f"split: {split}" + (f", config: {config_name}" if config_name else "")
         )
         return sample_data
+    except DatasetNotFoundError as e:
+        log_error_with_context(e, context, level=logging.WARNING)
+        error_response = format_error_response(e, context)
+        logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
+        raise
+    except AuthenticationError as e:
+        log_error_with_context(e, context, level=logging.WARNING)
+        context["has_token"] = get_dataset_service().is_authenticated
+        error_response = format_error_response(e, context)
+        logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
+        raise
+    except NetworkError as e:
+        log_error_with_context(e, context)
+        error_response = format_error_response(e, context)
+        logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
         raise
     except Exception as e:
+        log_error_with_context(e, context)
+        raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
 def get_dataset_sample_with_indices(
         ValueError: If inputs are invalid
         DatasetServiceError: If sampling fails
     """
+    # Input validation using centralized validation
+    try:
+        dataset_id = validate_dataset_id(dataset_id)
+        config_name = validate_config_name(config_name)
+        split = validate_split_name(split)
+        indices = validate_indices(indices)
+    except ValidationError as e:
+        logger.error(f"Validation error: {format_validation_error(e)}")
+        raise ValueError(format_validation_error(e))
     logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
         raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
 def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
     """Generate a human-readable summary of the sample data."""
     summary_parts = []

src/hf_eda_mcp/validation.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""
+Input validation utilities for HF EDA MCP Server.
+This module provides centralized validation functions for all tool inputs,
+ensuring consistent error messages and validation logic across the application.
+"""
+import re
+from typing import Optional, List
+from hf_eda_mcp.config import get_config
+class ValidationError(ValueError):
+    """Custom exception for validation errors with helpful messages."""
+    def __init__(self, message: str, suggestions: Optional[List[str]] = None):
+        super().__init__(message)
+        self.suggestions = suggestions or []
+def validate_dataset_id(dataset_id: str) -> str:
+    """
+    Validate and normalize a HuggingFace dataset identifier.
+    Args:
+        dataset_id: Dataset identifier to validate
+    Returns:
+        Normalized dataset_id (stripped of whitespace)
+    Raises:
+        ValidationError: If dataset_id is invalid with helpful error message
+    """
+    if not dataset_id:
+        raise ValidationError(
+            "dataset_id is required and cannot be empty",
+            suggestions=[
+                "Provide a valid HuggingFace dataset identifier",
+                "Examples: 'imdb', 'squad', 'glue', 'username/dataset-name'",
+            ],
+        )
+    if not isinstance(dataset_id, str):
+        raise ValidationError(
+            f"dataset_id must be a string, got {type(dataset_id).__name__}",
+            suggestions=["Ensure dataset_id is passed as a string value"],
+        )
+    dataset_id = dataset_id.strip()
+    if not dataset_id:
+        raise ValidationError(
+            "dataset_id cannot be empty or contain only whitespace",
+            suggestions=["Provide a non-empty dataset identifier"],
+        )
+    # Validate format: alphanumeric, hyphens, underscores, slashes, dots, @
+    # Pattern: optional username/ followed by dataset name
+    pattern = r"^[a-zA-Z0-9][\w\-\.@]*(/[\w\-\.]+)?$"
+    if not re.match(pattern, dataset_id):
+        raise ValidationError(
+            f"Invalid dataset_id format: '{dataset_id}'",
+            suggestions=[
+                "Dataset IDs should contain only letters, numbers, hyphens, underscores, dots, and slashes",
+                "Valid formats: 'dataset-name' or 'username/dataset-name'",
+                "Examples: 'imdb', 'squad', 'huggingface/dataset-name'",
+            ],
+        )
+    # Check for common mistakes
+    if dataset_id.startswith("/") or dataset_id.endswith("/"):
+        raise ValidationError(
+            f"Invalid dataset_id: '{dataset_id}' - cannot start or end with '/'",
+            suggestions=["Remove leading or trailing slashes from the dataset_id"],
+        )
+    if "//" in dataset_id:
+        raise ValidationError(
+            f"Invalid dataset_id: '{dataset_id}' - contains consecutive slashes",
+            suggestions=["Use single slashes to separate username from dataset name"],
+        )
+    # Warn about very long dataset IDs (likely an error)
+    if len(dataset_id) > 100:
+        raise ValidationError(
+            f"dataset_id is unusually long ({len(dataset_id)} characters)",
+            suggestions=[
+                "Check if the dataset_id is correct",
+                "Dataset IDs are typically shorter than 100 characters",
+            ],
+        )
+    return dataset_id
+def validate_config_name(config_name: Optional[str]) -> Optional[str]:
+    """
+    Validate and normalize a dataset configuration name.
+    Args:
+        config_name: Configuration name to validate (can be None)
+    Returns:
+        Normalized config_name or None
+    Raises:
+        ValidationError: If config_name is invalid
+    """
+    if config_name is None:
+        return None
+    if not isinstance(config_name, str):
+        raise ValidationError(
+            f"config_name must be a string or None, got {type(config_name).__name__}",
+            suggestions=["Pass config_name as a string or omit it for default configuration"],
+        )
+    config_name = config_name.strip()
+    if not config_name:
+        raise ValidationError(
+            "config_name cannot be empty or contain only whitespace",
+            suggestions=[
+                "Provide a valid configuration name or omit the parameter",
+                "Use None or don't specify config_name for default configuration",
+            ],
+        )
+    # Validate format: alphanumeric, hyphens, underscores, dots
+    pattern = r"^[a-zA-Z0-9][\w\-\.]*$"
+    if not re.match(pattern, config_name):
+        raise ValidationError(
+            f"Invalid config_name format: '{config_name}'",
+            suggestions=[
+                "Configuration names should contain only letters, numbers, hyphens, underscores, and dots",
+                "Examples: 'cola', 'sst2', 'plain_text'",
+            ],
+        )
+    if len(config_name) > 50:
+        raise ValidationError(
+            f"config_name is unusually long ({len(config_name)} characters)",
+            suggestions=[
+                "Check if the config_name is correct",
+                "Configuration names are typically shorter than 50 characters",
+            ],
+        )
+    return config_name
+def validate_split_name(split: str) -> str:
+    """
+    Validate and normalize a dataset split name.
+    Args:
+        split: Split name to validate
+    Returns:
+        Normalized split name (lowercase, stripped)
+    Raises:
+        ValidationError: If split is invalid
+    """
+    if not split:
+        raise ValidationError(
+            "split is required and cannot be empty",
+            suggestions=[
+                "Provide a valid split name",
+                "Common splits: 'train', 'validation', 'test'",
+            ],
+        )
+    if not isinstance(split, str):
+        raise ValidationError(
+            f"split must be a string, got {type(split).__name__}",
+            suggestions=["Ensure split is passed as a string value"],
+        )
+    split = split.strip().lower()
+    if not split:
+        raise ValidationError(
+            "split cannot be empty or contain only whitespace",
+            suggestions=["Provide a non-empty split name"],
+        )
+    # Validate format: alphanumeric, hyphens, underscores
+    pattern = r"^[a-zA-Z0-9][\w\-]*$"
+    if not re.match(pattern, split):
+        raise ValidationError(
+            f"Invalid split name format: '{split}'",
+            suggestions=[
+                "Split names should contain only letters, numbers, hyphens, and underscores",
+                "Common splits: 'train', 'validation', 'test', 'dev'",
+            ],
+        )
+    # Note: We don't enforce a specific set of split names as datasets can have custom splits
+    # Common splits for reference
+    common_splits = {"train", "validation", "test", "dev", "val"}
+    if split not in common_splits and len(split) > 20:
+        raise ValidationError(
+            f"Unusual split name: '{split}' (length: {len(split)})",
+            suggestions=[
+                "Check if the split name is correct",
+                f"Common splits are: {', '.join(sorted(common_splits))}",
+                "Some datasets may have custom split names",
+            ],
+        )
+    return split
+def validate_sample_size(num_samples: int, parameter_name: str = "num_samples") -> int:
+    """
+    Validate sample size parameter.
+    Args:
+        num_samples: Number of samples to validate
+        parameter_name: Name of the parameter (for error messages)
+    Returns:
+        Validated num_samples
+    Raises:
+        ValidationError: If num_samples is invalid
+    """
+    if not isinstance(num_samples, int):
+        # Check if it's a float that's actually an integer
+        if isinstance(num_samples, float) and num_samples.is_integer():
+            num_samples = int(num_samples)
+        else:
+            raise ValidationError(
+                f"{parameter_name} must be an integer, got {type(num_samples).__name__}",
+                suggestions=[
+                    f"Provide {parameter_name} as an integer value",
+                    "Example: num_samples=100",
+                ],
+            )
+    if num_samples <= 0:
+        raise ValidationError(
+            f"{parameter_name} must be positive, got {num_samples}",
+            suggestions=[
+                f"Provide a positive integer for {parameter_name}",
+                "Example: num_samples=10 or num_samples=1000",
+            ],
+        )
+    # Get max sample size from config
+    config = get_config()
+    max_sample_size = config.max_sample_size
+    if num_samples > max_sample_size:
+        raise ValidationError(
+            f"{parameter_name} ({num_samples}) exceeds maximum allowed ({max_sample_size})",
+            suggestions=[
+                f"Reduce {parameter_name} to {max_sample_size} or less",
+                f"Current maximum is configured as {max_sample_size}",
+                "For larger samples, consider using streaming or batch processing",
+            ],
+        )
+    # Warn about very small samples (might not be useful)
+    if num_samples < 5:
+        # This is just a soft warning, not an error
+        pass
+    return num_samples
+def validate_indices(indices: List[int]) -> List[int]:
+    """
+    Validate a list of indices for sampling.
+    Args:
+        indices: List of indices to validate
+    Returns:
+        Validated indices list
+    Raises:
+        ValidationError: If indices are invalid
+    """
+    if not indices:
+        raise ValidationError(
+            "indices list is required and cannot be empty",
+            suggestions=[
+                "Provide a non-empty list of indices",
+                "Example: indices=[0, 1, 2, 10, 20]",
+            ],
+        )
+    if not isinstance(indices, list):
+        raise ValidationError(
+            f"indices must be a list, got {type(indices).__name__}",
+            suggestions=[
+                "Provide indices as a list of integers",
+                "Example: indices=[0, 1, 2]",
+            ],
+        )
+    # Validate each index
+    for i, idx in enumerate(indices):
+        if not isinstance(idx, int):
+            raise ValidationError(
+                f"All indices must be integers, got {type(idx).__name__} at position {i}",
+                suggestions=[
+                    "Ensure all indices are integer values",
+                    "Example: indices=[0, 1, 2] (not [0.5, 1.2])",
+                ],
+            )
+        if idx < 0:
+            raise ValidationError(
+                f"All indices must be non-negative, got {idx} at position {i}",
+                suggestions=[
+                    "Provide only non-negative indices (0 or greater)",
+                    "Example: indices=[0, 1, 2, 10]",
+                ],
+            )
+    # Check for reasonable list size
+    config = get_config()
+    max_sample_size = config.max_sample_size
+    if len(indices) > max_sample_size:
+        raise ValidationError(
+            f"Too many indices requested ({len(indices)}), maximum is {max_sample_size}",
+            suggestions=[
+                f"Reduce the number of indices to {max_sample_size} or less",
+                "Consider using regular sampling instead of specific indices",
+            ],
+        )
+    return indices
+def format_validation_error(error: ValidationError) -> str:
+    """
+    Format a validation error with suggestions into a user-friendly message.
+    Args:
+        error: ValidationError to format
+    Returns:
+        Formatted error message with suggestions
+    """
+    message = str(error)
+    if error.suggestions:
+        message += "\n\nSuggestions:"
+        for suggestion in error.suggestions:
+            message += f"\n  - {suggestion}"
+    return message