Source code for evalml.data_checks.data_checks

"""A collection of data checks."""
import inspect

from evalml.data_checks import DataCheck
from evalml.exceptions import DataCheckInitError
from evalml.utils import infer_feature_types


def _has_defaults_for_all_args(init):
    """Test whether the init method has defaults for all arguments."""
    signature = inspect.getfullargspec(init)
    n_default_args = 0 if not signature.defaults else len(signature.defaults)
    n_args = (
        len(signature.args) - 1 if "self" in signature.args else len(signature.args)
    )
    return n_args == n_default_args


[docs]class DataChecks: """A collection of data checks. Args: data_checks (list (DataCheck)): List of DataCheck objects. data_check_params (dict): Parameters for passed DataCheck objects. """ @staticmethod def _validate_data_checks(data_check_classes, params): """Creates a DataChecks instance from a list of DataCheck classes and corresponding params.""" if not isinstance(data_check_classes, list): raise ValueError( f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}.", ) if not all( inspect.isclass(check) and issubclass(check, DataCheck) for check in data_check_classes ): raise ValueError( "All elements of parameter data_checks must be an instance of DataCheck " "or a DataCheck class with any desired parameters specified in the " "data_check_params dictionary.", ) params = params or dict() if not isinstance(params, dict): raise ValueError(f"Params must be a dictionary. Received {params}") in_params = set(params.keys()) in_classes = set([c.name for c in data_check_classes]) name_to_class = {c.name: c for c in data_check_classes} extraneous = in_params.difference(in_classes) missing = in_classes.difference(in_params) for extraneous_class in extraneous: raise DataCheckInitError( f"Class {extraneous_class} was provided in params dictionary but it does not match any name " "in the data_check_classes list. Make sure every key of the params dictionary matches the name" "attribute of a corresponding DataCheck class.", ) for missing_class_name in missing: if not _has_defaults_for_all_args(name_to_class[missing_class_name]): raise DataCheckInitError( f"Class {missing_class_name} was provided in the data_checks_classes list but it does not have " "an entry in the parameters dictionary.", ) @staticmethod def _init_data_checks(data_check_classes, params): data_check_instances = [] for data_check_class in data_check_classes: class_params = params.get(data_check_class.name, {}) if not isinstance(class_params, dict): raise DataCheckInitError( f"Parameters for {data_check_class.name} were not in a dictionary. Received {class_params}.", ) try: data_check_instances.append(data_check_class(**class_params)) except TypeError as e: raise DataCheckInitError( f"Encountered the following error while initializing {data_check_class.name}: {e}", ) return data_check_instances def __init__(self, data_checks=None, data_check_params=None): data_check_params = data_check_params or dict() self._validate_data_checks(data_checks, data_check_params) data_check_instances = self._init_data_checks(data_checks, data_check_params) self.data_checks = data_check_instances
[docs] def validate(self, X, y=None): """Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. Args: X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target data of length [n_samples] Returns: dict: Dictionary containing DataCheckMessage objects """ messages = [] existing_schema = X.ww.schema X.ww.init(schema=existing_schema, already_sorted=True) X = X.ww.drop(list(X.ww.select("index", return_schema=True).columns)) if y is not None: y = infer_feature_types(y) for data_check in self.data_checks: messages_new = data_check.validate(X, y) messages.extend(messages_new) return messages