Source code for evalml.pipelines.components.transformers.imputers.target_imputer

"""Component that imputes missing target data according to a specified imputation strategy."""

from functools import wraps

import pandas as pd
import woodwork as ww
from sklearn.impute import SimpleImputer as SkImputer

from evalml.exceptions import ComponentNotYetFittedError
from evalml.pipelines.components import ComponentBaseMeta
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types
from evalml.utils.nullable_type_utils import _get_new_logical_types_for_imputed_data


[docs]class TargetImputerMeta(ComponentBaseMeta):
    """A version of the ComponentBaseMeta class which handles when input features is None."""

[docs]    @classmethod
    def check_for_fit(cls, method):
        """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`.

        Args:
            method (callable): Method to wrap.

        Raises:
            ComponentNotYetFittedError: If component is not fitted.

        Returns:
            The wrapped input method.
        """

        @wraps(method)
        def _check_for_fit(self, X=None, y=None):
            klass = type(self).__name__
            if not self._is_fitted and self.needs_fitting:
                raise ComponentNotYetFittedError(
                    f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.",
                )
            else:
                return method(self, X, y)

        return _check_for_fit


[docs]class TargetImputer(Transformer, metaclass=TargetImputerMeta):
    """Imputes missing target data according to a specified imputation strategy.

    Args:
        impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for
           numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent".
        fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
           Defaults to None which uses 0 when imputing numerical data and "missing_value" for strings or object data types.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    """

    name = "Target Imputer"
    hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}
    """{
        "impute_strategy": ["mean", "median", "most_frequent"]
    }"""
    modifies_features = False
    modifies_target = True

    def __init__(
        self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs
    ):
        parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value}
        parameters.update(kwargs)
        imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs)
        super().__init__(
            parameters=parameters,
            component_obj=imputer,
            random_seed=random_seed,
        )

[docs]    def fit(self, X, y):
        """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same.

        Args:
            X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored.
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            self

        Raises:
            TypeError: If target is filled with all null values.
        """
        if y is None:
            return self
        y = infer_feature_types(y)
        if all(y.isnull()):
            raise TypeError("Provided target full of nulls.")
        y = y.to_frame()

        # Return early if all the columns are bool dtype, which will never have null values
        if (y.dtypes == bool).all():
            return y

        self._component_obj.fit(y)
        return self

[docs]    def transform(self, X, y):
        """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.

        Args:
            X (pd.DataFrame): Features. Ignored.
            y (pd.Series): Target data to impute.

        Returns:
            (pd.DataFrame, pd.Series): The original X, transformed y
        """
        if X is not None:
            X = infer_feature_types(X)
        if y is None:
            return X, None
        y_ww = infer_feature_types(y)
        y_df = y_ww.ww.to_frame()

        # Return early if all the columns are bool dtype, which will never have null values
        if (y_df.dtypes == bool).all():
            return X, y_ww

        transformed = self._component_obj.transform(y_df)
        y_t = pd.Series(transformed[:, 0], index=y_ww.index)

        # Determine logical type to use - should match input data where possible
        new_logical_type_dict = _get_new_logical_types_for_imputed_data(
            self.parameters["impute_strategy"],
            y_df.ww.schema,
        )
        new_logical_type = list(new_logical_type_dict.values())[0]

        return X, ww.init_series(y_t, logical_type=new_logical_type)

[docs]    def fit_transform(self, X, y):
        """Fits on and transforms the input target data.

        Args:
            X (pd.DataFrame): Features. Ignored.
            y (pd.Series): Target data to impute.

        Returns:
            (pd.DataFrame, pd.Series): The original X, transformed y
        """
        return self.fit(X, y).transform(X, y)