Source code for evalml.pipelines.components.transformers.imputers.target_imputer

"""Component that imputes missing target data according to a specified imputation strategy."""

from functools import wraps

import pandas as pd
import woodwork as ww
from sklearn.impute import SimpleImputer as SkImputer

from evalml.exceptions import ComponentNotYetFittedError
from evalml.pipelines.components import ComponentBaseMeta
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types
from evalml.utils.nullable_type_utils import _get_new_logical_types_for_imputed_data


[docs]class TargetImputerMeta(ComponentBaseMeta): """A version of the ComponentBaseMeta class which handles when input features is None."""
[docs] @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. Args: method (callable): Method to wrap. Raises: ComponentNotYetFittedError: If component is not fitted. Returns: The wrapped input method. """ @wraps(method) def _check_for_fit(self, X=None, y=None): klass = type(self).__name__ if not self._is_fitted and self.needs_fitting: raise ComponentNotYetFittedError( f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.", ) else: return method(self, X, y) return _check_for_fit
[docs]class TargetImputer(Transformer, metaclass=TargetImputerMeta): """Imputes missing target data according to a specified imputation strategy. Args: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent". fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. Defaults to None which uses 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """ name = "Target Imputer" hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} """{ "impute_strategy": ["mean", "median", "most_frequent"] }""" modifies_features = False modifies_target = True def __init__( self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs ): parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value} parameters.update(kwargs) imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs) super().__init__( parameters=parameters, component_obj=imputer, random_seed=random_seed, )
[docs] def fit(self, X, y): """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. y (pd.Series, optional): The target training data of length [n_samples]. Returns: self Raises: TypeError: If target is filled with all null values. """ if y is None: return self y = infer_feature_types(y) if all(y.isnull()): raise TypeError("Provided target full of nulls.") y = y.to_frame() # Return early if all the columns are bool dtype, which will never have null values if (y.dtypes == bool).all(): return y self._component_obj.fit(y) return self
[docs] def transform(self, X, y): """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. Args: X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. Returns: (pd.DataFrame, pd.Series): The original X, transformed y """ if X is not None: X = infer_feature_types(X) if y is None: return X, None y_ww = infer_feature_types(y) y_df = y_ww.ww.to_frame() # Return early if all the columns are bool dtype, which will never have null values if (y_df.dtypes == bool).all(): return X, y_ww transformed = self._component_obj.transform(y_df) y_t = pd.Series(transformed[:, 0], index=y_ww.index) # Determine logical type to use - should match input data where possible new_logical_type_dict = _get_new_logical_types_for_imputed_data( self.parameters["impute_strategy"], y_df.ww.schema, ) new_logical_type = list(new_logical_type_dict.values())[0] return X, ww.init_series(y_t, logical_type=new_logical_type)
[docs] def fit_transform(self, X, y): """Fits on and transforms the input target data. Args: X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. Returns: (pd.DataFrame, pd.Series): The original X, transformed y """ return self.fit(X, y).transform(X, y)