Source code for evalml.pipelines.components.transformers.imputers.simple_imputer
"""Component that imputes missing data according to a specified imputation strategy."""importpandasaspdimportwoodworkfromsklearn.imputeimportSimpleImputerasSkImputerfromevalml.pipelines.components.transformersimportTransformerfromevalml.utilsimportinfer_feature_typesfromevalml.utils.nullable_type_utilsimport_get_new_logical_types_for_imputed_data
[docs]classSimpleImputer(Transformer):"""Imputes missing data according to a specified imputation strategy. Natural language columns are ignored. Args: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """name="Simple Imputer"hyperparameter_ranges={"impute_strategy":["mean","median","most_frequent"]}"""{ "impute_strategy": ["mean", "median", "most_frequent"] }"""def__init__(self,impute_strategy="most_frequent",fill_value=None,random_seed=0,**kwargs):parameters={"impute_strategy":impute_strategy,"fill_value":fill_value}parameters.update(kwargs)self.impute_strategy=impute_strategyimputer=SkImputer(strategy=impute_strategy,fill_value=fill_value,missing_values=pd.NA,**kwargs,)self._all_null_cols=Nonesuper().__init__(parameters=parameters,component_obj=imputer,random_seed=random_seed,)
[docs]deffit(self,X,y=None):"""Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] Returns: self Raises: ValueError: if the SimpleImputer receives a dataframe with both Boolean and Categorical data. """X=infer_feature_types(X)ifset([lt.type_stringforltinX.ww.logical_types.values()])=={"boolean","categorical",}:raiseValueError("SimpleImputer cannot handle dataframes with both boolean and categorical features. Use Imputer instead.",)nan_ratio=X.isna().sum()/X.shape[0]# Keep track of the different types of data in Xself._all_null_cols=nan_ratio[nan_ratio==1].index.tolist()self._natural_language_cols=list(X.ww.select("NaturalLanguage",return_schema=True,).columns.keys(),)# Only impute data that is not natural language columns or fully nullself._cols_to_impute=[colforcolinX.columnsifcolnotinself._natural_language_colsandcolnotinself._all_null_cols]# If there are no columns to impute, return earlyifnotself._cols_to_impute:returnselfX=X[self._cols_to_impute]if(X.dtypes==bool).all():# Ensure that _component_obj still gets fit so that if any of the dtypes are different# at transform, we've fit the component. This is needed because sklearn doesn't allow# data with only bool dtype to be passed in.X=X.astype("boolean")self._component_obj.fit(X,y)returnself
[docs]deftransform(self,X,y=None):"""Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """# Record original dataX=infer_feature_types(X)original_schema=X.ww.schemaoriginal_index=X.index# separate out just the columns we are imputingX_t=X[self._cols_to_impute]ifnotself._cols_to_imputeor(X_t.dtypes==bool).all():# If there are no columns to impute or all columns to impute are bool dtype,# which will never have null values, return the original data without any fully null columnsnot_all_null_cols=[colforcolinX.columnsifcolnotinself._all_null_cols]returnX.ww[not_all_null_cols]# Transform the dataX_t=self._component_obj.transform(X_t)X_t=pd.DataFrame(X_t,columns=self._cols_to_impute)# Reinit woodwork, maintaining original types where possibleimputed_schema=original_schema.get_subset_schema(self._cols_to_impute)new_logical_types=_get_new_logical_types_for_imputed_data(impute_strategy=self.impute_strategy,original_schema=imputed_schema,)X_t.ww.init(schema=imputed_schema,logical_types=new_logical_types)# Add back in the unchanged original natural language columns that we want to keepiflen(self._natural_language_cols)>0:X_t=woodwork.concat_columns([X_t,X.ww[self._natural_language_cols]])# reorder columns to match originalX_t=X_t.ww[[colforcolinoriginal_schema.columnsifcolinX_t.columns]]ifself._cols_to_impute:X_t.index=original_indexreturnX_t
[docs]deffit_transform(self,X,y=None):"""Fits on X and transforms X. Args: X (pd.DataFrame): Data to fit and transform y (pd.Series, optional): Target data. Returns: pd.DataFrame: Transformed X """returnself.fit(X,y).transform(X,y)