Source code for evalml.pipelines.components.transformers.preprocessing.drop_null_columns
"""Transformer to drop features whose percentage of NaN values exceeds a specified threshold."""
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types
[docs]class DropNullColumns(Transformer):
    """Transformer to drop features whose percentage of NaN values exceeds a specified threshold.
    Args:
        pct_null_threshold(float): The percentage of NaN values in an input feature to drop.
            Must be a value between [0, 1] inclusive. If equal to 0.0, will drop columns with any null values.
            If equal to 1.0, will drop columns with all null values. Defaults to 0.95.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    """
    name = "Drop Null Columns Transformer"
    hyperparameter_ranges = {}
    """{}"""
    def __init__(self, pct_null_threshold=1.0, random_seed=0, **kwargs):
        if pct_null_threshold < 0 or pct_null_threshold > 1:
            raise ValueError(
                "pct_null_threshold must be a float between 0 and 1, inclusive.",
            )
        parameters = {"pct_null_threshold": pct_null_threshold}
        parameters.update(kwargs)
        self._cols_to_drop = None
        super().__init__(
            parameters=parameters,
            component_obj=None,
            random_seed=random_seed,
        )
[docs]    def fit(self, X, y=None):
        """Fits component to data.
        Args:
            X (pd.DataFrame): The input training data of shape [n_samples, n_features].
            y (pd.Series, optional): The target training data of length [n_samples].
        Returns:
            self
        """
        pct_null_threshold = self.parameters["pct_null_threshold"]
        X_t = infer_feature_types(X)
        percent_null = X_t.isnull().mean()
        if pct_null_threshold == 0.0:
            null_cols = percent_null[percent_null > 0]
        else:
            null_cols = percent_null[percent_null >= pct_null_threshold]
        self._cols_to_drop = list(null_cols.index)
        return self