Source code for evalml.pipelines.components.transformers.preprocessing.drop_rows_transformer

"""Transformer to drop rows specified by row indices."""

from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types


[docs]class DropRowsTransformer(Transformer):
    """Transformer to drop rows specified by row indices.

    Args:
        indices_to_drop (list): List of indices to drop in the input data. Defaults to None.
        random_seed (int): Seed for the random number generator. Is not used by this component. Defaults to 0.
    """

    name = "Drop Rows Transformer"
    modifies_target = True
    training_only = True
    hyperparameter_ranges = {}
    """{}"""

    def __init__(self, indices_to_drop=None, random_seed=0):
        if indices_to_drop is not None and len(set(indices_to_drop)) != len(
            indices_to_drop,
        ):
            raise ValueError("All input indices must be unique.")
        self.indices_to_drop = indices_to_drop
        parameters = {"indices_to_drop": self.indices_to_drop}
        super().__init__(
            parameters=parameters,
            component_obj=None,
            random_seed=random_seed,
        )

[docs]    def fit(self, X, y=None):
        """Fits component to data.

        Args:
            X (pd.DataFrame): The input training data of shape [n_samples, n_features].
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            self

        Raises:
            ValueError: If indices to drop do not exist in input features or target.
        """
        X_t = infer_feature_types(X)
        y_t = infer_feature_types(y) if y is not None else None
        if self.indices_to_drop is not None:
            indices_to_drop_set = set(self.indices_to_drop)
            missing_X_indices = indices_to_drop_set.difference(set(X_t.index))
            missing_y_indices = (
                indices_to_drop_set.difference(set(y_t.index))
                if y_t is not None
                else None
            )
            if len(missing_X_indices):
                raise ValueError(
                    "Indices [{}] do not exist in input features".format(
                        list(missing_X_indices),
                    ),
                )
            elif y_t is not None and len(missing_y_indices):
                raise ValueError(
                    "Indices [{}] do not exist in input target".format(
                        list(missing_y_indices),
                    ),
                )
        return self

[docs]    def transform(self, X, y=None):
        """Transforms data using fitted component.

        Args:
            X (pd.DataFrame): Features.
            y (pd.Series, optional): Target data.

        Returns:
            (pd.DataFrame, pd.Series): Data with row indices dropped.
        """
        X_t = infer_feature_types(X)
        y_t = infer_feature_types(y) if y is not None else None
        if self.indices_to_drop is None or len(self.indices_to_drop) == 0:
            return X_t, y_t
        schema = X_t.ww.schema

        X_t = X_t.drop(self.indices_to_drop, axis=0)
        X_t.ww.init(schema=schema)

        if y_t is not None:
            y_t = y_t.ww.drop(self.indices_to_drop)
        return X_t, y_t