Source code for evalml.pipelines.utils

import pandas as pd

from .binary_classification_pipeline import BinaryClassificationPipeline
from .multiclass_classification_pipeline import (
    MulticlassClassificationPipeline
)
from .regression_pipeline import RegressionPipeline

from evalml.model_family import ModelFamily
from evalml.pipelines.components import (
    CatBoostClassifier,
    CatBoostRegressor,
    DateTimeFeaturizer,
    DropNullColumns,
    Estimator,
    Imputer,
    OneHotEncoder,
    StandardScaler
)
from evalml.pipelines.components.utils import get_estimators
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.utils import get_logger
from evalml.utils.gen_utils import categorical_dtypes, datetime_dtypes

logger = get_logger(__file__)


def _get_preprocessing_components(X, y, problem_type, estimator_class):
    """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.

    Arguments:
        X (pd.DataFrame): the input data of shape [n_samples, n_features]
        y (pd.Series): the target labels of length [n_samples]
        problem_type (ProblemTypes or str): problem type
        estimator_class (class):A class which subclasses Estimator estimator for pipeline

    Returns:
        list[Transformer]: a list of applicable preprocessing components to use with the estimator
    """
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    pp_components = []
    all_null_cols = X.columns[X.isnull().all()]
    if len(all_null_cols) > 0:
        pp_components.append(DropNullColumns)

    pp_components.append(Imputer)

    datetime_cols = X.select_dtypes(include=datetime_dtypes)
    add_datetime_featurizer = len(datetime_cols.columns) > 0
    if add_datetime_featurizer:
        pp_components.append(DateTimeFeaturizer)

    # DateTimeFeaturizer can create categorical columns
    categorical_cols = X.select_dtypes(include=categorical_dtypes)
    if (add_datetime_featurizer or len(categorical_cols.columns) > 0) and estimator_class not in {CatBoostClassifier, CatBoostRegressor}:
        pp_components.append(OneHotEncoder)

    if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
        pp_components.append(StandardScaler)
    return pp_components


def _get_pipeline_base_class(problem_type):
    """Returns pipeline base class for problem_type"""
    if problem_type == ProblemTypes.BINARY:
        return BinaryClassificationPipeline
    elif problem_type == ProblemTypes.MULTICLASS:
        return MulticlassClassificationPipeline
    elif problem_type == ProblemTypes.REGRESSION:
        return RegressionPipeline


[docs]def make_pipeline(X, y, estimator, problem_type):
    """Given input data, target data, an estimator class and the problem type,
        generates a pipeline class with a preprocessing chain which was recommended based on the inputs.
        The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type.

   Arguments:
        X (pd.DataFrame): the input data of shape [n_samples, n_features]
        y (pd.Series): the target labels of length [n_samples]
        estimator (Estimator): estimator for pipeline
        problem_type (ProblemTypes or str): problem type for pipeline to generate

    Returns:
        class: PipelineBase subclass with dynamically generated preprocessing components and specified estimator

    """
    problem_type = handle_problem_types(problem_type)
    if estimator not in get_estimators(problem_type):
        raise ValueError(f"{estimator.name} is not a valid estimator for problem type")
    preprocessing_components = _get_preprocessing_components(X, y, problem_type, estimator)
    complete_component_graph = preprocessing_components + [estimator]

    hyperparameters = None
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    base_class = _get_pipeline_base_class(problem_type)

    class GeneratedPipeline(base_class):
        custom_name = f"{estimator.name} w/ {' + '.join([component.name for component in preprocessing_components])}"
        component_graph = complete_component_graph
        custom_hyperparameters = hyperparameters

    return GeneratedPipeline


def make_pipeline_from_components(component_instances, problem_type, custom_name=None):
    """Given a list of component instances and the problem type, a pipeline instance is generated with the component instances.
    The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. A custom name for
    the pipeline can optionally be specified; otherwise the default pipeline name will be 'Templated Pipeline'.

   Arguments:
        component_instances (list): a list of all of the components to include in the pipeline
        problem_type (str or ProblemTypes): problem type for the pipeline to generate
        custom_name (string): a name for the new pipeline

    Returns:
        Pipeline instance with component instances and specified estimator

    """
    if not isinstance(component_instances[-1], Estimator):
        raise ValueError("Pipeline needs to have an estimator at the last position of the component list")

    pipeline_name = custom_name
    problem_type = handle_problem_types(problem_type)

    class TemplatedPipeline(_get_pipeline_base_class(problem_type)):
        custom_name = pipeline_name
        component_graph = [c.__class__ for c in component_instances]

    pipeline_instance = TemplatedPipeline({})
    pipeline_instance.component_graph = component_instances
    return pipeline_instance