Source code for evalml.pipelines.components.transformers.preprocessing.lsa
"""Transformer to calculate the Latent Semantic Analysis Values of text input."""
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from evalml.pipelines.components.transformers.preprocessing import TextTransformer
from evalml.utils import infer_feature_types
[docs]class LSA(TextTransformer):
    """Transformer to calculate the Latent Semantic Analysis Values of text input.
    Args:
        random_seed (int): Seed for the random number generator. Defaults to 0.
    """
    name = "LSA Transformer"
    hyperparameter_ranges = {}
    """{}"""
    def __init__(self, random_seed=0, **kwargs):
        self._lsa_pipeline = make_pipeline(
            TfidfVectorizer(),
            TruncatedSVD(random_state=random_seed),
        )
        self._provenance = {}
        super().__init__(random_seed=random_seed, **kwargs)
[docs]    def fit(self, X, y=None):
        """Fits the input data.
        Args:
            X (pd.DataFrame): The data to transform.
            y (pd.Series, optional): Ignored.
        Returns:
            self
        """
        X = infer_feature_types(X)
        self._text_columns = self._get_text_columns(X)
        if len(self._text_columns) == 0:
            return self
        corpus = X[self._text_columns].values.flatten()
        # we assume non-str values will have been filtered out prior to calling LSA.fit. this is a safeguard.
        corpus = corpus.astype(str)
        self._lsa_pipeline.fit(corpus)
        return self 
    def _get_feature_provenance(self):
        return self._provenance