Source code for kwx.topic_model

"""
topic_model
-----------

The unsupervised learning topic model for keyword extraction.

Contents:
    TopicModel Class:
        _vectorize,
        fit
"""

import inspect
import logging
import os
import warnings
from datetime import datetime

logging.disable(logging.WARNING)
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import numpy as np
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from sklearn.cluster import KMeans


[docs]class TopicModel:
    """
    The topic model class to fit and predict given an unsupervised learning technique.
    """

    def __init__(self, num_topics=10, method="lda", bert_model=None):
        """
        Parameters
        ----------
            num_topics : int (default=10)
                The number of categories for LDA and BERT based approaches.

            method : str (default=lda)
                The modelling method.

            bert_model : sentence_transformers.SentenceTransformer.SentenceTransformer
                A sentence transformer model.
        """
        modeling_methods = ["lda", "bert"]
        if method not in modeling_methods:
            ValueError(
                "The indicated method is invalid. Please choose from {}.".format(
                    modeling_methods
                )
            )

        self.num_topics = num_topics
        self.bert_model = bert_model
        self.dirichlet_dict = None
        self.bow_corpus = None
        self.text_corpus = None
        self.cluster_model = None
        self.lda_model = None
        self.vec = {}
        self.gamma = 15  # parameter for relative importance of LDA
        self.method = method.lower()
        self.id = method + "_" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

[docs]    def _vectorize(self, text_corpus, method=None, **kwargs):
        """
        Get vector representations from selected methods.

        Parameters
        ----------
            text_corpus : list, list of lists, or str
                The text corpus over which analysis should be done.

            method : str
                The modeling technique to use.

            **kwargs : keyword arguments
                Keyword arguments correspoding to sentence_transformers.SentenceTransformer.encode or gensim.models.ldamulticore.LdaMulticore.

        Returns
        -------
            vec : np.array
                An array of text vectorizations.
        """
        if method is None:
            method = self.method

        self.text_corpus = text_corpus
        token_corpus = [t.split(" ") for t in text_corpus]
        self.dirichlet_dict = corpora.Dictionary(token_corpus)
        self.bow_corpus = [self.dirichlet_dict.doc2bow(text) for text in token_corpus]

        if method == "lda":
            if not self.lda_model:
                kwargs = {
                    k: v
                    for k, v in kwargs.items()
                    if k in inspect.getfullargspec(LdaMulticore)[0]
                }
                self.lda_model = LdaMulticore(
                    corpus=self.bow_corpus,
                    num_topics=self.num_topics,
                    id2word=self.dirichlet_dict,
                    **kwargs,
                )

            def get_vec_lda(model, bow_corpus, num_topics):
                """
                Get the LDA vector representation.

                Parameters
                ----------
                    bow_corpus : list of lists
                        Contains doc2bow representations of the given texts.

                    num_topics : int
                        The number of categories for LDA and BERT based approaches.

                Returns
                -------
                    vec_lda : np.array (n_doc * n_topic)
                        The probabilistic topic assignments for all documents.
                """
                n_doc = len(bow_corpus)
                vec_lda = np.zeros((n_doc, num_topics))
                for i in range(n_doc):
                    # Get the distribution for the i-th document in bow_corpus.
                    for topic, prob in model.get_document_topics(
                        bow=bow_corpus[i], minimum_probability=0
                    ):
                        vec_lda[i, topic] = prob

                return vec_lda

            vec = get_vec_lda(self.lda_model, self.bow_corpus, self.num_topics)

            return vec

        elif method == "bert":
            kwargs = {
                k: v
                for k, v in kwargs.items()
                if k in inspect.getfullargspec(self.bert_model.encode)[0]
            }
            vec = np.array(self.bert_model.encode(sentences=self.text_corpus, **kwargs))

            return vec

[docs]    def fit(self, text_corpus, method=None, m_clustering=None, **kwargs):
        """
        Fit the topic model for selected method given the preprocessed data.

        Parameters
        ----------
            text_corpus : list, list of lists, or str
                The text corpus over which analysis should be done.

            method : str
                The modeling technique to use.

            m_clustering : sklearn.cluster.object
                The method that should be used to cluster.

            **kwargs : keyword arguments
                Keyword arguments correspoding to sentence_transformers.SentenceTransformer.encode or gensim.models.ldamulticore.LdaMulticore.

        Returns
        -------
            self : LDA or cluster model
                A fitted model.
        """
        if method is None:
            method = self.method

        if m_clustering is None:
            m_clustering = KMeans

        self.text_corpus = text_corpus
        if not self.dirichlet_dict:
            token_corpus = [t.split(" ") for t in text_corpus]
            self.dirichlet_dict = corpora.Dictionary(token_corpus)
            self.bow_corpus = [
                self.dirichlet_dict.doc2bow(text) for text in token_corpus
            ]

        if method == "lda":
            if not self.lda_model:
                kwargs = {
                    k: v
                    for k, v in kwargs.items()
                    if k in inspect.getfullargspec(LdaMulticore)[0]
                }
                self.lda_model = LdaMulticore(
                    corpus=self.bow_corpus,
                    num_topics=self.num_topics,
                    id2word=self.dirichlet_dict,
                    **kwargs,
                )

        else:
            if len(self.text_corpus) < self.num_topics:
                raise ValueError(
                    "`num_topics` cannot be larger than the size of `text_corpus` - consider lowering the desired number of topics"
                )

            self.cluster_model = m_clustering(self.num_topics)
            self.vec[method] = self._vectorize(
                text_corpus=self.text_corpus, method=method, **kwargs,
            )
            self.cluster_model.fit(X=self.vec[method])