Source code for kwx.topic_model

"""
topic_model
-----------

The unsupervised learning topic model for keyword extraction.

Contents:
    TopicModel Class:
        _vectorize,
        fit
"""

import inspect
import logging
import os
import warnings
from datetime import datetime

logging.disable(logging.WARNING)
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import numpy as np
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from sklearn.cluster import KMeans


[docs]class TopicModel: """ The topic model class to fit and predict given an unsupervised learning technique. """ def __init__(self, num_topics=10, method="lda", bert_model=None): """ Parameters ---------- num_topics : int (default=10) The number of categories for LDA and BERT based approaches. method : str (default=lda) The modelling method. bert_model : sentence_transformers.SentenceTransformer.SentenceTransformer A sentence transformer model. """ modeling_methods = ["lda", "bert"] if method not in modeling_methods: ValueError( "The indicated method is invalid. Please choose from {}.".format( modeling_methods ) ) self.num_topics = num_topics self.bert_model = bert_model self.dirichlet_dict = None self.bow_corpus = None self.text_corpus = None self.cluster_model = None self.lda_model = None self.vec = {} self.gamma = 15 # parameter for relative importance of LDA self.method = method.lower() self.id = method + "_" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
[docs] def _vectorize(self, text_corpus, method=None, **kwargs): """ Get vector representations from selected methods. Parameters ---------- text_corpus : list, list of lists, or str The text corpus over which analysis should be done. method : str The modeling technique to use. **kwargs : keyword arguments Keyword arguments correspoding to sentence_transformers.SentenceTransformer.encode or gensim.models.ldamulticore.LdaMulticore. Returns ------- vec : np.array An array of text vectorizations. """ if method is None: method = self.method self.text_corpus = text_corpus token_corpus = [t.split(" ") for t in text_corpus] self.dirichlet_dict = corpora.Dictionary(token_corpus) self.bow_corpus = [self.dirichlet_dict.doc2bow(text) for text in token_corpus] if method == "lda": if not self.lda_model: kwargs = { k: v for k, v in kwargs.items() if k in inspect.getfullargspec(LdaMulticore)[0] } self.lda_model = LdaMulticore( corpus=self.bow_corpus, num_topics=self.num_topics, id2word=self.dirichlet_dict, **kwargs, ) def get_vec_lda(model, bow_corpus, num_topics): """ Get the LDA vector representation. Parameters ---------- bow_corpus : list of lists Contains doc2bow representations of the given texts. num_topics : int The number of categories for LDA and BERT based approaches. Returns ------- vec_lda : np.array (n_doc * n_topic) The probabilistic topic assignments for all documents. """ n_doc = len(bow_corpus) vec_lda = np.zeros((n_doc, num_topics)) for i in range(n_doc): # Get the distribution for the i-th document in bow_corpus. for topic, prob in model.get_document_topics( bow=bow_corpus[i], minimum_probability=0 ): vec_lda[i, topic] = prob return vec_lda vec = get_vec_lda(self.lda_model, self.bow_corpus, self.num_topics) return vec elif method == "bert": kwargs = { k: v for k, v in kwargs.items() if k in inspect.getfullargspec(self.bert_model.encode)[0] } vec = np.array(self.bert_model.encode(sentences=self.text_corpus, **kwargs)) return vec
[docs] def fit(self, text_corpus, method=None, m_clustering=None, **kwargs): """ Fit the topic model for selected method given the preprocessed data. Parameters ---------- text_corpus : list, list of lists, or str The text corpus over which analysis should be done. method : str The modeling technique to use. m_clustering : sklearn.cluster.object The method that should be used to cluster. **kwargs : keyword arguments Keyword arguments correspoding to sentence_transformers.SentenceTransformer.encode or gensim.models.ldamulticore.LdaMulticore. Returns ------- self : LDA or cluster model A fitted model. """ if method is None: method = self.method if m_clustering is None: m_clustering = KMeans self.text_corpus = text_corpus if not self.dirichlet_dict: token_corpus = [t.split(" ") for t in text_corpus] self.dirichlet_dict = corpora.Dictionary(token_corpus) self.bow_corpus = [ self.dirichlet_dict.doc2bow(text) for text in token_corpus ] if method == "lda": if not self.lda_model: kwargs = { k: v for k, v in kwargs.items() if k in inspect.getfullargspec(LdaMulticore)[0] } self.lda_model = LdaMulticore( corpus=self.bow_corpus, num_topics=self.num_topics, id2word=self.dirichlet_dict, **kwargs, ) else: if len(self.text_corpus) < self.num_topics: raise ValueError( "`num_topics` cannot be larger than the size of `text_corpus` - consider lowering the desired number of topics" ) self.cluster_model = m_clustering(self.num_topics) self.vec[method] = self._vectorize( text_corpus=self.text_corpus, method=method, **kwargs, ) self.cluster_model.fit(X=self.vec[method])