Source code for messageanalyzer.topic_modeling

from typing import List, Dict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

[docs]
def topic_modeling(messages: List[str], n_topics: int = 5, n_words: int = 10, random_state: int = 123) -> Dict[str, List[str]]:
    """
    Perform topic modeling using Non-negative Matrix Factorization (NMF).

    Parameters
    ----------
    messages : List[str]
        List of messages for topic modeling.
    n_topics : int, optional
        Number of topics to extract, by default 5.
    n_words : int, optional
        Number of top words to display per topic, by default 10.
    random_state : int, optional
        Random seed for reproducibility, by default 123.

    Returns
    -------
    Dict[str, List[str]]
        A dictionary where each key is a topic label (e.g., "Topic 1") and each value is a list of the most representative words for that topic.

    Raises
    ------
    TypeError
        If `messages` is not a list of strings.

    Examples
    --------
    >>> messages = ["Learning Data science at MDS is amazing!", "I prefer to work with Python than R"]
    >>> topic_modeling(messages, n_topics = 3, n_words = 3)
    {'Topic 1': ['mds', 'science', 'learning'], 'Topic 2': ['work', 'python', 'prefer'], 'Topic 3': ['amazing', 'data', 'learning']}
    """
   
    if not isinstance(messages, list):
        raise TypeError("Input messages should be a list of strings.")
    for doc in messages:
        if not isinstance(doc, str):
            raise TypeError("Input messages should be a list of strings.")

    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(messages)
    
    nmf_model = NMF(n_components=n_topics, random_state=random_state, init='random')
    W = nmf_model.fit_transform(tfidf_matrix)
    H = nmf_model.components_
    
    feature_names = vectorizer.get_feature_names_out()
    topics = {}
    
    for topic_idx, topic_weights in enumerate(H):
        top_word_indices = topic_weights.argsort()[:-n_words - 1:-1]
        top_words = [feature_names[i] for i in top_word_indices]
        topics[f"Topic {topic_idx + 1}"] = top_words
    return topics