Source code for messageanalyzer.extract_keywords

from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer


[docs]
def extract_keywords(messages: List[str], num_keywords: int = 5) -> List[List[str]]:

    """
    Extracts the top keywords from a list of text messages using TF-IDF (Term Frequency-Inverse Document Frequency).

    This function applies TF-IDF to determine the most important words in each message based on their relative 
    importance in the given text corpus. Stop words are automatically removed.

    Parameters
    ----------
    messages : List[str]
        A list of text messages from which to extract keywords.

    num_keywords : int, default = 5
        The number of top keywords to extract from each message.
    
    Raises
    ------
    TypeError
        If `messages` is not a list or contains non-string elements.

    Returns
    -------
    List[List[str]]
        A list where each sublist contains the top extracted keywords from the corresponding message.

    Examples
    --------
    >>> messages = ["Learning Data Science at MDS is amazing!", "I prefer to work with Python than R"]
    >>> extract_keywords(messages, num_keywords=3)
    [['data', 'science', 'amazing'], ['python', 'prefer', 'work']]

    """
    

    if not isinstance(messages, list) or not all(isinstance(msg, str) for msg in messages):
        raise TypeError("messages must be a list of strings")
    
    
    tf_idf_vectorizer = TfidfVectorizer(stop_words='english')

    tf_idf_vector = tf_idf_vectorizer.fit_transform(messages)

    feature_names = tf_idf_vectorizer.get_feature_names_out()

    top_keywords = []

    for i in range(len(messages)):
                
        msg_vector = tf_idf_vector[i].toarray().flatten()

        keywords = sorted(zip(msg_vector, feature_names), reverse= True)

        n_keywords = [word for _, word in keywords[:num_keywords]]
                
        top_keywords.append(n_keywords)
    
    return top_keywords