Source code for messageanalyzer.extract_keywords
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
[docs]
def extract_keywords(messages: List[str], num_keywords: int = 5) -> List[List[str]]:
"""
Extracts the top keywords from a list of text messages using TF-IDF (Term Frequency-Inverse Document Frequency).
This function applies TF-IDF to determine the most important words in each message based on their relative
importance in the given text corpus. Stop words are automatically removed.
Parameters
----------
messages : List[str]
A list of text messages from which to extract keywords.
num_keywords : int, default = 5
The number of top keywords to extract from each message.
Raises
------
TypeError
If `messages` is not a list or contains non-string elements.
Returns
-------
List[List[str]]
A list where each sublist contains the top extracted keywords from the corresponding message.
Examples
--------
>>> messages = ["Learning Data Science at MDS is amazing!", "I prefer to work with Python than R"]
>>> extract_keywords(messages, num_keywords=3)
[['data', 'science', 'amazing'], ['python', 'prefer', 'work']]
"""
if not isinstance(messages, list) or not all(isinstance(msg, str) for msg in messages):
raise TypeError("messages must be a list of strings")
tf_idf_vectorizer = TfidfVectorizer(stop_words='english')
tf_idf_vector = tf_idf_vectorizer.fit_transform(messages)
feature_names = tf_idf_vectorizer.get_feature_names_out()
top_keywords = []
for i in range(len(messages)):
msg_vector = tf_idf_vector[i].toarray().flatten()
keywords = sorted(zip(msg_vector, feature_names), reverse= True)
n_keywords = [word for _, word in keywords[:num_keywords]]
top_keywords.append(n_keywords)
return top_keywords