from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# 示例函数:移除停用词并进行词干提取
def preprocess_text(text):
stop_words = set(stopwords.words('english'))
words = [word for sentence in text for word in sentence]
words = [stemmer.stem(word) for word in words if word not in stop_words and word.isalpha()]
return words
# 示例函数:计算词频
def calculate_term_frequency(words):
word_frequencies = {}
for word in words:
word_frequencies[word] = word_frequencies.get(word, 0) + 1
return word_frequencies
# 示例函数:计算逆文档频率
def calculate_inverse_document_frequency(words, documents):
word_doc_counts = {}
num_docs = len(documents)
for word in words:
word_doc_counts[word] = word_doc_counts.get(word, 0) + 1
idf_values = {}
for word, doc_count in word_doc_counts.items():
idf_values[word] = np.log(num_docs / (1 + doc_count))
return idf_values
# 示例函数:计算TF-IDF值
def calculate_tf_idf(words, documents, word_frequencies, idf_values):
tf_idf_values = {}
for word, word_freq in word_frequencies.items():
tf_idf_values[word] = word_freq * idf_values[word]
return tf_idf_values
# 示例文本和文档集合
text = [['machine', 'learning', 'is', 'fun'], ['deep', 'learning', 'is', 'even', 'more', 'fun']]
documents = [[text[0]], [text[1]]]
# 预处理文本
stemmer = PorterStemmer()
preprocessed_text = preprocess_text(text)
# 计算词频
term_frequencies = calculate_term_frequency(preprocessed_text)
# 计算逆文档频率
idf_values = calculate_inverse_document_frequency(preprocessed_text, documents)
# 计算TF-IDF值
tf_idf_values = calculate_tf_idf(preprocessed_text, documents, term_frequencies, idf_values)
# 打印结果
print(tf_idf_values)
这个代码示例展示了如何使用NLTK库进行文本预处理,包括移除停用词和进行词干提取,计算词频,计算逆文档频率,以及计算TF-IDF值。这是自然语言处理中一个常见的工作流程,对于理解文本分析的原理很有帮助。