【情感提取+情感计算+词频统计】python情感分析
import jieba.analyse
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# 加载数据
def load_data(pos_file, neg_file):
with open(pos_file, 'r', encoding='utf-8') as f:
pos_data = f.readlines()
with open(neg_file, 'r', encoding='utf-8') as f:
neg_data = f.readlines()
return pos_data, neg_data
# 切词并生成词频统计
def generate_word_freq(data, cut_all=False):
# 使用结巴分词
words = []
for sentence in data:
words.extend(jieba.cut(sentence, cut_all=cut_all))
# 使用TfidfVectorizer计算词频
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform([word for word in words if word not in vectorizer.vocabulary_]).toarray()
# 返回词频矩阵和词汇表
return X, vectorizer.vocabulary_
# 训练情感判别模型
def train_model(pos_data, neg_data):
pos_data, neg_data = load_data(pos_file, neg_file)
pos_X, pos_vocab = generate_word_freq(pos_data)
neg_X, neg_vocab = generate_word_freq(neg_data)
# 合并词汇表并重新索引词频矩阵
vocab = pos_vocab.copy()
neg_vocab_idx = {word: idx for idx, word in enumerate(neg_vocab, len(pos_vocab))}
vocab.update(neg_vocab_idx)
X = np.concatenate([pos_X, neg_X], axis=0)
y = np.concatenate([np.ones(len(pos_X)), np.zeros(len(neg_X))])
# 训练多项朴布斯分类器
vectorizer = TfidfVectorizer(vocabulary=vocab)
X_train = vectorizer.fit_transform(data).toarray()
classifier = MultinomialNB()
classifier.fit(X_train, y)
return classifier, vectorizer
# 主程序
if __name__ == '__main__':
pos_file = 'pos.txt' # 正面评论数据文件
neg_file = 'neg.txt' # 负面评论数据文件
classifier, vectorizer = train_model(pos_file, neg_file)
# 示例:评估一条新的评论情感
sentence = '这家酒店位置很好,环境清洁'
X_test = vectorizer.transform([sentence]).toarray()
y_pred = classifier.predict(X_test)
print(f'情感: {"正面" if y_pred else "负面"}')
这段代码首先加载了正面和负面的评论数据,然后使用结巴分词库进行了中文分词,并使用TfidfVectorizer计算了词
评论已关闭