在自然语言处理(NLP)中,特征提取是将原始文本数据转换为可用于机器学习模型的数值特征的过程。以下是使用Python进行特征提取的几种常见方法:
- 使用n-gram:
from nltk import ngrams
from nltk.tokenize import word_tokenize
text = "自然语言处理是人工智能的重要部分"
tokens = word_tokenize(text)
bigrams = ngrams(tokens, 2)
for bigram in bigrams:
print(' '.join(bigram))
- 使用Tf-Idf:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["我爱编程", "我爱音乐", "编程乐趣"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names_out()
for i in range(len(X)):
print("Index %d: %s" % (i, " ".join([terms[ind] for ind in X[i].indices])))
- 使用CNN(卷积神经网络)进行文本分类:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
# 示例数据
texts = ["我爱编程", "我爱音乐", "编程乐趣"]
num_words = 10000
# 文本处理
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
# 转换为one-hot编码的矩阵
one_hot_results = tokenizer.texts_to_matrix(texts, mode='binary')
# 使用CNN建立模型
model = Sequential()
model.add(Embedding(num_words, 10, input_length=None))
model.add(Conv1D(filters=64, kernel_size=5, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=16, kernel_size=3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
# 模型训练
model.fit(one_hot_results, np.array(labels), epochs=10, batch_size=32)
- 使用预训练的BERT模型进行特征提取:
from transformers import BertTokenizer, BertModel
import torch
# 加载预训练模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# 示例文本
text = "自然语言处理是人工智能的重要部分"
# 文本处理
encoded_input = tokenizer(text, return_tensors='pt', padding=True)
# 获取特征
with torch.no_grad():
last_hidden_states = model(**encoded_input)
# 使用最后一个隐藏状态作为文本的特征表示