word2vector-情感分析demo


import numpy as np
import keras
import gensim
import json 
from sklearn.model_selection import train_test_split
from keras.utils import plot_model
import jieba
import tensorflow as tf

1 构建词向量矩阵

def get_embedding_matrix(emed_path, embed_size=100):
    embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
        emed_path, binary=False)

    words = embeddings_index.index2word
    embedding_matrix = np.zeros((len(words)+1, embed_size))

    for index, word in enumerate(words):
        embedding_matrix[index+1] = embeddings_index[word]
    word_dict = {word: index+1 for index, word in enumerate(words)}
    return embedding_matrix, word_dict

emed_path = "model_vec_2.txt"
embedding_matrix, word_dict = get_embedding_matrix(emed_path)

# embedding_matrix = np.random.random((135293, 100))

embedding_matrix.shape

2 读取训练数据

train_data_path = "./data/train.txt"
test_data_path =  "./data/test.txt"

max_len = 128  # 每个样本保留128个词的长度

## 数据 1
label2index = {"B":0, "G":1}

with open("./data/train.txt") as f:
    x_raw = f.readlines()
x_raw = [json.loads(i) for i in x_raw]

x_tensor, y_tensor = [], []

for line in x_raw:
    x_segs = [word_dict[i] if i in word_dict else 0 for i in line["words"]]
    padding = [0] * (max_len-len(x_segs))
    x_tensor.append(x_segs + padding)
    y_tensor.append([label2index[line["label"]]])

x_tensor = np.array(x_tensor)
y_tensor = np.array(y_tensor)

# x_tensor
# y_tensor

3 构建模型

inputs = keras.layers.Input(shape=(max_len,))
x = keras.layers.Embedding(embedding_matrix.shape[0],
                           output_dim=embedding_matrix.shape[1],
                           weights=[embedding_matrix],
                           input_length=max_len,
                           trainable=True)(inputs)
flatten = keras.layers.Lambda(lambda x: tf.reduce_sum(x, axis=1))(x)
dense = keras.layers.Dense(len(label2index), activation='softmax')(flatten)
model = keras.models.Model(inputs=[inputs], outputs=[dense])
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
              optimizer=keras.optimizers.Adam(), 
              metrics=[keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')])
model.summary()

plot_model(model,to_file='model.png',show_shapes=True)

4 训练模型

model.compile(loss ="sparse_categorical_crossentropy", optimizer = "adam", metrics=["acc"]) 
model.fit(x=x_tensor,y=y_tensor,epochs=20, validation_split=0.2, batch_size=128)

5 模型预测

test = ["衣服不好,质量很差","衣服不错,下次还来"]
x_pred = []
for i in test:
    x_segs = [word_dict[i] if i in word_dict else 0 for i in jieba.cut(i)]
    padding = [0] * (max_len-len(x_segs))
    x_pred.append(x_segs + padding)
y_pred = model.predict(x_pred)
for i, j in zip(test,y_pred):
    print(i, j)