# tf=2.2.2, keras=2.3.1
import keras.backend as K
import numpy as np
# 转移矩阵,需要训练的参数, 只要把loss计算出来, keras框架会帮我自动训练这个矩阵
_trans = np.array([
[ 0.6068113 , -0.07780273, 0.19370675, 0.58709157],
[-0.8390693 , -0.5952868 , -0.03093415, 0.7369754 ],
[-0.13051163, 0.34903476, 0.39821082, -0.49325067],
[-0.14534481, 0.86006504, -0.51447386, -0.20285301]],
dtype="float32")
# 以下都是模型的实际更新的参数,只是抠出来而已 batch_size = 2
s = ["我喜欢自然语言处理","工具人加油"] # 带分词的文本
s_cut = [['我', '喜欢', '自然', '语言', '处理', '工作'], ['工具人', '加油']] # 标注的分词真实结果
id2tag = {0:'s', 1:'b', 2:'m', 3:'e', 4: 'k'} # b分词开始, e分词结束, m分词的中间,s仅仅单字分开, k不足则补第五个标签
# 输入模型的 x_true, 模型通过 x_true 可以计算出 y _pred
x_true = np.array([
[ 405, 1237, 1050, 377, 161, 755, 728, 524, 140, 333, 221],
[ 333, 322, 334, 595, 1727, 0, 0, 0, 0, 0, 0]])
# 实际的真实标签
y_true = np.array([
[[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0.]],
[[0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 1.]]], dtype="float32")
# 假设模型预测出来的分数
y_pred = np.array([
[[ 3.680594 , -0.03072696, -2.915623 , -1.1445749 , -0.14310378],
[ 0.8227052 , 4.026242 , -1.8849481 , -2.5478888 , 0.00879496],
[ 0.03746354, 0.6086352 , -0.845218 , 1.3254657 , -0.19277799],
[-1.7585003 , 2.6042378 , 1.214861 , -1.4752557 , 0.09622633],
[-1.788204 , -0.8006125 , 1.4337045 , 1.6343507 , 0.07719082],
[-2.393122 , 1.1640749 , 1.6590837 , -0.02914308, -0.06154215],
[-1.1900285 , -1.0482597 , 1.4129926 , 1.3167557 , 0.1641072 ],
[-1.7268145 , 1.5086218 , 0.9587725 , -0.60441977, -0.09230721],
[-0.69056 , -0.31435436, -1.2028848 , 2.6586199 , -0.30499774],
[-1.2538036 , 2.3022325 , -0.07277121, -0.6597443 , -0.03979611],
[-0.70454466, -1.4252175 , 1.1865989 , 1.2471256 , 0.24029002]],
[[-0.5159192 , 3.0819356 , -2.1474144 , -1.4848454 , -0.52434576],
[-1.6737992 , 0.53270024, 1.3270049 , 0.20873429, -0.01093 ],
[ 0.66907936, -0.7314879 , -1.1508043 , 1.68861 , -0.1668109 ],
[-0.96932274, 2.5352259 , -0.15578564, -1.1283011 , 0.00594538],
[ 0.8658654 , -2.1527104 , -1.4609618 , 3.3423257 , -0.08020568],
[ 4.25738 , -0.23073708, -3.0954285 , -0.6405242 , 0.31829464],
[ 3.9629989 , 0.9391143 , -3.5199292 , -0.6034824 , 0.10845804],
[ 3.9653206 , 0.6616078 , -3.4509583 , -0.35464182, 0.107804 ],
[ 3.8821256 , 0.84127825, -3.3765595 , -0.5175394 , 0.11499661],
[ 3.206688 , 1.053434 , -2.4869404 , -0.92393434, 0.0869934 ],
[ 1.2626029 , 1.2862996 , -0.88906866, -1.0243741 ,0.1475074 ]]], dtype="float32")
mask = 1 - y_true[:, :, -1:]
y_true, y_pred = y_true[:, :, :4], y_pred[:, :, :4] # 去除 mask 的标签
# 真实标签和真实路径的得分
point_score = K.sum(K.sum(y_pred * y_true, 2), 1, keepdims=True) # 逐标签得分
labels1 = K.expand_dims(y_true[:, :-1], 3)
labels2 = K.expand_dims(y_true[:, 1:], 2)
labels = labels1 * labels2 # 两个错位labels,负责从转移矩阵中抽取目标转移得分
trans = K.expand_dims(K.expand_dims(_trans, 0), 0)
trans_score = K.sum(K.sum(trans * labels, [2, 3]), 1, keepdims=True)
path_score = point_score + trans_score
# 全部标签和全部路径的分数
init_states = [y_pred[:, 0]] # 初始状态
y_pred = np.concatenate([y_pred, mask], axis=-1)
def log_norm_step(inputs, states):
"""递归计算归一化因子
要点:1、递归计算;2、用logsumexp避免溢出。
技巧:通过expand_dims来对齐张量。
"""
inputs, mask = inputs[:, :-1], inputs[:, -1:]
states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1)
trans = K.expand_dims(_trans, 0) # (1, output_dim, output_dim)
outputs = K.logsumexp(states + trans, 1) # (batch_size, output_dim)
outputs = outputs + inputs
outputs = mask * outputs + (1 - mask) * states[:, :, 0] # 如果是mask,那就不转移了,直接取status的分数
return outputs, [outputs]
last_output, outputs, states = K.rnn(log_norm_step, y_pred[:, 1:], init_states) # 计算Z向量(对数)
log_norm = K.logsumexp(last_output, 1, keepdims=True) # 计算Z(对数)
# $ loss = - log \frac{realPath}{P1+P2+P3+...+Pn} $
# 即真实路径的分数除以全部路径,需要的结果是分数最大化,但是由于神经网络是需要loss最小化梯度下降求解,所以是负log
loss = log_norm - path_score # 即 -log(分子/分母)