bert的理解(bert4keras)-分词


1 分词逻辑

from bert4keras.tokenizers import Tokenizer,load_vocab,save_vocab
dict_path = 'vocab.txt'

sentence = "这件衣服,[订单详情],我要买这个衣服,和这个衣服,[订单详情],有啥区别"

1.1 正常分词

tokenizer = Tokenizer(token_dict=dict_path)
tokens = tokenizer.tokenize(sentence)
# print(tokens)

['[CLS]', '这', '件', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '我', '要', '买', '这', '个', '衣', '服', ',', '和', '这', '个', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '有', '啥', '区', '别', '[SEP]']

encode = tokenizer.encode(text, maxlen=128)
# print(encode)

([101, 2769, 1139, 4495, 754, 8447, 9310, 2399, 117, 100, 8993, 100, 13098, 8139, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

1.2 加载并精简词表,建立分词器 - 词表减小,减少emb

token_dict = load_vocab(dict_path=dict_path, simplified=False)
save_vocab("vocab_smaple_word.txt",token_dict)
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
tokenizer = Tokenizer(token_dict=dict_path)
tokens = tokenizer.tokenize(sentence)
# print(tokens)

['[CLS]', '这', '件', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '我', '要', '买', '这', '个', '衣', '服', ',', '和', '这', '个', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '有', '啥', '区', '别', '[SEP]']

encode = tokenizer.encode(text, maxlen=128)
# print(encode)

([101, 2769, 1139, 4495, 754, 8447, 9310, 2399, 117, 100, 8993, 100, 13098, 8139, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

1.3 加入自己的分词

sentence = "这件衣服,[订单详情],我要买这个衣服,和这个衣服,[订单详情],有啥区别"
def pre_tokenize(sentence):
    dic_set = {'[订单详情]','[旺旺表情]','[其他链接]'} # 新增加的词典
    for i in dic_set:
        sentence = sentence.replace(i, "\001"+i+"\001")
    tokens = sentence.split("\001")
    return tokens
dict_path = "vocab_add_new_word.txt"
tokenizer = Tokenizer(token_dict=dict_path,pre_tokenize=pre_tokenize)
tokens = tokenizer.tokenize(sentence)
# print(tokens)

['[CLS]', '这', '件', '衣', '服', ',', '[订单详情]', ',', '我', '要', '买', '这', '个', '衣', '服', ',', '和', '这', '个', '衣', '服', ',', '[订单详情]', ',', '有', '啥', '区', '别', '[SEP]']

encode = tokenizer.encode(text, maxlen=128)
print(encode)

([101, 2769, 1139, 4495, 754, 8447, 9310, 2399, 117, 100, 8993, 100, 13098, 8139, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])