1 分词逻辑
from bert4keras.tokenizers import Tokenizer,load_vocab,save_vocab
dict_path = 'vocab.txt'
sentence = "这件衣服,[订单详情],我要买这个衣服,和这个衣服,[订单详情],有啥区别"
1.1 正常分词
tokenizer = Tokenizer(token_dict=dict_path)
tokens = tokenizer.tokenize(sentence)
# print(tokens)
['[CLS]', '这', '件', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '我', '要', '买', '这', '个', '衣', '服', ',', '和', '这', '个', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '有', '啥', '区', '别', '[SEP]']
encode = tokenizer.encode(text, maxlen=128)
# print(encode)
([101, 2769, 1139, 4495, 754, 8447, 9310, 2399, 117, 100, 8993, 100, 13098, 8139, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
1.2 加载并精简词表,建立分词器 - 词表减小,减少emb
token_dict = load_vocab(dict_path=dict_path, simplified=False)
save_vocab("vocab_smaple_word.txt",token_dict)
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
tokenizer = Tokenizer(token_dict=dict_path)
tokens = tokenizer.tokenize(sentence)
# print(tokens)
['[CLS]', '这', '件', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '我', '要', '买', '这', '个', '衣', '服', ',', '和', '这', '个', '衣', '服', ',', '[', '订', '单', '详', '情', ']', ',', '有', '啥', '区', '别', '[SEP]']
encode = tokenizer.encode(text, maxlen=128)
# print(encode)
([101, 2769, 1139, 4495, 754, 8447, 9310, 2399, 117, 100, 8993, 100, 13098, 8139, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
1.3 加入自己的分词
sentence = "这件衣服,[订单详情],我要买这个衣服,和这个衣服,[订单详情],有啥区别"
def pre_tokenize(sentence):
dic_set = {'[订单详情]','[旺旺表情]','[其他链接]'} # 新增加的词典
for i in dic_set:
sentence = sentence.replace(i, "\001"+i+"\001")
tokens = sentence.split("\001")
return tokens
dict_path = "vocab_add_new_word.txt"
tokenizer = Tokenizer(token_dict=dict_path,pre_tokenize=pre_tokenize)
tokens = tokenizer.tokenize(sentence)
# print(tokens)
['[CLS]', '这', '件', '衣', '服', ',', '[订单详情]', ',', '我', '要', '买', '这', '个', '衣', '服', ',', '和', '这', '个', '衣', '服', ',', '[订单详情]', ',', '有', '啥', '区', '别', '[SEP]']
encode = tokenizer.encode(text, maxlen=128)
print(encode)
([101, 2769, 1139, 4495, 754, 8447, 9310, 2399, 117, 100, 8993, 100, 13098, 8139, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])