keras的bert两种模型实例化方式，两种分词方式，两种序列填充方式-阿里云开发者社区

1.keras的Embedding示例

# import numpy as np# from keras.models import Sequential# from keras.layers import Embedding ,Masking,LSTM# from bert4keras.backend import K### input_array = np.array([[1, 1, 1, 1, 1, 1, 1],#                [2, 2, 2, 2, 2, 2,0],#                [3, 3, 3, 3, 3,0,0],#                [4, 4, 4, 4,0,0,0],#                [5, 5, 5,0,0,0,0],#                [6, 6,0,0,0,0,0],#                [7,0,0,0,0,0,0]])## print(input_array)## model = Sequential()#### #1.没有掩码直接进lstm，每一个元素对应的快照输出都不为0，即填充位0也被认为有意义的值进行计算# # model.add(LSTM(10, input_shape=(7,1),return_sequences=True))## #2. 进lstm之前使用mask进行掩码计算，lstm输出可以看见填充位元素对应的快照为0，即填充位不参与计算# # model.add(Masking(mask_value= 0,input_shape=(7,1,)))# # model.add(LSTM(10, input_shape=(7,1),return_sequences=True))## #3. 使用Embedding设置mask_zero=True，即可自动对填充位过滤# # model.add(Embedding(7, 8, input_length=7, mask_zero=False))# model.add(Embedding(7, 8, input_length=7, mask_zero=True))    #Embedding这层如果直接打印填充为输出其实不为0，但因为计算了mask，所以在传入下一层lstm时可以看到填充位不进行计算# model.add(LSTM(10, input_shape=(7,1),return_sequences=True))### model.compile('rmsprop', 'mse')# output_array = model.predict(input_array)# print(output_array.shape)# # print(output_array)## K.print_tensor(output_array, message='数据 = ',summarize=-1)###########################################################验证bert分词，编码以及PAD对应的编码数字# import os# from bert4keras.tokenizers import Tokenizer# from bert4keras.snippets import DataGenerator as _DataGenerator, sequence_padding## base_path = os.path.dirname(os.path.abspath(__file__))# vocab_path = os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt')# tokenizer=Tokenizer(vocab_path, do_lower_case=True)## batch_token_ids=[]# sents =[ "SimpGB ",# "1.443253269.0 URL",# "path_simpgb参数",# "(1)留言板."]# for sent in sents:#     print('---------------------',sent)#     tokens = tokenizer.tokenize(sent, maxlen=100)#     print('tokens:',tokens)#     token_ids = tokenizer.tokens_to_ids(tokens)#     print('token_ids:', token_ids)#     mapping = tokenizer.rematch(sent, tokens)#     print('mapping',mapping)#     start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}#     end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}##     # token_ids.append(0)#     # tokens2=tokenizer.ids_to_tokens(token_ids)##     batch_token_ids.append(token_ids)## print('填充前：',batch_token_ids)# batch_token_ids = sequence_padding(batch_token_ids)# print('填充后：',batch_token_ids)## tokens2=tokenizer.ids_to_tokens(batch_token_ids[0])# print(tokens2)# tokens2=tokenizer.ids_to_tokens(batch_token_ids[1])# print(tokens2)## print('token_ids:',tokenizer.id_to_token(0))# print('token_ids:',tokenizer.id_to_token(101))# print('token_ids:',tokenizer.id_to_token(102))# 导入BERT类frombert4keras.modelsimportBERTfromkeras.layersimportInputfrombert4keras.snippetsimportDataGeneratoras_DataGenerator, sequence_paddingimportyamlimportosimportnumpyasnpfrombert4keras.tokenizersimportTokenizerfrombert4keras.snippetsimportto_arrayfromkeras.modelsimportModelfrombert4keras.backendimportKimporttensorflowastfbase_path=os.path.dirname(os.path.abspath(__file__))
conf_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/ner_conf.yaml')
withopen(conf_path, encoding='utf-8') asf:
_config=yaml.load(f, Loader=yaml.FullLoader)
_model_params=_config["bert_bilstm_crf"]
_model_params['vocab_size']=23000print(_model_params)
# 定义BERT模型 这种bert可以bert.model.predict()  or bert.call()bert=BERT(**_model_params)
bert.build(**_model_params)
#以下两种bert可以直接bert.predict()fromkeras_bertimportload_trained_model_from_checkpointconfig_path='./multi_cased_L-12_H-768_A-12/bert_config.json'checkpoint_path='./multi_cased_L-12_H-768_A-12/bert_model.ckpt'#######1######## model = build_transformer_model(config_path, checkpoint_path, with_pool=False)#######2#######bert=load_trained_model_from_checkpoint(config_path, checkpoint_path)
base_path=os.path.dirname(os.path.abspath(__file__))
vocab_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab100.txt')
tokenizer=Tokenizer(vocab_path, do_lower_case=True)
batch_token_ids=[]
batch_segment_ids=[]
sents=[ 'ssh_sign_session_id 函数在 0.5.3 之前的。 libssh 中的 keys.c 中释放“错误路径上的无效指针”，这可能允许远程攻击者通过未指定的向量造成拒绝服务（崩溃）。',u'(1) publickey_make_dss、(2) publickey_make_rsa、(3) signature_from_string、(4) ssh_do_sign 和 (5) ssh_sign_session_id 函数在 0.5.3 之前的 libssh 中的 keys.c 中释放“错误路径上的无效指针”，这可能允许远程攻击者通过未指定的向量造成拒绝服务（崩溃）。']
forsentinsents:
print('---------------------',sent)
token_ids, segment_ids=tokenizer.encode(sent)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
# 使用Tokenizer编码多个句子# token_ids, segment_ids = tokenizer.encode('这是第一个句子。','这是第一个句子。')# print(token_ids,segment_ids)# # 打印编码后的结果# for i, tokens in enumerate(token_ids):#     print('句子', i+1, 'token ids:', tokens)#     print('句子', i+1, 'segment ids:', segment_ids[i])#     print()print('填充前：',batch_token_ids)
print('batch_segment_ids:',batch_segment_ids)
batch_token_ids=sequence_padding(batch_token_ids,length=512)
batch_segment_ids=sequence_padding(batch_segment_ids,length=512)
batch_token_ids[0][-1]=0batch_token_ids[0][-2]=0batch_token_ids[0][-3]=0print('填充后：',batch_token_ids)
tokens2=tokenizer.ids_to_tokens(batch_token_ids[0])
print(tokens2)
batch_token_ids, batch_segment_ids=to_array(batch_token_ids, batch_segment_ids)
print('格式后：',batch_token_ids)
probas2=bert.predict([batch_token_ids, batch_segment_ids])
print(probas2)
# bert2=bert.model# probas2=bert2.predict([batch_token_ids, batch_segment_ids])# print(probas2)# 用mlm模型预测被mask掉的部分# probas = bert.call([batch_token_ids, batch_segment_ids])# print(probas)

2.keras的bert模型两种加载方式：

importnumpyasnpfrombert4keras.modelsimportbuild_transformer_modelimportnumpyasnpfromkeras.layersimportInput, Conv1D, Lambdafromkeras.modelsimportModel# 加载预训练BERT模型两种方式：frombert4keras.modelsimportbuild_transformer_modelfromkeras_bertimportload_trained_model_from_checkpointconfig_path='./multi_cased_L-12_H-768_A-12/bert_config.json'checkpoint_path='./multi_cased_L-12_H-768_A-12/bert_model.ckpt'#######1######## model = build_transformer_model(config_path, checkpoint_path, with_pool=False)#######2#######model=load_trained_model_from_checkpoint(config_path, checkpoint_path)
##加载分词表：fromkeras_bertimportload_vocabularyimportosbase_path=os.path.dirname(os.path.abspath(__file__))
vocab_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt')
token_dict=load_vocabulary(vocab_path)
#分词两种方式：# ####1#########fromkeras_bertimportTokenizertokenizer=Tokenizer(token_dict)
# indices[1] = indices[2] = tokenizer._token_dict['[PAD]']text='语言模型'tokens=tokenizer.tokenize(text)
# ['[CLS]', '语', '言', '模', '型', '[SEP]']indices, segments=tokenizer.encode(first=text, max_len=512)
#两种转数组# indices, segments=np.array([indices]), np.array([segments])#OR：frombert4keras.snippetsimportto_arrayindices, segments=to_array([indices], [segments])
# ####2########## from bert4keras.tokenizers import Tokenizer# import os# from bert4keras.snippets import DataGenerator as _DataGenerator, sequence_padding# base_path = os.path.dirname(os.path.abspath(__file__))# vocab_path = os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt')# tokenizer=Tokenizer(vocab_path, do_lower_case=True)## text = '语言模型'# indices, segments = tokenizer.encode(first_text=text, maxlen=512)# indices = sequence_padding(inputs=[indices],length=512)# segments = sequence_padding(inputs=[segments],length=512)###转数组：# from bert4keras.snippets import to_array# indices, segments = to_array(indices, segments)#OR：# indices, segments=np.array(indices), np.array(segments)print(indices[:10])
print(segments[:10])
predicts=model.predict([indices, segments])[0]
fori, tokeninenumerate(indices[:10]):
print(token, predicts[i].tolist()[:10])

3.keras的bert两种模型实例化方式，两种分词方式，两种序列填充方式：

importnumpyasnpfrombert4keras.modelsimportbuild_transformer_modelimportnumpyasnpfromkeras.layersimportInput, Conv1D, Lambdafromkeras.modelsimportModelfrombert4keras.backendimportK# 加载预训练BERT模型两种方式：frombert4keras.modelsimportbuild_transformer_modelfromkeras_bertimportload_trained_model_from_checkpointconfig_path='./multi_cased_L-12_H-768_A-12/bert_config.json'checkpoint_path='./multi_cased_L-12_H-768_A-12/bert_model.ckpt'#######1######## model = build_transformer_model(config_path, checkpoint_path, with_pool=False)#######2#######model=load_trained_model_from_checkpoint(config_path, checkpoint_path)
##加载分词表：fromkeras_bertimportload_vocabularyimportosbase_path=os.path.dirname(os.path.abspath(__file__))
vocab_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt')
token_dict=load_vocabulary(vocab_path)
#分词两种方式：# ####1#########fromkeras_bertimportTokenizertokenizer=Tokenizer(token_dict)
# indices[1] = indices[2] = tokenizer._token_dict['[PAD]']batch_token_ids=[]
batch_segment_ids=[]
sents=[ '这字文本。',u'科学一生产力']
forsentinsents:
print('---------------------',sent)
token_ids, segment_ids=tokenizer.encode(first=sent, max_len=512)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
#两种转数组# indices, segments=np.array([indices]), np.array([segments])#OR：frombert4keras.snippetsimportto_arrayindices, segments=to_array(batch_token_ids, batch_segment_ids)
print('indices:',indices)
print('indices:',model.input[:][0])
importtensorflowastf# 使用Lambda层将greater_than_zero应用于张量defprint_tensor(x):
input=x[0][0]
output=x[1]
K.print_tensor(input, message='input = ')
input_tmp=K.cast(K.greater(input, 0),K.floatx())
K.print_tensor(input_tmp, message='tmp = ')
# K.print_tensor(input, message='数据 = ',summarize=-1)K.print_tensor(output, message='output = ')
input_tmp=tf.expand_dims(input_tmp, axis=-1)
K.print_tensor(input_tmp, message='tmp = ')
K.print_tensor(input_tmp.shape, message='tmp = ')
K.print_tensor(output.shape, message='tmp = ')
output=tf.multiply(input_tmp, output)
# K.print_tensor(output, message='output = ',summarize=-1)returnoutput# mask_input=[x[0] for x in model.input]# print(mask_input)output=Lambda(print_tensor)([model.input,model.output])
model=Model(model.input,output)
print(indices.shape)
print(indices[0].shape)
# 推理并打印输出嵌入向量结果output=model.predict([indices, segments])[0]
print(output)
# indices[indices>0]=1# print(indices)# output = indices.T* output# print(output)# output[0]=np.zeros_like(output[0])# print(output[2].tolist()[:10])# print(output[10].tolist()[:10])fori, tokeninenumerate(batch_token_ids[0][:10]):
print(token, output[i].tolist()[:10])
# for i, embedding in enumerate(output):#     # 如果是填充标记，输出全零向量；如果是掩码标记，同样输出全零向量#     if indices[i].any() == 0 or segments[i].any() == 1:#         print(f'Token {i}: {np.zeros_like(embedding)}')#     else:#         print(f'Token {i}: {embedding}')

keras的bert两种模型实例化方式，两种分词方式，两种序列填充方式

热门文章

最新文章

相关电子书

相关实验场景

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

keras的bert两种模型实例化方式，两种分词方式，两种序列填充方式

热门文章

最新文章

相关电子书

相关实验场景