1.keras的Embedding示例
# import numpy as np# from keras.models import Sequential# from keras.layers import Embedding ,Masking,LSTM# from bert4keras.backend import K### input_array = np.array([[1, 1, 1, 1, 1, 1, 1],# [2, 2, 2, 2, 2, 2,0],# [3, 3, 3, 3, 3,0,0],# [4, 4, 4, 4,0,0,0],# [5, 5, 5,0,0,0,0],# [6, 6,0,0,0,0,0],# [7,0,0,0,0,0,0]])## print(input_array)## model = Sequential()#### #1.没有掩码直接进lstm,每一个元素对应的快照输出都不为0,即填充位0也被认为有意义的值进行计算# # model.add(LSTM(10, input_shape=(7,1),return_sequences=True))## #2. 进lstm之前使用mask进行掩码计算,lstm输出可以看见填充位元素对应的快照为0,即填充位不参与计算# # model.add(Masking(mask_value= 0,input_shape=(7,1,)))# # model.add(LSTM(10, input_shape=(7,1),return_sequences=True))## #3. 使用Embedding设置mask_zero=True,即可自动对填充位过滤# # model.add(Embedding(7, 8, input_length=7, mask_zero=False))# model.add(Embedding(7, 8, input_length=7, mask_zero=True)) #Embedding这层如果直接打印填充为输出其实不为0,但因为计算了mask,所以在传入下一层lstm时可以看到填充位不进行计算# model.add(LSTM(10, input_shape=(7,1),return_sequences=True))### model.compile('rmsprop', 'mse')# output_array = model.predict(input_array)# print(output_array.shape)# # print(output_array)## K.print_tensor(output_array, message='数据 = ',summarize=-1)###########################################################验证bert分词,编码以及PAD对应的编码数字# import os# from bert4keras.tokenizers import Tokenizer# from bert4keras.snippets import DataGenerator as _DataGenerator, sequence_padding## base_path = os.path.dirname(os.path.abspath(__file__))# vocab_path = os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt')# tokenizer=Tokenizer(vocab_path, do_lower_case=True)## batch_token_ids=[]# sents =[ "SimpGB ",# "1.443253269.0 URL",# "path_simpgb参数",# "(1)留言板."]# for sent in sents:# print('---------------------',sent)# tokens = tokenizer.tokenize(sent, maxlen=100)# print('tokens:',tokens)# token_ids = tokenizer.tokens_to_ids(tokens)# print('token_ids:', token_ids)# mapping = tokenizer.rematch(sent, tokens)# print('mapping',mapping)# start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}# end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}## # token_ids.append(0)# # tokens2=tokenizer.ids_to_tokens(token_ids)## batch_token_ids.append(token_ids)## print('填充前:',batch_token_ids)# batch_token_ids = sequence_padding(batch_token_ids)# print('填充后:',batch_token_ids)## tokens2=tokenizer.ids_to_tokens(batch_token_ids[0])# print(tokens2)# tokens2=tokenizer.ids_to_tokens(batch_token_ids[1])# print(tokens2)## print('token_ids:',tokenizer.id_to_token(0))# print('token_ids:',tokenizer.id_to_token(101))# print('token_ids:',tokenizer.id_to_token(102))# 导入BERT类frombert4keras.modelsimportBERTfromkeras.layersimportInputfrombert4keras.snippetsimportDataGeneratoras_DataGenerator, sequence_paddingimportyamlimportosimportnumpyasnpfrombert4keras.tokenizersimportTokenizerfrombert4keras.snippetsimportto_arrayfromkeras.modelsimportModelfrombert4keras.backendimportKimporttensorflowastfbase_path=os.path.dirname(os.path.abspath(__file__)) conf_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/ner_conf.yaml') withopen(conf_path, encoding='utf-8') asf: _config=yaml.load(f, Loader=yaml.FullLoader) _model_params=_config["bert_bilstm_crf"] _model_params['vocab_size']=23000print(_model_params) # 定义BERT模型 这种bert可以bert.model.predict() or bert.call()bert=BERT(**_model_params) bert.build(**_model_params) #以下两种bert可以直接bert.predict()fromkeras_bertimportload_trained_model_from_checkpointconfig_path='./multi_cased_L-12_H-768_A-12/bert_config.json'checkpoint_path='./multi_cased_L-12_H-768_A-12/bert_model.ckpt'#######1######## model = build_transformer_model(config_path, checkpoint_path, with_pool=False)#######2#######bert=load_trained_model_from_checkpoint(config_path, checkpoint_path) base_path=os.path.dirname(os.path.abspath(__file__)) vocab_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab100.txt') tokenizer=Tokenizer(vocab_path, do_lower_case=True) batch_token_ids=[] batch_segment_ids=[] sents=[ 'ssh_sign_session_id 函数在 0.5.3 之前的。 libssh 中的 keys.c 中释放“错误路径上的无效指针”,这可能允许远程攻击者通过未指定的向量造成拒绝服务(崩溃)。',u'(1) publickey_make_dss、(2) publickey_make_rsa、(3) signature_from_string、(4) ssh_do_sign 和 (5) ssh_sign_session_id 函数在 0.5.3 之前的 libssh 中的 keys.c 中释放“错误路径上的无效指针”,这可能允许远程攻击者通过未指定的向量造成拒绝服务(崩溃)。'] forsentinsents: print('---------------------',sent) token_ids, segment_ids=tokenizer.encode(sent) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) # 使用Tokenizer编码多个句子# token_ids, segment_ids = tokenizer.encode('这是第一个句子。','这是第一个句子。')# print(token_ids,segment_ids)# # 打印编码后的结果# for i, tokens in enumerate(token_ids):# print('句子', i+1, 'token ids:', tokens)# print('句子', i+1, 'segment ids:', segment_ids[i])# print()print('填充前:',batch_token_ids) print('batch_segment_ids:',batch_segment_ids) batch_token_ids=sequence_padding(batch_token_ids,length=512) batch_segment_ids=sequence_padding(batch_segment_ids,length=512) batch_token_ids[0][-1]=0batch_token_ids[0][-2]=0batch_token_ids[0][-3]=0print('填充后:',batch_token_ids) tokens2=tokenizer.ids_to_tokens(batch_token_ids[0]) print(tokens2) batch_token_ids, batch_segment_ids=to_array(batch_token_ids, batch_segment_ids) print('格式后:',batch_token_ids) probas2=bert.predict([batch_token_ids, batch_segment_ids]) print(probas2) # bert2=bert.model# probas2=bert2.predict([batch_token_ids, batch_segment_ids])# print(probas2)# 用mlm模型预测被mask掉的部分# probas = bert.call([batch_token_ids, batch_segment_ids])# print(probas)
2.keras的bert模型两种加载方式:
importnumpyasnpfrombert4keras.modelsimportbuild_transformer_modelimportnumpyasnpfromkeras.layersimportInput, Conv1D, Lambdafromkeras.modelsimportModel# 加载预训练BERT模型两种方式:frombert4keras.modelsimportbuild_transformer_modelfromkeras_bertimportload_trained_model_from_checkpointconfig_path='./multi_cased_L-12_H-768_A-12/bert_config.json'checkpoint_path='./multi_cased_L-12_H-768_A-12/bert_model.ckpt'#######1######## model = build_transformer_model(config_path, checkpoint_path, with_pool=False)#######2#######model=load_trained_model_from_checkpoint(config_path, checkpoint_path) ##加载分词表:fromkeras_bertimportload_vocabularyimportosbase_path=os.path.dirname(os.path.abspath(__file__)) vocab_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt') token_dict=load_vocabulary(vocab_path) #分词两种方式:# ####1#########fromkeras_bertimportTokenizertokenizer=Tokenizer(token_dict) # indices[1] = indices[2] = tokenizer._token_dict['[PAD]']text='语言模型'tokens=tokenizer.tokenize(text) # ['[CLS]', '语', '言', '模', '型', '[SEP]']indices, segments=tokenizer.encode(first=text, max_len=512) #两种转数组# indices, segments=np.array([indices]), np.array([segments])#OR:frombert4keras.snippetsimportto_arrayindices, segments=to_array([indices], [segments]) # ####2########## from bert4keras.tokenizers import Tokenizer# import os# from bert4keras.snippets import DataGenerator as _DataGenerator, sequence_padding# base_path = os.path.dirname(os.path.abspath(__file__))# vocab_path = os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt')# tokenizer=Tokenizer(vocab_path, do_lower_case=True)## text = '语言模型'# indices, segments = tokenizer.encode(first_text=text, maxlen=512)# indices = sequence_padding(inputs=[indices],length=512)# segments = sequence_padding(inputs=[segments],length=512)###转数组:# from bert4keras.snippets import to_array# indices, segments = to_array(indices, segments)#OR:# indices, segments=np.array(indices), np.array(segments)print(indices[:10]) print(segments[:10]) predicts=model.predict([indices, segments])[0] fori, tokeninenumerate(indices[:10]): print(token, predicts[i].tolist()[:10])
3.keras的bert两种模型实例化方式,两种分词方式,两种序列填充方式:
importnumpyasnpfrombert4keras.modelsimportbuild_transformer_modelimportnumpyasnpfromkeras.layersimportInput, Conv1D, Lambdafromkeras.modelsimportModelfrombert4keras.backendimportK# 加载预训练BERT模型两种方式:frombert4keras.modelsimportbuild_transformer_modelfromkeras_bertimportload_trained_model_from_checkpointconfig_path='./multi_cased_L-12_H-768_A-12/bert_config.json'checkpoint_path='./multi_cased_L-12_H-768_A-12/bert_model.ckpt'#######1######## model = build_transformer_model(config_path, checkpoint_path, with_pool=False)#######2#######model=load_trained_model_from_checkpoint(config_path, checkpoint_path) ##加载分词表:fromkeras_bertimportload_vocabularyimportosbase_path=os.path.dirname(os.path.abspath(__file__)) vocab_path=os.path.join(base_path, '../../../ai_web/data/models/vul_ner/common/vocab10.txt') token_dict=load_vocabulary(vocab_path) #分词两种方式:# ####1#########fromkeras_bertimportTokenizertokenizer=Tokenizer(token_dict) # indices[1] = indices[2] = tokenizer._token_dict['[PAD]']batch_token_ids=[] batch_segment_ids=[] sents=[ '这字文本。',u'科学一生产力'] forsentinsents: print('---------------------',sent) token_ids, segment_ids=tokenizer.encode(first=sent, max_len=512) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) #两种转数组# indices, segments=np.array([indices]), np.array([segments])#OR:frombert4keras.snippetsimportto_arrayindices, segments=to_array(batch_token_ids, batch_segment_ids) print('indices:',indices) print('indices:',model.input[:][0]) importtensorflowastf# 使用Lambda层将greater_than_zero应用于张量defprint_tensor(x): input=x[0][0] output=x[1] K.print_tensor(input, message='input = ') input_tmp=K.cast(K.greater(input, 0),K.floatx()) K.print_tensor(input_tmp, message='tmp = ') # K.print_tensor(input, message='数据 = ',summarize=-1)K.print_tensor(output, message='output = ') input_tmp=tf.expand_dims(input_tmp, axis=-1) K.print_tensor(input_tmp, message='tmp = ') K.print_tensor(input_tmp.shape, message='tmp = ') K.print_tensor(output.shape, message='tmp = ') output=tf.multiply(input_tmp, output) # K.print_tensor(output, message='output = ',summarize=-1)returnoutput# mask_input=[x[0] for x in model.input]# print(mask_input)output=Lambda(print_tensor)([model.input,model.output]) model=Model(model.input,output) print(indices.shape) print(indices[0].shape) # 推理并打印输出嵌入向量结果output=model.predict([indices, segments])[0] print(output) # indices[indices>0]=1# print(indices)# output = indices.T* output# print(output)# output[0]=np.zeros_like(output[0])# print(output[2].tolist()[:10])# print(output[10].tolist()[:10])fori, tokeninenumerate(batch_token_ids[0][:10]): print(token, output[i].tolist()[:10]) # for i, embedding in enumerate(output):# # 如果是填充标记,输出全零向量;如果是掩码标记,同样输出全零向量# if indices[i].any() == 0 or segments[i].any() == 1:# print(f'Token {i}: {np.zeros_like(embedding)}')# else:# print(f'Token {i}: {embedding}')