预训练模型下载方式:
1.git下载
git lfs install git clone https://huggingface.co/hfl/chinese-roberta-wwm-ext # if you want to clone without large files – just their pointers # prepend your git clone with the following env var: GIT_LFS_SKIP_SMUDGE=1
2.import各类预训练模型
from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext") model = AutoModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
# from transformers import BertTokenizer,BertModelForMaskedLM # tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext") # model = BertModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("ckiplab/albert-tiny-chinese") model = AutoModelForMaskedLM.from_pretrained("ckiplab/albert-tiny-chinese")
Downloading: 0%| | 0.00/174 [00:00<?, ?B/s] Downloading: 0%| | 0.00/729 [00:00<?, ?B/s] Downloading: 0%| | 0.00/107k [00:00<?, ?B/s] Downloading: 0%| | 0.00/112 [00:00<?, ?B/s] Downloading: 0%| | 0.00/15.4M [00:00<?, ?B/s]
3.导入包
!pip install transformers
from transformers import AutoConfig,AutoModel,AutoTokenizer,AdamW,get_linear_schedule_with_warmup,logging import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import TensorDataset,SequentialSampler,RandomSampler,DataLoader # from transformers import AutoConfig,AutoModel,AutoTokenizer
# 预训练模型名称 MODEL_NAME="bert-base-chinese" # MODEL_NAME="roberta-large"
4.配置
# 预训练模型配置 config = AutoConfig.from_pretrained(MODEL_NAME)
config
BertConfig { "_name_or_path": "bert-base-chinese", "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "classifier_dropout": null, "directionality": "bidi", "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "pooler_fc_size": 768, "pooler_num_attention_heads": 12, "pooler_num_fc_layers": 3, "pooler_size_per_head": 128, "pooler_type": "first_token_transform", "position_embedding_type": "absolute", "transformers_version": "4.17.0", "type_vocab_size": 2, "use_cache": true, "vocab_size": 21128 }
config.num_labels=12
# config
type(config)
transformers.models.bert.configuration_bert.BertConfig
5.tokenizer
# 加载tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer
PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
一些特殊符号:['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
tokenizer.all_special_ids
[100, 102, 0, 101, 103]
tokenizer.all_special_tokens
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
# tokenizer.vocab
# 词汇表大小 tokenizer.vocab_size
21128
6.将文本转为词汇表id
- 方法1
def encode( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs ) -> List[int]: """ Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Args: text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids`` method). text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids`` method). """
text="我在北京工作" token_ids=tokenizer.encode(text) token_ids
[101, 2769, 1762, 1266, 776, 2339, 868, 102]
type(token_ids)
list
# 将id转为原始字符 tokenizer.convert_ids_to_tokens(token_ids)
['[CLS]', '我', '在', '北', '京', '工', '作', '[SEP]']
padding的模式
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths).
# 加入参数 token_ids=tokenizer.encode(text,padding=True,max_length=30,add_special_tokens=True) token_ids 码
[101, 2769, 1762, 1266, 776, 2339, 868, 102]
# 加入参数 token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True) token_ids
[101, 2769, 1762, 1266, 776, 2339, 868, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True,return_tensors='pt') token_ids
tensor([[ 101, 2769, 1762, 1266, 776, 2339, 868, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
- 方法2 encode_plus
def encode_plus( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, **kwargs ) -> BatchEncoding:
token_ids=tokenizer.encode_plus( text,padding="max_length", max_length=30, add_special_tokens=True, return_tensors='pt', return_token_type_ids=True, return_attention_mask=True ) token_ids
{'input_ids': tensor([[ 101, 2769, 1762, 1266, 776, 2339, 868, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
7.模型加载
model=AutoModel.from_pretrained(MODEL_NAME)
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
model
BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(21128, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) )
# outputs=model(token_ids['input_ids'],token_ids['token_type_ids']) outputs=model(token_ids['input_ids'],token_ids['attention_mask']) # outputs=model(token_ids['input_ids'],token_ids['attention_mask'],token_ids['token_type_ids'])
outputs
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2670, -0.0858, 0.2122, ..., -0.0070, 0.9425, -0.3466], [ 0.5193, -0.3700, 0.4482, ..., -1.0237, 0.7864, -0.1775], [-0.1792, -0.7018, 1.0653, ..., -0.3034, 1.0692, 0.0429], ..., [-0.0568, -0.1166, 0.2944, ..., -0.1114, 0.0260, -0.2406], [-0.2842, 0.0047, 0.4074, ..., -0.0445, -0.1530, -0.2477], [ 0.0038, -0.0741, 0.2955, ..., -0.2048, 0.0951, -0.2106]]], grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[ 0.9986, 0.9999, 0.9988, 0.9545, -0.6417, 0.5586, 0.3451, 0.6832, 0.9936, -0.9965, 1.0000, 0.9999, 0.0969, -0.9015, 0.9994, -0.9996, -0.0634, 1.0000, 0.9828, 0.5460, 0.9992, -1.0000, -0.9602, -0.9486, -0.8842, 0.9878, 0.9769, 0.0949, -0.9995, 0.9895, 0.9659, 0.9994, 0.9980, -0.9999, -0.9976, 0.5098, -0.7977, 0.9948, -0.7914, -0.9849, -0.9965, -0.5981, 0.3857, -0.9975, -0.9579, 0.4100, -1.0000, -1.0000, 0.8568, 0.9991, -0.1765, -1.0000, 0.9296, -0.9318, 0.8056, 0.9725, -0.9998, 0.8912, 1.0000, 0.3592, 0.9997, -0.7306, -0.6022, -0.9998, 1.0000, -0.9999, -0.9528, 0.1521, 0.9995, 1.0000, -0.9858, 0.4340, 1.0000, 0.9561, -0.7498, 0.9997, -0.9917, 0.6525, -1.0000, -0.5290, 1.0000, 0.9993, -0.9347, 0.8421, -0.9891, -0.9999, -0.9998, 0.9999, -0.5783, 0.8760, 0.9945, -0.9977, -1.0000, 0.9980, -0.9983, -0.9987, -0.8752, 0.9981, -0.3967, -0.8975, -0.5133, 0.9742, -0.9992, -0.9991, 0.9994, 0.9994, 0.7277, -0.9995, 0.9999, 0.7907, -1.0000, -0.9462, -1.0000, 0.2198, -0.9616, 0.9996, 0.4455, -0.3929, 0.9995, -0.9991, 0.7031, -0.9999, -0.7935, -0.9974, 0.9999, 0.9999, 0.9985, -0.9997, 0.9998, 1.0000, 0.9194, 0.9896, -0.9930, 0.9952, -0.1052, -0.9834, 0.7176, -0.9664, 1.0000, 0.9652, 0.9823, -0.9853, 0.9957, -0.9980, 0.9999, -1.0000, 0.9945, -1.0000, -0.9994, 0.9953, 0.9923, 1.0000, -0.7978, 0.9999, -0.9812, -0.9999, 0.9990, -0.0079, 0.9991, -0.9999, 0.9872, 0.8773, -0.8599, 0.7851, -1.0000, 0.9999, -0.8774, 1.0000, 0.9998, -0.8900, -0.9732, -0.9988, 0.9746, -0.9995, -0.9984, 0.9864, -0.3062, 0.9885, -0.9927, -0.9211, 0.7024, -0.8854, -0.9998, 0.9979, -0.1070, -0.2068, 0.6250, 0.8880, 0.9973, 0.9898, -0.7060, 0.9999, -0.0964, 0.9962, 0.9989, -0.0794, -0.7561, -0.9706, -1.0000, 0.3083, 0.9999, -0.7450, -0.9987, 0.9098, -1.0000, 0.9353, -0.2246, 0.5185, -0.9900, -0.9999, 0.9999, -0.9718, -0.9958, 0.6067, -0.9118, 0.3253, -1.0000, 0.9202, 0.9909, -0.8688, 0.5344, -0.7166, -0.9953, 0.9309, -0.8199, 0.9348, 0.9977, 1.0000, 0.9804, -0.7467, -0.9335, 1.0000, 0.5077, -1.0000, 0.5815, -0.7935, -0.7349, 0.9998, -0.9990, 0.9095, 1.0000, 0.9921, 1.0000, -0.2125, -0.9989, -0.9970, 1.0000, 0.9978, 0.9998, -0.9985, -0.9991, 0.6060, -0.1385, -1.0000, -0.9962, -0.8801, 0.9911, 1.0000, 0.2897, -0.9998, -0.2624, -0.9993, 1.0000, -0.8487, 1.0000, 0.9556, -0.8725, -0.9962, 0.8722, -0.5077, -0.9997, -0.2779, -0.9996, -0.9924, -0.9999, 0.9055, -0.9990, -1.0000, 0.8632, 0.9999, 0.9105, -0.9998, 0.9996, 0.9957, -0.9611, -0.9996, 0.9823, -1.0000, 1.0000, -0.9969, 0.6207, -0.0030, -0.9880, -0.8604, 0.9991, 0.9997, -0.9974, -0.9256, -0.8272, -0.9999, -0.7311, 0.8521, 0.0231, 0.7647, -0.9838, -0.9336, 0.8415, -0.9954, -0.9999, -0.9192, 1.0000, -0.4956, 1.0000, 0.4524, 1.0000, 0.9832, -0.9993, 0.9930, 0.8250, -0.5943, -0.7908, -0.9861, 0.8129, 0.2001, 0.5161, -0.9995, 0.9997, 0.9983, 0.9893, 0.9763, 0.3462, -0.4559, 0.9393, -0.9982, 0.9976, -0.9996, -0.7520, 0.9971, 0.9999, 0.9999, 0.7595, -0.8876, 0.9727, -0.9980, 0.9970, -0.9974, 0.9985, -0.9960, 0.9693, -0.7504, -0.9917, 1.0000, 0.9545, -0.6712, 1.0000, -0.9418, 0.9384, 0.9999, 0.9206, 0.9717, 0.6311, 0.9999, -0.9986, -0.9966, -0.9973, -0.9944, -0.9988, -1.0000, 0.4478, -0.9976, -0.9626, -0.9599, 0.5757, -0.0107, -0.7348, 0.0048, 0.0723, 0.8022, -0.9708, 0.2892, 0.9310, -0.9980, -0.9384, -1.0000, -0.9981, 0.9888, 0.9992, -0.9997, 0.9997, -1.0000, -0.9987, 0.9901, 0.2053, -0.5843, 0.9998, -0.9999, 0.9686, 1.0000, 1.0000, 0.9991, 0.9997, -0.9751, -0.9999, -0.9994, -0.9999, -1.0000, -0.9994, 0.7674, 0.7939, -1.0000, -0.9327, 0.9427, 1.0000, 0.9453, -0.9987, 0.8275, -0.9995, -0.9830, 0.9995, -0.6096, -0.9989, 0.9999, -0.1734, 1.0000, -0.8638, 0.9956, 0.9765, 0.7885, 0.9677, -1.0000, 0.7434, 1.0000, 0.5149, -0.9999, -0.5679, -0.9572, -1.0000, -0.1615, 0.9307, 0.9999, -0.9999, -0.6308, -0.9919, 0.3437, 0.9118, 0.9999, 0.9988, 0.8609, 0.3412, 0.9425, 0.1690, 0.9997, 0.4484, -0.9968, 0.9974, -0.2034, 0.5577, -1.0000, 0.9962, 0.4399, 0.9999, 0.9959, 0.6560, -0.9489, -0.9596, 0.9954, 1.0000, -0.9612, 0.9706, -0.9990, -1.0000, -0.9989, -0.0476, -0.7789, -0.9785, -0.9992, 0.8798, 0.9559, 1.0000, 0.9999, 0.9957, -0.7819, -0.9561, 0.9869, 0.0119, 0.9998, -0.7133, -1.0000, -0.9949, -0.9999, 0.9996, -0.9068, -0.9097, -0.9300, -0.3992, 0.8845, -0.9999, -0.8416, -0.9979, 0.4116, 1.0000, -0.9875, 0.9986, -0.9986, -0.0395, 0.7331, 0.9024, 0.9995, -0.5490, -0.6971, -0.7122, 0.8567, 0.9874, 0.9989, -0.9868, 0.8329, 0.9981, -0.9835, 0.9991, 0.6488, 0.7209, 0.9834, 1.0000, 0.3964, 0.9979, 0.8983, 0.9999, 0.9999, -0.9403, 0.6022, 0.8283, -0.8373, -0.1218, 0.9771, 0.9999, 0.6683, -0.9757, -0.9997, 0.9984, 0.9961, 1.0000, 0.7415, 0.9946, -0.5225, 0.9588, 0.8054, 0.7780, 0.1452, 0.4877, 0.9282, 0.9990, -1.0000, -1.0000, -1.0000, 1.0000, 0.9999, -0.6069, -1.0000, 0.9994, -0.6409, 0.9728, 0.9938, 0.4333, -0.8666, 0.9610, -0.9995, -0.0485, 0.2587, 0.3155, 0.7848, 0.9992, -0.9998, -0.6526, 1.0000, 0.0809, 0.9999, 0.4925, -0.9816, 0.9979, -0.9703, -0.9998, -0.9115, 0.9998, 0.9994, -0.6118, -0.3596, 0.9993, -0.9996, 0.9999, -0.9999, 0.8994, -0.9990, 0.9999, -0.9854, -0.9989, -0.5286, 0.1115, 0.9979, -0.5575, 0.9999, -0.7099, -0.9667, -0.4315, -0.9133, -0.9996, -0.9925, 0.1584, -0.9999, 0.8137, -0.6677, -0.1643, -0.9849, -0.9998, 0.9999, -0.8938, -0.9912, 0.9999, -0.9979, -1.0000, 0.7306, -0.9942, -0.5475, 0.9840, 0.6176, 0.4018, -1.0000, 0.5113, 0.9995, -0.9994, -0.9433, -0.9860, -0.9887, 0.2204, 0.9866, 0.9670, -0.0998, 0.3975, -0.2984, 0.8272, 0.6054, 0.4439, -0.9957, -0.9461, -0.9810, -0.9991, -0.9991, -0.9999, 1.0000, 0.9998, 0.9999, 0.7350, -0.8119, 0.7291, 0.9982, -0.9996, -0.5762, 0.7971, 0.9614, -0.5536, -0.9997, -0.6525, -1.0000, -0.6792, -0.2272, -0.9713, 0.5998, 1.0000, 0.9999, -0.9997, -0.9987, -0.9992, -0.9950, 0.9997, 0.9985, 0.9994, -0.8969, -0.7800, 0.9759, 0.1705, -0.1565, -0.9984, -0.9964, -0.9998, 0.7632, -0.9969, -0.9995, 0.9998, 0.9996, 0.6582, -0.9999, -0.8716, 0.9998, 0.9992, 1.0000, 0.9521, 0.9998, -0.9946, 0.9990, -0.9999, 1.0000, -1.0000, 1.0000, 1.0000, 0.9882, 0.9990, -0.9772, 0.9573, 0.1751, -0.3707, 0.9621, -0.6839, -0.9851, 0.8741, 0.9970, -0.8459, 1.0000, 0.9007, 0.6603, 0.5746, 0.9532, 0.8250, -0.2215, -0.9997, 0.9931, 0.9996, 0.9992, 1.0000, 0.9771, 0.9999, -0.9744, -0.9995, 0.9920, -0.8009, 0.5036, -0.9990, 0.9999, 1.0000, -0.9989, -0.7738, 0.5874, 0.4543, 0.9999, 0.9995, 0.9998, 0.9350, -0.1731, 0.9999, -0.9996, 0.4741, -0.9847, -0.9183, 1.0000, -0.8106, 0.9993, -0.9607, 1.0000, -0.9767, 0.9727, 0.9982, 0.9664, -0.9946, 1.0000, 0.0554, -0.9968, -0.9992, -0.9921, -0.9907, 0.8621]],
last_hidden_state=outputs[0] outputs[0].shape # last_hidden_state
torch.Size([1, 30, 768])
outputs[1].shape # pooler_output # 整个句子的Pooler output
torch.Size([1, 768])
cls_embeddings=last_hidden_state[:,0] # 第一个字符CLS的embedding表示 last_hidden_state[:,0].shape
torch.Size([1, 768])
8.对Bert输出进行变换
config
BertConfig { "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "directionality": "bidi", "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "pooler_fc_size": 768, "pooler_num_attention_heads": 12, "pooler_num_fc_layers": 3, "pooler_size_per_head": 128, "pooler_type": "first_token_transform", "position_embedding_type": "absolute", "transformers_version": "4.6.0", "type_vocab_size": 2, "use_cache": true, "vocab_size": 21128 }
config.update({ 'output_hidden_states':True })
model=AutoModel.from_pretrained(MODEL_NAME,config=config) outputs=model(token_ids['input_ids'],token_ids['token_type_ids'])
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias'] - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
outputs.keys()
odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])
outputs['last_hidden_state'].shape
torch.Size([1, 30, 768])
outputs['pooler_output'].shape
torch.Size([1, 768])
len(outputs['hidden_states'])
13
outputs['hidden_states'][-1].shape
torch.Size([1, 30, 768])