比赛链接:https://www.kaggle.com/c/tweet-sentiment-extraction/overview
赛题背景
“My ridiculous dog is amazing。” [sentiment:positive]
由于各种推文无时无刻都在传播,因此很难说出一条特定推文背后的情绪是否会因为负面影响而影响公司,个人品牌的病毒式传播(正面)或者毁灭性利润。 在这种情况下,在几秒钟内创建并更新决策和反应的过程中,用语言捕捉情感非常重要。 但是,哪些词实际上导致了情绪描述? 在这场比赛中,您需要选择推文(词或短语)中反映情感的部分。
通过丰富的推文数据集,帮助您在这一重要领域中掌握该技能。 努力运用技术,在这场比赛中夺魁。 推文中的哪些词支持正面,负面或中性情绪? 您如何使用机器学习工具帮助做出决定?
赛题任务
在本次比赛中,主办方从图八的 Figure Eight's Data for Everyone platform
平台中提取了支持短语。 该数据集的标题为“情感分析:找出带有现有情感标签的推文文本”,简单来说我们的目标就是要抽取文本中的情感片段。
比如给出一段话:
<id>,"<word or phrase that supports the sentiment>"
然后提出该文本的情感文字:
2,"very good" 5,"I am neutral about this" 6,"bad" 8,"if you say so!" etc.
数据分析
训练集如下:
情感标签分布
fig = go.Figure(go.Funnelarea( text =temp.sentiment, values = temp.text, title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"} )) fig.show()
选中的情感文本与原始文本的Jaccard 相似性匹配
def jaccard(str1, str2): a = set(str1.lower().split()) b = set(str2.lower().split()) c = a.intersection(b) return float(len(c)) / (len(a) + len(b) - len(c)) results_jaccard=[] for ind,row in train.iterrows(): sentence1 = row.text sentence2 = row.selected_text jaccard_score = jaccard(sentence1,sentence2) results_jaccard.append([sentence1,sentence2,jaccard_score]) jaccard = pd.DataFrame(results_jaccard,columns=["text","selected_text","jaccard_score"]) train = train.merge(jaccard,how='outer') train['Num_words_ST'] = train['selected_text'].apply(lambda x:len(str(x).split())) #Number Of words in Selected Text train['Num_word_text'] = train['text'].apply(lambda x:len(str(x).split())) #Number Of words in main text train['difference_in_words'] = train['Num_word_text'] - train['Num_words_ST'] #Difference in Number of words text and Selected Text
plt.figure(figsize=(12,6)) p1=sns.kdeplot(train[train['sentiment']=='positive']['jaccard_score'], shade=True, color="b").set_title('KDE of Jaccard Scores across different Sentiments') p2=sns.kdeplot(train[train['sentiment']=='negative']['jaccard_score'], shade=True, color="r") plt.legend(labels=['positive','negative'])
选中文本最常见的词语:
fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Words') fig.show()
在Positive情感文本中最常见的词:
https://www.kaggle.com/tanulsingh077/twitter-sentiment-extaction-analysis-eda-and-model
在Neutral情感文本中最常见的词:
在Negative情感文本中最常见的词:
https://www.kaggle.com/parulpandey/eda-and-preprocessing-for-bert
第一名方案
分享链接1:https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/159477
分享链接2:https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/159264
方案1 预训练模型全家桶
Models: RoBERTa-base-squad2, RoBERTa-large-squad2, DistilRoBERTa-base, XLNet-base-cased
模型结构如下:
import torch import transformers import config class TweetModel(transformers.BertPreTrainedModel): def __init__(self, conf): super(TweetModel, self).__init__(conf) self.roberta = transformers.RobertaModel.from_pretrained( config.MODEL_CONFIG, config=conf) self.high_dropout = torch.nn.Dropout(config.HIGH_DROPOUT) self.classifier = torch.nn.Linear(config.HIDDEN_SIZE * 2, 2) torch.nn.init.normal_(self.classifier.weight, std=0.02) def forward(self, ids, mask, token_type_ids): # sequence_output of N_LAST_HIDDEN + Embedding states # (N_LAST_HIDDEN + 1, batch_size, num_tokens, 768) _, _, out = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids) out = torch.stack( tuple(out[-i - 1] for i in range(config.N_LAST_HIDDEN)), dim=0) out_mean = torch.mean(out, dim=0) out_max, _ = torch.max(out, dim=0) out = torch.cat((out_mean, out_max), dim=-1) # Multisample Dropout: https://arxiv.org/abs/1905.09788 logits = torch.mean(torch.stack([ self.classifier(self.high_dropout(out)) for _ in range(5) ], dim=0), dim=0) start_logits, end_logits = logits.split(1, dim=-1) # (batch_size, num_tokens) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) return start_logits, end_logits
方案2 Char-NN模型
代码链接:https://www.kaggle.com/theoviel/character-level-model-magic
- RNN模型
class TweetCharModel(nn.Module): def __init__(self, len_voc, use_msd=True, embed_dim=64, lstm_dim=64, char_embed_dim=32, sent_embed_dim=32, ft_lstm_dim=32, n_models=1): super().__init__() self.use_msd = use_msd self.char_embeddings = nn.Embedding(len_voc, char_embed_dim) self.sentiment_embeddings = nn.Embedding(3, sent_embed_dim) self.proba_lstm = nn.LSTM(n_models * 2, ft_lstm_dim, batch_first=True, bidirectional=True) self.lstm = nn.LSTM(char_embed_dim + ft_lstm_dim * 2 + sent_embed_dim, lstm_dim, batch_first=True, bidirectional=True) self.lstm2 = nn.LSTM(lstm_dim * 2, lstm_dim, batch_first=True, bidirectional=True) self.logits = nn.Sequential( nn.Linear(lstm_dim * 4, lstm_dim), nn.ReLU(), nn.Linear(lstm_dim, 2), ) self.high_dropout = nn.Dropout(p=0.5) def forward(self, tokens, sentiment, start_probas, end_probas): bs, T = tokens.size() probas = torch.cat([start_probas, end_probas], -1) probas_fts, _ = self.proba_lstm(probas) char_fts = self.char_embeddings(tokens) sentiment_fts = self.sentiment_embeddings(sentiment).view(bs, 1, -1) sentiment_fts = sentiment_fts.repeat((1, T, 1)) features = torch.cat([char_fts, sentiment_fts, probas_fts], -1) features, _ = self.lstm(features) features2, _ = self.lstm2(features) features = torch.cat([features, features2], -1) if self.use_msd and self.training: logits = torch.mean( torch.stack( [self.logits(self.high_dropout(features)) for _ in range(5)], dim=0, ), dim=0, ) else: logits = self.logits(features) start_logits, end_logits = logits[:, :, 0], logits[:, :, 1] return start_logits, end_logits
- CNN模型
class ConvNet(nn.Module): def __init__(self, len_voc, use_msd=True, cnn_dim=64, char_embed_dim=32, sent_embed_dim=32, proba_cnn_dim=32, n_models=1, kernel_size=3, use_bn=False): super().__init__() self.use_msd = use_msd self.char_embeddings = nn.Embedding(len_voc, char_embed_dim) self.sentiment_embeddings = nn.Embedding(3, sent_embed_dim) self.probas_cnn = ConvBlock(n_models * 2, proba_cnn_dim, kernel_size=kernel_size, use_bn=use_bn) self.cnn = nn.Sequential( ConvBlock(char_embed_dim + sent_embed_dim + proba_cnn_dim, cnn_dim, kernel_size=kernel_size, use_bn=use_bn), ConvBlock(cnn_dim, cnn_dim * 2, kernel_size=kernel_size, use_bn=use_bn), ConvBlock(cnn_dim * 2 , cnn_dim * 4, kernel_size=kernel_size, use_bn=use_bn), ConvBlock(cnn_dim * 4, cnn_dim * 8, kernel_size=kernel_size, use_bn=use_bn), ) self.logits = nn.Sequential( nn.Linear(cnn_dim * 8, cnn_dim), nn.ReLU(), nn.Linear(cnn_dim, 2), ) self.high_dropout = nn.Dropout(p=0.5) def forward(self, tokens, sentiment, start_probas, end_probas): bs, T = tokens.size() probas = torch.cat([start_probas, end_probas], -1).permute(0, 2, 1) probas_fts = self.probas_cnn(probas).permute(0, 2, 1) char_fts = self.char_embeddings(tokens) sentiment_fts = self.sentiment_embeddings(sentiment).view(bs, 1, -1) sentiment_fts = sentiment_fts.repeat((1, T, 1)) x = torch.cat([char_fts, sentiment_fts, probas_fts], -1).permute(0, 2, 1) features = self.cnn(x).permute(0, 2, 1) # [Bs x T x nb_ft] if self.use_msd and self.training: logits = torch.mean( torch.stack( [self.logits(self.high_dropout(features)) for _ in range(5)], dim=0, ), dim=0, ) else: logits = self.logits(features) start_logits, end_logits = logits[:, :, 0], logits[:, :, 1] return start_logits, end_logits
- WaveNet模型
class WaveNet(nn.Module): def __init__(self, len_voc, use_msd=True, dilations=[1], cnn_dim=64, char_embed_dim=32, sent_embed_dim=32, proba_cnn_dim=32, n_models=1, kernel_size=3, use_bn=True): super().__init__() self.use_msd = use_msd self.char_embeddings = nn.Embedding(len_voc, char_embed_dim) self.sentiment_embeddings = nn.Embedding(3, sent_embed_dim) self.probas_cnn = ConvBlock(n_models * 2, proba_cnn_dim, kernel_size=kernel_size, use_bn=use_bn) self.cnn = nn.Sequential( Waveblock(char_embed_dim + sent_embed_dim + proba_cnn_dim, cnn_dim, kernel_size=kernel_size, dilations=dilations), nn.BatchNorm1d(cnn_dim), Waveblock(cnn_dim, cnn_dim * 2, kernel_size=kernel_size, dilations=dilations), nn.BatchNorm1d(cnn_dim * 2), Waveblock(cnn_dim * 2 , cnn_dim * 4, kernel_size=kernel_size, dilations=dilations), nn.BatchNorm1d(cnn_dim * 4), ) self.logits = nn.Sequential( nn.Linear(cnn_dim * 4, cnn_dim), nn.ReLU(), nn.Linear(cnn_dim, 2), ) self.high_dropout = nn.Dropout(p=0.5) def forward(self, tokens, sentiment, start_probas, end_probas): bs, T = tokens.size() probas = torch.cat([start_probas, end_probas], -1).permute(0, 2, 1) probas_fts = self.probas_cnn(probas).permute(0, 2, 1) char_fts = self.char_embeddings(tokens) sentiment_fts = self.sentiment_embeddings(sentiment).view(bs, 1, -1) sentiment_fts = sentiment_fts.repeat((1, T, 1)) x = torch.cat([char_fts, sentiment_fts, probas_fts], -1).permute(0, 2, 1) features = self.cnn(x).permute(0, 2, 1) # [Bs x T x nb_ft] if self.use_msd and self.training: logits = torch.mean( torch.stack( [self.logits(self.high_dropout(features)) for _ in range(5)], dim=0, ), dim=0, ) else: logits = self.logits(features) start_logits, end_logits = logits[:, :, 0], logits[:, :, 1] return start_logits, end_logits
方案三 模型融合Stakcing
使用Transformer系列作为基础模型学习,然后输入到三种Char Level模型再学习
亮点
作者同时根据Jaccard设计了自定义损失函数,有兴趣同学可以研究下:
https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/159477