求助，关于注意力机制与mask的问题

在这段代码里，attention-LSTM模型先用torch.lstm进行RNN处理，然后用layernorm对mish激活后的lstm输出进行归一化处理。但是layernorm不同于batchnorm，他是将每个样本的所有特征进行归一化处理那么在接下去的attention层中，mask怎么进行？
又或者说，在归一化处理后，pad的地方已经不为零了，那怎么使用torch.sign(torch.abs(features).sum(dim==-1))来获得mask的index？？

部分代码在此处：
class AttLSTMModel(nn.Module): def __init__(self, vocab_size, embedding_dim, in_times, dropout, pad_idx,hidden_dim, n_layer, n_class): super(AttLSTMModel, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx) #定义循环神经网络层 self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, n_layer,batch_first=True) self.Linear = torch.nn.Linear(hidden_dim*in_times, n_class)#定义全连接层 self.attention = AttentionSeq(hidden_dim,hard=0.03) self.last_norm = torch.nn.LayerNorm(hidden_dim) self.drop_p = dropout
self.mish = Mish() def forward(self, text): #输入形状为[sent len, batch size]
text = text.permute(1, 0)#将形状变为[batch size, sent len] embedded = self.embedding(text)#形状为[batch size, sent len, emb dim] t, _ = self.lstm(embedded) #进行RNN处理 t = self.last_norm(self.mish(t) )
t = self.attention(t) t=t.reshape(t.shape[0],-1)# t = t[:, -1, :] #获取RNN网络的最后一个序列数据 t = nn.functional.dropout(t, p=self.drop_p, training=self.training) out = self.Linear(t)#进行全连接处理 return out
class AttentionSeq(torch.nn.Module):
def __init__(self, hidden_dim,hard= 0): super(AttentionSeq, self).__init__() self.hidden_dim = hidden_dim self.dense = torch.nn.Linear(hidden_dim, hidden_dim) self.hard = hard
def forward(self, features, mean=False): #[batch,seq,dim] batch_size, time_step, hidden_dim = features.size() weight = torch.nn.Tanh()(self.dense(features))
# mask给负无穷使得权重为0 mask_idx = torch.sign(torch.abs(features).sum(dim=-1))# mask_idx = mask_idx.unsqueeze(-1).expand(batch_size, time_step, hidden_dim) mask_idx = mask_idx.unsqueeze(-1).repeat(1, 1, hidden_dim)
weight = torch.where(mask_idx== 1, weight, torch.full_like(mask_idx,(-2 ** 32 + 1))) weight = weight.transpose(2, 1) weight = torch.nn.Softmax(dim=2)(weight) if self.hard!=0: #hard mode weight = torch.where(weight>self.hard, weight, torch.full_like(weight,0)) if mean: weight = weight.mean(dim=1) weight = weight.unsqueeze(1) weight = weight.repeat(1, hidden_dim, 1) weight = weight.transpose(2, 1) features_attention = weight * features
return features_attention

扫二维码下载贴吧客户端

下载贴吧APP
看高清直播、视频！

1回复贴，共1页

<<返回自然语言处理吧

分享到:

日	一	二	三	四	五	六