公式比较简单,softmax(q*k/sqrt(d_k))*v
记录一个coding模版,记得加上mask和dropout
多头的话,就是把d_model拆分成多个头,然后交换sequence_length和n_head进行自注意力计算,得到的张量再还原回去,最后过一层线性层作为输出。
def SelfAttention(q,k,v,mask,dropout):
d_k = q.size(-1)
scores = torch.matmul(q,k.transpose(-1,-2))
if mask:
scores.mask_fill(mask==0,1e-9)
scores = F.softmax(scores,dim=-1)
if dropout:
scores = dropout(scores)
return torch.matmul(scores,v)
class MultiHeadAttention(nn.Moudle):
def __init__(self,):
super().__init__()
def forward(self, n_head, d_model, q, k, v, mask, dropout):
assert (d_model%head)==0
n_batch = q.size(0)
d_k = q.size(-1)
# 四个线性变换矩阵,维度和输入维度一致
w_q = nn.Linear(d_model,d_model)
w_k = nn.Linear(d_model,d_model)
w_v = nn.Linear(d_model,d_model)
w_o = nn.Linear(d_model,d_model)
# 将输入的q、k、v通过线性变换,这里使用transpose交换1、2维度的目的是我们后续计算self-attention的时候是分别在每个头上
# 对sequence_length*d_k矩阵计算的,计算完self-attention再还原回去
q = w_q(q).view(n_batch, -1, n_head, d_k).transpose(1,2)
k = w_k(k).view(n_batch, -1, n_head, d_k).transpose(1,2)
v = w_v(v).view(n_batch, -1, n_head, d_k).transpose(1,2)
# 如果有mask,需要对第1个纬度升维,因为q、k、v已经从三维变成了四维,这里的mask要对应上
if mask:
mask.unsqueeze(1)
atten_scores = SelfAttention(q,k,v,mask,nn.Dropout(p=dropout))
# 通过self-attention后的张量是4维的,且进行self-attention前的q、k、v是交换了sequence_length和n_head的,所以现在得先还原维度
atten_scores = atten_scores.transpose(1,2).contiguous().view(n_batch, -1, d_k*n_head)
reutrn w_o(atten_scores)