Ai
1 Star 1 Fork 0

LEVSONGSW/DeepLearnLog

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
transformer_relative.py 7.97 KB
一键复制 编辑 原始数据 按行查看 历史
LEVSONGSW 提交于 2025-09-20 20:56 +08:00 . Relative Position Encode
# %%
import torch
import torch.nn as nn
import math
from positionalEncoding import RelativePosition
from torch.nn.functional import log_softmax
# %%
class Embeddings(nn.Module):
def __init__(self, vocab_size, d_model) -> None:
super(Embeddings, self).__init__()
self.embed = nn.Embedding(vocab_size, d_model)
self.d_model = d_model
def forward(self, x):
return self.embed(x) * math.sqrt(self.d_model)
# %%
def attentionBlock(query, key, k_basic, value, v_basic, mask=None, dropout=None):
nbatches, nhead, ntoken ,d_k = query.size()
_, _, ntoken_k , _ = key.size()
scores_origin = query @ key.transpose(-2, -1)
query_1 = query.contiguous().view(nbatches*nhead, ntoken, d_k).transpose(0, 1).contiguous()
score_basic = torch.matmul(query_1, k_basic.transpose(1, 2)).transpose(0, 1).contiguous().view(nbatches, nhead, ntoken, ntoken_k)
scores = (scores_origin + score_basic) / math.sqrt(d_k)
if mask is not None:
mask = mask.unsqueeze(1)
scores = scores.masked_fill(mask == 0, float('-inf'))
attn = torch.softmax(scores, dim=-1)
if dropout is not None:
attn = dropout(attn)
weight_origin = attn @ value
attn_reshape = attn.view(nbatches*nhead, ntoken, ntoken_k).transpose(0, 1).contiguous()
weight_score = attn_reshape @ v_basic
weight_score = weight_score.transpose(0, 1).contiguous().view(nbatches, nhead, ntoken, d_k)
return weight_origin + weight_score, attn
# %%
class attention(nn.Module):
'''
@params:
h:分类头
d_mdoel: 嵌入层输出维度,QKV的输入维度
'''
def __init__(self, h, d_model, dropout=0.1) -> None:
super(attention, self).__init__()
assert d_model % h == 0
self.d_k = d_model // h
self.h = h
self.query = nn.Linear(d_model, d_model)
self.key = nn.Linear(d_model, d_model)
self.value = nn.Linear(d_model, d_model)
self.output = nn.Linear(d_model, d_model)
self.max_relative_position = 2
self.relative_position_k = RelativePosition(self.d_k, self.max_relative_position)
self.relative_position_v = RelativePosition(self.d_k, self.max_relative_position)
self.attn = None # ?
self.dropout = nn.Dropout(dropout)
def forward(self, q, k, v, mask=None):
nbatches = q.size(0)
len_q_token = q.size(1)
len_v_token = v.size(1)
len_k_token = k.size(1)
q = self.query(q).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
k = self.key(k).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
k_basic = self.relative_position_k(len_q_token, len_k_token)
v = self.value(v).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
v_basic = self.relative_position_v(len_q_token, len_v_token)
x, _ = attentionBlock(q, k, k_basic, v, v_basic, mask, self.dropout)
x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
return self.output(x)
# %%
class AddNorm(nn.Module):
def __init__(self,size, dropout=0.1) -> None:
super(AddNorm, self).__init__()
self.norm = nn.LayerNorm(size)
self.dropout = nn.Dropout(dropout)
def forward(self, x, sublayer):
return x + self.dropout(self.norm(sublayer(x)))
# %%
class feedforward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1) -> None:
super(feedforward, self).__init__()
# self.w_1 = nn.Linear(d_model, d_ff)
# self.w_2 = nn.Linear(d_ff, d_model)
# self.dropout = nn.Dropout(dropout)
self.net = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(),
nn.Linear(d_ff, d_model),
nn.Dropout(dropout)
)
def forward(self, x):
# return self.w_2(self.dropout(self.w_1(x).relu()))
return self.net(x)
# %%
class encoderLayer(nn.Module):
def __init__(self, d_model, self_attn, feed_forward, dropout=0.1) -> None:
super(encoderLayer, self).__init__()
self.self_attn = self_attn
self.feed_forward = feed_forward
self.sublayers = nn.ModuleList(
[
AddNorm(d_model, dropout),
AddNorm(d_model, dropout)
]
)
def forward(self, x, mask):
out1 = self.sublayers[0](x, lambda x:self.self_attn(x, x, x, mask))
out2 = self.sublayers[1](out1, lambda out1: self.feed_forward(out1))
return out2
# %%
class decoderLayer(nn.Module):
def __init__(self, d_model, self_attn, cross_attn, feed_forward, dropout=0.1) -> None:
super(decoderLayer, self).__init__()
self.self_attn = self_attn
self.cross_attn = cross_attn
self.feed_forward = feed_forward
self.sublayers = nn.ModuleList(
[
AddNorm(d_model, dropout),
AddNorm(d_model, dropout),
AddNorm(d_model, dropout)
]
)
def forward(self, x, memory, src_mask, tgt_mask):
out1 = self.sublayers[0](x, lambda x : self.self_attn(x, x, x, tgt_mask))
out2 = self.sublayers[1](out1, lambda empty : self.cross_attn(out1, memory, memory, src_mask))
out3 = self.sublayers[2](out2, lambda out2 : self.feed_forward(out2))
return out3
# %%
class Generator(nn.Module):
def __init__(self, d_model, vocab):
super(Generator, self).__init__()
self.proj = nn.Linear(d_model, vocab)
def forward(self, x):
return log_softmax(self.proj(x), dim=-1)
# %%
class transformer(nn.Module):
def __init__(self, src_vocab, tag_vocab, d_model=512, N=6, h=8, d_ff=2048, dropout=0.1) -> None:
super(transformer, self).__init__()
self.src_embed = nn.Sequential(
Embeddings(src_vocab, d_model),
# positionalEncodingCosSin(d_model, dropout)
)
self.tag_embed = nn.Sequential(
Embeddings(tag_vocab, d_model),
# positionalEncodingCosSin(d_model, dropout)
)
attn = lambda: attention(h, d_model, dropout)
ff = lambda: feedforward(d_model, d_ff, dropout)
self.encoder = nn.ModuleList([
encoderLayer(d_model, attn(), ff() , dropout) for _ in range(N)
])
self.decoder = nn.ModuleList([
decoderLayer(d_model, attn(), attn(), ff(), dropout) for _ in range(N)
])
self.out = Generator(d_model, tag_vocab)
def encode(self, src, src_mask):
x = self.src_embed(src)
for layer in self.encoder:
x = layer(x, src_mask)
return x
def decode(self, tag, memory, src_mask, tag_mask):
x = self.tag_embed(tag)
for layer in self.decoder:
x = layer(x, memory, src_mask, tag_mask)
return x
def forward(self, src, tag, src_mask=None, tag_mask=None):
memory = self.encode(src, src_mask)
out = self.decode(tag, memory, src_mask, tag_mask)
return self.out(out)
# %%
class LabelSmoothing(nn.Module):
'''
size的维度是哪个? 目标词典大小
'''
def __init__(self, size, padding_idx, smoothing=0.0) -> None:
super(LabelSmoothing, self).__init__()
self.criterion = nn.KLDivLoss(reduction='mean')
self.padding_idx = padding_idx
self.confidence = 1 - smoothing
self.smoothing = smoothing
self.tru_dist = None
self.size = size
def forward(self, x, target):
# assert x.size(2) == self.size
assert x.size(2) == self.size
true_dist = x.data.clone()
true_dist.fill_(self.smoothing / (self.size - 2))
true_dist.scatter_(2, target.data.unsqueeze(2), self.confidence)
true_dist[:, self.padding_idx] = 0
mask = torch.nonzero(target.data == self.padding_idx)
if mask.dim() > 0:
# true_dist.index_fill_(0, mask.squeeze(), 0.0)
true_dist = true_dist.masked_fill(target.data.unsqueeze(2) == self.padding_idx, 0.0)
self.true_dist = true_dist
return self.criterion(x, true_dist.clone().detach())
# %%
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/levsongsw/deep-learn-log.git
git@gitee.com:levsongsw/deep-learn-log.git
levsongsw
deep-learn-log
DeepLearnLog
master

搜索帮助