Fetch the repository succeeded.
# %%
import math
import torch.nn as nn
import torch
# %%
class positionalEncodingCosSin(nn.Module):
'''
@param:
d_model: 词嵌入的输出维度
dropout: 使用dropout的作用:为了提高模型的泛化性。以概率值取消位置掉位置编码,让模型自己学习上下文语义
max_len: 允许最大的Token数量, 一般都设置为2的n次方
'''
def __init__(self, d_model, dropout, max_len=5000) -> None:
super(positionalEncodingCosSin, self).__init__()
self.dropout= nn.Dropout(dropout)
# * 一次性生成大的空间位置编码
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
# 相当于 10000 ^ (2i/d_model)
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000)) / d_model)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
# 注册器,参与前向计算,不需要训练的持久变量,作为模型的一部分,不会被优化器更新
self.register_buffer('pe', pe)
def forward(self, x):
# x = x + self.pe[:, : x.size(1)].requires_grad_(False) # * 如果没有使用register_buffer(),那么需要使用requires_grad_()设置为False
x = x + self.pe[:, : x.size(1)]
return self.dropout(x)
# %%
class RelativePosition(nn.Module):
def __init__(self, num_units, max_relative_position):
super().__init__()
self.num_units = num_units
self.max_relative_position = max_relative_position
self.embeddings_table = nn.Parameter(torch.Tensor(max_relative_position * 2 + 1, num_units))
nn.init.xavier_uniform_(self.embeddings_table)
def forward(self, length_q, length_k):
range_vec_q = torch.arange(length_q)
range_vec_k = torch.arange(length_k)
distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
distance_mat_clipped = torch.clamp(distance_mat, -self.max_relative_position, self.max_relative_position)
final_mat = distance_mat_clipped + self.max_relative_position
final_mat = torch.LongTensor(final_mat)
embeddings = self.embeddings_table[final_mat]
return embeddings
# %%
class RoPEPosition(nn.Module):
def __init__(self, d_model, max_len=50000, theta=10000):
super().__init__()
self.d_model = d_model
freqs = 1 / (theta ** (torch.arange(0, d_model, 2)[:d_model//2].float() / d_model))
# * 为每个token生成角度
t = torch.arange(max_len)
freqs_for_each_token = torch.outer(t, freqs) # 张量外积函数
# * 转换为复数
freqs_cis = torch.polar(
torch.ones_like(freqs_for_each_token),
freqs_for_each_token
) # 笛卡尔形式: a + bj 其中,torch.ones_like(freqs_for_each_token)代表每个位置的模长,freqs_for_each_token代表角度,则第一个元素值的复数为:1 * cos(θ) + 1 * sin(θ) j
self.register_buffer("freqs_cis", freqs_cis)
def forward(self, input, seq_len):
# 将实数向量转换为复数
input_complex = torch.view_as_complex(
input.float().reshape(*input.shape[:-1], -1, 2)
) # (batch_size, head, seq_len, d_model / 2)
freqs_cis = self.freqs_cis[:seq_len] # (seq_len, d_model / 2)
input_rotaed = input_complex * freqs_cis.unsqueeze(0).unsqueeze(0)
out = torch.view_as_real(input_rotaed).flatten(-2)
return out.type_as(input)
# %%
# if __name__=="__main__":
# rope = RoPEPosition(64)
# q = torch.randn(1, 8, 1024, 64)
# q_rope = rope(q, q.shape[-2])
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。