开发环境:
torch==1.11.0
说明:
本文档只用于指导自定义算子实现流程
torch-api
onnx-tutorials
torch.onnx
当我们设计某个算子无法用pytorch内置的基础算子进行组合得到时,这个时候我们就需要自己定义一个算子。
pytorch自定义算子有三种方式:
NCReLU(x)=concat(ReLU(x),-ReLU(-x))
from setuptools import setup
from torch.utils import cpp_extension
setup(
name="ncrelu_cpp",
ext_modules=[
cpp_extension.CppExtension(
'ncrelu_cpp', ['src/ncrelu.cpp'] # 编译后的链接库名字,待编译文件(相对路径)
)
],
cmdclass={ # 编译命令
'build_ext': cpp_extension.BuildExtension
}
)
利用python中提供的setuptools模块完成事先编译流程,将写有算子的C++文件,编译成为一个动态链接库(在Linux平台是一个.so后缀文件),可以让python调用其中实现的函数功能。
Pytorch提供了一个封装cpp_extension,方便编译过程中所需要的设置选项,以及所需包含的头文件位置路径设置等等。
#include <torch/extension.h>
torch::Tensor ncrelu_forward(torch::Tensor input){
auto pos=input.clamp_min(0); //clamp_min_方法设置一个下限min,tensor中有元素小于这个值, 就把对应的值赋为min
auto neg=input.clamp_max(0);
return torch::cat({pos,neg},1);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
m.def("forward",&ncrelu_forward,"ncrelu forword");
}
torch/extension.h头文件,是编写Pytorch的C++扩展时必须包含的一个文件。它基本上囊括了实现中所需要的所有依赖,包含了ATen库,pybind11和二者之间的交互。ATen是Pytorch底层张量运算库,负责实现具体张量操作运算;pybind11是实现C++代码到python的绑定(binding),可以在python里调用C++函数。
函数返回类型和传递参数类型均是torch::Tensor类
通过pybind进行python和cpp的绑定,ncrelu_cpp是模块名,forward是函数名
python setup.py build_ext --inplace
会生成一个ncrelu_cpp.cpython-35m-x86_64-linux-gnu.so的动态链接库
import torch
from torch_op.ncrelu import ncrelu_cpp
a = torch.randn(4, 3)
print(a)
b = ncrelu_cpp.forward(a)
print(b)
a = a.cuda()
c = ncrelu_cpp.forward(a)
print(c)
tensor([[-0.5878, -1.2310, 0.2664],
[ 0.6259, 0.7189, -0.4295],
[ 0.4099, -1.3427, 1.5124],
[-1.3714, 1.1723, -0.5091]])
tensor([[ 0.0000, 0.0000, 0.2664, -0.5878, -1.2310, 0.0000],
[ 0.6259, 0.7189, 0.0000, 0.0000, 0.0000, -0.4295],
[ 0.4099, 0.0000, 1.5124, 0.0000, -1.3427, 0.0000],
[ 0.0000, 1.1723, 0.0000, -1.3714, 0.0000, -0.5091]])
tensor([[ 0.0000, 0.0000, 0.2664, -0.5878, -1.2310, 0.0000],
[ 0.6259, 0.7189, 0.0000, 0.0000, 0.0000, -0.4295],
[ 0.4099, 0.0000, 1.5124, 0.0000, -1.3427, 0.0000],
[ 0.0000, 1.1723, 0.0000, -1.3714, 0.0000, -0.5091]],
device='cuda:0')
由于是利用Pytorch中ATen张量库封装的高层操作,是一种与运行设备无关的代码抽象,因此上面所实现的函数可以直接应用于GPU上进行计算,只需要将输入迁移至GPU上即可。
└── torch_op
├── ncrelu
│ ├── build
│ │ ├── lib.linux-x86_64-3.9
│ │ │ └── ncrelu_cpp.cpython-39-x86_64-linux-gnu.so
│ │ └── temp.linux-x86_64-3.9
│ │ └── src
│ │ └── ncrelu.o
│ ├── ncrelu_cpp.cpython-39-x86_64-linux-gnu.so
│ ├── setup.py
│ └── src
│ └── ncrelu.cpp
└── op_test.py
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name="ncrelu_cuda",
ext_modules=[
CUDAExtension(
'ncrelu_cuda', [
'src/ncrelu_cuda.cpp',
'src/ncrelu_cuda_kernel.cu'
]
)
],
cmdclass={
'build_ext': BuildExtension
}
)
这里提供方式2和方式3进行算子的扩展,extension和op注册
#include<torch/extension.h>
at::Tensor NCReLUForwardLauncher(const at::Tensor & src,
const int batch,
const int channels,
const int height,
const int width);
// 宏定义
#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
at::Tensor ncrelu_forward_cuda(const at::Tensor input){
CHECK_INPUT(input);
at::DeviceGuard guard(input.device());
int batch = input.size(0);
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
return NCReLUForwardLauncher(input, batch, channels, height, width);
}
// extension方式 pybind绑定
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ncrelu_forward_cuda", &ncrelu_forward_cuda,
"ncrelu forward (CUDA)");
}
//op注册方式
static auto registry = torch::RegisterOperators("xwx::ncrelu_forward_cuda", &ncrelu_forward_cuda);
这部分代码主要是基于cuda编程,这里不做介绍,可以自行查阅cuda编程相关知识
#include<ATen/ATen.h>
#include<ATen/cuda/CUDAContext.h>
#include<THC/THCAtomics.cuh>
//实现核函数
template <typename scalar_t>
__global__ void NCReLUForward(const int input_size,
const int channels,
const int height,
const int width,
const scalar_t * src_data,
scalar_t * dst_data) {
const int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index > input_size) return;
auto value = src_data[index]; // 寻找到原数据值
const int chw = channels * height * width;
dst_data[index + index / chw * chw] = value >= 0 ? value : scalar_t(0); // 前一半通道为正值
dst_data[index + index / chw * chw + chw] = value >= 0 ? scalar_t(0) : value; // 后一半通道为负值
}
#define THREADS_PER_BLOCK 1024
inline int GET_BLOCKS(const int N) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
int max_block_num = 65000;
return min(optimal_block_num, max_block_num);
}
at::Tensor NCReLUForwardLauncher(const at::Tensor & src,
const int batch,
const int channels,
const int height,
const int width){
//开辟一段存储空间
at::Tensor dst=at::empty({batch,2*channels,height,width},src.options());
const int input_size=batch*channels*height*width;
const int output_size=batch*channels*height*width;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(src.scalar_type(), "NCReLUForwardLauncher", ([&] {
const scalar_t *src_ = src.data<scalar_t>();
scalar_t *dst_ = dst.data<scalar_t>();
NCReLUForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK,
0, at::cuda::getCurrentCUDAStream()>>>(
input_size, channels, height, width, src_, dst_
);
}));
//THCudaCheck(cudaGetLastError());
return dst;
}
由于使用了两种扩展方式,那么对应的调用方式也会不同
extension:通过import方式导入
from torch_op.ncrelu_cuda import ncrelu_cuda
b = ncrelu_cuda.ncrelu_forward_cuda(a)
op注册:通过ops.load_library导入
torch.ops.load_library(
"/home/xwx/PycharmProjects/custom_operator/torch_op/ncrelu_cuda/ncrelu_cuda.cpython-37m-x86_64-linux-gnu.so")
c = torch.ops.xwx.ncrelu_forward_cuda(a)
两种方式调用
import torch
from torch_op.ncrelu_cuda import ncrelu_cuda
torch.ops.load_library(
"/home/xwx/PycharmProjects/custom_operator/torch_op/ncrelu_cuda/ncrelu_cuda.cpython-37m-x86_64-linux-gnu.so")
a = torch.randn(1, 3, 10, 10).cuda()
b = ncrelu_cuda.ncrelu_forward_cuda(a)
c = torch.ops.xwx.ncrelu_forward_cuda(a)
print(b)
print(c)
├── ncrelu_cuda
│ ├── build
│ │ ├── lib.linux-x86_64-3.7
│ │ │ └── ncrelu_cuda.cpython-37m-x86_64-linux-gnu.so
│ │ └── temp.linux-x86_64-3.7
│ │ └── src
│ │ ├── ncrelu_cuda_kernel.o
│ │ └── ncrelu_cuda.o
│ ├── ncrelu_cuda.cpython-37m-x86_64-linux-gnu.so
│ ├── setup.py
│ └── src
│ ├── ncrelu_cuda.cpp
│ └── ncrelu_cuda_kernel.cu
├── ncrelu_cuda_test.py
def ncrelu(g, input):
return g.op("domain::ncrelu", input)
register_custom_op_symbolic('xwx::ncrelu_forward_cuda', ncrelu, 9)
其中ncrelu为最后导出的op名字,domain为操作子的命名空间,这是对于onnx操作子来说
其中xwx::ncrelu_forward这一部分需要和cpp中注册的算子保持一致,xwx是命名空间,这是对于torch来说
class CustomModel(nn.Module):
def __init__(self) -> None:
super().__init__()
self.Conv = nn.Conv2d(3, 3, kernel_size=3)
self.NCReLU = torch.ops.xwx.ncrelu_forward_cuda
def forward(self, input):
x = self.Conv(input)
x = self.NCReLU(x)
return x
torch.ops.xwx.ncrelu_forward_cuda 和 op注册方式调用一样
model.eval()
model.cuda()
input_temp = torch.ones([1, 3, 10, 10]).cuda()
torch.onnx.export(model, input_temp, "custom.onnx", export_params=True, opset_version=11, verbose=True,
input_names=['input'],
output_names=['output'], custom_opsets={"domain": 1})
custom_opsets={"domain": 1}用来指定你的op版本
├── export_onnx
│ ├── create_model.py
│ ├── custom.onnx
│ └── custom.pth
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。