diff --git a/impl/activation/gelu/gelu_impl.h b/impl/activation/gelu/gelu_impl.h index 137b5f1259caa1de98b3c5a674b7c916c5c45394..45130efbc2f8ccd25818e4155831c4f87f88b50b 100644 --- a/impl/activation/gelu/gelu_impl.h +++ b/impl/activation/gelu/gelu_impl.h @@ -149,6 +149,86 @@ __aicore__ inline void FastGeluCalcSimplified(const LocalTensor& dstLocal, co } } +template +__aicore__ inline void FastGeluV2ClipParams(const LocalTensor& tempTensorA, const LocalTensor& srcLocal, + const GeluParams& params) +{ + const T coefficientsA = -0.1444; + const T coefficientsB = -1.769; + const T coefficientsBInv = 1.769; + const T coefficientsC = 0.7071; + const T coefficientsD = 0.5; + + const UnaryRepeatParams unaryParams; + const BinaryRepeatParams binaryParams; + + // x1 = (-0.1444) * (clip(|0.7071 * x|, max=1.769) - 1.769) ^ 2 + 0.5 + Muls(tempTensorA, srcLocal, coefficientsC, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Abs(tempTensorA, tempTensorA, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Mins(tempTensorA, tempTensorA, coefficientsBInv, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Adds(tempTensorA, tempTensorA, coefficientsB, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Mul(tempTensorA, tempTensorA, tempTensorA, MASK_PLACEHOLDER, params.repeatTimes, binaryParams); + PipeBarrier(); + + Muls(tempTensorA, tempTensorA, coefficientsA, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Adds(tempTensorA, tempTensorA, coefficientsD, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); +} + +template +__aicore__ inline void FastGeluV2CalcSimplified(const LocalTensor& dstLocal, const LocalTensor& srcLocal, + const GeluParams& params) +{ + const T coefficients = 0.000000000001; + const T coefficientsHalf = 0.5; + const UnaryRepeatParams unaryParams; + const BinaryRepeatParams binaryParams; + const LocalTensor& tempTensorA = params.tempTensorA; + const LocalTensor& tempTensorB = params.tempTensorB; + const LocalTensor& tempTensorC = params.tempTensorC; + + // x1 = (-0.1444) * (clip(|0.7071 * x|, max=1.769) - 1.769) ^ 2 + 0.5 + FastGeluV2ClipParams(tempTensorA, srcLocal, params); + + // x2 = (x + 0.000000000001) / |(x + 0.000000000001)| + Adds(tempTensorB, srcLocal, coefficients, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Abs(tempTensorC, tempTensorB, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + if constexpr (highPerformance) { + Reciprocal(tempTensorC, tempTensorC, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Mul(tempTensorB, tempTensorB, tempTensorC, MASK_PLACEHOLDER, params.repeatTimes, binaryParams); + PipeBarrier(); + } else { + Div(tempTensorB, tempTensorB, tempTensorC, MASK_PLACEHOLDER, params.repeatTimes, binaryParams); + PipeBarrier(); + } + + // fast_gelu_v2(x) = x * (x2 * x1 + 0.5) + Mul(tempTensorA, tempTensorA, tempTensorB, MASK_PLACEHOLDER, params.repeatTimes, binaryParams); + PipeBarrier(); + + Adds(tempTensorA, tempTensorA, coefficientsHalf, MASK_PLACEHOLDER, params.repeatTimes, unaryParams); + PipeBarrier(); + + Mul(dstLocal, srcLocal, tempTensorA, MASK_PLACEHOLDER, params.repeatTimes, binaryParams); + PipeBarrier(); +} + template __aicore__ inline void GeluFormulasTmpCalc(GeluParams& params) { @@ -162,9 +242,16 @@ __aicore__ inline void GeluFormulasTmpCalc(GeluParams& params) ASCENDC_ASSERT((params.stackSize > 0), { KERNEL_LOG(KERNEL_ERROR, "params.stackSize must > 0!"); }); uint32_t nextTmpPos = params.stackSize; - if constexpr (bufferNumber >= TWO_OF_STACK_BUFFER) { + if constexpr (bufferNumber == TWO_OF_STACK_BUFFER) { + params.tempTensorB = params.sharedTmpBuffer[nextTmpPos]; + nextTmpPos += params.stackSize; + } + + if constexpr (bufferNumber >= THREE_OF_STACK_BUFFER) { params.tempTensorB = params.sharedTmpBuffer[nextTmpPos]; nextTmpPos += params.stackSize; + params.tempTensorC = params.sharedTmpBuffer[nextTmpPos]; + nextTmpPos += params.stackSize; } if constexpr (highPrecision) { @@ -326,6 +413,30 @@ __aicore__ inline void FasterGeluImpl(const LocalTensor& dstLocal, const Loca FasterGeluImpl(dstLocal, srcLocal, sharedTmpBuffer, dataSize); } + +template +__aicore__ inline void FasterGeluV2Impl(const LocalTensor& dstLocal, const LocalTensor& srcLocal, + const LocalTensor& sharedTmpBuffer, const uint32_t dataSize) +{ + if constexpr (highPrecision && (IsSameType::value)) { + GeluClass(dstLocal, srcLocal, sharedTmpBuffer, dataSize, + FastGeluV2CalcSimplified); + } else { + GeluClass(dstLocal, srcLocal, sharedTmpBuffer, dataSize, + FastGeluV2CalcSimplified); + } +} + +template +__aicore__ inline void FasterGeluV2Impl(const LocalTensor& dstLocal, const LocalTensor& srcLocal, + const uint32_t dataSize) +{ + LocalTensor sharedTmpBuffer; + bool ans = PopStackBuffer(sharedTmpBuffer); + ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); + + FasterGeluV2Impl(dstLocal, srcLocal, sharedTmpBuffer, dataSize); +} #pragma end_pipe } // namespace AscendC -#endif // IMPL_ACTIVATION_GELU_GELU_IMPL_H +#endif // IMPL_ACTIVATION_GELU_GELU_IMPL_H \ No newline at end of file diff --git a/lib/activation/gelu.h b/lib/activation/gelu.h index c2c9d162bfdebe28c47dfe6b6bd7691a33e70a71..5a2ca428c19dc8f79c4a8dbc3a6dfd45ba6d4578 100644 --- a/lib/activation/gelu.h +++ b/lib/activation/gelu.h @@ -91,6 +91,49 @@ __aicore__ inline void FasterGelu(const LocalTensor& dstLocal, const LocalTen } FasterGeluImpl(dstLocal, srcLocal, dataSize); } + +/* ! + * \brief sgn(x) = (x + 0.000000000001) / |(x + 0.000000000001)| + * \brief fast_gelu_v2(x) = x * (sgn(x) * [(-0.1444) * (clip(|0.7071 * x|, max=1.769) - 1.769) ^ 2 + 0.5] + 0.5) + * \param [out] dstLocal, output LocalTensor + * \param [in] srcLocal, input LocalTensor + * \param [in] sharedTmpBuffer, input local temporary Tensor + * \param [in] dataSize, number of input data elements + * \param [in] highPrecision, whether to enable the high-precision interface to improve the calculation accuracy + * \param [in] highPerformance, whether to enable the high-performance interface to improve the computing efficiency + */ +template +__aicore__ inline void FasterGeluV2(const LocalTensor& dstLocal, const LocalTensor& srcLocal, + const LocalTensor& sharedTmpBuffer, const uint32_t dataSize) +{ + if ASCEND_IS_AIC { + return; + } + ASCENDC_ASSERT((IsSameType::value || IsSameType::value), + { KERNEL_LOG(KERNEL_ERROR, "FasterGeluV2 only support data type: float/half"); }); + FasterGeluV2Impl(dstLocal, srcLocal, sharedTmpBuffer, dataSize); +} + +/* ! + * \ingroup FasterGeluV2 + * \param [out] dstLocal, output LocalTensor + * \param [in] srcLocal, input LocalTensor + * \param [in] sharedTmpBuffer, input local temporary Tensor + * \param [in] dataSize, number of input data elements + * \param [in] highPrecision, whether to enable the high-precision interface to improve the calculation accuracy + * \param [in] highPerformance, whether to enable the high-performance interface to improve the computing efficiency + */ +template +__aicore__ inline void FasterGeluV2(const LocalTensor& dstLocal, const LocalTensor& srcLocal, + const uint32_t dataSize) +{ + if ASCEND_IS_AIC { + return; + } + ASCENDC_ASSERT((IsSameType::value || IsSameType::value), + { KERNEL_LOG(KERNEL_ERROR, "FasterGeluV2 only support data type: float/half"); }); + FasterGeluV2Impl(dstLocal, srcLocal, dataSize); +} #pragma end_pipe } // namespace AscendC -#endif // LIB_GELU_GELU_H +#endif // LIB_GELU_GELU_H \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c07def0ce3448aab315f679d7f5adfb2189592ba..6387a70cadc3875ce67c4ef60d96486864b6145e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -55,6 +55,7 @@ file(GLOB ASCENDC_TEST_ascend310p_CASE_SRC_FILES file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/activation/geglu/test_operator_geglu.cpp ${ASCENDC_TESTS_DIR}/activation/gelu/test_operator_fast_gelu.cpp + ${ASCENDC_TESTS_DIR}/activation/gelu/test_operator_fast_gelu_v2.cpp ${ASCENDC_TESTS_DIR}/activation/reglu/test_operator_reglu.cpp ${ASCENDC_TESTS_DIR}/activation/sigmoid/test_operator_vec_sigmoid.cpp ${ASCENDC_TESTS_DIR}/activation/silu/test_operator_silu.cpp diff --git a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..18ad3098f307fcd1e0acbf936f2f0e695f0e9951 --- /dev/null +++ b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp @@ -0,0 +1,140 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#define private public +#define protect public +#include "kernel_operator.h" + +using namespace std; +using namespace AscendC; + +template class KernelFastGeluV2 { +public: + __aicore__ inline KernelFastGeluV2() {} + __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t inputSize) + { + dataSize = inputSize; + + srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(srcGm), dataSize); + dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(dstGm), dataSize); + + pipe.InitBuffer(inQueueX, 1, dataSize * sizeof(srcType)); + pipe.InitBuffer(outQueue, 1, dataSize * sizeof(srcType)); + } + __aicore__ inline void Process() + { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() + { + LocalTensor srcLocal = inQueueX.AllocTensor(); + + DataCopy(srcLocal, srcGlobal, dataSize); + + inQueueX.EnQue(srcLocal); + } + __aicore__ inline void Compute() + { + LocalTensor dstLocal = outQueue.AllocTensor(); + + LocalTensor srcLocal = inQueueX.DeQue(); + + FasterGeluV2(dstLocal, srcLocal, dataSize); + + outQueue.EnQue(dstLocal); + + inQueueX.FreeTensor(srcLocal); + } + __aicore__ inline void CopyOut() + { + LocalTensor dstLocal = outQueue.DeQue(); + DataCopy(dstGlobal, dstLocal, dataSize); + outQueue.FreeTensor(dstLocal); + } + +private: + GlobalTensor srcGlobal; + GlobalTensor dstGlobal; + + TPipe pipe; + TQue inQueueX; + TQue outQueue; + + uint32_t dataSize = 0; +}; + +template +__aicore__ void main_FastGeluV2_test(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t dataSize) +{ + KernelFastGeluV2 op; + op.Init(srcGm, dstGm, dataSize); + op.Process(); +} + +struct FastGeluV2TestParams { + uint32_t dataSize; + uint32_t dataTypeSize; + void (*calFunc)(uint8_t*, uint8_t*, uint32_t); +}; + +class FastGeluV2TestSuite : public testing::Test, public testing::WithParamInterface { +protected: + static void SetUpTestCase() + { + std::cout << "FastGeluV2TestSuite SetUpTestCase" << std::endl; + } + static void TearDownTestCase() + { + std::cout << "FastGeluV2TestSuite TearDownTestCase" << std::endl; + } + virtual void SetUp() {} + virtual void TearDown() {} +}; + +INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_FastGeluV2, FastGeluV2TestSuite, + ::testing::Values( + FastGeluV2TestParams { 32, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 64, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 256, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 512, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 1024, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 2048, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 4096, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 8192, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 16384, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 32768, sizeof(half), main_FastGeluV2_test }, + FastGeluV2TestParams { 32, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 64, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 256, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 512, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 1024, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 2048, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 4096, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 8192, sizeof(float), main_FastGeluV2_test }, + FastGeluV2TestParams { 16384, sizeof(float), main_FastGeluV2_test } +)); + +TEST_P(FastGeluV2TestSuite, FastGeluV2TestCase) +{ + auto param = GetParam(); + + uint8_t inputGm[param.dataSize * param.dataTypeSize] { 0x00 }; + uint8_t outputGm[param.dataSize * param.dataTypeSize] { 0x00 }; + + param.calFunc(inputGm, outputGm, param.dataSize); + + for (int32_t i = 0; i < param.dataSize; i++) { + EXPECT_EQ(outputGm[i], 0x00); + } +} \ No newline at end of file