diff --git a/impl/activation/gelu/gelu_impl.h b/impl/activation/gelu/gelu_impl.h
index 137b5f1259caa1de98b3c5a674b7c916c5c45394..45130efbc2f8ccd25818e4155831c4f87f88b50b 100644
--- a/impl/activation/gelu/gelu_impl.h
+++ b/impl/activation/gelu/gelu_impl.h
@@ -149,6 +149,86 @@ __aicore__ inline void FastGeluCalcSimplified(const LocalTensor<T>& dstLocal, co
     }
 }
 
+template <typename T>
+__aicore__ inline void FastGeluV2ClipParams(const LocalTensor<T>& tempTensorA, const LocalTensor<T>& srcLocal,
+    const GeluParams<T>& params)
+{
+    const T coefficientsA = -0.1444;
+    const T coefficientsB = -1.769;
+    const T coefficientsBInv = 1.769;
+    const T coefficientsC = 0.7071;
+    const T coefficientsD = 0.5;
+
+    const UnaryRepeatParams unaryParams;
+    const BinaryRepeatParams binaryParams;
+
+    // x1 = (-0.1444) * (clip(|0.7071 * x|, max=1.769) - 1.769) ^ 2 + 0.5
+    Muls<T, false>(tempTensorA, srcLocal, coefficientsC, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Abs<T, false>(tempTensorA, tempTensorA, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Mins<T, false>(tempTensorA, tempTensorA, coefficientsBInv, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Adds<T, false>(tempTensorA, tempTensorA, coefficientsB, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Mul<T, false>(tempTensorA, tempTensorA, tempTensorA, MASK_PLACEHOLDER, params.repeatTimes, binaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Muls<T, false>(tempTensorA, tempTensorA, coefficientsA, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Adds<T, false>(tempTensorA, tempTensorA, coefficientsD, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T, bool highPerformance = false>
+__aicore__ inline void FastGeluV2CalcSimplified(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal,
+    const GeluParams<T>& params)
+{
+    const T coefficients = 0.000000000001;
+    const T coefficientsHalf = 0.5;
+    const UnaryRepeatParams unaryParams;
+    const BinaryRepeatParams binaryParams;
+    const LocalTensor<T>& tempTensorA = params.tempTensorA;
+    const LocalTensor<T>& tempTensorB = params.tempTensorB;
+    const LocalTensor<T>& tempTensorC = params.tempTensorC;
+
+    // x1 = (-0.1444) * (clip(|0.7071 * x|, max=1.769) - 1.769) ^ 2 + 0.5
+    FastGeluV2ClipParams(tempTensorA, srcLocal, params);
+
+    // x2 = (x + 0.000000000001) / |(x + 0.000000000001)|
+    Adds<T, false>(tempTensorB, srcLocal, coefficients, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Abs<T, false>(tempTensorC, tempTensorB, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    if constexpr (highPerformance) {
+        Reciprocal<T, false>(tempTensorC, tempTensorC, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+        PipeBarrier<PIPE_V>();
+
+        Mul<T, false>(tempTensorB, tempTensorB, tempTensorC, MASK_PLACEHOLDER, params.repeatTimes, binaryParams);
+        PipeBarrier<PIPE_V>();
+    } else {
+        Div<T, false>(tempTensorB, tempTensorB, tempTensorC, MASK_PLACEHOLDER, params.repeatTimes, binaryParams);
+        PipeBarrier<PIPE_V>();
+    }
+
+    // fast_gelu_v2(x) = x * (x2 * x1 + 0.5)
+    Mul<T, false>(tempTensorA, tempTensorA, tempTensorB, MASK_PLACEHOLDER, params.repeatTimes, binaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Adds<T, false>(tempTensorA, tempTensorA, coefficientsHalf, MASK_PLACEHOLDER, params.repeatTimes, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Mul<T, false>(dstLocal, srcLocal, tempTensorA, MASK_PLACEHOLDER, params.repeatTimes, binaryParams);
+    PipeBarrier<PIPE_V>();
+}
+
 template <typename T, bool highPrecision = false, uint32_t bufferNumber = 1>
 __aicore__ inline void GeluFormulasTmpCalc(GeluParams<T>& params)
 {
@@ -162,9 +242,16 @@ __aicore__ inline void GeluFormulasTmpCalc(GeluParams<T>& params)
     ASCENDC_ASSERT((params.stackSize > 0), { KERNEL_LOG(KERNEL_ERROR, "params.stackSize must > 0!"); });
 
     uint32_t nextTmpPos = params.stackSize;
-    if constexpr (bufferNumber >= TWO_OF_STACK_BUFFER) {
+    if constexpr (bufferNumber == TWO_OF_STACK_BUFFER) {
+        params.tempTensorB = params.sharedTmpBuffer[nextTmpPos];
+        nextTmpPos += params.stackSize;
+    }
+
+    if constexpr (bufferNumber >= THREE_OF_STACK_BUFFER) {
         params.tempTensorB = params.sharedTmpBuffer[nextTmpPos];
         nextTmpPos += params.stackSize;
+        params.tempTensorC = params.sharedTmpBuffer[nextTmpPos];
+        nextTmpPos += params.stackSize;
     }
 
     if constexpr (highPrecision) {
@@ -326,6 +413,30 @@ __aicore__ inline void FasterGeluImpl(const LocalTensor<T>& dstLocal, const Loca
 
     FasterGeluImpl<T, highPrecision, highPerformance>(dstLocal, srcLocal, sharedTmpBuffer, dataSize);
 }
+
+template <typename T, bool highPrecision = false, bool highPerformance = false>
+__aicore__ inline void FasterGeluV2Impl(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t dataSize)
+{
+    if constexpr (highPrecision && (IsSameType<T, half>::value)) {
+        GeluClass<THREE_OF_STACK_BUFFER>(dstLocal, srcLocal, sharedTmpBuffer, dataSize,
+            FastGeluV2CalcSimplified<float, highPerformance>);
+    } else {
+        GeluClass<T, THREE_OF_STACK_BUFFER>(dstLocal, srcLocal, sharedTmpBuffer, dataSize,
+            FastGeluV2CalcSimplified<T, highPerformance>);
+    }
+}
+
+template <typename T, bool highPrecision = false, bool highPerformance = false>
+__aicore__ inline void FasterGeluV2Impl(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal,
+    const uint32_t dataSize)
+{
+    LocalTensor<uint8_t> sharedTmpBuffer;
+    bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
+    ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
+
+    FasterGeluV2Impl<T, highPrecision, highPerformance>(dstLocal, srcLocal, sharedTmpBuffer, dataSize);
+}
 #pragma end_pipe
 } // namespace AscendC
-#endif // IMPL_ACTIVATION_GELU_GELU_IMPL_H
+#endif // IMPL_ACTIVATION_GELU_GELU_IMPL_H
\ No newline at end of file
diff --git a/lib/activation/gelu.h b/lib/activation/gelu.h
index c2c9d162bfdebe28c47dfe6b6bd7691a33e70a71..5a2ca428c19dc8f79c4a8dbc3a6dfd45ba6d4578 100644
--- a/lib/activation/gelu.h
+++ b/lib/activation/gelu.h
@@ -91,6 +91,49 @@ __aicore__ inline void FasterGelu(const LocalTensor<T>& dstLocal, const LocalTen
     }
     FasterGeluImpl<T, highPrecision, highPerformance>(dstLocal, srcLocal, dataSize);
 }
+
+/* !
+ * \brief sgn(x) = (x + 0.000000000001) / |(x + 0.000000000001)|
+ * \brief fast_gelu_v2(x) = x * (sgn(x) * [(-0.1444) * (clip(|0.7071 * x|, max=1.769) - 1.769) ^ 2 + 0.5] + 0.5)
+ * \param [out] dstLocal, output LocalTensor
+ * \param [in] srcLocal, input LocalTensor
+ * \param [in] sharedTmpBuffer, input local temporary Tensor
+ * \param [in] dataSize, number of input data elements
+ * \param [in] highPrecision, whether to enable the high-precision interface to improve the calculation accuracy
+ * \param [in] highPerformance, whether to enable the high-performance interface to improve the computing efficiency
+ */
+template <typename T, bool highPrecision = false, bool highPerformance = false>
+__aicore__ inline void FasterGeluV2(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t dataSize)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    ASCENDC_ASSERT((IsSameType<T, half>::value || IsSameType<T, float>::value),
+        { KERNEL_LOG(KERNEL_ERROR, "FasterGeluV2 only support data type: float/half"); });
+    FasterGeluV2Impl<T, highPrecision, highPerformance>(dstLocal, srcLocal, sharedTmpBuffer, dataSize);
+}
+
+/* !
+ * \ingroup FasterGeluV2
+ * \param [out] dstLocal, output LocalTensor
+ * \param [in] srcLocal, input LocalTensor
+ * \param [in] sharedTmpBuffer, input local temporary Tensor
+ * \param [in] dataSize, number of input data elements
+ * \param [in] highPrecision, whether to enable the high-precision interface to improve the calculation accuracy
+ * \param [in] highPerformance, whether to enable the high-performance interface to improve the computing efficiency
+ */
+template <typename T, bool highPrecision = false, bool highPerformance = false>
+__aicore__ inline void FasterGeluV2(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal,
+    const uint32_t dataSize)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    ASCENDC_ASSERT((IsSameType<T, half>::value || IsSameType<T, float>::value),
+        { KERNEL_LOG(KERNEL_ERROR, "FasterGeluV2 only support data type: float/half"); });
+    FasterGeluV2Impl<T, highPrecision, highPerformance>(dstLocal, srcLocal, dataSize);
+}
 #pragma end_pipe
 } // namespace AscendC
-#endif // LIB_GELU_GELU_H
+#endif // LIB_GELU_GELU_H
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c07def0ce3448aab315f679d7f5adfb2189592ba..6387a70cadc3875ce67c4ef60d96486864b6145e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -55,6 +55,7 @@ file(GLOB ASCENDC_TEST_ascend310p_CASE_SRC_FILES
 file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES
     ${ASCENDC_TESTS_DIR}/activation/geglu/test_operator_geglu.cpp
     ${ASCENDC_TESTS_DIR}/activation/gelu/test_operator_fast_gelu.cpp
+    ${ASCENDC_TESTS_DIR}/activation/gelu/test_operator_fast_gelu_v2.cpp
     ${ASCENDC_TESTS_DIR}/activation/reglu/test_operator_reglu.cpp
     ${ASCENDC_TESTS_DIR}/activation/sigmoid/test_operator_vec_sigmoid.cpp
     ${ASCENDC_TESTS_DIR}/activation/silu/test_operator_silu.cpp
diff --git a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18ad3098f307fcd1e0acbf936f2f0e695f0e9951
--- /dev/null
+++ b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp
@@ -0,0 +1,140 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include <gtest/gtest.h>
+#define private public
+#define protect public
+#include "kernel_operator.h"
+
+using namespace std;
+using namespace AscendC;
+
+template <typename srcType> class KernelFastGeluV2 {
+public:
+    __aicore__ inline KernelFastGeluV2() {}
+    __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t inputSize)
+    {
+        dataSize = inputSize;
+
+        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(srcGm), dataSize);
+        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(dstGm), dataSize);
+
+        pipe.InitBuffer(inQueueX, 1, dataSize * sizeof(srcType));
+        pipe.InitBuffer(outQueue, 1, dataSize * sizeof(srcType));
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        LocalTensor<srcType> srcLocal = inQueueX.AllocTensor<srcType>();
+
+        DataCopy(srcLocal, srcGlobal, dataSize);
+
+        inQueueX.EnQue(srcLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();
+
+        LocalTensor<srcType> srcLocal = inQueueX.DeQue<srcType>();
+
+        FasterGeluV2(dstLocal, srcLocal, dataSize);
+
+        outQueue.EnQue<srcType>(dstLocal);
+
+        inQueueX.FreeTensor(srcLocal);
+    }
+    __aicore__ inline void CopyOut()
+    {
+        LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
+        DataCopy(dstGlobal, dstLocal, dataSize);
+        outQueue.FreeTensor(dstLocal);
+    }
+
+private:
+    GlobalTensor<srcType> srcGlobal;
+    GlobalTensor<srcType> dstGlobal;
+
+    TPipe pipe;
+    TQue<QuePosition::VECIN, 1> inQueueX;
+    TQue<QuePosition::VECOUT, 1> outQueue;
+
+    uint32_t dataSize = 0;
+};
+
+template <typename dataType>
+__aicore__ void main_FastGeluV2_test(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t dataSize)
+{
+    KernelFastGeluV2<dataType> op;
+    op.Init(srcGm, dstGm, dataSize);
+    op.Process();
+}
+
+struct FastGeluV2TestParams {
+    uint32_t dataSize;
+    uint32_t dataTypeSize;
+    void (*calFunc)(uint8_t*, uint8_t*, uint32_t);
+};
+
+class FastGeluV2TestSuite : public testing::Test, public testing::WithParamInterface<FastGeluV2TestParams> {
+protected:
+    static void SetUpTestCase()
+    {
+        std::cout << "FastGeluV2TestSuite SetUpTestCase" << std::endl;
+    }
+    static void TearDownTestCase()
+    {
+        std::cout << "FastGeluV2TestSuite TearDownTestCase" << std::endl;
+    }
+    virtual void SetUp() {}
+    virtual void TearDown() {}
+};
+
+INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_FastGeluV2, FastGeluV2TestSuite,
+    ::testing::Values(
+    FastGeluV2TestParams { 32,       sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 64,       sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 256,      sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 512,      sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 1024,     sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 2048,     sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 4096,     sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 8192,     sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 16384,    sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 32768,    sizeof(half),  main_FastGeluV2_test<half> },
+    FastGeluV2TestParams { 32,       sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 64,       sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 256,      sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 512,      sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 1024,     sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 2048,     sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 4096,     sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 8192,     sizeof(float), main_FastGeluV2_test<float> },
+    FastGeluV2TestParams { 16384,    sizeof(float), main_FastGeluV2_test<float> }
+));
+
+TEST_P(FastGeluV2TestSuite, FastGeluV2TestCase)
+{
+    auto param = GetParam();
+
+    uint8_t inputGm[param.dataSize * param.dataTypeSize] { 0x00 };
+    uint8_t outputGm[param.dataSize * param.dataTypeSize] { 0x00 };
+
+    param.calFunc(inputGm, outputGm, param.dataSize);
+
+    for (int32_t i = 0; i < param.dataSize; i++) {
+        EXPECT_EQ(outputGm[i], 0x00);
+    }
+}
\ No newline at end of file