diff --git a/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h b/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h index ab84dacd080be69741c07879579d5e65a45a4567..32602ded5789811878fbbb417c53803f91783e8f 100644 --- a/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h +++ b/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h @@ -21,12 +21,12 @@ struct VecTiling { template class KernelFasterGelu { public: __aicore__ inline KernelFasterGelu() {} - __aicore__ inline void Init(GM_ADDR src_gm, GM_ADDR dst_gm, uint32_t inputSize) + __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t inputSize) { dataSize = inputSize; - srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(src_gm), dataSize); - dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(dst_gm), dataSize); + srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(srcGm), dataSize); + dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(dstGm), dataSize); pipe.InitBuffer(inQueueX, 1, dataSize * sizeof(srcType)); pipe.InitBuffer(outQueue, 1, dataSize * sizeof(srcType)); @@ -50,8 +50,8 @@ private: AscendC::LocalTensor dstLocal = outQueue.AllocTensor(); AscendC::LocalTensor srcLocal = inQueueX.DeQue(); AscendC::FasterGelu(dstLocal, srcLocal, dataSize); - // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //开启高精度模式 - // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //开启高性能模式 + // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //Enable high precision mode + // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //Enable high performance mode outQueue.EnQue(dstLocal); inQueueX.FreeTensor(srcLocal); } diff --git a/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h b/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h index 13ac35cf69c482e751120e8fc2a94aa76a30f00f..6a91d8caeefcfc9b11f4f18f1df1973262df4ad9 100644 --- a/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h +++ b/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h @@ -38,7 +38,7 @@ void ComputeTiling(uint32_t rnLength, uint32_t abLength, uint32_t head, uint32_t ge::Shape srcShape(shapeVec); uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesize = 4; // float类型 + uint32_t dtypesize = 4; // float type tiling.set_rnLength(rnLength); tiling.set_abLength(abLength); diff --git a/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h index ef4335a3f3c62e6f66b142877586f84db95a22fc..b23571d0292db785e0fb021cc61fc7f3d223e188 100644 --- a/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h +++ b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h @@ -36,8 +36,8 @@ void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t ab ge::Shape srcShape(shapeVec); uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesizeT = 2; // half类型 - uint32_t dtypesizeU = 4; // float类型 + uint32_t dtypesizeT = 2; // half type + uint32_t dtypesizeU = 4; // float type tiling.set_inplace(inplace); tiling.set_nLength(nLength); diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h index ef4335a3f3c62e6f66b142877586f84db95a22fc..b23571d0292db785e0fb021cc61fc7f3d223e188 100644 --- a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h @@ -36,8 +36,8 @@ void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t ab ge::Shape srcShape(shapeVec); uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesizeT = 2; // half类型 - uint32_t dtypesizeU = 4; // float类型 + uint32_t dtypesizeT = 2; // half type + uint32_t dtypesizeU = 4; // float type tiling.set_inplace(inplace); tiling.set_nLength(nLength); diff --git a/examples/reduce/sum/main.cpp b/examples/reduce/sum/main.cpp index c3ec5710ebf2f08f98f79c432de8b1fde70ac2b6..4d7da545eea0765453af7ec99b06999303d4f4de 100644 --- a/examples/reduce/sum/main.cpp +++ b/examples/reduce/sum/main.cpp @@ -26,7 +26,7 @@ constexpr uint32_t M = 7; // outter constexpr uint32_t N = 2023; // inner_actual } -extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N); +extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t n); static bool CompareResult(const void *outputData, uint32_t outSize) { void *goldenData; diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp index ebfc06e6f61fb18690aeb1a76d85324a239a4c99..5e3cbb4c4057a28c650e8060b7c2e25a6a594157 100644 --- a/examples/reduce/sum/op_host/sum_custom_tiling.cpp +++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp @@ -16,25 +16,25 @@ namespace { constexpr uint32_t PADDING_BYTE = 32U; } -void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N) { +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t n) { uint32_t minValue = 0; uint32_t maxValue = 0; - AscendC::GetSumMaxMinTmpSize(N, sizeof(uint32_t), false, maxValue, minValue); + AscendC::GetSumMaxMinTmpSize(n, sizeof(uint32_t), false, maxValue, minValue); SumCustomTilingData *tiling = reinterpret_cast(tilingBuf); - auto paddingFunc = [](const uint32_t n, const uint32_t typeSize) -> uint32_t { + auto paddingFunc = [](const uint32_t n1, const uint32_t typeSize) -> uint32_t { if (typeSize == 0) { return 0; } - return (n * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; + return (n1 * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; }; - tiling->outter = M; - tiling->inner = paddingFunc(N, sizeof(uint32_t)); - tiling->n = N; + tiling->outter = m; + tiling->inner = paddingFunc(n, sizeof(uint32_t)); + tiling->n = n; tiling->tmpBufSize = minValue; - tiling->out_inner = paddingFunc(M, sizeof(uint32_t)); + tiling->out_inner = paddingFunc(m, sizeof(uint32_t)); } \ No newline at end of file diff --git a/examples/sort/topk/kernel_impl/topk_custom.h b/examples/sort/topk/kernel_impl/topk_custom.h index ad24c567cdd9c19c51afbe6b939d0e593c18469c..7b0a1883d617ca906319324c0f98fe935d06dfc1 100644 --- a/examples/sort/topk/kernel_impl/topk_custom.h +++ b/examples/sort/topk/kernel_impl/topk_custom.h @@ -47,17 +47,17 @@ public: tmplocalBytes = tilingData.minsize; topKTilingData = tilingData.topKTilingData; k = tilingData.k; - // 计算k_pad + // calculation kPad if (sizeof(T) == sizeof(float)) { - k_pad = (k + K_FLOAT - 1) / K_FLOAT * K_FLOAT; + kPad = (k + K_FLOAT - 1) / K_FLOAT * K_FLOAT; } else { - k_pad = (k + K_HALF - 1) / K_HALF * K_HALF; + kPad = (k + K_HALF - 1) / K_HALF * K_HALF; } - kpad_index = (k + K_FLOAT) / K_FLOAT * K_FLOAT; + kPadIndex = (k + K_FLOAT) / K_FLOAT * K_FLOAT; isLargest = tilingData.isLargest; inDataSize = inner * outter; - outValueDataSize = k_pad * outter; - outIndexDataSize = kpad_index * outter; + outValueDataSize = kPad * outter; + outIndexDataSize = kPadIndex * outter; inputdexDataSize = inner; if (topkMode == true) { @@ -189,8 +189,8 @@ private: uint32_t outValueDataSize = 0; uint32_t outIndexDataSize = 0; uint32_t k; - uint32_t k_pad; - uint32_t kpad_index; + uint32_t kPad; + uint32_t kPadIndex; bool isLargest = true; TopkTiling topKTilingData; uint32_t outter; diff --git a/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp b/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp index cd5af26c34ebb90e04719bc1ac31300a26cfb644..c2239d979ce8c0d007c4198b6dc760c427626976 100644 --- a/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp +++ b/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp @@ -29,7 +29,7 @@ uint8_t* GenerateTiling(uint32_t k, uint32_t outter, uint32_t inner, uint32_t n, uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesize = 4; // float类型 + uint32_t dtypesize = 4; // float type platform_ascendc::PlatformAscendC* ascendcPlatform; if (socVersion != nullptr) { diff --git a/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h b/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h index 21ca183c5db901e43410e57e0f15450fa03250ea..bb57df5ff5bb1e40517eeb63d9ff45e8fee65132 100644 --- a/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h +++ b/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h @@ -30,7 +30,7 @@ public: // init zGm value AscendC::InitGlobalMemory(zGm, INIT_SIZE, (float)(AscendC::GetBlockIdx())); - //需要插MTE2等MTE3的同步 + // sync of MTE2 and MTE3 is requied AscendC::TEventID eventIdMTE3ToMTE2 = GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_MTE2); AscendC::SetFlag(eventIdMTE3ToMTE2); AscendC::WaitFlag(eventIdMTE3ToMTE2); diff --git a/lib/quantization/ascend_antiquant.h b/lib/quantization/ascend_antiquant.h index 0275ae1ea922231928ab9c8ba8c3d679d72c2968..c1c338f9f666c285e4bed5e03f64a30080caefb3 100644 --- a/lib/quantization/ascend_antiquant.h +++ b/lib/quantization/ascend_antiquant.h @@ -33,12 +33,12 @@ namespace AscendC { template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, const LocalTensor &offset, const LocalTensor &scale, - const LocalTensor &sharedTmpBuffer, const uint32_t K, const AntiQuantShapeInfo& shapeInfo = {}) + const LocalTensor &sharedTmpBuffer, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, K, + AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, k, shapeInfo); } @@ -54,13 +54,13 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const LocalTensor &scale, const LocalTensor &sharedTmpBuffer, const uint32_t K, + const LocalTensor &scale, const LocalTensor &sharedTmpBuffer, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, K, shapeInfo); + AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, k, shapeInfo); } /* ! @@ -75,13 +75,13 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const LocalTensor &offset, const LocalTensor &scale, const uint32_t K, + const LocalTensor &offset, const LocalTensor &scale, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, K, shapeInfo); + AscendAntiQuantImpl(dst, src, offset, scale, k, shapeInfo); } /* ! @@ -98,12 +98,12 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, const OutputDataType offset, const OutputDataType scale, const LocalTensor &sharedTmpBuffer, - const uint32_t K, const AntiQuantShapeInfo& shapeInfo = {}) + const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, K, + AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, k, shapeInfo); } @@ -119,13 +119,13 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const OutputDataType scale, const LocalTensor &sharedTmpBuffer, const uint32_t K, + const OutputDataType scale, const LocalTensor &sharedTmpBuffer, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, K, shapeInfo); + AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, k, shapeInfo); } /* ! @@ -140,12 +140,12 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const OutputDataType offset, const OutputDataType scale, const uint32_t K, const AntiQuantShapeInfo& shapeInfo = {}) + const OutputDataType offset, const OutputDataType scale, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, K, shapeInfo); + AscendAntiQuantImpl(dst, src, offset, scale, k, shapeInfo); } #pragma end_pipe } // namespace AscendC diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp index 7dd522b972f24d7d4f178682dd2f454e4547dbcb..4fa3732261307dad969bd45d01a14ed69ceabfbb 100644 --- a/tests/normalization/groupnorm/test_operator_groupnorm.cpp +++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp @@ -70,11 +70,12 @@ __aicore__ inline void GetGroupNormNDTillingInfo(const ShapeInfo& inputShapeInfo tiling.tmpBufSize = stackBufferSize / ONE_BLK_SIZE * ONE_BLK_SIZE / B32_BYTE_SIZE; tiling.oneTmpSize = (tiling.tmpBufSize - meanVarTotalSize) / tiling.numberOfTmpBuf; - // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group + // to enable MeanVarTensor to directly use Add without need to use GetValue + // it is necessary to ensure that each iteration has at least 8 integer multiples of groups tiling.bsCurLength = tiling.oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * tiling.d * tiling.hwAlignSize) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION; - // 判断是否满足 smallShape 计算 + // determine whether the condition for smallShape is met uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE; while ((tiling.dhwAlignSize / (ONE_BLK_SIZE / B32_BYTE_SIZE)) % k != 0) { k--; @@ -82,7 +83,9 @@ __aicore__ inline void GetGroupNormNDTillingInfo(const ShapeInfo& inputShapeInfo tiling.smallShape = (tiling.hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) && (tiling.hwAlignSize * tiling.d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k); - // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围 + // the constraints instroduced by the ReduceSum0 interface + // base one the DHW calculation of the mask/repeat for 2 ReduceSum operations, + // as well as the value range of DHW/bsCurLength if (tiling.smallShape) { uint32_t mask1{GROUPNORM_MAX_MASK_VAL}; if (tiling.dhwAlignSize > GROUPNORM_MAX_MASK_VAL) { diff --git a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp index c5b08ec14784671fd8c33d8ef78da54de7f2366e..c937fbfa5f8d1c0f7d673e5475ab54e202a067ea 100644 --- a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp +++ b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp @@ -210,9 +210,9 @@ protected: {} }; -// 1、有尾块; -// 2、有counts; -// 3、 有buffer约束; +// 1. with tail block; +// 2. there is counts; +// 3. there is buffer constraint INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_WelfordFinalize, WelfordFinalizeTestSuite, ::testing::Values( WelfordFinalizeTestParams { 4, 32, 4, 32, 4, 0, kernel_WelfordFinalize_test }, // !1 + !2 + !3 diff --git a/tests/reduce/reduce_all/test_operator_reduce_all.cpp b/tests/reduce/reduce_all/test_operator_reduce_all.cpp index 4cf976717de46fba8522ea3d742ad146c69a49ff..bd570304afe638cb24a5130ac5f5d5ae25b867d9 100644 --- a/tests/reduce/reduce_all/test_operator_reduce_all.cpp +++ b/tests/reduce/reduce_all/test_operator_reduce_all.cpp @@ -180,7 +180,7 @@ TEST_P(ReduceAllTestsuite, ReduceAllOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_any/test_operator_reduce_any.cpp b/tests/reduce/reduce_any/test_operator_reduce_any.cpp index bf4335fc49f223d4f3aa82f149284a8b2107ad25..9a09028e21e47fb816a508781e92820d5b8b4525 100644 --- a/tests/reduce/reduce_any/test_operator_reduce_any.cpp +++ b/tests/reduce/reduce_any/test_operator_reduce_any.cpp @@ -179,7 +179,7 @@ TEST_P(ReduceAnyTestsuite, ReduceAnyOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_max/test_operator_reduce_max.cpp b/tests/reduce/reduce_max/test_operator_reduce_max.cpp index 2589174154d069d038592eeb7b435bd8afd904ee..8d8ae90cd2528e939ea80808d64c14f1e4b85391 100644 --- a/tests/reduce/reduce_max/test_operator_reduce_max.cpp +++ b/tests/reduce/reduce_max/test_operator_reduce_max.cpp @@ -172,7 +172,7 @@ TEST_P(MaxTestsuite, MaxOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp b/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp index 5d7ad967f5951b1bc8ca6519053d6fd79af25b88..b0dca21cd0dfc6edbaef054681adf90fa13ff937 100644 --- a/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp +++ b/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp @@ -159,7 +159,7 @@ TEST_P(ReduceMeanTestsuite, ReduceMeanOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_min/test_operator_reduce_min.cpp b/tests/reduce/reduce_min/test_operator_reduce_min.cpp index 6d0787df2d2f23917b0a7775f6e32b36472deb7d..7137cd17dea4849e761b53f3f37a5dcaa7108e89 100644 --- a/tests/reduce/reduce_min/test_operator_reduce_min.cpp +++ b/tests/reduce/reduce_min/test_operator_reduce_min.cpp @@ -172,7 +172,7 @@ TEST_P(MinTestsuite, MinOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp b/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp index d27778c963c3ca9c1fa074710b4603ea06d32443..80079e0a397fb0af97d581c24ff021b7439c88f4 100644 --- a/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp +++ b/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp @@ -136,7 +136,7 @@ TEST_P(ProdTestsuite, ProdOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp b/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp index 2b0927cdbb6c0801226a7b7583b60b577e094212..e0e418eee4af9cedb6d4e65934927f2f4d73e8de 100644 --- a/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp +++ b/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp @@ -163,7 +163,7 @@ TEST_P(ReduceSumTestsuite, ReduceSumOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/sum/test_operator_sum.cpp b/tests/reduce/sum/test_operator_sum.cpp index 015bcf708c1472e8d0cf839e5c08c45d6eeec7f3..5a9dd78ce47b4fe6d8f2f1a3c0bef1e41e4979a1 100644 --- a/tests/reduce/sum/test_operator_sum.cpp +++ b/tests/reduce/sum/test_operator_sum.cpp @@ -31,10 +31,10 @@ public: src1Global.SetGlobalBuffer((__gm__ T*)src0Gm); dstGlobal.SetGlobalBuffer((__gm__ T*)dstGm); pipe.InitBuffer(inQueueSrc1, 1, 8 * 160 * sizeof(T)); - pipe.InitBuffer(outQueueDst, 1, ONE_BLK_SIZE); // 8个数整体对齐 - int32_t repeatTimes = (160 + elementNumPerRep - 1) / elementNumPerRep; // workSize = repeatTimes向上取整 + pipe.InitBuffer(outQueueDst, 1, ONE_BLK_SIZE); // align the 8 number as a whole + int32_t repeatTimes = (160 + elementNumPerRep - 1) / elementNumPerRep; // workSize = ceil(repeatTimes) int32_t finalWorkSize = (repeatTimes + elementNumPerBlk - 1) / elementNumPerBlk * elementNumPerBlk * sizeof(T); - pipe.InitBuffer(workQueue, 1, finalWorkSize); // 向上取整 + pipe.InitBuffer(workQueue, 1, finalWorkSize); // round up } __aicore__ inline void Process() @@ -59,7 +59,7 @@ private: LocalTensor workLocal = workQueue.AllocTensor(); LocalTensor dstLocal = outQueueDst.AllocTensor(); - SumParams params {8, 160, 152}; // n是自己填的 + SumParams params {8, 160, 152}; Sum(dstLocal, srcLocal1, workLocal, params); outQueueDst.EnQue(dstLocal); @@ -75,11 +75,13 @@ private: private: TPipe pipe; - TQue inQueueSrc1; // 用于申请临时tensor + // used for applying a temporary tensor + TQue inQueueSrc1; TQue workQueue; TQue outQueueDst; - GlobalTensor src1Global, dstGlobal; // 用于关联Gm + // used for associating GM + GlobalTensor src1Global, dstGlobal; }; } // namespace AscendC @@ -117,7 +119,7 @@ INSTANTIATE_TEST_CASE_P(TEST_OPEARATION_SUM, SumTestsuite, TEST_P(SumTestsuite, SumOpTestCase) { auto param = GetParam(); - uint8_t src0Gm[8 * 160 * param.typeSize]; // 外部保证inner是32B对齐 + uint8_t src0Gm[8 * 160 * param.typeSize]; // external guarantee inner is 32B aligned uint32_t dstLen = (8 * param.typeSize + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE; uint8_t dstGm[dstLen]; param.cal_func(dstGm, src0Gm); diff --git a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp index c0624cc1485f1048d5b197d97b4ee116fcdbd38e..fe6b6b58732980472efa9cbd6a66ac50790dc094 100644 --- a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp +++ b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp @@ -19,9 +19,9 @@ using namespace std; using namespace AscendC; -// 场景1 +// scene 1 namespace AscendC { -// 场景1、2: srcShape[B, A1, A2, A3] +// scene 1, 2: srcShape[B, A1, A2, A3] __aicore__ inline void GetConfusionTranspose0213TilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -32,7 +32,7 @@ __aicore__ inline void GetConfusionTranspose0213TilingInfo(const ShapeInfo srcSh uint32_t widthTiling = (srcShape.originalShape[3] + BLOCK_CUBE - 1) / BLOCK_CUBE; uint32_t alignA3 = widthTiling * BLOCK_CUBE; - // stackBuffer向 [16,16]对齐 + // stackBuffer is aligned to [16,16] uint32_t newPopSize = (stackBufferSize / CUBE_MAX_SIZE) * CUBE_MAX_SIZE; // element uint32_t newPopH = newPopSize / BLOCK_CUBE; uint32_t needSize = alignA2 * BLOCK_CUBE; @@ -62,7 +62,7 @@ __aicore__ inline void GetConfusionTranspose0213TilingInfo(const ShapeInfo srcSh tiling.param15 = mainOffset; } -// 场景3:srcShape[B, N, S, H/N] +// scene 3:srcShape[B, N, S, H/N] __aicore__ inline void GetConfusionTranspose2NZ012NTilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -103,7 +103,7 @@ __aicore__ inline void GetConfusionTranspose2NZ012NTilingInfo(const ShapeInfo sr tiling.param16 = srcBatchOffset; } -// 场景4:srcShape[B, N, S, H/N] +// scene 4:srcShape[B, N, S, H/N] __aicore__ inline void GetConfusionTranspose2ND012NTilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -146,7 +146,7 @@ __aicore__ inline void GetConfusionTranspose2ND012NTilingInfo(const ShapeInfo sr tiling.param17 = blockNum; } -// 场景5、6:srcShape[B, N, S, H/N] +// scene 5, 6:srcShape[B, N, S, H/N] __aicore__ inline void GetConfusionTranspose012TilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -183,7 +183,7 @@ __aicore__ inline void GetConfusionTranspose012TilingInfo(const ShapeInfo srcSha tiling.param14 = blockNum; } -// 场景7:srcShape[height, width] +// scene 7:srcShape[height, width] __aicore__ inline void GetConfusionTransposeOnlyTilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -335,7 +335,7 @@ TEST_P(ConfusionTransposeFirstTestsuite, ConfusionTransposeFirstTestCase) } } -// 场景2 +// scene 2 namespace AscendC { template class KernelConfusionTransposeSecond { @@ -458,7 +458,7 @@ TEST_P(ConfusionTransposeSecondTestsuite, ConfusionTransposeSecondTestCase) } } -// 场景3 +// scene 3 namespace AscendC { template class KernelConfusionTransposeThird { @@ -604,7 +604,7 @@ TEST_P(ConfusionTransposeThirdTestsuite, ConfusionTransposeThirdTestCase) } } -// 场景4 +// scene 4 namespace AscendC { template class KernelConfusionTransposeFourth { @@ -751,7 +751,7 @@ TEST_P(ConfusionTransposeFourthTestsuite, ConfusionTransposeFourthTestCase) } -// 场景5 +// scene 5 namespace AscendC { template class KernelConfusionTransposeFifth { @@ -897,7 +897,7 @@ TEST_P(ConfusionTransposeFifthTestsuite, ConfusionTransposeFifthTestCase) } -// 场景6 +// scene 6 namespace AscendC { template class KernelConfusionTransposeSixth { @@ -1042,7 +1042,7 @@ TEST_P(ConfusionTransposeSixthTestsuite, ConfusionTransposeSixthTestCase) } } -// 场景7 +// scene 7 namespace AscendC { template class KernelConfusionTransposeSeventh {