diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp index df072c8e677da1757641fdc125991ab5154ddb6e..0d3d3c29726b51b06f930c2276c21704fa996cf5 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp @@ -33,66 +33,72 @@ public: this->tileLength = tiling.tileLength / BUFFER_NUM; this->lastTileLength = tiling.lastTileLength; - xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + uint64_t offset = this->blockLength * AscendC::GetBlockIdx(); + xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + offset, this->blockLength); + yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + offset, this->blockLength); + zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + offset, this->blockLength); } else { if (AscendC::GetBlockIdx() < tiling.formerNum) { this->tileNum = tiling.formerTileNum; this->tileLength = tiling.formerTileLength / BUFFER_NUM; this->lastTileLength = tiling.formerLastTileLength; - xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); + uint64_t offset = tiling.formerLength * AscendC::GetBlockIdx(); + xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + offset, tiling.formerLength); + yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + offset, tiling.formerLength); + zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + offset, tiling.formerLength); } else { this->tileNum = tiling.tailTileNum; this->tileLength = tiling.tailTileLength / BUFFER_NUM; this->lastTileLength = tiling.tailLastTileLength; - xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); + uint64_t offset = tiling.formerLength * tiling.formerNum + tiling.tailLength * + (AscendC::GetBlockIdx() - tiling.formerNum); + xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + offset, tiling.tailLength); + yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + offset, tiling.tailLength); + zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + offset, tiling.tailLength); } } - pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t)); - pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float)); - pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float)); + this->initBufferLength = AscendC::Std::max(this->tileLength, this->lastTileLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->initBufferLength * sizeof(bfloat16_t)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->initBufferLength * sizeof(bfloat16_t)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->initBufferLength * sizeof(bfloat16_t)); + + pipe.InitBuffer(tmpBuf0, this->initBufferLength * sizeof(float)); + pipe.InitBuffer(tmpBuf1, this->initBufferLength * sizeof(float)); } __aicore__ inline void Process() { - int32_t loopCount = this->tileNum * BUFFER_NUM; - for (int32_t i = 0; i < loopCount; i++) { - CopyIn(i); - Compute(i); - CopyOut(i); + // 整块进行double buffer计算 + uint32_t loopCount = this->tileNum * BUFFER_NUM; + for (uint32_t i = 0; i < loopCount; i++) { + CopyIn(i, this->tileLength); + Compute(i, this->tileLength); + CopyOut(i, this->tileLength); + } + + // 进行尾块计算, 不做double buffer操作 + if (this->lastTileLength > 0) { + CopyIn(loopCount, this->lastTileLength); + Compute(loopCount, this->lastTileLength); + CopyOut(loopCount, this->lastTileLength); } } private: - __aicore__ inline void CopyIn(int32_t progress) + __aicore__ inline void CopyIn(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - if (progress == (this->tileNum * BUFFER_NUM - 1)) { - AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - this->tileLength); - AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - this->tileLength); - } else { - AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); - AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); - } + + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], tileLength); + inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } - __aicore__ inline void Compute(int32_t progress) + __aicore__ inline void Compute(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor xLocal = inQueueX.DeQue(); AscendC::LocalTensor yLocal = inQueueY.DeQue(); @@ -101,25 +107,20 @@ private: AscendC::LocalTensor tmpTensor0 = tmpBuf0.Get(); AscendC::LocalTensor tmpTensor1 = tmpBuf1.Get(); - AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); - AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); + AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, tileLength); + AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, tileLength); - AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength); - AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength); + AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, tileLength); + AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, tileLength); outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); } - __aicore__ inline void CopyOut(int32_t progress) + __aicore__ inline void CopyOut(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - if (progress == (this->tileNum * BUFFER_NUM - 1)) { - AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, - this->tileLength); - } else { - AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); - } + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, tileLength); outQueueZ.FreeTensor(zLocal); } @@ -135,6 +136,7 @@ private: AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; + uint32_t initBufferLength; // initBuffer所用长度 uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; @@ -152,66 +154,72 @@ public: this->tileLength = tiling.tileLength / BUFFER_NUM; this->lastTileLength = tiling.lastTileLength; - xGm.SetGlobalBuffer((__gm__ int8_t *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - yGm.SetGlobalBuffer((__gm__ int8_t *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - zGm.SetGlobalBuffer((__gm__ int8_t *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + uint64_t offset = this->blockLength * AscendC::GetBlockIdx(); + xGm.SetGlobalBuffer((__gm__ int8_t *)x + offset, this->blockLength); + yGm.SetGlobalBuffer((__gm__ int8_t *)y + offset, this->blockLength); + zGm.SetGlobalBuffer((__gm__ int8_t *)z + offset, this->blockLength); } else { if (AscendC::GetBlockIdx() < tiling.formerNum) { this->tileNum = tiling.formerTileNum; this->tileLength = tiling.formerTileLength / BUFFER_NUM; this->lastTileLength = tiling.formerLastTileLength; - xGm.SetGlobalBuffer((__gm__ int8_t *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - yGm.SetGlobalBuffer((__gm__ int8_t *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); + uint64_t offset = tiling.formerLength * AscendC::GetBlockIdx(); + xGm.SetGlobalBuffer((__gm__ int8_t *)x + offset, tiling.formerLength); + yGm.SetGlobalBuffer((__gm__ int8_t *)y + offset, tiling.formerLength); + zGm.SetGlobalBuffer((__gm__ int8_t *)z + offset, tiling.formerLength); } else { this->tileNum = tiling.tailTileNum; this->tileLength = tiling.tailTileLength / BUFFER_NUM; this->lastTileLength = tiling.tailLastTileLength; - xGm.SetGlobalBuffer((__gm__ int8_t *)x + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - yGm.SetGlobalBuffer((__gm__ int8_t *)y + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); + uint64_t offset = tiling.formerLength * tiling.formerNum + tiling.tailLength * + (AscendC::GetBlockIdx() - tiling.formerNum); + xGm.SetGlobalBuffer((__gm__ int8_t *)x + offset, tiling.tailLength); + yGm.SetGlobalBuffer((__gm__ int8_t *)y + offset, tiling.tailLength); + zGm.SetGlobalBuffer((__gm__ int8_t *)z + offset, tiling.tailLength); } } - pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(int8_t)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(int8_t)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(int8_t)); - pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half)); - pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half)); + this->initBufferLength = AscendC::Std::max(this->tileLength, this->lastTileLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->initBufferLength * sizeof(int8_t)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->initBufferLength * sizeof(int8_t)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->initBufferLength * sizeof(int8_t)); + + pipe.InitBuffer(tmpBuf0, this->initBufferLength * sizeof(half)); + pipe.InitBuffer(tmpBuf1, this->initBufferLength * sizeof(half)); } __aicore__ inline void Process() { - int32_t loopCount = this->tileNum * BUFFER_NUM; - for (int32_t i = 0; i < loopCount; i++) { - CopyIn(i); - Compute(i); - CopyOut(i); + // 整块进行double buffer计算 + uint32_t loopCount = this->tileNum * BUFFER_NUM; + for (uint32_t i = 0; i < loopCount; i++) { + CopyIn(i, this->tileLength); + Compute(i, this->tileLength); + CopyOut(i, this->tileLength); + } + + // 进行尾块计算, 不做double buffer操作 + if (this->lastTileLength > 0) { + CopyIn(loopCount, this->lastTileLength); + Compute(loopCount, this->lastTileLength); + CopyOut(loopCount, this->lastTileLength); } } private: - __aicore__ inline void CopyIn(int32_t progress) + __aicore__ inline void CopyIn(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - if (progress == (this->tileNum * BUFFER_NUM - 1)) { - AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - this->tileLength); - AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - this->tileLength); - } else { - AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); - AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); - } + + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], tileLength); + inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } - __aicore__ inline void Compute(int32_t progress) + __aicore__ inline void Compute(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor xLocal = inQueueX.DeQue(); AscendC::LocalTensor yLocal = inQueueY.DeQue(); @@ -220,25 +228,20 @@ private: AscendC::LocalTensor tmpTensor0 = tmpBuf0.Get(); AscendC::LocalTensor tmpTensor1 = tmpBuf1.Get(); - AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); - AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); + AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, tileLength); + AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, tileLength); - AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength); - AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength); + AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, tileLength); + AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, tileLength); outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); } - __aicore__ inline void CopyOut(int32_t progress) + __aicore__ inline void CopyOut(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - if (progress == (this->tileNum * BUFFER_NUM - 1)) { - AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, - this->tileLength); - } else { - AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); - } + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, tileLength); outQueueZ.FreeTensor(zLocal); } @@ -254,6 +257,7 @@ private: AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; + uint32_t initBufferLength; // initBuffer所用长度 uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; @@ -271,83 +275,83 @@ public: this->tileLength = tiling.tileLength / BUFFER_NUM; this->lastTileLength = tiling.lastTileLength; - xGm.SetGlobalBuffer((__gm__ dataType *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - yGm.SetGlobalBuffer((__gm__ dataType *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - zGm.SetGlobalBuffer((__gm__ dataType *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + uint64_t offset = this->blockLength * AscendC::GetBlockIdx(); + xGm.SetGlobalBuffer((__gm__ dataType *)x + offset, this->blockLength); + yGm.SetGlobalBuffer((__gm__ dataType *)y + offset, this->blockLength); + zGm.SetGlobalBuffer((__gm__ dataType *)z + offset, this->blockLength); } else { if (AscendC::GetBlockIdx() < tiling.formerNum) { this->tileNum = tiling.formerTileNum; this->tileLength = tiling.formerTileLength / BUFFER_NUM; this->lastTileLength = tiling.formerLastTileLength; - xGm.SetGlobalBuffer((__gm__ dataType *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - yGm.SetGlobalBuffer((__gm__ dataType *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); + uint64_t offset = tiling.formerLength * AscendC::GetBlockIdx(); + xGm.SetGlobalBuffer((__gm__ dataType *)x + offset, tiling.formerLength); + yGm.SetGlobalBuffer((__gm__ dataType *)y + offset, tiling.formerLength); + zGm.SetGlobalBuffer((__gm__ dataType *)z + offset, tiling.formerLength); } else { this->tileNum = tiling.tailTileNum; this->tileLength = tiling.tailTileLength / BUFFER_NUM; this->lastTileLength = tiling.tailLastTileLength; - xGm.SetGlobalBuffer((__gm__ dataType *)x + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - yGm.SetGlobalBuffer((__gm__ dataType *)y + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * tiling.formerNum + - tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); + uint64_t offset = tiling.formerLength * tiling.formerNum + tiling.tailLength * + (AscendC::GetBlockIdx() - tiling.formerNum); + xGm.SetGlobalBuffer((__gm__ dataType *)x + offset, tiling.tailLength); + yGm.SetGlobalBuffer((__gm__ dataType *)y + offset, tiling.tailLength); + zGm.SetGlobalBuffer((__gm__ dataType *)z + offset, tiling.tailLength); } } - pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(dataType)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType)); + this->initBufferLength = AscendC::Std::max(this->tileLength, this->lastTileLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->initBufferLength * sizeof(dataType)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->initBufferLength * sizeof(dataType)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->initBufferLength * sizeof(dataType)); } __aicore__ inline void Process() { - int32_t loopCount = this->tileNum * BUFFER_NUM; - for (int32_t i = 0; i < loopCount; i++) { - CopyIn(i); - Compute(i); - CopyOut(i); + // 整块进行double buffer计算 + uint32_t loopCount = this->tileNum * BUFFER_NUM; + for (uint32_t i = 0; i < loopCount; i++) { + CopyIn(i, this->tileLength); + Compute(i, this->tileLength); + CopyOut(i, this->tileLength); + } + + // 进行尾块计算, 不做double buffer操作 + if (this->lastTileLength > 0) { + CopyIn(loopCount, this->lastTileLength); + Compute(loopCount, this->lastTileLength); + CopyOut(loopCount, this->lastTileLength); } } private: - __aicore__ inline void CopyIn(int32_t progress) + __aicore__ inline void CopyIn(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - if (progress == (this->tileNum * BUFFER_NUM - 1)) { - AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - this->tileLength); - AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - this->tileLength); - } else { - AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); - AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); - } + + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], tileLength); + inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } - __aicore__ inline void Compute(int32_t progress) + __aicore__ inline void Compute(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor xLocal = inQueueX.DeQue(); AscendC::LocalTensor yLocal = inQueueY.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + AscendC::Add(zLocal, xLocal, yLocal, tileLength); outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); } - __aicore__ inline void CopyOut(int32_t progress) + __aicore__ inline void CopyOut(uint32_t progress, uint32_t tileLength) { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - if (progress == (this->tileNum * BUFFER_NUM - 1)) { - AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, - this->tileLength); - } else { - AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); - } + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); outQueueZ.FreeTensor(zLocal); } @@ -361,12 +365,13 @@ private: AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; + uint32_t initBufferLength; // initBuffer所用长度 uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; uint32_t lastTileLength; }; - + extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling) { if (tiling.dataType == ADD_BFLOAT16) { diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp index 8940940b055a4d85e9e0da7398263337bfbe7ab5..1c951d7e45d5e683eb968995b314b2592888284f 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp @@ -15,36 +15,35 @@ constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4}; constexpr uint32_t BLOCK_SIZE = 32; constexpr uint32_t BUFFER_NUM = 2; -constexpr uint32_t UB_BLOCK_NUM = 100; // UB最大可以使用的block数量 -constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM; +constexpr uint32_t UB_BLOCK_NUM = 100; // UB最大可以使用的block数量 +constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM; // 每个buffer可用block数量 // tiling参数计算函数 -void TilingParamsCalc(uint32_t length, uint32_t alignNum, - uint32_t& tileNum, uint32_t& tileLength, uint32_t& lastTileLength) +// length: 每个核的数据长度 +void TilingParamsCalc(uint32_t length, uint32_t alignNum, uint32_t& tileNum, uint32_t& tileLength, + uint32_t& lastTileLength) { assert(alignNum != 0U); tileNum = length / (alignNum * MAX_AVAILABLE_UB_BLOCK_NUM); - if ((static_cast(length / alignNum) % MAX_AVAILABLE_UB_BLOCK_NUM == 0U) || tileNum == 0U) { - if (tileNum == 0U) { - tileNum = 1U; - } - if (length < MAX_AVAILABLE_UB_BLOCK_NUM * alignNum) { - tileLength = ((static_cast(length) + alignNum - 1) / alignNum) * alignNum; - lastTileLength = tileLength; - } else { - tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum; - lastTileLength = (uint32_t)(length - (tileNum - 1) * tileLength); - } + + // 单核需要计算的长度 < 单核UB最大一次可计算长度 -> 仅有尾块 + if (tileNum == 0U) { + tileLength = 0; + lastTileLength = ((static_cast(length) + alignNum - 1) / alignNum) * alignNum; + } else if (static_cast(length / alignNum) % MAX_AVAILABLE_UB_BLOCK_NUM == 0U) { + // 单核需要计算的长度 = 单核UB最大一次可计算长度 的整数倍 -> 仅有整块 + tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum; + lastTileLength = 0; } else { - tileNum++; + // 有整块 + 尾块 tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum; - lastTileLength = (uint32_t)(length - (tileNum - 1) * tileLength); + lastTileLength = (uint32_t)(length - tileNum* tileLength); } } void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim) { - uint32_t totalLength; + uint32_t totalLength; // 总共要计算的元素个数 uint32_t dataTypeSize; uint32_t blockLength; uint32_t totalLengthAligned; @@ -53,7 +52,7 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim) totalLength = tiling->totalLength; dataTypeSize = DATA_TYPE_SIZE[tiling->dataType]; - uint32_t alignNum = BLOCK_SIZE / dataTypeSize; + uint32_t alignNum = BLOCK_SIZE / dataTypeSize; // 一个block中的元素个数 assert((alignNum != 0U) && (blockDim != 0U)); /** 计算使用的核数 **/ /* 如果传入数据的长度非32B对齐, 计算对齐后的长度*/ @@ -90,10 +89,8 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim) uint32_t tailTileLength; uint32_t tailLastTileLength; - TilingParamsCalc(formerLength, alignNum, - formerTileNum, formerTileLength, formerLastTileLength); - TilingParamsCalc(tailLength, alignNum, - tailTileNum, tailTileLength, tailLastTileLength); + TilingParamsCalc(formerLength, alignNum, formerTileNum, formerTileLength, formerLastTileLength); + TilingParamsCalc(tailLength, alignNum, tailTileNum, tailTileLength, tailLastTileLength); tiling->formerNum = formerNum; tiling->formerLength = formerLength; @@ -106,6 +103,7 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim) tiling->tailTileNum = tailTileNum; tiling->tailTileLength = tailTileLength; tiling->tailLastTileLength = tailLastTileLength; + tiling->isEvenCore = 0U; } } \ No newline at end of file diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h index 7b82c3377b1a8a51bb93cd5d0e5ee3f61a0594f2..441a56757b400685672a0abb0f1cd588d23f87cb 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h @@ -14,23 +14,24 @@ struct AddCustomTilingData { uint32_t totalLength; uint32_t dataType; + // 核间可均分数据块时,主块的参数 + uint32_t blockLength; // 每个核要计算的数据总长度 + uint32_t tileNum; // 数据整块数 + uint32_t tileLength; // 数据整块的元素个数 + uint32_t lastTileLength; // 数据尾块的元素个数 - uint32_t blockLength; - uint32_t tileNum; - uint32_t tileLength; - uint32_t lastTileLength; - - uint32_t formerNum; + // 核间不可均分数据块时,主块的参数 + uint32_t formerNum; // 主块核数 uint32_t formerLength; uint32_t formerTileNum; uint32_t formerTileLength; uint32_t formerLastTileLength; - - uint32_t tailNum; + // 核间不可均分数据块时,尾块的参数 + uint32_t tailNum; // 尾块核数 uint32_t tailLength; uint32_t tailTileNum; uint32_t tailTileLength; - uint32_t tailLastTileLength; + uint32_t tailLastTileLength; uint32_t isEvenCore; };