From bac9641f4f785af97d5ce194aabf2cb447b74461 Mon Sep 17 00:00:00 2001 From: chenmingkai Date: Wed, 25 Sep 2024 18:03:24 +0800 Subject: [PATCH] =?UTF-8?q?add=20auto=5Fsync=20pragma=20for=20msda=20&=20m?= =?UTF-8?q?sdagrad=20=EF=BC=88cherry=20picked=20commit=20from=20?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kernels/op_kernel/ms_deform_attn_grad_high_perf.h | 11 +++++++++++ .../ops/kernels/op_kernel/ms_deform_attn_high_perf.h | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h b/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h index fc13db7b..125c079f 100644 --- a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h +++ b/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h @@ -353,6 +353,7 @@ __aicore__ inline void KernelMultiScaleDeformableAttnGradOpt(0, (1UL << embedDims_) - 1); Mul(value[pingOffset], value[pingOffset], gradOut[outOffset], MASK_PLACEHOLDER, num_points * 4, {1, 1, 1, static_cast(embedBlk_), static_cast(embedBlk_), 0}); + PipeBarrier(); for (uint32_t i = 0; i < 4; ++i) { WholeReduceSum(reducedValue[i * alignedNumPoints_], value[pingOffset + i * num_points * alignedEmbedDims_], MASK_PLACEHOLDER, num_points, 1, 1, embedBlk_); // dstRepStride Unit: 4 bytes } + PipeBarrier(); Duplicate(value[pingOffset], 0.f, MASK_PLACEHOLDER, num_points * 4, 1, embedBlk_); SetFlag(ping); ping = 1 - ping; SetVectorMask(0, 0xff); + PipeBarrier(); Mul(cornerWeight, reducedValue, cornerWeight, MASK_PLACEHOLDER, 4, {1, 1, 1, 1, 1, 1}); // [4*numPoints,] * [4*numPoints,] + PipeBarrier(); Add(cornerWeight, cornerWeight, cornerWeight[2 * alignedNumPoints_], MASK_PLACEHOLDER, 2, {1, 1, 1, 1, 1, 1}); + PipeBarrier(); Add(gradWeight[level * alignedNumPoints_], cornerWeight, cornerWeight[alignedNumPoints_], MASK_PLACEHOLDER, 1, {1, 1, 1, 1, 1, 1}); SetFlag(calEvt_); @@ -495,16 +501,21 @@ __aicore__ inline void KernelMultiScaleDeformableAttnGradOpt(valueDiff, reducedValue[3 * alignedNumPoints_], reducedValue[alignedNumPoints_], MASK_PLACEHOLDER, 2, {1, 1, 1, 1, 0, 1}); + PipeBarrier(); Sub(valueDiff[2 * alignedNumPoints_], reducedValue[2 * alignedNumPoints_], reducedValue, MASK_PLACEHOLDER, 1, {1, 1, 1, 1, 1, 0}); + PipeBarrier(); Sub(valueDiff[3 * alignedNumPoints_], reducedValue[alignedNumPoints_], reducedValue, MASK_PLACEHOLDER, 1, {1, 1, 1, 1, 1, 0}); SetVectorMask(0, 0xffffffff); Copy(reducedValue, locFloat[sx], MASK_PLACEHOLDER, 1, {1, queryBlk_, 8, 8}); + PipeBarrier(); Mul(reducedValue, reducedValue, valueDiff, MASK_PLACEHOLDER, 1, {1, 1, 1, 1, 1, 1}); + PipeBarrier(); Add(reducedValue, reducedValue, reducedValue[2 * alignedNumPoints_], MASK_PLACEHOLDER, 1, {1, 1, 1, 1, 1, 1}); + PipeBarrier(); Gather(gradLoc[level * 32], reducedValue, gatherOffset, 0, 16); SetFlag(calEvt_); WaitFlag(calEvt_); diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h b/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h index 6014308f..ab3f1c9c 100644 --- a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h +++ b/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h @@ -288,6 +288,7 @@ __aicore__ inline void KernelMultiScaleDeformableAttnOpt { uint8_t ping = 0; +#pragma bisheng auto_sync parallel for (uint32_t head = 0; head < numHeads_; ++head) { uint32_t valueOffset = (baseSrcOffset_ + head) * embedDims_; uint32_t outOffset = head * alignedEmbedDims_; @@ -338,10 +339,12 @@ __aicore__ inline void KernelMultiScaleDeformableAttnOpt // broadcast to [4*8, embedDims_] Copy(cornerWeight, weight[sx], MASK_PLACEHOLDER, 1, {1, queryBlk_, 8, 8}); + PipeBarrier(); for (uint32_t i = 0; i < 4; ++i) { Brcb(cornerWeightBrc[i * num_points * alignedEmbedDims_], cornerWeight[i * alignedNumPoints_], 1, {embedBlk_, dstRptStride_}); } + PipeBarrier(); for (uint32_t i = 1; i < embedBlk_; ++i) { Copy(cornerWeightBrc[i * B32_DATA_NUM_PER_BLOCK], cornerWeightBrc, MASK_PLACEHOLDER, 4, {embedBlk_, embedBlk_, dstRptStride_, dstRptStride_}); @@ -349,37 +352,45 @@ __aicore__ inline void KernelMultiScaleDeformableAttnOpt WaitFlag(copyEvt_); + PipeBarrier(); Mul(cornerWeightBrc, value[pingOffset], cornerWeightBrc, MASK_PLACEHOLDER, valRptTimes4_, {1, 1, 1, 8, 8, 8}); + + PipeBarrier(); Duplicate(value[pingOffset], 0.f, MASK_PLACEHOLDER, valRptTimes4_, 1, 8); SetFlag(ping); ping = 1 - ping; Add(cornerWeightBrc, cornerWeightBrc, cornerWeightBrc[2 * num_points * alignedEmbedDims_], MASK_PLACEHOLDER, valRptTimes2_, {1, 1, 1, 8, 8, 8}); + PipeBarrier(); Add(cornerWeightBrc, cornerWeightBrc, cornerWeightBrc[num_points * alignedEmbedDims_], MASK_PLACEHOLDER, valRptTimes1_, {1, 1, 1, 8, 8, 8}); SetVectorMask(0, (1UL << embedDims_) - 1); if (num_points == 8) { + PipeBarrier(); Add(cornerWeightBrc, cornerWeightBrc, cornerWeightBrc[4 * alignedEmbedDims_], MASK_PLACEHOLDER, 4, {1, 1, 1, static_cast(embedBlk_), static_cast(embedBlk_), static_cast(embedBlk_)}); } if (num_points >= 4) { + PipeBarrier(); Add(cornerWeightBrc, cornerWeightBrc, cornerWeightBrc[2 * alignedEmbedDims_], MASK_PLACEHOLDER, 2, {1, 1, 1, static_cast(embedBlk_), static_cast(embedBlk_), static_cast(embedBlk_)}); } if (num_points >= 2) { + PipeBarrier(); Add(cornerWeightBrc, cornerWeightBrc, cornerWeightBrc[alignedEmbedDims_], MASK_PLACEHOLDER, 1, {1, 1, 1, static_cast(embedBlk_), static_cast(embedBlk_), static_cast(embedBlk_)}); } if (num_points >= 1) { + PipeBarrier(); Add(output[outOffset], output[outOffset], cornerWeightBrc, MASK_PLACEHOLDER, 1, {1, 1, 1, static_cast(embedBlk_), static_cast(embedBlk_), static_cast(embedBlk_)}); -- Gitee