From edb5c704439e672d185be6839051a41adfe5fcc7 Mon Sep 17 00:00:00 2001
From: weili10 <liwei386@huawei.com>
Date: Fri, 16 Jul 2021 18:19:48 +0800
Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5b98f7ec21ee0d6c14be8b0bd09301?=
 =?UTF-8?q?d9f6e686816?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 patch/npu.patch                               | 275 +++++++++---------
 .../src/ATen/native/native_functions.yaml     |  14 +-
 .../src/ATen/native/npu/ArgminKernelNpu.cpp   |   0
 src/aten/src/ATen/native/npu/CatKernelNpu.cpp |  14 +
 .../native/npu/ConstantPadNdKernelNpu.cpp     |  14 +-
 .../native/npu/EmbeddingRenormKernelNpu.cpp   | 176 +++++------
 .../ATen/native/npu/GiouBackwardKernelNpu.cpp |  73 +++++
 .../src/ATen/native/npu/GiouKernelNpu.cpp     |  87 ++++++
 .../src/ATen/native/npu/IndexPutKernelNpu.cpp |   3 +
 src/aten/src/ATen/native/npu/MinKernelNpu.cpp |  27 +-
 src/aten/src/ATen/native/npu/MmKernelNpu.cpp  |  11 +-
 .../src/ATen/native/npu/NormKernelNpu.cpp     |  31 +-
 .../native/npu/ReflectionPad2dKernelNpu.cpp   | 127 --------
 .../native/npu/ReplicationPad2dKernelNpu.cpp  |   0
 .../native/npu/common/FormatCastHelper.cpp    |   4 +-
 .../npu/convolution/ConvolutionKernelNpu.cpp  |  24 +-
 .../ATen/native/npu/frame/FormatHelper.cpp    |  30 --
 .../src/ATen/native/npu/frame/FormatHelper.h  |   2 -
 .../native/npu/interface/EnvVariables.cpp     |   2 -
 .../src/ATen/native/npu/utils/CalcuOpUtil.cpp |  13 +-
 .../native/npu/utils/KernelNpuOutputSize.cpp  |   6 -
 .../native/npu/utils/KernelNpuOutputSize.h    |   4 -
 src/aten/src/ATen/utils/DumpUtils.h           |   4 +
 src/aten/src/ATen/utils/LoadUtils.cpp         |  53 +++-
 src/tools/autograd/derivatives.yaml           |   5 +-
 test/test_npu/test_constant_pad_nd.py         |  70 -----
 test/test_npu/test_network_ops/test_abs.py    |   0
 test/test_npu/test_network_ops/test_add.py    |   0
 test/test_npu/test_network_ops/test_addmm.py  |   0
 test/test_npu/test_network_ops/test_all.py    |   0
 test/test_npu/test_network_ops/test_any.py    |   0
 test/test_npu/test_network_ops/test_arange.py |   0
 test/test_npu/test_network_ops/test_argmax.py |   0
 .../test_avg_pool2d_backward.py               |   0
 .../{ => test_network_ops}/test_bilinear.py   |  65 +++--
 ...nary_cross_entropy_with_logits_backward.py |   0
 test/test_npu/test_network_ops/test_bmm.py    |   0
 .../test_network_ops/test_broadcastToD.py     |   0
 test/test_npu/test_network_ops/test_cat.py    |   0
 test/test_npu/test_network_ops/test_clamp.py  |   0
 .../test_network_ops/test_constant_pad_nd.py  |   1 -
 test/test_npu/test_network_ops/test_conv2d.py |   0
 .../test_conv_depthwise2d_backward.py         |   0
 test/test_npu/test_network_ops/test_div.py    |   0
 .../test_npu/test_network_ops/test_dropout.py |   0
 .../test_embedding_backward.py                |   0
 .../test_embedding_renorm.py                  |   6 +-
 test/test_npu/test_network_ops/test_exp.py    |   0
 test/test_npu/test_network_ops/test_fill_.py  |   0
 test/test_npu/test_network_ops/test_floor.py  |   0
 test/test_npu/test_network_ops/test_fmod.py   |   0
 test/test_npu/test_network_ops/test_full.py   |   0
 test/test_npu/test_network_ops/test_ge.py     |   0
 .../test_gelu_backward.py                     |  25 +-
 test/test_npu/test_network_ops/test_gt.py     |   0
 .../test_network_ops/test_hardtanh.py         |   0
 .../test_network_ops/test_index_put.py        |  10 +
 test/test_npu/test_network_ops/test_le.py     |   0
 .../test_leaky_relu_backward.py               |   0
 test/test_npu/test_network_ops/test_log.py    |   0
 test/test_npu/test_network_ops/test_log2.py   |   0
 .../test_network_ops/test_log_softmax.py      |   0
 .../test_log_softmax_backward.py              |   0
 test/test_npu/test_network_ops/test_lt.py     |   0
 test/test_npu/test_network_ops/test_matmul.py |   0
 test/test_npu/test_network_ops/test_max.py    |   0
 test/test_npu/test_network_ops/test_min.py    |   0
 test/test_npu/test_network_ops/test_mm.py     |   0
 test/test_npu/test_network_ops/test_muls.py   |   0
 test/test_npu/test_network_ops/test_neg.py    |   0
 .../test_npu/test_network_ops/test_nllloss.py |   0
 .../test_network_ops/test_not_equal.py        |   0
 .../test_network_ops/test_npu_giou.py         | 133 +++++++++
 .../test_npu_giou_backward.py                 |  86 ++++++
 test/test_npu/test_network_ops/test_pow.py    |   0
 test/test_npu/test_network_ops/test_prod.py   |   0
 .../test_network_ops/test_reciprocal.py       |   0
 test/test_npu/test_network_ops/test_relu.py   |   0
 .../test_network_ops/test_remainder.py        |   0
 test/test_npu/test_network_ops/test_rsqrt.py  |   0
 test/test_npu/test_network_ops/test_rsub.py   |   0
 test/test_npu/test_network_ops/test_sign.py   |   0
 .../test_npu/test_network_ops/test_softmax.py |   0
 test/test_npu/test_network_ops/test_split.py  |   0
 test/test_npu/test_network_ops/test_sqrt.py   |   0
 test/test_npu/test_network_ops/test_stack.py  |   0
 test/test_npu/test_network_ops/test_sub.py    |   0
 test/test_npu/test_network_ops/test_sum.py    |   0
 .../test_upsample_bilinear_backward.py        |   0
 test/test_npu/test_network_ops/test_where.py  |   0
 test/test_npu/test_network_ops/test_zero.py   |   0
 test/test_npu/test_network_ops/test_zeros.py  |   0
 .../test_network_ops/test_zeroslike.py        |   0
 test/test_npu/test_network_ops/util_test.py   |   0
 .../torch.onnx/eval/onnx/cp_onnx_eval.py      |   0
 .../torch.onnx/eval/onnxrt/onnxrt_eval.py     |   0
 .../test_onnx/torch.onnx/export/cp_parser.py  |   0
 .../torch.onnx/export/export_onnx.py          |   0
 .../torch.onnx/export/model_export-cpu.py     |   0
 .../torch.onnx/export/model_export-gpu.py     |   0
 .../torch.onnx/export/model_export-npu.py     |   0
 .../torch.onnx/export/model_export.py         |   0
 .../torch.onnx/export/onnx_parser.py          |   0
 test/test_npu/test_onnx/torch.onnx/main.py    |   0
 test/test_npu/test_reflection_pad2d.py        | 238 ---------------
 105 files changed, 807 insertions(+), 826 deletions(-)
 mode change 100644 => 100755 src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp
 create mode 100644 src/aten/src/ATen/native/npu/GiouBackwardKernelNpu.cpp
 create mode 100644 src/aten/src/ATen/native/npu/GiouKernelNpu.cpp
 delete mode 100644 src/aten/src/ATen/native/npu/ReflectionPad2dKernelNpu.cpp
 mode change 100644 => 100755 src/aten/src/ATen/native/npu/ReplicationPad2dKernelNpu.cpp
 delete mode 100644 test/test_npu/test_constant_pad_nd.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_abs.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_add.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_addmm.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_all.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_any.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_arange.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_argmax.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_avg_pool2d_backward.py
 rename test/test_npu/{ => test_network_ops}/test_bilinear.py (74%)
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits_backward.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_bmm.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_broadcastToD.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_cat.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_clamp.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_conv2d.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_conv_depthwise2d_backward.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_div.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_dropout.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_embedding_backward.py
 rename test/test_npu/{ => test_network_ops}/test_embedding_renorm.py (97%)
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_exp.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_fill_.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_floor.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_fmod.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_full.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_ge.py
 rename test/test_npu/{ => test_network_ops}/test_gelu_backward.py (77%)
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_gt.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_hardtanh.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_index_put.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_le.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_leaky_relu_backward.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_log.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_log2.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_log_softmax.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_log_softmax_backward.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_lt.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_matmul.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_max.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_min.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_mm.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_muls.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_neg.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_nllloss.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_not_equal.py
 create mode 100644 test/test_npu/test_network_ops/test_npu_giou.py
 create mode 100644 test/test_npu/test_network_ops/test_npu_giou_backward.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_pow.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_prod.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_reciprocal.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_relu.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_remainder.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_rsqrt.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_rsub.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_sign.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_softmax.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_split.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_sqrt.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_stack.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_sub.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_sum.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_upsample_bilinear_backward.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_where.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_zero.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_zeros.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/test_zeroslike.py
 mode change 100644 => 100755 test/test_npu/test_network_ops/util_test.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/eval/onnx/cp_onnx_eval.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/eval/onnxrt/onnxrt_eval.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/export/cp_parser.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/export/export_onnx.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/export/model_export-cpu.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/export/model_export-gpu.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/export/model_export-npu.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/export/model_export.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/export/onnx_parser.py
 mode change 100644 => 100755 test/test_npu/test_onnx/torch.onnx/main.py
 delete mode 100644 test/test_npu/test_reflection_pad2d.py

diff --git a/patch/npu.patch b/patch/npu.patch
index 14c9a2b015..b16fb85396 100644
--- a/patch/npu.patch
+++ b/patch/npu.patch
@@ -1,6 +1,6 @@
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt
 --- pytorch-v1.5.0/aten/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/CMakeLists.txt	2021-07-13 15:30:57.594267657 +0800
++++ pytorch-develop/aten/CMakeLists.txt	2021-07-16 18:19:46.298791052 +0800
 @@ -22,8 +22,10 @@
  set(ATen_CPU_INCLUDE)
  set(ATen_THIRD_PARTY_INCLUDE)
@@ -51,7 +51,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-13 15:30:57.594267657 +0800
++++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-16 18:19:46.298791052 +0800
 @@ -67,6 +67,9 @@
  FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h")
  FILE(GLOB native_cpu_h "native/cpu/*.h")
@@ -129,7 +129,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h
 --- pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-13 15:30:57.602267943 +0800
++++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-16 18:19:46.306791339 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -170,7 +170,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py
 --- pytorch-v1.5.0/aten/src/ATen/function_wrapper.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-13 15:30:57.610268230 +0800
++++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-16 18:19:46.314791625 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -354,7 +354,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          for option in declaration['options']:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py
 --- pytorch-v1.5.0/aten/src/ATen/gen.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/gen.py	2021-07-13 15:30:57.610268230 +0800
++++ pytorch-develop/aten/src/ATen/gen.py	2021-07-16 18:19:46.314791625 +0800
 @@ -1,3 +1,18 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -512,7 +512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      generate_outputs()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-13 15:30:57.622268661 +0800
++++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-16 18:19:46.326792056 +0800
 @@ -339,20 +339,20 @@
  
  void hardsigmoid_backward_kernel(TensorIterator& iter) {
@@ -540,7 +540,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    });
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-13 15:30:57.614268374 +0800
++++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-16 18:19:46.318791769 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -595,7 +595,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        detail::computeStorageSize(self.sizes(), self.strides()),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml
 --- pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-13 15:30:57.634269091 +0800
++++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-16 18:19:46.342792630 +0800
 @@ -1,6 +1,5 @@
  # See README.md in this directory for more guidance
  
@@ -5916,24 +5916,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6118,12 +7584,16 @@
-   dispatch:
-     CPU: reflection_pad2d_out_cpu
-     CUDA: reflection_pad2d_out_cuda
-+  npu_dispatch:
-+    NPU: reflection_pad2d_out_npu
- 
- - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
-   python_module: nn
-   dispatch:
-     CPU: reflection_pad2d_cpu
-     CUDA: reflection_pad2d_cuda
-+  npu_dispatch:
-+    NPU: reflection_pad2d_npu
- 
- - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
-   python_module: nn
-@@ -6166,12 +7636,16 @@
+@@ -6166,12 +7632,16 @@
    dispatch:
      CPU: replication_pad2d_out_cpu
      CUDA: replication_pad2d_out_cuda
@@ -5950,7 +5933,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6214,12 +7688,16 @@
+@@ -6214,12 +7684,16 @@
    dispatch:
      CPU: upsample_linear1d_out_cpu
      CUDA: upsample_linear1d_out_cuda
@@ -5967,7 +5950,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6232,12 +7710,16 @@
+@@ -6232,12 +7706,16 @@
    dispatch:
      CPU: upsample_linear1d_backward_cpu
      CUDA: upsample_linear1d_backward_cuda
@@ -5984,7 +5967,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6245,96 +7727,128 @@
+@@ -6245,96 +7723,128 @@
      CPU: upsample_bilinear2d_cpu
      CUDA: upsample_bilinear2d_cuda
      QuantizedCPU: quantized_upsample_bilinear2d_cpu
@@ -6113,7 +6096,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6342,24 +7856,32 @@
+@@ -6342,24 +7852,32 @@
      CPU: upsample_nearest2d_cpu
      CUDA: upsample_nearest2d_cuda
      QuantizedCPU: quantized_upsample_nearest2d_cpu
@@ -6146,7 +6129,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6367,38 +7889,52 @@
+@@ -6367,38 +7885,52 @@
      CPU: upsample_nearest3d_cpu
      CUDA: upsample_nearest3d_cuda
      QuantizedCPU: quantized_upsample_nearest3d_cpu
@@ -6199,7 +6182,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # What's a thnn_conv_ versus a slow_conv_?
  #
-@@ -6423,24 +7959,32 @@
+@@ -6423,24 +7955,32 @@
    dispatch:
      CPU: slow_conv_transpose2d_out_cpu
      CUDA: slow_conv_transpose2d_out_cuda
@@ -6232,7 +6215,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6468,21 +8012,29 @@
+@@ -6468,21 +8008,29 @@
  
  - func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -6262,7 +6245,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    python_module: nn
-@@ -6495,32 +8047,46 @@
+@@ -6495,32 +8043,46 @@
    dispatch:
      CPU: slow_conv2d_backward_cpu
      CUDA: legacy::cuda::_thnn_conv2d_backward
@@ -6309,7 +6292,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6553,12 +8119,16 @@
+@@ -6553,12 +8115,16 @@
    dispatch:
      CPU: slow_conv_dilated2d_cpu
      CUDA: slow_conv_dilated2d_cuda
@@ -6326,7 +6309,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
    python_module: nn
-@@ -6577,57 +8147,405 @@
+@@ -6577,57 +8143,413 @@
    dispatch:
      CPU: col2im_out_cpu
      CUDA: col2im_out_cuda
@@ -6732,10 +6715,18 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- func: npu_bert_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor(a!), Tensor(b!), Tensor(c!))
 +  npu_dispatch_only:
 +    NPU: bert_apply_adam_npu
++
++- func: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
++  npu_dispatch_only:
++    NPU: giou_npu
++
++- func: npu_giou_backward(Tensor grad, Tensor bboxes, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> (Tensor, Tensor)
++  npu_dispatch_only:
++    NPU: giou_backward_npu
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
 --- pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-13 15:30:57.674270525 +0800
++++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-16 18:19:46.378793920 +0800
 @@ -659,14 +659,14 @@
  
      SUB x1, x1, 4
@@ -6761,7 +6752,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      CMP x1, 2
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-13 15:30:57.618268517 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-16 18:19:46.322791912 +0800
 @@ -64,7 +64,7 @@
  
  Tensor isinf(const Tensor &self) {
@@ -6773,7 +6764,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "isinf", [&]() {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-13 15:30:57.618268517 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-16 18:19:46.326792056 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6818,7 +6809,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-13 15:30:57.618268517 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-16 18:19:46.326792056 +0800
 @@ -87,6 +87,7 @@
    if (self.is_contiguous(memory_format)) {
      return self;
@@ -6829,7 +6820,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        "preserve memory format is unsupported by the contiguous operator");
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-13 15:30:57.622268661 +0800
++++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-16 18:19:46.326792056 +0800
 @@ -26,7 +26,7 @@
          const scalar_t* in = &idata[output_y * input_width + output_x];
          scalar_t* out = &odata[output_y * output_width + output_x];
@@ -6841,7 +6832,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            out += output_width * output_height;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py
 --- pytorch-v1.5.0/aten/src/ATen/native_parse.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-13 15:30:57.686270955 +0800
++++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-16 18:19:46.394794494 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6879,7 +6870,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  msg = '''Exception raised in processing function:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py
 --- pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-13 15:30:57.690271099 +0800
++++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-16 18:19:46.394794494 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6911,7 +6902,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-13 15:30:57.690271099 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-16 18:19:46.394794494 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6944,7 +6935,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-13 15:30:57.690271099 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-16 18:19:46.394794494 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6978,7 +6969,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/TH/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-13 15:30:57.694271242 +0800
++++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-16 18:19:46.394794494 +0800
 @@ -48,6 +48,11 @@
    ${CMAKE_CURRENT_SOURCE_DIR}
  PARENT_SCOPE)
@@ -6993,7 +6984,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-13 15:30:57.694271242 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-16 18:19:46.398794637 +0800
 @@ -1,9 +1,32 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7102,7 +7093,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-13 15:30:57.694271242 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-16 18:19:46.398794637 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7141,7 +7132,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt
 --- pytorch-v1.5.0/c10/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/CMakeLists.txt	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/CMakeLists.txt	2021-07-16 18:19:46.410795068 +0800
 @@ -63,6 +63,14 @@
    message(STATUS "don't use NUMA")
  endif()
@@ -7170,7 +7161,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # not checked in
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h
 --- pytorch-v1.5.0/c10/core/Backend.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Backend.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Backend.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7265,7 +7256,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp
 --- pytorch-v1.5.0/c10/core/Device.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.cpp	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Device.cpp	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7305,7 +7296,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        types.begin(),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h
 --- pytorch-v1.5.0/c10/core/Device.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Device.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7340,7 +7331,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return type_ == DeviceType::CPU;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp
 --- pytorch-v1.5.0/c10/core/DeviceType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7380,7 +7371,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        return false;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h
 --- pytorch-v1.5.0/c10/core/DeviceType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DeviceType.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7423,7 +7414,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  constexpr DeviceType kXLA = DeviceType::XLA;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp
 --- pytorch-v1.5.0/c10/core/DispatchKey.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7455,7 +7446,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      case DispatchKey::TESTING_ONLY_GenericModeTensorId:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h
 --- pytorch-v1.5.0/c10/core/DispatchKey.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DispatchKey.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7487,7 +7478,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h
 --- pytorch-v1.5.0/c10/core/Storage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Storage.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Storage.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7521,7 +7512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  };
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h
 --- pytorch-v1.5.0/c10/core/StorageImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/StorageImpl.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/StorageImpl.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,12 +1,39 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7578,7 +7569,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h
 --- pytorch-v1.5.0/c10/core/TensorImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorImpl.h	2021-07-13 15:30:57.710271816 +0800
++++ pytorch-develop/c10/core/TensorImpl.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7648,7 +7639,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h
 --- pytorch-v1.5.0/c10/core/TensorOptions.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorOptions.h	2021-07-13 15:30:57.710271816 +0800
++++ pytorch-develop/c10/core/TensorOptions.h	2021-07-16 18:19:46.410795068 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7689,7 +7680,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h
 --- pytorch-v1.5.0/c10/macros/Export.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/macros/Export.h	2021-07-13 15:30:57.710271816 +0800
++++ pytorch-develop/c10/macros/Export.h	2021-07-16 18:19:46.414795211 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7816,7 +7807,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt
 --- pytorch-v1.5.0/caffe2/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-13 15:30:57.718272102 +0800
++++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-16 18:19:46.422795498 +0800
 @@ -32,6 +32,7 @@
    # Add source, includes, and libs to lists
    list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@@ -7963,7 +7954,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs.
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format
 --- pytorch-v1.5.0/.clang-format	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.clang-format	2021-07-13 15:30:57.586267370 +0800
++++ pytorch-develop/.clang-format	2021-07-16 18:19:46.294790909 +0800
 @@ -84,5 +84,4 @@
  SpacesInSquareBrackets: false
  Standard:        Cpp11
@@ -7974,7 +7965,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake
 --- pytorch-v1.5.0/cmake/BuildVariables.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-16 18:19:46.530799370 +0800
 @@ -11,6 +11,7 @@
  # CMakeLists.txt files under each folder respectively.
  set(Caffe2_CPU_SRCS)
@@ -8001,7 +7992,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # symbols. However, if the lib is whole linked in caffe2 lib, we don't want
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake
 --- pytorch-v1.5.0/cmake/Codegen.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Codegen.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/Codegen.cmake	2021-07-16 18:19:46.530799370 +0800
 @@ -191,13 +191,14 @@
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
@@ -8032,7 +8023,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake
 --- pytorch-v1.5.0/cmake/Dependencies.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Dependencies.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/Dependencies.cmake	2021-07-16 18:19:46.534799514 +0800
 @@ -1509,6 +1509,13 @@
    ENDIF(NOT C_HAS_THREAD)
  endif()
@@ -8049,7 +8040,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake
 --- pytorch-v1.5.0/cmake/Summary.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Summary.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/Summary.cmake	2021-07-16 18:19:46.534799514 +0800
 @@ -134,6 +134,7 @@
    if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
      message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
@@ -8060,7 +8051,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endfunction()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in
 --- pytorch-v1.5.0/cmake/TorchConfig.cmake.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-16 18:19:46.534799514 +0800
 @@ -112,6 +112,11 @@
    list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
  endif()
@@ -8075,7 +8066,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt
 --- pytorch-v1.5.0/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/CMakeLists.txt	2021-07-13 15:30:57.590267513 +0800
++++ pytorch-develop/CMakeLists.txt	2021-07-16 18:19:46.294790909 +0800
 @@ -205,6 +205,10 @@
  option(USE_TBB "Use TBB" OFF)
  option(ONNX_ML "Enable traditional ONNX ML API." ON)
@@ -8142,7 +8133,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore
 --- pytorch-v1.5.0/.dockerignore	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.dockerignore	2021-07-13 15:30:57.586267370 +0800
++++ pytorch-develop/.dockerignore	2021-07-16 18:19:46.294790909 +0800
 @@ -1,257 +1 @@
 -# READ THIS BEFORE YOU REFACTOR ME
 -#
@@ -8405,7 +8396,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat
 --- pytorch-v1.5.0/docs/make.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/docs/make.bat	2021-07-13 15:30:57.834276262 +0800
++++ pytorch-develop/docs/make.bat	2021-07-16 18:19:46.538799657 +0800
 @@ -1,36 +1,36 @@
 -@ECHO OFF
 -
@@ -8494,7 +8485,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt
 --- pytorch-v1.5.0/requirements.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/requirements.txt	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/requirements.txt	2021-07-16 18:19:46.554800231 +0800
 @@ -4,4 +4,12 @@
  requests
  setuptools
@@ -8513,7 +8504,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat
 --- pytorch-v1.5.0/scripts/appveyor/install.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install.bat	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/appveyor/install.bat	2021-07-16 18:19:46.554800231 +0800
 @@ -1,10 +1,10 @@
 -:: Installation scripts for appveyor.
 -
@@ -8537,7 +8528,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +conda install -y numpy
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat
 --- pytorch-v1.5.0/scripts/appveyor/install_cuda.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-16 18:19:46.554800231 +0800
 @@ -1,22 +1,22 @@
 -@echo on
 -
@@ -8585,7 +8576,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +nvcc -V || exit /b
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat
 --- pytorch-v1.5.0/scripts/build_windows.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/build_windows.bat	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/build_windows.bat	2021-07-16 18:19:46.554800231 +0800
 @@ -1,84 +1,84 @@
 -:: #############################################################################
 -:: Example command to build on Windows.
@@ -8757,7 +8748,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +exit /b 1
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1
 --- pytorch-v1.5.0/scripts/proto.ps1	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/proto.ps1	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/proto.ps1	2021-07-16 18:19:46.554800231 +0800
 @@ -1,17 +1,17 @@
 -param(
 -  [string]$protoc,
@@ -8795,7 +8786,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +Invoke-Expression $cmd
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/setup.py pytorch-develop/setup.py
 --- pytorch-v1.5.0/setup.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/setup.py	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/setup.py	2021-07-16 18:19:46.554800231 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -8894,7 +8885,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  'python/serialized_test/data/operator_test/*.zip',
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml
 --- pytorch-v1.5.0/tools/autograd/derivatives.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-16 18:19:47.710841680 +0800
 @@ -107,6 +107,10 @@
  #
  # NB: The parameter names here MUST be consistent with the parameter names
@@ -8951,7 +8942,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # The above backward definitions are equivalent to the definitions below.  Why do we bundle
  # everything up?  It's because it's more convenient to define double backwards
  # when there is a single function that manages everything.
-@@ -1630,3 +1643,52 @@
+@@ -1630,3 +1643,55 @@
  
  - name: nonzero(Tensor self) -> Tensor
    output_differentiability: [False]
@@ -9004,11 +8995,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
 +  input, weight: npu_linear_backward(grad, input, weight)
 +  bias: maybe_multiply(grad, 1)
++
++- name: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
++  self, gtboxes: npu_giou_backward(grad, self, gtboxes, trans, is_cross, mode)
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py
 --- pytorch-v1.5.0/tools/autograd/dump_utils.py	1970-01-01 08:00:00.000000000 +0800
-+++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-13 15:30:58.990317711 +0800
-@@ -0,0 +1,114 @@
++++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-16 18:19:47.710841680 +0800
+@@ -0,0 +1,115 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# All rights reserved.
 +#
@@ -9121,11 +9115,12 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  "pin_memory",
 +  "to_device",
 +  "numpy_T",
-+  "slice_Tensor"
++  "slice_Tensor",
++  "select_int"
 +]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-16 18:19:47.710841680 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9311,7 +9306,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_python_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-16 18:19:47.710841680 +0800
 @@ -1,3 +1,20 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9353,7 +9348,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              'value': argname,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py
 --- pytorch-v1.5.0/tools/autograd/gen_variable_type.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-16 18:19:47.714841823 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9526,7 +9521,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/Functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-16 18:19:47.714841823 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9606,7 +9601,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto sparse = sparse_.coalesce();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-16 18:19:47.714841823 +0800
 @@ -22,7 +22,7 @@
  #include "torch/csrc/autograd/generated/variable_factories.h"
  #include "torch/csrc/utils/structseq.h"
@@ -9690,7 +9685,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-16 18:19:47.714841823 +0800
 @@ -15,7 +15,13 @@
  #include "torch/csrc/cuda/Stream.h"
  #include "torch/csrc/cuda/Event.h"
@@ -9777,7 +9772,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"has_names", (PyCFunction)THPVariable_has_names, METH_NOARGS, NULL},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-16 18:19:47.714841823 +0800
 @@ -1,7 +1,27 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9808,7 +9803,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-16 18:19:47.714841823 +0800
 @@ -1,3 +1,20 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9840,7 +9835,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl
 --- pytorch-v1.5.0/tools/build_variables.bzl	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/build_variables.bzl	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/tools/build_variables.bzl	2021-07-16 18:19:47.714841823 +0800
 @@ -46,6 +46,7 @@
      "torch/csrc/autograd/functions/utils.cpp",
      "torch/csrc/autograd/input_buffer.cpp",
@@ -9926,7 +9921,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py
 --- pytorch-v1.5.0/torch/autograd/profiler.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/autograd/profiler.py	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/autograd/profiler.py	2021-07-16 18:19:47.718841966 +0800
 @@ -1,8 +1,25 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -10399,7 +10394,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return ''.join(result)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt
 --- pytorch-v1.5.0/torch/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/CMakeLists.txt	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/torch/CMakeLists.txt	2021-07-16 18:19:47.714841823 +0800
 @@ -97,6 +97,7 @@
      ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp
      ${TORCH_SRC_DIR}/csrc/utils.cpp
@@ -10431,7 +10426,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/engine.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10554,7 +10549,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        auto event = c10::Event{c10::DeviceType::CUDA};
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10586,7 +10581,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            /*non_blocking=*/false,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10629,7 +10624,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    m.def("_enable_profiler", enableProfiler);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10681,7 +10676,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto& old_var = buffer[pos];
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10877,7 +10872,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  CUDAStubs::~CUDAStubs() = default;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11002,7 +10997,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11056,7 +11051,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11097,7 +11092,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h
 --- pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-16 18:19:47.730842396 +0800
 @@ -168,6 +168,45 @@
    return r.release();
  }
@@ -11146,7 +11141,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!r) throw python_error();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-13 15:30:59.006318284 +0800
++++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-16 18:19:47.730842396 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11180,7 +11175,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!t.defined()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-13 15:30:59.014318571 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-16 18:19:47.734842540 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11286,7 +11281,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    while (!in_flight.empty()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-13 15:30:59.014318571 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-16 18:19:47.734842540 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11343,7 +11338,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-13 15:30:59.014318571 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-16 18:19:47.734842540 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11468,7 +11463,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp
 --- pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-16 18:19:47.718841966 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11517,7 +11512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return it->second;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp
 --- pytorch-v1.5.0/torch/csrc/Generator.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-16 18:19:47.722842110 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11585,7 +11580,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #endif 
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/serialization.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-13 15:30:59.018318714 +0800
++++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-16 18:19:47.738842684 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11685,7 +11680,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/Storage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-13 15:30:59.018318714 +0800
++++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-16 18:19:47.738842684 +0800
 @@ -1,7 +1,25 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11764,7 +11759,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        for (Py_ssize_t i = 0; i < length; i++) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-13 15:30:59.018318714 +0800
++++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-16 18:19:47.738842684 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11812,7 +11807,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp
 --- pytorch-v1.5.0/torch/csrc/Module.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Module.cpp	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/csrc/Module.cpp	2021-07-16 18:19:47.722842110 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11956,7 +11951,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-16 18:19:47.758843401 +0800
 @@ -1,18 +1,35 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12333,7 +12328,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +} // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-16 18:19:47.758843401 +0800
 @@ -1,6 +1,10 @@
  #include <ATen/core/ivalue.h>
  #include <torch/csrc/utils/init.h>
@@ -12421,7 +12416,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h
 --- pytorch-v1.5.0/torch/csrc/utils/init.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.h	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/init.h	2021-07-16 18:19:47.758843401 +0800
 @@ -8,4 +8,7 @@
  void initThroughputBenchmarkBindings(PyObject* module);
  
@@ -12432,7 +12427,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h
 --- pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-16 18:19:47.758843401 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12467,7 +12462,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return at::Device(device_str);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-16 18:19:47.762843544 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12498,7 +12493,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    registerLayoutObject((THPLayout*)strided_layout, at::Backend::QuantizedCPU);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-16 18:19:47.762843544 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12634,7 +12629,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else if(expected_layout == c10::kSparse) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-16 18:19:47.762843544 +0800
 @@ -1,58 +1,91 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12847,7 +12842,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def get_rng_state(): ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py
 --- pytorch-v1.5.0/torch/distributed/distributed_c10d.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-13 15:30:59.042319575 +0800
++++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-16 18:19:47.762843544 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -12928,7 +12923,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py
 --- pytorch-v1.5.0/torch/distributions/von_mises.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributions/von_mises.py	2021-07-13 15:30:59.042319575 +0800
++++ pytorch-develop/torch/distributions/von_mises.py	2021-07-16 18:19:47.766843687 +0800
 @@ -1,140 +1,140 @@
 -from __future__ import absolute_import, division, print_function
 -
@@ -13212,7 +13207,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +                    _log_modified_bessel_fn(self.concentration, order=0)).exp()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py
 --- pytorch-v1.5.0/torch/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/__init__.py	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/torch/__init__.py	2021-07-16 18:19:47.714841823 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13255,7 +13250,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-13 15:30:59.046319718 +0800
++++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-16 18:19:47.766843687 +0800
 @@ -28,6 +28,10 @@
    option(USE_C10D_NCCL "USE C10D NCCL" ON)
  endif()
@@ -13308,7 +13303,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    copy_header(ProcessGroupMPI.hpp)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-13 15:30:59.046319718 +0800
++++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-16 18:19:47.770843831 +0800
 @@ -37,8 +37,11 @@
  SET_TARGET_PROPERTIES(shm PROPERTIES
    PREFIX "lib"
@@ -13365,7 +13360,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py
 --- pytorch-v1.5.0/torch/nn/functional.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/functional.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/functional.py	2021-07-16 18:19:47.770843831 +0800
 @@ -1611,7 +1611,7 @@
      else:
          output = input.matmul(weight.t())
@@ -13388,7 +13383,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -from . import parallel as parallel
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py
 --- pytorch-v1.5.0/torch/nn/modules/batchnorm.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-16 18:19:47.770843831 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13420,7 +13415,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              self.register_parameter('running_var', None)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py
 --- pytorch-v1.5.0/torch/nn/modules/module.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/module.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/modules/module.py	2021-07-16 18:19:47.774843974 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13563,7 +13558,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py
 --- pytorch-v1.5.0/torch/nn/modules/normalization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-16 18:19:47.774843974 +0800
 @@ -128,13 +128,14 @@
      """
      __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
@@ -13596,7 +13591,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          return '{normalized_shape}, eps={eps}, ' \
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in
 --- pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-13 15:30:59.054320005 +0800
++++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-16 18:19:47.774843974 +0800
 @@ -1,60 +1,60 @@
 -from ..init import xavier_uniform_
 -from .activation import MultiheadAttention
@@ -13756,7 +13751,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -                  module_kwargs: Optional[Any] = ...) -> Tensor: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py
 --- pytorch-v1.5.0/torch/nn/parallel/distributed.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-13 15:30:59.054320005 +0800
++++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-16 18:19:47.774843974 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14107,7 +14102,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def remove_weight_norm(module: T_module, name: str = ...) -> T_module: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py
 --- pytorch-v1.5.0/torch/onnx/symbolic_opset9.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-13 15:30:59.054320005 +0800
++++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-16 18:19:47.778844118 +0800
 @@ -1621,14 +1621,23 @@
          slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals]
          return g.op('Concat', *slices, axis_i=0)
@@ -14185,7 +14180,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., lr_decay: float=..., weight_decay: float=..., initial_accumulator_value: float=...,  eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py
 --- pytorch-v1.5.0/torch/optim/adamax.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/optim/adamax.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/optim/adamax.py	2021-07-16 18:19:47.778844118 +0800
 @@ -80,8 +80,8 @@
                      exp_inf.mul_(beta2).unsqueeze(0),
                      grad.abs().add_(eps).unsqueeze_(0)
@@ -14362,7 +14357,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py
 --- pytorch-v1.5.0/torch/serialization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/serialization.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/serialization.py	2021-07-16 18:19:47.778844118 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14446,7 +14441,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def location_tag(storage):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py
 --- pytorch-v1.5.0/torch/storage.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/storage.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/storage.py	2021-07-16 18:19:47.778844118 +0800
 @@ -7,6 +7,7 @@
  
  class _StorageBase(object):
@@ -14466,7 +14461,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          else:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py
 --- pytorch-v1.5.0/torch/tensor.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/tensor.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/tensor.py	2021-07-16 18:19:47.778844118 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14528,7 +14523,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      def __reversed__(self):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py
 --- pytorch-v1.5.0/torch/_tensor_str.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_tensor_str.py	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/torch/_tensor_str.py	2021-07-16 18:19:47.718841966 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14582,7 +14577,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      has_default_dtype = self.dtype in (torch.get_default_dtype(), torch.int64, torch.bool)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py
 --- pytorch-v1.5.0/torch/utils/data/dataloader.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-13 15:30:59.062320292 +0800
++++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-16 18:19:47.782844261 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14791,7 +14786,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py
 --- pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-13 15:30:59.062320292 +0800
++++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-16 18:19:47.782844261 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14852,7 +14847,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py
 --- pytorch-v1.5.0/torch/utils/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/__init__.py	2021-07-13 15:30:59.062320292 +0800
++++ pytorch-develop/torch/utils/__init__.py	2021-07-16 18:19:47.782844261 +0800
 @@ -1,6 +1,7 @@
  from __future__ import absolute_import, division, print_function, unicode_literals
  
@@ -14863,7 +14858,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def set_module(obj, mod):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py
 --- pytorch-v1.5.0/torch/_utils.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_utils.py	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/_utils.py	2021-07-16 18:19:47.718841966 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
diff --git a/src/aten/src/ATen/native/native_functions.yaml b/src/aten/src/ATen/native/native_functions.yaml
index 30c7a8aeb1..74c22e5b3a 100644
--- a/src/aten/src/ATen/native/native_functions.yaml
+++ b/src/aten/src/ATen/native/native_functions.yaml
@@ -7584,16 +7584,12 @@
   dispatch:
     CPU: reflection_pad2d_out_cpu
     CUDA: reflection_pad2d_out_cuda
-  npu_dispatch:
-    NPU: reflection_pad2d_out_npu
 
 - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: reflection_pad2d_cpu
     CUDA: reflection_pad2d_cuda
-  npu_dispatch:
-    NPU: reflection_pad2d_npu
 
 - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -8548,4 +8544,12 @@
 
 - func: npu_bert_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   npu_dispatch_only:
-    NPU: bert_apply_adam_npu
\ No newline at end of file
+    NPU: bert_apply_adam_npu
+
+- func: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
+  npu_dispatch_only:
+    NPU: giou_npu
+
+- func: npu_giou_backward(Tensor grad, Tensor bboxes, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> (Tensor, Tensor)
+  npu_dispatch_only:
+    NPU: giou_backward_npu
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp
old mode 100644
new mode 100755
diff --git a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
index 8c3ac87647..4bc949120d 100644
--- a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
@@ -154,6 +154,20 @@ Tensor& _cat_out_npu(Tensor& result, TensorList tensors, int64_t dim) {
 }
 
 Tensor& cat_out_npu(Tensor& result, TensorList tensors, int64_t dim) {
+  SmallVector<Tensor, N> inputTensors = cat_dest_tensor_list(tensors);
+
+  int64_t dim_post_expr = 0;
+  if (inputTensors.size() > 0) {
+    dim_post_expr = inputTensors[0].dim();
+  }
+  dim = CalcuOpUtil::make_wrap_dim(dim, dim_post_expr);
+  auto outputSize = cat_npu_output_size(inputTensors, dim);
+  OpPreparation::CheckOut(
+    {tensors[0]}, 
+    result, 
+    ACL_FORMAT_ND, 
+    tensors[0].scalar_type(), 
+    outputSize); 
   return at::_cat_out(result, tensors, dim);
 }
 
diff --git a/src/aten/src/ATen/native/npu/ConstantPadNdKernelNpu.cpp b/src/aten/src/ATen/native/npu/ConstantPadNdKernelNpu.cpp
index 442034bec2..e29b20600b 100644
--- a/src/aten/src/ATen/native/npu/ConstantPadNdKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ConstantPadNdKernelNpu.cpp
@@ -88,13 +88,21 @@ Tensor constant_pad_nd_npu(const Tensor& self, IntArrayRef pad, Scalar value){
   }
 
   if (is_backward(pad)) {
-    TORCH_CHECK(self.dim() == 4, "only support 4D now, but self.dim is",self.dim());
-    TORCH_CHECK(pad.size()  == 4, "Length of pad must is 4 now, but pad.size() is", pad.size());
+    TORCH_CHECK(self.dim() == 4 || self.dim() == 5,
+        "Only support 4D and 5D now, but self.dim is",self.dim());
+    TORCH_CHECK(pad.size()  == 4 || pad.size()  == 6,
+        "Length of pad must is 4 or 6 now, but pad.size() is", pad.size());
 
     SmallVector<int64_t, SIZE> begin_list = {0, 0, -pad[2], -pad[0]};
     SmallVector<int64_t, SIZE> end_list = {self.size(0), self.size(1), self.size(-2) + pad[3], self.size(-1) + pad[1]};
     SmallVector<int64_t, SIZE> strides = {1, 1, 1, 1};
 
+    if (self.dim() == 5) {
+      begin_list = {0, 0, -pad[4], -pad[2], -pad[0]};
+      end_list = {self.size(0), self.size(1), self.size(-3) + pad[5], self.size(-2) + pad[3], self.size(-1) + pad[1]};
+      strides = {1, 1, 1, 1, 1};
+    }
+
     return at::npu_indexing(self, begin_list, end_list, strides);
   }
 
@@ -109,4 +117,4 @@ Tensor constant_pad_nd_npu(const Tensor& self, IntArrayRef pad, Scalar value){
 }
 
 } // namespace native
-} // namespace at
\ No newline at end of file
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp b/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp
index 3a22c9157d..517c96b7f2 100644
--- a/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp
@@ -12,71 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
 using namespace at::native::npu;
 
-SmallVector<NPUTensorDesc, N> embedding_renorm_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> embedding_renorm_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> embedding_renorm_npu_attr(
-      double max_norm, 
-      double norm_type){
-  int64_t dim = 0;
-  float max_norm_float = (float) max_norm;
-  float norm_type_float = (float) norm_type;
-  NPUAttrDesc npuAttrScalarP = NPUAttrDesc("p", norm_type_float);
-  NPUAttrDesc npuAttrScalarMaxnorm = NPUAttrDesc("maxnorm", max_norm_float);
-  NPUAttrDesc npuAttrDim = NPUAttrDesc("dim", dim);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrScalarP, npuAttrDim, npuAttrScalarMaxnorm};
-  return attrs;
-}
-SmallVector<NPUAttrDesc, N> embedding_gather2d_npu_attr() {
-  NPUAttrDesc npuAttrAxis = NPUAttrDesc("axis", (int64_t)0);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrAxis};
-  return attrs;
-}
-
-SmallVector<NPUAttrDesc, N> embedding_renorm_scatter_update_npu_attr(){
-  NPUAttrDesc npuAttrAxis = NPUAttrDesc("use_locking", false);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrAxis};
-  return attrs;
-}
-
 Tensor& embedding_renorm_gather2d_out_npu(
     Tensor& result,
     const Tensor& self,
-    const Tensor& indices
-    ){
-// execute the NPU operate  GatherV2D
-  auto inputs = embedding_renorm_npu_input({self, indices});
-  auto outputs = embedding_renorm_npu_output({result});
-  auto attrs = embedding_gather2d_npu_attr();
-  CalcuOpUtil::execute_npu_operate("GatherV2D", inputs, outputs, attrs);
+    const Tensor& indices) {
+  OpCommand cmd;
+  cmd.Name("GatherV2D")
+    .Input(self)
+    .Input(indices)
+    .Output(result)
+    .Attr("axis", (int64_t)0)
+    .Run();
   return result;
 }
 
 Tensor& embedding_renorm_execute_out_npu(
     Tensor& result,
     const Tensor& self,
-    double max_norm, 
-    double norm_type){
-//execute the NPU operate  Renorm
-  auto inputs = embedding_renorm_npu_input({self});
-  auto outputs = embedding_renorm_npu_output({result});
-  auto attrs = embedding_renorm_npu_attr(max_norm, norm_type);
-  CalcuOpUtil::execute_npu_operate("Renorm", inputs, outputs, attrs);
+    double max_norm,
+    double norm_type) {
+  OpCommand cmd;
+  cmd.Name("Renorm")
+    .Input(self)
+    .Output(result)
+    .Attr("p", (float)norm_type)
+    .Attr("dim", (int64_t)0)
+    .Attr("maxnorm", (float)max_norm)
+    .Run();
   return result;
 }
 
@@ -85,82 +53,84 @@ Tensor& embedding_renorm_scatter_update_out_npu(
     Tensor& result,
     const Tensor& self,
     const Tensor& indices,
-    const Tensor& update){
-  auto inputs = embedding_renorm_npu_input({self, indices, update});
-  auto outputs = embedding_renorm_npu_output({result});
-  auto attrs = embedding_renorm_scatter_update_npu_attr();
-  CalcuOpUtil::execute_npu_operate("ScatterUpdate", inputs, outputs, attrs);
+    const Tensor& update) {
+  OpCommand cmd;
+  cmd.Name("ScatterUpdate")
+    .Input(self)
+    .Input(indices)
+    .Input(update)
+    .Output(result)
+    .Attr("use_locking", false)
+    .Run();
   return result;
 }
 
-
 Tensor& embedding_renorm_out_npu(
     Tensor& result,
     const Tensor& self,
     const Tensor& indices,
-    Tensor& mid_input,
-    Tensor& mid_output,
-    double max_norm, 
+    double max_norm,
     double norm_type){
-// execute the NPU operate  GatherV2D,generate  new tensor by indices 
-  embedding_renorm_gather2d_out_npu(
-        mid_input,
-        self,
-        indices);
-//execute the NPU operate  Renorm
-  embedding_renorm_execute_out_npu(
-        mid_output,
-        mid_input,
-        max_norm, 
-        norm_type);
-// executing the NPU operator ScatterUpdate
-  embedding_renorm_scatter_update_out_npu(
-        result,
-        self,
-        indices,
-        mid_output); 
+
+  //get the  outSize of  GatherV2 , the middle tensor
+  SmallVector<int64_t, SIZE> midSize = {indices.size(0), self.size(1)};
+  Tensor mid_input = OpPreparation::ApplyTensor(self, midSize);
+  Tensor mid_output = OpPreparation::ApplyTensor(self, midSize);
+
+  // execute the NPU operate  GatherV2D, generate  new tensor by indices
+  embedding_renorm_gather2d_out_npu(mid_input,self,indices);
+
+  //execute the NPU operate  Renorm
+  embedding_renorm_execute_out_npu(mid_output, mid_input, max_norm, norm_type);
+
+  //execute the NPU operate  ZerosLike or RangeD, generate new tensor by indices.numel()
+  Tensor mid_output_copy = mid_output.clone();
+  auto num_indices = indices.numel();
+  Tensor input_indices;
+  
+  // RangeD not support range(0,0)
+  if (num_indices - 1 == 0) {
+    input_indices = at::zeros({1}, self.options()).to(at::kLong);
+  } else {
+    input_indices = at::range(0, num_indices-1, self.options()).to(at::kLong);
+  }
+
+  //execute the NPU operate  MUL, generate change result
+  auto num_mid_output = mid_output.numel();
+  resize_npu_(mid_output_copy, num_mid_output);
+  Tensor scalar_out = OpPreparation::ApplyTensor(self, {num_indices, 1});
+  embedding_renorm_gather2d_out_npu(scalar_out, mid_output_copy, input_indices);
+  Tensor out_res = mid_input * scalar_out;
+
+  // executing the NPU operator ScatterUpdate
+  embedding_renorm_scatter_update_out_npu(result, self, indices, out_res);
+
   return result;
 }
 
 Tensor& embedding_renorm_npu_(
     Tensor& self,
     const Tensor& indices,
-    double max_norm, 
+    double max_norm,
     double norm_type) {
 
-//check dim and type
+  //check dim and type
   auto self_arg = TensorArg(self, "self", 1);
   auto indices_arg = TensorArg(indices, "indices", 2);
   checkDim("embedding_renorm_", self_arg, 2);
   checkScalarType("embedding_renorm_", indices_arg, kLong);
 
-// indices must be int64 in pytorch, but npu can only support int32
-  auto indices_int32 = indices.to("cpu");
-  indices_int32 = indices_int32.to(at::kInt);
-  indices_int32 = indices_int32.to("npu");   
-
-//resize indices to 1D
+  //resize indices to 1D
   Tensor indices_copy = indices.clone();
   auto num_indices = indices.numel();
   resize_npu_(indices_copy, num_indices);
-    
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
-//get the  outSize of  GatherV2 , the middle tensor
-  auto midSize = embedding_renorm_mid_npu_output_size(self, indices_copy);
-  Tensor mid = at::empty_with_format(midSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  Tensor mid1 = at::empty_with_format(midSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-    
-//inplace operate
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    Tensor result = embedding_renorm_out_npu(contiguousSelf, contiguousSelf, indices_copy, mid, mid1, max_norm, norm_type);
-  NpuUtils::format_fresh_view(self, result);
-  } else {
-    embedding_renorm_out_npu(self, self, indices_copy, mid, mid1, max_norm, norm_type);
-  }
+
+  OpPipeWithDefinedOut pipe;
+  pipe.CheckMemory({self, indices_copy}, {self})
+   .Func([&self, &indices_copy, max_norm, norm_type](Tensor& result){
+        embedding_renorm_out_npu(self, self, indices_copy, max_norm, norm_type);})
+   .Call(self);
+
   return self;
 }
 
diff --git a/src/aten/src/ATen/native/npu/GiouBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/GiouBackwardKernelNpu.cpp
new file mode 100644
index 0000000000..5672a390df
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/GiouBackwardKernelNpu.cpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+std::tuple<Tensor&, Tensor&>
+giou_backward_inner_out_npu(
+    Tensor& dbboxes,
+    Tensor& dgtboxes,
+    const Tensor& grad,
+    const Tensor& bboxes,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  string mode_str = mode == 1 ? "iof" : "iou";
+
+  OpCommand cmd;
+  cmd.Name("GIoUGrad")
+      .Input(grad)
+      .Input(bboxes)
+      .Input(gtboxes)
+      .Output(dbboxes)
+      .Output(dgtboxes)
+      .Attr("trans", trans)
+      .Attr("is_cross", is_cross)
+      .Attr("mode", mode_str)
+      .Run();
+  return std::tie(dbboxes, dgtboxes);
+}
+
+std::tuple<Tensor, Tensor>
+giou_backward_npu(
+    const Tensor& grad,
+    const Tensor& bboxes,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  TORCH_CHECK(!trans && !is_cross &&  mode == 0,
+            "giou backward only support trans==False, ",
+            "is_cross==False, ",
+            "mode==0('iou') current version ",
+            "if you need to back propagation, ",
+            "please ensure your parameter is correct!");
+  // Op need form of [n] grad
+  Tensor gradCp = at::squeeze(grad, 0);
+  Tensor dbboxes = OpPreparation::ApplyTensor(bboxes);
+  Tensor dgtboxes = OpPreparation::ApplyTensor(gtboxes);
+
+  giou_backward_inner_out_npu(dbboxes, dgtboxes, gradCp, bboxes, gtboxes, trans, is_cross, mode);
+  return std::tie(dbboxes, dgtboxes);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/GiouKernelNpu.cpp b/src/aten/src/ATen/native/npu/GiouKernelNpu.cpp
new file mode 100644
index 0000000000..5360ee39c8
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/GiouKernelNpu.cpp
@@ -0,0 +1,87 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<int64_t, N> giou_output_size(
+    const Tensor& self,
+    const Tensor& gtboxes,
+    bool is_cross){
+  SmallVector<int64_t, N> output_size;
+  if(is_cross){
+      output_size = {gtboxes.size(0), self.size(0)};
+  } else {
+      output_size = {1, self.size(0)};
+  }
+  return output_size;
+}
+
+Tensor& giou_inner_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  auto output_size = giou_output_size(self, gtboxes, is_cross);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self,
+      output_size);
+  string mode_str = mode == 1 ? "iof" : "iou";
+
+  OpCommand cmd;
+  cmd.Name("GIoU")
+      .Input(self)
+      .Input(gtboxes)
+      .Output(result)
+      .Attr("trans", trans)
+      .Attr("is_cross", is_cross)
+      .Attr("mode", mode_str)
+      .Run();
+  return result;
+}
+
+Tensor giou_npu(
+    const Tensor& self,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  TORCH_CHECK(!trans && !is_cross &&  mode == 0,
+            "giou backward only support trans==False, ",
+            "is_cross==False, ",
+            "mode==0('iou') current version ",
+            "if you need to back propagation, ",
+            "please ensure your parameter is correct!");
+  // Op need form of [n, 4], but pass should be [4, n];
+  Tensor selfCp = self.permute({1, 0});
+  Tensor gtboxesCp = gtboxes.permute({1, 0});
+  auto output_size = giou_output_size(selfCp, gtboxesCp, is_cross);
+  Tensor result = OpPreparation::ApplyTensor(selfCp, output_size);
+
+  giou_inner_out_npu(result, selfCp, gtboxesCp, trans, is_cross, mode);
+  result = result.permute({1, 0});
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
index 9cbbf8f841..6814d60261 100644
--- a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
@@ -26,6 +26,9 @@ Tensor& index_put_nocheck(
     const TensorList& indices,
     const Tensor& value,
     bool accumulate) {
+  if (value.numel() == 0) {
+    return result;
+  }
   // masks corresponds to indices. 0 indicates undefined tensor.
   SmallVector<int64_t, N> masks;
   std::vector<Tensor> allDefinedIndices;
diff --git a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
index f45ae27e9b..680ec91179 100644
--- a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
@@ -67,21 +67,32 @@ tuple<Tensor&, Tensor&> min_out_npu(
 }
 
 tuple<Tensor, Tensor> min_npu(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor selfCast = self;
+  if(self.dtype() == ScalarType::Bool){
+    selfCast = self.to(ScalarType::Float);
+  }
+
   SmallVector<int64_t, SIZE> dims = {dim};
-  auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
+  auto outputSize = reduce_ops_npu_output_size(selfCast, dims, keepdim);
   SmallVector<int64_t, SIZE> indicesSize = outputSize;
 
-  auto func = [&self, dim, keepdim](Tensor outputs, Tensor indices) {
-    min_out_npu_nocheck(outputs, indices, self, dim, keepdim);
+  auto func = [&selfCast, dim, keepdim](Tensor outputs, Tensor indices) {
+    min_out_npu_nocheck(outputs, indices, selfCast, dim, keepdim);
   };
 
   Tensor outputs, indices;
   OpPipeWithDefinedMultiOut<Tensor, Tensor> pipe(outputs, indices);
-  return pipe.ApplyOutputWithSpecailParams<0>(outputSize, self.options(), ACL_FORMAT_ND)
-            .ApplyOutputWithSpecailParams<1>(indicesSize, self.options().dtype(ScalarType::Int), ACL_FORMAT_NCHW)
-            .Call(func)
-            .ReflushOutputDtype<1>(ScalarType::Long)
-            .Return<Tensor, Tensor>();
+  std::tie(outputs, indices) = pipe.ApplyOutputWithSpecailParams<0>(outputSize, selfCast.options(), ACL_FORMAT_ND)
+      .ApplyOutputWithSpecailParams<1>(indicesSize, selfCast.options().dtype(ScalarType::Int), ACL_FORMAT_NCHW)
+      .Call(func)
+      .ReflushOutputDtype<1>(ScalarType::Long)
+      .Return<Tensor, Tensor>();
+
+  if(self.dtype() == ScalarType::Bool){
+    outputs = outputs.to(ScalarType::Bool);
+  }
+
+  return std::tie(outputs, indices);
 }
 
 tuple<Tensor&, Tensor&> min_out_npu(
diff --git a/src/aten/src/ATen/native/npu/MmKernelNpu.cpp b/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
index 91af42d2af..28ab0aa981 100644
--- a/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
@@ -18,8 +18,6 @@
 #include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/common/InnerNpuNativeFunction.h"
-#include "ATen/native/npu/frame/StorageDescHelper.h"
 
 namespace at {
 namespace native {
@@ -28,7 +26,7 @@ using namespace at::native::npu;
 // Flexible transpose judgement for view+transpose+Matmul, 
 // i.e., tensors with dim=2 and base_size_.size=3 can also be Matmul directly!
 bool is_transpose_last_two_dims_flex(const Tensor& tensor) {
-  if (tensor.dim() != 2) {
+  if (tensor.dim() < 2 || tensor.dim() > 3) {
     return false;
   }
   int64_t numel = 1;
@@ -115,17 +113,10 @@ Tensor mm_npu(const Tensor& self, const Tensor& mat2) {
   // Matmul cannot directly deal with view+transposed tensor with NZ format, so Transdata is necessary
   if (self.sizes().size() != self_desc.base_sizes_.size()) {
     selfFormatCast = OpPreparation::CastBackToOriFormat(self);
-    // refresh storage desc info [origin shape and storage shape] of reshaped Tensor
-    if (is_transpose_last_two_dims_flex(selfFormatCast)) {
-      StorageDescHelper::ReflushDescBySelf(selfFormatCast.transpose(-2, -1));
-    }
   }
   
   if (mat2.sizes().size() != mat2_desc.base_sizes_.size()) {
     mat2FormatCast = OpPreparation::CastBackToOriFormat(mat2);
-    if (is_transpose_last_two_dims_flex(mat2FormatCast)) {
-      StorageDescHelper::ReflushDescBySelf(mat2FormatCast.transpose(-2, -1));
-    }
   }
   
   // construct the output tensor of the NPU
diff --git a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
index 8308e4763a..2f25260240 100644
--- a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
@@ -38,7 +38,7 @@ int64_t calculate_p(optional<Scalar> p) {
 
 
 // norm.dtype_out
-Tensor& norm_out_npu(
+Tensor& norm_out_npu_nocheck(
     Tensor& out,
     const Tensor& self,
     optional<Scalar> p,
@@ -80,11 +80,36 @@ Tensor& norm_out_npu(
     optional<Scalar> p,
     IntArrayRef dim,
     bool keepdim) {
-  norm_out_npu(out, self, p, dim, keepdim, self.scalar_type());
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+  OpPreparation::CheckOut(
+    {self}, 
+    out, 
+    ACL_FORMAT_ND, 
+    self.scalar_type(), 
+    outputSize); 
+  norm_out_npu_nocheck(out, self, p, dim, keepdim, self.scalar_type());
 
   return out;
 }
 
+Tensor& norm_out_npu(
+    Tensor& out,
+    const Tensor& self,
+    optional<Scalar> p,
+    IntArrayRef dim,
+    bool keepdim,
+    ScalarType dtype) {
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+  OpPreparation::CheckOut(
+    {self}, 
+    out, 
+    ACL_FORMAT_ND, 
+    self.scalar_type(), 
+    outputSize); 
+  norm_out_npu_nocheck(out, self, p, dim, keepdim, dtype);
+
+  return out;
+}
 // norm.ScalarOpt_dim_dtype
 Tensor norm_npu(
     const Tensor& self,
@@ -99,7 +124,7 @@ Tensor norm_npu(
   Tensor out = OpPreparation::ApplyTensorWithSizes(outputSize, self.options().dtype(dtype));
 
   // calculate the output result of the NPU
-  norm_out_npu(out, self, p, dim, keepdim, dtype);
+  norm_out_npu_nocheck(out, self, p, dim, keepdim, dtype);
   
   return out;
 }
diff --git a/src/aten/src/ATen/native/npu/ReflectionPad2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/ReflectionPad2dKernelNpu.cpp
deleted file mode 100644
index c3daebe725..0000000000
--- a/src/aten/src/ATen/native/npu/ReflectionPad2dKernelNpu.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-SmallVector<NPUTensorDesc, N> reflection_pad2d_npu_input(SmallVector<Tensor, N> inputs) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputs);
-}
-
-SmallVector<NPUTensorDesc, N> reflection_pad2d_npu_output(const SmallVector<Tensor, N> &outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> reflection_pad2d_npu_attr(const Tensor& input, IntArrayRef paddingSize) {
-  int64_t pad_l = 0;
-  int64_t pad_r = 0;
-  int64_t pad_t = 0;
-  int64_t pad_b = 0;
-  int64_t pad_zeros = 0;
-
-  TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
-
-  pad_l = paddingSize[0];
-  pad_r = paddingSize[1];
-  pad_t = paddingSize[2];
-  pad_b = paddingSize[3];
-
-  SmallVector<int64_t, SIZE> vectorInt = {};
-  SmallVector<SmallVector<int64_t, SIZE>, SIZE> vectorVectorInt = {};
-  SmallVector<IntArrayRef, SIZE> vectorListInt = {};
-  SmallVector<int64_t, SIZE> paddingsVector = array_to_small_vector(paddingSize);
-  paddingsVector.resize(input.dim(), 0);
-
-  for (int i = 0; i < paddingsVector.size(); i ++) {
-    if (i<2) {
-      vectorInt.emplace_back(pad_zeros);
-      vectorInt.emplace_back(pad_zeros);
-    }
-    else if (i == 2) {
-       vectorInt.emplace_back(pad_t);
-       vectorInt.emplace_back(pad_b);
-    }
-    else {
-      vectorInt.emplace_back(pad_l);
-      vectorInt.emplace_back(pad_r);
-    }
-    vectorVectorInt.emplace_back(vectorInt);
-    vectorInt.clear();
-    vectorListInt.emplace_back(IntArrayRef(vectorVectorInt.back()));
-  }
-  int64_t constant_values = 0;
-  // string mode = "constant";
-  string mode = "reflect";
-  bool padding_contiguous = true;
-  NPUAttrDesc npuAttrConstantValues = NPUAttrDesc("constant_values", constant_values);
-  NPUAttrDesc npuAttrMode = NPUAttrDesc("mode", mode);
-  NPUAttrDesc npuAttrPaddingContiguous = NPUAttrDesc("padding_contiguous", padding_contiguous);
-  NPUAttrDesc npuAttrPadding = NPUAttrDesc("paddings", vectorListInt);
-  SmallVector<NPUAttrDesc, N> attrs = {
-      npuAttrPadding,
-      npuAttrConstantValues,
-      npuAttrMode,
-      npuAttrPaddingContiguous
-  };
-  return attrs;
-}
-
-Tensor& reflection_pad2d_out_npu_nocheck(Tensor& out, const Tensor& self, IntArrayRef padding) {
-  //constructs the input and output NPUTensorDesc
-  auto inputs = reflection_pad2d_npu_input({self});
-  auto outputs = reflection_pad2d_npu_output({out});
-
-  //constructs the attr of the NPUAttrDesc
-  auto attrs = reflection_pad2d_npu_attr(self, padding);
-
-  //executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("PadV3D", inputs, outputs, attrs);
-
-  return out;
-}
-
-Tensor& reflection_pad2d_out_npu(Tensor& result, const Tensor& self, IntArrayRef padding){
-  //calculate the output size
-  auto outputSize = reflection_pad2d_npu_output_size(self, padding);
-  //construct the output tensor of the NPU
-  result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  OpPreparation::CheckOut(
-  {self},
-  result,
-  CalcuOpUtil::get_tensor_npu_format(self),
-  self.scalar_type(),
-  outputSize);
-  reflection_pad2d_out_npu_nocheck(result, self, padding);
-
-  return result;
-}
-
-Tensor reflection_pad2d_npu(const Tensor& self, IntArrayRef padding) {
-  //calculate the output size
-  auto outputSize = reflection_pad2d_npu_output_size(self, padding);
-  //construct the output tensor of the NPU
-  Tensor out = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  //calculate the output result of the NPU
-  reflection_pad2d_out_npu_nocheck(out, self, padding);
-
-  return out;
-}
-}
-} // namespace at::native
diff --git a/src/aten/src/ATen/native/npu/ReplicationPad2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/ReplicationPad2dKernelNpu.cpp
old mode 100644
new mode 100755
diff --git a/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp b/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp
index 2890926250..bf72d425e5 100644
--- a/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp
+++ b/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp
@@ -36,7 +36,9 @@ void FormatCastHelper::format_cast_as_base_format(const Tensor& src, aclFormat f
   AT_ASSERT(FormatHelper::IsBaseFormatType(src), "src format must be base format");
   
   auto& src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-  src_desc.storage_sizes_ = FormatHelper::GetSizeOfBaseFormat(src, format);
+  // due to CANN principle : if the ori format of a tensor is the
+  // same as the npu format, then its base shape must be same as storage shape
+  // so we should not change the storage shape when format cast between base format
   src_desc.origin_format_ = format;
   src_desc.npu_format_ = format;
   return;
diff --git a/src/aten/src/ATen/native/npu/convolution/ConvolutionKernelNpu.cpp b/src/aten/src/ATen/native/npu/convolution/ConvolutionKernelNpu.cpp
index 46054203a6..f84a0656c0 100644
--- a/src/aten/src/ATen/native/npu/convolution/ConvolutionKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/convolution/ConvolutionKernelNpu.cpp
@@ -28,18 +28,6 @@ constexpr int output_channels_dim = 1;
 constexpr int weight_output_channels_dim = 0;
 constexpr int weight_input_channels_dim = 1;
 
-bool is_depthwise(
-    const at::Tensor& input,
-    const at::Tensor& weight,
-    int64_t groups,
-    bool transposed) {
-  return input.is_npu() && !transposed && input.ndimension() == 4 &&
-      input.size(1) == groups &&
-      groups > 1 && // no point if there is only a single group
-      weight.size(0) % input.size(1) ==
-      0; // output channels must be a multiple of input channels
-}
-
 inline SmallVector<int64_t, N> expand_dim_if_needed(
     IntArrayRef list_param,
     const char* param_name,
@@ -261,17 +249,7 @@ Tensor _convolution_npu(
   }
 
   Tensor output;
-  if (is_depthwise(input, weight, groups, transposed)) {
-    auto kernel_size = weight.sizes().slice(2);
-    output = at::thnn_conv_depthwise2d(
-        input.contiguous(),
-        weight,
-        kernel_size,
-        bias,
-        stride,
-        padding,
-        dilation);
-  } else if (!transposed) {
+  if (!transposed) {
     output = at::npu_convolution(
         input, weight, bias, stride, padding, dilation, groups);
   } else {
diff --git a/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp b/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp
index 018f6e2707..d13bccb4c0 100644
--- a/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp
+++ b/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp
@@ -59,22 +59,6 @@ std::unordered_map<aclFormat, FormatHelper::FormatInfo> FormatHelper::info = {
   {ACL_FRACTAL_Z_3D,        (FormatInfo){ACL_FRACTAL_Z_3D,      ACL_FORMAT_NCDHW,   InferShapeOfFZ3D,       "FRACTAL_Z_3D", true}},
 };
 
-std::unordered_map<aclFormat, std::unordered_map<aclFormat, baseFormatConverter>> FormatHelper::base_format_convert_info = {
-  {ACL_FORMAT_ND,      {
-                          {ACL_FORMAT_NCHW,    InferShapeNDToNCHW},
-                          {ACL_FORMAT_NCDHW,    InferShapeNDToNCDHW},
-                       }
-  },
-  {ACL_FORMAT_NCHW,    {
-                          {ACL_FORMAT_ND,      InferShapeNCHWToND},
-                       }
-  },
-  {ACL_FORMAT_NCDHW,    {
-                          {ACL_FORMAT_ND,      InferShapeNCDHWToND},
-                       }
-  },
-};
-
 bool FormatHelper::IsPadded(const Tensor* tensor) {
   auto format = tensor->storage().unsafeGetStorageImpl()->npu_desc_.npu_format_;
   return IsPadded(format);
@@ -136,20 +120,6 @@ FormatShape FormatHelper::GetStorageSizes(NPUStorageDesc desc) {
   return GetStorageSizes(format, ori_size);
 }
 
-FormatShape FormatHelper::GetSizeOfBaseFormat(const Tensor& src, aclFormat dst_format) {
-  auto src_format = GetBaseFormat(src);
-  auto itr = base_format_convert_info.find(src_format);
-  if (itr != base_format_convert_info.end()) {
-    auto next_itr = itr->second.find(dst_format);
-    if (next_itr != itr->second.end()) {
-      auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      return next_itr->second(src_desc.storage_sizes_, src_desc.base_sizes_);
-    }
-  }
-  AT_ERROR("unsupport InferShape from ", GetFormatName(src_format), " to ", GetFormatName(dst_format));
-  return {};
-}
-
 // 
 namespace {
 FormatShape InferShapeLessTo4(IntArrayRef dims) {
diff --git a/src/aten/src/ATen/native/npu/frame/FormatHelper.h b/src/aten/src/ATen/native/npu/frame/FormatHelper.h
index 862ff1b7d3..9f0d1f0242 100644
--- a/src/aten/src/ATen/native/npu/frame/FormatHelper.h
+++ b/src/aten/src/ATen/native/npu/frame/FormatHelper.h
@@ -48,7 +48,6 @@ public:
   static FormatShape GetStorageSizes(aclFormat format, sizeType ori_size);
   // GetStorageSizes used to calculate the storage sizes of op at npu device at different format.
   static FormatShape GetStorageSizes(NPUStorageDesc desc);
-  static FormatShape GetSizeOfBaseFormat(const Tensor& src, aclFormat dst_format);
 
 private:
   static bool IsPadded(aclFormat format);
@@ -64,7 +63,6 @@ private:
     bool isPadded = false;
   } FormatInfo;
   static std::unordered_map<aclFormat, FormatInfo> info;
-  static std::unordered_map<aclFormat, std::unordered_map<aclFormat, baseFormatConverter>> base_format_convert_info;
 }; // class FormatHelper
 
 // template impl
diff --git a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
index 1985cbffbb..46abd15c00 100644
--- a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
+++ b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
@@ -42,9 +42,7 @@ REGISTER_OPTION_HOOK(ACL_OP_COMPILER_CACHE_DIR, [](const std::string& val) {
   aclSetCompileopt(aclCompileOpt::ACL_OP_COMPILER_CACHE_DIR, val.c_str());
  })
 REGISTER_OPTION_HOOK(NPU_FUZZY_COMPILE_BLACKLIST, [](const std::string& val) { 
-  if (CheckFuzzyEnable()) {
     FuzzyCompileBlacklist::GetInstance().RegisterBlacklist(val);
-  }
  })
 
  REGISTER_OPTION_INIT_BY_ENV(PROFILING_MODE)
diff --git a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
index a49aa9b994..412d1fc32b 100644
--- a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
+++ b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
@@ -347,7 +347,18 @@ NPUStatus CalcuOpUtil::CreateAclTensorDescInfo(
         input[i].tensorDescType == NPUTensorDesc::TensorDescType::TENSOR) {
       Tensor* aclInput = &input[i].tensor;
       SmallVector<int64_t, 5> dims;
-      dims = aclInput->storage().get_npu_desc().base_sizes_;
+      if (opName == "MatMul") {
+        auto dims_pre = aclInput->sizes();
+        if (attrs[i].boolAttrValue == 1) {
+          dims.push_back(dims_pre[1]);
+          dims.push_back(dims_pre[0]);
+        } else if (attrs[i].boolAttrValue == 0) {
+          dims.push_back(dims_pre[0]);
+          dims.push_back(dims_pre[1]);
+        }
+      } else {
+        dims = aclInput->storage().get_npu_desc().base_sizes_;
+      }
       auto storageDims = aclInput->storage().get_npu_desc().storage_sizes_;
       int64_t numel = 1;
       for (int j = 0; j < storageDims.size(); j++) {
diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
index 773f25ab30..10672bf113 100644
--- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
+++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
@@ -333,12 +333,6 @@ SmallVector<int64_t, SIZE> embedding_dense_backward_npu_output_size(
   return {num_weights, grad_output.size(-1)};
 }
 
-SmallVector<int64_t, SIZE> embedding_renorm_mid_npu_output_size(
-    const Tensor& self,
-    const Tensor& indices){
-  return {indices.size(0), self.size(1)};
-}
-
 SmallVector<int64_t, SIZE> equal_npu_output_size(void) {
   int64_t outputshape = 1;
   SmallVector<int64_t, SIZE> outputSize = {outputshape};
diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
index 9290da7ddd..b676141652 100644
--- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
+++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
@@ -182,10 +182,6 @@ SmallVector<int64_t, SIZE> embedding_dense_backward_npu_output_size(
     int64_t padding_idx, 
     bool scale_grad_by_freq);
 
-SmallVector<int64_t, SIZE> embedding_renorm_mid_npu_output_size(
-    const Tensor& self,
-    const Tensor& indices);
-
 SmallVector<int64_t, SIZE> index_npu_output_size(
   const Tensor& self, 
   TensorList indices);
diff --git a/src/aten/src/ATen/utils/DumpUtils.h b/src/aten/src/ATen/utils/DumpUtils.h
index 630e9a94f2..f728ed3027 100644
--- a/src/aten/src/ATen/utils/DumpUtils.h
+++ b/src/aten/src/ATen/utils/DumpUtils.h
@@ -71,6 +71,10 @@ public:
   void SetValue(const T &value) {
     value_ = value;
   }
+
+  void SetName(const string& newName) {
+    name_ = newName;
+  }
 private:
   string name_;
   T value_;
diff --git a/src/aten/src/ATen/utils/LoadUtils.cpp b/src/aten/src/ATen/utils/LoadUtils.cpp
index 10ed418f73..f526fb5a22 100644
--- a/src/aten/src/ATen/utils/LoadUtils.cpp
+++ b/src/aten/src/ATen/utils/LoadUtils.cpp
@@ -89,9 +89,11 @@ namespace at {
   using stringmap = std::unordered_map<string, string>;
   stringmap IrNameMapper = {
     {"NpuConvolutionBackward", "CudnnConvolutionBackward"},
+    {"NativeBatchNormBackward", "CudnnBatchNormBackward"},
   };
   std::unordered_map<string, stringmap> IrParamNameMapper = {
     {"NpuConvolutionBackward", {{"input", "self"},}},
+    {"NativeBatchNormBackward", {{"eps", "epsilon"},}},
   };
 
   void MaybeMapTensorName(const string& irName, std::vector<TensorDesc>& tensorDescVec) {
@@ -103,6 +105,26 @@ namespace at {
     }
   }
 
+  template <typename T>
+  void MaybeMapValueName(const string& irName, T& value) {
+    for (auto it = value.begin(); it != value.end(); it++) {
+      auto valueName = (*it).Name();
+      if (IrParamNameMapper[irName].find(valueName) != IrParamNameMapper[irName].end()) {
+        (*it).SetName(IrParamNameMapper[irName][valueName]);
+      }
+    }
+  }
+
+  template <typename T>
+  void MaybeMapScalarName(const string& irName, T& value) {
+    for (auto it = value.begin(); it != value.end(); it++) {
+      auto valueName = (*it)->Name();
+      if (IrParamNameMapper[irName].find(valueName) != IrParamNameMapper[irName].end()) {
+        (*it)->SetName(IrParamNameMapper[irName][valueName]);
+      }
+    }
+  }
+
   void MaybeMapName(CommDesc& commDesc, const H5File* file) {
     std::string h5IRPath = "/" + commDesc.nameIr;
     if (file->nameExists(h5IRPath)) {
@@ -112,6 +134,17 @@ namespace at {
       auto oriNameIr = commDesc.nameIr;
       commDesc.nameIr = IrNameMapper[commDesc.nameIr];
       MaybeMapTensorName(oriNameIr, commDesc.tensorDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.int64VecDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.int64DescVec);
+      MaybeMapValueName(oriNameIr, commDesc.boolDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.doubleDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.optionalDoubleDescVec);
+      MaybeMapScalarName(oriNameIr, commDesc.scalarDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.optionalInt64DescVec);
+      MaybeMapScalarName(oriNameIr, commDesc.optionalScalarDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.scalarTypeDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.sizePairDescVec);
+      MaybeMapValueName(oriNameIr, commDesc.longIntArrayDescVec);
     }
   }
 
@@ -689,17 +722,23 @@ namespace at {
 
   }
 
+  void ZeroStrideClear(Tensor& dst, Tensor& src) {
+    auto strides = dst.strides().vec();
+    auto position = std::find(strides.begin(), strides.end(), 0);
+    if (position != strides.end()) {
+      dst = dst.select(position - strides.begin(), 0);
+      src = src.select(position - strides.begin(), 0);
+    } else {
+      return;
+    }
+    ZeroStrideClear(dst, src);
+  }
+
   // when the stride of some dim is zero, the tensor may has been "expand", copy should only
   // process on any axis of that dim
   // To do: is this kind of copy matches other zero stride cases?
   void CopyMaybeWithZeroStride(Tensor dst, Tensor src) {
-    auto strides = dst.strides().vec();
-    for (int i = 0; i < strides.size(); i++) {
-      if (strides[i] == 0) {
-        dst = dst.select(i, 0);
-        src = src.select(i, 0);
-      }
-    }
+    ZeroStrideClear(dst, src);
     dst.copy_(src);
   }
 
diff --git a/src/tools/autograd/derivatives.yaml b/src/tools/autograd/derivatives.yaml
index 1db83b1c5a..ee68e09e8d 100644
--- a/src/tools/autograd/derivatives.yaml
+++ b/src/tools/autograd/derivatives.yaml
@@ -1691,4 +1691,7 @@
 
 - name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   input, weight: npu_linear_backward(grad, input, weight)
-  bias: maybe_multiply(grad, 1)
\ No newline at end of file
+  bias: maybe_multiply(grad, 1)
+
+- name: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
+  self, gtboxes: npu_giou_backward(grad, self, gtboxes, trans, is_cross, mode)
\ No newline at end of file
diff --git a/test/test_npu/test_constant_pad_nd.py b/test/test_npu/test_constant_pad_nd.py
deleted file mode 100644
index 59d0bbae99..0000000000
--- a/test/test_npu/test_constant_pad_nd.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestConstantPadNd(TestCase):
-    
-    def op_exec_cpu(self, input1, pad_shape):
-        output = torch.constant_pad_nd(input1, pad_shape)
-        output = output.numpy()
-        
-        return output
-
-    def op_exec_npu(self, input1, pad_shape):
-        input1 = input1.to("npu")
-        output = torch.constant_pad_nd(input1, pad_shape)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def test_constant_pad_nd_shape_format(self, device):
-        shape_format = [  
-            [[np.float32, 3, (25, 32, 1, 1)], (1,1)],
-            [[np.float32, 0, [25, 32, 11, 11]], (2,2,2,2)],
-            [[np.float32, 0, [25, 3, 22, 22]],(2,2,2,2,20,20)],
-            [[np.float16, 3, [25, 12, 7, 7]], (20,20,20,20)],
-            [[np.float16, 0, [25, 3, 22, 22]], (20,20,20,20,5,5,5,5)],
-            [[np.float16, 4, (2, 3, 3, 3)], (1,1,1,20,5,5,5,5)],
-            [[np.float16, 4, [100, 20, 7, 7]], (0,0,0,0,0,0,0,0)],
-            [[np.float16, 0, [2,3,4,5]], (1,0,1,0,1,0,1,0)],
-            [[np.float16, 4, [2]],(0,1)],
-            [[np.float16, 0, [20,20]],(0,1,0,2)],
-            [[np.float16, 0, [20,20,20]],(1,1,1,1) ],
-            [[np.float16, 3, [1,1,1,1]], (1,1)],
-            [[np.float16, 3, [1]], (1,1)],
-            [[np.float16, 0, [50, 24, 56, 56]], (100, 100, 100, 100, 100, 100, 100, 100)],
-        ]
-
-        for item in shape_format:
-            input_cpu, input_npu = create_common_tensor(item[0], 1, 1)
-            pad_shape = item[1]
-            cpu_output = self.op_exec_cpu(input_cpu, pad_shape)
-            npu_output = self.op_exec_npu(input_npu, pad_shape)
-            
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-
-
-instantiate_device_type_tests(TestConstantPadNd, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/test_npu/test_network_ops/test_abs.py b/test/test_npu/test_network_ops/test_abs.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_add.py b/test/test_npu/test_network_ops/test_add.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_addmm.py b/test/test_npu/test_network_ops/test_addmm.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_all.py b/test/test_npu/test_network_ops/test_all.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_any.py b/test/test_npu/test_network_ops/test_any.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_arange.py b/test/test_npu/test_network_ops/test_arange.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_argmax.py b/test/test_npu/test_network_ops/test_argmax.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_avg_pool2d_backward.py b/test/test_npu/test_network_ops/test_avg_pool2d_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_bilinear.py b/test/test_npu/test_network_ops/test_bilinear.py
similarity index 74%
rename from test/test_npu/test_bilinear.py
rename to test/test_npu/test_network_ops/test_bilinear.py
index dbb919e5a7..4bfdb837f9 100644
--- a/test/test_npu/test_bilinear.py
+++ b/test/test_npu/test_network_ops/test_bilinear.py
@@ -33,7 +33,7 @@ class test_bilinear(TestCase):
         outputs = outputs.cpu().detach().numpy()
         return outputs
 
-    def test_add_common_shape_format1(self, device):
+    def test_bilinear_common_shape_format1(self, device):
         shape_format = [  
                   [[np.float32, -1, (10,30)], [np.float32, -1, (10, 40)], [np.float32, -1, (5, 30, 40)],
                     [np.float32, -1, (5,)]],
@@ -43,12 +43,12 @@ class test_bilinear(TestCase):
                   [[np.float32, -1, (10, 30, 40, 30)], [np.float32, -1, (10, 30, 40, 30)], 
                     [np.float32, -1, (30, 30, 30)],
                       [np.float32, -1, (30,)]],
-                  [[np.float32, -1, (100,3)], [np.float32, -1, (1000, 4)], [np.float32, -1, (5, 3, 4)],
+                  [[np.float32, -1, (100,3)], [np.float32, -1, (100, 4)], [np.float32, -1, (5, 3, 4)],
                     [np.float32, -1, (5,)]],
                   [[np.float16, -1, (2, 1, 1, 1)], [np.float16, -1, (2, 1, 1, 1)], [np.float16, -1, (5, 1, 1)],
                     [np.float16, -1, (5,)]],
                   [[np.float16, -1, (2, 50)], [np.float16, -1, (2, 50)], [np.float16, -1, (5, 50, 50)],
-                    [np.float16, -1, (2, 4)]],
+                    [np.float16, -1, (5)]],
                   [[np.float16, -1, (2, 3)], [np.float16, -1, (2, 4)], [np.float16, -1, (2, 3, 4)],],
                   [[np.float16, -1, (2, 3)], [np.float16, -1, (2, 4)], [np.float16, -1, (4, 3, 4)],
                   [np.float16, -1, (4,)]],
@@ -61,11 +61,19 @@ class test_bilinear(TestCase):
             if len(item)>3:
               cpu_input4, npu_input4 = create_common_tensor(item[3], 0, 1)
               bias = [cpu_input4, npu_input4]
-            cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
+            if cpu_input1.dtype == torch.float16:
+              if bias[0] != None:
+                cpu_outputs = self.cpu_op_exec(
+                  cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0].float()).astype(np.float16)
+              else:
+                cpu_outputs = self.cpu_op_exec(
+                  cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0]).astype(np.float16)
+            else:
+              cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
             npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
             self.assertRtolEqual(cpu_outputs, npu_outputs)
     
-    def test_add_common_shape_format2(self, device):
+    def test_bilinear_common_shape_format2(self, device):
         shape_format = [  
                   [[np.int32, -1, (10,30)], [np.int32, -1, (10, 40)], [np.int32, -1, (5, 30, 40)],
                     [np.int32, -1, (5,)]],
@@ -87,7 +95,7 @@ class test_bilinear(TestCase):
             npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
             self.assertRtolEqual(cpu_outputs, npu_outputs)
        
-    def test_add_common_shape_format3(self, device):
+    def test_bilinear_common_shape_format3(self, device):
         shape_format = [  
                 [[np.float32, 0, (10,30)], [np.float32, 0, (10, 40)], [np.float32, 0, (5, 30, 40)],
                   [np.float32, 0, (5,)]],
@@ -97,12 +105,12 @@ class test_bilinear(TestCase):
                 [[np.float32, 0, (10, 30, 40, 30)], [np.float32, 0, (10, 30, 40, 30)], 
                   [np.float32, 0, (30, 30, 30)],
                     [np.float32, 0, (30,)]],
-                [[np.float32, 0, (100,3)], [np.float32, 0, (1000, 4)], [np.float32, 0, (5, 3, 4)],
+                [[np.float32, 0, (100,3)], [np.float32, 0, (100, 4)], [np.float32, 0, (5, 3, 4)],
                   [np.float32, 0, (5,)]],
                 [[np.float16, 0, (2, 1, 1, 1)], [np.float16, 0, (2, 1, 1, 1)], [np.float16, 0, (5, 1, 1)],
                   [np.float16, 0, (5,)]],
                 [[np.float16, 0, (2, 50)], [np.float16, 0, (2, 50)], [np.float16, 0, (5, 50, 50)],
-                  [np.float16, 0, (2, 4)]],
+                  [np.float16, 0, (5)]],
                 [[np.float16, 0, (2, 3)], [np.float16, 0, (2, 4)], [np.float16, 0, (2, 3, 4)],],
                 [[np.float16, 0, (2, 3)], [np.float16, 0, (2, 4)], [np.float16, 0, (4, 3, 4)],
                 [np.float16, 0, (4,)]],
@@ -115,11 +123,19 @@ class test_bilinear(TestCase):
           if len(item)>3:
             cpu_input4, npu_input4 = create_common_tensor(item[3], 0, 1)
             bias = [cpu_input4, npu_input4]
-          cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
+          if cpu_input1.dtype == torch.float16:
+            if bias[0] != None:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0].float()).astype(np.float16)
+            else:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0]).astype(np.float16)
+          else:
+            cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
           npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
           self.assertRtolEqual(cpu_outputs, npu_outputs)
 
-    def test_add_common_shape_format4(self, device):
+    def test_bilinear_common_shape_format4(self, device):
         shape_format = [  
                 [[np.float32, 3, (10,30)], [np.float32, 3, (10, 40)], [np.float32, 3, (5, 30, 40)],
                   [np.float32, 3, (5,)]],
@@ -129,15 +145,15 @@ class test_bilinear(TestCase):
                 [[np.float32, 3, (10, 30, 40, 30)], [np.float32, 3, (10, 30, 40, 30)], 
                   [np.float32, 3, (30, 30, 30)],
                     [np.float32, 3, (30,)]],
-                [[np.float32, 29, (100,3)], [np.float32, 29, (1000, 4)], [np.float32, 29, (5, 3, 4)],
-                  [np.float32, 29, (5,)]],
-                [[np.float16, 29, (2, 1, 1, 1)], [np.float16, 29, (2, 1, 1, 1)], [np.float16, 29, (5, 1, 1)],
-                  [np.float16, 29, (5,)]],
-                [[np.float16, 29, (2, 50)], [np.float16, 29, (2, 50)], [np.float16, 29, (5, 50, 50)],
-                  [np.float16, 29, (2, 4)]],
-                [[np.float16, 29, (2, 3)], [np.float16, 29, (2, 4)], [np.float16, 29, (2, 3, 4)],],
-                [[np.float16, 29, (2, 3)], [np.float16, 29, (2, 4)], [np.float16, 29, (4, 3, 4)],
-                [np.float16, 29, (4,)]],
+                [[np.float32, 2, (100,3)], [np.float32, 2, (100, 4)], [np.float32, 2, (5, 3, 4)],
+                  [np.float32, 2, (5,)]],
+                [[np.float16, 2, (2, 1, 1, 1)], [np.float16, 2, (2, 1, 1, 1)], [np.float16, 2, (5, 1, 1)],
+                  [np.float16, 2, (5,)]],
+                [[np.float16, 2, (2, 50)], [np.float16, 2, (2, 50)], [np.float16, 2, (5, 50, 50)],
+                  [np.float16, 2, (5)]],
+                [[np.float16, 2, (2, 3)], [np.float16, 2, (2, 4)], [np.float16, 2, (2, 3, 4)],],
+                [[np.float16, 2, (2, 3)], [np.float16, 2, (2, 4)], [np.float16, 2, (4, 3, 4)],
+                [np.float16, 2, (4,)]],
               ]
         for item in shape_format:
           bias = [None, None]
@@ -147,11 +163,18 @@ class test_bilinear(TestCase):
           if len(item)>3:
             cpu_input4, npu_input4 = create_common_tensor(item[3], 0, 1)
             bias = [cpu_input4, npu_input4]
-          cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
+          if cpu_input1.dtype == torch.float16:
+            if bias[0] != None:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0].float()).astype(np.float16)
+            else:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0]).astype(np.float16)
+          else:
+            cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
           npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
           self.assertRtolEqual(cpu_outputs, npu_outputs)
           
 instantiate_device_type_tests(test_bilinear, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
diff --git a/test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits_backward.py b/test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_bmm.py b/test/test_npu/test_network_ops/test_bmm.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_broadcastToD.py b/test/test_npu/test_network_ops/test_broadcastToD.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_cat.py b/test/test_npu/test_network_ops/test_cat.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_clamp.py b/test/test_npu/test_network_ops/test_clamp.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_constant_pad_nd.py b/test/test_npu/test_network_ops/test_constant_pad_nd.py
index 5572e8af6e..06efe9dcf6 100644
--- a/test/test_npu/test_network_ops/test_constant_pad_nd.py
+++ b/test/test_npu/test_network_ops/test_constant_pad_nd.py
@@ -35,7 +35,6 @@ class TestConstantPadNd(TestCase):
         
     def constant_pad_nd_shape_format(self, shape_format):
         for item in shape_format:
-          print(item)
           input_cpu, input_npu = create_common_tensor(item[0], 1, 1)
           pad_shape = item[1]
           if input_cpu.dtype == torch.float16:
diff --git a/test/test_npu/test_network_ops/test_conv2d.py b/test/test_npu/test_network_ops/test_conv2d.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_conv_depthwise2d_backward.py b/test/test_npu/test_network_ops/test_conv_depthwise2d_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_div.py b/test/test_npu/test_network_ops/test_div.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_dropout.py b/test/test_npu/test_network_ops/test_dropout.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_embedding_backward.py b/test/test_npu/test_network_ops/test_embedding_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_embedding_renorm.py b/test/test_npu/test_network_ops/test_embedding_renorm.py
similarity index 97%
rename from test/test_npu/test_embedding_renorm.py
rename to test/test_npu/test_network_ops/test_embedding_renorm.py
index 51f06efe73..2da53426a0 100644
--- a/test/test_npu/test_embedding_renorm.py
+++ b/test/test_npu/test_network_ops/test_embedding_renorm.py
@@ -26,7 +26,7 @@ class TestEmbeddingRenorm(TestCase):
         input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
         npu_input1 = torch.from_numpy(input1)
         npu_input2 = torch.LongTensor(np.random.uniform(0,shape[0], int(shape[0]/2,)).astype(np.int32))
-        #npu_input2=torch.LongTensor([[0,1,1,0,1],[0,1,1,0,1],[1,0,1,1,2]])
+        
         return npu_input1, npu_input2
 
     def cpu_op_exec(self, input1, input2, max_norm, norm_type):
@@ -36,7 +36,6 @@ class TestEmbeddingRenorm(TestCase):
         output = torch.embedding_renorm_(input1, input2, max_norm=max_norm, norm_type=norm_type)
         if stype == torch.float16:
             output = output.half()
-        output = output.numpy()
         return output
 
     def npu_op_exec(self, input1, input2, max_norm,norm_type):
@@ -44,7 +43,6 @@ class TestEmbeddingRenorm(TestCase):
         input2 = input2.to("npu")
         output = torch.embedding_renorm_(input1, input2, max_norm=max_norm, norm_type=norm_type)
         output = output.to("cpu")
-        output = output.numpy()
         return output
 
     def test_embedding_renorm_float16_2(self, device):
@@ -60,7 +58,7 @@ class TestEmbeddingRenorm(TestCase):
         cpu_input1 = copy.deepcopy(npu_input1)
         cpu_input2 = copy.deepcopy(npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.2, 0)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.2, 0)       
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.2, 0)
         self.assertRtolEqual(cpu_output, npu_output)
 
     def test_embedding_renorm_float16_1(self, device):
diff --git a/test/test_npu/test_network_ops/test_exp.py b/test/test_npu/test_network_ops/test_exp.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_fill_.py b/test/test_npu/test_network_ops/test_fill_.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_floor.py b/test/test_npu/test_network_ops/test_floor.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_fmod.py b/test/test_npu/test_network_ops/test_fmod.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_full.py b/test/test_npu/test_network_ops/test_full.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_ge.py b/test/test_npu/test_network_ops/test_ge.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_gelu_backward.py b/test/test_npu/test_network_ops/test_gelu_backward.py
similarity index 77%
rename from test/test_npu/test_gelu_backward.py
rename to test/test_npu/test_network_ops/test_gelu_backward.py
index a21092c621..439e57e28c 100644
--- a/test/test_npu/test_gelu_backward.py
+++ b/test/test_npu/test_network_ops/test_gelu_backward.py
@@ -33,7 +33,7 @@ class TestGeluBackward(TestCase):
         z = output.sum()
         z.backward()
         res = input1.grad        
-        return res.detach()
+        return res.detach().numpy()
 
     def npu_op_exec(self, input1):
         input1 = input1.to("npu")
@@ -42,44 +42,37 @@ class TestGeluBackward(TestCase):
         z = output.sum()
         z.backward()
         res = input1.grad.to("cpu")        
-        return res.detach()
+        return res.detach().numpy()
         
     def test_gelu_backward_float32_1(self, device):
-        input1= self.generate_single_data(0, 100, (4,3,1,1), np.float32)
+        input1= self.generate_single_data(0, 100, (4, 3, 1, 1), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)
         
     def test_gelu_backward_float32_2(self, device):
-        input1= self.generate_single_data(0, 100, (4,3,10), np.float32)
+        input1= self.generate_single_data(0, 100, (15, 3, 1), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)
 
     def test_gelu_backward_float32_3(self, device):
-        input1= self.generate_single_data(0, 100, (400,30,10), np.float32)
-        cpu_input1 = copy.deepcopy(input1)
-        cpu_output = self.cpu_op_exec(cpu_input1)
-        npu_output = self.npu_op_exec(input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_gelu_backward_float32_4(self, device):
-        input1= self.generate_single_data(-30, 0, (4,4), np.float32)
+        input1= self.generate_single_data(0, 100, (4, 4), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)       
 
     def test_gelu_backward_float16(self, device):
-        input1 = self.generate_single_data(0, 100, (5, 10, 100) , np.float16)
-        input1 =  input1.to(torch.float32)
-        cpu_input1 = copy.deepcopy(input1)
+        input1 = self.generate_single_data(0, 100, (5, 10, 100), np.float16)
+        cpu_input1 =  input1.to(torch.float32)
         cpu_output = self.cpu_op_exec(cpu_input1)
+        cpu_output = cpu_output.astype(np.float16)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output) 
         
 instantiate_device_type_tests(TestGeluBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_gt.py b/test/test_npu/test_network_ops/test_gt.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_hardtanh.py b/test/test_npu/test_network_ops/test_hardtanh.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_index_put.py b/test/test_npu/test_network_ops/test_index_put.py
old mode 100644
new mode 100755
index 2ab3b47b94..3a01077180
--- a/test/test_npu/test_network_ops/test_index_put.py
+++ b/test/test_npu/test_network_ops/test_index_put.py
@@ -127,6 +127,16 @@ class TestIndexPut(TestCase):
         self.case_exec_fp16(shape_format)
         self.case_inp_exec_fp16(shape_format)
 
+    def test_index_put_null(self, device):
+        cpu_input1 = torch.rand(2, 2)
+        cpu_input2 = torch.rand(2, 2)
+        cpu_mask_index = torch.tensor([[False, False], [False, False]])
+        npu_mask_index = cpu_mask_index.to("npu")
+        npu_input1 = cpu_input1.to("npu")
+        npu_input2 = cpu_input2.to("npu")
+        cpu_input1[cpu_mask_index] = cpu_input2.detach()[cpu_mask_index]
+        npu_input1[npu_mask_index] = npu_input2.detach()[npu_mask_index]
+        self.assertEqual(cpu_input1, npu_input1.to("cpu"))
 
 instantiate_device_type_tests(TestIndexPut, globals(), except_for="cpu")
 
diff --git a/test/test_npu/test_network_ops/test_le.py b/test/test_npu/test_network_ops/test_le.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_leaky_relu_backward.py b/test/test_npu/test_network_ops/test_leaky_relu_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_log.py b/test/test_npu/test_network_ops/test_log.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_log2.py b/test/test_npu/test_network_ops/test_log2.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_log_softmax.py b/test/test_npu/test_network_ops/test_log_softmax.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_log_softmax_backward.py b/test/test_npu/test_network_ops/test_log_softmax_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_lt.py b/test/test_npu/test_network_ops/test_lt.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_matmul.py b/test/test_npu/test_network_ops/test_matmul.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_max.py b/test/test_npu/test_network_ops/test_max.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_min.py b/test/test_npu/test_network_ops/test_min.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_mm.py b/test/test_npu/test_network_ops/test_mm.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_muls.py b/test/test_npu/test_network_ops/test_muls.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_neg.py b/test/test_npu/test_network_ops/test_neg.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_nllloss.py b/test/test_npu/test_network_ops/test_nllloss.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_not_equal.py b/test/test_npu/test_network_ops/test_not_equal.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_npu_giou.py b/test/test_npu/test_network_ops/test_npu_giou.py
new file mode 100644
index 0000000000..c6f55768d0
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_npu_giou.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import math
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuGiou(TestCase):
+    def generate_giou_data(self, n, m, dtype):
+        data_bboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_bboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, n).astype(dtype)
+            data_bboxes = np.append(data_bboxes, data_bboxes_array)
+        data_bboxes = data_bboxes.reshape([4, n])
+        data_gtboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_gtboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, m).astype(dtype)
+            data_gtboxes = np.append(data_gtboxes, data_gtboxes_array)
+        data_gtboxes = data_gtboxes.reshape([4, m])
+        cpu_input1 = torch.from_numpy(data_bboxes)
+        cpu_input2 = torch.from_numpy(data_gtboxes)
+        npu_input1 = cpu_input1.npu()
+        npu_input2 = cpu_input2.npu()
+        return cpu_input1, cpu_input2, npu_input1, npu_input2
+
+    def cpu_op_exec(self, box1, box2, trans=False, is_cross=False, mode="iou"):
+        box1 = box1.numpy()
+        box2 = box2.numpy()
+        dtype = box1.dtype
+        _, n = box1.shape
+        _, m = box2.shape
+        if trans:
+            b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+            b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+            b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+            b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+        else:
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+        w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1
+        w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1
+        area1 = w1 * h1
+        area2 = w2 * h2
+        giou_res =np.array([], dtype=dtype)
+        
+        for i in range(n):
+            for j in range(m):
+                inter_x1 = max(b1_x1[i], b2_x1[j])
+                inter_x2 = min(b1_x2[i], b2_x2[j])
+                inter_y1 = max(b1_y1[i], b2_y1[j])
+                inter_y2 = min(b1_y2[i], b2_y2[j])
+                outer_x1 = min(b1_x1[i], b2_x1[j])
+                outer_x2 = max(b1_x2[i], b2_x2[j])
+                outer_y1 = min(b1_y1[i], b2_y1[j])
+                outer_y2 = max(b1_y2[i], b2_y2[j])
+                inter_area = max(0, (inter_x2 - inter_x1)) * max(0, (inter_y2 - inter_y1))
+                outer_area = abs(outer_x2 - outer_x1) * abs(outer_y2 - outer_y1)
+                union_area = area1[i] + area2[j] - inter_area + 1e-16
+                other_area = outer_area - union_area
+                giou_ij = inter_area / union_area - other_area / outer_area
+                if not is_cross:
+                    if i == j:
+                        giou_res = np.append(giou_res, giou_ij)
+                else:
+                    giou_res = np.append(giou_res, giou_ij)
+        
+        if not is_cross:
+            res = giou_res.reshape(1, n)
+        else:
+            res = giou_res.reshape(n, m)
+            res = np.transpose(res)
+        res = np.transpose(res)
+        return res
+
+    def npu_op_exec(self,  box1, box2, trans=False, is_cross=False, mode=0):
+        output = torch.npu_giou(box1, box2, trans, is_cross, mode)
+        output = output.detach().cpu().numpy()
+        return output
+    
+    def test_npu_giou_shape_format_fp32(self, device):
+        self._test_npu_giou_shape_format(np.float32)
+    
+    def test_npu_giou_shape_format_fp16(self, device):
+        self._test_npu_giou_shape_format(np.float16)
+    
+    def _test_npu_giou_shape_format(self, dtype):
+        shape_list = [
+            [10, 10],
+            [12, 10],
+            [100, 100]
+        ]
+        is_trans_list = [False]
+        mode_list = ["iou"]
+        # TODO(Ascend): 反向只支持 mode=="iof", is_cross==False,
+        # is_trans==Fasle场景，这里同步验证相同场景
+        shape_format = [[j, k, m]
+                        for j in shape_list
+                        for k in is_trans_list
+                        for m in mode_list]
+
+        for item in shape_format:
+            mode_digit = 0 if item[-1] == "iou" else 1
+            is_cross = False if item[0][0] == item[0][1] else True
+            cpu_input1, cpu_input2, npu_input1, npu_input2 = self.generate_giou_data(*item[0], dtype)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, item[1], is_cross, item[-1])
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, item[1], is_cross, mode_digit)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            if dtype == np.float16:
+                # TODO(Ascend): fp16 insufficient precision
+                self.assertRtolEqual(cpu_output, npu_output, prec16=1e-2)
+            else:
+                self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestNpuGiou, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_npu_giou_backward.py b/test/test_npu/test_network_ops/test_npu_giou_backward.py
new file mode 100644
index 0000000000..1cf564d74b
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_npu_giou_backward.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import math
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuGiouBackward(TestCase):
+    def generate_giou_data(self, n, m, dtype):
+        data_bboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_bboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, n).astype(dtype)
+            data_bboxes = np.append(data_bboxes, data_bboxes_array)
+        data_bboxes = data_bboxes.reshape([4, n])
+        data_gtboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_gtboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, m).astype(dtype)
+            data_gtboxes = np.append(data_gtboxes, data_gtboxes_array)
+        data_gtboxes = data_gtboxes.reshape([4, m])
+        cpu_input1 = torch.from_numpy(data_bboxes)
+        cpu_input2 = torch.from_numpy(data_gtboxes)
+        npu_input1 = cpu_input1.npu()
+        npu_input2 = cpu_input2.npu()
+        return cpu_input1, cpu_input2, npu_input1, npu_input2
+
+    def npu_op_exec(self,  box1, box2, trans=False, is_cross=False, mode=0):
+        box1.requires_grad = True
+        box2.requires_grad = True
+        output = torch.npu_giou(box1, box2, trans, is_cross, mode)
+        output.backward(torch.ones_like(output))
+        box1_grad = box1.grad
+        box2_grad = box2.grad
+        box1_grad = box1_grad.detach().cpu().numpy()
+        box2_grad = box2_grad.detach().cpu().numpy()
+        output = output.detach().cpu().numpy()
+        return output, box1_grad, box2_grad
+
+    def test_npu_giou_backward_shape_format(self, dtype):
+        shape_list = [
+            [1, 1]
+        ]
+        is_trans_list = [False]
+        mode_list = ["iou"]
+        # TODO(Ascend): only support mode=="iof", is_cross==False, 
+        # is_trans==Fasle currently
+        shape_format = [[j, k, m]
+                        for j in shape_list
+                        for k in is_trans_list
+                        for m in mode_list]
+
+        for item in shape_format:
+            mode_digit = 0 if item[-1] == "iou" else 1
+            is_cross = False if item[0][0] == item[0][1] else True
+            expected_cpu_grad1 = np.array([[0.51091206],
+                                           [-0.70909655],
+                                           [0.3726323],
+                                           [0.349545]], dtype=np.float32)
+            expected_cpu_grad2 = np.array([[-0.51091206],
+                                           [0.70909655],
+                                           [0.3599837],
+                                           [0.47306436]], dtype=np.float32)
+            _, _, npu_input1, npu_input2 = self.generate_giou_data(*item[0], np.float32)
+            _, npu_grad1, npu_grad2 = self.npu_op_exec(npu_input1, npu_input2, item[1], is_cross, mode_digit)
+            self.assertRtolEqual(expected_cpu_grad1, npu_grad1)
+            self.assertRtolEqual(expected_cpu_grad2, npu_grad2)
+
+
+instantiate_device_type_tests(TestNpuGiouBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_pow.py b/test/test_npu/test_network_ops/test_pow.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_prod.py b/test/test_npu/test_network_ops/test_prod.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_reciprocal.py b/test/test_npu/test_network_ops/test_reciprocal.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_relu.py b/test/test_npu/test_network_ops/test_relu.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_remainder.py b/test/test_npu/test_network_ops/test_remainder.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_rsqrt.py b/test/test_npu/test_network_ops/test_rsqrt.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_rsub.py b/test/test_npu/test_network_ops/test_rsub.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_sign.py b/test/test_npu/test_network_ops/test_sign.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_softmax.py b/test/test_npu/test_network_ops/test_softmax.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_split.py b/test/test_npu/test_network_ops/test_split.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_sqrt.py b/test/test_npu/test_network_ops/test_sqrt.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_stack.py b/test/test_npu/test_network_ops/test_stack.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_sub.py b/test/test_npu/test_network_ops/test_sub.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_sum.py b/test/test_npu/test_network_ops/test_sum.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_upsample_bilinear_backward.py b/test/test_npu/test_network_ops/test_upsample_bilinear_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_where.py b/test/test_npu/test_network_ops/test_where.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_zero.py b/test/test_npu/test_network_ops/test_zero.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_zeros.py b/test/test_npu/test_network_ops/test_zeros.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_zeroslike.py b/test/test_npu/test_network_ops/test_zeroslike.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/util_test.py b/test/test_npu/test_network_ops/util_test.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/eval/onnx/cp_onnx_eval.py b/test/test_npu/test_onnx/torch.onnx/eval/onnx/cp_onnx_eval.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/eval/onnxrt/onnxrt_eval.py b/test/test_npu/test_onnx/torch.onnx/eval/onnxrt/onnxrt_eval.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/cp_parser.py b/test/test_npu/test_onnx/torch.onnx/export/cp_parser.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/export_onnx.py b/test/test_npu/test_onnx/torch.onnx/export/export_onnx.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export-cpu.py b/test/test_npu/test_onnx/torch.onnx/export/model_export-cpu.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export-gpu.py b/test/test_npu/test_onnx/torch.onnx/export/model_export-gpu.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export-npu.py b/test/test_npu/test_onnx/torch.onnx/export/model_export-npu.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export.py b/test/test_npu/test_onnx/torch.onnx/export/model_export.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/onnx_parser.py b/test/test_npu/test_onnx/torch.onnx/export/onnx_parser.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/main.py b/test/test_npu/test_onnx/torch.onnx/main.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_reflection_pad2d.py b/test/test_npu/test_reflection_pad2d.py
deleted file mode 100644
index d150c4c955..0000000000
--- a/test/test_npu/test_reflection_pad2d.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestReflectionPad2d(TestCase):
-    def cpu_op_out_exec(self, input1, pad, output):
-        m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-        m = m.numpy()
-        return m
-
-    def npu_op_out_exec(self, input1, pad, output):
-        m_n = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-        m_n = m_n.to("cpu")
-        m_n = m_n.numpy()
-        return m_n
-
-    def cpu_op_exec(self, input1, pad):
-        m = torch.nn.ReflectionPad2d(pad)
-        output = m(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, pad):
-        m = torch.nn.ReflectionPad2d(pad).to("npu")
-        output = m(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_reflectionPad2d_out_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, (1, 1, 3, 3)], [2, 2, 2, 2]],
-            [[np.float32, 3, (1, 1, 4, 3)], 2]
-        ]
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_out_exec(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.float16, 3, (1, 1, 4, 3)], 2]
-        ]
-
-        def cpu_op_out_exec_fp16(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.float16)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_fp16(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_int8(self, device):
-        shape_format = [
-            [[np.int8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int8, 0, (1, 1, 5, 3)], 2]
-        ]
-
-        def cpu_op_out_exec_int8(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.int8)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_int8(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_uint8(self, device):
-        shape_format = [
-            [[np.uint8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.uint8, 0, (1, 1, 4, 9)], 3]
-        ]
-
-        def cpu_op_out_exec_uint8(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.uint8)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_uint8(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_int32(self, device):
-        shape_format = [
-            [[np.int32, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int32, 0, (1, 1, 4, 9)], 2]
-        ]
-
-        def cpu_op_out_exec_int32(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.int32)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_int32(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, (1, 1, 3, 3)], [2, 2, 2, 2]],
-            [[np.float32, 3, (1, 1, 4, 3)], 2]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.float16, 3, (1, 1, 4, 3)], 2]
-        ]
-
-        def cpu_op_exec_fp16(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_fp16(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_int8(self, device):
-        shape_format = [
-            [[np.int8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int8, 0, (1, 1, 5, 3)], 2]
-        ]
-
-        def cpu_op_exec_int8(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.int8)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_int8(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_uint8(self, device):
-        shape_format = [
-            [[np.uint8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.uint8, 0, (1, 1, 4, 9)], 3]
-        ]
-
-        def cpu_op_exec_uint8(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.uint8)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_uint8(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_int32(self, device):
-        shape_format = [
-            [[np.int32, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int32, 0, (1, 1, 4, 9)], 2]
-        ]
-
-        def cpu_op_exec_int32(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.int32)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_int32(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestReflectionPad2d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
-    run_tests()
-- 
Gitee