diff --git a/patch/npu.patch b/patch/npu.patch
index 4e01faf27cccb57db1074605f20a5c9883360123..14c9a2b015f9a880197f7275d29f92be65b92dc7 100644
--- a/patch/npu.patch
+++ b/patch/npu.patch
@@ -1,6 +1,6 @@
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt
 --- pytorch-v1.5.0/aten/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/CMakeLists.txt	2021-07-09 17:16:47.786789915 +0800
++++ pytorch-develop/aten/CMakeLists.txt	2021-07-13 15:30:57.594267657 +0800
 @@ -22,8 +22,10 @@
  set(ATen_CPU_INCLUDE)
  set(ATen_THIRD_PARTY_INCLUDE)
@@ -51,7 +51,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-09 17:16:47.786789915 +0800
++++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-13 15:30:57.594267657 +0800
 @@ -67,6 +67,9 @@
  FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h")
  FILE(GLOB native_cpu_h "native/cpu/*.h")
@@ -129,7 +129,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h
 --- pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-09 17:16:47.794790202 +0800
++++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-13 15:30:57.602267943 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -170,7 +170,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py
 --- pytorch-v1.5.0/aten/src/ATen/function_wrapper.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-09 17:16:47.802790488 +0800
++++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-13 15:30:57.610268230 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -248,7 +248,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      # options should be List[FunctionOption]
      'options': Any,
      'schema_string': str,
-@@ -1037,12 +1081,33 @@
+@@ -1037,12 +1081,32 @@
              return_types.append(rtype)
  
          return return_types
@@ -267,9 +267,8 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +        elif 'TensorOptions' in argu_types:
 +            check.append(argu_names[argu_types.index('TensorOptions')] + ".device()")
 +        else:
-+            print("Can not find right dispatch key of argument Type of Tensor, TensorList, TensorOptions")
 +            print("argument:", option['schema_string'])
-+            raise
++            raise ValueError("Can not find right dispatch key of argument Type of Tensor, TensorList, TensorOptions.")
 +        return check
  
      def process_native(option):
@@ -282,7 +281,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          formals = native_get_formals(option)
          option['formals_list'] = formals
          option['formals'] = [format_formal(f) for f in formals]
-@@ -1203,17 +1268,22 @@
+@@ -1203,17 +1267,22 @@
          # we just implement it in the base Type.  This is exposed
          # in Declarations.yaml via a field named 'abstract'.
          abstract = False
@@ -307,7 +306,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  if option['use_c10_dispatcher'] == 'full':
                      op_registrations.append(OpRegistration(
                          operator_name=OPERATOR_NAME.substitute(option),
-@@ -1236,6 +1306,17 @@
+@@ -1236,6 +1305,17 @@
                      option['native_type_method_dispatch'] = value
                      top_env['native_function_declarations'].append(NATIVE_DECLARATION.substitute(option))
                      generated_native_functions.append(value)
@@ -325,7 +324,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          else:
              top_env['native_function_declarations'].append(NATIVE_DECLARATION.substitute(option))
  
-@@ -1552,7 +1633,7 @@
+@@ -1552,7 +1632,7 @@
          # type: (FunctionOption) -> None
          dispatch = option['type_method_definition_dispatch']
          env = nested_dict(option, backend_type_env)
@@ -334,7 +333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          if isinstance(dispatch, dict):
              # If we're here, then our native_functions.yaml entry has dispatch configuration.
              # Having manual kernel registration doesn't make sense.
-@@ -1576,6 +1657,18 @@
+@@ -1576,6 +1656,18 @@
                          op_registrations.append(OpRegistration(
                              operator_name=OPERATOR_NAME.substitute(option),
                              registration_code=BACKEND_UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(env)))
@@ -355,7 +354,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          for option in declaration['options']:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py
 --- pytorch-v1.5.0/aten/src/ATen/gen.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/gen.py	2021-07-09 17:16:47.802790488 +0800
++++ pytorch-develop/aten/src/ATen/gen.py	2021-07-13 15:30:57.610268230 +0800
 @@ -1,3 +1,18 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -513,7 +512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      generate_outputs()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-09 17:16:47.814790918 +0800
++++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-13 15:30:57.622268661 +0800
 @@ -339,20 +339,20 @@
  
  void hardsigmoid_backward_kernel(TensorIterator& iter) {
@@ -541,7 +540,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    });
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-09 17:16:47.806790632 +0800
++++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-13 15:30:57.614268374 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -596,7 +595,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        detail::computeStorageSize(self.sizes(), self.strides()),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml
 --- pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-09 17:16:47.830791493 +0800
++++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-13 15:30:57.634269091 +0800
 @@ -1,6 +1,5 @@
  # See README.md in this directory for more guidance
  
@@ -2274,13 +2273,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor
    use_c10_dispatcher: full
-@@ -1761,26 +2246,40 @@
- - func: matrix_power(Tensor self, int n) -> Tensor
-   use_c10_dispatcher: full
-   variants: function, method
-+  npu_dispatch:
-+    NPU: matrix_power_npu
- 
+@@ -1765,22 +2250,34 @@
  - func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
    variants: function, method
    supports_named_tensor: True
@@ -2315,7 +2308,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -1791,6 +2290,8 @@
+@@ -1791,6 +2288,8 @@
  
  - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
    supports_named_tensor: True
@@ -2324,7 +2317,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
    requires_tensor: True
-@@ -1814,6 +2315,8 @@
+@@ -1814,6 +2313,8 @@
      CPU: mean_cpu_gpu
      CUDA: mean_cpu_gpu
      QuantizedCPU: quantized_mean_cpu
@@ -2333,7 +2326,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
    variants: function, method
-@@ -1822,6 +2325,8 @@
+@@ -1822,6 +2323,8 @@
      CPU: mean_cpu_gpu
      CUDA: mean_cpu_gpu
      QuantizedCPU: quantized_mean_cpu
@@ -2342,7 +2335,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -1829,47 +2334,73 @@
+@@ -1829,47 +2332,73 @@
      CPU: mean_out_cpu_gpu
      CUDA: mean_out_cpu_gpu
      QuantizedCPU: quantized_mean_out_cpu
@@ -2416,7 +2409,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
  
-@@ -1958,6 +2489,8 @@
+@@ -1958,6 +2487,8 @@
      CUDA: legacy::cuda::_th_mm
      SparseCPU: _sparse_mm
      SparseCUDA: _sparse_mm
@@ -2425,7 +2418,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
-@@ -1966,6 +2499,8 @@
+@@ -1966,6 +2497,8 @@
      CUDA: legacy::cuda::_th_mm_out
      SparseCPU: _sparse_mm_out
      SparseCUDA: _sparse_mm_out
@@ -2434,7 +2427,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
-@@ -1994,6 +2529,8 @@
+@@ -1994,6 +2527,8 @@
      SparseCPU: mul_sparse
      SparseCUDA: mul_sparse
      MkldnnCPU: mkldnn_mul
@@ -2443,7 +2436,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-@@ -2004,6 +2541,8 @@
+@@ -2004,6 +2539,8 @@
      SparseCPU: mul_sparse_
      SparseCUDA: mul_sparse_
      MkldnnCPU: mkldnn_mul_
@@ -2452,7 +2445,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-@@ -2013,15 +2552,21 @@
+@@ -2013,15 +2550,21 @@
      SparseCPU: mul_out_sparse_cpu
      SparseCUDA: mul_out_sparse_cuda
      MkldnnCPU: mkldnn_mul_out
@@ -2474,7 +2467,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mv(Tensor self, Tensor vec) -> Tensor
    use_c10_dispatcher: full
-@@ -2030,12 +2575,16 @@
+@@ -2030,12 +2573,16 @@
      CPU: mv_cpu
      CUDA: legacy::cuda::_th_mv
    supports_named_tensor: True
@@ -2491,7 +2484,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mvlgamma(Tensor self, int p) -> Tensor
    use_c10_dispatcher: full
-@@ -2052,6 +2601,8 @@
+@@ -2052,6 +2599,8 @@
      CUDA: narrow_copy_dense
      SparseCPU: narrow_copy_sparse
      SparseCUDA: narrow_copy_sparse
@@ -2500,7 +2493,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
    variants: function, method
-@@ -2068,6 +2619,8 @@
+@@ -2068,6 +2617,8 @@
      CPU: batch_norm_cpu
      CUDA: batch_norm_cuda
      MkldnnCPU: mkldnn_batch_norm
@@ -2509,7 +2502,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    dispatch:
-@@ -2098,6 +2651,8 @@
+@@ -2098,6 +2649,8 @@
    dispatch:
      CPU: batch_norm_backward_cpu
      CUDA: batch_norm_backward_cuda
@@ -2518,7 +2511,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
    dispatch:
-@@ -2117,6 +2672,8 @@
+@@ -2117,6 +2670,8 @@
  
  - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
    variants: function
@@ -2527,7 +2520,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
    variants: function
-@@ -2129,42 +2686,60 @@
+@@ -2129,42 +2684,60 @@
  
  - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    device_guard: False
@@ -2590,7 +2583,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Only exposed from C++ -- in Python,
  # we expose it as an attribute `T`, not a function.
-@@ -2253,54 +2828,82 @@
+@@ -2253,54 +2826,82 @@
    supports_named_tensor: True
  
  - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2674,7 +2667,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor
    use_c10_dispatcher: full
-@@ -2316,6 +2919,8 @@
+@@ -2316,6 +2917,8 @@
  - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -2683,7 +2676,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: reshape(Tensor self, int[] shape) -> Tensor
    variants: function, method
-@@ -2337,16 +2942,22 @@
+@@ -2337,16 +2940,22 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -2706,7 +2699,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
  
-@@ -2360,6 +2971,8 @@
+@@ -2360,6 +2969,8 @@
      CUDA: relu
      MkldnnCPU: mkldnn_relu
      QuantizedCPU: quantized_relu
@@ -2715,7 +2708,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: relu_(Tensor(a!) self) -> Tensor(a!)
-@@ -2370,6 +2983,8 @@
+@@ -2370,6 +2981,8 @@
      CUDA: relu_
      MkldnnCPU: mkldnn_relu_
      QuantizedCPU: quantized_relu_
@@ -2724,7 +2717,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: prelu(Tensor self, Tensor weight) -> Tensor
    use_c10_dispatcher: full
-@@ -2377,12 +2992,16 @@
+@@ -2377,12 +2990,16 @@
    dispatch:
      CPU: prelu_cpu
      CUDA: prelu_cuda
@@ -2741,7 +2734,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gelu(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2390,6 +3009,8 @@
+@@ -2390,6 +3007,8 @@
    dispatch:
      CPU: gelu_cpu
      CUDA: gelu_cuda
@@ -2750,7 +2743,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2397,29 +3018,41 @@
+@@ -2397,29 +3016,41 @@
    dispatch:
      CPU: gelu_backward_cpu
      CUDA: gelu_backward_cuda
@@ -2792,7 +2785,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
    variants: function, method
-@@ -2433,14 +3066,21 @@
+@@ -2433,14 +3064,21 @@
  
  - func: selu(Tensor self) -> Tensor
    use_c10_dispatcher: full
@@ -2815,7 +2808,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sigmoid(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2451,6 +3091,8 @@
+@@ -2451,6 +3089,8 @@
      CUDA: sigmoid
      QuantizedCPU: quantized_sigmoid
      MkldnnCPU: mkldnn_sigmoid
@@ -2824,7 +2817,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2459,36 +3101,52 @@
+@@ -2459,36 +3099,52 @@
      CPU: sigmoid_
      CUDA: sigmoid_
      MkldnnCPU: mkldnn_sigmoid_
@@ -2877,7 +2870,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Returns a copy of this `Variable` that is detached from its autograd graph.
  # This method is OK to call if the `Variable` is a view.
-@@ -2533,6 +3191,8 @@
+@@ -2533,6 +3189,8 @@
  
  - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
    variants: function, method
@@ -2886,7 +2879,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: smm(Tensor self, Tensor mat2) -> Tensor
    use_c10_dispatcher: full
-@@ -2542,10 +3202,14 @@
+@@ -2542,10 +3200,14 @@
  - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
    variants: function, method
    supports_named_tensor: True
@@ -2901,7 +2894,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
    use_c10_dispatcher: full
-@@ -2553,12 +3217,16 @@
+@@ -2553,12 +3215,16 @@
      CPU: softmax_cpu
      CUDA: softmax_cuda
      MkldnnCPU: mkldnn_softmax
@@ -2918,7 +2911,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
    variants: function, method
-@@ -2609,8 +3277,12 @@
+@@ -2609,8 +3275,12 @@
      SparseCUDA: _sspaddmm_out_cuda
  
  - func: stack(Tensor[] tensors, int dim=0) -> Tensor
@@ -2931,7 +2924,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # The signature is designed to be consistent with librosa except that it is
  # missing the `pad_mode` and `center` arguments, which are taken care of at
-@@ -2633,20 +3305,30 @@
+@@ -2633,20 +3303,30 @@
  - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
    variants: function, method
    supports_named_tensor: True
@@ -2962,7 +2955,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sum_to_size(Tensor self, int[] size) -> Tensor
    variants: method
-@@ -2656,13 +3338,19 @@
+@@ -2656,13 +3336,19 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -2982,7 +2975,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: square(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2677,51 +3365,81 @@
+@@ -2677,51 +3363,81 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3065,7 +3058,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: t(Tensor(a) self) -> Tensor(a)
    device_guard: False
-@@ -2736,6 +3454,8 @@
+@@ -2736,6 +3452,8 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -3074,7 +3067,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tan_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2743,12 +3463,16 @@
+@@ -2743,12 +3461,16 @@
    dispatch:
      CPU: _tan__cpu
      CUDA: _tan__cuda
@@ -3091,7 +3084,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tanh(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2758,6 +3482,8 @@
+@@ -2758,6 +3480,8 @@
      CPU: tanh
      CUDA: tanh
      QuantizedCPU: quantized_tanh
@@ -3100,7 +3093,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tanh_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2765,12 +3491,16 @@
+@@ -2765,12 +3489,16 @@
    dispatch:
      CPU: _tanh__cpu
      CUDA: _tanh__cuda
@@ -3117,7 +3110,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
    variants: function
-@@ -2783,6 +3513,8 @@
+@@ -2783,6 +3511,8 @@
    dispatch:
      CPU: threshold
      CUDA: threshold_cuda
@@ -3126,7 +3119,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
    variants: function
-@@ -2790,12 +3522,16 @@
+@@ -2790,12 +3520,16 @@
    dispatch:
      CPU: threshold_
      CUDA: threshold__cuda
@@ -3143,7 +3136,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
    use_c10_dispatcher: full
-@@ -2803,6 +3539,8 @@
+@@ -2803,6 +3537,8 @@
    dispatch:
      CPU: threshold_backward
      CUDA: threshold_backward_cuda
@@ -3152,7 +3145,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
    variants: function, method
-@@ -2835,18 +3573,24 @@
+@@ -2835,18 +3571,24 @@
    use_c10_dispatcher: full
    python_module: nn
    variants: function
@@ -3177,7 +3170,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args
  
-@@ -2872,6 +3616,8 @@
+@@ -2872,6 +3614,8 @@
      CUDA: true_divide
      SparseCPU: true_divide_sparse
      SparseCUDA: true_divide_sparse
@@ -3186,7 +3179,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-@@ -2881,6 +3627,8 @@
+@@ -2881,6 +3625,8 @@
      CUDA: true_divide_
      SparseCPU: true_divide_sparse_
      SparseCUDA: true_divide_sparse_
@@ -3195,7 +3188,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-@@ -2889,31 +3637,43 @@
+@@ -2889,31 +3635,43 @@
      CUDA: true_divide_out
      SparseCPU: true_divide_out_sparse_zerodim
      SparseCUDA: true_divide_out_sparse_zerodim
@@ -3239,7 +3232,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: type_as(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -2956,6 +3716,8 @@
+@@ -2956,6 +3714,8 @@
    dispatch:
      CPU: _unique2_cpu
      CUDA: _unique2_cuda
@@ -3248,7 +3241,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _unsafe_view(Tensor self, int[] size) -> Tensor
  
-@@ -2971,32 +3733,48 @@
+@@ -2971,32 +3731,48 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3297,7 +3290,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: view_as(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -3009,13 +3787,19 @@
+@@ -3009,13 +3785,19 @@
  - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -3317,7 +3310,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
    variants: function
-@@ -3041,13 +3825,21 @@
+@@ -3041,13 +3823,21 @@
  
  - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    device_guard: False
@@ -3339,7 +3332,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
    use_c10_dispatcher: full
-@@ -3100,25 +3892,37 @@
+@@ -3100,25 +3890,37 @@
  
  - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
    dispatch:
@@ -3379,7 +3372,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
    variants: function, method
-@@ -3162,12 +3966,16 @@
+@@ -3162,12 +3964,16 @@
      SparseCUDA: clone_sparse
      MkldnnCPU: mkldnn_clone
      QuantizedCPU: quantized_clone
@@ -3396,7 +3389,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -3176,6 +3984,8 @@
+@@ -3176,6 +3982,8 @@
      CUDA: pow_out
      SparseCPU: pow_out_sparse_scalar
      SparseCUDA: pow_out_sparse_scalar
@@ -3405,7 +3398,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -3186,6 +3996,8 @@
+@@ -3186,6 +3994,8 @@
      CUDA: pow
      SparseCPU: pow_sparse_scalar
      SparseCUDA: pow_sparse_scalar
@@ -3414,7 +3407,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: zero_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -3196,6 +4008,14 @@
+@@ -3196,6 +4006,14 @@
      SparseCPU: zero_sparse_
      SparseCUDA: zero_sparse_
      MkldnnCPU: mkldnn_zero_
@@ -3429,7 +3422,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
    dispatch:
-@@ -3204,6 +4024,8 @@
+@@ -3204,6 +4022,8 @@
      SparseCPU: sub_out_sparse
      SparseCUDA: sub_out_sparse
    supports_named_tensor: True
@@ -3438,7 +3431,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
    use_c10_dispatcher: full
-@@ -3213,6 +4035,8 @@
+@@ -3213,6 +4033,8 @@
      CUDA: sub
      SparseCPU: sub_sparse
      SparseCUDA: sub_sparse
@@ -3447,7 +3440,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-@@ -3222,6 +4046,8 @@
+@@ -3222,6 +4044,8 @@
      CUDA: sub_
      SparseCPU: sub_sparse_
      SparseCUDA: sub_sparse_
@@ -3456,7 +3449,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  # For C++ only, until we have conversion from C++ numbers to Tensor
-@@ -3229,21 +4055,29 @@
+@@ -3229,21 +4053,29 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3486,7 +3479,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Functionally the same as addmm, but we give it a different derivative formula
  # that doesn't propagate gradients to non-present entries on sparse.
-@@ -3257,6 +4091,8 @@
+@@ -3257,6 +4089,8 @@
      CUDA: legacy::cuda::_th_addmm_out
      SparseCPU: addmm_out_sparse_dense_cpu
      SparseCUDA: addmm_out_sparse_dense_cuda
@@ -3495,7 +3488,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-@@ -3267,6 +4103,8 @@
+@@ -3267,6 +4101,8 @@
      CUDA: legacy::cuda::_th_addmm
      SparseCPU: addmm_sparse_dense_cpu
      SparseCUDA: addmm_sparse_dense_cuda
@@ -3504,7 +3497,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-@@ -3278,9 +4116,10 @@
+@@ -3278,9 +4114,10 @@
      # broadcasting
      SparseCPU: s_addmm_sparse_dense_cpu_
      SparseCUDA: s_addmm_sparse_dense_cuda_
@@ -3516,7 +3509,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # NOTE [ Sparse: autograd and API ]
  #
  #
-@@ -3396,7 +4235,6 @@
+@@ -3396,7 +4233,6 @@
  # shared. In other words, their outputs are non-differentiable views of the
  # sparse tensor.
  
@@ -3524,7 +3517,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
  # the default would never make sense.
  - func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
-@@ -3433,7 +4271,6 @@
+@@ -3433,7 +4269,6 @@
      SparseCUDA: sparse_resize_and_clear_
    requires_tensor: True
  
@@ -3532,7 +3525,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3442,7 +4279,6 @@
+@@ -3442,7 +4277,6 @@
      SparseCUDA: sparse_mask_cuda
    requires_tensor: True
  
@@ -3540,7 +3533,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: to_dense(Tensor self) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3474,7 +4310,6 @@
+@@ -3474,7 +4308,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3548,7 +3541,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: dense_dim(Tensor self) -> int
    use_c10_dispatcher: full
    variants: method
-@@ -3494,7 +4329,6 @@
+@@ -3494,7 +4327,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3556,7 +3549,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: _nnz(Tensor self) -> int
    use_c10_dispatcher: full
    variants: method
-@@ -3504,7 +4338,6 @@
+@@ -3504,7 +4336,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3564,7 +3557,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: coalesce(Tensor self) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3513,7 +4346,6 @@
+@@ -3513,7 +4344,6 @@
      SparseCUDA: coalesce_sparse_cuda
    requires_tensor: True
  
@@ -3572,7 +3565,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: is_coalesced(Tensor self) -> bool
    use_c10_dispatcher: full
    variants: method
-@@ -3524,7 +4356,6 @@
+@@ -3524,7 +4354,6 @@
    device_guard: False
    supports_named_tensor: True
  
@@ -3580,7 +3573,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: _indices(Tensor(a) self) -> Tensor(a)
    variants: method
    dispatch:
-@@ -3568,7 +4399,6 @@
+@@ -3568,7 +4397,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3588,7 +3581,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
    dispatch:
      SparseCPU: hspmm_out_sparse_cpu
-@@ -3630,11 +4460,15 @@
+@@ -3630,11 +4458,15 @@
    variants: function
    dispatch:
      CPU: quantize_per_tensor_cpu
@@ -3604,7 +3597,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: dequantize(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -3713,20 +4547,28 @@
+@@ -3713,20 +4545,28 @@
    variants: method
    device_guard: False
    supports_named_tensor: True
@@ -3633,7 +3626,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: meshgrid(Tensor[] tensors) -> Tensor[]
  
-@@ -3765,6 +4607,8 @@
+@@ -3765,6 +4605,8 @@
    dispatch:
      CPU: _local_scalar_dense_cpu
      CUDA: _local_scalar_dense_cuda
@@ -3642,7 +3635,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    variants: function
    supports_named_tensor: True
  
-@@ -3791,10 +4635,16 @@
+@@ -3791,10 +4633,16 @@
  
  # RNN cells and layers
  - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
@@ -3659,7 +3652,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  
-@@ -3839,10 +4689,14 @@
+@@ -3839,10 +4687,14 @@
  
  # PackedSequence utilities
  - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
@@ -3674,7 +3667,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # wrappers for legacy TH methods
  
-@@ -3852,6 +4706,8 @@
+@@ -3852,6 +4704,8 @@
    dispatch:
      CPU: set_
      CUDA: set_
@@ -3683,7 +3676,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
    variants: method
-@@ -3860,6 +4716,8 @@
+@@ -3860,6 +4714,8 @@
      CPU: legacy::cpu::_th_set_
      CUDA: legacy::cuda::_th_set_
      QuantizedCPU: set_storage
@@ -3692,7 +3685,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
    variants: method
-@@ -3867,12 +4725,16 @@
+@@ -3867,12 +4723,16 @@
    dispatch:
      CPU: set_tensor_
      CUDA: set_tensor_
@@ -3709,7 +3702,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!)
    variants: method
-@@ -3892,6 +4754,8 @@
+@@ -3892,6 +4752,8 @@
    dispatch:
      CPU: masked_fill__cpu
      CUDA: masked_fill__cuda
@@ -3718,7 +3711,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
-@@ -3904,6 +4768,8 @@
+@@ -3904,6 +4766,8 @@
    dispatch:
      CPU: masked_fill__cpu
      CUDA: masked_fill__cuda
@@ -3727,7 +3720,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
-@@ -3916,6 +4782,8 @@
+@@ -3916,6 +4780,8 @@
    dispatch:
      CPU: masked_scatter__cpu
      CUDA: masked_scatter__cuda
@@ -3736,7 +3729,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
    use_c10_dispatcher: full
-@@ -3929,25 +4797,35 @@
+@@ -3929,25 +4795,35 @@
      CUDA: view
      MkldnnCPU: mkldnn_view
      QuantizedCPU: view
@@ -3772,7 +3765,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
    variants: method
-@@ -3955,11 +4833,15 @@
+@@ -3955,11 +4831,15 @@
    dispatch:
      CPU: legacy::cpu::_th_index_fill_
      CUDA: legacy::cuda::_th_index_fill_
@@ -3788,7 +3781,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
    variants: method
-@@ -3967,11 +4849,15 @@
+@@ -3967,11 +4847,15 @@
      CPU: index_fill_
      CUDA: index_fill_
    supports_named_tensor: True
@@ -3804,7 +3797,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
    variants: method
-@@ -3994,6 +4880,8 @@
+@@ -3994,6 +4878,8 @@
    dispatch:
      CPU: scatter_cpu_
      CUDA: legacy::cuda::_th_scatter_
@@ -3813,7 +3806,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
    use_c10_dispatcher: full
-@@ -4004,6 +4892,8 @@
+@@ -4004,6 +4890,8 @@
    dispatch:
      CPU: scatter_fill_cpu_
      CUDA: legacy::cuda::_th_scatter_
@@ -3822,7 +3815,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
    use_c10_dispatcher: full
-@@ -4020,81 +4910,127 @@
+@@ -4020,81 +4908,127 @@
    dispatch:
      CPU: scatter_add_cpu_
      CUDA: legacy::cuda::_th_scatter_add_
@@ -3950,7 +3943,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
    variants: method
-@@ -4107,70 +5043,106 @@
+@@ -4107,70 +5041,106 @@
    dispatch:
      CPU: bitwise_or_out
      CUDA: bitwise_or_out
@@ -4057,7 +4050,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
    variants: method
-@@ -4240,18 +5212,24 @@
+@@ -4240,18 +5210,24 @@
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
    supports_named_tensor: True
    variants: method
@@ -4082,7 +4075,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: digamma_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4266,6 +5244,8 @@
+@@ -4266,6 +5242,8 @@
    dispatch:
      CPU: legacy::cpu::_th_renorm_
      CUDA: legacy::cuda::_th_renorm_
@@ -4091,7 +4084,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4273,6 +5253,8 @@
+@@ -4273,6 +5251,8 @@
    dispatch:
      CPU: pow_
      CUDA: pow_
@@ -4100,7 +4093,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4280,53 +5262,71 @@
+@@ -4280,53 +5260,71 @@
    dispatch:
      CPU: pow_
      CUDA: pow_
@@ -4172,7 +4165,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
    use_c10_dispatcher: full
-@@ -4334,28 +5334,40 @@
+@@ -4334,28 +5332,40 @@
    dispatch:
      CPU: legacy::cpu::_th_addbmm
      CUDA: legacy::cuda::_th_addbmm
@@ -4213,7 +4206,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
-@@ -4380,6 +5392,8 @@
+@@ -4380,6 +5390,8 @@
    dispatch:
      CPU: legacy::cpu::_th_diag_out
      CUDA: legacy::cuda::_th_diag_out
@@ -4222,7 +4215,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: diag(Tensor self, int diagonal=0) -> Tensor
    use_c10_dispatcher: full
-@@ -4387,30 +5401,44 @@
+@@ -4387,30 +5399,44 @@
    dispatch:
      CPU: legacy::cpu::_th_diag
      CUDA: legacy::cuda::_th_diag
@@ -4267,7 +4260,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    dispatch:
-@@ -4435,6 +5463,8 @@
+@@ -4435,6 +5461,8 @@
      CPU: ne_out
      CUDA: ne_out
      QuantizedCPU: ne_out_quantized_cpu
@@ -4276,7 +4269,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4444,6 +5474,8 @@
+@@ -4444,6 +5472,8 @@
      CPU: ne
      CUDA: ne
      QuantizedCPU: ne_quantized_cpu
@@ -4285,7 +4278,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4451,6 +5483,8 @@
+@@ -4451,6 +5481,8 @@
      CPU: ne_out
      CUDA: ne_out
      QuantizedCPU: ne_out_quantized_cpu
@@ -4294,7 +4287,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4460,6 +5494,8 @@
+@@ -4460,6 +5492,8 @@
      CPU: ne
      CUDA: ne
      QuantizedCPU: ne_quantized_cpu
@@ -4303,7 +4296,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4467,6 +5503,8 @@
+@@ -4467,6 +5501,8 @@
      CPU: eq_out
      CUDA: eq_out
      QuantizedCPU: eq_out_quantized_cpu
@@ -4312,7 +4305,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4476,6 +5514,8 @@
+@@ -4476,6 +5512,8 @@
      CPU: eq
      CUDA: eq
      QuantizedCPU: eq_quantized_cpu
@@ -4321,7 +4314,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4483,6 +5523,8 @@
+@@ -4483,6 +5521,8 @@
      CPU: eq_out
      CUDA: eq_out
      QuantizedCPU: eq_out_quantized_cpu
@@ -4330,7 +4323,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4492,6 +5534,8 @@
+@@ -4492,6 +5532,8 @@
      CPU: eq
      CUDA: eq
      QuantizedCPU: eq_quantized_cpu
@@ -4339,7 +4332,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4499,6 +5543,8 @@
+@@ -4499,6 +5541,8 @@
      CPU: ge_out
      CUDA: ge_out
      QuantizedCPU: ge_out_quantized_cpu
@@ -4348,7 +4341,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4508,6 +5554,8 @@
+@@ -4508,6 +5552,8 @@
      CPU: ge
      CUDA: ge
      QuantizedCPU: ge_quantized_cpu
@@ -4357,7 +4350,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4515,6 +5563,8 @@
+@@ -4515,6 +5561,8 @@
      CPU: ge_out
      CUDA: ge_out
      QuantizedCPU: ge_out_quantized_cpu
@@ -4366,7 +4359,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4524,6 +5574,8 @@
+@@ -4524,6 +5572,8 @@
      CPU: ge
      CUDA: ge
      QuantizedCPU: ge_quantized_cpu
@@ -4375,7 +4368,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4531,6 +5583,8 @@
+@@ -4531,6 +5581,8 @@
      CPU: le_out
      CUDA: le_out
      QuantizedCPU: le_out_quantized_cpu
@@ -4384,7 +4377,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4540,6 +5594,8 @@
+@@ -4540,6 +5592,8 @@
      CPU: le
      CUDA: le
      QuantizedCPU: le_quantized_cpu
@@ -4393,7 +4386,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4547,6 +5603,8 @@
+@@ -4547,6 +5601,8 @@
      CPU: le_out
      CUDA: le_out
      QuantizedCPU: le_out_quantized_cpu
@@ -4402,7 +4395,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4556,6 +5614,8 @@
+@@ -4556,6 +5612,8 @@
      CPU: le
      CUDA: le
      QuantizedCPU: le_quantized_cpu
@@ -4411,7 +4404,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4563,6 +5623,8 @@
+@@ -4563,6 +5621,8 @@
      CPU: gt_out
      CUDA: gt_out
      QuantizedCPU: gt_out_quantized_cpu
@@ -4420,7 +4413,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4572,6 +5634,8 @@
+@@ -4572,6 +5632,8 @@
      CPU: gt
      CUDA: gt
      QuantizedCPU: gt_quantized_cpu
@@ -4429,7 +4422,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4579,6 +5643,8 @@
+@@ -4579,6 +5641,8 @@
      CPU: gt_out
      CUDA: gt_out
      QuantizedCPU: gt_out_quantized_cpu
@@ -4438,7 +4431,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4588,6 +5654,8 @@
+@@ -4588,6 +5652,8 @@
      CPU: gt
      CUDA: gt
      QuantizedCPU: gt_quantized_cpu
@@ -4447,7 +4440,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4595,6 +5663,8 @@
+@@ -4595,6 +5661,8 @@
      CPU: lt_out
      CUDA: lt_out
      QuantizedCPU: lt_out_quantized_cpu
@@ -4456,7 +4449,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4604,6 +5674,8 @@
+@@ -4604,6 +5672,8 @@
      CPU: lt
      CUDA: lt
      QuantizedCPU: lt_quantized_cpu
@@ -4465,7 +4458,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4611,6 +5683,8 @@
+@@ -4611,6 +5681,8 @@
      CPU: lt_out
      CUDA: lt_out
      QuantizedCPU: lt_out_quantized_cpu
@@ -4474,7 +4467,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4620,11 +5694,16 @@
+@@ -4620,11 +5692,16 @@
      CPU: lt
      CUDA: lt
      QuantizedCPU: lt_quantized_cpu
@@ -4491,7 +4484,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: take(Tensor self, Tensor index) -> Tensor
    use_c10_dispatcher: full
-@@ -4632,11 +5711,16 @@
+@@ -4632,11 +5709,16 @@
    dispatch:
      CPU: legacy::cpu::_th_take
      CUDA: legacy::cuda::_th_take
@@ -4508,7 +4501,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
    use_c10_dispatcher: full
-@@ -4646,17 +5730,25 @@
+@@ -4646,17 +5728,25 @@
      CUDA: legacy::cuda::_th_index_select
      SparseCPU: index_select_sparse
      SparseCUDA: index_select_sparse
@@ -4534,7 +4527,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: masked_select(Tensor self, Tensor mask) -> Tensor
    use_c10_dispatcher: full
-@@ -4665,11 +5757,15 @@
+@@ -4665,11 +5755,15 @@
      CPU: masked_select_cpu
      CUDA: masked_select_cuda
    supports_named_tensor: True
@@ -4550,7 +4543,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: nonzero(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -4677,6 +5773,8 @@
+@@ -4677,6 +5771,8 @@
    dispatch:
      CPU: legacy::cpu::_th_nonzero
      CUDA: legacy::cuda::_th_nonzero
@@ -4559,7 +4552,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: nonzero_numpy(Tensor self) -> Tensor[]
    variants: method, function
-@@ -4685,6 +5783,8 @@
+@@ -4685,6 +5781,8 @@
    dispatch:
      CPU: gather_out_cpu
      CUDA: gather_out_cuda
@@ -4568,7 +4561,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
    use_c10_dispatcher: full
-@@ -4692,34 +5792,50 @@
+@@ -4692,34 +5790,50 @@
    dispatch:
      CPU: gather_cpu
      CUDA: gather_cuda
@@ -4619,7 +4612,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
    dispatch:
-@@ -4826,9 +5942,13 @@
+@@ -4826,9 +5940,13 @@
      CUDA: legacy::cuda::_th_potri
  
  - func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
@@ -4633,7 +4626,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _qr_helper(Tensor self, bool some) -> (Tensor, Tensor)
    variants: function
-@@ -4891,12 +6011,16 @@
+@@ -4891,12 +6009,16 @@
    dispatch:
      CPU: multinomial_out
      CUDA: multinomial_out
@@ -4650,7 +4643,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor)
    variants: function
-@@ -4947,6 +6071,8 @@
+@@ -4947,6 +6069,8 @@
    dispatch:
      CPU: erfinv
      CUDA: erfinv
@@ -4659,7 +4652,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4954,26 +6080,36 @@
+@@ -4954,26 +6078,36 @@
    dispatch:
      CPU: _erfinv__cpu
      CUDA: _erfinv__cuda
@@ -4696,7 +4689,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
    use_c10_dispatcher: full
-@@ -4981,21 +6117,29 @@
+@@ -4981,21 +6115,29 @@
  
  - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
@@ -4726,7 +4719,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
    use_c10_dispatcher: full
-@@ -5003,6 +6147,8 @@
+@@ -5003,6 +6145,8 @@
    dispatch:
      CPU: lerp_cpu_scalar
      CUDA: lerp_cuda_scalar
@@ -4735,7 +4728,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
    use_c10_dispatcher: full
-@@ -5010,6 +6156,8 @@
+@@ -5010,6 +6154,8 @@
    dispatch:
      CPU: lerp_cpu_tensor
      CUDA: lerp_cuda_tensor
@@ -4744,7 +4737,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
    dispatch:
-@@ -5027,6 +6175,8 @@
+@@ -5027,6 +6173,8 @@
    dispatch:
      CPU: fmod_out
      CUDA: legacy::cuda::_th_fmod_out
@@ -4753,7 +4746,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
    use_c10_dispatcher: full
-@@ -5034,11 +6184,15 @@
+@@ -5034,11 +6182,15 @@
    dispatch:
      CPU: fmod
      CUDA: legacy::cuda::_th_fmod
@@ -4769,7 +4762,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -5046,11 +6200,15 @@
+@@ -5046,11 +6198,15 @@
    dispatch:
      CPU: fmod
      CUDA: legacy::cuda::_th_fmod
@@ -4785,7 +4778,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
    use_c10_dispatcher: full
-@@ -5058,11 +6216,15 @@
+@@ -5058,11 +6214,15 @@
    dispatch:
      CPU: remainder
      CUDA: remainder
@@ -4801,7 +4794,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -5070,12 +6232,18 @@
+@@ -5070,12 +6230,18 @@
    dispatch:
      CPU: remainder
      CUDA: remainder
@@ -4820,7 +4813,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: min(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5084,13 +6252,19 @@
+@@ -5084,13 +6250,19 @@
      CPU: min
      CUDA: legacy::cuda::_th_min
      QuantizedCPU: min_quant
@@ -4840,7 +4833,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5099,6 +6273,8 @@
+@@ -5099,6 +6271,8 @@
      CPU: max
      CUDA: legacy::cuda::_th_max
      QuantizedCPU: max_quant
@@ -4849,7 +4842,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: median(Tensor self) -> Tensor
-@@ -5107,12 +6283,16 @@
+@@ -5107,12 +6281,16 @@
    dispatch:
      CPU: median_cpu
      CUDA: median_cuda
@@ -4866,7 +4859,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
    variants: method, function
-@@ -5120,23 +6300,45 @@
+@@ -5120,23 +6298,45 @@
      CPU: legacy::cpu::_th_sort
      CUDA: legacy::cuda::_th_sort
      QuantizedCPU: sort_quant
@@ -4912,7 +4905,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
    variants: method, function
-@@ -5144,11 +6346,15 @@
+@@ -5144,11 +6344,15 @@
      CPU: topk
      CUDA: topk
      QuantizedCPU: quantized_topk_cpu
@@ -4928,7 +4921,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: any(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5159,11 +6365,15 @@
+@@ -5159,11 +6363,15 @@
      CUDA: any
      SparseCPU: any_sparse
      SparseCUDA: any_sparse
@@ -4944,7 +4937,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
    use_c10_dispatcher: full
-@@ -5171,6 +6381,8 @@
+@@ -5171,6 +6379,8 @@
    dispatch:
      CPU: legacy::cpu::_th_renorm
      CUDA: legacy::cuda::_th_renorm
@@ -4953,7 +4946,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
    variants: method
-@@ -5178,6 +6390,8 @@
+@@ -5178,6 +6388,8 @@
    dispatch:
      CPU: unfold
      CUDA: unfold
@@ -4962,7 +4955,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: equal(Tensor self, Tensor other) -> bool
    use_c10_dispatcher: full
-@@ -5186,6 +6400,8 @@
+@@ -5186,6 +6398,8 @@
      CPU: legacy::cpu::_th_equal
      CUDA: legacy::cuda::_th_equal
      QuantizedCPU: quantized_equal
@@ -4971,7 +4964,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
-@@ -5193,6 +6409,8 @@
+@@ -5193,6 +6407,8 @@
    dispatch:
      CPU: pow_out
      CUDA: pow_out
@@ -4980,7 +4973,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -5201,12 +6419,16 @@
+@@ -5201,12 +6417,16 @@
    dispatch:
      CPU: pow
      CUDA: pow
@@ -4997,7 +4990,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -5214,6 +6436,8 @@
+@@ -5214,6 +6434,8 @@
    dispatch:
      CPU: pow
      CUDA: pow
@@ -5006,7 +4999,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
    variants: method
-@@ -5221,40 +6445,58 @@
+@@ -5221,40 +6443,58 @@
      CPU: normal_cpu_
      CUDA: normal_cuda_
    supports_named_tensor: True
@@ -5065,7 +5058,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: alias(Tensor(a) self) -> Tensor(a)
    variants: method, function
-@@ -5265,43 +6507,59 @@
+@@ -5265,43 +6505,59 @@
    dispatch:
      CPU: legacy::cpu::_th_addr
      CUDA: legacy::cuda::_th_addr
@@ -5126,7 +5119,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _var(Tensor self, bool unbiased=True) -> Tensor
    use_c10_dispatcher: full
-@@ -5309,6 +6567,8 @@
+@@ -5309,6 +6565,8 @@
      CPU: legacy::cpu::_th_var
      CUDA: legacy::cuda::_th_var
    supports_named_tensor: True
@@ -5135,7 +5128,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _std(Tensor self, bool unbiased=True) -> Tensor
    use_c10_dispatcher: full
-@@ -5321,6 +6581,8 @@
+@@ -5321,6 +6579,8 @@
    variants: function
    dispatch:
      CUDA: _amp_non_finite_check_and_unscale_cuda_
@@ -5144,7 +5137,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
    variants: function
-@@ -5332,12 +6594,16 @@
+@@ -5332,12 +6592,16 @@
      CPU: _cat_cpu
      CUDA: cat_cuda
      QuantizedCPU: quantized_cat
@@ -5161,7 +5154,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
    dispatch:
-@@ -5353,36 +6619,50 @@
+@@ -5353,36 +6617,50 @@
    dispatch:
      CPU: legacy::cpu::_th_max
      CUDA: legacy::cuda::_th_max
@@ -5212,7 +5205,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
    use_c10_dispatcher: full
-@@ -5390,23 +6670,33 @@
+@@ -5390,23 +6668,33 @@
    dispatch:
      CPU: mse_loss_backward
      CUDA: mse_loss_backward
@@ -5246,7 +5239,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5434,22 +6724,30 @@
+@@ -5434,22 +6722,30 @@
  
  - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5277,7 +5270,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -5466,97 +6764,137 @@
+@@ -5466,97 +6762,137 @@
  
  - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5415,7 +5408,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5564,6 +6902,8 @@
+@@ -5564,6 +6900,8 @@
      CPU: elu_out
      CUDA: elu_out
      QuantizedCPU: quantized_elu_out
@@ -5424,7 +5417,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
    use_c10_dispatcher: full
-@@ -5572,16 +6912,22 @@
+@@ -5572,16 +6910,22 @@
      CPU: elu
      CUDA: elu
      QuantizedCPU: quantized_elu
@@ -5447,7 +5440,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
    python_module: nn
-@@ -5589,12 +6935,16 @@
+@@ -5589,12 +6933,16 @@
      CPU: elu_
      CUDA: elu_
      QuantizedCPU: quantized_elu_
@@ -5464,7 +5457,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: glu(Tensor self, int dim=-1) -> Tensor
    use_c10_dispatcher: full
-@@ -5602,12 +6952,16 @@
+@@ -5602,12 +6950,16 @@
    dispatch:
      CPU: glu
      CUDA: legacy::cuda::_thnn_glu_forward
@@ -5481,7 +5474,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
    use_c10_dispatcher: full
-@@ -5615,20 +6969,30 @@
+@@ -5615,20 +6967,30 @@
    dispatch:
      CPU: glu_backward
      CUDA: legacy::cuda::_thnn_glu_backward
@@ -5512,7 +5505,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5636,6 +7000,8 @@
+@@ -5636,6 +6998,8 @@
      CPU: hardtanh_out
      CUDA: hardtanh_out
      QuantizedCPU: quantized_hardtanh_out
@@ -5521,7 +5514,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
    use_c10_dispatcher: full
-@@ -5644,16 +7010,22 @@
+@@ -5644,16 +7008,22 @@
      CPU: hardtanh
      CUDA: hardtanh
      QuantizedCPU: quantized_hardtanh
@@ -5544,7 +5537,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
    python_module: nn
-@@ -5661,6 +7033,8 @@
+@@ -5661,6 +7031,8 @@
      CPU: hardtanh_
      CUDA: hardtanh_
      QuantizedCPU: quantized_hardtanh_
@@ -5553,7 +5546,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5668,6 +7042,8 @@
+@@ -5668,6 +7040,8 @@
      CPU: leaky_relu_out
      CUDA: leaky_relu_out
      QuantizedCPU: quantized_leaky_relu_out
@@ -5562,7 +5555,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
    use_c10_dispatcher: full
-@@ -5676,10 +7052,14 @@
+@@ -5676,10 +7050,14 @@
      CPU: leaky_relu
      CUDA: leaky_relu
      QuantizedCPU: quantized_leaky_relu
@@ -5577,7 +5570,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
    python_module: nn
-@@ -5687,31 +7067,44 @@
+@@ -5687,31 +7065,44 @@
      CPU: leaky_relu_
      CUDA: leaky_relu_
      QuantizedCPU: quantized_leaky_relu_
@@ -5622,7 +5615,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
    use_c10_dispatcher: full
-@@ -5719,6 +7112,8 @@
+@@ -5719,6 +7110,8 @@
    dispatch:
      CPU: log_sigmoid_backward_cpu
      CUDA: legacy::cuda::_thnn_log_sigmoid_backward
@@ -5631,7 +5624,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5744,37 +7139,53 @@
+@@ -5744,37 +7137,53 @@
  
  - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5685,7 +5678,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5782,9 +7193,13 @@
+@@ -5782,9 +7191,13 @@
      CPU: adaptive_avg_pool2d_out_cpu
      CUDA: adaptive_avg_pool2d_out_cuda
      MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
@@ -5699,7 +5692,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
    dispatch:
-@@ -5796,6 +7211,8 @@
+@@ -5796,6 +7209,8 @@
      CPU: adaptive_avg_pool2d_cpu
      CUDA: adaptive_avg_pool2d_cuda
      QuantizedCPU: quantized_adaptive_avg_pool2d
@@ -5708,7 +5701,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5803,24 +7220,32 @@
+@@ -5803,24 +7218,32 @@
    dispatch:
      CPU: adaptive_avg_pool2d_backward_cpu
      CUDA: adaptive_avg_pool2d_backward_cuda
@@ -5741,7 +5734,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5828,6 +7253,8 @@
+@@ -5828,6 +7251,8 @@
    dispatch:
      CPU: adaptive_avg_pool3d_backward_cpu
      CUDA: adaptive_avg_pool3d_backward_cuda
@@ -5750,7 +5743,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5835,6 +7262,8 @@
+@@ -5835,6 +7260,8 @@
    dispatch:
      CPU: adaptive_max_pool2d_out_cpu
      CUDA: adaptive_max_pool2d_out_cuda
@@ -5759,7 +5752,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
-@@ -5842,12 +7271,16 @@
+@@ -5842,12 +7269,16 @@
    dispatch:
      CPU: adaptive_max_pool2d_cpu
      CUDA: adaptive_max_pool2d_cuda
@@ -5776,7 +5769,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
    use_c10_dispatcher: full
-@@ -5855,6 +7288,8 @@
+@@ -5855,6 +7286,8 @@
    dispatch:
      CPU: adaptive_max_pool2d_backward_cpu
      CUDA: adaptive_max_pool2d_backward_cuda
@@ -5785,7 +5778,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5889,6 +7324,8 @@
+@@ -5889,6 +7322,8 @@
      CPU: avg_pool2d_out_cpu
      CUDA: avg_pool2d_out_cuda
      MkldnnCPU: mkldnn_avg_pool2d_out
@@ -5794,7 +5787,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
    python_module: nn
-@@ -5897,24 +7334,32 @@
+@@ -5897,24 +7332,32 @@
      CUDA: avg_pool2d_cuda
      MkldnnCPU: mkldnn_avg_pool2d
      QuantizedCPU: quantized_avg_pool2d
@@ -5827,7 +5820,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
    python_module: nn
-@@ -5922,18 +7367,24 @@
+@@ -5922,18 +7365,24 @@
      CPU: avg_pool3d_cpu
      CUDA: avg_pool3d_cuda
      QuantizedCPU: quantized_avg_pool3d
@@ -5852,7 +5845,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5993,6 +7444,8 @@
+@@ -5993,6 +7442,8 @@
    dispatch:
      CPU: max_pool2d_with_indices_out_cpu
      CUDA: max_pool2d_with_indices_out_cuda
@@ -5861,7 +5854,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -6000,6 +7453,8 @@
+@@ -6000,6 +7451,8 @@
    dispatch:
      CPU: max_pool2d_with_indices_cpu
      CUDA: max_pool2d_with_indices_cuda
@@ -5870,7 +5863,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
-@@ -6007,12 +7462,16 @@
+@@ -6007,12 +7460,16 @@
    dispatch:
      CPU: max_pool2d_with_indices_backward_out_cpu
      CUDA: max_pool2d_with_indices_backward_out_cuda
@@ -5887,7 +5880,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -6020,6 +7479,8 @@
+@@ -6020,6 +7477,8 @@
    dispatch:
      CPU: max_pool3d_with_indices_out_cpu
      CUDA: max_pool3d_with_indices_out_cuda
@@ -5896,7 +5889,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -6027,6 +7488,8 @@
+@@ -6027,6 +7486,8 @@
    dispatch:
      CPU: max_pool3d_with_indices_cpu
      CUDA: max_pool3d_with_indices_cuda
@@ -5905,7 +5898,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
-@@ -6034,12 +7497,17 @@
+@@ -6034,12 +7495,17 @@
    dispatch:
      CPU: max_pool3d_with_indices_backward_out_cpu
      CUDA: max_pool3d_with_indices_backward_out_cuda
@@ -5923,7 +5916,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6118,12 +7586,16 @@
+@@ -6118,12 +7584,16 @@
    dispatch:
      CPU: reflection_pad2d_out_cpu
      CUDA: reflection_pad2d_out_cuda
@@ -5940,7 +5933,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6166,12 +7638,16 @@
+@@ -6166,12 +7636,16 @@
    dispatch:
      CPU: replication_pad2d_out_cpu
      CUDA: replication_pad2d_out_cuda
@@ -5957,7 +5950,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6214,12 +7690,16 @@
+@@ -6214,12 +7688,16 @@
    dispatch:
      CPU: upsample_linear1d_out_cpu
      CUDA: upsample_linear1d_out_cuda
@@ -5974,7 +5967,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6232,12 +7712,16 @@
+@@ -6232,12 +7710,16 @@
    dispatch:
      CPU: upsample_linear1d_backward_cpu
      CUDA: upsample_linear1d_backward_cuda
@@ -5991,7 +5984,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6245,96 +7729,128 @@
+@@ -6245,96 +7727,128 @@
      CPU: upsample_bilinear2d_cpu
      CUDA: upsample_bilinear2d_cuda
      QuantizedCPU: quantized_upsample_bilinear2d_cpu
@@ -6120,7 +6113,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6342,24 +7858,32 @@
+@@ -6342,24 +7856,32 @@
      CPU: upsample_nearest2d_cpu
      CUDA: upsample_nearest2d_cuda
      QuantizedCPU: quantized_upsample_nearest2d_cpu
@@ -6153,7 +6146,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6367,38 +7891,52 @@
+@@ -6367,38 +7889,52 @@
      CPU: upsample_nearest3d_cpu
      CUDA: upsample_nearest3d_cuda
      QuantizedCPU: quantized_upsample_nearest3d_cpu
@@ -6206,7 +6199,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # What's a thnn_conv_ versus a slow_conv_?
  #
-@@ -6423,24 +7961,32 @@
+@@ -6423,24 +7959,32 @@
    dispatch:
      CPU: slow_conv_transpose2d_out_cpu
      CUDA: slow_conv_transpose2d_out_cuda
@@ -6239,7 +6232,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6468,21 +8014,29 @@
+@@ -6468,21 +8012,29 @@
  
  - func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -6269,7 +6262,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    python_module: nn
-@@ -6495,32 +8049,46 @@
+@@ -6495,32 +8047,46 @@
    dispatch:
      CPU: slow_conv2d_backward_cpu
      CUDA: legacy::cuda::_thnn_conv2d_backward
@@ -6316,7 +6309,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6553,12 +8121,16 @@
+@@ -6553,12 +8119,16 @@
    dispatch:
      CPU: slow_conv_dilated2d_cpu
      CUDA: slow_conv_dilated2d_cuda
@@ -6333,7 +6326,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
    python_module: nn
-@@ -6577,57 +8149,401 @@
+@@ -6577,57 +8147,405 @@
    dispatch:
      CPU: col2im_out_cpu
      CUDA: col2im_out_cuda
@@ -6354,7 +6347,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      CPU: col2im_backward_out_cpu
      CUDA: col2im_backward_out_cuda
 +  npu_dispatch:
-+     NPU: col2im_backward_out_npu
++     NPU: im2col_out_npu
  
  - func: col2im_backward(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
    python_module: nn
@@ -6362,7 +6355,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      CPU: col2im_backward_cpu
      CUDA: col2im_backward_cuda
 +  npu_dispatch:
-+     NPU: col2im_backward_npu
++     NPU: im2col_npu
  
  - func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -6735,10 +6728,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor)
 +  npu_dispatch_only:
 +    NPU: linear_backward_npu
++
++- func: npu_bert_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor(a!), Tensor(b!), Tensor(c!))
++  npu_dispatch_only:
++    NPU: bert_apply_adam_npu
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
 --- pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-09 17:16:47.866792783 +0800
++++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-13 15:30:57.674270525 +0800
 @@ -659,14 +659,14 @@
  
      SUB x1, x1, 4
@@ -6764,7 +6761,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      CMP x1, 2
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-09 17:16:47.810790775 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-13 15:30:57.618268517 +0800
 @@ -64,7 +64,7 @@
  
  Tensor isinf(const Tensor &self) {
@@ -6776,7 +6773,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "isinf", [&]() {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-09 17:16:47.810790775 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-13 15:30:57.618268517 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6821,7 +6818,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-09 17:16:47.810790775 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-13 15:30:57.618268517 +0800
 @@ -87,6 +87,7 @@
    if (self.is_contiguous(memory_format)) {
      return self;
@@ -6832,7 +6829,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        "preserve memory format is unsupported by the contiguous operator");
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-09 17:16:47.814790918 +0800
++++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-13 15:30:57.622268661 +0800
 @@ -26,7 +26,7 @@
          const scalar_t* in = &idata[output_y * input_width + output_x];
          scalar_t* out = &odata[output_y * output_width + output_x];
@@ -6844,7 +6841,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            out += output_width * output_height;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py
 --- pytorch-v1.5.0/aten/src/ATen/native_parse.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-09 17:16:47.878793213 +0800
++++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-13 15:30:57.686270955 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6882,7 +6879,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  msg = '''Exception raised in processing function:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py
 --- pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-09 17:16:47.882793357 +0800
++++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-13 15:30:57.690271099 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6914,7 +6911,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-09 17:16:47.882793357 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-13 15:30:57.690271099 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6947,7 +6944,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-09 17:16:47.882793357 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-13 15:30:57.690271099 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6981,7 +6978,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/TH/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-09 17:16:47.886793500 +0800
++++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-13 15:30:57.694271242 +0800
 @@ -48,6 +48,11 @@
    ${CMAKE_CURRENT_SOURCE_DIR}
  PARENT_SCOPE)
@@ -6996,7 +6993,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-09 17:16:47.886793500 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-13 15:30:57.694271242 +0800
 @@ -1,9 +1,32 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7105,7 +7102,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-09 17:16:47.886793500 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-13 15:30:57.694271242 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7144,7 +7141,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt
 --- pytorch-v1.5.0/c10/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/CMakeLists.txt	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/CMakeLists.txt	2021-07-13 15:30:57.706271672 +0800
 @@ -63,6 +63,14 @@
    message(STATUS "don't use NUMA")
  endif()
@@ -7173,7 +7170,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # not checked in
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h
 --- pytorch-v1.5.0/c10/core/Backend.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Backend.h	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/Backend.h	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7268,7 +7265,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp
 --- pytorch-v1.5.0/c10/core/Device.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.cpp	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/Device.cpp	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7308,7 +7305,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        types.begin(),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h
 --- pytorch-v1.5.0/c10/core/Device.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.h	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/Device.h	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7343,7 +7340,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return type_ == DeviceType::CPU;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp
 --- pytorch-v1.5.0/c10/core/DeviceType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7383,7 +7380,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        return false;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h
 --- pytorch-v1.5.0/c10/core/DeviceType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.h	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/DeviceType.h	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7426,7 +7423,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  constexpr DeviceType kXLA = DeviceType::XLA;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp
 --- pytorch-v1.5.0/c10/core/DispatchKey.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7458,7 +7455,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      case DispatchKey::TESTING_ONLY_GenericModeTensorId:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h
 --- pytorch-v1.5.0/c10/core/DispatchKey.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.h	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/DispatchKey.h	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7490,7 +7487,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h
 --- pytorch-v1.5.0/c10/core/Storage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Storage.h	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/Storage.h	2021-07-13 15:30:57.706271672 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7524,7 +7521,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  };
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h
 --- pytorch-v1.5.0/c10/core/StorageImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/StorageImpl.h	2021-07-09 17:16:47.902794074 +0800
++++ pytorch-develop/c10/core/StorageImpl.h	2021-07-13 15:30:57.706271672 +0800
 @@ -1,12 +1,39 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7581,7 +7578,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h
 --- pytorch-v1.5.0/c10/core/TensorImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorImpl.h	2021-07-09 17:16:47.906794218 +0800
++++ pytorch-develop/c10/core/TensorImpl.h	2021-07-13 15:30:57.710271816 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7651,7 +7648,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h
 --- pytorch-v1.5.0/c10/core/TensorOptions.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorOptions.h	2021-07-09 17:16:47.906794218 +0800
++++ pytorch-develop/c10/core/TensorOptions.h	2021-07-13 15:30:57.710271816 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7692,7 +7689,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h
 --- pytorch-v1.5.0/c10/macros/Export.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/macros/Export.h	2021-07-09 17:16:47.906794218 +0800
++++ pytorch-develop/c10/macros/Export.h	2021-07-13 15:30:57.710271816 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7819,7 +7816,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt
 --- pytorch-v1.5.0/caffe2/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-09 17:16:47.918794647 +0800
++++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-13 15:30:57.718272102 +0800
 @@ -32,6 +32,7 @@
    # Add source, includes, and libs to lists
    list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@@ -7966,7 +7963,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs.
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format
 --- pytorch-v1.5.0/.clang-format	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.clang-format	2021-07-09 17:16:47.778789628 +0800
++++ pytorch-develop/.clang-format	2021-07-13 15:30:57.586267370 +0800
 @@ -84,5 +84,4 @@
  SpacesInSquareBrackets: false
  Standard:        Cpp11
@@ -7977,7 +7974,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake
 --- pytorch-v1.5.0/cmake/BuildVariables.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-09 17:16:48.030798663 +0800
++++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-13 15:30:57.830276118 +0800
 @@ -11,6 +11,7 @@
  # CMakeLists.txt files under each folder respectively.
  set(Caffe2_CPU_SRCS)
@@ -8004,7 +8001,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # symbols. However, if the lib is whole linked in caffe2 lib, we don't want
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake
 --- pytorch-v1.5.0/cmake/Codegen.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Codegen.cmake	2021-07-09 17:16:48.030798663 +0800
++++ pytorch-develop/cmake/Codegen.cmake	2021-07-13 15:30:57.830276118 +0800
 @@ -191,13 +191,14 @@
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
@@ -8035,7 +8032,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake
 --- pytorch-v1.5.0/cmake/Dependencies.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Dependencies.cmake	2021-07-09 17:16:48.030798663 +0800
++++ pytorch-develop/cmake/Dependencies.cmake	2021-07-13 15:30:57.830276118 +0800
 @@ -1509,6 +1509,13 @@
    ENDIF(NOT C_HAS_THREAD)
  endif()
@@ -8052,7 +8049,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake
 --- pytorch-v1.5.0/cmake/Summary.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Summary.cmake	2021-07-09 17:16:48.034798807 +0800
++++ pytorch-develop/cmake/Summary.cmake	2021-07-13 15:30:57.830276118 +0800
 @@ -134,6 +134,7 @@
    if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
      message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
@@ -8063,7 +8060,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endfunction()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in
 --- pytorch-v1.5.0/cmake/TorchConfig.cmake.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-09 17:16:48.034798807 +0800
++++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-13 15:30:57.830276118 +0800
 @@ -112,6 +112,11 @@
    list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
  endif()
@@ -8078,7 +8075,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt
 --- pytorch-v1.5.0/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/CMakeLists.txt	2021-07-09 17:16:47.782789771 +0800
++++ pytorch-develop/CMakeLists.txt	2021-07-13 15:30:57.590267513 +0800
 @@ -205,6 +205,10 @@
  option(USE_TBB "Use TBB" OFF)
  option(ONNX_ML "Enable traditional ONNX ML API." ON)
@@ -8145,7 +8142,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore
 --- pytorch-v1.5.0/.dockerignore	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.dockerignore	2021-07-09 17:16:47.778789628 +0800
++++ pytorch-develop/.dockerignore	2021-07-13 15:30:57.586267370 +0800
 @@ -1,257 +1 @@
 -# READ THIS BEFORE YOU REFACTOR ME
 -#
@@ -8408,7 +8405,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat
 --- pytorch-v1.5.0/docs/make.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/docs/make.bat	2021-07-09 17:16:48.038798950 +0800
++++ pytorch-develop/docs/make.bat	2021-07-13 15:30:57.834276262 +0800
 @@ -1,36 +1,36 @@
 -@ECHO OFF
 -
@@ -8497,7 +8494,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt
 --- pytorch-v1.5.0/requirements.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/requirements.txt	2021-07-09 17:16:48.054799524 +0800
++++ pytorch-develop/requirements.txt	2021-07-13 15:30:57.850276836 +0800
 @@ -4,4 +4,12 @@
  requests
  setuptools
@@ -8516,7 +8513,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat
 --- pytorch-v1.5.0/scripts/appveyor/install.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install.bat	2021-07-09 17:16:48.054799524 +0800
++++ pytorch-develop/scripts/appveyor/install.bat	2021-07-13 15:30:57.850276836 +0800
 @@ -1,10 +1,10 @@
 -:: Installation scripts for appveyor.
 -
@@ -8540,7 +8537,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +conda install -y numpy
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat
 --- pytorch-v1.5.0/scripts/appveyor/install_cuda.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-09 17:16:48.054799524 +0800
++++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-13 15:30:57.850276836 +0800
 @@ -1,22 +1,22 @@
 -@echo on
 -
@@ -8588,7 +8585,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +nvcc -V || exit /b
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat
 --- pytorch-v1.5.0/scripts/build_windows.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/build_windows.bat	2021-07-09 17:16:48.054799524 +0800
++++ pytorch-develop/scripts/build_windows.bat	2021-07-13 15:30:57.850276836 +0800
 @@ -1,84 +1,84 @@
 -:: #############################################################################
 -:: Example command to build on Windows.
@@ -8760,7 +8757,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +exit /b 1
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1
 --- pytorch-v1.5.0/scripts/proto.ps1	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/proto.ps1	2021-07-09 17:16:48.054799524 +0800
++++ pytorch-develop/scripts/proto.ps1	2021-07-13 15:30:57.850276836 +0800
 @@ -1,17 +1,17 @@
 -param(
 -  [string]$protoc,
@@ -8798,7 +8795,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +Invoke-Expression $cmd
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/setup.py pytorch-develop/setup.py
 --- pytorch-v1.5.0/setup.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/setup.py	2021-07-09 17:16:48.054799524 +0800
++++ pytorch-develop/setup.py	2021-07-13 15:30:57.850276836 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -8897,7 +8894,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  'python/serialized_test/data/operator_test/*.zip',
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml
 --- pytorch-v1.5.0/tools/autograd/derivatives.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-13 15:30:58.990317711 +0800
 @@ -107,6 +107,10 @@
  #
  # NB: The parameter names here MUST be consistent with the parameter names
@@ -9010,8 +9007,8 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py
 --- pytorch-v1.5.0/tools/autograd/dump_utils.py	1970-01-01 08:00:00.000000000 +0800
-+++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-09 17:16:49.194840399 +0800
-@@ -0,0 +1,112 @@
++++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-13 15:30:58.990317711 +0800
+@@ -0,0 +1,114 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# All rights reserved.
 +#
@@ -9122,11 +9119,13 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  "item",
 +  "set__source_Storage_storage_offset",
 +  "pin_memory",
-+  "to_device"
++  "to_device",
++  "numpy_T",
++  "slice_Tensor"
 +]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-13 15:30:58.990317711 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9312,7 +9311,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_python_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-13 15:30:58.990317711 +0800
 @@ -1,3 +1,20 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9354,7 +9353,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              'value': argname,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py
 --- pytorch-v1.5.0/tools/autograd/gen_variable_type.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-13 15:30:58.990317711 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9527,7 +9526,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/Functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-13 15:30:58.990317711 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9607,7 +9606,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto sparse = sparse_.coalesce();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-13 15:30:58.990317711 +0800
 @@ -22,7 +22,7 @@
  #include "torch/csrc/autograd/generated/variable_factories.h"
  #include "torch/csrc/utils/structseq.h"
@@ -9691,7 +9690,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-13 15:30:58.990317711 +0800
 @@ -15,7 +15,13 @@
  #include "torch/csrc/cuda/Stream.h"
  #include "torch/csrc/cuda/Event.h"
@@ -9778,7 +9777,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"has_names", (PyCFunction)THPVariable_has_names, METH_NOARGS, NULL},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-13 15:30:58.990317711 +0800
 @@ -1,7 +1,27 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9809,7 +9808,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-09 17:16:49.194840399 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-13 15:30:58.990317711 +0800
 @@ -1,3 +1,20 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9841,7 +9840,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl
 --- pytorch-v1.5.0/tools/build_variables.bzl	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/build_variables.bzl	2021-07-09 17:16:49.198840543 +0800
++++ pytorch-develop/tools/build_variables.bzl	2021-07-13 15:30:58.994317854 +0800
 @@ -46,6 +46,7 @@
      "torch/csrc/autograd/functions/utils.cpp",
      "torch/csrc/autograd/input_buffer.cpp",
@@ -9927,7 +9926,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py
 --- pytorch-v1.5.0/torch/autograd/profiler.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/autograd/profiler.py	2021-07-09 17:16:49.202840686 +0800
++++ pytorch-develop/torch/autograd/profiler.py	2021-07-13 15:30:58.998317998 +0800
 @@ -1,8 +1,25 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -10400,7 +10399,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return ''.join(result)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt
 --- pytorch-v1.5.0/torch/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/CMakeLists.txt	2021-07-09 17:16:49.198840543 +0800
++++ pytorch-develop/torch/CMakeLists.txt	2021-07-13 15:30:58.994317854 +0800
 @@ -97,6 +97,7 @@
      ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp
      ${TORCH_SRC_DIR}/csrc/utils.cpp
@@ -10432,7 +10431,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/engine.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10555,7 +10554,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        auto event = c10::Event{c10::DeviceType::CUDA};
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10587,7 +10586,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            /*non_blocking=*/false,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10630,7 +10629,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    m.def("_enable_profiler", enableProfiler);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10682,7 +10681,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto& old_var = buffer[pos];
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10878,7 +10877,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  CUDAStubs::~CUDAStubs() = default;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11003,7 +11002,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11057,7 +11056,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-13 15:30:59.010318428 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11098,7 +11097,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h
 --- pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-09 17:16:49.214841116 +0800
++++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-13 15:30:59.010318428 +0800
 @@ -168,6 +168,45 @@
    return r.release();
  }
@@ -11147,7 +11146,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!r) throw python_error();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-09 17:16:49.210840973 +0800
++++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-13 15:30:59.006318284 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11181,7 +11180,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!t.defined()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-09 17:16:49.218841259 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-13 15:30:59.014318571 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11287,7 +11286,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    while (!in_flight.empty()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-09 17:16:49.218841259 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-13 15:30:59.014318571 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11344,7 +11343,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-09 17:16:49.218841259 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-13 15:30:59.014318571 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11469,7 +11468,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp
 --- pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-09 17:16:49.202840686 +0800
++++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-13 15:30:58.998317998 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11518,7 +11517,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return it->second;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp
 --- pytorch-v1.5.0/torch/csrc/Generator.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-09 17:16:49.202840686 +0800
++++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-13 15:30:58.998317998 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11586,7 +11585,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #endif 
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/serialization.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-09 17:16:49.222841403 +0800
++++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-13 15:30:59.018318714 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11686,7 +11685,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/Storage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-09 17:16:49.218841259 +0800
++++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-13 15:30:59.018318714 +0800
 @@ -1,7 +1,25 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11765,7 +11764,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        for (Py_ssize_t i = 0; i < length; i++) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-09 17:16:49.222841403 +0800
++++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-13 15:30:59.018318714 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11813,7 +11812,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp
 --- pytorch-v1.5.0/torch/csrc/Module.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Module.cpp	2021-07-09 17:16:49.202840686 +0800
++++ pytorch-develop/torch/csrc/Module.cpp	2021-07-13 15:30:58.998317998 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11957,7 +11956,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-09 17:16:49.242842120 +0800
++++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-13 15:30:59.038319432 +0800
 @@ -1,18 +1,35 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12334,7 +12333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +} // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-09 17:16:49.242842120 +0800
++++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-13 15:30:59.038319432 +0800
 @@ -1,6 +1,10 @@
  #include <ATen/core/ivalue.h>
  #include <torch/csrc/utils/init.h>
@@ -12422,7 +12421,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h
 --- pytorch-v1.5.0/torch/csrc/utils/init.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.h	2021-07-09 17:16:49.242842120 +0800
++++ pytorch-develop/torch/csrc/utils/init.h	2021-07-13 15:30:59.038319432 +0800
 @@ -8,4 +8,7 @@
  void initThroughputBenchmarkBindings(PyObject* module);
  
@@ -12433,7 +12432,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h
 --- pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-09 17:16:49.242842120 +0800
++++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-13 15:30:59.038319432 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12468,7 +12467,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return at::Device(device_str);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-09 17:16:49.242842120 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-13 15:30:59.038319432 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12499,7 +12498,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    registerLayoutObject((THPLayout*)strided_layout, at::Backend::QuantizedCPU);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-09 17:16:49.242842120 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-13 15:30:59.038319432 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12635,7 +12634,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else if(expected_layout == c10::kSparse) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-09 17:16:49.242842120 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-13 15:30:59.038319432 +0800
 @@ -1,58 +1,91 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12848,7 +12847,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def get_rng_state(): ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py
 --- pytorch-v1.5.0/torch/distributed/distributed_c10d.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-09 17:16:49.246842264 +0800
++++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-13 15:30:59.042319575 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -12929,7 +12928,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py
 --- pytorch-v1.5.0/torch/distributions/von_mises.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributions/von_mises.py	2021-07-09 17:16:49.246842264 +0800
++++ pytorch-develop/torch/distributions/von_mises.py	2021-07-13 15:30:59.042319575 +0800
 @@ -1,140 +1,140 @@
 -from __future__ import absolute_import, division, print_function
 -
@@ -13213,7 +13212,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +                    _log_modified_bessel_fn(self.concentration, order=0)).exp()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py
 --- pytorch-v1.5.0/torch/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/__init__.py	2021-07-09 17:16:49.198840543 +0800
++++ pytorch-develop/torch/__init__.py	2021-07-13 15:30:58.994317854 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13256,7 +13255,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-09 17:16:49.250842407 +0800
++++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-13 15:30:59.046319718 +0800
 @@ -28,6 +28,10 @@
    option(USE_C10D_NCCL "USE C10D NCCL" ON)
  endif()
@@ -13309,7 +13308,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    copy_header(ProcessGroupMPI.hpp)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-09 17:16:49.250842407 +0800
++++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-13 15:30:59.046319718 +0800
 @@ -37,8 +37,11 @@
  SET_TARGET_PROPERTIES(shm PROPERTIES
    PREFIX "lib"
@@ -13366,7 +13365,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py
 --- pytorch-v1.5.0/torch/nn/functional.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/functional.py	2021-07-09 17:16:49.254842550 +0800
++++ pytorch-develop/torch/nn/functional.py	2021-07-13 15:30:59.050319862 +0800
 @@ -1611,7 +1611,7 @@
      else:
          output = input.matmul(weight.t())
@@ -13389,7 +13388,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -from . import parallel as parallel
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py
 --- pytorch-v1.5.0/torch/nn/modules/batchnorm.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-09 17:16:49.254842550 +0800
++++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-13 15:30:59.050319862 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13421,7 +13420,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              self.register_parameter('running_var', None)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py
 --- pytorch-v1.5.0/torch/nn/modules/module.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/module.py	2021-07-09 17:16:49.254842550 +0800
++++ pytorch-develop/torch/nn/modules/module.py	2021-07-13 15:30:59.050319862 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13564,7 +13563,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py
 --- pytorch-v1.5.0/torch/nn/modules/normalization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-09 17:16:49.254842550 +0800
++++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-13 15:30:59.050319862 +0800
 @@ -128,13 +128,14 @@
      """
      __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
@@ -13597,7 +13596,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          return '{normalized_shape}, eps={eps}, ' \
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in
 --- pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-09 17:16:49.254842550 +0800
++++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-13 15:30:59.054320005 +0800
 @@ -1,60 +1,60 @@
 -from ..init import xavier_uniform_
 -from .activation import MultiheadAttention
@@ -13757,7 +13756,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -                  module_kwargs: Optional[Any] = ...) -> Tensor: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py
 --- pytorch-v1.5.0/torch/nn/parallel/distributed.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-09 17:16:49.258842694 +0800
++++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-13 15:30:59.054320005 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14108,7 +14107,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def remove_weight_norm(module: T_module, name: str = ...) -> T_module: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py
 --- pytorch-v1.5.0/torch/onnx/symbolic_opset9.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-09 17:16:49.258842694 +0800
++++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-13 15:30:59.054320005 +0800
 @@ -1621,14 +1621,23 @@
          slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals]
          return g.op('Concat', *slices, axis_i=0)
@@ -14186,7 +14185,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., lr_decay: float=..., weight_decay: float=..., initial_accumulator_value: float=...,  eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py
 --- pytorch-v1.5.0/torch/optim/adamax.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/optim/adamax.py	2021-07-09 17:16:49.262842837 +0800
++++ pytorch-develop/torch/optim/adamax.py	2021-07-13 15:30:59.058320149 +0800
 @@ -80,8 +80,8 @@
                      exp_inf.mul_(beta2).unsqueeze(0),
                      grad.abs().add_(eps).unsqueeze_(0)
@@ -14363,7 +14362,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py
 --- pytorch-v1.5.0/torch/serialization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/serialization.py	2021-07-09 17:16:49.262842837 +0800
++++ pytorch-develop/torch/serialization.py	2021-07-13 15:30:59.058320149 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14447,7 +14446,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def location_tag(storage):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py
 --- pytorch-v1.5.0/torch/storage.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/storage.py	2021-07-09 17:16:49.262842837 +0800
++++ pytorch-develop/torch/storage.py	2021-07-13 15:30:59.058320149 +0800
 @@ -7,6 +7,7 @@
  
  class _StorageBase(object):
@@ -14467,7 +14466,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          else:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py
 --- pytorch-v1.5.0/torch/tensor.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/tensor.py	2021-07-09 17:16:49.262842837 +0800
++++ pytorch-develop/torch/tensor.py	2021-07-13 15:30:59.058320149 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14529,7 +14528,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      def __reversed__(self):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py
 --- pytorch-v1.5.0/torch/_tensor_str.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_tensor_str.py	2021-07-09 17:16:49.198840543 +0800
++++ pytorch-develop/torch/_tensor_str.py	2021-07-13 15:30:58.994317854 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14583,7 +14582,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      has_default_dtype = self.dtype in (torch.get_default_dtype(), torch.int64, torch.bool)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py
 --- pytorch-v1.5.0/torch/utils/data/dataloader.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-09 17:16:49.266842980 +0800
++++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-13 15:30:59.062320292 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14792,7 +14791,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py
 --- pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-09 17:16:49.266842980 +0800
++++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-13 15:30:59.062320292 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14853,7 +14852,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py
 --- pytorch-v1.5.0/torch/utils/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/__init__.py	2021-07-09 17:16:49.266842980 +0800
++++ pytorch-develop/torch/utils/__init__.py	2021-07-13 15:30:59.062320292 +0800
 @@ -1,6 +1,7 @@
  from __future__ import absolute_import, division, print_function, unicode_literals
  
@@ -14864,7 +14863,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def set_module(obj, mod):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py
 --- pytorch-v1.5.0/torch/_utils.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_utils.py	2021-07-09 17:16:49.202840686 +0800
++++ pytorch-develop/torch/_utils.py	2021-07-13 15:30:58.998317998 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
diff --git a/src/aten/src/ATen/native/native_functions.yaml b/src/aten/src/ATen/native/native_functions.yaml
index 4b3a1b7ded4f60281cf4d8dffa66d024ed3f5ef8..30c7a8aeb19a82f8bffa37cd4947172f082db5be 100644
--- a/src/aten/src/ATen/native/native_functions.yaml
+++ b/src/aten/src/ATen/native/native_functions.yaml
@@ -2246,8 +2246,6 @@
 - func: matrix_power(Tensor self, int n) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-  npu_dispatch:
-    NPU: matrix_power_npu
 
 - func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -8166,7 +8164,7 @@
     CPU: col2im_backward_out_cpu
     CUDA: col2im_backward_out_cuda
   npu_dispatch:
-     NPU: col2im_backward_out_npu
+     NPU: im2col_out_npu
 
 - func: col2im_backward(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
   python_module: nn
@@ -8174,7 +8172,7 @@
     CPU: col2im_backward_cpu
     CUDA: col2im_backward_cuda
   npu_dispatch:
-     NPU: col2im_backward_npu
+     NPU: im2col_npu
 
 - func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -8546,4 +8544,8 @@
 
 - func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor)
   npu_dispatch_only:
-    NPU: linear_backward_npu
\ No newline at end of file
+    NPU: linear_backward_npu
+
+- func: npu_bert_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  npu_dispatch_only:
+    NPU: bert_apply_adam_npu
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/BaddbmmKernelNpu.cpp b/src/aten/src/ATen/native/npu/BaddbmmKernelNpu.cpp
index 4e68df64788ca9d606cd23324424823e13ac4933..600c7e09a8db8a42e48b83ff27b097e4bac6fb62 100644
--- a/src/aten/src/ATen/native/npu/BaddbmmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BaddbmmKernelNpu.cpp
@@ -11,9 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+
 namespace at {
 namespace native {
 using namespace at::native::npu;
@@ -47,7 +48,8 @@ Tensor& baddbmm_out_npu(
     const Tensor& tensor2,
     Scalar beta,
     Scalar alpha) {
-  Tensor BatchMatMulTensor = result;
+  auto outputSize = baddbmm_npu_output_size(tensor1, tensor2);
+  Tensor BatchMatMulTensor = OpPreparation::ApplyTensor(self, outputSize);
   
   auto inputs = baddbmm_npu_input(tensor1, tensor2);
   auto outputs = baddbmm_npu_output({BatchMatMulTensor});
@@ -100,4 +102,4 @@ Tensor& baddbmm_npu_(
   return self;
 }
 }
-}
\ No newline at end of file
+}
diff --git a/src/aten/src/ATen/native/npu/BertApplyAdamKernelNpu.cpp b/src/aten/src/ATen/native/npu/BertApplyAdamKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..380f65f1aac08321d6a9b72e091454e8be042f0d
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/BertApplyAdamKernelNpu.cpp
@@ -0,0 +1,108 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor, Tensor, Tensor> bert_apply_adam_out_npu_nocheck(
+    Tensor& var_out,
+    Tensor& m_out,
+    Tensor& v_out,
+    const Tensor& var,
+    const Tensor& m,
+    const Tensor& v,
+    Scalar lr,
+    Scalar beta1,
+    Scalar beta2,
+    Scalar epsilon,
+    const Tensor& grad,
+    Scalar max_grad_norm,
+    Scalar global_grad_norm,
+    Scalar weight_decay) {
+  OpCommand cmd;
+  cmd.Name("ApplyAdamV2")
+      .Input(var)
+      .Input(m)
+      .Input(v)
+      .Input(lr, var.scalar_type())
+      .Input(beta1, var.scalar_type())
+      .Input(beta2, var.scalar_type())
+      .Input(epsilon, var.scalar_type())
+      .Input(grad)
+      .Input(max_grad_norm, var.scalar_type())
+      .Input(global_grad_norm, var.scalar_type())
+      .Input(weight_decay, var.scalar_type())
+      .Output(var_out)
+      .Output(m_out)
+      .Output(v_out)
+      .Run();
+  return std::tie(var_out, m_out, v_out);
+}
+
+tuple<Tensor, Tensor, Tensor> bert_apply_adam_out_npu(
+    Tensor& var_out,
+    Tensor& m_out,
+    Tensor& v_out,
+    const Tensor& var,
+    const Tensor& m,
+    const Tensor& v,
+    Scalar lr,
+    Scalar beta1,
+    Scalar beta2,
+    Scalar epsilon,
+    const Tensor& grad,
+    Scalar max_grad_norm,
+    Scalar global_grad_norm,
+    Scalar weight_decay) {
+  OpPipeWithDefinedOut check;
+  check.CheckMemory({var, m, v, grad}, {var_out, m_out, v_out});
+
+  auto func = [&var, &m, &v, &lr, &beta1, &beta2, &epsilon, &grad, &max_grad_norm, &global_grad_norm, &weight_decay] (
+      Tensor& var_out,
+      Tensor& m_out,
+      Tensor& v_out) {
+        bert_apply_adam_out_npu_nocheck(var_out, m_out, v_out, var, m, v, 
+            lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay);
+      };
+  
+  OpPipeWithMultiOut<Tensor&, Tensor&, Tensor&> pipe(var_out, m_out, v_out);
+  return pipe.Call(func)
+              .ReturnRef<Tensor&, Tensor&, Tensor&>();
+}
+
+tuple<Tensor, Tensor, Tensor> bert_apply_adam_npu(
+    Tensor& var,
+    Tensor& m,
+    Tensor& v,
+    Scalar lr,
+    Scalar beta1,
+    Scalar beta2,
+    Scalar epsilon,
+    const Tensor& grad,
+    Scalar max_grad_norm,
+    Scalar global_grad_norm,
+    Scalar weight_decay) {
+  bert_apply_adam_out_npu(
+      var, m, v, var, m, v,
+      lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay);
+  return std::tie(var, m, v);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp
index 722dc7e2e8c874ec311083ad2b945909c3787f37..f7b608dcaef0e37afa371e7738fa458dd8fbaec5 100644
--- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp
@@ -68,6 +68,12 @@ Tensor binary_cross_entropy_npu(
 
   // construct the output tensor of the NPU
   Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+  if (self.numel() == 0) {
+    // In this scenario, needs to return nan. And the nan of the NPU can only be fp32.
+    result = result.to(at::kFloat).fill_(0);
+    result = result / 0;
+    return result;
+  }
 
   // calculate the output result of the NPU
   binary_cross_entropy_out_npu(result, self, target, weight, reduction);
diff --git a/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp b/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp
deleted file mode 100644
index 4db76efd8096ea033de19d68b123604ff85a83d1..0000000000000000000000000000000000000000
--- a/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& cholesky_out_npu(
-    Tensor & y, 
-    const Tensor & x, 
-    bool upper) {
-  TORCH_CHECK(
-      upper == false,
-      "cholesky: The upper parameter currently only supports upper == false");
-
-  OpCommand cmd;
-  cmd.Name("Cholesky")
-     .Input(x)
-     .Output(y)
-     .Run();
-    return y;
-}
-
-Tensor cholesky_npu(const Tensor& x, bool upper) {
-  Tensor formatCastOfX = x.npu_format_cast(ACL_FORMAT_NCHW);
-  // calculate the output size
-  auto outputSize = input_same_output_size(formatCastOfX);
-
-  // construct the output tensor of the NPU
-  Tensor y = at::empty_with_format(
-      outputSize, formatCastOfX.options(), ACL_FORMAT_NCHW);
-
-  // calculate the output result of the NPU
-  cholesky_out_npu(y, formatCastOfX, upper);
-
-  return y;
-}
-
-} // namespace native
-} // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/EqKernelNpu.cpp b/src/aten/src/ATen/native/npu/EqKernelNpu.cpp
index becfb6f46da8c924b65a3c8c334ec950d1b37c58..c4374fadf75bf0ce213c85783c68d9d996c09db1 100644
--- a/src/aten/src/ATen/native/npu/EqKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/EqKernelNpu.cpp
@@ -60,8 +60,8 @@ Tensor& eq_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
   OpPreparation::CheckOut(
     {self, other}, 
     result, 
-    CalcuOpUtil::get_tensor_npu_format(self), 
-    ScalarType::Bool, 
+    ACL_FORMAT_ND, 
+    result.scalar_type(), 
     IntArrayRef(outputSize));
   eq_out_npu_nocheck(result, self, other);
   return result;
@@ -71,8 +71,8 @@ Tensor& eq_out_npu(Tensor& result, const Tensor& self, Scalar other) {
   OpPreparation::CheckOut(
     {self}, 
     result, 
-    CalcuOpUtil::get_tensor_npu_format(self), 
-    ScalarType::Bool, 
+    ACL_FORMAT_ND,
+    result.scalar_type(), 
     self.sizes());
   eq_out_npu_nocheck(result, self, other);
   return result;
diff --git a/src/aten/src/ATen/native/npu/GeKernelNpu.cpp b/src/aten/src/ATen/native/npu/GeKernelNpu.cpp
index 6169b9de059eb470ed71172330301d88ea77d319..64c08e5617fa8213ccc96571934b3f0ba3e25064 100644
--- a/src/aten/src/ATen/native/npu/GeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GeKernelNpu.cpp
@@ -49,7 +49,7 @@ Tensor& ge_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
@@ -78,7 +78,7 @@ Tensor& ge_out_npu(Tensor& result, const Tensor& self, Scalar other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
diff --git a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
index 63970a9d50ce6ce8a50b284dca5d047d11f910a0..48dc7f3235f037215c06a65a44f00931acfb2ad9 100644
--- a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
@@ -50,7 +50,7 @@ Tensor& gt_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
@@ -80,7 +80,7 @@ Tensor& gt_out_npu(Tensor& result, const Tensor& self, Scalar other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
diff --git a/src/aten/src/ATen/native/npu/LeKernelNpu.cpp b/src/aten/src/ATen/native/npu/LeKernelNpu.cpp
index 88ad478bd8b658c5f503e481e51b6c5823a430d3..af19185518fcd91208573d6200118ca5f3c134f4 100644
--- a/src/aten/src/ATen/native/npu/LeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LeKernelNpu.cpp
@@ -38,7 +38,7 @@ Tensor& le_out_npu(Tensor& result, const Tensor& self, Scalar other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
@@ -67,7 +67,7 @@ Tensor& le_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
diff --git a/src/aten/src/ATen/native/npu/LogKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogKernelNpu.cpp
index 89409d80f542257f85c859cf35e24fea1806a7e9..d536021b4e6885402c430df4d1b11542c1213628 100644
--- a/src/aten/src/ATen/native/npu/LogKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogKernelNpu.cpp
@@ -34,10 +34,14 @@ Tensor& log_out_npu_nocheck(Tensor& result, const Tensor& self) {
 }
 
 Tensor& log_out_npu(Tensor& result, const Tensor& self) {
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      self);
+  if (!result.is_same(self)) {
+    OpPreparation::CheckOut(
+        {self},
+        result,
+        ACL_FORMAT_ND,
+        self.scalar_type(),
+        self.sizes());
+  }
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
diff --git a/src/aten/src/ATen/native/npu/LtKernelNpu.cpp b/src/aten/src/ATen/native/npu/LtKernelNpu.cpp
index 8e2ab7095eea9d592b3982d2c34c2d9f945af094..59c5ba684053863585846acf4dd3c6f62ec48308 100644
--- a/src/aten/src/ATen/native/npu/LtKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LtKernelNpu.cpp
@@ -48,7 +48,7 @@ Tensor& lt_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
@@ -77,7 +77,7 @@ Tensor& lt_out_npu(Tensor& result, const Tensor& self, Scalar other) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ACL_FORMAT_ND,
       result.scalar_type(),
       outputSize);
 
diff --git a/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp
index 1f3c8cd185110e7a3dce8ce8bb9f9c33315ef584..ddd370a06e977b02c52e84b2cc8467d5014d8106 100644
--- a/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp
@@ -21,21 +21,6 @@ namespace at {
 namespace native {
 using namespace at::native::npu;
 
-static inline tuple<SmallVector<int64_t, SIZE>, int64_t> max_output_calc(
-    const Tensor& self,
-    IntArrayRef dims,
-    bool keepdim) {
-  SmallVector<int64_t, SIZE> outputSize =
-      reduce_ops_npu_output_size(self, dims, keepdim);
-
-  int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(self);
-  if (outputSize.empty()) {
-    npu_format = ACL_FORMAT_ND; // use default format
-  }
-  
-  return std::tie(outputSize, npu_format);
-}
-
 tuple<Tensor&, Tensor&> max_out_npu_nocheck(
     Tensor& output,
     Tensor& indices,
@@ -45,7 +30,7 @@ tuple<Tensor&, Tensor&> max_out_npu_nocheck(
   OpCommand cmd;
   cmd.Name("ArgMaxWithValue")
       .Input(self)
-      .Output(indices)      
+      .Output(indices)
       .Output(output)
       .Attr("dimension", dim)
       .Attr("keep_dims", keepdim)
@@ -59,10 +44,9 @@ tuple<Tensor&, Tensor&> max_out_npu(
     const Tensor& self,
     int64_t dim,
     bool keepdim) {
-  auto params = max_output_calc(self, {dim}, keepdim);
-  auto outputSize = std::get<0>(params);
-  auto indicesSize = std::get<0>(params);
-  auto npu_format = std::get<1>(params);
+  SmallVector<int64_t, SIZE> dims = {dim};
+  auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
+  SmallVector<int64_t, SIZE> indicesSize = outputSize;
 
   auto func = [&self, dim, keepdim](Tensor& output, Tensor& indices) {
     max_out_npu_nocheck(output, indices, self, dim, keepdim);
@@ -73,19 +57,19 @@ tuple<Tensor&, Tensor&> max_out_npu(
 
   Tensor indices_tmp;
   OpPipeWithMultiOut<Tensor&, Tensor&> pipe(output, indices_tmp);
-  return pipe.FixOutputSizeAndFormat<0>({self}, self, npu_format, outputSize)
+  return pipe.FixOutputSizeAndFormat<0>({self}, self, ACL_FORMAT_ND, outputSize)
             .ApplyOutputWithSpecailParams<1>(indicesSize, self.options().dtype(ScalarType::Int), ACL_FORMAT_ND) // use default format
             .Call(func)
             .ReflushOutputDtype<1>(ScalarType::Long)
+            .FixOutputExceptDtype<1>({self}, ACL_FORMAT_ND, ScalarType::Long, indicesSize)
             .FixOutputWithReplace<1>(indices)
             .ReturnRef<Tensor&, Tensor&>();
 }
 
 tuple<Tensor, Tensor> max_npu(const Tensor& self, int64_t dim, bool keepdim) {
-  auto params = max_output_calc(self, {dim}, keepdim);
-  auto outputSize = std::get<0>(params);
-  auto indicesSize = std::get<0>(params);
-  auto npu_format = std::get<1>(params);
+  SmallVector<int64_t, SIZE> dims = {dim};
+  auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
+  SmallVector<int64_t, SIZE> indicesSize = outputSize;
 
   auto func = [&self, dim, keepdim](Tensor outputs, Tensor indices) {
     max_out_npu_nocheck(outputs, indices, self, dim, keepdim);
@@ -93,7 +77,7 @@ tuple<Tensor, Tensor> max_npu(const Tensor& self, int64_t dim, bool keepdim) {
 
   Tensor outputs, indices;
   OpPipeWithDefinedMultiOut<Tensor, Tensor> pipe(outputs, indices);
-  return pipe.ApplyOutputWithSpecailParams<0>(outputSize, self.options(), npu_format)
+  return pipe.ApplyOutputWithSpecailParams<0>(outputSize, self.options(), ACL_FORMAT_ND)
             .ApplyOutputWithSpecailParams<1>(indicesSize, self.options().dtype(ScalarType::Int), ACL_FORMAT_ND) // use default format
             .Call(func)
             .ReflushOutputDtype<1>(ScalarType::Long)
@@ -144,7 +128,12 @@ Tensor& max_out_npu(
     Tensor& result,
     const Tensor& self,
     const Tensor& other) {
-  OpPreparation::CheckOut({self}, result, self);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      self.scalar_type(),
+      self.sizes());
   max_out_npu_nocheck(result, self, other);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
index 821424393afeddddd346d448c3656a6d6b1c671c..f45ae27e9b5f9eedb40a54b483f692d9cd6f6129 100644
--- a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
@@ -22,21 +22,6 @@ namespace at {
 namespace native {
 using namespace at::native::npu;
 
-static inline tuple<SmallVector<int64_t, SIZE>, int64_t> min_output_calc(
-    const Tensor& self,
-    IntArrayRef dims,
-    bool keepdim) {
-  SmallVector<int64_t, SIZE> outputSize =
-      reduce_ops_npu_output_size(self, dims, keepdim);
-
-  int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(self);
-  if (outputSize.empty()) {
-    npu_format = ACL_FORMAT_ND; // scalar tensor use default format
-  }
-  
-  return std::tie(outputSize, npu_format);
-}
-
 tuple<Tensor&, Tensor&> min_out_npu_nocheck(
     Tensor& output,
     Tensor& indices,
@@ -62,10 +47,9 @@ tuple<Tensor&, Tensor&> min_out_npu(
     const Tensor& self,
     int64_t dim,
     bool keepdim) {
-  auto params = min_output_calc(self, {dim}, keepdim);
-  auto outputSize = std::get<0>(params);
-  auto indicesSize = std::get<0>(params);
-  auto npu_format = std::get<1>(params);
+  SmallVector<int64_t, SIZE> dims = {dim};
+  auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
+  SmallVector<int64_t, SIZE> indicesSize = outputSize;
 
   auto func = [&self, dim, keepdim](Tensor& output, Tensor& indices) {
     min_out_npu_nocheck(output, indices, self, dim, keepdim);
@@ -73,19 +57,19 @@ tuple<Tensor&, Tensor&> min_out_npu(
 
   Tensor indices_tmp;
   OpPipeWithMultiOut<Tensor&, Tensor&> pipe(output, indices_tmp);
-  return pipe.FixOutputSizeAndFormat<0>({self}, self, npu_format, outputSize)
-            .ApplyOutputWithSpecailParams<1>(indicesSize, self.options().dtype(ScalarType::Int), ACL_FORMAT_NCHW) // indices must be nchw format
+  return pipe.FixOutputSizeAndFormat<0>({self}, self, ACL_FORMAT_ND, outputSize)
+            .ApplyOutputWithSpecailParams<1>(indicesSize, self.options().dtype(ScalarType::Int), ACL_FORMAT_ND)
             .Call(func)
             .ReflushOutputDtype<1>(ScalarType::Long)
+            .FixOutputExceptDtype<1>({self}, ACL_FORMAT_ND, ScalarType::Long, indicesSize)
             .FixOutputWithReplace<1>(indices)
             .ReturnRef<Tensor&, Tensor&>();
 }
 
 tuple<Tensor, Tensor> min_npu(const Tensor& self, int64_t dim, bool keepdim) {
-  auto params = min_output_calc(self, {dim}, keepdim);
-  auto outputSize = std::get<0>(params);
-  auto indicesSize = std::get<0>(params);
-  auto npu_format = std::get<1>(params);
+  SmallVector<int64_t, SIZE> dims = {dim};
+  auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
+  SmallVector<int64_t, SIZE> indicesSize = outputSize;
 
   auto func = [&self, dim, keepdim](Tensor outputs, Tensor indices) {
     min_out_npu_nocheck(outputs, indices, self, dim, keepdim);
@@ -93,7 +77,7 @@ tuple<Tensor, Tensor> min_npu(const Tensor& self, int64_t dim, bool keepdim) {
 
   Tensor outputs, indices;
   OpPipeWithDefinedMultiOut<Tensor, Tensor> pipe(outputs, indices);
-  return pipe.ApplyOutputWithSpecailParams<0>(outputSize, self.options(), npu_format)
+  return pipe.ApplyOutputWithSpecailParams<0>(outputSize, self.options(), ACL_FORMAT_ND)
             .ApplyOutputWithSpecailParams<1>(indicesSize, self.options().dtype(ScalarType::Int), ACL_FORMAT_NCHW)
             .Call(func)
             .ReflushOutputDtype<1>(ScalarType::Long)
@@ -144,7 +128,12 @@ Tensor& min_out_npu(
     Tensor& result,
     const Tensor& self,
     const Tensor& other) {
-  OpPreparation::CheckOut({self}, result, self);
+  OpPreparation::CheckOut(
+      {self}, 
+      result, 
+      ACL_FORMAT_ND,
+      self.scalar_type(), 
+      self.sizes());
   min_out_npu_nocheck(result, self, other);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/MmKernelNpu.cpp b/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
index 28ab0aa98118511eec34cdd63b981685a87124e2..91af42d2af1248cc22cdd88ce4f58dc578fe4e6a 100644
--- a/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
@@ -18,6 +18,8 @@
 #include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/common/InnerNpuNativeFunction.h"
+#include "ATen/native/npu/frame/StorageDescHelper.h"
 
 namespace at {
 namespace native {
@@ -26,7 +28,7 @@ using namespace at::native::npu;
 // Flexible transpose judgement for view+transpose+Matmul, 
 // i.e., tensors with dim=2 and base_size_.size=3 can also be Matmul directly!
 bool is_transpose_last_two_dims_flex(const Tensor& tensor) {
-  if (tensor.dim() < 2 || tensor.dim() > 3) {
+  if (tensor.dim() != 2) {
     return false;
   }
   int64_t numel = 1;
@@ -113,10 +115,17 @@ Tensor mm_npu(const Tensor& self, const Tensor& mat2) {
   // Matmul cannot directly deal with view+transposed tensor with NZ format, so Transdata is necessary
   if (self.sizes().size() != self_desc.base_sizes_.size()) {
     selfFormatCast = OpPreparation::CastBackToOriFormat(self);
+    // refresh storage desc info [origin shape and storage shape] of reshaped Tensor
+    if (is_transpose_last_two_dims_flex(selfFormatCast)) {
+      StorageDescHelper::ReflushDescBySelf(selfFormatCast.transpose(-2, -1));
+    }
   }
   
   if (mat2.sizes().size() != mat2_desc.base_sizes_.size()) {
     mat2FormatCast = OpPreparation::CastBackToOriFormat(mat2);
+    if (is_transpose_last_two_dims_flex(mat2FormatCast)) {
+      StorageDescHelper::ReflushDescBySelf(mat2FormatCast.transpose(-2, -1));
+    }
   }
   
   // construct the output tensor of the NPU
diff --git a/src/aten/src/ATen/native/npu/NeKernelNpu.cpp b/src/aten/src/ATen/native/npu/NeKernelNpu.cpp
index 4e4c377b7a1ca69228f7e8fa50b401477e7d87a2..a1de7e8793b6b39ccc29841bffcaab1e4b8e3694 100644
--- a/src/aten/src/ATen/native/npu/NeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NeKernelNpu.cpp
@@ -15,7 +15,6 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -70,8 +69,8 @@ Tensor& ne_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
   OpPreparation::CheckOut(
     {self, other}, 
     result, 
-    CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
-    ScalarType::Bool, 
+    ACL_FORMAT_ND,
+    result.scalar_type(), 
     IntArrayRef(outputSize));
   ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther);
   return result;
@@ -83,8 +82,8 @@ Tensor& ne_out_npu(Tensor& result, const Tensor& self, Scalar other) {
   OpPreparation::CheckOut(
     {self}, 
     result, 
-    CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
-    ScalarType::Bool, 
+    ACL_FORMAT_ND,
+    result.scalar_type(), 
     outputSize);
   ne_out_npu_nocheck(result, formatCastOfSelf, other);
   return result;
diff --git a/src/aten/src/ATen/native/npu/NegKernelNpu.cpp b/src/aten/src/ATen/native/npu/NegKernelNpu.cpp
index aceb969bebf5a3d8f96e0ab6fcca47f0ce578d8b..463845886638c384f6072c3b5c7d5e1724308394 100644
--- a/src/aten/src/ATen/native/npu/NegKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NegKernelNpu.cpp
@@ -33,7 +33,12 @@ Tensor& neg_out_npu_nocheck(Tensor& result, const Tensor& self) {
 }
 
 Tensor& neg_out_npu(Tensor& result, const Tensor& self) {
-  OpPreparation::CheckOut({self}, result, self);
+  OpPreparation::CheckOut(
+      {self}, 
+      result, 
+      ACL_FORMAT_ND,
+      self.scalar_type(), 
+      self.sizes());
   neg_out_npu_nocheck(result, self);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
index d299fde881a3b0778ed46ca6777ea039a7de41fd..91b36d5d73f1b92e9d497f0ae1731b2426d3bfe1 100644
--- a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
@@ -79,7 +79,8 @@ Tensor& std_out_npu(
   OpPreparation::CheckOut(
       {self}, 
       result, 
-      self,
+      ACL_FORMAT_ND,
+      self.scalar_type(),
       outputSize);
 
   // executing the NPU operator
@@ -100,12 +101,14 @@ tuple<Tensor&, Tensor&> std_mean_out_npu(
   OpPreparation::CheckOut(
       {self}, 
       result1, 
-      self,
+      ACL_FORMAT_ND,
+      self.scalar_type(),
       outputSize);
   OpPreparation::CheckOut(
       {self}, 
       result2, 
-      self,
+      ACL_FORMAT_ND,
+      self.scalar_type(),
       outputSize);
       
   // executing the NPU operator
diff --git a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
index 412d1fc32b7bca4bb8f5d7bcac31eee8458a5bc8..a49aa9b9945afa46ac43295733adce86ded1ece0 100644
--- a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
+++ b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
@@ -347,18 +347,7 @@ NPUStatus CalcuOpUtil::CreateAclTensorDescInfo(
         input[i].tensorDescType == NPUTensorDesc::TensorDescType::TENSOR) {
       Tensor* aclInput = &input[i].tensor;
       SmallVector<int64_t, 5> dims;
-      if (opName == "MatMul") {
-        auto dims_pre = aclInput->sizes();
-        if (attrs[i].boolAttrValue == 1) {
-          dims.push_back(dims_pre[1]);
-          dims.push_back(dims_pre[0]);
-        } else if (attrs[i].boolAttrValue == 0) {
-          dims.push_back(dims_pre[0]);
-          dims.push_back(dims_pre[1]);
-        }
-      } else {
-        dims = aclInput->storage().get_npu_desc().base_sizes_;
-      }
+      dims = aclInput->storage().get_npu_desc().base_sizes_;
       auto storageDims = aclInput->storage().get_npu_desc().storage_sizes_;
       int64_t numel = 1;
       for (int j = 0; j < storageDims.size(); j++) {
diff --git a/test/test_npu/test_alpha_dropout.py b/test/test_npu/test_alpha_dropout.py
deleted file mode 100644
index 9bd3266f91bb06a8e06cc2ad3369781ee38ec758..0000000000000000000000000000000000000000
--- a/test/test_npu/test_alpha_dropout.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import random
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestAlphaDropout(TestCase):
-    def cpu_op_exec(self,input, p):
-        m = torch.nn.AlphaDropout(p)
-        output = m(input)
-        output = output.numpy()
-        return output
-        
-    def npu_op_exec(self,input, p):
-        m = torch.nn.AlphaDropout(p)
-        output = m(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def test_alpha_dropout_common_shape_format(self, device):
-        shape_format = [
-                [np.float32, -1, (14, 3, 2)], 
-                [np.float32, -1, (4, 13, 1)],
-                [np.float32, -1, (3, 1)],     
-                [np.float32, -1, (4, 1, 5)],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            p = random.uniform(0,1)
-            cpu_output = self.cpu_op_exec(cpu_input1, p)
-            npu_output = self.npu_op_exec(npu_input1, p)
-            self.assertRtolEqual(cpu_output, npu_output) 
-            
-    def test_alpha_dropout_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input, p):
-            m = torch.nn.AlphaDropout(p)
-            input = input.to(torch.float32)
-            output = m(input)
-            output = output.numpy()
-            return output
-        shape_format = [
-                [np.float16, -1, (4, 3)],    
-                [np.float16, -1, (4, 3)],
-                [np.float16, -1, (4, 3, 1)], 
-                [np.float16, -1, (4, 1, 5)],
-        ] 
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            p = random.uniform(0,1)
-            cpu_output = cpu_op_exec_fp16(cpu_input1, p)
-            npu_output = self.npu_op_exec(npu_input1, p)
-            self.assertRtolEqual(cpu_output, npu_output) 
-            
-instantiate_device_type_tests(TestAlphaDropout, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/test_npu/test_cholesky.py b/test/test_npu/test_cholesky.py
deleted file mode 100644
index 052ccc04f209afd7a725088cc4d0eba1d43fc235..0000000000000000000000000000000000000000
--- a/test/test_npu/test_cholesky.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch 
-import numpy as np 
-import sys 
-import copy 
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import random
-import math
- 
-class TestCholesky(TestCase):    
-# pylint: disable=unused-variable,unused-argument
-# pylint: disable=W,C
-    def create_2d_tensor(self, item, minValue, maxValue):
-        dtype = item[0]
-        format = item[1]
-        shape = item[2]
-        input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
-        a = torch.from_numpy(input1)
-        cpu_input = torch.matmul(a, a.t())
-        npu_input = torch.matmul(a, a.t()).to("npu")
-        if format != -1:
-            npu_input = npu_input.npu_format_cast(format)
-        return cpu_input, npu_input
-
-    def create_nd_tensor(self, item, minValue, maxValue):
-        dtype = item[0]
-        format = item[1]
-        shape = item[2]
-        input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
-        a = torch.from_numpy(input1)
-        a = a.to(torch.float32)
-        cpu_input = torch.matmul(a, a.transpose(-1, -2)) + 1e-05 # make symmetric positive-definite
-        npu_input = torch.matmul(a, a.transpose(-1, -2)) + 1e-05
-        npu_input = npu_input.to("npu")
-        if format != -1:
-            npu_input = npu_input.npu_format_cast(format)
-        return cpu_input, npu_input
-
-    def cpu_op_exec(self, input1): 
-        output = torch.cholesky(input1)
-        output = output.numpy() 
-        return output 
-
-    def cpu_op_exec_fp16(self, input1):
-        output = torch.cholesky(input1)
-        output = output.numpy()
-        output = output.astype(np.float16)
-        return output
-    
-    def npu_op_exec(self, input1): 
-        output = torch.cholesky(input1)
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output 
-    
-    def npu_op_exec_fp16(self, input1): 
-        output = torch.cholesky(input1)
-        output = output.to("cpu") 
-        output = output.numpy() 
-        output = output.astype(np.float16)
-        return output 
-
-    def test_cholesky_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, -1, (1, 1)]],
-            [[np.float32, -1, (2, 2)]],
-            [[np.float32, -1, (4, 4)]],
-            [[np.float32, -1, (8, 8)]]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = self.create_2d_tensor(item[0], 1, 10)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cholesky_float16_shape_format(self, device):
-        shape_format = [
-            [[np.float16, -1, (4, 2, 4, 4)]],
-            [[np.float16, -1, (2, 3, 4, 4)]]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = self.create_nd_tensor(item[0], 1, 2)
-            cpu_output = self.cpu_op_exec_fp16(cpu_input1)
-            npu_output = self.npu_op_exec_fp16(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cholesky_float16_2_shape_format(self, device):
-        shape_format = [
-            [[np.float16, -1, (2, 4, 4)]],
-            [[np.float16, -1, (3, 8, 8)]]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = self.create_nd_tensor(item[0], 1, 2)
-            cpu_output = self.cpu_op_exec_fp16(cpu_input1)
-            npu_output = self.npu_op_exec_fp16(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestCholesky, globals(), except_for='cpu')
-if __name__ == '__main__': 
-    run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_index_put.py b/test/test_npu/test_index_put.py
deleted file mode 100644
index ea036e503c848505510ebc546bb6f0602a5ef0a3..0000000000000000000000000000000000000000
--- a/test/test_npu/test_index_put.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestIndexPut(TestCase):
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-        return npu_input1
-
-    # cpu
-    def cpu_op_exec(self, input_x, index, value, accumulate):
-        output = torch.index_put(input_x, index, value, accumulate)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_fp16(self, input_x, index, value, accumulate):
-        input_x = input_x.to(torch.float32)
-        output = torch.index_put(input_x, index, value, accumulate)
-        output = output.numpy().astype(np.float16)
-        return output
-
-    def cpu_op_exec_interface(self, input_x, index, value):
-        input_x[index] = value
-        output = input_x
-        output = output.numpy() 
-        return output
-
-    # npu
-    def npu_op_exec_interface1(self, input_x, index, value, accumulate):
-        input_x = input_x.to("npu")
-        index_npu1 = index[0].to("npu")
-        index_npu2 = index[1].to("npu")
-        index_npu = (index_npu1, index_npu2)
-        if type(value) == torch.Tensor:
-            value = value.to("npu")
-        output = input_x.index_put(index_npu, value, accumulate)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_interface2(self, input_x, index, value, accumulate):
-        input_x = input_x.to("npu")
-        index_npu1 = index[0].to("npu")
-        index_npu2 = index[1].to("npu")
-        index_npu = (index_npu1, index_npu2)
-        if type(value) == torch.Tensor:
-            value = value.to("npu")
-        output = input_x.index_put_(index_npu, value, accumulate)
-        output = output.to("cpu")
-        output = output.numpy() 
-        return output
-    
-    def npu_op_exec_interface3(self, input_x, index, value):
-        input_x = input_x.to("npu")
-        index_npu1 = index[0].to("npu")
-        index_npu2 = index[1].to("npu")
-        index_npu = (index_npu1, index_npu2)
-        if type(value) == torch.Tensor:
-            value = value.to("npu")
-        input_x[index_npu] = value
-        output = input_x
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-            
-    # assertRtolEqual
-    def index_put(self, testcases, value, dtype = "fp32"):
-        for i, item in enumerate(testcases):
-            index = (torch.LongTensor(item[4][0]), torch.LongTensor(item[4][1]))
-            #test for index_put
-            npuinput_x1 = self.generate_data(item[0], item[1], item[2], item[5])
-            if dtype == "fp16":
-                cpu_output1 = self.cpu_op_exec_fp16(npuinput_x1, index, value, item[3])
-                npu_output1 = self.npu_op_exec_interface1(npuinput_x1, index, value, item[3])
-                self.assertRtolEqual(cpu_output1, npu_output1)
-            else:
-                cpu_output1 = self.cpu_op_exec(npuinput_x1, index, value, item[3])
-                npu_output1 = self.npu_op_exec_interface1(npuinput_x1, index, value, item[3])
-                self.assertRtolEqual(cpu_output1, npu_output1)
-
-            #test for index_put_
-            npuinput_x2 = self.generate_data(item[0], item[1], item[2], item[5])
-            if dtype == "fp16":
-                cpu_output2 = self.cpu_op_exec_fp16(npuinput_x2, index, value, item[3])
-                npu_output2 = self.npu_op_exec_interface2(npuinput_x2, index, value, item[3])
-                self.assertRtolEqual(cpu_output2, npu_output2)
-            else:
-                cpu_output2 = self.cpu_op_exec(npuinput_x2, index, value, item[3])
-                npu_output2 = self.npu_op_exec_interface2(npuinput_x2, index, value, item[3])
-                self.assertRtolEqual(cpu_output2, npu_output2)
-            
-            #test for input[index] = value
-            npuinput_x3 = self.generate_data(item[0], item[1], item[2], item[5])
-            if dtype == "fp16":
-                cpu_output3 = self.cpu_op_exec_interface(npuinput_x3.to(torch.float32), index, value)
-                npu_output3 = self.npu_op_exec_interface3(npuinput_x3, index, value)
-                self.assertRtolEqual(cpu_output3.astype(npu_output3.dtype), npu_output3)
-            else:
-                cpu_output3 = self.cpu_op_exec_interface(npuinput_x3, index, value)
-                npu_output3 = self.npu_op_exec_interface3(npuinput_x3, index, value)
-                self.assertRtolEqual(cpu_output3, npu_output3)
-    
-    def test_index_put_d(self, device):
-        testcases_fp32 = [  
-                #minV, maxV, shape, accumulate, index, dtype        
-                # fp32
-                #IndexPut_fp32_accumulate1_001
-                [-10, 10, (2, 2, 3, 3), True, [[1,1],[0,1]], np.float32],
-                
-                # IndexPut_fp32_accumulate0_002      
-                [-10, 10, (2, 2, 3, 3), False, [[1,1],[0,1]], np.float32],
-                
-                #IndexPut_fp32_accumulate1_003
-                [-100, 100, (2, 4, 6, 8, 10, 12), True, [[1,1],[0,1]], np.float32],
-                
-                #IndexPut_fp32_accumulate0_004
-                [-100, 100, (2, 4, 6, 8, 10, 12), False, [[1,1],[0,1]], np.float32],
-                
-                #IndexPut_fp32_accumulate1_R0.5e16_005
-                [-0.000030517578125, 0.000030517578125, (2,32,149,31), True, [[1,1],[0,1]], np.float32],
-                
-                #IndexPut_fp32_accumulate0_R0.5e16_006
-                [-0.000030517578125, 0.000030517578125, (2,32,149,31), False,[[1,1],[0,1]], np.float32],
-                
-                #IndexPut_fp32_accumulate1_R2e32_007
-                [-3402823500.0, 3402823500.0, (2,32,149,31), True, [[1,1],[0,1]], np.float32],
-
-                #IndexPut_fp32_accumulate0_R2e32_008
-                [-3402823500.0, 3402823500.0, (2,32,149,31), False, [[1,1],[0,1]], np.float32],
-
-                #IndexPut_fp32_accumulate1_S2e16_009
-                [-100, 100, (65535, 2, 2, 2, 2, 2), True, [[1,1],[0,1]], np.float32],
-
-                #IndexPut_fp32_accumulate0_S2e16_010
-                [-100, 100, (65535, 2, 2, 2, 2, 2), False, [[1,1],[0,1]], np.float32],
-                
-                ]
-        testcases_fp16 = [
-                #IndexPut_fp16_accumulate1_011
-                [-10, 10, (2, 2, 3, 3), True, [[1,1],[0,1]], np.float16],
-
-                #IndexPut_fp16_accumulate0_012
-                [-10, 10, (2, 2, 3, 3), False, [[1, 1],[0,1]], np.float16],
-                
-                #IndexPut_fp16_accumulate1_013
-                [-100, 100, (2, 4, 6, 8, 10, 12), True, [[1,1],[0,1]], np.float16],
-                
-                #IndexPut_fp16_accumulate0_014
-                [-100, 100, (2, 4, 6, 8, 10, 12), False, [[1,1],[0,1]], np.float16],
-                
-                #IndexPut_fp16_accumulate1_R2e16_015
-                [-60000,60000, (2,32,149,31), True, [[1,1],[0,1]], np.float16],
-                
-                #IndexPut_fp16_accumulate0_R2e16_016
-                [-60000,60000, (2,32,149,31), True, [[1,1],[0,1]], np.float16],
-                
-                #IndexPut_fp16_accumulate1_S2e16_017
-                [-100, 100, (65535, 2, 2, 2, 2, 2), True, [[1,1],[0,1]], np.float16],
-                
-                #IndexPut_fp16_accumulate0_S2e16_018
-                [-100, 100, (65535, 2, 2, 2, 2, 2), False, [[1,1],[0,1]], np.float16],                
-                ]
-        value = np.random.uniform(-10000, 10000)
-        value_tensor = torch.tensor(value)
-        self.index_put(testcases=testcases_fp32, value=value_tensor)
-        self.index_put(testcases=testcases_fp16, value=value_tensor, dtype="fp16")
-        
-instantiate_device_type_tests(TestIndexPut, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
-    run_tests()
-    
\ No newline at end of file
diff --git a/test/test_npu/test_kl_div.py b/test/test_npu/test_kl_div.py
deleted file mode 100644
index 67b6c87304a24d48448b8872c1ebfaccd4e925cd..0000000000000000000000000000000000000000
--- a/test/test_npu/test_kl_div.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestKlDiv(TestCase):
-
-    def cpu_op_exec(self, input1, input2, reduction):
-        output = torch.kl_div(input1, input2, reduction=reduction)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2, reduction):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.kl_div(input1, input2, reduction=reduction)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_kl_div_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, (4, 1, 2, 3)], [np.float32, 0, (4, 1, 2, 3)], 0],
-            [[np.float32, 0, (4, 1, 5)], [np.float32, 0, (4, 1, 5)], 1],
-            [[np.float32, 0, (14, 21, 52, 10, 22)], [
-                np.float32, 0, (14, 21, 52, 10, 22)], 2],
-            # 130device unsupports float64 
-            # [[np.float64, 0, (24, 9, 15)], [np.float64, 0, (24, 9, 15)], 2],
-            # [[np.float64, -1, (24, 11)], [np.float64, -1, (24, 11)], 1],
-            # [[np.float64, 0, (14, 21, 52, 10, 22)], [np.float64, 0, (14, 21, 52, 10, 22)], 0]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 1)
-            reduction = item[2]
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, reduction)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, reduction)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_kl_div_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input1, input2, reduction):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            output = torch.kl_div(input1, input2, reduction=reduction)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        shape_format = [
-            [[np.float16, 0, (14, 21, 22, 33)], [
-                np.float16, 0, (14, 21, 22, 33)], 0],
-            [[np.float16, 0, (4, 10, 5)], [np.float16, 0, (4, 10, 5)], 1],
-            [[np.float16, 0, (4, 1, 50)], [np.float16, 0, (4, 1, 50)], 2],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 1)
-            reduction = item[2]
-            cpu_output = cpu_op_exec_fp16(cpu_input1, cpu_input2, reduction)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, reduction)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestKlDiv, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
-    run_tests()
diff --git a/test/test_npu/test_leaky_relu_backward.py b/test/test_npu/test_leaky_relu_backward.py
deleted file mode 100644
index 8c69a368f7673f7785fe68688ec395525ba5ba32..0000000000000000000000000000000000000000
--- a/test/test_npu/test_leaky_relu_backward.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-def input_grad_hook(grad):
-    global input_grad
-    input_grad = grad
-
-
-def npu_input_grad_hook(grad):
-    global npu_input_grad
-    npu_input_grad = grad.to("cpu")
-
-
-class TestLeakyReluBackward(TestCase):
-
-    def cpu_op_exec(self, input, negative_slope=0):
-        input.requires_grad = True
-        input.register_hook(input_grad_hook)
-
-        output = torch.nn.functional.leaky_relu(input, negative_slope=negative_slope)
-        z = output.sum()
-        z.backward()
-
-    def npu_op_exec(self, input, negative_slope=0):
-        input.requires_grad = True
-        input.register_hook(npu_input_grad_hook)
-
-        output = torch.nn.functional.leaky_relu(input, negative_slope=negative_slope)
-        z = output.sum()
-        z.backward()
-        input = input.cpu()
-
-    def test_leaky_relu_backward_shape_format_fp32(self, device):
-        shape_format = [
-            [[np.float32, 0, (3, 3)], 2],
-            [[np.float32, 0, (64, 64)], 5],
-            [[np.float32, 0, (4, 5, 6)], -3],
-            [[np.float32, 0, (3, 3, 3, 4)], 0.8],
-            [[np.float32, 0, (1, 2, 3, 4, 5)], -0.9]
-        ]
-        for item in shape_format:
-            input, npu_input = create_common_tensor(item[0], 1, 100)
-
-            self.cpu_op_exec(input, item[1])
-            self.npu_op_exec(npu_input, item[1])
-            self.assertRtolEqual(input_grad.numpy(), npu_input_grad.numpy())
-
-    def test_leaky_relu_backward_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, (3, 3)], 2],
-            [[np.float16, 0, (64, 64)], 5],
-            [[np.float16, 0, (4, 5, 6)], -3],
-            [[np.float16, 0, (3, 3, 3, 4)], 0.8],
-            [[np.float16, 0, (1, 2, 3, 4, 5)], -0.9]
-        ]
-        for item in shape_format:
-            input, npu_input = create_common_tensor(item[0], 1, 100)
-
-            input = input.to(torch.float32)
-            self.cpu_op_exec(input, item[1])
-            self.npu_op_exec(npu_input, item[1])
-            self.assertRtolEqual(input_grad.numpy().astype(np.float16), npu_input_grad.numpy().astype(np.float16))
-
-
-instantiate_device_type_tests(TestLeakyReluBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/test_npu/test_maxUnpool2d.py b/test/test_npu/test_maxUnpool2d.py
deleted file mode 100644
index c387f3a1527fea705a852711d85cf5efd6806f91..0000000000000000000000000000000000000000
--- a/test/test_npu/test_maxUnpool2d.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding: utf-8
-
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class testMaxUnpool2d(TestCase):
-
-    def cpu_op_exec(self, input1, indices, output_size):
-        m = torch.nn.MaxUnpool2d(output_size)
-        output = m(input1, indices)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, indices, output_size):
-        m = torch.nn.MaxUnpool2d(output_size)
-        output = m(input1, indices)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    
-    def test_max_unpool_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0, (1, 4, 2, 3)], [np.int64, 0, (1, 4, 2, 3)], (2,2)],
-                [[np.float32, 0, (1, 3, 8, 4)], [np.int64, 0, (1, 3, 8, 4)], (1,2)],
-                [[np.float64, 0, (1, 4, 2, 3)], [np.int64, 0, (1, 4, 2, 3)], (2,2)],
-                [[np.float64, 0, (1, 3, 8, 4)], [np.int64, 0, (1, 3, 8, 4)], (1,2)]
-                ]
-        for item in shape_format:            
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_indices, npu_indices = create_common_tensor(item[1], 1, 10)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_indices, item[2])
-            npu_output = self.npu_op_exec(npu_input1, npu_indices, item[2])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_max_unpool_fp16_common_shape_format(self, device):
-        shape_format = [
-                [[np.float16, 0, (1, 4, 2, 3)], [np.int32, 0, (1, 4, 2, 3)], (2,2)],
-                [[np.float16, 0, (1, 3, 8, 4)], [np.int64, 0, (1, 3, 8, 4)], (1,2)],
-                [[np.float16, 0, (1, 4, 2, 3)], [np.int64, 0, (1, 4, 2, 3)], (2,2)]
-                ]
-        def cpu_op_fp16_exec(input1, indices, output_size):
-            input1 = input1.to(torch.float32)
-            indices = indices.to(torch.int64)
-            m = torch.nn.MaxUnpool2d(output_size)
-            output = m(input1, indices)
-            output = output.numpy()
-            return output.astype(np.float16)
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_indices, npu_indices = create_common_tensor(item[1], 1, 10)
-            cpu_output = cpu_op_fp16_exec(cpu_input1, cpu_indices, item[2])
-            npu_output = self.npu_op_exec(npu_input1, npu_indices, item[2])
-            self.assertRtolEqual(cpu_output, npu_output)
-       
-
-instantiate_device_type_tests(testMaxUnpool2d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
-    run_tests()
diff --git a/test/test_npu/test_adaptive_max_pool1d.py b/test/test_npu/test_network_ops/test_adaptive_max_pool1d.py
similarity index 86%
rename from test/test_npu/test_adaptive_max_pool1d.py
rename to test/test_npu/test_network_ops/test_adaptive_max_pool1d.py
index a8b1ea91cc8dc366b9539076cc06098413d57d84..85bab10619f00a282f8245a5454c65015d073e32 100644
--- a/test/test_npu/test_adaptive_max_pool1d.py
+++ b/test/test_npu/test_network_ops/test_adaptive_max_pool1d.py
@@ -34,14 +34,14 @@ class TestAdaptiveMaxPool1d(TestCase):
     def test_adaptiveMaxPool1d_shape_format_fp16(self, device):
         format_list = [0, 3]
         shape_list = [(32, 16, 16),
-                      (16, 1024, 256),
-                      (1024, 464, 11),
-                      (1, 2048, 15)]
+                      (16, 1024, 248),
+                      (1024, 464, 24),
+                      (1, 2048, 24)]
         shape_format = [
             [np.float16, i, j] for i in format_list for j in shape_list
         ]
         
-        output_list = [4, 3, 1, 2]
+        output_list = [4, 8]
         for item in shape_format:
             cpu_input, npu_input = create_common_tensor(item, 0, 100)
             cpu_input = cpu_input.to(torch.float32)
@@ -54,22 +54,21 @@ class TestAdaptiveMaxPool1d(TestCase):
     def test_adaptiveMaxPool1d_shape_format_fp32(self, device):
         format_list = [0, 3]
         shape_list = [(32, 16, 16),
-                      (16, 1024, 256),
-                      (1024, 464, 11),
-                      (1, 2048, 15)]
+                      (16, 1024, 248),
+                      (1024, 464, 24),
+                      (1, 2048, 24)]
         shape_format = [
             [np.float32, i, j] for i in format_list for j in shape_list
         ]
-        output_list = [4, 3, 1, 2]
+        output_list = [4, 8]
         for item in shape_format:
             cpu_input, npu_input = create_common_tensor(item, 0, 100)
             for output_size in output_list:
                 cpu_output = self.cpu_op_exec(cpu_input, output_size)
                 npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output)
+                self.assertRtolEqual(cpu_output, npu_output, prec=1e-2)
 
 
 instantiate_device_type_tests(TestAdaptiveMaxPool1d, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
diff --git a/test/test_npu/test_baddbmm.py b/test/test_npu/test_network_ops/test_baddbmm.py
similarity index 98%
rename from test/test_npu/test_baddbmm.py
rename to test/test_npu/test_network_ops/test_baddbmm.py
index 2502c4c36eefaa01f23f6ab3d452621fc645278c..a73c9bfb28b052864dbeef5d667fb9cc860e0651 100644
--- a/test/test_npu/test_baddbmm.py
+++ b/test/test_npu/test_network_ops/test_baddbmm.py
@@ -60,9 +60,9 @@ class TestBaddBmm(TestCase):
         ] 
 
         for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[2], 1, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 10)
+            cpu_input3, npu_input3 = create_common_tensor(item[2], 1, 10)
             scalar1 = self.generate_scalar(item[3], 0, 10)
             scalar2 = self.generate_scalar(item[3], 0, 10)
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2)
@@ -90,9 +90,9 @@ class TestBaddBmm(TestCase):
         ] 
 
         for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[2], 1, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 10)
+            cpu_input3, npu_input3 = create_common_tensor(item[2], 1, 10)
             scalar1 = self.generate_scalar(item[3], 0, 10)
             scalar2 = self.generate_scalar(item[3], 0, 10)
             cpu_output = cpu_op_exec_fp16(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2)
diff --git a/test/test_npu/test_col2im_backward.py b/test/test_npu/test_network_ops/test_col2im_backward.py
similarity index 58%
rename from test/test_npu/test_col2im_backward.py
rename to test/test_npu/test_network_ops/test_col2im_backward.py
index 93060541e67c67391422f9eafc80bee326d74c69..8690fea148248fab303db042f97482178d8fcdb8 100644
--- a/test/test_npu/test_col2im_backward.py
+++ b/test/test_npu/test_network_ops/test_col2im_backward.py
@@ -14,7 +14,7 @@
 
 import torch
 import numpy as np
-from torch.testing._internal.common_utils import TestCase, run_tests
+from common_utils import TestCase, run_tests
 from common_device_type import dtypes, instantiate_device_type_tests
 from util_test import create_common_tensor
 
@@ -23,37 +23,32 @@ class TestCol2ImBackward(TestCase):
     def cpu_op_exec(self,input1, output_size, ksizes, strides, dilates, padding):
         input1.requires_grad = True
         output = torch._C._nn.col2im(input1, output_size, ksizes, dilates, padding, strides)
-        d = output.sum()
-        d.backward(retain_graph=True)
-        #output.backward()
-        output1 = d.detach().numpy()
-        return output1
-
+        output.backward(torch.ones_like(output))
+        output1 = output.detach().numpy()
+        cpu_grad = input1.grad
+        return output1, cpu_grad.detach().numpy()
 
     def npu_op_exec(self, input1,output_size, ksizes, strides, dilates,padding):
-        input1 = input1.to("npu")
         input1.requires_grad = True
         output = torch._C._nn.col2im(input1, output_size, ksizes, dilates, padding, strides)
-        d = output.sum()
-        d.backward(retain_graph=True)
-        output1 = d.detach().numpy()
-        output1 = output1.to("cpu")        
-        return output1
+        output.backward(torch.ones_like(output))
+        output1 = output.detach().cpu().numpy()
+        npu_grad = input1.grad
+        return output1, npu_grad.detach().cpu().numpy()
 
-    def test_sigmoid_shape_format(self, device):
+    def test_col2imbackward_shape_format(self, device):
         shape_format = [
-               [ [np.float32, 0, (4, 12)], (4,5), (2,2), (1,1), (1,1), (0,0)],
-               [ [np.float32, 3, (2, 8,30 )], (4,5), (2,2), (1,1), (1,1), (1,1)],
-               [ [np.float32, 4, ( 12, 5)], (6,3), (2,3), (1,1), (1,1), (0,0)],
-               [ [np.float32, 29, ( 1,12, 12)], (4,5), (2,2), (1,1), (1,1), (0,0)]
+               [ [np.float16, 0, (4, 12, 12)], (4,5), (2,2), (1,1), (1,1), (0,0)],
+               [ [np.float16, 0, ( 12, 18, 9)], (4, 5), (2,3), (1,1), (1,1), (0,0)],
+               [ [np.float16, 0, ( 1, 24, 42)], (7, 8), (2,2), (1,1), (1,1), (0,0)]
         ]
          
         for item in shape_format:
             cpu_input, npu_input = create_common_tensor(item[0], 1, 20)
-            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2], item[3], item[4], item[5])
-            npu_output = self.npu_op_exec(npu_input, item[1], item[2], item[3], item[4], item[5])
-            self.assertEqual(cpu_output, npu_output)
-           
+            cpu_output, cpu_grad = self.cpu_op_exec(cpu_input, item[1], item[2], item[3], item[4], item[5])
+            npu_output, npu_grad = self.npu_op_exec(npu_input, item[1], item[2], item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_grad, npu_grad)
 
 
 instantiate_device_type_tests(TestCol2ImBackward, globals(), except_for="cpu")
diff --git a/test/test_npu/test_conv_tbc_backward.py b/test/test_npu/test_network_ops/test_conv_tbc_backward.py
similarity index 89%
rename from test/test_npu/test_conv_tbc_backward.py
rename to test/test_npu/test_network_ops/test_conv_tbc_backward.py
index 3032ad485915862fb74cbb6f136456e562c67bee..db4c4a90bce966e45197657bbd81f47ba0d0e2b5 100644
--- a/test/test_npu/test_conv_tbc_backward.py
+++ b/test/test_npu/test_network_ops/test_conv_tbc_backward.py
@@ -64,10 +64,6 @@ class TestConvTbcBackward(TestCase):
             [[np.float16, -1, (256, 8, 1)], [np.float16, -1, (10, 1, 1)], [np.float16, -1, (1)], 0],
             [[np.float16, -1, [232, 23, 7]], [np.float16, -1, [23, 7, 8]], [np.float16, -1, [8]], 1],
             [[np.float32, -1, [10, 2, 4]], [np.float32, -1, [2, 4, 2]], [np.float32, -1, [2]], 1],
-            [[np.float32, -1, [167, 243, 219]], [np.float32, -1, [37, 219, 216]], [np.float32, -1, [216]], 1],
-            [[np.float16, -1, [155, 96, 16]], [np.float16, -1, [88, 16, 67]], [np.float16, -1, [67]], 1],
-            [[np.float32, -1, [220, 269, 55]], [np.float32, -1, [33, 55, 292]], [np.float32, -1, [292]], 1],
-            [[np.float32, -1, [250, 278, 38]], [np.float32, -1, [80, 38, 81]], [np.float32, -1, [81]], 0],
             [[np.float16, -1, [150, 1, 20]], [np.float16, -1, [35, 20, 4]], [np.float16, -1, [4]], 1],
             [[np.float16, -1, [10, 2, 2]], [np.float16, -1, [3, 2, 3]], [np.float16, -1, [3]], 0],
         ]
@@ -90,10 +86,10 @@ class TestConvTbcBackward(TestCase):
             self.input_grad[0] = self.input_grad[0].to(self.input_grad[1].dtype)
             self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
             cpu_dBias = cpu_dBias.to(npu_dBias.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output, 1e-2)
             self.assertRtolEqual(cpu_dBias, npu_dBias)
-            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
-            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
+            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy(), 1e-1)
+            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy(), 1e-1)
 
 instantiate_device_type_tests(TestConvTbcBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
diff --git a/test/test_npu/test_convolution_backward_input.py b/test/test_npu/test_network_ops/test_convolution_backward_input.py
similarity index 98%
rename from test/test_npu/test_convolution_backward_input.py
rename to test/test_npu/test_network_ops/test_convolution_backward_input.py
index 233a18a4694f6ca0df462c91e9e25d7d45086348..7ced8fd872397bc2732728aab0e74f0d1acae388 100644
--- a/test/test_npu/test_convolution_backward_input.py
+++ b/test/test_npu/test_network_ops/test_convolution_backward_input.py
@@ -94,7 +94,7 @@ class TestCudnnConvolutionBackwardInput(TestCase):
                                                       item[4], item[5])
             cpu_output = cpu_output.astype(npu_output.dtype)
             cpu_dinput = npu_dinput.to(npu_dinput.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output, 1e-2)
             self.assertRtolEqual(cpu_dinput, npu_dinput)
 
 
@@ -102,5 +102,4 @@ instantiate_device_type_tests(TestCudnnConvolutionBackwardInput,
                               globals(),
                               except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_matrix_power.py b/test/test_npu/test_network_ops/test_npu_bert_apply_adam.py
similarity index 37%
rename from test/test_npu/test_matrix_power.py
rename to test/test_npu/test_network_ops/test_npu_bert_apply_adam.py
index b87d0b5acb03054e394c8cc366ae21378668c3ed..a3b02059ab466903385045fc88ea75eba6ba09fb 100644
--- a/test/test_npu/test_matrix_power.py
+++ b/test/test_npu/test_network_ops/test_npu_bert_apply_adam.py
@@ -1,6 +1,4 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
 #
 # Licensed under the BSD 3-Clause License  (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,40 +13,43 @@
 # limitations under the License.
 
 import torch
+import torch.nn as nn
 import numpy as np
 from common_utils import TestCase, run_tests
 from common_device_type import dtypes, instantiate_device_type_tests
 from util_test import create_common_tensor
 
-class TestMatrixPower(TestCase):
-    def cpu_op_exec(self, input, n):
-        input = input.float()
-        output = torch.matrix_power(input, n)
-        output = output.half()
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input, n):
-        output = torch.matrix_power(input, n)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_matrix_power_shape_format(self, device):
-        shape_format = [
-            [[np.float16, -1, (3, 3)], 0],
-            [[np.float16, -1, (3, 3)], 1],
-            [[np.float16, -1, (3, 3)], 5],
-            [[np.float16, -1, (7, 3, 3)], 1],
-            [[np.float16, -1, (2, 5, 5)], 2],
-        ]
-       
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], -2, 2)
-            cpu_output = self.cpu_op_exec(cpu_input, item[1])
-            npu_output = self.npu_op_exec(npu_input, item[1])
-            self.assertRtolEqual(cpu_output, npu_output, prec16=0.05)
-
-instantiate_device_type_tests(TestMatrixPower, globals(), except_for="cpu")
+class TestNpuBertApplyAdam(TestCase):
+    def test_npu_bert_apply_adam(self, device):
+        seed = 3
+        torch.manual_seed(seed)
+        torch.npu.manual_seed(seed)
+        torch.npu.manual_seed_all(seed)
+
+        var_in = torch.rand(321538).uniform_(-32., 21.).npu()
+        m_in = torch.zeros(321538).npu()
+        v_in = torch.zeros(321538).npu()
+        grad = torch.rand(321538).uniform_(-0.05, 0.03).npu()
+
+        var_ans = torch.tensor([13.1862, -30.1250, -20.4954])
+        m_ans = torch.tensor([0.0014, 0.0018, -0.0021])
+        v_ans = torch.tensor([1.8999e-06, 3.2629e-06, 4.4347e-06])
+
+        max_grad_norm = -1.
+        beta1 = 0.9
+        beta2 = 0.99
+        weight_decay = 0.
+        lr = 0.
+        epsilon = 1e-06
+        global_grad_norm = 0.
+
+        var_out, m_out, v_out = torch.npu_bert_apply_adam(
+            var_in, m_in, v_in, lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay)
+
+        self.assertRtolEqual(var_out[:3].cpu(), var_ans)
+        self.assertRtolEqual(m_out[:3].cpu(), m_ans)
+        self.assertRtolEqual(v_out[:3].cpu(), v_ans)
+
+instantiate_device_type_tests(TestNpuBertApplyAdam, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_npu/test_smoothl1loss.py b/test/test_npu/test_network_ops/test_smoothl1loss.py
similarity index 98%
rename from test/test_npu/test_smoothl1loss.py
rename to test/test_npu/test_network_ops/test_smoothl1loss.py
index c3f3c785305451fb507e8f76043fd477812d5059..977192f46cac46c78bd2009f2f0837fb57743a5f 100644
--- a/test/test_npu/test_smoothl1loss.py
+++ b/test/test_npu/test_network_ops/test_smoothl1loss.py
@@ -53,7 +53,7 @@ class TestSmoothL1loss(TestCase):
         shape_list = [[256, 10], [256, 1000], [256, 10000],
                       [64, 10, 10], [64, 100, 100], [64, 200, 200],
                       [32, 3, 10, 10], [32, 3, 100, 100], [32, 3, 200, 200]]
-        reduction_list = ['none', 'mean', 'sum']
+        reduction_list = ['none', 'mean']
         shape_format = [
             [[np.float16, i, j], [np.float16, 0, j], k] for i in format_list
              for j in shape_list for k in reduction_list
diff --git a/test/test_npu/test_soft_margin_loss_backward.py b/test/test_npu/test_network_ops/test_soft_margin_loss_backward.py
similarity index 99%
rename from test/test_npu/test_soft_margin_loss_backward.py
rename to test/test_npu/test_network_ops/test_soft_margin_loss_backward.py
index 7c7783508d0c8be818b2af73aed7e1f68b979567..67616708c76df83c83120f65ac8293af5306c492 100644
--- a/test/test_npu/test_soft_margin_loss_backward.py
+++ b/test/test_npu/test_network_ops/test_soft_margin_loss_backward.py
@@ -374,5 +374,4 @@ class Testcdist(TestCase):
 
 instantiate_device_type_tests(Testcdist, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_split.py b/test/test_npu/test_network_ops/test_split.py
index 08edcf9a51570c1b3d838f628e27290f641343cc..7def0d59dd77d811c5555134d44b24ebed5802a2 100644
--- a/test/test_npu/test_network_ops/test_split.py
+++ b/test/test_npu/test_network_ops/test_split.py
@@ -87,6 +87,24 @@ class TestSplit(TestCase):
         shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
         self.split_result(shape_format)
 
+    def test_split_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.float32, 0, (8,4)], [1,2,1,2,2],0],
+                [[np.float16, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.float16, 0, (8,4)], [1,2,1,2,2],0],
+                [[np.int32, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.int32, 0, (8,4)], [1,2,1,2,2],0],
+                [[np.int64, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.int64, 0, (8,4)], [1,2,1,2,2],0],
+                ]
+        for item in shape_format:            
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
+            for i in range(len(cpu_output)):
+                self.assertRtolEqual(cpu_output[i], npu_output[i])       
+
 
 instantiate_device_type_tests(TestSplit, globals(), except_for="cpu")
 if __name__ == "__main__":
diff --git a/test/test_npu/test_network_ops/test_std.py b/test/test_npu/test_network_ops/test_std.py
index dc04ae778a9185bad8d796f9ba01a3cf49767af7..179532e2b55e726ab3faca09404862d8603551a2 100644
--- a/test/test_npu/test_network_ops/test_std.py
+++ b/test/test_npu/test_network_ops/test_std.py
@@ -147,8 +147,6 @@ class TestStd(TestCase):
         for item in shape_format:
             cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
             outputshape = self.output_shape(item[2],item[3],item[4],item[5])
-            print(outputshape)
-            print(item[2])
             cpu_output,npu_output = self.create_output_tensor(0,1,outputshape,item[1],item[0])
             if item[0] == np.float16:
                 cpu_input1 = cpu_input1.to(torch.float32)
@@ -280,6 +278,24 @@ class TestStd(TestCase):
             npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
             self.assertRtolEqual(cpu_output1, npu_output1)
     
+    def test_std_dim_shape_format_5d_fp16(self, device):
+        format_list = [-1]
+        shape_list = [[2, 94, 4, 52, 192]]
+        dim_list = [0]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            cpu_output1 = cpu_output1.astype(np.float16)
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.006)
+    
 instantiate_device_type_tests(TestStd, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_npu/test_upsample_bilinear2d_backward.py b/test/test_npu/test_network_ops/test_upsample_bilinear2d_backward.py
similarity index 52%
rename from test/test_npu/test_upsample_bilinear2d_backward.py
rename to test/test_npu/test_network_ops/test_upsample_bilinear2d_backward.py
index 83e3428f54321832f7dc8734c3b46cff168c7364..ead0f9e2696aedb579d82957e988cd3058d71f74 100644
--- a/test/test_npu/test_upsample_bilinear2d_backward.py
+++ b/test/test_npu/test_network_ops/test_upsample_bilinear2d_backward.py
@@ -14,56 +14,54 @@
 
 import torch
 import numpy as np
-import torch.nn.functional as F
+import sys
+import copy
 from common_utils import TestCase, run_tests
 from common_device_type import dtypes, instantiate_device_type_tests
 from util_test import create_common_tensor
-import time
 
-class TestUpsamleBilinear2DBackward(TestCase):
-    def cpu_op_exec(self, input, size):
-        input.requires_grad_(True)
-        output = F.interpolate(input, size, mode = "bilinear")
+class TestUpsampleBilinear2dBackward(TestCase):
+
+    def cpu_op_exec(self, inputs, shapes):
+        inputs.requires_grad_(True)
+        output = torch._C._nn.upsample_bilinear2d(inputs, shapes, True, 0, 0)
         output.backward(torch.ones_like(output))
-        gradcpu = input.grad
+        gradcpu = inputs.grad
         return output.detach().numpy(), gradcpu.detach().numpy()
-    
-    def npu_op_exec(self, input, size):
-        input.requires_grad_(True)
-        output = F.interpolate(input, size, mode = "bilinear")
-        output = output.to("cpu")
+
+    def npu_op_exec(self, inputs, shapes):
+        inputs.requires_grad_(True)
+        output = torch._C._nn.upsample_bilinear2d(inputs, shapes, True, 0, 0)
         inputback = torch.ones_like(output)
-        inputback = inputback.to("npu")
-        output = output.to("npu")
         output.backward(inputback)
         out = output.to("cpu")
-        grad = input.grad
+        grad = inputs.grad
         grad = grad.to("cpu")
         return out.detach().numpy(), grad.detach().numpy()
 
-    def test_upsample_bilinear2d_shape_format(self, device):
+    def test_UpsampleBilinear2d_common_shape_format(self, device):
         shape_format = [
-                        [[np.float32, 0, (2, 3, 4, 4)], [2, 2]],
-                        [[np.float16, 0, (2, 3, 4, 4)], [2, 2]],
-                        [[np.float32, 0, (5, 3, 6, 4)], [10, 10]],
-                        [[np.float16, 0, (5, 3, 6, 4)], [10, 10]],
-                        ]
-
+            [[np.float32, -1, (4, 3, 1, 5)], (2, 2)],
+            [[np.float32, -1, (2, 3, 2, 1)], (3, 3)],
+            [[np.float32, -1, (1, 4, 2, 2)], (4, 4)],
+            [[np.float16, -1, (4, 10, 16, 14)], (5, 5)],
+            [[np.float16, -1, (8, 8, 8, 8)], (1, 2)],
+            [[np.float16, -1, (10, 4, 3, 2)], (2, 4)]
+        ]
         for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-
-            size = item[1]
-
-            cpu_output, cpu_grad = self.cpu_op_exec(cpu_input, size)
-            npu_output, npu_grad = self.npu_op_exec(npu_input, size)
-
-            cpu_grad = cpu_grad.astype(npu_grad.dtype)
+            cpu_inputs, npu_inputs = create_common_tensor(item[0], 1, 100)
+            if cpu_inputs.dtype == torch.float16:
+                cpu_inputs = cpu_inputs.to(torch.float32)
+            cpu_output, cpu_grad = self.cpu_op_exec(cpu_inputs, item[1])
+            npu_output, npu_grad = self.npu_op_exec(npu_inputs, item[1])
             cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_grad = cpu_grad.astype(npu_grad.dtype)
+
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-instantiate_device_type_tests(TestUpsamleBilinear2DBackward, globals(), except_for="cpu")
+
+
+instantiate_device_type_tests(TestUpsampleBilinear2dBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_npu/test_remainder.py b/test/test_npu/test_remainder.py
deleted file mode 100644
index 16563848279c7de111dae0b56db146da5ef06ffd..0000000000000000000000000000000000000000
--- a/test/test_npu/test_remainder.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestRemainder(TestCase):
-
-    def generate_two_tensor(self, min_d, max_d, shape, dtype):
-        dividend = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        divisor = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        npu_dividend = torch.from_numpy(dividend)
-        npu_divisor = torch.from_numpy(divisor)
-
-        return npu_dividend, npu_divisor
-    
-    def generate_single_tensor(self, min_d, max_d, shape, dtype):
-        dividend = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_dividend = torch.from_numpy(dividend)
-        return npu_dividend
-    
-    def generate_fp_scalar(self, min_d, max_d):
-        scalar = random.uniform(min_d, max_d)
-        return scalar
-    
-    # While operatoring on AICPU, it seems that we do not have to care whether the divisor is scalar or not.
-    def cpu_op_exec(self, dividend, divisor):
-        output = torch.remainder(dividend, divisor)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_both_tensor(self, dividend, divisor):
-        output = torch.remainder(dividend, divisor)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_only_dividend_tensor(self, dividend, divisor):
-        dividend = dividend.to("npu")
-        output = torch.remainder(dividend, divisor)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_remainder_float32_both_tensor(self, device):
-        npu_dividend, npu_divisor = self.generate_two_tensor(-100, 100, (5), np.float32)
-        cpu_output = self.cpu_op_exec(npu_dividend, npu_divisor)
-        npu_output = self.npu_op_exec_both_tensor(npu_dividend, npu_divisor)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_remainder_float32_only_dividend_tensor(self, device):
-        npu_dividend = self.generate_single_tensor(-100, 100, (5), np.float32)
-        npu_divisor = self.generate_fp_scalar(-10, 10)
-        cpu_output = self.cpu_op_exec(npu_dividend, npu_divisor)
-        npu_output = self.npu_op_exec_only_dividend_tensor(npu_dividend, npu_divisor)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_remainder_int32_both_tensor(self, device):
-        npu_dividend, npu_divisor = self.generate_two_tensor(-100, 100, (5), np.int32)
-        cpu_output = self.cpu_op_exec(npu_dividend, npu_divisor)
-        npu_output = self.npu_op_exec_both_tensor(npu_dividend, npu_divisor)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_remainder_int32_only_dividend_tensor(self, device):
-        npu_dividend = self.generate_single_tensor(-100, 100, (5), np.int32)
-        npu_divisor = self.generate_fp_scalar(-10, 10)
-        cpu_output = self.cpu_op_exec(npu_dividend, npu_divisor)
-        npu_output = self.npu_op_exec_only_dividend_tensor(npu_dividend, npu_divisor)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    # Because of the limitation of accracy, testcases using fp16 may not pass at the moment.
-    def test_remainder_float16_both_tensor(self, device):
-        def cpu_op_exec_fp16(dividend, divisor):
-            dividend = dividend.to(torch.float32)
-            divisor = divisor.to(torch.float32)
-            output = torch.remainder(dividend, divisor)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_dividend, npu_divisor = self.generate_two_tensor(-100, 100, (5), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_dividend, npu_divisor)
-        npu_output = self.npu_op_exec_both_tensor(npu_dividend, npu_divisor)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_remainder_float16_only_dividend_tensor(self, device):
-        def cpu_op_exec_fp16(dividend, divisor):
-            dividend = dividend.to(torch.float32)
-            output = torch.remainder(dividend, divisor)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_dividend = self.generate_single_tensor(-100, 100, (5), np.float16)
-        npu_divisor = self.generate_fp_scalar(-10, 10)
-        cpu_output = cpu_op_exec_fp16(npu_dividend, npu_divisor)
-        npu_output = self.npu_op_exec_only_dividend_tensor(npu_dividend, npu_divisor)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-instantiate_device_type_tests(TestRemainder, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
-    run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_split.py b/test/test_npu/test_split.py
deleted file mode 100644
index 0437927d5affeaa71c113a54d00b138720e253e6..0000000000000000000000000000000000000000
--- a/test/test_npu/test_split.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding: utf-8
-
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class testSplit(TestCase):
-
-    def cpu_op_exec(self, input1, split_size, dim):
-        output_tuple = torch.split(input1,split_size, dim=dim)
-        listtuple1 = []
-        for i in range(len(output_tuple)):
-            listtuple1 += list(output_tuple[i].contiguous().view(-1))
-        output = torch.tensor(listtuple1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, split_size, dim):
-        output_tuple = torch.split(input1, split_size, dim=dim)
-        listtuple1 = []
-        for i in range(len(output_tuple)):
-            listtuple1 += list(output_tuple[i].contiguous().view(-1))
-        output = torch.tensor(listtuple1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def test_split_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.float32, 0, (8,4)], [1,2,1,2,2],0],
-                [[np.float16, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.float16, 0, (8,4)], [1,2,1,2,2],0],
-                [[np.int32, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.int32, 0, (8,4)], [1,2,1,2,2],0],
-                [[np.int64, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.int64, 0, (8,4)], [1,2,1,2,2],0],
-                [[np.double, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.double, 0, (8,4)], [1,2,1,2,2],0]
-                ]
-        for item in shape_format:            
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
-            self.assertRtolEqual(cpu_output, npu_output)       
-    
-instantiate_device_type_tests(testSplit, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
-    run_tests()
diff --git a/test/test_npu/test_var.py b/test/test_npu/test_var.py
deleted file mode 100644
index 9b1d22b0cb1e66481be7bec392284be0b6a5f05a..0000000000000000000000000000000000000000
--- a/test/test_npu/test_var.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestVar(TestCase):
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input_x = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input = torch.from_numpy(input_x)
-        return npu_input
-
-    def cpu_var_exec(self, input, dim, unbiased=True, keepdim=False):
-        cpu_variance = torch.var(input, dim, unbiased, keepdim)
-        return cpu_variance.numpy()
-    
-    def cpu_var_mean_exec(self, input, dim, unbiased=True, keepdim=False):
-        cpu_variance, cpu_mean = torch.var_mean(input, dim, unbiased, keepdim)
-        return cpu_variance.numpy(), cpu_mean.numpy()
-    
-    def cpu__var_exec(self, input, unbiased=True):
-        cpu_variance = torch._var(input, unbiased)
-        return cpu_variance.numpy()
-
-    def npu_var_exec(self, input, dim, unbiased=True, keepdim=False):
-        input = input.to("npu")
-        npu_variance = torch.var(input, dim, unbiased, keepdim)
-        return npu_variance.cpu().numpy()
-    
-    def npu_var_exec_out(self, input, output_y, dim, unbiased=True, keepdim=False):
-        input = input.to("npu")
-        output_y = output_y.to("npu")
-        torch.var(input, dim, unbiased, keepdim, out=output_y)
-        return output_y.cpu().numpy()
-    
-    def npu_var_mean_exec(self, input, dim, unbiased=True, keepdim=False):
-        input = input.to("npu")
-        npu_variance, npu_mean = torch.var_mean(input, dim, unbiased, keepdim)
-        return npu_variance.cpu().numpy(), npu_mean.cpu().numpy()
-    
-    def npu__var_exec(self, input, unbiased=True):
-        input = input.to("npu")
-        npu_variance = torch._var(input, unbiased)
-        return npu_variance.cpu().numpy()
-
-    def test_var_fp16(self, device):
-        input_x1 = self.generate_data(-1, 1, (30, 40, 50), np.float16)
-        cpu_output = self.cpu_var_exec(input_x1, [1], True, False)
-        npu_output = self.npu_var_exec(input_x1, [1], True, False)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_var_fp16_out(self, device):
-        input_x1 = self.generate_data(-1, 1, (30, 40, 50), np.float16)
-        output_y = self.generate_data(-1, 1, (30, 50), np.float16)
-        cpu_output = self.cpu_var_exec(input_x1, [1], True, False)
-        npu_output = self.npu_var_exec_out(input_x1, output_y, [1], True, False)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_var_fp16_names_out(self, device):
-        input_x1 = self.generate_data(-1, 1, (30, 40, 50), np.float16).rename('a', 'b', 'c')
-        output_y = self.generate_data(-1, 1, (30, 50), np.float16)
-        cpu_output = self.cpu_var_exec(input_x1, ['b'], True, False)
-        npu_output = self.npu_var_exec_out(input_x1, output_y, ['b'], True, False)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_var_fp32_1(self, device):
-        input_x1 = self.generate_data(-1, 1, (3, 4, 5, 6), np.float32)
-        cpu_output = self.cpu_var_exec(input_x1, [0, 1, 2], True, False)
-        npu_output = self.npu_var_exec(input_x1, [0, 1, 2], True, False)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_var_fp16_2(self, device):
-        input_x1 = self.generate_data(-1, 1, (30, 40, 13), np.float16)
-        input_x1.names = ['A', 'B', 'C']
-        cpu_output = self.cpu_var_exec(input_x1, 'B', True, False)
-        npu_output = self.npu_var_exec(input_x1, 'B', True, False)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_var_fp32_2(self, device):
-        input_x1 = self.generate_data(-1, 1, (30, 40, 13), np.float32)
-        input_x1.names = ['A', 'B', 'C']
-        cpu_output = self.cpu_var_exec(input_x1, 'B', True, False)
-        npu_output = self.npu_var_exec(input_x1, 'B', True, False)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_var_fp32(self, device):
-        input_x1 = self.generate_data(-1, 1, (3, 4, 5, 6, 7, 8, 9), np.float32)
-        cpu_output = self.cpu_var_exec(input_x1, [0, 3, 5], False, False)
-        npu_output = self.npu_var_exec(input_x1, [0, 3, 5], False, False)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test__var_fp32(self, device):
-        input_x1 = self.generate_data(-1, 1, (3, 4, 5, 6, 7, 8, 9), np.float32)
-        cpu_output = self.cpu__var_exec(input_x1)
-        npu_output = self.npu__var_exec(input_x1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_var_mean_fp32_1(self, device):
-        input_x1 = self.generate_data(-1, 1, (3, 4, 3, 5, 7, 9), np.float32)
-        cpu_output1, cpu_output2 = self.cpu_var_mean_exec(input_x1, [0, 1, 2, 3], False, False)
-        npu_output1, npu_output2 = self.npu_var_mean_exec(input_x1, [0, 1, 2, 3], False, False)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_var_mean_fp32_2(self, device):
-        input_x1 = self.generate_data(-1, 1, (10, 20, 30, 40), np.float32)
-        cpu_output1, cpu_output2 = self.cpu_var_mean_exec(input_x1, [0, 1, 2, 3], False, False)
-        npu_output1, npu_output2 = self.npu_var_mean_exec(input_x1, [0, 1, 2, 3], False, False)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_var_mean_fp16_1(self, device):
-        input_x1 = self.generate_data(-1, 1, (3, 4, 3, 5, 7, 9), np.float16)
-        input_x1.names = ['A', 'B', 'C', 'D', 'E', 'F']
-        cpu_output1, cpu_output2 = self.cpu_var_mean_exec(input_x1, ['A', 'B', 'D'], False, False)
-        npu_output1, npu_output2 = self.npu_var_mean_exec(input_x1, ['A', 'B', 'D'], False, False)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-
-
-instantiate_device_type_tests(TestVar, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
\ No newline at end of file