diff --git a/docs/en/PyTorch API Support.md b/docs/en/PyTorch API Support.md index 6ce7bda7510850c424cf791649b70ec19cb65c50..b52fff607306105cbee924337091c34bdcf942cc 100644 --- a/docs/en/PyTorch API Support.md +++ b/docs/en/PyTorch API Support.md @@ -1669,13 +1669,14 @@ Change the format of a npu tensor. 29 ``` -> torch_npu.npu_transpose(self, perm) -> Tensor +> torch_npu.npu_transpose(self, perm, bool require_contiguous=True) -> Tensor Returns a view of the original tensor with its dimensions permuted, and make the result contiguous. - Parameters: - **self** (Tensor) - the input tensor - **perm** (ListInt) - The desired ordering of dimensions + - **require_contiguous** (bool) - Used to specify whether trans-contiguous of self is needed. - Constraints: diff --git "a/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225.md" "b/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225.md" index 4007a86b1db8ca1c72588d64ee1b2be7c7cdc3b8..61e5284ee4baedf70ea35ff129df21bcb7acc6fe 100644 --- "a/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225.md" +++ "b/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225.md" @@ -1676,13 +1676,14 @@ Change the format of a npu tensor. 29 ``` -> torch_npu.npu_transpose(self, perm) -> Tensor +> torch_npu.npu_transpose(self, perm, bool require_contiguous=True) -> Tensor Returns a view of the original tensor with its dimensions permuted, and make the result contiguous. - Parameters: - **self** (Tensor) - the input tensor - **perm** (ListInt) - The desired ordering of dimensions + - **require_contiguous** (bool) - Used to specify whether trans-contiguous of self is needed. - constraints: diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index 51f8911bd3d092cd5736dbad8fbd9fa66ab1bdff..32593817543569d083db21eb4fd77d6b5ff2da68 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -1862,11 +1862,9 @@ autograd: - elu_ custom: - - func: npu_transpose_to_contiguous(Tensor self) -> Tensor + - func: npu_transpose(Tensor self, int[] perm, bool require_contiguous=True) -> Tensor variants: function, method - - func: npu_transpose(Tensor self, int[] perm) -> Tensor - variants: function, method - - func: npu_transpose.out(Tensor self, int[] perm, *, Tensor(a!) out) -> Tensor(a!) + - func: npu_transpose.out(Tensor self, int[] perm, bool require_contiguous=True, *, Tensor(a!) out) -> Tensor(a!) - func: npu_broadcast(Tensor self, int[] size) -> Tensor variants: function, method - func: npu_broadcast.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!) diff --git a/torch_npu/csrc/aten/ops/ArgsortKernelNpu.cpp b/torch_npu/csrc/aten/ops/ArgsortKernelNpu.cpp index 0a8fc964d26a8d11262074b759a1a3bb10f8d6c7..2732b1d58e20ea9d6a0e1455961f2b0b0149e3cb 100644 --- a/torch_npu/csrc/aten/ops/ArgsortKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ArgsortKernelNpu.cpp @@ -53,7 +53,7 @@ at::Tensor& argsort_out_npu_nocheck( } std::swap(perm[dim], perm[lastDim]); - at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); + at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm, true); auto outputSize = transpose_npu_output_size(values, perm); at::Tensor transposeValues = OpPreparation::ApplyTensor( values, @@ -65,7 +65,7 @@ at::Tensor& argsort_out_npu_nocheck( argsort_out_npu_no_transpose( transposeValues, transposeIndices, transposeSelf, lastDim, descending); - NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, indices); + NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, true, indices); // indices dtype transform to Int64 indices = NPUNativeFunctions::npu_dtype_cast(indices, at::kLong); diff --git a/torch_npu/csrc/aten/ops/CumminKernelNpu.cpp b/torch_npu/csrc/aten/ops/CumminKernelNpu.cpp index b60d0d2fbab197c4e3a4b8a49f08c79d3069c1f6..79c368e7381985cbf0b548d8b1d086ba1817eee8 100644 --- a/torch_npu/csrc/aten/ops/CumminKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/CumminKernelNpu.cpp @@ -52,7 +52,7 @@ void NPUNativeFunctions::_cummin_helper(const at::Tensor& self, at::Tensor& valu } std::swap(perm[dim], perm[firstDim]); - at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); + at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm, true); auto outputSize = transpose_npu_output_size(values, perm); at::Tensor transposeValue = OpPreparation::ApplyTensor(self, outputSize); at::Tensor transposeIndices = OpPreparation::ApplyTensor(outputSize, self.options().dtype(at::kInt), self); @@ -60,8 +60,8 @@ void NPUNativeFunctions::_cummin_helper(const at::Tensor& self, at::Tensor& valu cummin_out_npu_nocheck(transposeValue, transposeIndices, transposeSelf, firstDim); // Indices must to be long transposeIndices = transposeIndices.to(at::kLong); - NPUNativeFunctions::npu_transpose_out(transposeValue, perm, values); - NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, indices); + NPUNativeFunctions::npu_transpose_out(transposeValue, perm, true, values); + NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, true, indices); } else { at::Tensor valuesTemp = OpPreparation::ApplyTensor(self); at::Tensor indicesTemp = OpPreparation::ApplyTensor(self, self.options().dtype(at::kInt)); diff --git a/torch_npu/csrc/aten/ops/PadPackedSequenceKernelNpu.cpp b/torch_npu/csrc/aten/ops/PadPackedSequenceKernelNpu.cpp index bca6a96b9735a085a8039cdf235d7366889ad9be..6a25b00221d1591a3d26c81969ee51b4ed89a700 100644 --- a/torch_npu/csrc/aten/ops/PadPackedSequenceKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/PadPackedSequenceKernelNpu.cpp @@ -55,7 +55,7 @@ std::tuple NPUNativeFunctions::_pad_packed_sequence( prevBatchSize = batchSize; } if (batchFirst) { - output = NPUNativeFunctions::npu_transpose(data, {0, 1}); + output = NPUNativeFunctions::npu_transpose(data, {0, 1}, true); } return std::tie(output, lengthsT); } diff --git a/torch_npu/csrc/aten/ops/RollKernelNpu.cpp b/torch_npu/csrc/aten/ops/RollKernelNpu.cpp index d49e772de4bcd695fe5c50662ab1d75e8a67e675..147041e321666190d652ff3f374fe50f254edea4 100644 --- a/torch_npu/csrc/aten/ops/RollKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/RollKernelNpu.cpp @@ -50,7 +50,7 @@ at::Tensor& roll_transpose( perm.emplace_back(i); } std::swap(perm[axis], perm[firstDim]); - at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); + at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm, true); auto outputSize = transpose_npu_output_size(result, perm); at::Tensor transposeResult = OpPreparation::ApplyTensorWithFormat( outputSize, @@ -61,7 +61,7 @@ at::Tensor& roll_transpose( at::IntArrayRef dim_now = at::IntArrayRef(dim); at::IntArrayRef shift_now = at::IntArrayRef(shift_bak); roll_out_npu_no_transpose(transposeResult, transposeSelf, shift_now, dim_now); - NPUNativeFunctions::npu_transpose_out(transposeResult, perm, result); + NPUNativeFunctions::npu_transpose_out(transposeResult, perm, true, result); return result; } diff --git a/torch_npu/csrc/aten/ops/SortKernelNpu.cpp b/torch_npu/csrc/aten/ops/SortKernelNpu.cpp index 2c27cf1144e2777d591c265b0f655a2f5ae283eb..496316d389a723dc1e07b04a1ea24a5439b6f7bc 100644 --- a/torch_npu/csrc/aten/ops/SortKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SortKernelNpu.cpp @@ -54,7 +54,7 @@ tuple NPUNativeFunctions::sort_out( } std::swap(perm[dim], perm[lastDim]); - at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); + at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm, true); auto outputSize = transpose_npu_output_size(values, perm); at::Tensor transposeValues = OpPreparation::ApplyTensor(values, outputSize); at::Tensor transposeIndices =OpPreparation::ApplyTensor(indices, outputSize); @@ -62,8 +62,8 @@ tuple NPUNativeFunctions::sort_out( sort_out_npu_no_transpose( transposeSelf, lastDim, descending, transposeValues, transposeIndices); - NPUNativeFunctions::npu_transpose_out(transposeValues, perm, values); - NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, indices); + NPUNativeFunctions::npu_transpose_out(transposeValues, perm, true, values); + NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, true, indices); } else { sort_out_npu_no_transpose( self, lastDim, descending, values, indices); diff --git a/torch_npu/csrc/aten/ops/SortWithoutIndicesKernelNpu.cpp b/torch_npu/csrc/aten/ops/SortWithoutIndicesKernelNpu.cpp index 46d52e6ad72eedb8d3b88927fbd27436027a1006..a376c1faa79c8ca0cf052b6c46c0e34ea35410b6 100644 --- a/torch_npu/csrc/aten/ops/SortWithoutIndicesKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SortWithoutIndicesKernelNpu.cpp @@ -58,13 +58,13 @@ at::Tensor& NPUNativeFunctions::npu_sort_v2_out( perm.emplace_back(i); } std::swap(perm[dim], perm[lastDim]); - at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); + at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm, true); auto outputSize = transpose_npu_output_size(result, perm); at::Tensor transposeResult = OpPreparation::ApplyTensor(result, outputSize); sort_without_indices_no_transpose(transposeResult, transposeSelf, lastDim, descending); - NPUNativeFunctions::npu_transpose_out(transposeResult, perm, result); + NPUNativeFunctions::npu_transpose_out(transposeResult, perm, true, result); } else { if (!NpuUtils::check_match(&result)) { at::Tensor contiguousResult = NpuUtils::format_contiguous(result); @@ -94,13 +94,13 @@ at::Tensor NPUNativeFunctions::npu_sort_v2( perm.emplace_back(i); } std::swap(perm[dim], perm[lastDim]); - at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); + at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm, true); auto outputSize = transpose_npu_output_size(result, perm); at::Tensor transposeResult = OpPreparation::ApplyTensor(result, outputSize); sort_without_indices_no_transpose(transposeResult, transposeSelf, lastDim, descending); - NPUNativeFunctions::npu_transpose_out(transposeResult, perm, result); + NPUNativeFunctions::npu_transpose_out(transposeResult, perm, true, result); } else { sort_without_indices_no_transpose(result, self, dim, descending); } diff --git a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp index ce3cebcb11b26eb076dcff09f74cb55ee199d8fb..7b8df572920fba9750878bd47052e2faf90be40a 100644 --- a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp @@ -63,7 +63,7 @@ tuple topk_out_npu_nocheck( std::swap(perm[dim], perm[lastDim]); // construct the output tensor of the NPU - at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); + at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm, true); auto outputSize = transpose_npu_output_size(values, perm); at::Tensor transposeValue = OpPreparation::ApplyTensor(values, outputSize); at::Tensor transposeIndices = OpPreparation::ApplyTensor(indices, outputSize); @@ -75,8 +75,8 @@ tuple topk_out_npu_nocheck( lastDim, largest, sorted); - NPUNativeFunctions::npu_transpose_out(transposeValue, perm, values); - NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, indices); + NPUNativeFunctions::npu_transpose_out(transposeValue, perm, true, values); + NPUNativeFunctions::npu_transpose_out(transposeIndices, perm, true, indices); } else { topk_out_npu_no_transpose( values, indices, self, k, lastDim, largest, sorted); diff --git a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp index bc5d78fa27904b7b51cc5de2a16b4589494aa692..09f1b56a176f70bfb0dfcc1ca11b79df43150222 100644 --- a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp @@ -27,82 +27,38 @@ namespace at_npu at::Tensor &NPUNativeFunctions::npu_transpose_out( const at::Tensor &self, at::IntArrayRef perm, - at::Tensor &result) + bool require_contiguous, + at::Tensor &result + ) { - c10::SmallVector permVec = array_to_small_vector(perm); OpCommand cmd; + if (require_contiguous) { + // Any tensor-view(discontiguous) Input Tensor from users should be transformed to be contiguous here. + cmd.Name("Transpose") + .Input(self) + .Input(perm) + .Output(result) + .Run(); + } else { + // For permute-opt in trans-contiguous, it accepts transposed(discontiguous) Input Tensor. cmd.Name("Transpose") .InputWithoutContiguous(self) .Input(perm) .Output(result) .Run(); + } return result; } - at::Tensor NPUNativeFunctions::npu_transpose(const at::Tensor &self, at::IntArrayRef perm) + at::Tensor NPUNativeFunctions::npu_transpose(const at::Tensor &self, at::IntArrayRef perm, bool require_contiguous) { auto outputSize = transpose_npu_output_size(self, perm); at::Tensor result = OpPreparation::ApplyTensor(self, outputSize); - NPUNativeFunctions::npu_transpose_out(self, perm, result); + NPUNativeFunctions::npu_transpose_out(self, perm, require_contiguous, result); return result; } - c10::SmallVector transpose_to_contiguous_npu_input( - const c10::SmallVector &src) - { - - c10::SmallVector inputs; - for (int i = 0; i < src.size(); i++) - { - inputs.emplace_back( - NPUTensorDesc(src[i])); - - if (src[i].dim() == 0) - { - inputs[i].tensorDescType = NPUTensorDesc::TensorDescType::TENSOR_SCALAR; - } - } - return inputs; - } - - c10::SmallVector transpose_to_contiguous_npu_output( - const c10::SmallVector &result) - { - return CalcuOpUtil::create_npu_output_tensor_desc(result); - } - - at::Tensor NPUNativeFunctions::npu_transpose_to_contiguous(const at::Tensor &self) - { - RECORD_FUNCTION("transpose_to_contiguous", vector({self})); - int64_t self_format = CalcuOpUtil::get_tensor_npu_format(self); - at::Tensor result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), self_format); - - // obtain the transpose axises - at::IntArrayRef dim; - if ((self.dim() == 2) && (self.stride(self.dim() - 2) == 1)) - { - dim = at::IntArrayRef({1, 0}); - } - else if ((self.dim() == 3) && (self.stride(self.dim() - 2) == 1)) - { - dim = at::IntArrayRef({0, 2, 1}); - } - else if ((self.dim() == 3) && (self.stride(0) <= self.stride(1))) - { - dim = at::IntArrayRef({1, 0, 2}); - } - // constructs the input and output NPUTensorDesc - auto inputs = transpose_to_contiguous_npu_input({self}); - auto outputs = transpose_to_contiguous_npu_output({result}); - - // constructs the attr of the NPUAttrDesc - NPUAttrDesc npuAttrTranspose = NPUAttrDesc("perm", dim); - c10::SmallVector attrs = {npuAttrTranspose}; - - CalcuOpUtil::execute_npu_operate("TransposeD", inputs, outputs, attrs); - return result; - } } // namespace native } // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/contiguous/permute_opt.cpp b/torch_npu/csrc/framework/contiguous/permute_opt.cpp index 06f69cc495eea51b86fe37d87c473419f0899217..4cce6545409cda687b6b50df95dccec06d4752cf 100644 --- a/torch_npu/csrc/framework/contiguous/permute_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/permute_opt.cpp @@ -39,7 +39,7 @@ public: static_cast(sizes)); src_desc.storage_sizes_ = sizes; - NPUNativeFunctions::npu_transpose_out(src, perm, self); + NPUNativeFunctions::npu_transpose_out(src, perm, false, self); src_desc = src_desc_stored; return true; }