diff --git a/docs/en/PyTorch API Support List/PyTorch API Support List.md b/docs/en/PyTorch API Support List/PyTorch API Support List.md new file mode 100644 index 0000000000000000000000000000000000000000..f1d83dffd361f15774265dc7b641cee463d8a917 --- /dev/null +++ b/docs/en/PyTorch API Support List/PyTorch API Support List.md @@ -0,0 +1,8229 @@ +# PyTorch API Support List +- [Tensors](#tensors.md) +- [Generators](#generators.md) +- [Random sampling](#random-sampling.md) +- [Serialization](#serialization.md) +- [Math operations](#math-operations.md) +- [Utilities](#utilities.md) +- [Other](#other.md) +- [torch.Tensor](#torch-tensor.md) +- [Layers \(torch.nn\)](#layers-(torch-nn).md) +- [Functions\(torch.nn.functional\)](#functions(torch-nn-functional).md) +- [torch.distributed](#torch-distributed.md) +- [NPU and CUDA Function Alignment](#npu-and-cuda-function-alignment.md) +

Tensors

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.is_tensor

+

Supported

+

2

+

torch.is_storage

+

Supported

+

3

+

torch.is_complex

+

Unsupported.

+

4

+

torch.is_floating_point

+

Supported

+

5

+

torch.set_default_dtype

+

Supported

+

6

+

torch.get_default_dtype

+

Supported

+

7

+

torch.set_default_tensor_type

+

Supported

+

8

+

torch.numel

+

Supported

+

9

+

torch.set_printoptions

+

Supported

+

10

+

torch.set_flush_denormal

+

Supported.

+

11

+

torch.tensor

+

Supported

+

12

+

torch.sparse_coo_tensor

+

Unsupported.

+

13

+

torch.as_tensor

+

Supported

+

14

+

torch.as_strided

+

Supported

+

15

+

torch.from_numpy

+

Supported

+

16

+

torch.zeros

+

Supported

+

17

+

torch.zeros_like

+

Supported

+

18

+

torch.ones

+

Supported

+

19

+

torch.ones_like

+

Supported

+

20

+

torch.arange

+

Supported

+

21

+

torch.range

+

Supported

+

22

+

torch.linspace

+

Supported

+

23

+

torch.logspace

+

Supported

+

24

+

torch.eye

+

Supported

+

25

+

torch.empty

+

Supported

+

26

+

torch.empty_like

+

Supported

+

27

+

torch.empty_strided

+

Supported

+

28

+

torch.full

+

Supported

+

29

+

torch.full_like

+

Supported

+

30

+

torch.quantize_per_tensor

+

Supported

+

31

+

torch.quantize_per_channel

+

Unsupported.

+

32

+

torch.cat

+

Supported

+

33

+

torch.chunk

+

Supported

+

34

+

torch.gather

+

Supported

+

35

+

torch.index_select

+

Supported

+

36

+

torch.masked_select

+

Supported

+

37

+

torch.narrow

+

Supported

+

38

+

torch.nonzero

+

Supported

+

39

+

torch.reshape

+

Supported

+

40

+

torch.split

+

Supported

+

41

+

torch.squeeze

+

Supported

+

42

+

torch.stack

+

Supported

+

43

+

torch.t

+

Supported

+

44

+

torch.take

+

Supported

+

45

+

torch.transpose

+

Supported

+

46

+

torch.unbind

+

Supported

+

47

+

torch.unsqueeze

+

Supported

+

48

+

torch.where

+

Supported

+
+ +

Generators

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch._C.Generator

+

Unsupported.

+

2

+

torch._C.Generator.device

+

Unsupported.

+

3

+

torch._C.Generator.get_state

+

Unsupported.

+

4

+

torch._C.Generator.initial_seed

+

Unsupported.

+

5

+

torch._C.Generator.manual_seed

+

Unsupported.

+

6

+

torch._C.Generator.seed

+

Unsupported.

+

7

+

torch._C.Generator.set_state

+

Unsupported.

+
+ +

Random sampling

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.seed

+

Unsupported.

+

2

+

torch.manual_seed

+

Unsupported.

+

3

+

torch.initial_seed

+

Unsupported.

+

4

+

torch.get_rng_state

+

Unsupported.

+

5

+

torch.set_rng_state

+

Unsupported.

+

6

+

torch.torch.default_generator

+

Unsupported.

+

7

+

torch.bernoulli

+

Supported

+

8

+

torch.multinomial

+

Supported

+

9

+

torch.normal

+

Supported

+

10

+

torch.poisson

+

Unsupported.

+

11

+

torch.rand

+

Supported

+

12

+

torch.rand_like

+

Supported

+

13

+

torch.randint

+

Supported

+

14

+

torch.randint_like

+

Supported

+

15

+

torch.randn

+

Supported

+

16

+

torch.randn_like

+

Supported

+

17

+

torch.randperm

+

Supported

+

18

+

torch.Tensor.bernoulli_()

+

Supported

+

19

+

torch.Tensor.bernoulli_()

+

Supported

+

20

+

torch.Tensor.exponential_()

+

Unsupported.

+

21

+

torch.Tensor.geometric_()

+

Unsupported.

+

22

+

torch.Tensor.log_normal_()

+

Unsupported.

+

23

+

torch.Tensor.normal_()

+

Supported

+

24

+

torch.Tensor.random_()

+

Supported

+

25

+

torch.Tensor.uniform_()

+

Supported

+

26

+

torch.quasirandom.SobolEngine

+

Unsupported.

+

27

+

torch.quasirandom.SobolEngine.draw

+

Unsupported.

+

28

+

torch.quasirandom.SobolEngine.fast_forward

+

Unsupported.

+

29

+

torch.quasirandom.SobolEngine.reset

+

Unsupported.

+
+ +

Serialization

+ + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.save

+

Supported

+

2

+

torch.load

+

Supported

+
+ +

Math operations

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.abs

+

Supported

+

2

+

torch.acos

+

Supported

+

3

+

torch.add

+

Supported

+

4

+

torch.addcdiv

+

Supported

+

5

+

torch.addcmul

+

Supported

+

6

+

torch.angle

+

Unsupported.

+

7

+

torch.asin

+

Supported

+

8

+

torch.atan

+

Supported

+

9

+

torch.atan2

+

Supported

+

10

+

torch.bitwise_not

+

Supported

+

11

+

torch.bitwise_and

+

Supported

+

12

+

torch.bitwise_or

+

Supported

+

13

+

torch.bitwise_xor

+

Supported

+

14

+

torch.ceil

+

Supported

+

15

+

torch.clamp

+

Supported

+

16

+

torch.conj

+

Unsupported.

+

17

+

torch.cos

+

Supported

+

18

+

torch.cosh

+

Supported

+

19

+

torch.div

+

Supported

+

20

+

torch.digamma

+

Unsupported.

+

21

+

torch.erf

+

Supported

+

22

+

torch.erfc

+

Unsupported.

+

23

+

torch.erfinv

+

Supported

+

24

+

torch.exp

+

Supported

+

25

+

torch.expm1

+

Supported

+

26

+

torch.floor

+

Supported

+

27

+

torch.floor_divide

+

Supported

+

28

+

torch.fmod

+

Supported

+

29

+

torch.frac

+

Supported

+

30

+

torch.imag

+

Unsupported.

+

31

+

torch.lerp

+

Supported

+

32

+

torch.lgamma

+

Unsupported.

+

33

+

torch.log

+

Supported

+

34

+

torch.log10

+

Supported

+

35

+

torch.log1p

+

Supported

+

36

+

torch.log2

+

Supported

+

37

+

torch.logical_and

+

Supported

+

38

+

torch.logical_not

+

Supported

+

39

+

torch.logical_or

+

Supported

+

40

+

torch.logical_xor

+

Supported

+

41

+

torch.mul

+

Supported

+

42

+

torch.mvlgamma

+

Unsupported.

+

43

+

torch.neg

+

Supported

+

44

+

torch.polygamma

+

Unsupported.

+

45

+

torch.pow

+

Supported

+

46

+

torch.real

+

Supported

+

47

+

torch.reciprocal

+

Supported

+

48

+

torch.remainder

+

Supported

+

49

+

torch.round

+

Supported

+

50

+

torch.rsqrt

+

Supported

+

51

+

torch.sigmoid

+

Supported

+

52

+

torch.sign

+

Supported

+

53

+

torch.sin

+

Supported

+

54

+

torch.sinh

+

Supported

+

55

+

torch.sqrt

+

Supported

+

56

+

torch.square

+

Supported

+

57

+

torch.tan

+

Supported

+

58

+

torch.tanh

+

Supported

+

59

+

torch.true_divide

+

Supported

+

60

+

torch.trunc

+

Supported

+

61

+

torch.argmax

+

Supported

+

62

+

torch.argmin

+

Supported

+

63

+

torch.dist

+

Supported

+

64

+

torch.logsumexp

+

Supported

+

65

+

torch.mean

+

Supported

+

66

+

torch.median

+

Supported

+

67

+

torch.mode

+

Unsupported.

+

68

+

torch.norm

+

Supported

+

69

+

torch.prod

+

Supported

+

70

+

torch.std

+

Supported

+

71

+

torch.std_mean

+

Supported

+

72

+

torch.sum

+

Supported

+

73

+

torch.unique

+

Supported

+

74

+

torch.unique_consecutive

+

Unsupported.

+

75

+

torch.var

+

Unsupported.

+

76

+

torch.var_mean

+

Unsupported.

+

77

+

torch.allclose

+

Supported

+

78

+

torch.argsort

+

Supported

+

79

+

torch.eq

+

Supported

+

80

+

torch.equal

+

Supported

+

81

+

torch.ge

+

Supported

+

82

+

torch.gt

+

Supported

+

83

+

torch.isfinite

+

Supported

+

84

+

torch.isinf

+

Supported

+

85

+

torch.isnan

+

Supported

+

86

+

torch.kthvalue

+

Supported

+

87

+

torch.le

+

Supported

+

88

+

torch.lt

+

Supported

+

89

+

torch.max

+

Supported

+

90

+

torch.min

+

Supported

+

91

+

torch.ne

+

Supported

+

92

+

torch.sort

+

Supported

+

93

+

torch.topk

+

Supported

+

94

+

torch.fft

+

Unsupported.

+

95

+

torch.ifft

+

Unsupported.

+

96

+

torch.rfft

+

Unsupported.

+

97

+

torch.irfft

+

Unsupported.

+

98

+

torch.stft

+

Unsupported.

+

99

+

torch.bartlett_window

+

Supported

+

100

+

torch.blackman_window

+

Supported

+

101

+

torch.hamming_window

+

Supported

+

102

+

torch.hann_window

+

Supported

+

103

+

torch.bincount

+

Unsupported.

+

104

+

torch.broadcast_tensors

+

Supported

+

105

+

torch.cartesian_prod

+

Supported

+

106

+

torch.cdist

+

Supported

+

107

+

torch.combinations

+

Unsupported.

+

108

+

torch.cross

+

Supported

+

109

+

torch.cummax

+

Unsupported.

+

110

+

torch.cummin

+

Supported

+

111

+

torch.cumprod

+

Supported

+

112

+

torch.cumsum

+

Supported

+

113

+

torch.diag

+

Supported

+

114

+

torch.diag_embed

+

Supported

+

115

+

torch.diagflat

+

Supported

+

116

+

torch.diagonal

+

Supported

+

117

+

torch.einsum

+

Supported

+

118

+

torch.flatten

+

Supported

+

119

+

torch.flip

+

Supported

+

120

+

torch.rot90

+

Supported

+

121

+

torch.histc

+

Unsupported.

+

122

+

torch.meshgrid

+

Supported

+

123

+

torch.renorm

+

Supported

+

124

+

torch.repeat_interleave

+

Unsupported.

+

125

+

torch.roll

+

Unsupported.

+

126

+

torch.tensordot

+

Supported

+

127

+

torch.trace

+

Unsupported.

+

128

+

torch.tril

+

Supported

+

129

+

torch.tril_indices

+

Unsupported.

+

130

+

torch.triu

+

Supported

+

131

+

torch.triu_indices

+

Unsupported.

+

132

+

torch.addbmm

+

Supported

+

133

+

torch.addmm

+

Supported

+

134

+

torch.addmv

+

Supported

+

135

+

torch.addr

+

Supported

+

136

+

torch.baddbmm

+

Supported

+

137

+

torch.bmm

+

Supported

+

138

+

torch.chain_matmul

+

Supported

+

139

+

torch.cholesky

+

Unsupported.

+

140

+

torch.cholesky_inverse

+

Unsupported.

+

141

+

torch.cholesky_solve

+

Unsupported.

+

142

+

torch.dot

+

Unsupported.

+

143

+

torch.eig

+

Unsupported.

+

144

+

torch.geqrf

+

Unsupported.

+

145

+

torch.ger

+

Supported

+

146

+

torch.inverse

+

Supported

+

147

+

torch.det

+

Unsupported.

+

148

+

torch.logdet

+

Unsupported.

+

149

+

torch.slogdet

+

Supported

+

150

+

torch.lstsq

+

Unsupported.

+

151

+

torch.lu

+

Unsupported.

+

152

+

torch.lu_solve

+

Unsupported.

+

153

+

torch.lu_unpack

+

Unsupported.

+

154

+

torch.matmul

+

Supported

+

155

+

torch.matrix_power

+

Supported

+

156

+

torch.matrix_rank

+

Unsupported.

+

157

+

torch.mm

+

Supported

+

158

+

torch.mv

+

Supported

+

159

+

torch.orgqr

+

Unsupported.

+

160

+

torch.ormqr

+

Unsupported.

+

161

+

torch.pinverse

+

Unsupported.

+

162

+

torch.qr

+

Supported

+

163

+

torch.solve

+

Unsupported.

+

164

+

torch.svd

+

Unsupported.

+

165

+

torch.svd_lowrank

+

Unsupported.

+

166

+

torch.pca_lowrank

+

Unsupported.

+

167

+

torch.symeig

+

Unsupported.

+

168

+

torch.lobpcg

+

Unsupported.

+

169

+

torch.trapz

+

Supported

+

170

+

torch.triangular_solve

+

Unsupported.

+
+ +

Utilities

+ + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.compiled_with_cxx11_abi

+

Supported

+

2

+

torch.result_type

+

Supported

+

3

+

torch.can_cast

+

Supported

+

4

+

torch.promote_types

+

Supported

+
+ +

Other

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.no_grad

+

Supported

+

2

+

torch.enable_grad

+

Supported

+

3

+

torch.set_grad_enabled

+

Supported

+

4

+

torch.get_num_threads

+

Supported

+

5

+

torch.set_num_threads

+

Supported

+

6

+

torch.get_num_interop_threads

+

Supported

+

7

+

torch.set_num_interop_threads

+

Supported

+
+ +

torch.Tensor

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.Tensor

+

Supported

+

2

+

torch.Tensor.new_tensor

+

Supported

+

3

+

torch.Tensor.new_full

+

Supported

+

4

+

torch.Tensor.new_empty

+

Supported

+

5

+

torch.Tensor.new_ones

+

Supported

+

6

+

torch.Tensor.new_zeros

+

Supported

+

7

+

torch.Tensor.is_cuda

+

Supported

+

8

+

torch.Tensor.is_quantized

+

Supported

+

9

+

torch.Tensor.device

+

Supported

+

10

+

torch.Tensor.ndim

+

Supported

+

11

+

torch.Tensor.T

+

Supported

+

12

+

torch.Tensor.abs

+

Supported

+

13

+

torch.Tensor.abs_

+

Supported

+

14

+

torch.Tensor.acos

+

Supported

+

15

+

torch.Tensor.acos_

+

Supported

+

16

+

torch.Tensor.add

+

Supported

+

17

+

torch.Tensor.add_

+

Supported

+

18

+

torch.Tensor.addbmm

+

Supported

+

19

+

torch.Tensor.addbmm_

+

Supported

+

20

+

torch.Tensor.addcdiv

+

Supported

+

21

+

torch.Tensor.addcdiv_

+

Supported

+

22

+

torch.Tensor.addcmul

+

Supported

+

23

+

torch.Tensor.addcmul_

+

Supported

+

24

+

torch.Tensor.addmm

+

Supported

+

25

+

torch.Tensor.addmm_

+

Supported

+

26

+

torch.Tensor.addmv

+

Supported

+

27

+

torch.Tensor.addmv_

+

Supported

+

28

+

torch.Tensor.addr

+

Supported

+

29

+

torch.Tensor.addr_

+

Supported

+

30

+

torch.Tensor.allclose

+

Supported

+

31

+

torch.Tensor.angle

+

Unsupported.

+

32

+

torch.Tensor.apply_

+

Unsupported.

+

33

+

torch.Tensor.argmax

+

Supported

+

34

+

torch.Tensor.argmin

+

Supported

+

35

+

torch.Tensor.argsort

+

Supported

+

36

+

torch.Tensor.asin

+

Supported

+

37

+

torch.Tensor.asin_

+

Supported

+

38

+

torch.Tensor.as_strided

+

Supported

+

39

+

torch.Tensor.atan

+

Supported

+

40

+

torch.Tensor.atan2

+

Supported

+

41

+

torch.Tensor.atan2_

+

Supported

+

42

+

torch.Tensor.atan_

+

Supported

+

43

+

torch.Tensor.baddbmm

+

Supported

+

44

+

torch.Tensor.baddbmm_

+

Supported

+

45

+

torch.Tensor.bernoulli

+

Supported

+

46

+

torch.Tensor.bernoulli_

+

Supported

+

47

+

torch.Tensor.bfloat16

+

Unsupported.

+

48

+

torch.Tensor.bincount

+

Unsupported.

+

49

+

torch.Tensor.bitwise_not

+

Supported

+

50

+

torch.Tensor.bitwise_not_

+

Supported

+

51

+

torch.Tensor.bitwise_and

+

Supported

+

52

+

torch.Tensor.bitwise_and_

+

Supported

+

53

+

torch.Tensor.bitwise_or

+

Supported

+

54

+

torch.Tensor.bitwise_or_

+

Supported

+

55

+

torch.Tensor.bitwise_xor

+

Supported

+

56

+

torch.Tensor.bitwise_xor_

+

Supported

+

57

+

torch.Tensor.bmm

+

Supported

+

58

+

torch.Tensor.bool

+

Supported

+

59

+

torch.Tensor.byte

+

Supported

+

60

+

torch.Tensor.cauchy_

+

Unsupported.

+

61

+

torch.Tensor.ceil

+

Supported

+

62

+

torch.Tensor.ceil_

+

Supported

+

63

+

torch.Tensor.char

+

Supported

+

64

+

torch.Tensor.cholesky

+

Unsupported.

+

65

+

torch.Tensor.cholesky_inverse

+

Unsupported.

+

66

+

torch.Tensor.cholesky_solve

+

Unsupported.

+

67

+

torch.Tensor.chunk

+

Supported

+

68

+

torch.Tensor.clamp

+

Supported

+

69

+

torch.Tensor.clamp_

+

Supported

+

70

+

torch.Tensor.clone

+

Supported

+

71

+

torch.Tensor.contiguous

+

Supported

+

72

+

torch.Tensor.copy_

+

Supported

+

73

+

torch.Tensor.conj

+

Unsupported.

+

74

+

torch.Tensor.cos

+

Supported

+

75

+

torch.Tensor.cos_

+

Supported

+

76

+

torch.Tensor.cosh

+

Supported

+

77

+

torch.Tensor.cosh_

+

Supported

+

78

+

torch.Tensor.cpu

+

Supported

+

79

+

torch.Tensor.cross

+

Supported

+

80

+

torch.Tensor.cuda

+

Unsupported.

+

81

+

torch.Tensor.cummax

+

Unsupported.

+

82

+

torch.Tensor.cummin

+

Supported

+

83

+

torch.Tensor.cumprod

+

Supported

+

84

+

torch.Tensor.cumsum

+

Supported

+

85

+

torch.Tensor.data_ptr

+

Supported

+

86

+

torch.Tensor.dequantize

+

Unsupported.

+

87

+

torch.Tensor.det

+

Unsupported.

+

88

+

torch.Tensor.dense_dim

+

Unsupported.

+

89

+

torch.Tensor.diag

+

Supported

+

90

+

torch.Tensor.diag_embed

+

Supported

+

91

+

torch.Tensor.diagflat

+

Supported

+

92

+

torch.Tensor.diagonal

+

Supported

+

93

+

torch.Tensor.fill_diagonal_

+

Supported

+

94

+

torch.Tensor.digamma

+

Unsupported.

+

95

+

torch.Tensor.digamma_

+

Unsupported.

+

96

+

torch.Tensor.dim

+

Supported

+

97

+

torch.Tensor.dist

+

Supported

+

98

+

torch.Tensor.div

+

Supported

+

99

+

torch.Tensor.div_

+

Supported

+

100

+

torch.Tensor.dot

+

Unsupported.

+

101

+

torch.Tensor.double

+

Unsupported.

+

102

+

torch.Tensor.eig

+

Unsupported.

+

103

+

torch.Tensor.element_size

+

Supported

+

104

+

torch.Tensor.eq

+

Supported

+

105

+

torch.Tensor.eq_

+

Supported

+

106

+

torch.Tensor.equal

+

Supported

+

107

+

torch.Tensor.erf

+

Supported

+

108

+

torch.Tensor.erf_

+

Supported

+

109

+

torch.Tensor.erfc

+

Supported

+

110

+

torch.Tensor.erfc_

+

Supported

+

111

+

torch.Tensor.erfinv

+

Supported

+

112

+

torch.Tensor.erfinv_

+

Supported

+

113

+

torch.Tensor.exp

+

Supported

+

114

+

torch.Tensor.exp_

+

Supported

+

115

+

torch.Tensor.expm1

+

Supported

+

116

+

torch.Tensor.expm1_

+

Supported

+

117

+

torch.Tensor.expand

+

Supported

+

118

+

torch.Tensor.expand_as

+

Supported

+

119

+

torch.Tensor.exponential_

+

Unsupported.

+

120

+

torch.Tensor.fft

+

Unsupported.

+

121

+

torch.Tensor.fill_

+

Supported

+

122

+

torch.Tensor.flatten

+

Supported

+

123

+

torch.Tensor.flip

+

Supported

+

124

+

torch.Tensor.float

+

Supported

+

125

+

torch.Tensor.floor

+

Supported

+

126

+

torch.Tensor.floor_

+

Supported

+

127

+

torch.Tensor.floor_divide

+

Supported

+

128

+

torch.Tensor.floor_divide_

+

Supported

+

129

+

torch.Tensor.fmod

+

Supported

+

130

+

torch.Tensor.fmod_

+

Supported

+

131

+

torch.Tensor.frac

+

Supported

+

132

+

torch.Tensor.frac_

+

Supported

+

133

+

torch.Tensor.gather

+

Supported

+

134

+

torch.Tensor.ge

+

Supported

+

135

+

torch.Tensor.ge_

+

Supported

+

136

+

torch.Tensor.geometric_

+

Unsupported.

+

137

+

torch.Tensor.geqrf

+

Unsupported.

+

138

+

torch.Tensor.ger

+

Supported

+

139

+

torch.Tensor.get_device

+

Supported

+

140

+

torch.Tensor.gt

+

Supported

+

141

+

torch.Tensor.gt_

+

Supported

+

142

+

torch.Tensor.half

+

Supported

+

143

+

torch.Tensor.hardshrink

+

Supported

+

144

+

torch.Tensor.histc

+

Unsupported.

+

145

+

torch.Tensor.ifft

+

Unsupported.

+

146

+

torch.Tensor.index_add_

+

Supported

+

147

+

torch.Tensor.index_add

+

Supported

+

148

+

torch.Tensor.index_copy_

+

Supported

+

149

+

torch.Tensor.index_copy

+

Supported

+

150

+

torch.Tensor.index_fill_

+

Supported

+

151

+

torch.Tensor.index_fill

+

Supported

+

152

+

torch.Tensor.index_put_

+

Supported

+

153

+

torch.Tensor.index_put

+

Supported

+

154

+

torch.Tensor.index_select

+

Supported

+

155

+

torch.Tensor.indices

+

Unsupported.

+

156

+

torch.Tensor.int

+

Supported

+

157

+

torch.Tensor.int_repr

+

Unsupported.

+

158

+

torch.Tensor.inverse

+

Supported

+

159

+

torch.Tensor.irfft

+

Unsupported.

+

160

+

torch.Tensor.is_contiguous

+

Supported

+

161

+

torch.Tensor.is_complex

+

Supported

+

162

+

torch.Tensor.is_floating_point

+

Supported

+

163

+

torch.Tensor.is_pinned

+

Supported

+

164

+

torch.Tensor.is_set_to

+

Unsupported.

+

165

+

torch.Tensor.is_shared

+

Supported

+

166

+

torch.Tensor.is_signed

+

Supported

+

167

+

torch.Tensor.is_sparse

+

Supported

+

168

+

torch.Tensor.item

+

Supported

+

169

+

torch.Tensor.kthvalue

+

Supported

+

170

+

torch.Tensor.le

+

Supported

+

171

+

torch.Tensor.le_

+

Supported

+

172

+

torch.Tensor.lerp

+

Supported

+

173

+

torch.Tensor.lerp_

+

Supported

+

174

+

torch.Tensor.lgamma

+

Unsupported.

+

175

+

torch.Tensor.lgamma_

+

Unsupported.

+

176

+

torch.Tensor.log

+

Supported

+

177

+

torch.Tensor.log_

+

Supported

+

178

+

torch.Tensor.logdet

+

Unsupported.

+

179

+

torch.Tensor.log10

+

Supported

+

180

+

torch.Tensor.log10_

+

Supported

+

181

+

torch.Tensor.log1p

+

Supported

+

182

+

torch.Tensor.log1p_

+

Supported

+

183

+

torch.Tensor.log2

+

Supported

+

184

+

torch.Tensor.log2_

+

Supported

+

185

+

torch.Tensor.log_normal_

+

Supported

+

186

+

torch.Tensor.logsumexp

+

Supported

+

187

+

torch.Tensor.logical_and

+

Supported

+

188

+

torch.Tensor.logical_and_

+

Supported

+

189

+

torch.Tensor.logical_not

+

Supported

+

190

+

torch.Tensor.logical_not_

+

Supported

+

191

+

torch.Tensor.logical_or

+

Supported

+

192

+

torch.Tensor.logical_or_

+

Supported

+

193

+

torch.Tensor.logical_xor

+

Unsupported.

+

194

+

torch.Tensor.logical_xor_

+

Unsupported.

+

195

+

torch.Tensor.long

+

Supported

+

196

+

torch.Tensor.lstsq

+

Unsupported.

+

197

+

torch.Tensor.lt

+

Supported

+

198

+

torch.Tensor.lt_

+

Supported

+

199

+

torch.Tensor.lu

+

Supported

+

200

+

torch.Tensor.lu_solve

+

Supported

+

201

+

torch.Tensor.map_

+

Unsupported.

+

202

+

torch.Tensor.masked_scatter_

+

Supported

+

203

+

torch.Tensor.masked_scatter

+

Supported

+

204

+

torch.Tensor.masked_fill_

+

Supported

+

205

+

torch.Tensor.masked_fill

+

Supported

+

206

+

torch.Tensor.masked_select

+

Supported

+

207

+

torch.Tensor.matmul

+

Supported

+

208

+

torch.Tensor.matrix_power

+

Supported

+

209

+

torch.Tensor.max

+

Supported

+

210

+

torch.Tensor.mean

+

Supported

+

211

+

torch.Tensor.median

+

Supported

+

212

+

torch.Tensor.min

+

Supported

+

213

+

torch.Tensor.mm

+

Supported

+

214

+

torch.Tensor.mode

+

Unsupported.

+

215

+

torch.Tensor.mul

+

Supported

+

216

+

torch.Tensor.mul_

+

Supported

+

217

+

torch.Tensor.multinomial

+

Supported

+

218

+

torch.Tensor.mv

+

Supported

+

219

+

torch.Tensor.mvlgamma

+

Unsupported.

+

220

+

torch.Tensor.mvlgamma_

+

Unsupported.

+

221

+

torch.Tensor.narrow

+

Supported

+

222

+

torch.Tensor.narrow_copy

+

Supported

+

223

+

torch.Tensor.ndimension

+

Supported

+

224

+

torch.Tensor.ne

+

Supported

+

225

+

torch.Tensor.ne_

+

Supported

+

226

+

torch.Tensor.neg

+

Supported

+

227

+

torch.Tensor.neg_

+

Supported

+

228

+

torch.Tensor.nelement

+

Supported

+

229

+

torch.Tensor.nonzero

+

Supported

+

230

+

torch.Tensor.norm

+

Supported

+

231

+

torch.Tensor.normal_

+

Supported

+

232

+

torch.Tensor.numel

+

Supported

+

233

+

torch.Tensor.numpy

+

Unsupported.

+

234

+

torch.Tensor.orgqr

+

Unsupported.

+

235

+

torch.Tensor.ormqr

+

Unsupported.

+

236

+

torch.Tensor.permute

+

Supported

+

237

+

torch.Tensor.pin_memory

+

Unsupported.

+

238

+

torch.Tensor.pinverse

+

Unsupported.

+

239

+

torch.Tensor.polygamma

+

Unsupported.

+

240

+

torch.Tensor.polygamma_

+

Unsupported.

+

241

+

torch.Tensor.pow

+

Supported

+

242

+

torch.Tensor.pow_

+

Supported

+

243

+

torch.Tensor.prod

+

Supported

+

244

+

torch.Tensor.put_

+

Supported

+

245

+

torch.Tensor.qr

+

Supported

+

246

+

torch.Tensor.qscheme

+

Unsupported.

+

247

+

torch.Tensor.q_scale

+

Unsupported.

+

248

+

torch.Tensor.q_zero_point

+

Unsupported.

+

249

+

torch.Tensor.q_per_channel_scales

+

Unsupported.

+

250

+

torch.Tensor.q_per_channel_zero_points

+

Unsupported.

+

251

+

torch.Tensor.q_per_channel_axis

+

Unsupported.

+

252

+

torch.Tensor.random_

+

Supported

+

253

+

torch.Tensor.reciprocal

+

Supported

+

254

+

torch.Tensor.reciprocal_

+

Supported

+

255

+

torch.Tensor.record_stream

+

Unsupported.

+

256

+

torch.Tensor.remainder

+

Supported

+

257

+

torch.Tensor.remainder_

+

Supported

+

258

+

torch.Tensor.renorm

+

Supported

+

259

+

torch.Tensor.renorm_

+

Supported

+

260

+

torch.Tensor.repeat

+

Supported

+

261

+

torch.Tensor.repeat_interleave

+

Supported

+

262

+

torch.Tensor.requires_grad_

+

Supported

+

263

+

torch.Tensor.reshape

+

Supported

+

264

+

torch.Tensor.reshape_as

+

Supported

+

265

+

torch.Tensor.resize_

+

Supported

+

266

+

torch.Tensor.resize_as_

+

Supported

+

267

+

torch.Tensor.rfft

+

Unsupported.

+

268

+

torch.Tensor.roll

+

Unsupported.

+

269

+

torch.Tensor.rot90

+

Supported

+

270

+

torch.Tensor.round

+

Supported

+

271

+

torch.Tensor.round_

+

Supported

+

272

+

torch.Tensor.rsqrt

+

Supported

+

273

+

torch.Tensor.rsqrt_

+

Supported

+

274

+

torch.Tensor.scatter

+

Supported

+

275

+

torch.Tensor.scatter_

+

Supported

+

276

+

torch.Tensor.scatter_add_

+

Supported

+

277

+

torch.Tensor.scatter_add

+

Supported

+

278

+

torch.Tensor.select

+

Supported

+

279

+

torch.Tensor.set_

+

Supported

+

280

+

torch.Tensor.share_memory_

+

Unsupported.

+

281

+

torch.Tensor.short

+

Supported

+

282

+

torch.Tensor.sigmoid

+

Supported

+

283

+

torch.Tensor.sigmoid_

+

Supported

+

284

+

torch.Tensor.sign

+

Supported

+

285

+

torch.Tensor.sign_

+

Supported

+

286

+

torch.Tensor.sin

+

Supported

+

287

+

torch.Tensor.sin_

+

Supported

+

288

+

torch.Tensor.sinh

+

Supported

+

289

+

torch.Tensor.sinh_

+

Supported

+

290

+

torch.Tensor.size

+

Supported

+

291

+

torch.Tensor.slogdet

+

Unsupported.

+

292

+

torch.Tensor.solve

+

Unsupported.

+

293

+

torch.Tensor.sort

+

Supported

+

294

+

torch.Tensor.split

+

Supported

+

295

+

torch.Tensor.sparse_mask

+

Unsupported.

+

296

+

torch.Tensor.sparse_dim

+

Unsupported.

+

297

+

torch.Tensor.sqrt

+

Supported

+

298

+

torch.Tensor.sqrt_

+

Supported

+

299

+

torch.Tensor.square

+

Supported

+

300

+

torch.Tensor.square_

+

Supported

+

301

+

torch.Tensor.squeeze

+

Supported

+

302

+

torch.Tensor.squeeze_

+

Supported

+

303

+

torch.Tensor.std

+

Supported

+

304

+

torch.Tensor.stft

+

Unsupported.

+

305

+

torch.Tensor.storage

+

Supported

+

306

+

torch.Tensor.storage_offset

+

Supported

+

307

+

torch.Tensor.storage_type

+

Supported

+

308

+

torch.Tensor.stride

+

Supported

+

309

+

torch.Tensor.sub

+

Supported

+

310

+

torch.Tensor.sub_

+

Supported

+

311

+

torch.Tensor.sum

+

Supported

+

312

+

torch.Tensor.sum_to_size

+

Supported

+

313

+

torch.Tensor.svd

+

Unsupported.

+

314

+

torch.Tensor.symeig

+

Unsupported.

+

315

+

torch.Tensor.t

+

Supported

+

316

+

torch.Tensor.t_

+

Supported

+

317

+

torch.Tensor.to

+

Supported

+

318

+

torch.Tensor.to_mkldnn

+

Unsupported.

+

319

+

torch.Tensor.take

+

Supported

+

320

+

torch.Tensor.tan

+

Supported

+

321

+

torch.Tensor.tan_

+

Supported

+

322

+

torch.Tensor.tanh

+

Supported

+

323

+

torch.Tensor.tanh_

+

Supported

+

324

+

torch.Tensor.tolist

+

Supported

+

325

+

torch.Tensor.topk

+

Supported

+

326

+

torch.Tensor.to_sparse

+

Unsupported.

+

327

+

torch.Tensor.trace

+

Unsupported.

+

328

+

torch.Tensor.transpose

+

Supported

+

329

+

torch.Tensor.transpose_

+

Supported

+

330

+

torch.Tensor.triangular_solve

+

Unsupported.

+

331

+

torch.Tensor.tril

+

Supported

+

332

+

torch.Tensor.tril_

+

Supported

+

333

+

torch.Tensor.triu

+

Supported

+

334

+

torch.Tensor.triu_

+

Supported

+

335

+

torch.Tensor.true_divide

+

Supported

+

336

+

torch.Tensor.true_divide_

+

Supported

+

337

+

torch.Tensor.trunc

+

Supported

+

338

+

torch.Tensor.trunc_

+

Supported

+

339

+

torch.Tensor.type

+

Supported

+

340

+

torch.Tensor.type_as

+

Supported

+

341

+

torch.Tensor.unbind

+

Supported

+

342

+

torch.Tensor.unfold

+

Supported

+

343

+

torch.Tensor.uniform_

+

Supported

+

344

+

torch.Tensor.unique

+

Supported

+

345

+

torch.Tensor.unique_consecutive

+

Unsupported.

+

346

+

torch.Tensor.unsqueeze

+

Supported

+

347

+

torch.Tensor.unsqueeze_

+

Supported

+

348

+

torch.Tensor.values

+

Unsupported.

+

349

+

torch.Tensor.var

+

Unsupported.

+

350

+

torch.Tensor.view

+

Supported

+

351

+

torch.Tensor.view_as

+

Supported

+

352

+

torch.Tensor.where

+

Supported

+

353

+

torch.Tensor.zero_

+

Supported

+

354

+

torch.BoolTensor

+

Supported

+

355

+

torch.BoolTensor.all

+

Supported

+

356

+

torch.BoolTensor.any

+

Supported

+
+ +

Layers \(torch.nn\)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.nn.Parameter

+

Supported

+

2

+

torch.nn.Module

+

Supported

+

3

+

torch.nn.Module.add_module

+

Supported

+

4

+

torch.nn.Module.apply

+

Supported

+

5

+

torch.nn.Module.bfloat16

+

Unsupported.

+

6

+

torch.nn.Module.buffers

+

Supported

+

7

+

torch.nn.Module.children

+

Supported

+

8

+

torch.nn.Module.cpu

+

Supported

+

9

+

torch.nn.Module.cuda

+

Unsupported.

+

10

+

torch.nn.Module.double

+

Unsupported.

+

11

+

torch.nn.Module.dump_patches

+

Supported

+

12

+

torch.nn.Module.eval

+

Supported

+

13

+

torch.nn.Module.extra_repr

+

Supported

+

14

+

torch.nn.Module.float

+

Supported

+

15

+

torch.nn.Module.forward

+

Supported

+

16

+

torch.nn.Module.half

+

Supported

+

17

+

torch.nn.Module.load_state_dict

+

Supported

+

18

+

torch.nn.Module.modules

+

Supported

+

19

+

torch.nn.Module.named_buffers

+

Supported

+

20

+

torch.nn.Module.named_children

+

Supported

+

21

+

torch.nn.Module.named_modules

+

Supported

+

22

+

torch.nn.Module.named_parameters

+

Supported

+

23

+

torch.nn.Module.parameters

+

Supported

+

24

+

torch.nn.Module.register_backward_hook

+

Supported

+

25

+

torch.nn.Module.register_buffer

+

Supported

+

26

+

torch.nn.Module.register_forward_hook

+

Supported

+

27

+

torch.nn.Module.register_forward_pre_hook

+

Supported

+

28

+

torch.nn.Module.register_parameter

+

Supported

+

29

+

torch.nn.Module.requires_grad_

+

Supported

+

30

+

torch.nn.Module.state_dict

+

Supported

+

31

+

torch.nn.Module.to

+

Supported

+

32

+

torch.nn.Module.train

+

Supported

+

33

+

torch.nn.Module.type

+

Supported

+

34

+

torch.nn.Module.zero_grad

+

Supported

+

35

+

torch.nn.Sequential

+

Supported

+

36

+

torch.nn.ModuleList

+

Supported

+

37

+

torch.nn.ModuleList.append

+

Supported

+

38

+

torch.nn.ModuleList.extend

+

Supported

+

39

+

torch.nn.ModuleList.insert

+

Supported

+

40

+

torch.nn.ModuleDict

+

Supported

+

41

+

torch.nn.ModuleDict.clear

+

Supported

+

42

+

torch.nn.ModuleDict.items

+

Supported

+

43

+

torch.nn.ModuleDict.keys

+

Supported

+

44

+

torch.nn.ModuleDict.pop

+

Supported

+

45

+

torch.nn.ModuleDict.update

+

Supported

+

46

+

torch.nn.ModuleDict.values

+

Supported

+

47

+

torch.nn.ParameterList

+

Supported

+

48

+

torch.nn.ParameterList.append

+

Supported

+

49

+

torch.nn.ParameterList.extend

+

Supported

+

50

+

torch.nn.ParameterDict

+

Supported

+

51

+

torch.nn.ParameterDict.clear

+

Supported

+

52

+

torch.nn.ParameterDict.items

+

Supported

+

53

+

torch.nn.ParameterDict.keys

+

Supported

+

54

+

torch.nn.ParameterDict.pop

+

Supported

+

55

+

torch.nn.ParameterDict.update

+

Supported

+

56

+

torch.nn.ParameterDict.values

+

Supported

+

57

+

torch.nn.Conv1d

+

Supported

+

58

+

torch.nn.Conv2d

+

Supported

+

59

+

torch.nn.Conv3d

+

Supported

+

60

+

torch.nn.ConvTranspose1d

+

Unsupported.

+

61

+

torch.nn.ConvTranspose2d

+

Supported

+

62

+

torch.nn.ConvTranspose3d

+

Unsupported.

+

63

+

torch.nn.Unfold

+

Unsupported.

+

64

+

torch.nn.Fold

+

Supported

+

65

+

torch.nn.MaxPool1d

+

Supported

+

66

+

torch.nn.MaxPool2d

+

Supported

+

67

+

torch.nn.MaxPool3d

+

Supported

+

68

+

torch.nn.MaxUnpool1d

+

Unsupported.

+

69

+

torch.nn.MaxUnpool2d

+

Unsupported.

+

70

+

torch.nn.MaxUnpool3d

+

Unsupported.

+

71

+

torch.nn.AvgPool1d

+

Supported

+

72

+

torch.nn.AvgPool2d

+

Supported

+

73

+

torch.nn.AvgPool3d

+

Supported

+

74

+

torch.nn.FractionalMaxPool2d

+

Unsupported.

+

75

+

torch.nn.LPPool1d

+

Supported

+

76

+

torch.nn.LPPool2d

+

Supported

+

77

+

torch.nn.AdaptiveMaxPool1d

+

Unsupported.

+

78

+

torch.nn.AdaptiveMaxPool2d

+

Unsupported.

+

79

+

torch.nn.AdaptiveMaxPool3d

+

Unsupported.

+

80

+

torch.nn.AdaptiveAvgPool1d

+

Supported

+

81

+

torch.nn.AdaptiveAvgPool2d

+

Supported

+

82

+

torch.nn.AdaptiveAvgPool3d

+

Unsupported.

+

83

+

torch.nn.ReflectionPad1d

+

Unsupported.

+

84

+

torch.nn.ReflectionPad2d

+

Unsupported.

+

85

+

torch.nn.ReplicationPad1d

+

Unsupported.

+

86

+

torch.nn.ReplicationPad2d

+

Unsupported.

+

87

+

torch.nn.ReplicationPad3d

+

Unsupported.

+

88

+

torch.nn.ZeroPad2d

+

Supported

+

89

+

torch.nn.ConstantPad1d

+

Supported

+

90

+

torch.nn.ConstantPad2d

+

Supported

+

91

+

torch.nn.ConstantPad3d

+

Supported

+

92

+

torch.nn.ELU

+

Supported

+

93

+

torch.nn.Hardshrink

+

Supported

+

94

+

torch.nn.Hardtanh

+

Supported

+

95

+

torch.nn.LeakyReLU

+

Supported

+

96

+

torch.nn.LogSigmoid

+

Supported

+

97

+

torch.nn.MultiheadAttention

+

Unsupported.

+

98

+

torch.nn.PReLU

+

Unsupported.

+

99

+

torch.nn.ReLU

+

Supported

+

100

+

torch.nn.ReLU6

+

Supported

+

101

+

torch.nn.RReLU

+

Unsupported.

+

102

+

torch.nn.SELU

+

Supported

+

103

+

torch.nn.CELU

+

Supported

+

104

+

torch.nn.GELU

+

Supported

+

105

+

torch.nn.Sigmoid

+

Supported

+

106

+

torch.nn.Softplus

+

Unsupported.

+

107

+

torch.nn.Softshrink

+

Supported. However, it is not supported in SoftShrink scenarios currently.

+

108

+

torch.nn.Softsign

+

Supported

+

109

+

torch.nn.Tanh

+

Supported

+

110

+

torch.nn.Tanhshrink

+

Supported

+

111

+

torch.nn.Threshold

+

Supported

+

112

+

torch.nn.Softmin

+

Supported

+

113

+

torch.nn.Softmax

+

Supported

+

114

+

torch.nn.Softmax2d

+

Supported

+

115

+

torch.nn.LogSoftmax

+

Unsupported.

+

116

+

torch.nn.AdaptiveLogSoftmaxWithLoss

+

Unsupported.

+

117

+

torch.nn.AdaptiveLogSoftmaxWithLoss.log_prob

+

Unsupported.

+

118

+

torch.nn.AdaptiveLogSoftmaxWithLoss.predict

+

Unsupported.

+

119

+

torch.nn.BatchNorm1d

+

Supported

+

120

+

torch.nn.BatchNorm2d

+

Supported

+

121

+

torch.nn.BatchNorm3d

+

Unsupported.

+

122

+

torch.nn.GroupNorm

+

Supported

+

123

+

torch.nn.SyncBatchNorm

+

Unsupported.

+

124

+

torch.nn.SyncBatchNorm.convert_sync_batchnorm

+

Unsupported.

+

125

+

torch.nn.InstanceNorm1d

+

Supported

+

126

+

torch.nn.InstanceNorm2d

+

Supported

+

127

+

torch.nn.InstanceNorm3d

+

Supported

+

128

+

torch.nn.LayerNorm

+

Supported

+

129

+

torch.nn.LocalResponseNorm

+

Supported

+

130

+

torch.nn.RNNBase

+

Supported

+

131

+

torch.nn.RNNBase.flatten_parameters

+

Supported

+

132

+

torch.nn.RNN

+

Supported

+

133

+

torch.nn.LSTM

+

Supported. However, it is not supported in DynamicRNN scenarios currently.

+

134

+

torch.nn.GRU

+

Supported. However, it is not supported in DynamicGRUV2 scenarios currently.

+

135

+

torch.nn.RNNCell

+

Supported

+

136

+

torch.nn.LSTMCell

+

Supported

+

137

+

torch.nn.GRUCell

+

Supported

+

138

+

torch.nn.Transformer

+

Unsupported.

+

139

+

torch.nn.Transformer.forward

+

Unsupported.

+

140

+

torch.nn.Transformer.generate_square_subsequent_mask

+

Unsupported.

+

141

+

torch.nn.TransformerEncoder

+

Supported

+

142

+

torch.nn.TransformerEncoder.forward

+

Supported

+

143

+

torch.nn.TransformerDecoder

+

Unsupported.

+

144

+

torch.nn.TransformerDecoder.forward

+

Unsupported.

+

145

+

torch.nn.TransformerEncoderLayer

+

Supported

+

146

+

torch.nn.TransformerEncoderLayer.forward

+

Supported

+

147

+

torch.nn.TransformerDecoderLayer

+

Unsupported.

+

148

+

torch.nn.TransformerDecoderLayer.forward

+

Unsupported.

+

149

+

torch.nn.Identity

+

Supported

+

150

+

torch.nn.Linear

+

Supported

+

151

+

torch.nn.Bilinear

+

Supported

+

152

+

torch.nn.Dropout

+

Supported

+

153

+

torch.nn.Dropout2d

+

Supported

+

154

+

torch.nn.Dropout3d

+

Supported

+

155

+

torch.nn.AlphaDropout

+

Supported

+

156

+

torch.nn.Embedding

+

Supported

+

157

+

torch.nn.Embedding.from_pretrained

+

Supported

+

158

+

torch.nn.EmbeddingBag

+

Unsupported.

+

159

+

torch.nn.EmbeddingBag.from_pretrained

+

Unsupported.

+

160

+

torch.nn.CosineSimilarity

+

Supported

+

161

+

torch.nn.PairwiseDistance

+

Supported

+

162

+

torch.nn.L1Loss

+

Supported

+

163

+

torch.nn.MSELoss

+

Supported

+

164

+

torch.nn.CrossEntropyLoss

+

Supported

+

165

+

torch.nn.CTCLoss

+

Supported

+

166

+

torch.nn.NLLLoss

+

Supported

+

167

+

torch.nn.PoissonNLLLoss

+

Supported

+

168

+

torch.nn.KLDivLoss

+

Supported

+

169

+

torch.nn.BCELoss

+

Supported

+

170

+

torch.nn.BCEWithLogitsLoss

+

Supported

+

171

+

torch.nn.MarginRankingLoss

+

Supported

+

172

+

torch.nn.HingeEmbeddingLoss

+

Supported

+

173

+

torch.nn.MultiLabelMarginLoss

+

Unsupported.

+

174

+

torch.nn.SmoothL1Loss

+

Supported

+

175

+

torch.nn.SoftMarginLoss

+

Unsupported.

+

176

+

torch.nn.MultiLabelSoftMarginLoss

+

Supported

+

177

+

torch.nn.CosineEmbeddingLoss

+

Supported

+

178

+

torch.nn.MultiMarginLoss

+

Unsupported.

+

179

+

torch.nn.TripletMarginLoss

+

Supported

+

180

+

torch.nn.PixelShuffle

+

Supported

+

181

+

torch.nn.Upsample

+

Supported

+

182

+

torch.nn.UpsamplingNearest2d

+

Supported

+

183

+

torch.nn.UpsamplingBilinear2d

+

Supported

+

184

+

torch.nn.DataParallel

+

Unsupported

+

185

+

torch.nn.parallel.DistributedDataParallel

+

Supported

+

186

+

torch.nn.parallel.DistributedDataParallel.no_sync

+

Supported

+

187

+

torch.nn.utils.clip_grad_norm_

+

Unsupported.

+

188

+

torch.nn.utils.clip_grad_value_

+

Unsupported.

+

189

+

torch.nn.utils.parameters_to_vector

+

Supported

+

190

+

torch.nn.utils.vector_to_parameters

+

Supported

+

197

+

torch.nn.utils.prune.PruningContainer

+

Supported

+

198

+

torch.nn.utils.prune.PruningContainer.add_pruning_method

+

Unsupported.

+

199

+

torch.nn.utils.prune.PruningContainer.apply

+

Supported

+

200

+

torch.nn.utils.prune.PruningContainer.apply_mask

+

Unsupported.

+

201

+

torch.nn.utils.prune.PruningContainer.compute_mask

+

Supported.

+

202

+

torch.nn.utils.prune.PruningContainer.prune

+

Supported

+

203

+

torch.nn.utils.prune.PruningContainer.remove

+

Unsupported.

+

204

+

torch.nn.utils.prune.Identity

+

Supported

+

205

+

torch.nn.utils.prune.Identity.apply

+

Supported

+

206

+

torch.nn.utils.prune.Identity.apply_mask

+

Unsupported.

+

207

+

torch.nn.utils.prune.Identity.prune

+

Supported

+

208

+

torch.nn.utils.prune.Identity.remove

+

Unsupported.

+

209

+

torch.nn.utils.prune.RandomUnstructured

+

Supported

+

210

+

torch.nn.utils.prune.RandomUnstructured.apply

+

Unsupported.

+

211

+

torch.nn.utils.prune.RandomUnstructured.apply_mask

+

Unsupported.

+

212

+

torch.nn.utils.prune.RandomUnstructured.prune

+

Unsupported.

+

213

+

torch.nn.utils.prune.RandomUnstructured.remove

+

Unsupported.

+

214

+

torch.nn.utils.prune.L1Unstructured

+

Supported

+

215

+

torch.nn.utils.prune.L1Unstructured.apply

+

Unsupported.

+

216

+

torch.nn.utils.prune.L1Unstructured.apply_mask

+

Unsupported.

+

217

+

torch.nn.utils.prune.L1Unstructured.prune

+

Unsupported.

+

218

+

torch.nn.utils.prune.L1Unstructured.remove

+

Unsupported.

+

219

+

torch.nn.utils.prune.RandomStructured

+

Supported

+

220

+

torch.nn.utils.prune.RandomStructured.apply

+

Supported

+

221

+

torch.nn.utils.prune.RandomStructured.apply_mask

+

Unsupported.

+

222

+

torch.nn.utils.prune.RandomStructured.compute_mask

+

Supported

+

223

+

torch.nn.utils.prune.RandomStructured.prune

+

Supported

+

224

+

torch.nn.utils.prune.RandomStructured.remove

+

Unsupported.

+

225

+

torch.nn.utils.prune.LnStructured

+

Supported

+

226

+

torch.nn.utils.prune.LnStructured.apply

+

Unsupported.

+

227

+

torch.nn.utils.prune.LnStructured.apply_mask

+

Unsupported.

+

228

+

torch.nn.utils.prune.LnStructured.compute_mask

+

Unsupported.

+

229

+

torch.nn.utils.prune.LnStructured.prune

+

Unsupported.

+

230

+

torch.nn.utils.prune.LnStructured.remove

+

Unsupported.

+

231

+

torch.nn.utils.prune.CustomFromMask

+

Supported

+

232

+

torch.nn.utils.prune.CustomFromMask.apply

+

Supported

+

233

+

torch.nn.utils.prune.CustomFromMask.apply_mask

+

Unsupported.

+

234

+

torch.nn.utils.prune.CustomFromMask.prune

+

Supported

+

235

+

torch.nn.utils.prune.CustomFromMask.remove

+

Unsupported.

+

236

+

torch.nn.utils.prune.identity

+

Supported

+

237

+

torch.nn.utils.prune.random_unstructured

+

Unsupported.

+

238

+

torch.nn.utils.prune.l1_unstructured

+

Unsupported.

+

239

+

torch.nn.utils.prune.random_structured

+

Supported

+

240

+

torch.nn.utils.prune.ln_structured

+

Unsupported.

+

241

+

torch.nn.utils.prune.global_unstructured

+

Unsupported.

+

242

+

torch.nn.utils.prune.custom_from_mask

+

Supported

+

243

+

torch.nn.utils.prune.remove

+

Supported

+

244

+

torch.nn.utils.prune.is_pruned

+

Supported

+

245

+

torch.nn.utils.weight_norm

+

Supported

+

246

+

torch.nn.utils.remove_weight_norm

+

Supported

+

247

+

torch.nn.utils.spectral_norm

+

Supported

+

248

+

torch.nn.utils.remove_spectral_norm

+

Unsupported.

+

249

+

torch.nn.utils.rnn.PackedSequence

+

Supported

+

250

+

torch.nn.utils.rnn.pack_padded_sequence

+

Supported

+

251

+

torch.nn.utils.rnn.pad_packed_sequence

+

Unsupported.

+

252

+

torch.nn.utils.rnn.pad_sequence

+

Supported

+

253

+

torch.nn.utils.rnn.pack_sequence

+

Unsupported.

+

254

+

torch.nn.Flatten

+

Supported

+

255

+

torch.quantization.quantize

+

Unsupported.

+

256

+

torch.quantization.quantize_dynamic

+

Unsupported.

+

257

+

torch.quantization.quantize_qat

+

Unsupported.

+

258

+

torch.quantization.prepare

+

Unsupported.

+

259

+

torch.quantization.prepare_qat

+

Unsupported.

+

260

+

torch.quantization.convert

+

Unsupported.

+

261

+

torch.quantization.QConfig

+

Unsupported.

+

262

+

torch.quantization.QConfigDynamic

+

Unsupported.

+

263

+

torch.quantization.fuse_modules

+

Unsupported.

+

264

+

torch.quantization.QuantStub

+

Unsupported.

+

265

+

torch.quantization.DeQuantStub

+

Unsupported.

+

266

+

torch.quantization.QuantWrapper

+

Unsupported.

+

267

+

torch.quantization.add_quant_dequant

+

Unsupported.

+

268

+

torch.quantization.add_observer_

+

Unsupported.

+

269

+

torch.quantization.swap_module

+

Unsupported.

+

270

+

torch.quantization.propagate_qconfig_

+

Unsupported.

+

271

+

torch.quantization.default_eval_fn

+

Unsupported.

+

272

+

torch.quantization.MinMaxObserver

+

Unsupported.

+

273

+

torch.quantization.MovingAverageMinMaxObserver

+

Unsupported.

+

274

+

torch.quantization.PerChannelMinMaxObserver

+

Unsupported.

+

275

+

torch.quantization.MovingAveragePerChannelMinMaxObserver

+

Unsupported.

+

276

+

torch.quantization.HistogramObserver

+

Unsupported.

+

277

+

torch.quantization.FakeQuantize

+

Unsupported.

+

278

+

torch.quantization.NoopObserver

+

Unsupported.

+

279

+

torch.quantization.get_observer_dict

+

Unsupported.

+

280

+

torch.quantization.RecordingObserver

+

Unsupported.

+

281

+

torch.nn.intrinsic.ConvBn2d

+

Supported

+

282

+

torch.nn.intrinsic.ConvBnReLU2d

+

Supported

+

283

+

torch.nn.intrinsic.ConvReLU2d

+

Supported

+

284

+

torch.nn.intrinsic.ConvReLU3d

+

Unsupported.

+

285

+

torch.nn.intrinsic.LinearReLU

+

Supported

+

286

+

torch.nn.intrinsic.qat.ConvBn2d

+

Unsupported.

+

287

+

torch.nn.intrinsic.qat.ConvBnReLU2d

+

Unsupported.

+

288

+

torch.nn.intrinsic.qat.ConvReLU2d

+

Unsupported.

+

289

+

torch.nn.intrinsic.qat.LinearReLU

+

Unsupported.

+

290

+

torch.nn.intrinsic.quantized.ConvReLU2d

+

Unsupported.

+

291

+

torch.nn.intrinsic.quantized.ConvReLU3d

+

Unsupported.

+

292

+

torch.nn.intrinsic.quantized.LinearReLU

+

Unsupported.

+

293

+

torch.nn.qat.Conv2d

+

Unsupported.

+

294

+

torch.nn.qat.Conv2d.from_float

+

Unsupported.

+

295

+

torch.nn.qat.Linear

+

Unsupported.

+

296

+

torch.nn.qat.Linear.from_float

+

Unsupported.

+

297

+

torch.nn.quantized.functional.relu

+

Unsupported.

+

298

+

torch.nn.quantized.functional.linear

+

Unsupported.

+

299

+

torch.nn.quantized.functional.conv2d

+

Unsupported.

+

300

+

torch.nn.quantized.functional.conv3d

+

Unsupported.

+

301

+

torch.nn.quantized.functional.max_pool2d

+

Unsupported.

+

302

+

torch.nn.quantized.functional.adaptive_avg_pool2d

+

Unsupported.

+

303

+

torch.nn.quantized.functional.avg_pool2d

+

Unsupported.

+

304

+

torch.nn.quantized.functional.interpolate

+

Unsupported.

+

305

+

torch.nn.quantized.functional.upsample

+

Unsupported.

+

306

+

torch.nn.quantized.functional.upsample_bilinear

+

Unsupported.

+

307

+

torch.nn.quantized.functional.upsample_nearest

+

Unsupported.

+

308

+

torch.nn.quantized.ReLU

+

Unsupported.

+

309

+

torch.nn.quantized.ReLU6

+

Unsupported.

+

310

+

torch.nn.quantized.Conv2d

+

Unsupported.

+

311

+

torch.nn.quantized.Conv2d.from_float

+

Unsupported.

+

312

+

torch.nn.quantized.Conv3d

+

Unsupported.

+

313

+

torch.nn.quantized.Conv3d.from_float

+

Unsupported.

+

314

+

torch.nn.quantized.FloatFunctional

+

Unsupported.

+

315

+

torch.nn.quantized.QFunctional

+

Unsupported.

+

316

+

torch.nn.quantized.Quantize

+

Unsupported.

+

317

+

torch.nn.quantized.DeQuantize

+

Unsupported.

+

318

+

torch.nn.quantized.Linear

+

Unsupported.

+

319

+

torch.nn.quantized.Linear.from_float

+

Unsupported.

+

320

+

torch.nn.quantized.dynamic.Linear

+

Unsupported.

+

321

+

torch.nn.quantized.dynamic.Linear.from_float

+

Unsupported.

+

322

+

torch.nn.quantized.dynamic.LSTM

+

Unsupported.

+
+ +

Functions\(torch.nn.functional\)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.nn.functional.conv1d

+

Supported

+

2

+

torch.nn.functional.conv2d

+

Supported

+

3

+

torch.nn.functional.conv3d

+

Supported

+

4

+

torch.nn.functional.conv_transpose1d

+

Supported

+

5

+

torch.nn.functional.conv_transpose2d

+

Supported

+

6

+

torch.nn.functional.conv_transpose3d

+

Unsupported.

+

7

+

torch.nn.functional.unfold

+

Unsupported.

+

8

+

torch.nn.functional.fold

+

Supported

+

9

+

torch.nn.functional.avg_pool1d

+

Supported

+

10

+

torch.nn.functional.avg_pool2d

+

Supported

+

11

+

torch.nn.functional.avg_pool3d

+

Supported

+

12

+

torch.nn.functional.max_pool1d

+

Supported

+

13

+

torch.nn.functional.max_pool2d

+

Supported

+

14

+

torch.nn.functional.max_pool3d

+

Supported

+

15

+

torch.nn.functional.max_unpool1d

+

Unsupported.

+

16

+

torch.nn.functional.max_unpool2d

+

Unsupported.

+

17

+

torch.nn.functional.max_unpool3d

+

Unsupported.

+

18

+

torch.nn.functional.lp_pool1d

+

Supported

+

19

+

torch.nn.functional.lp_pool2d

+

Supported

+

20

+

torch.nn.functional.adaptive_max_pool1d

+

Supported

+

21

+

torch.nn.functional.adaptive_max_pool2d

+

Supported

+

22

+

torch.nn.functional.adaptive_max_pool3d

+

Unsupported.

+

23

+

torch.nn.functional.adaptive_avg_pool1d

+

Supported

+

24

+

torch.nn.functional.adaptive_avg_pool2d

+

Supported

+

25

+

torch.nn.functional.adaptive_avg_pool3d

+

Supported

+

26

+

torch.nn.functional.threshold

+

Supported

+

27

+

torch.nn.functional.threshold_

+

Supported

+

28

+

torch.nn.functional.relu

+

Supported

+

29

+

torch.nn.functional.relu_

+

Supported

+

30

+

torch.nn.functional.hardtanh

+

Supported

+

31

+

torch.nn.functional.hardtanh_

+

Supported

+

32

+

torch.nn.functional.relu6

+

Supported

+

33

+

torch.nn.functional.elu

+

Supported

+

34

+

torch.nn.functional.elu_

+

Supported

+

35

+

torch.nn.functional.selu

+

Supported

+

36

+

torch.nn.functional.celu

+

Supported

+

37

+

torch.nn.functional.leaky_relu

+

Supported

+

38

+

torch.nn.functional.leaky_relu_

+

Supported

+

39

+

torch.nn.functional.prelu

+

Supported

+

40

+

torch.nn.functional.rrelu

+

Unsupported.

+

41

+

torch.nn.functional.rrelu_

+

Unsupported.

+

42

+

torch.nn.functional.glu

+

Supported

+

43

+

torch.nn.functional.gelu

+

Supported

+

44

+

torch.nn.functional.logsigmoid

+

Supported

+

45

+

torch.nn.functional.hardshrink

+

Supported

+

46

+

torch.nn.functional.tanhshrink

+

Supported

+

47

+

torch.nn.functional.softsign

+

Supported

+

48

+

torch.nn.functional.softplus

+

Supported

+

49

+

torch.nn.functional.softmin

+

Supported

+

50

+

torch.nn.functional.softmax

+

Supported

+

51

+

torch.nn.functional.softshrink

+

Unsupported.

+

52

+

torch.nn.functional.gumbel_softmax

+

Unsupported.

+

53

+

torch.nn.functional.log_softmax

+

Supported

+

54

+

torch.nn.functional.tanh

+

Supported

+

55

+

torch.nn.functional.sigmoid

+

Supported

+

56

+

torch.nn.functional.batch_norm

+

Supported

+

57

+

torch.nn.functional.instance_norm

+

Supported

+

58

+

torch.nn.functional.layer_norm

+

Supported

+

59

+

torch.nn.functional.local_response_norm

+

Supported

+

60

+

torch.nn.functional.normalize

+

Supported

+

61

+

torch.nn.functional.linear

+

Supported

+

62

+

torch.nn.functional.bilinear

+

Supported

+

63

+

torch.nn.functional.dropout

+

Supported

+

64

+

torch.nn.functional.alpha_dropout

+

Supported

+

65

+

torch.nn.functional.dropout2d

+

Unsupported.

+

66

+

torch.nn.functional.dropout3d

+

Unsupported.

+

67

+

torch.nn.functional.embedding

+

Supported

+

68

+

torch.nn.functional.embedding_bag

+

Unsupported.

+

69

+

torch.nn.functional.one_hot

+

Supported

+

70

+

torch.nn.functional.pairwise_distance

+

Supported

+

71

+

torch.nn.functional.cosine_similarity

+

Supported

+

72

+

torch.nn.functional.pdist

+

Supported

+

73

+

torch.nn.functional.binary_cross_entropy

+

Supported

+

74

+

torch.nn.functional.binary_cross_entropy_with_logits

+

Supported

+

75

+

torch.nn.functional.poisson_nll_loss

+

Supported

+

76

+

torch.nn.functional.cosine_embedding_loss

+

Supported

+

77

+

torch.nn.functional.cross_entropy

+

Supported

+

78

+

torch.nn.functional.ctc_loss

+

Supported

+

79

+

torch.nn.functional.hinge_embedding_loss

+

Supported

+

80

+

torch.nn.functional.kl_div

+

Supported

+

81

+

torch.nn.functional.l1_loss

+

Supported

+

82

+

torch.nn.functional.mse_loss

+

Supported

+

83

+

torch.nn.functional.margin_ranking_loss

+

Supported

+

84

+

torch.nn.functional.multilabel_margin_loss

+

Supported

+

85

+

torch.nn.functional.multilabel_soft_margin_loss

+

Supported

+

86

+

torch.nn.functional.multi_margin_loss

+

Unsupported.

+

87

+

torch.nn.functional.nll_loss

+

Supported

+

88

+

torch.nn.functional.smooth_l1_loss

+

Supported

+

89

+

torch.nn.functional.soft_margin_loss

+

Supported

+

90

+

torch.nn.functional.triplet_margin_loss

+

Supported

+

91

+

torch.nn.functional.pixel_shuffle

+

Supported

+

92

+

torch.nn.functional.pad

+

Supported

+

93

+

torch.nn.functional.interpolate

+

Unsupported.

+

94

+

torch.nn.functional.upsample

+

Unsupported.

+

95

+

torch.nn.functional.upsample_nearest

+

Unsupported.

+

96

+

torch.nn.functional.upsample_bilinear

+

Supported

+

97

+

torch.nn.functional.grid_sample

+

Supported

+

98

+

torch.nn.functional.affine_grid

+

Unsupported.

+

99

+

torch.nn.parallel.data_parallel

+

Unsupported.

+
+ +

torch.distributed

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

API Name

+

Supported/Unsupported

+

1

+

torch.distributed.init_process_group

+

Supported

+

2

+

torch.distributed.Backend

+

Supported

+

3

+

torch.distributed.get_backend

+

Supported

+

4

+

torch.distributed.get_rank

+

Supported

+

5

+

torch.distributed.get_world_size

+

Supported

+

6

+

torch.distributed.is_initialized

+

Supported

+

7

+

torch.distributed.is_mpi_available

+

Supported

+

8

+

torch.distributed.is_nccl_available

+

Supported

+

9

+

torch.distributed.new_group

+

Supported

+

10

+

torch.distributed.send

+

Unsupported.

+

11

+

torch.distributed.recv

+

Unsupported.

+

12

+

torch.distributed.isend

+

Unsupported.

+

13

+

torch.distributed.irecv

+

Unsupported.

+

14

+

is_completed

+

Supported

+

15

+

wait

+

Supported

+

16

+

torch.distributed.broadcast

+

Supported

+

17

+

torch.distributed.all_reduce

+

Supported

+

18

+

torch.distributed.reduce

+

Unsupported.

+

19

+

torch.distributed.all_gather

+

Supported

+

20

+

torch.distributed.gather

+

Unsupported.

+

21

+

torch.distributed.scatter

+

Unsupported.

+

22

+

torch.distributed.barrier

+

Supported

+

23

+

torch.distributed.ReduceOp

+

Supported

+

24

+

torch.distributed.reduce_op

+

Supported

+

25

+

torch.distributed.broadcast_multigpu

+

Unsupported.

+

26

+

torch.distributed.all_reduce_multigpu

+

Unsupported.

+

27

+

torch.distributed.reduce_multigpu

+

Unsupported.

+

28

+

torch.distributed.all_gather_multigpu

+

Unsupported.

+

29

+

torch.distributed.launch

+

Supported

+

30

+

torch.multiprocessing.spawn

+

Supported

+
+ +

NPU and CUDA Function Alignment

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

CUDA API Name

+

NPU API Name

+

Supported/Unsupported

+

1

+

torch.cuda.current_blas_handle

+

torch.npu.current_blas_handle

+

Unsupported.

+

2

+

torch.cuda.current_device

+

torch.npu.current_device

+

Supported

+

3

+

torch.cuda.current_stream

+

torch.npu.current_stream

+

Unsupported.

+

4

+

torch.cuda.default_stream

+

torch.npu.default_stream

+

Supported

+

5

+

torch.cuda.device

+

torch.npu.device

+

Unsupported.

+

6

+

torch.cuda.device_count

+

torch.npu.device_count

+

Supported

+

7

+

torch.cuda.device_of

+

torch.npu.device_of

+

Unsupported.

+

8

+

torch.cuda.get_device_capability

+

torch.npu.get_device_capability

+

Unsupported.

+

9

+

torch.cuda.get_device_name

+

torch.npu.get_device_name

+

Unsupported.

+

10

+

torch.cuda.init

+

torch.npu.init

+

Supported

+

11

+

torch.cuda.ipc_collect

+

torch.npu.ipc_collect

+

Unsupported.

+

12

+

torch.cuda.is_available

+

torch.npu.is_available

+

Supported

+

13

+

torch.cuda.is_initialized

+

torch.npu.is_initialized

+

Supported

+

14

+

torch.cuda.set_device

+

torch.npu.set_device

+

Partially supported

+

15

+

torch.cuda.stream

+

torch.npu.stream

+

Supported

+

16

+

torch.cuda.synchronize

+

torch.npu.synchronize

+

Supported

+

17

+

torch.cuda.get_rng_state

+

torch.npu.get_rng_state

+

Unsupported.

+

18

+

torch.cuda.get_rng_state_all

+

torch.npu.get_rng_state_all

+

Unsupported.

+

19

+

torch.cuda.set_rng_state

+

torch.npu.set_rng_state

+

Unsupported.

+

20

+

torch.cuda.set_rng_state_all

+

torch.npu.set_rng_state_all

+

Unsupported.

+

21

+

torch.cuda.manual_seed

+

torch.npu.manual_seed

+

Unsupported.

+

22

+

torch.cuda.manual_seed_all

+

torch.npu.manual_seed_all

+

Unsupported.

+

23

+

torch.cuda.seed

+

torch.npu.seed

+

Unsupported.

+

24

+

torch.cuda.seed_all

+

torch.npu.seed_all

+

Unsupported.

+

25

+

torch.cuda.initial_seed

+

torch.npu.initial_seed

+

Unsupported.

+

26

+

torch.cuda.comm.broadcast

+

torch.npu.comm.broadcast

+

Unsupported.

+

27

+

torch.cuda.comm.broadcast_coalesced

+

torch.npu.comm.broadcast_coalesced

+

Unsupported.

+

28

+

torch.cuda.comm.reduce_add

+

torch.npu.comm.reduce_add

+

Unsupported.

+

29

+

torch.cuda.comm.scatter

+

torch.npu.comm.scatter

+

Unsupported.

+

30

+

torch.cuda.comm.gather

+

torch.npu.comm.gather

+

Unsupported.

+

31

+

torch.cuda.Stream

+

torch.npu.Stream

+

Supported

+

32

+

torch.cuda.Stream.query

+

torch.npu.Stream.query

+

Unsupported.

+

33

+

torch.cuda.Stream.record_event

+

torch.npu.Stream.record_event

+

Supported

+

34

+

torch.cuda.Stream.synchronize

+

torch.npu.Stream.synchronize

+

Supported

+

35

+

torch.cuda.Stream.wait_event

+

torch.npu.Stream.wait_event

+

Supported

+

36

+

torch.cuda.Stream.wait_stream

+

torch.npu.Stream.wait_stream

+

Supported

+

37

+

torch.cuda.Event

+

torch.npu.Event

+

Supported

+

38

+

torch.cuda.Event.elapsed_time

+

torch.npu.Event.elapsed_time

+

Supported

+

39

+

torch.cuda.Event.from_ipc_handle

+

torch.npu.Event.from_ipc_handle

+

Unsupported.

+

40

+

torch.cuda.Event.ipc_handle

+

torch.npu.Event.ipc_handle

+

Unsupported.

+

41

+

torch.cuda.Event.query

+

torch.npu.Event.query

+

Supported

+

42

+

torch.cuda.Event.record

+

torch.npu.Event.record

+

Supported

+

43

+

torch.cuda.Event.synchronize

+

torch.npu.Event.synchronize

+

Supported

+

44

+

torch.cuda.Event.wait

+

torch.npu.Event.wait

+

Supported

+

45

+

torch.cuda.empty_cache

+

torch.npu.empty_cache

+

Supported

+

46

+

torch.cuda.memory_stats

+

torch.npu.memory_stats

+

Supported

+

47

+

torch.cuda.memory_summary

+

torch.npu.memory_summary

+

Supported

+

48

+

torch.cuda.memory_snapshot

+

torch.npu.memory_snapshot

+

Supported

+

49

+

torch.cuda.memory_allocated

+

torch.npu.memory_allocated

+

Supported

+

50

+

torch.cuda.max_memory_allocated

+

torch.npu.max_memory_allocated

+

Supported

+

51

+

torch.cuda.reset_max_memory_allocated

+

torch.npu.reset_max_memory_allocated

+

Supported

+

52

+

torch.cuda.memory_reserved

+

torch.npu.memory_reserved

+

Supported

+

53

+

torch.cuda.max_memory_reserved

+

torch.npu.max_memory_reserved

+

Supported

+

54

+

torch.cuda.memory_cached

+

torch.npu.memory_cached

+

Supported

+

55

+

torch.cuda.max_memory_cached

+

torch.npu.max_memory_cached

+

Supported

+

56

+

torch.cuda.reset_max_memory_cached

+

torch.npu.reset_max_memory_cached

+

Supported

+

57

+

torch.cuda.nvtx.mark

+

torch.npu.nvtx.mark

+

Unsupported.

+

58

+

torch.cuda.nvtx.range_push

+

torch.npu.nvtx.range_push

+

Unsupported.

+

59

+

torch.cuda.nvtx.range_pop

+

torch.npu.nvtx.range_pop

+

Unsupported.

+

60

+

torch.cuda._sleep

+

torch.npu._sleep

+

Unsupported.

+

61

+

torch.cuda.Stream.priority_range

+

torch.npu.Stream.priority_range

+

Unsupported.

+

62

+

torch.cuda.get_device_properties

+

torch.npu.get_device_properties

+

Unsupported.

+

63

+

torch.cuda.amp.GradScaler

+

torch.npu.amp.GradScaler

+

Unsupported.

+
+ +>![](public_sys-resources/icon-note.gif) **NOTE:** +>The** torch.npu.set\_device \(\)** API can be used to specify the device only at the starting position of the program by using **set\_device**. The device cannot be specified for multiple times or switched by using **torch.npu.device \(id\)**. + diff --git a/docs/zh/RELEASENOTE/public_sys-resources/icon-caution.gif b/docs/en/PyTorch API Support List/public_sys-resources/icon-caution.gif similarity index 100% rename from docs/zh/RELEASENOTE/public_sys-resources/icon-caution.gif rename to docs/en/PyTorch API Support List/public_sys-resources/icon-caution.gif diff --git a/docs/zh/RELEASENOTE/public_sys-resources/icon-danger.gif b/docs/en/PyTorch API Support List/public_sys-resources/icon-danger.gif similarity index 100% rename from docs/zh/RELEASENOTE/public_sys-resources/icon-danger.gif rename to docs/en/PyTorch API Support List/public_sys-resources/icon-danger.gif diff --git a/docs/zh/RELEASENOTE/public_sys-resources/icon-note.gif b/docs/en/PyTorch API Support List/public_sys-resources/icon-note.gif similarity index 100% rename from docs/zh/RELEASENOTE/public_sys-resources/icon-note.gif rename to docs/en/PyTorch API Support List/public_sys-resources/icon-note.gif diff --git a/docs/zh/RELEASENOTE/public_sys-resources/icon-notice.gif b/docs/en/PyTorch API Support List/public_sys-resources/icon-notice.gif similarity index 100% rename from docs/zh/RELEASENOTE/public_sys-resources/icon-notice.gif rename to docs/en/PyTorch API Support List/public_sys-resources/icon-notice.gif diff --git a/docs/zh/RELEASENOTE/public_sys-resources/icon-tip.gif b/docs/en/PyTorch API Support List/public_sys-resources/icon-tip.gif similarity index 100% rename from docs/zh/RELEASENOTE/public_sys-resources/icon-tip.gif rename to docs/en/PyTorch API Support List/public_sys-resources/icon-tip.gif diff --git a/docs/zh/RELEASENOTE/public_sys-resources/icon-warning.gif b/docs/en/PyTorch API Support List/public_sys-resources/icon-warning.gif similarity index 100% rename from docs/zh/RELEASENOTE/public_sys-resources/icon-warning.gif rename to docs/en/PyTorch API Support List/public_sys-resources/icon-warning.gif diff --git a/docs/en/PyTorch Installation Guide/PyTorch Installation Guide.md b/docs/en/PyTorch Installation Guide/PyTorch Installation Guide.md new file mode 100644 index 0000000000000000000000000000000000000000..f48cc191c6b0e314e6cc07641f341e8f5d4d16a5 --- /dev/null +++ b/docs/en/PyTorch Installation Guide/PyTorch Installation Guide.md @@ -0,0 +1,568 @@ +# FrameworkPTAdapter 2.0.2 PyTorch Installation Guide +- [Overview](#overview.md) +- [Manual Build and Installation](#manual-build-and-installation.md) + - [Prerequisites](#prerequisites.md) + - [Installing the PyTorch Framework](#installing-the-pytorch-framework.md) + - [Configuring Environment Variables](#configuring-environment-variables.md) + - [Installing the Mixed Precision Module](#installing-the-mixed-precision-module.md) +- [Using the Ascend Hub Image](#using-the-ascend-hub-image.md) + - [Obtaining the PyTorch Image from the Ascend Hub](#obtaining-the-pytorch-image-from-the-ascend-hub.md) + - [Configuring Environment Variables](#configuring-environment-variables-0.md) +- [References](#references.md) + - [Installing CMake](#installing-cmake.md) + - [How Do I Install GCC 7.3.0?](#how-do-i-install-gcc-7-3-0.md) + - [What Do I Do If "torch 1.5.0xxxx" and "torchvision" Do Not Match When torch-\*.whl Is Installed?](#what-do-i-do-if-torch-1-5-0xxxx-and-torchvision-do-not-match-when-torch--whl-is-installed.md) +

Overview

+ +When setting up the environment for PyTorch model porting and training, you can manually build and install the modules adapted to the PyTorch framework on a training server, or use the base image provided by the Ascend Hub image center \(the PyTorch module and mixed precision module have been installed in the image\). + +**Figure 1** Environment setup process +![](figures/environment-setup-process.png "environment-setup-process") + +

Manual Build and Installation

+ +- **[Prerequisites](#prerequisites.md)** + +- **[Installing the PyTorch Framework](#installing-the-pytorch-framework.md)** + +- **[Configuring Environment Variables](#configuring-environment-variables.md)** + +- **[Installing the Mixed Precision Module](#installing-the-mixed-precision-module.md)** + + +

Prerequisites

+ +## Prerequisites + +- The development or operating environment of CANN has been installed. For details, see the _CANN Software Installation Guide_. +- CMake 3.12.0 or later has been installed. For details about how to install CMake, see [Installing CMake](#installing-cmake.md). +- GCC 7.3.0 or later has been installed. For details about how to install and use GCC 7.3.0, see [How Do I Install GCC 7.3.0?](#how-do-i-install-gcc-7-3-0.md). +- The Patch and Git tools have been installed in the environment. To install the tools for Ubuntu and CentOS, run the following commands: + - Ubuntu + + ``` + apt-get install patch + apt-get install git + ``` + + - CentOS + + ``` + yum install patch + yum install git + ``` + + + +

Installing the PyTorch Framework

+ +## Installation Process + +1. Log in to the server as the **root** user or a non-root user. +2. Run the following commands in sequence to install the PyTorch dependencies. + + If you install Python and its dependencies as a non-root user, add **--user** at the end of each command in this step. Example command: **pip3.7 install pyyaml --user**. + + ``` + pip3 install pyyaml + pip3 install wheel + ``` + +3. Obtain the PyTorch source code. + + 1. Run the following command to obtain the PyTorch source code adapted to Ascend AI Processors: + + ``` + git clone https://gitee.com/ascend/pytorch.git + ``` + + The directory structure of the downloaded source code is as follows: + + ``` + pytorch + │ ├─patch # Directory of the patch adapted to Ascend AI Processors + │ ├─npu.patch + │ ├─scripts # Build and create a directory. + │ ├─gen.sh + │ ├─src # Source code directory + │ ├─test # Directory for storing test cases + │ ├─README.md + ``` + + 2. Run the following commands to go to the **pytorch** directory and obtain the native PyTorch source code: + + ``` + cd pytorch + git clone -b v1.5.0 --depth=1 https://github.com/pytorch/pytorch.git + ``` + + After the native PyTorch source code is downloaded, the main directory structure of the code is as follows: + + ``` + pytorch + │ ├─patch # Directory of the patch adapted to Ascend AI Processors + │ ├─npu.patch + │ ├─pytorch # Directory for storing the native PyTorch code + │ ├─scripts # Build and create a directory. + │ ├─gen.sh + │ ├─src # Source code directory + │ ├─test # Directory for storing test cases + │ ├─README.md + ``` + + 3. Run the following commands to go to the native PyTorch code directory **pytorch** and obtain the PyTorch passive dependency code: + + ``` + cd pytorch + git submodule sync + git submodule update --init --recursive + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >Due to network fluctuation, it may take a long time to obtain the source code. If no error is reported after the download is complete, the PyTorch and third-party code on which PyTorch depends are generated. + +4. Generate the PyTorch installation package adapted to Ascend AI Processors. + 1. Go to the **pytorch/scripts** directory and run the conversion script to generate full code adapted to Ascend AI Processors. + + ``` + cd ../scripts + bash gen.sh + ``` + + The full code adapted to Ascend AI Processors is generated in the **pytorch/pytorch** directory. + + 2. Go to the full code directory **pytorch/pytorch**, and compile and generate the binary installation package of PyTorch. + + ``` + cd ../pytorch + bash build.sh + ``` + + The generated binary package is stored in the current dist directory **pytorch/pytorch/dist**. + +5. Install PyTorch. + + Go to the **pytorch/pytorch/dist** directory and run the following command to install PyTorch: + + ``` + pip3 install --upgrade torch-1.5.0+ascend-cp37-cp37m-linux_{arch}.whl + ``` + + _\{**arch\}**_ indicates the architecture information. The value can be **aarch64** or **x86\_64**. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >To upgrade PyTorch in the environment, uninstall the PyTorch software package installed in the environment and then perform [Step 5 Install PyTorch](#en-us_topic_0000001152776301_li49671667141). Run the following command to check whether PyTorch has been installed: + >**pip3 list | grep torch** + + +

Configuring Environment Variables

+ +After the software packages are installed, configure environment variables to use Ascend PyTorch. You are advised to build a startup script, for example, the **set\_env.sh** script, and run **source set\_env.sh** to configure the environment variables. The content of the **set\_env.sh** script is as follows \(the **root** user is used as the installation user and the default installation path is used\): + +``` +cpu_type=$(echo $HOSTTYPE) + +if [ x"${cpu_type}" == x"x86_64" ];then + cpu_type=x86_64-linux +else + cpu_type=arm64-linux +fi +if [ -d /usr/local/Ascend/nnae/latest ];then + export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=/usr/local/Ascend/nnae/latest/ +else + export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest/${cpu_type} +fi +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH +export TASK_QUEUE_ENABLE=1 + +# (Optional) If the system is openEuler, run this command to cancel CPU core binding. +# unset GOMP_CPU_AFFINITY + +# Select an HCCL initialization method and configure the corresponding environment variables in the following scenarios. The details are as follows: +# Scenario 1: Single-node scenario +export HCCL_WHITELIST_DISABLE=1 # Disable the HCCL trustlist. +# Scenario 2: Multi-node scenario +export HCCL_WHITELIST_DISABLE=1 # Disable the HCCL trustlist. +export HCCL_IF_IP="1.1.1.1" # 1.1.1.1 is the NIC IP address of the host. Change it based on the site requirements. Ensure that the NIC IP addresses used can communicate with each other in the cluster. +``` + +[Table 1](#en-us_topic_0000001152616261_table42017516135) describes related variables. + +**Table 1** Description of environment variables + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Environment Variable

+

Description

+

LD_LIBRARY_PATH

+

Dynamic library search path. Set this variable based on the preceding example.

+

If you need to upgrade GCC in OSs such as CentOS, Debian, and BC-Linux, add ${install_path}/lib64 to the LD_LIBRARY_PATH variable of the dynamic library search path. Replace {install_path} with the GCC installation path. For details, see 5.

+

PYTHONPATH

+

Python search path. Set this variable based on the preceding example.

+

PATH

+

Executable program search path. Set this variable based on the preceding example.

+

ASCEND_OPP_PATH

+

Operator package (OPP) root directory. Set this variable based on the preceding example.

+

OPTION_EXEC_EXTERN_PLUGIN_PATH

+

Path of the operator information library.

+

ASCEND_AICPU_PATH

+

Path of the AI CPU operator package.

+

TASK_QUEUE_ENABLE

+

Whether to asynchronously deliver tasks and call the ACL APIs. You are advised to set this parameter to 1 to enable this function.

+

HCCL_WHITELIST_DISABLE

+

Whether to enable the communication trustlist when the HCCL is used.

+
  • 0: enable the trustlist. The HCCL communication trustlist does not need to be verified.
  • 1: disable the trustlist. The HCCL communication trustlist needs to be verified.
+

The default value is 0, indicating that the trustlist is enabled by default.

+

HCCL_IF_IP

+

IP address of the NIC for initializing communication in the HCCL.

+
  • The IP address is in dotted decimal notation.
  • Currently, only the host NIC is supported.
+

By default, the host communication NICs are selected in the following sequence: NICs other than Docker/local NICs (in ascending alphabetical order of NIC names) > Docker NICs > local NICs.

+

unset GOMP_CPU_AFFINITY

+

(Optional) If the system is openEuler, run this command to cancel CPU core binding.

+
+ +

Installing the Mixed Precision Module

+ +## Prerequisites + +1. Ensure that the PyTorch framework adapted to Ascend AI Processors in the operating environment can be used properly. +2. Before building and installing Apex, you have configured the environment variables on which the build depends. See [Configuring Environment Variables](#configuring-environment-variables.md). + +## Installation Process + +1. Log in to the server as the **root** user or a non-root user. +2. Obtain the Apex source code. + + 1. Run the following command to obtain the Apex source code adapted to Ascend AI Processors: + + ``` + git clone https://gitee.com/ascend/apex.git + ``` + + The directory structure of the downloaded source code is as follows: + + ``` + apex + │ ├─patch # Directory of the patch adapted to Ascend AI Processors + │ ├─npu.patch + │ ├─scripts # Build and create a directory. + │ ├─gen.sh + │ ├─src # Source code directory + │ ├─tests # Directory for storing test cases + │ ├─README.md + ``` + + 2. Run the following commands to go to the **apex** directory and obtain the native Apex source code: + + ``` + cd apex + git clone https://github.com/NVIDIA/apex.git + ``` + + After the native Apex source code is downloaded, the main directory structure of the code is as follows: + + ``` + apex + │ ├─apex # Directory for storing the native Apex code + │ ├─patch # Directory of the patch adapted to Ascend AI Processors + │ ├─npu.patch + │ ├─scripts # Build and create a directory. + │ ├─gen.sh + │ ├─src # Source code directory + │ ├─tests # Directory for storing test cases + │ ├─README.md + ``` + + 3. Go to the native Apex code directory, that is, **apex/apex**. Switch to the code branch whose commit ID is 4ef930c1c884fdca5f472ab2ce7cb9b505d26c1a. + + ``` + cd apex + git checkout 4ef930c1c884fdca5f472ab2ce7cb9b505d26c1a + cd .. + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >Due to network fluctuation, it may take a long time to obtain the source code. + +3. Generate the Apex installation package adapted to Ascend AI Processors. + 1. Go to the **apex/scripts** directory and run the conversion script to generate full code adapted to Ascend AI Processors. + + ``` + cd ../scripts + bash gen.sh + ``` + + The full code adapted to Ascend AI Processors is generated in the **apex/apex** directory. + + 2. Go to the full code directory **apex/apex**, and compile and generate the binary installation package of Apex. + + ``` + cd ../apex + python3 setup.py --cpp_ext --npu_float_status bdist_wheel + ``` + + The generated binary package is stored in the current dist directory **apex/apex/dist**. + +4. Install Apex. + + Go to the **apex/apex/dist** directory and run the following command to install Apex: + + ``` + pip3.7 install --upgrade apex-0.1+ascend-cp37-cp37m-linux_{arch}.whl + ``` + + _\{**arch\}**_ indicates the architecture information. The value can be **aarch64** or **x86\_64**. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >To upgrade PyTorch in the environment, uninstall the PyTorch software package installed in the environment and then perform [Step 4 Install Apex](#en-us_topic_0000001106176190_li425495374416). Run the following command to check whether PyTorch has been installed: + >**pip3 list | grep apex** + + +

Using the Ascend Hub Image

+ +- **[Obtaining the PyTorch Image from the Ascend Hub](#obtaining-the-pytorch-image-from-the-ascend-hub.md)** + +- **[Configuring Environment Variables](#configuring-environment-variables-0.md)** + + +

Obtaining the PyTorch Image from the Ascend Hub

+ +## Prerequisites + +- The development or operating environment of CANN has been installed. For details, see the _CANN Software Installation Guide_. +- Docker has been installed on the host. + +## Obtaining and Using an Image + +Log in to the [Ascend Hub](https://ascendhub.huawei.com/#/home) to obtain the required image. \(Activate the account when applying for an image for the first time.\) + +[Table 1](#en-us_topic_0000001118701830_en-us_topic_0000001074498056_table1519011227314) lists the supported images. Select an image that you want. + +**Table 1** Image list + + + + + + + + + + + + +

Image Name

+

Image Version

+

CANN Version

+
+

21.0.2

+

5.0.2

+
+ +

Configuring Environment Variables

+ +After starting and entering the image container, configure the environment variables on which model training depends by referring to [Configuring Environment Variables](#configuring-environment-variables.md). + +

References

+ +- **[Installing CMake](#installing-cmake.md)** + +- **[How Do I Install GCC 7.3.0?](#how-do-i-install-gcc-7-3-0.md)** + +- **[What Do I Do If "torch 1.5.0xxxx" and "torchvision" Do Not Match When torch-\*.whl Is Installed?](#what-do-i-do-if-torch-1-5-0xxxx-and-torchvision-do-not-match-when-torch--whl-is-installed.md)** + + +

Installing CMake

+ +Procedure for upgrading CMake to 3.12.1 + +1. Obtain the CMake software package. + + ``` + wget https://cmake.org/files/v3.12/cmake-3.12.1.tar.gz --no-check-certificate + ``` + +2. Decompress the package and go to the software package directory. + + ``` + tar -xf cmake-3.12.1.tar.gz + cd cmake-3.12.1/ + ``` + +3. Run the configuration, build, and installation commands. + + ``` + ./configure --prefix=/usr/local/cmake + make && make install + ``` + +4. Set the soft link. + + ``` + ln -s /usr/local/cmake/bin/cmake /usr/bin/cmake + ``` + +5. Run the following command to check whether CMake has been installed: + + ``` + cmake --version + ``` + + If the message "cmake version 3.12.1" is displayed, the installation is successful. + + +

How Do I Install GCC 7.3.0?

+ +Perform the following steps as the **root** user. + +1. Download **gcc-7.3.0.tar.gz** from [https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz](https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz). +2. GCC installation requires adequate temporary space. Run the following command to clear the **/tmp** directory in advance: + + ``` + sudo rm -rf /tmp/* + ``` + +3. Install the dependency package. \(CentOS and Ubuntu are used as examples.\) + - For CentOS, run the following command: + + ``` + yum install bzip2 + ``` + + - For Ubuntu, run the following command: + + ``` + apt-get install bzip2 + ``` + +4. Build and install GCC. + 1. Go to the directory where the source package **gcc-7.3.0.tar.gz** is located and run the following command to decompress it: + + ``` + tar -zxvf gcc-7.3.0.tar.gz + ``` + + 2. Go to the extracted directory and run the following command to download the GCC dependency packages: + + ``` + cd gcc-7.3.0 + ./contrib/download_prerequisites + ``` + + If an error is reported during the command execution, run the following commands in the **gcc-7.3.0/** directory to download the dependency packages: + + ``` + wget http://gcc.gnu.org/pub/gcc/infrastructure/gmp-6.1.0.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpfr-3.1.4.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpc-1.0.3.tar.gz + wget http://gcc.gnu.org/pub/gcc/infrastructure/isl-0.16.1.tar.bz2 + ``` + + After the preceding dependencies are downloaded, run the following command again: + + ``` + ./contrib/download_prerequisites + ``` + + If the validation fails, check whether the dependency packages are repeatedly downloaded. The packages should be downloaded at a time. + + 3. Run the configuration, build, and installation commands. + + ``` + ./configure --enable-languages=c,c++ --disable-multilib --with-system-zlib --prefix=/usr/local/linux_gcc7.3.0 + make -j15 # Check the number of CPUs by running grep -w processor /proc/cpuinfo|wc -l. In this example, the number is 15. + make install + ``` + + >![](public_sys-resources/icon-notice.gif) **NOTICE:** + >The **--prefix** option is used to specify the linux\_gcc7.3.0 installation path, which is configurable. Do not set it to **/usr/local** or **/usr**, which is the default installation path for the GCC installed by using the software source. Otherwise, a conflict occurs and the original GCC compilation environment of the system is damaged. In this example, the installation path is set to **/usr/local/linux\_gcc7.3.0**. + + +5. Set the environment variable. + + Training must be performed in the compilation environment with GCC upgraded. If you want to run training, configure the following environment variable in your training script: + + ``` + export LD_LIBRARY_PATH=${install_path}/lib64:${LD_LIBRARY_PATH} + ``` + + **$\{install\_path\}** indicates the GCC 7.3.0 installation path configured in [3](#en-us_topic_0000001135347812_en-us_topic_0000001173199577_en-us_topic_0000001172534867_en-us_topic_0276688294_li1649343041310). In this example, the GCC 7.3.0 installation path is **/usr/local/gcc7.3.0/**. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >Skip this step if you do not need to use the compilation environment with GCC upgraded. + + +

What Do I Do If "torch 1.5.0xxxx" and "torchvision" Do Not Match When torch-\*.whl Is Installed?

+ +## Symptom + +During the installation of **torch-**_\*_**.whl**, the message "ERROR: torchvision 0.6.0 has requirement torch==1.5.0, but you'll have torch 1.5.0a0+1977093 which is incompatible" " is displayed. + +![](figures/en-us_image_0000001180656411.png) + +## Possible Causes + +When the PyTorch is installed, the version check is automatically triggered. The version of the torchvision installed in the environment is 0.6.0. During the check, it is found that the version of the **torch-**_\*_**.whl** is inconsistent with the required version 1.5.0. As a result, an error message is displayed, but the installation is successful. + +## Solution + +This problem has no impact on the actual result, and no action is required. + diff --git "a/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/zh-cn_image_0000001152776305.png" b/docs/en/PyTorch Installation Guide/figures/en-us_image_0000001180656411.png similarity index 100% rename from "docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/zh-cn_image_0000001152776305.png" rename to docs/en/PyTorch Installation Guide/figures/en-us_image_0000001180656411.png diff --git a/docs/en/PyTorch Installation Guide/figures/environment-setup-process.png b/docs/en/PyTorch Installation Guide/figures/environment-setup-process.png new file mode 100644 index 0000000000000000000000000000000000000000..1001d9314eaf5912713e55fc12411f07672a9ec6 Binary files /dev/null and b/docs/en/PyTorch Installation Guide/figures/environment-setup-process.png differ diff --git a/docs/en/PyTorch Installation Guide/public_sys-resources/icon-caution.gif b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-caution.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-caution.gif differ diff --git a/docs/en/PyTorch Installation Guide/public_sys-resources/icon-danger.gif b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-danger.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-danger.gif differ diff --git a/docs/en/PyTorch Installation Guide/public_sys-resources/icon-note.gif b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-note.gif new file mode 100644 index 0000000000000000000000000000000000000000..6314297e45c1de184204098efd4814d6dc8b1cda Binary files /dev/null and b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-note.gif differ diff --git a/docs/en/PyTorch Installation Guide/public_sys-resources/icon-notice.gif b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-notice.gif new file mode 100644 index 0000000000000000000000000000000000000000..86024f61b691400bea99e5b1f506d9d9aef36e27 Binary files /dev/null and b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-notice.gif differ diff --git a/docs/en/PyTorch Installation Guide/public_sys-resources/icon-tip.gif b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-tip.gif new file mode 100644 index 0000000000000000000000000000000000000000..93aa72053b510e456b149f36a0972703ea9999b7 Binary files /dev/null and b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-tip.gif differ diff --git a/docs/en/PyTorch Installation Guide/public_sys-resources/icon-warning.gif b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-warning.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Installation Guide/public_sys-resources/icon-warning.gif differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/PyTorch Network Model Porting and Training Guide.md b/docs/en/PyTorch Network Model Porting and Training Guide/PyTorch Network Model Porting and Training Guide.md new file mode 100644 index 0000000000000000000000000000000000000000..3987aa95e2033affacda4761b5f6979a87e5960d --- /dev/null +++ b/docs/en/PyTorch Network Model Porting and Training Guide/PyTorch Network Model Porting and Training Guide.md @@ -0,0 +1,4081 @@ +# PyTorch Network Model Porting and Training Guide +- [Overview](#overview.md) +- [Restrictions and Limitations](#restrictions-and-limitations.md) +- [Porting Process](#porting-process.md) +- [Model Porting Evaluation](#model-porting-evaluation.md) +- [Environment Setup](#environment-setup.md) + - [Setting Up the Operating Environment](#setting-up-the-operating-environment.md) + - [Configuring Environment Variables](#configuring-environment-variables.md) +- [Model Porting](#model-porting.md) + - [Tool-Facilitated](#tool-facilitated.md) + - [Introduction](#introduction.md) + - [Instructions](#instructions.md) + - [Result Analysis](#result-analysis.md) + - [Manual](#manual.md) + - [Single-Device Training Model Porting](#single-device-training-model-porting.md) + - [Multi-Device Training Model Porting](#multi-device-training-model-porting.md) + - [Replacing PyTorch-related APIs](#replacing-pytorch-related-apis.md) + - [Mixed Precision](#mixed-precision.md) + - [Performance Optimization](#performance-optimization.md) + - [Overview](#overview-0.md) + - [Changing the CPU Performance Mode \(x86 Server\)](#changing-the-cpu-performance-mode-(x86-server).md) + - [Changing the CPU Performance Mode \(ARM Server\)](#changing-the-cpu-performance-mode-(arm-server).md) + - [Installing the High-Performance Pillow Library \(x86 Server\)](#installing-the-high-performance-pillow-library-(x86-server).md) + - [\(Optional\) Installing the OpenCV Library of the Specified Version](#(optional)-installing-the-opencv-library-of-the-specified-version.md) +- [Model Training](#model-training.md) +- [Performance Analysis and Optimization](#performance-analysis-and-optimization.md) + - [Prerequisites](#prerequisites.md) + - [Commissioning Process](#commissioning-process.md) + - [Overall Guideline](#overall-guideline.md) + - [Collecting Data Related to the Training Process](#collecting-data-related-to-the-training-process.md) + - [Performance Optimization](#performance-optimization-1.md) + - [Affinity Library](#affinity-library.md) + - [Source](#source.md) + - [Functions](#functions.md) +- [Precision Commissioning](#precision-commissioning.md) + - [Prerequisites](#prerequisites-2.md) + - [Commissioning Process](#commissioning-process-3.md) + - [Overall Guideline](#overall-guideline-4.md) + - [Precision Optimization Methods](#precision-optimization-methods.md) +- [Model Saving and Conversion](#model-saving-and-conversion.md) + - [Introduction](#introduction-5.md) + - [Saving a Model](#saving-a-model.md) + - [Exporting an ONNX Model](#exporting-an-onnx-model.md) +- [Samples](#samples.md) + - [ResNet-50 Model Porting](#resnet-50-model-porting.md) + - [Obtaining Samples](#obtaining-samples.md) + - [Porting the Training Script](#porting-the-training-script.md) + - [Single-Device Training Modification](#single-device-training-modification.md) + - [Distributed Training Modification](#distributed-training-modification.md) + - [Executing the Script](#executing-the-script.md) + - [ShuffleNet Model Optimization](#shufflenet-model-optimization.md) + - [Obtaining Samples](#obtaining-samples-6.md) + - [Evaluating the Model](#evaluating-the-model.md) + - [Porting the Network](#porting-the-network.md) + - [Commissioning the Network](#commissioning-the-network.md) +- [References](#references.md) + - [Single-Operator Sample Building](#single-operator-sample-building.md) + - [Single-Operator Dump Method](#single-operator-dump-method.md) + - [Common Environment Variables](#common-environment-variables.md) + - [dump op Method](#dump-op-method.md) + - [How Do I Install GCC 7.3.0?](#how-do-i-install-gcc-7-3-0.md) +- [FAQs](#faqs.md) + - [FAQs About Software Installation](#faqs-about-software-installation.md) + - [pip3.7 install Pillow==5.3.0 Installation Failed](#pip3-7-install-pillow-5-3-0-installation-failed.md) + - [FAQs About Model and Operator Running](#faqs-about-model-and-operator-running.md) + - [What Do I Do If the Error Message "RuntimeError: ExchangeDevice:" Is Displayed During Model or Operator Running?](#what-do-i-do-if-the-error-message-runtimeerror-exchangedevice-is-displayed-during-model-or-operator.md) + - [What Do I Do If the Error Message "Error in atexit.\_run\_exitfuncs:" Is Displayed During Model or Operator Running?](#what-do-i-do-if-the-error-message-error-in-atexit-_run_exitfuncs-is-displayed-during-model-or-operat.md) + - [What Do I Do If the Error Message "terminate called after throwing an instance of 'c10::Error' what\(\): HelpACLExecute:" Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-terminate-called-after-throwing-an-instance-of-c10-error-what()-he.md) + - [What Do I Do If the Error Message "ImportError: libhccl.so." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-importerror-libhccl-so-is-displayed-during-model-running.md) + - [What Do I Do If the Error Message "RuntimeError: Initialize." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-runtimeerror-initialize-is-displayed-during-model-running.md) + - [What Do I Do If the Error Message "TVM/te/cce error." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-tvm-te-cce-error-is-displayed-during-model-running.md) + - [What Do I Do If the Error Message "MemCopySync:drvMemcpy failed." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-memcopysync-drvmemcpy-failed-is-displayed-during-model-running.md) + - [What Do I Do If the Error Message "MemCopySync:drvMemcpy failed." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-memcopysync-drvmemcpy-failed-is-displayed-during-model-running-7.md) + - [What Do I Do If the Error Message "HelpACLExecute." Is Displayed After Multi-Task Delivery Is Disabled \(export TASK\_QUEUE\_ENABLE=0\) During Model Running?](#what-do-i-do-if-the-error-message-helpaclexecute-is-displayed-after-multi-task-delivery-is-disabled.md) + - [What Do I Do If the Error Message "55056 GetInputConstDataOut: ErrorNo: -1\(failed\)" Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-55056-getinputconstdataout-errorno--1(failed)-is-displayed-during.md) + - [FAQs About Model Commissioning](#faqs-about-model-commissioning.md) + - [What Do I Do If the Error Message "RuntimeError: malloc:/..../pytorch/c10/npu/NPUCachingAllocator.cpp:293 NPU error, error code is 500000." Is Displayed During Model Commissioning?](#what-do-i-do-if-the-error-message-runtimeerror-malloc-pytorch-c10-npu-npucachingallocator-cpp-293-np.md) + - [What Do I Do If the Error Message "RuntimeError: Could not run 'aten::trunc.out' with arguments from the 'NPUTensorId' backend." Is Displayed During Model Commissioning](#what-do-i-do-if-the-error-message-runtimeerror-could-not-run-aten-trunc-out-with-arguments-from-the.md) + - [What Do I Do If the MaxPoolGradWithArgmaxV1 and max Operators Report Errors During Model Commissioning?](#what-do-i-do-if-the-maxpoolgradwithargmaxv1-and-max-operators-report-errors-during-model-commissioni.md) + - [What Do I Do If the Error Message "ModuleNotFoundError: No module named 'torch.\_C'" Is Displayed When torch Is Called?](#what-do-i-do-if-the-error-message-modulenotfounderror-no-module-named-torch-_c-is-displayed-when-tor.md) + - [FAQs About Other Operations](#faqs-about-other-operations.md) + - [What Do I Do If an Error Is Reported During CUDA Stream Synchronization?](#what-do-i-do-if-an-error-is-reported-during-cuda-stream-synchronization.md) + - [What Do I Do If aicpu\_kernels/libpt\_kernels.so Does Not Exist?](#what-do-i-do-if-aicpu_kernels-libpt_kernels-so-does-not-exist.md) + - [What Do I Do If the Python Process Is Residual When the npu-smi info Command Is Used to View Video Memory?](#what-do-i-do-if-the-python-process-is-residual-when-the-npu-smi-info-command-is-used-to-view-video-m.md) + - [What Do I Do If the Error Message "match op inputs failed"Is Displayed When the Dynamic Shape Is Used?](#what-do-i-do-if-the-error-message-match-op-inputs-failed-is-displayed-when-the-dynamic-shape-is-used.md) + - [What Do I Do If the Error Message "Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported" Is Displayed?](#what-do-i-do-if-the-error-message-op-type-sigmoidcrossentropywithlogitsv2-of-ops-kernel-aicoreengine.md) + - [What Do I Do If a Hook Failure Occurs?](#what-do-i-do-if-a-hook-failure-occurs.md) + - [What Do I Do If the Error Message "load state\_dict error." Is Displayed When the Weight Is Loaded?](#what-do-i-do-if-the-error-message-load-state_dict-error-is-displayed-when-the-weight-is-loaded.md) + - [FAQs About Distributed Model Training](#faqs-about-distributed-model-training.md) + - [What Do I Do If the Error Message "host not found." Is Displayed During Distributed Model Training?](#what-do-i-do-if-the-error-message-host-not-found-is-displayed-during-distributed-model-training.md) + - [What Do I Do If the Error Message "RuntimeError: connect\(\) timed out." Is Displayed During Distributed Model Training?](#what-do-i-do-if-the-error-message-runtimeerror-connect()-timed-out-is-displayed-during-distributed-m.md) +

Overview

+ +Currently, the solution of adapting to the Ascend AI Processor is an online solution. + +## Solution Features and Advantages + +The acceleration of the Ascend AI Processor is implemented by calling various operators \(OP-based\). That is, the AscendCL is used to call one or more D affinity operators to replace the original GPU-based implementation. [Figure 1](#fig2267112413239) shows the logical model of the implementation. + +**Figure 1** Logical model + + +![](figures/pytorch适配逻辑结构图-优化.png) + +Currently, the main reasons for selecting the online adaptation solution are as follows: + +1. The dynamic graph feature of the PyTorch framework is inherited to the maximum extent. +2. The GPU's usage on the PyTorch is inherited to the maximum extent, which minimizes the changes in the development mode and code reuse when a model is ported to the Ascend AI Processor for training. +3. The original PyTorch architecture is inherited to the maximum extent and the excellent features of the PyTorch architecture are retained, such as automatic differentiation, dynamic distribution, debugging, profiling, storage sharing mechanism, and dynamic memory management on the device side. +4. It has good scalability. During the streamlining process, only the development and implementation of related compute operators are involved for new network types or structures. Framework operators, reverse graph building, and implementation mechanisms can be reused. +5. The usage and style are the same as those of the GPU-based implementation. During online adaption, you only need to specify the device as the Ascend AI Processor in Python and device operations to develop, train, and debug the network in PyTorch using the Ascend AI Processor. You do not need to pay attention to the underlying details of the Ascend AI Processor. In this way, you can minimize the modification and complete porting with low costs. + +

Restrictions and Limitations

+ +- In the **infershape** phase, operators do not support unknown shape inference. +- Only the float16 operator can be used for cube computing. +- inf/nan data of the float16 type cannot be input or output. +- Dimensions cannot be reduced when the format larger than 4D is used. +- In the current version, Apex is implemented using Python, and the customized optimization CUDA kernel in Apex is not supported. +- The current version of Apex supports only the mixed precision calculation and multiple fusion optimizer functions adapted to Ascend AI Processors. +- The restrictions on collective communication are as follows: + - In data parallel mode, the graphs executed on different devices must be the same. + - Allocation at only 1, 2, 4, or 8 processors is supported. + - Only the int8, int32, float16, and float32 data types are supported. + + +

Porting Process

+ +Model porting refers to moving models that have been implemented in the open-source community to an Ascend AI Processor. [Figure 1](#fig759451810422) shows the model porting process. + +**Figure 1** Porting process +![](figures/porting-process.png "porting-process") + +**Table 1** Porting process + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Scenario

+

Description

+

Model selection

+

For details, see Model Selection.

+

Model porting evaluation

+

For details, see Model Porting Evaluation.

+

Operator development

+

For details, see the PyTorch Operator Development Guide.

+

Environment setup

+

For details, see Environment Setup.

+

Model porting

+

For details, see Model Porting.

+

Model training

+

For details, see Model Training.

+

Error analysis

+

For details, see "AI Core Error Analyzer Instructions" in the CANN Log Reference (Training)CANN Log Reference and CANN Development Auxiliary Tool Guide (Training)CANN Development Auxiliary Tool Guide (Training).

+

Performance analysis and optimization

+

For details, see Performance Optimization and Analysis.

+

Precision commissioning

+

For details, see Precision Commissioning.

+

Model saving and conversion

+

For details, see Model Saving and Conversion and "ATC Tool Instructions" in the CANN Auxiliary Development Tool User Guide .

+

Application software development

+

For details, see the CANN Application Software Development Guide (C and C++, Inference).

+

FAQs

+

Describes how to prepare the environment, port models, commission models, and resolve other common problems. For details, see FAQs.

+
+ +

Model Porting Evaluation

+ +1. When selecting models, select authoritative PyTorch models as benchmarks, including but not limited to PyTorch \([example](https://github.com/pytorch/examples/tree/master/imagenet)/[vision](https://github.com/pytorch/vision)\), facebookresearch \([Detectron](https://github.com/facebookresearch/Detectron)/[detectron2](https://github.com/facebookresearch/detectron2)\), and open-mmlab \([mmdetection](https://github.com/open-mmlab/mmdetection)/[mmpose](https://github.com/open-mmlab/mmpose)\). +2. Check the operator adaptation. Before porting the original model and training script to an Ascend AI Processor, train the original model and training script on the CPU, obtain the operator information by using the dump op method, and compare the operator information with that in the _PyTorch Adapted Operator List_ to check whether the operator is supported. For details about the dump op method, see [dump op Method](#dump-op-method.md). If an operator is not supported, develop the operator. For details, see the _PyTorch Operator Development Guide_. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >You can also port the model and training script to the Ascend AI Processor for training to view the error information. For details about how to port the model and training script, see the following sections. Generally, a message is displayed, indicating that an operator \(the first operator that is not supported\) cannot run in the backend of the Ascend AI Processor. + + +

Environment Setup

+ +- **[Setting Up the Operating Environment](#setting-up-the-operating-environment.md)** + +- **[Configuring Environment Variables](#configuring-environment-variables.md)** + + +

Setting Up the Operating Environment

+ +For details about how to set up the PyTorch operating environment, see the . + +

Configuring Environment Variables

+ +After the software packages are installed, configure environment variables to use Ascend PyTorch. You are advised to build a startup script, for example, the **set\_env.sh** script, and run **source set\_env.sh** to configure the environment variables. The content of the **set\_env.sh** script is as follows \(the **root** user is used as the installation user and the default installation path is used\): + +``` +cpu_type=$(echo $HOSTTYPE) + +if [ x"${cpu_type}" == x"x86_64" ];then + cpu_type=x86_64-linux +else + cpu_type=arm64-linux +fi +if [ -d /usr/local/Ascend/nnae/latest ];then + export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=/usr/local/Ascend/nnae/latest/ +else + export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest/${cpu_type} +fi +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH +export TASK_QUEUE_ENABLE=1 + +# (Optional) If the system is openEuler, run this command to cancel CPU core binding. +# unset GOMP_CPU_AFFINITY + +# Select an HCCL initialization method and configure the corresponding environment variables in the following scenarios. The details are as follows: +# Scenario 1: Single-node scenario +export HCCL_WHITELIST_DISABLE=1 # Disable the HCCL trustlist. +# Scenario 2: Multi-node scenario +export HCCL_WHITELIST_DISABLE=1 # Disable the HCCL trustlist. +export HCCL_IF_IP="1.1.1.1" # 1.1.1.1 is the NIC IP address of the host. Change it based on the site requirements. Ensure that the NIC IP addresses used can communicate with each other in the cluster. +``` + +[Table 1](#en-us_topic_0000001134654416_en-us_topic_0000001152616261_table42017516135) describes related environment variables. + +**Table 1** Description of environment variables + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Environment Variable

+

Description

+

LD_LIBRARY_PATH

+

Dynamic library search path. Set this variable based on the preceding example.

+

If you need to upgrade GCC in OSs such as CentOS, Debian, and BCLinux, add ${install_path}/lib64 to the LD_LIBRARY_PATH variable of the dynamic library search path. Replace {install_path} with the GCC installation path. For details, see 5.

+

PYTHONPATH

+

Python search path. Set this variable based on the preceding example.

+

PATH

+

Executable program search path. Set this variable based on the preceding example.

+

ASCEND_OPP_PATH

+

Operator package (OPP) root directory. Set this variable based on the preceding example.

+

OPTION_EXEC_EXTERN_PLUGIN_PATH

+

Path of the operator information library.

+

ASCEND_AICPU_PATH

+

Path of the AI CPU operator package.

+

TASK_QUEUE_ENABLE

+

Whether to asynchronously deliver tasks and call the ACL APIs. You are advised to set this parameter to 1 to enable this function.

+

HCCL_WHITELIST_DISABLE

+

Whether to enable the communication trustlist when the HCCL is used.

+
  • 0: enables the trustlist.
  • 1: disables the trustlist.
+

The default value is 0, indicating that the trustlist is enabled by default.

+

HCCL_IF_IP

+

IP address of the NIC for initializing communication in the HCCL.

+
  • The IP address is in dotted decimal notation.
  • Currently, only the host NIC is supported.
+

By default, the host communication NICs are selected in the following sequence: NICs other than Docker/local NICs (in ascending alphabetical order of NIC names) > Docker NICs > local NICs.

+

unset GOMP_CPU_AFFINITY

+

(Optional) If the system is openEuler, run this command to cancel CPU core binding.

+
+ +

Model Porting

+ +- **[Tool-Facilitated](#tool-facilitated.md)** + +- **[Manual](#manual.md)** + +- **[Mixed Precision](#mixed-precision.md)** + +- **[Performance Optimization](#performance-optimization.md)** + + +

Tool-Facilitated

+ +The Ascend platform provides a script conversion tool to enable you to port training scripts to Ascend AI Processors using commands. The following will provide the details. In addition to using commands, you can also use the PyTorch GPU2Ascend function integrated in MindStudio to port scripts. For details, see the _MindStudio User Guide_. + +- **[Introduction](#introduction.md)** + +- **[Instructions](#instructions.md)** + +- **[Result Analysis](#result-analysis.md)** + + +

Introduction

+ +## Overview + +Ascend NPU is an up-and-comer in the AI computing field, but most training and online inference scripts are based on GPUs. Due to the architecture differences between NPUs and GPUs, GPU-based training and online inference scripts cannot be directly used on NPUs. The script conversion tool provides an automated method for converting GPU-based scripts into NPU-based scripts, reducing the learning cost and workload of manual script migration, thereby improving the migration efficiency. + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>- msFmkTransplt provides suggestions and converts scripts by the adaptation rules, significantly accelerating script migration and reducing development workload. The scripts in [Table 1](#en-us_topic_0000001133095885_table4705239194613) can be directly executed after being converted. The conversion results of other scripts are for reference only. You need to perform adaptation based on the site requirements. +>- The original scripts in [Table 1](#en-us_topic_0000001133095885_table4705239194613) must be executed in the GPU environment and based on Python 3. +>- The script execution logic after conversion is the same as that before conversion. +>- This script conversion tool only supports the conversion of PyTorch training scripts. + +**Table 1** Supported models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

Model

+

1

+

3D AttentionNet

+

2

+

3D Nested_UNet

+

3

+

Advanced East

+

4

+

AlexNet

+

5

+

DeeplabV3+(Xception-JFT)

+

6

+

DeepMar

+

7

+

Densenet121

+

8

+

DenseNet161

+

9

+

DenseNet169

+

10

+

DenseNet201

+

11

+

EAST

+

12

+

FCN

+

13

+

FD-GAN

+

14

+

FOTS

+

15

+

GENet

+

16

+

GoogleNet

+

17

+

GRU

+

18

+

Inception V4

+

19

+

InceptionV2

+

20

+

LPRNet

+

21

+

LSTM

+

22

+

MNASNet0_5

+

23

+

MNASNet0_75

+

24

+

MNASNet1_0

+

25

+

MNASNet1_3

+

26

+

MobileNetV1

+

27

+

MobileNetV2

+

28

+

PNet

+

29

+

PSENet

+

30

+

RAFT

+

31

+

RecVAE

+

32

+

ResNet101

+

33

+

ResNet152

+

34

+

ResNet18

+

35

+

ResNet34

+

36

+

ResNet50

+

37

+

Resnext101_32x8d

+

38

+

Resnext50

+

39

+

RNet

+

40

+

Shufflenetv2

+

41

+

SqueezeNet1_0

+

42

+

SqueezeNet1_1

+

43

+

U-Net

+

44

+

VAE+GAN

+

45

+

VGG11

+

46

+

VGG11_BN

+

47

+

VGG13

+

48

+

VGG13_BN

+

49

+

VGG16

+

50

+

VGG16_BN

+

51

+

VGG19

+

52

+

VGG19_BN

+

53

+

VIT-base

+

54

+

Wide_ResNet101_2

+

55

+

Wide_ResNet50_2

+
+ +## System Requirement + +msFmkTransplt runs on Ubuntu 18.04, CentOS 7.6, and EulerOS 2.8 only. + +## Environment Setup + +Set up the development environment by referring to the _CANN Software Installation Guide_. + +

Instructions

+ +## Command-line Options + +**Table 1** Command-line options + + + + + + + + + + + + + + + + + + + + + + + + +

Option

+

Description

+

Example Value

+

-i

+

--input

+
  • Path of the folder or file where the original script file to be converted is located.
  • Required
+
  • /home/username/fmktransplt
  • /home/username/fmktransplt.py
+

-o

+

--output

+
  • Output path of the script conversion result. A folder with the .msft suffix will be generated in the path.
  • Required
+

/home/username/fmktransplt_output

+

-r

+

--rule

+
  • Path of the JSON file for custom general conversion rules, which cover function parameter, function name, and module name modifications.
  • Optional
+

/home/username/fmktransplt_rule.json

+

-h

+

--help

+

Help information.

+

-

+
+ +## Customizing a Rule File + +An example of a custom conversion rule is as follows: + +``` +{ + "rules": { + "ArgsModifyRule": [ + { + "func_name": "name1", + "arg_idx": 0, + "arg_new": "agrs0" + }, + { + "func_name": "name2", + "arg_idx": 0, + "arg_new": "agrs0" + } + ], + "FuncNameModifyRule": [ + { + "old_name": "func", + "new_name": "new_func" + } + ], + "ModuleNameModifyRule": [ + { + "old_name": "module", + "new_name": "new_module", + "parent_module":"parent_module" + } + ] + } +} +``` + +**Table 2** Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Option

+

Description

+

ArgsModifyRule

+

Function parameter modification

+

func_name

+

Function name

+

arg_idx

+

Parameter position

+

arg_new

+

New parameter

+

FuncNameModifyRule

+

Function name modification

+

ModuleNameModifyRule

+

Module name modification

+

old_name

+

Old name

+

new_name

+

New name

+

parent_module

+

Parent module name

+
+ +## Performing Conversion + +1. Go to the directory of the script conversion tool msFmkTransplt. + + ``` + cd {Ascend-CANN-Toolkit install path}/ascend-toolkit/{version}/{arch}-linux/toolkit/tools/ms_fmk_transplt + ``` + +2. Execute msFmkTransplt. + + ``` + python3 ms_fmk_transplt.py -i original script path -o output path of the script conversion result [-r path of the JSON file for custom general conversion rules] + ``` + +3. Find the converted script in the specified output path. + +

Result Analysis

+ +You can view the result files in the output path when the script is converted. + +``` +├── xxx_msft // Directory for storing script conversion results. The default directory is the directory of the original script. xxx indicates the name of the folder where the original script is stored. +│ ├── generated script file // The directory structure is the same as that of the script file before conversion. +│ ├── msFmkTranspltlog.txt // Log file generated during script conversion +│ ├── unsupported_op.xlsx // File of the unsupported operator list +``` + +

Manual

+ +- **[Single-Device Training Model Porting](#single-device-training-model-porting.md)** + +- **[Multi-Device Training Model Porting](#multi-device-training-model-porting.md)** + +- **[Replacing PyTorch-related APIs](#replacing-pytorch-related-apis.md)** + + +

Single-Device Training Model Porting

+ +The advantage of the online adaption is that the training on the Ascend AI Processor is consistent with the usage of the GPU. During online adaption,** you only need to specify the device as the Ascend AI Processor in Python and device operations** to develop, train, and debug the network in PyTorch using the Ascend AI Processor. For single-device model training, main changes for porting are as follows: + +GPU code before porting: + +``` + CALCULATE_DEVICE = "gpu:0" + torch.cuda.set_device(CALCULATE_DEVICE) + # Two methods for porting the code to device + model = model.cuda() # Method 1 + model = model.to(CALCULATE_DEVICE) # Method 2 + # Port the input from host to device. + images = images.to(CALCULATE_DEVICE) + target = target.to(CALCULATE_DEVICE) +``` + +The code ported to the Ascend AI Processor is as follows: + +``` + CALCULATE_DEVICE = "npu:0" + torch.npu.set_device(CALCULATE_DEVICE) + # Two methods for porting the code to device + model = model.npu() # Method 1 + model = model.to(CALCULATE_DEVICE) # Method 2 + # Port the input from host to device. + images = images.to(CALCULATE_DEVICE) + target = target.to(CALCULATE_DEVICE) +``` + +For details, see [Single-Device Training Modification](#single-device-training-modification.md). + +

Multi-Device Training Model Porting

+ +To port a multi-device training model,** you need to specify the device as the Ascend AI Processor in Python and device operations**. In addition, you can perform distributed training using PyTorch **DistributedDataParallel**, that is, run **init\_process\_group** during model initialization, and then initialize the model into a **DistributedDataParallel** model. Note that the **backend **must be set to **hccl **and the initialization mode must be shielded when **init\_process\_group** is executed. + +PyTorch distributed training code example \(some code is omitted\): + +``` +import torch +import torch.distributed as dist +import torch.nn.parallel +def main(): + args = parser.parse_args() +# The initialization mode needs to be shielded. + dist.init_process_group(backend='hccl',# init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + for epoch in range(args.start_epoch, args.epochs): + acc1 = train(train_loader, model, criterion, optimizer, epoch, args,ngpus_per_node, + lr_scheduler) +``` + +For details, see [Distributed Training Modification](#distributed-training-modification.md). + + + +1. To enable the Ascend AI Processor to use the capabilities of the PyTorch framework, the native PyTorch framework needs to be adapted at the device layer. The APIs related to the CPU and CUDA need to be replaced for external presentation. During network porting, some device-related APIs need to be replaced with the APIs related to the Ascend AI Processor. [Table 1](#table1922064517344) lists the supported device-related APIs. + + **Table 1** Device-related APIs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Original PyTorch API

+

API Adapted to the Ascend AI Processor

+

Description

+

torch.cuda.is_available()

+

torch.npu.is_available()

+

Checks whether the device is available in the current environment (not the final result).

+

torch.cuda.current_device()

+

torch.npu.current_device()

+

Obtains the device in use.

+

torch.cuda.device_count()

+

torch.npu.device_count()

+

Obtains the number of devices in the current environment.

+

torch.cuda.set_device()

+

torch.npu.set_device()

+

Sets the device in use.

+

torch.tensor([1,2,3]).is_cuda

+

torch.tensor([1,2,3]).is_npu

+

Checks whether a tensor is in the format on the CUDA or NPU device.

+

torch.tensor([1,2,3]).cuda()

+

torch.tensor([1,2,3]).npu()

+

Converts a tensor to the format on the CUDA or NPU device.

+

torch.tensor([1,2,3]).to("cuda")

+

torch.tensor([1,2,3]).to('npu')

+

Converts a tensor to the format on the CUDA or NPU device.

+

torch.cuda.synchronize()

+

torch.npu.synchronize()

+

Waits until the event is complete.

+

torch.cuda.device

+

torch.npu.device

+

Generates a device class, which can be used to perform device-related operations.

+

torch.cuda.Stream(device)

+

torch.npu.Stream(device)

+

Generates a stream object.

+

torch.cuda.stream(Stream)

+

torch.npu.stream(Stream)

+

Mainly used for scope restriction.

+

torch.cuda.current_stream()

+

torch.npu.current_stream()

+

Obtains the current stream.

+

torch.cuda.default_stream()

+

torch.npu.default_stream()

+

Obtains the default stream.

+

device = torch.device("cuda:0")

+

device = torch.device("npu:0")

+

Specifies a device.

+

torch.autograd.profiler.profile

+

(use_cuda=True)

+

torch.autograd.profiler.profile

+

(use_npu=True)

+

Specifies that CUDA/NPU is used during profiler execution.

+

torch.cuda.Event()

+

torch.npu.Event()

+

Returns events on a device.

+
+ +2. When building or porting a network, you need to create tensors of specified data types. The following table lists the tensors created on the Ascend AI Processor. + + **Table 2** Tensor-related APIs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

GPU tensor

+

API Adapted to the Ascend AI Processor

+

torch.tensor([1,2,3],dtype=torch.long,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.long,device='npu')

+

torch.tensor([1,2,3],dtype=torch.int,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.int,device='npu')

+

torch.tensor([1,2,3],dtype=torch.half,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.half,device='npu')

+

torch.tensor([1,2,3],dtype=torch.float,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.float,device='npu')

+

torch.tensor([1,2,3],dtype=torch.bool,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.bool,device='npu')

+

torch.cuda.BoolTensor([1,2,3])

+

torch.npu.BoolTensor([1,2,3])

+

torch.cuda.FloatTensor([1,2,3])

+

torch.npu.FloatTensor([1,2,3])

+

torch.cuda.IntTensor([1,2,3])

+

torch.npu.IntTensor([1,2,3])

+

torch.cuda.LongTensor([1,2,3])

+

torch.npu.LongTensor([1,2,3])

+

torch.cuda.HalfTensor([1,2,3])

+

torch.npu.HalfTensor([1,2,3])

+
+ + +For more APIs, see the _PyTorch API Support_. + +

Mixed Precision

+ +## Overview + +Based on the architecture features of the NPU chip, mixed precision training is involved, that is, the scenario where the float16 and float32 data types are used together. Replacing float32 with float16 has the following advantages: + +- The memory usage of intermediate variables is reduced. +- The data transfer time decreases because the memory usage is reduced. +- The computing units of float16 provide better computing performance. + +However, the mixed precision training is limited by the precision range expressed by float16. If float32 is converted into float16, the training convergence is affected. To use float16 for acceleration in some computations and ensure training convergence, the mixed precision module Apex is used. The mixed precision module Apex is a comprehensive optimization library that features high optimization performance and precision. + +In addition to the preceding advantages, the mixed precision module Apex adapted to Ascend AI Processors can improve computing performance. Details are described as follows: + +- During mixed precision calculation, Apex calculates the grad of the model. You can enable combine\_grad to accelerate these operations. Set the **combine\_grad** parameter of the amp.initialize\(\) interface to **True**. +- After the adaptation, Apex optimizes optimizers, such as adadelta, adam, sgd, and lamb to adapt them to Ascend AI Processors. As a result, the obtained NPU-based fusion optimizers are consistent with the native algorithms, but the calculation speed is faster. You only need to replace the original optimizer with **apex.optimizers.\*** \(**\*** indicates the optimizer name, for example, **NpuFusedSGD**\). + +## Supported Features + +[Table 1](#table10717173813332) describes the functions and optimization of the mixed precision module. + +**Table 1** Functions of the mixed precision module + + + + + + + + + + + + + + + + + + + +

Function

+

Description

+

O1 configuration

+

Conv and Matmul use float16 for computing, and Softmax and BN use float32.

+

O2 configuration

+

BN uses float32, and others use float16.

+

Static loss scale

+

Parameters are statically set to ensure the convergence of mixed precision training.

+

Dynamic loss scale

+

The loss scale value is dynamically calculated to determine whether overflow occurs.

+
+ +>![](public_sys-resources/icon-note.gif) **NOTE:** +>- In the current version, Apex is implemented using Python and does not support AscendCL or CUDA optimization. +>- Ascend AI devices do not support the original FusedLayerNorm interface module of Apex. If the original model script file uses the FusedLayerNorm interface module, you need to replace the script header file **from apex.normalization import FusedLayerNorm** with **from torch.nn import LayerNorm**. + +## Integrating Mixed Precision Module Into the PyTorch Model + +1. To use the mixed precision module Apex, you need to import the amp from the Apex library as follows: + + ``` + from apex import amp + ``` + +2. After the amp module is imported, you need to initialize the amp module so that it can modify the model, optimizer, and PyTorch internal functions. The initialization code is as follows: + + ``` + model, optimizer = amp.initialize(model, optimizer, combine_grad=True) + ``` + +3. Mark the location where the back propagation **.backward\(\)** occurs so that the amp can perform loss scaling and clear the status of each iteration. The code is as follows: + + Original code: + + ``` + loss = criterion(...) + loss.backward() + optimizer.step() + ``` + + Code after the modification to support loss scaling: + + ``` + loss = criterion(...) + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + optimizer.step() + ``` + + +

Performance Optimization

+ +- **[Overview](#overview-0.md)** + +- **[Changing the CPU Performance Mode \(x86 Server\)](#changing-the-cpu-performance-mode-(x86-server).md)** + +- **[Changing the CPU Performance Mode \(ARM Server\)](#changing-the-cpu-performance-mode-(arm-server).md)** + +- **[Installing the High-Performance Pillow Library \(x86 Server\)](#installing-the-high-performance-pillow-library-(x86-server).md)** + +- **[\(Optional\) Installing the OpenCV Library of the Specified Version](#(optional)-installing-the-opencv-library-of-the-specified-version.md)** + + +

Overview

+ +During PyTorch model porting and training, the number of images recognized within one second \(FPS\) for some network models is low and the performance does not meet the requirements. In this case, you need to perform the following optimization operations on the server: + +- Change the CPU performance mode. +- Install the high-performance Pillow library. + +

Changing the CPU Performance Mode \(x86 Server\)

+ +## Setting the Power Policy to High Performance + +To improve network performance, you need to set the power policy to high performance in the BIOS settings of the x86 server. The detailed operations are as follows: + +1. Log in to the iBMC WebUI, start the virtual console, and select **HTML5 Integrated Remote Console**, as shown in [Figure 1](#fig15869135420288). + + **Figure 1** Remote console + ![](figures/remote-console.png "remote-console") + +2. On the virtual toolbar, click the startup item tool ![](figures/en-us_image_0000001106016350.png). The startup item drop-down list is displayed, as shown in [Figure 2](#fig744814574243). + + **Figure 2** Startup item tool + ![](figures/startup-item-tool.png "startup-item-tool") + +3. In the drop-down list, choose, select **BIOS Setup**, and click ![](figures/en-us_image_0000001152616281.png) on the toolbar to restart the server. +4. After the system restarts, the BIOS configuration screen is displayed. Choose **Advanced** \> **Socket Configuration**. See [Figure 3](#fig4546303814). + + **Figure 3** Socket Configuration + ![](figures/socket-configuration.png "socket-configuration") + +5. On the **Advanced Power Mgmt. Configuration** page displayed, set **Power Policy** to **Performance**, See [Figure 4](#fig15501111014442). + + **Figure 4** Setting the power policy + ![](figures/setting-the-power-policy.png "setting-the-power-policy") + +6. Press **F10** to save the settings and reboot the server. + +## Setting the CPU Mode to Performance + +Perform the following steps as the **root** user: + +1. Run the following command to check the current CPU mode: + + ``` + cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor + ``` + + After the preceding command is run, the current CPU mode is displayed. For details, see [Table 1](#table354392019384). If the current CPU mode is not performance, perform the following operations to set the CPU mode to performance: Otherwise, skip this step. + + **Table 1** CPU mode + + + + + + + + + + + + + + + + + + + + + + + + + +

Governor

+

Description

+

performance

+

The CPU runs at the maximum frequency.

+

powersave

+

The CPU runs at the minimum frequency.

+

userspace

+

The CPU runs at a frequency specified by the user.

+

ondemand

+

The CPU frequency is dynamically adjusted as required. Once a task needs CPU computing power, the CPU runs at the maximum frequency. If the idle time increases, the CPU frequency decreases.

+

conservative

+

The CPU frequency is dynamically adjusted as required. The adjustment is more conservative than that of the ondemand mode.

+

schedutil

+

The CPU frequency is adjusted based on the scheduler.

+
+ +2. Run the following command to install the tool: + - The **ubuntu/debian** system is used as an example. + + ``` + apt-get install linux-tools-$(uname -r) + ``` + + - The **centos/bclinux/euler** system is used as an example: + + ``` + yum install kernel-tools -y + systemctl daemon-reload + systemctl enable cpupower + systemctl start cpupower + ``` + +3. Sets the CPU mode to performance. + + ``` + cpupower frequency-set -g performance + ``` + +4. Perform [Step 1](#li158435131344) again to check whether the current CPU mode is set to performance. + +

Changing the CPU Performance Mode \(ARM Server\)

+ +## Setting the Power Policy to High Performance + +Some models that have demanding requirements on the CPUs on the host, for example, the object detection model, require complex image pre-processing. Enabling the high-performance mode of the power supply can improve performance and stability. To improve network performance, you need to set the power policy to high performance in the BIOS settings of the ARM server. The detailed operations are as follows: + +1. Log in to the iBMC WebUI, start the virtual console, and select **HTML5 Integrated Remote Console**, as shown in [Figure 1](#fig15869135420288). + + **Figure 1** Remote console + ![](figures/remote-console-0.png "remote-console-0") + +2. On the virtual toolbar, click the startup item tool ![](figures/en-us_image_0000001152616289.png). The startup item drop-down list is displayed, as shown in [Figure 2](#fig744814574243). + + **Figure 2** Startup item tool + ![](figures/startup-item-tool-1.png "startup-item-tool-1") + +3. In the drop-down list, select **BIOS Setup**, and click ![](figures/en-us_image_0000001115716581.png) on the toolbar to restart the server. +4. After the system restarts, the BIOS configuration screen is displayed. Choose **Advanced** \> **Performance Config**. See [Figure 3](#fig4546303814). + + **Figure 3** Performance Config + ![](figures/performance-config.png "performance-config") + +5. On the **Performance Config** page, set **Power Policy** to **Performance**. See [Figure 4](#fig15501111014442). + + **Figure 4** Setting the power policy + ![](figures/setting-the-power-policy-2.png "setting-the-power-policy-2") + +6. Press **F10** to save the settings and reboot the server. + +

Installing the High-Performance Pillow Library \(x86 Server\)

+ +1. Run the following command to install the dependencies for the high-performance pillow library: + + Ubuntu/Debian: + + ``` + apt-get install libtiff5-dev libjpeg8-dev libopenjp2-7-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python3-tk libharfbuzz-dev libfribidi-dev libxcb1-dev + ``` + + CentOS/BC-Linux/EulerOS: + + ``` + yum install libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel + ``` + +2. Install the high-performance Pillow library. + 1. Run the following command to uninstall the native Pillow: + + ``` + pip3.7 uninstall -y pillow + ``` + + 2. Install the pillow-simd of the SSE4 version. + + Run the following command as the **root** user. If a non-root user is used, add **--user** to the end of the command. + + ``` + pip3.7 install pillow-simd + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >If the CPU supports the AVX2 instruction set, run the following command to install pillow-simd of the AVX2 version: + >``` + >CC="cc -mavx2" pip3.7 install -U --force-reinstall pillow-simd + >``` + + +3. Modify the torchvision code to solve the problem that the pillow-simd does not contain the **PILLOW\_VERSION** field. For details about how to install torchvision, see [How to Obtain](#obtaining-samples.md). + + Modify the code in line 5 of **/usr/local/python3.7.5/lib/python3.7/site-packages/torchvision/transforms/functional.py** as follows: + + ``` + try: + from PIL import Image, ImageOps, ImageEnhance,PILLOW_VERSION + except: + from PIL import Image, ImageOps, ImageEnhance + PILLOW_VERSION="7.0.0" + ``` + + +

\(Optional\) Installing the OpenCV Library of the Specified Version

+ +If the model depends on OpenCV, you are advised to install OpenCV 3.4.10 to ensure training performance. + +1. Source code: [Link](https://opencv.org/releases/) +2. Installation guide: [Link](https://docs.opencv.org/3.4.10/d7/d9f/tutorial_linux_install.html) + +

Model Training

+ +After the training scripts are migrated, set environment variables by following the instructions in [Configuring Environment Variables](#configuring-environment-variables.md) and run the **python3.7** _xxx_ command to train a model. For details, see [Executing the Script](#executing-the-script.md). + +

Performance Analysis and Optimization

+ +- **[Prerequisites](#prerequisites.md)** + +- **[Commissioning Process](#commissioning-process.md)** + +- **[Affinity Library](#affinity-library.md)** + + +

Prerequisites

+ +1. Modify the open-source code to ensure that the model can run properly, including data preprocessing, forward propagation, loss calculation, mixed precision, back propagation, and parameter update. For details, see [Samples](#samples.md). +2. During model porting, check whether the model can run properly and whether the existing operators can meet the requirements. If no operator meets the requirements, develop an adapted operator. For details, see the _PyTorch Operator Development Guide_. +3. Prioritize the single-device function, and then enable the multi-device function. + +

Commissioning Process

+ +- **[Overall Guideline](#overall-guideline.md)** + +- **[Collecting Data Related to the Training Process](#collecting-data-related-to-the-training-process.md)** + +- **[Performance Optimization](#performance-optimization-1.md)** + + +

Overall Guideline

+ +1. Check whether the throughput meets the expected requirements based on the training execution result. +2. If the throughput does not meet requirements, you need to find out the causes of the performance bottleneck. Possible causes are as follows: + - Operator bottleneck: The execution of an operator is too slow. + - Copy bottleneck: The bottleneck is caused by the copy operation during converting non-contiguous tensors to contiguous tensors. + - Framework bottleneck: Additional operations are required due to operator format conversion. + - Compilation bottleneck: Repeated compilation is caused by the changes of shape or attributes. + +3. Analyze the preceding causes of performance bottlenecks and optimize the performance. + + + +## Profile Data Collection + +If the throughput does not meet requirements, you need to collect profile data during the training process to analyze which step and which operator cause the performance consumption. To obtain profile data, perform the following steps: + +1. Obtain the **chrome\_trace** file. Use the profile API to reconstruct the loss calculation and optimization process of the original code. + + ``` + # Use the profile API adapted to Ascend-PyTorch. You are advised to run only one step. + with torch.autograd.profiler.profile(use_npu=True) as prof: + out = model(input_tensor) + loss=loss_func(out) + loss.backward() + optimizer.zero_grad() + optimizer.step() + # Export the chrome_trace file to a specified path. + prof.export_chrome_trace(output_path) + ``` + +2. To view the **chrome\_trace** file, access **chrome://tracing** in the Chrome browser, drag the file in the blank space. You can press **W**, **A**, **S**, or **D** to zoom in, zoom out, or move the profiling result. + +## Obtaining Operator Information \(OP\_INFO\) + +The network model is executed as an operator \(OP\). The OPInfo log can be used to obtain the operator and its attributes during the actual execution. Obtain the information by running the **get\_ascend\_op\_info.py** script. + +1. Write the **get\_ascend\_op\_info.py** script to obtain the operator information. The script content is as follows: + + ``` + # -*- coding: utf-8 -*- + """ Used to export operator information. + """ + import os + import sys + import argparse + + def func(host_log_folder): + """ + :param host_log_folder: where host_log_folder addr is. + :return: + """ + host_log_files = os.listdir(host_log_folder) + result = {} + + for host_log in host_log_files: + if not host_log.endswith('.log') or host_log.endswith('.out'): + continue + with open(os.path.join(host_log_folder, host_log), 'r')as f: + host_log_lines = f.readlines() + for line in host_log_lines: + if line.startswith('[INFO] ASCENDCL') and "aclopCompile::aclOp" in line: + op_info = line.split('OpType: ')[1][:-2] + op_type = op_info.split(',')[0] + op_param = op_info[len(op_type) + 2:] + if op_type not in result.keys(): + result[op_type] = [op_param] + else: + result[op_type].append(op_param) + + with open('ascend_op_info_summary.txt', 'w')as f: + for k, v in result.items(): + v_set = set(v) + for info in v_set: + f.write(k + " " + info + "\n") + + if __name__ == "__main__": + parser = argparse.ArgumentParser(description='trans the log') + parser.add_argument('--host_log_folder', default="./", + help="input the dir name, trans the current dir with default") + ags = parser.parse_args() + func(ags.host_log_folder) + ``` + +2. Set environment variable to print host logs to the screen. + + ``` + export ASCEND_SLOG_PRINT_TO_STDOUT=1 + ``` + +3. Set the log level to **info**. For details, see the _CANN Log Reference_. +4. Run the training script to train the model. After the training is complete, obtain the host logs. By default, the logs are stored in the **$HOME/ascend/log/plog** directory. **$HOME** indicates the root directory of the user on the host. +5. After the host logs are parsed, obtain the operator information **ascend\_op\_info\_summary.txt** in the current directory. + + ``` + python3.7 get_ascend_op_info.py --host_log_folder $HOME/ascend/log/plog + ``` + +6. Analyze the extra tasks in TaskInfo, especially transdata. + +

Performance Optimization

+ +## Operator Bottleneck Optimization + +1. Obtain the profile data during training. For details, see [Profile Data Collection](#collecting-data-related-to-the-training-process.md). +2. Analyze the profile data to obtain the time-consuming operator. +3. See [Single-Operator Sample Building](#single-operator-sample-building.md) to build the single-operator sample of the time-consuming operator, and compare the execution time of a single-operator sample on the CPU and GPU. If the performance is insufficient, use either of the following methods to solve the problem: + - Workaround: Use other efficient operators with the same semantics. + - Solution: Improve the operator performance. + + +## Copy Bottleneck Optimization + +1. Obtain the profile data during training. For details, see [Profile Data Collection](#collecting-data-related-to-the-training-process.md). +2. Analyze the Profile data to obtain the execution time of **D2DCopywithStreamSynchronize**, **PTCopy**, or **format\_contiguous** in the entire network. +3. If the execution takes a long time, use either of the following methods to solve the problem: + - Method 1 \(workaround\): Replace view operators with compute operators. In PyTorch, view operators cause conversion from non-contiguous tensors to contiguous tensors. The optimization idea is to replace view operators with compute operators. Common view operators include view, permute, and transpose operators. For more view operators, go to [https://pytorch.org/docs/stable/tensor\_view.html](https://pytorch.org/docs/stable/tensor_view.html). + - Method 2 \(solution\): Accelerate the operation of converting non-contiguous tensors to contiguous tensors. + + +## Framework Bottleneck Optimization + +1. Obtain the operator information \(OP\_INFO\) during the training. For details, see [Obtaining Operator Information \(OP\_INFO\)](#collecting-data-related-to-the-training-process.md). +2. Analyze the specifications and calling relationship of operators in OP\_INFO to check whether redundant operators are inserted. Pay special attention to check whether transdata is proper. +3. Solution: Specify the initialization format of some operators to eliminate cast operators. +4. In **pytorch/torch/nn/modules/module.py**, specify the operator initialization format in **cast\_weight**, as shown in the following figure. + + ![](figures/指定算子初始化方式.png) + + The format setting principle is as follows: + + - For the Conv2D operator, weight can be set to FZ format, for example, line 424. + - For the linear operator, weight can be set to NZ format, for example, line 409. + + +## Compilation Bottleneck Optimization + +1. Obtain the operator information \(OP\_INFO\) during the training. For details, see [Obtaining Operator Information \(OP\_INFO\)](#collecting-data-related-to-the-training-process.md). +2. View the INFO log and check the keyword **aclopCompile::aclOp** after the first step. If **Match op iunputs/type failed** or **To compile op** is displayed, the operator is dynamically compiled and needs to be optimized. +3. Use either of the following methods to solve the problem: + - Workaround: Based on the understanding of model semantics and related APIs, replace dynamic shape with static shape. + - Solution: Reduce compilation or do not compile the operator. + + +

Affinity Library

+ +- **[Source](#source.md)** + +- **[Functions](#functions.md)** + + +

Source

+ +The common network structures and functions in the public models are optimized to greatly improve computing performance. In addition, the network structures and functions are integrated into the PyTorch framework to facilitate model performance optimization. + +

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + +

Function

+

Location

+

Description

+

pairwise_iou

+

torch.contrib.npu.optimized_lib

+

Calculates the IOUs of the two bounding boxes.

+

fast_rcnn_inference_single_image

+

torch.contrib.npu.optimized_lib

+

Provides the inference API of the Mask R-CNN and Faster R-CNN models.

+

ChannelShuffle

+

torch.contrib.npu.optimized_lib

+

Provides NPU-affinity channelshuffle operations and applies to models such as shufflenetv2.

+

PreLoader

+

torch.contrib.npu.optimized_lib

+

Provides the data loading method for accelerating Ascend AI Processors.

+
+ +>![](public_sys-resources/icon-note.gif) **NOTE:** +>The optimization content will be enhanced and updated with the version. Use the content in the corresponding path of the actual PyTorch version. + +

Precision Commissioning

+ +- **[Prerequisites](#prerequisites-2.md)** + +- **[Commissioning Process](#commissioning-process-3.md)** + + +

Prerequisites

+ +Run a certain number of epochs \(20% of the total number of epoches is recommended\) with the same semantics and hyperparameters to align the precision and loss with the corresponding level of the GPU. After the alignment is complete, align the final precision. + +

Commissioning Process

+ +- **[Overall Guideline](#overall-guideline-4.md)** + +- **[Precision Optimization Methods](#precision-optimization-methods.md)** + + +

Overall Guideline

+ +To locate the precision problem, you need to find out the step in which the problem occurs. The following aspects are involved: + +1. Model network calculation error + - Locating method: Add a hook to the network to determine which part is suspected. Then build a [single-operator sample](#single-operator-sample-building.md) to narrow down the error range. This can prove that the operator calculation is incorrect in the current network. You can compare the result with the CPU or GPU result to prove the problem. + + - Workaround: Use other operators with the same semantics. + + - Solution: Improve the operator precision or function. + +2. Loss calculation error + - Locating method: The loss is special and can be customized. After determining that the loss calculation is incorrect, you are advised to dump the loss input in the network instead of a random tensor with the identical shape, so that the problem can be better reproduced and proved. + + - Workaround: Use other operators with the same semantics. + + - Solution: Improve the operator precision or function. \(Loss is also formed by operators.\) + +3. Parameter update error + + - Locating method: Before each **optim.step\(\)**, print the gradients of the parameters in the network one by one to determine which part is suspected. Then build a single-operator sample to narrow down the error range. This can prove that the gradient calculation by the operator is incorrect in the current network. You can compare the result with the CPU or GPU result to prove the problem. The priority of this item should be lower than that of items [1](#li17755175510322) and [2](#li25281726103316) because the errors of items 1 and 2 can also cause the gradient exception. + + - Workaround: Use other operators with the same semantics. + + - Solution: Improve the precision or function of the operator for gradient calculation. + +4. Multi-device calculation error + + - Locating method: When the precision of a single-device is ensured, multi-device calculation errors occur. + + - Solution: Contact Huawei support to provide the single-device script and multi-device script of stable reproduction. + + + +

Precision Optimization Methods

+ +1. Determine whether the calculation on the Ascend AI Processor is correct by comparing the calculation result of the CPU and that of the Ascend AI Processor. + + Code example \(this example shows only the basic method and does not allow direct copy\): + + ``` + # The input parameters are fixed to ensure that the model and input data are the same on the CPU and Ascend AI Processor. + input_tensor_cpu = torch.Tensor() + model_cpu = build_model() + # Port the input data to the Ascend AI Processor. + input_tensor_npu = input_tensor_cpu.npu() + # Port the model to the Ascend AI Processor. + model_npu = model_cpu.npu() + + #Compare the calculation results. + output_cpu = model_cpu(input_tensor_cpu) + output_npu = model_npu(input_tensor_npu) + compute_result = (output_cpu - output_npu).abs().mean()) + print(compute_result) + ``` + + The calculation results are slightly different because the hardware architecture of the Ascend AI Processor is different from that of the CPU. If the calculation results are close \(generally not higher than 1e-4\), then they are normal. + +2. Use the hook mechanism of PyTorch to print the inputs and outputs of the module in the forward and backward propagation for analysis. + + Code example \(this example shows only the basic method and does not allow direct copy\): + + ``` + # Set the hook function. + def hook_func(name, module): + def hook_function(module, inputs, outputs): + print(name+' inputs', inputs) + print(name+' outputs', outputs) + return hook_function + + # Register the forward and backward hooks. + for name, module in model.named_modules(): + module.register_forward_hook(hook_func('[forward]: '+name, module)) + module.register_backward_hook(hook_func('[backward]: '+name, module)) + + # Execute the model. + model(input_tensor) + ``` + + Analyze the printed inputs and outputs in the forward and backward propagation. + +3. Obtain parameters such as **grad**, **running\_mean**, and **running\_var** of the module to analyze the updates. + + Code example \(this example shows only the basic method and does not allow direct copy\): + + ``` + # For example, obtain the gradient and average value of BN for check. + for name, module in model.named_modules(): + if isinstance(module, nn._BatchNorm): + print("[BN_buffer]: "+name, module.running_mean, module.running_var) + print("[grad]: "+name, module.grad) + ``` + + +

Model Saving and Conversion

+ +- **[Introduction](#introduction-5.md)** + +- **[Saving a Model](#saving-a-model.md)** + +- **[Exporting an ONNX Model](#exporting-an-onnx-model.md)** + + +

Introduction

+ +After the model training is complete, save the model file and export the ONNX model by using the APIs provided by PyTorch. Then use the ATC tool to convert the model into an .om file that adapts to the Ascend AI Processor for offline inference. + +This section describes how to convert the trained .pth or .pth.tar file into the ONNX model. For details about how to convert the ONNX model into an .om file adapted to the Ascend AI Processor, see "ATC Tool Instructions" in the _CANN Auxiliary Development Tool User Guide _. + +For details about how to use the Auto Tune function, see "Auto Tune Tool Instructions" in the _CANN Auxiliary Development Tool User Guide _. + +For details about how to build an offline inference application, see the _CANN Application Software Development Guide \(C and C++, Inference\)_. The process is as follows: + +![](figures/en-us_image_0000001106176222.png) + +

Saving a Model

+ +During PyTorch training, **torch.save\(\)** is used to save checkpoint files. Based on the usage of model files, model files are saved in the following two formats: + +- .pth or .pt files: These files are used for online inference or exporting ONNX models. Only model parameters are saved, and the model structure is not saved, so that the compressed file can be opened using a visualization tool such as Netron. [Figure 1](#fig315704722610) shows an example. + + **Figure 1** .pth file + ![](figures/pth-file.jpg "pth-file") + + Use **state\_dict** to save and load a model. The following is an example: + + 1. Save a model. + + ``` + # Create a storage path. + PATH = "state_dict_model.pt" + # Save a model. + torch.save(net.state_dict(), PATH) + ``` + + 2. Load the model for online inference. The following is an example. For details, see the _PyTorch Online Inference Guide_. + + ``` + # Path for storing the model file + PATH = "state_dict_model.pt" + model = TheModelClass(*args, **kwargs) + # Load a model. + model.load_state_dict(torch.load(PATH)) + model.eval() + ``` + + >![](public_sys-resources/icon-notice.gif) **NOTICE:** + >The model definition file must be provided when the .pth or .pt file is saved. Otherwise, the deployment cannot be performed. + +- .pth.tar files: can be used for online inference or training after reloading. Multiple components are saved in dictionary format. Common components include the **state\_dict** of the model and optimizer, epoch when the training stops, training loss of the latest record, and the external torch.nn.Embedding layer. If only an inference model needs to be deployed, you are advised to save the weight information only, that is, the **state\_dict** of the model, in the .pth.tar file. + + The following is an example of saving and loading a model: + + 1. Save a model. + + ``` + PATH = "checkpoint.pth.tar" + torch.save({ + 'epoch': epoch, + 'loss': loss, + 'state_dict': model.state_dict(), + 'optimizer' : optimizer.state_dict(), + ... + }, PATH) + ``` + + 2. Load a model for inference or resuming training. + + ``` + model = TheModelClass(*args, **kwargs) + optimizer = TheOptimizerClass(*args, **kwargs) + + checkpoint = torch.load(PATH) + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + epoch = checkpoint['epoch'] + loss = checkpoint['loss'] + + model.eval() + # - or - + model.train() + ``` + + + +>![](public_sys-resources/icon-notice.gif) **NOTICE:** +>Generally, an operator is processed in different ways in the training graph and inference graph \(for example, BatchNorm and dropout operators\), and the input formats are also different. Therefore, before inference or ONNX model exporting, **model.eval\(\)** must be called to set the dropout and batch normalization layers to the inference mode. + +

Exporting an ONNX Model

+ +## Introduction + +The deployment policy of the Ascend AI Processor for PyTorch models is implemented based on the ONNX module that is supported by PyTorch. ONNX is a mainstream model format in the industry and is widely used for model sharing and deployment. This section describes how to export a checkpoint file as an ONNX model by using the **torch.onnx.export\(\)** API. + +## Using the .pth or .pt File to Export the ONNX Model + +The saved .pth or .pt file can be restored by building a model using PyTorch and then loading the weight. Then you can export the ONNX model. The following is an example. + +``` +import torch +import torch.onnx +import torchvision.models as models +# Set the CPU to be used to export the model. +device = torch.device("cpu") + +def convert(): +# The model definition comes from the torchvision. The model file generated in the example is based on the ResNet-50 model. + model = models.resnet50(pretrained = False) + resnet50_model = torch.load('resnet50.pth', map_location='cpu') + model.load_state_dict(resnet50_model) + + batch_size = 1 # Size of the batch processing + input_shape = (3, 224, 224) # Input data. Replace it with the actual shape. + + # Set the model to inference mode. + model.eval() + + dummy_input = torch.randn(batch_size, *input_shape) # Define the input shape. + torch.onnx.export(model, + dummy_input, + "resnet50_official.onnx", + input_names = ["input"], # Construct the input name. + output_names = ["output"], # Construct the output name. + opset_version=11, # Currently, the ATC tool supports only opset_version=11. + dynamic_axes={"input":{0:"batch_size"}, "output":{0:"batch_size"}}) # Dynamic axes of the output is supported. + ) + +if __name__ == "__main__": + convert() +``` + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>- Before exporting the ONNX model, the **model.eval\(\)** must be called to set the dropout and batch normalization layers to inference mode. +>- The model in the sample script comes from the definition in the torchvision module. You need to specify a model when using your own model. +>- The constructed input and output must correspond to the input and output during training. Otherwise, the inference cannot be performed properly. + +## Using the .pth.tar File to Export the ONNX Model + +Before exporting the ONNX model using the .pth.tar file, you need to check the saved information. Sometimes, the saved node name may be different from the node name in the model definition. For example, a prefix and suffix may be added. During the conversion, you can modify the node name. The following is an example of the conversion. + +``` +import torch +import torch.onnx +from collections import OrderedDict +import mobilenet + +# In this example, when the .pth.tar file is saved, the prefix module is added to the node name. Delete it by traversing. +def proc_nodes_module(checkpoint, AttrName): + new_state_dict = OrderedDict() + for key, value in checkpoint[AttrName].items(): + if key == "module.features.0.0.weight": + print(value) + if(key[0:7] == "module."): + name = key[7:] + else: + name = key[0:] + + new_state_dict[name] = value + return new_state_dict + +def convert(): + checkpoint = torch.load("./mobilenet_cpu.pth.tar", map_location=torch.device('cpu')) + checkpoint['state_dict'] = proc_nodes_module(checkpoint,'state_dict') + model = mobilenet.mobilenet_v2(pretrained = False) + model.load_state_dict(checkpoint['state_dict']) + model.eval() + input_names = ["actual_input_1"] + output_names = ["output1"] + dummy_input = torch.randn(1, 3, 224, 224) + torch.onnx.export(model, dummy_input, "mobilenetV2_npu.onnx", input_names = input_names, output_names = output_names, opset_version=11) + +if __name__ == "__main__": + convert() +``` + +

Samples

+ +- **[ResNet-50 Model Porting](#resnet-50-model-porting.md)** + +- **[ShuffleNet Model Optimization](#shufflenet-model-optimization.md)** + + +

ResNet-50 Model Porting

+ +- **[Obtaining Samples](#obtaining-samples.md)** + +- **[Porting the Training Script](#porting-the-training-script.md)** + +- **[Executing the Script](#executing-the-script.md)** + + +

Obtaining Samples

+ +## How to Obtain + +1. This sample is used to adapt to the porting and reconstruction of the Ascend 910 AI Processor based on the ImageNet dataset training model provided by the PyTorch official website. The sample can be obtained from [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet). +2. This sample depends on torchvision. Therefore, you need to install the torchvision dependency. If you install it as a non-root user, add **--user** to the end of the command. + + If the server runs in the x86 environment, run the following command: + + ``` + pip3.7 install torchvision==0.6.0 --no-deps + ``` + + If the server runs in the ARM environment, run the following command: + + ``` + pip3.7 install torchvision==0.2.2.post3 --no-deps + ``` + +3. For details about the ResNet-50 model, go to [https://pytorch.org/hub/pytorch\_vision\_resnet/](https://pytorch.org/hub/pytorch_vision_resnet/). The following two methods are available: + 1. Directly call the corresponding API. For example: + + ``` + import torchvision.models as models + model = models.resnet50() + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >ResNet-50 is a model built in PyTorch. For more built-in models, visit the [PyTorch official website](https://pytorch.org/). + + 2. During script execution, set **arch** to **resnet50**. This method is used in the sample. For details, see [Executing the Script](#executing-the-script.md). + + ``` + --arch resnet50 + ``` + + + +## Directory Structure + +The structure of major directories and files is as follows: + +``` +├──main.py +``` + +

Porting the Training Script

+ +- **[Single-Device Training Modification](#single-device-training-modification.md)** + +- **[Distributed Training Modification](#distributed-training-modification.md)** + + +

Single-Device Training Modification

+ +1. Add the header file to **main.py** to support model training on the Ascend 910 AI Processor based on the PyTorch framework. + + ``` + import torch.npu + ``` + +2. Add parameters to the end of the header file in the **main.py** file to specify that the Ascend 910 AI Processor is used for training. + + ``` + CALCULATE_DEVICE = "npu:1" + ``` + +3. Modify the parameter and option so that training is performed only on the Ascend 910 AI Processor. + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + # The original code specifies the GPU for training. The original code is as follows: + # args.gpu = gpu + ############## npu modify begin ############# + args.gpu = None + ############## npu modify end ############# + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + # The original code determines whether to perform training on the GPU. The code is as follows: + # if not torch.cuda.is_available(): + # print('using CPU, this will be slow') + # elif args.distributed: + ############## npu modify begin ############# + # After the migration, the code directly determines whether to perform distributed training and does not determine whether to perform training on the GPU. + if args.distributed: + ############## npu modify end ############# + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + ...... + ``` + +4. Migrate the model and loss function to the Ascend 910 AI Processor for calculation. + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + # The original code uses the torch.nn.DataParallel() class to accelerate training using multiple GPUs. + # model = torch.nn.DataParallel(model).cuda() + ############## npu modify begin ############# + # Migrate the model to the NPU for training. + model = model.to(CALCULATE_DEVICE) + ############## npu modify end ############# + # In the original code, the loss function is calculated on the GPU. + # # define loss function (criterion) and optimizer + # criterion = nn.CrossEntropyLoss().cuda(args.gpu) + ############## npu modify begin ############# + # Migrate the loss function to the NPU for calculation. + criterion = nn.CrossEntropyLoss().to(CALCULATE_DEVICE) + ############## npu modify end ############# + ``` + +5. Change the type of the **target** operator in the dataset to **int32** to resolve the operator error. Migrate the dataset to the Ascend 910 AI Processor for calculation. + - Code location: **train\(\)** in **main.py** \(The changes are in bold.\) + + ``` + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + # In the original code, the training dataset is loaded and calculated on the GPU. The original code is as follows: + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ############## npu modify begin ############# + # Port the dataset to the NPU for calculation and modify the target data type to improve performance. + if 'npu' in CALCULATE_DEVICE: + target = target.to(torch.int32) + images, target = images.to(CALCULATE_DEVICE, non_blocking=True), target.to(CALCULATE_DEVICE, non_blocking=True) + ############## npu modify end ############# + ``` + + - Code location: **validate\(\)** in **main.py** \(The changes are in bold.\) + + ``` + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + # In the original code, the training dataset is loaded and calculated on the GPU. The original code is as follows: + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ############## npu modify begin ############# + # Port the dataset to the NPU for calculation and modify the target data type. + if 'npu' in CALCULATE_DEVICE: + target = target.to(torch.int32) + images, target = images.to(CALCULATE_DEVICE, non_blocking=True), target.to(CALCULATE_DEVICE, non_blocking=True) + ############## npu modify end ############# + ``` + +6. Set the device in use. + + Code location: Main function entry in **main.py** \(The changes are in bold.\) + + ``` + if __name__ == '__main__': + ############## npu modify begin ############# + if 'npu' in CALCULATE_DEVICE: + torch.npu.set_device(CALCULATE_DEVICE) + ############## npu modify begin ############# + main() + ``` + + +

Distributed Training Modification

+ +1. Add the header file to **main.py** to support mixed-precision model training on the Ascend 910 AI Processor based on the PyTorch framework. + + ``` + import torch.npu + from apex import amp + ``` + +2. Add the following parameters, including the parameters for specifying the Ascend 910 AI Processor involved in training and the parameters required for mixed-precision training. + + ``` + parser.add_argument('--device', default='npu', type=str, help='npu or gpu') + parser.add_argument('--addr', default='10.136.181.115', type=str, help='master addr') + parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list') + parser.add_argument('--amp', default=False, action='store_true', help='use amp to train the model') + parser.add_argument('--loss-scale', default=1024., type=float, + help='loss scale using in amp, default -1 means dynamic') + parser.add_argument('--opt-level', default='O2', type=str, + help='loss scale using in amp, default -1 means dynamic') + ``` + +3. Create a mapping function from **device\_id** to **process\_id** and specify the device for training. Add the following API to the **main.py** function: + + ``` + def device_id_to_process_device_map(device_list): + devices = device_list.split(",") + devices = [int(x) for x in devices] + devices.sort() + + process_device_map = dict() + for process_id, device_id in enumerate(devices): + process_device_map[process_id] = device_id + + return process_device_map + ``` + +4. Specify the IP address and the port number of the training server. + + Code location: Main function **main\(\)** in **main.py** \(The changes are in bold.\) + + ``` + def main(): + args = parser.parse_args() + ############## npu modify begin ############# + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29688' + ############## npu modify end ############# + ``` + +5. Create a mapping parameter from **device\_id** to **process\_id** to obtain the number of Ascend 910 AI Processors on a single node. + + Code location: Main function **main\(\)** in **main.py** \(The changes are in bold.\) + + ``` + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + ############## npu modify begin ############# + args.process_device_map = device_id_to_process_device_map(args.device_list) + if args.device == 'npu': + ngpus_per_node = len(args.process_device_map) + else: + ngpus_per_node = torch.cuda.device_count() + ############## npu modify end ############# + # The original code is as follows: + # ngpus_per_node = torch.cuda.device_count() + ``` + +6. Obtain the ID of the Ascend 910 AI Processor corresponding to **process\_id** and specify the Ascend 910 AI Processor for training. + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + ############## npu modify begin ############# + args.gpu = args.process_device_map[gpu] + ############## npu modify end ############# + # The original code is as follows: + # args.gpu = gpu + ``` + +7. Initialize the process group and shield the initialization mode. + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + ############## npu modify begin ############# + if args.device == 'npu': + dist.init_process_group(backend=args.dist_backend, #init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + else: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + ############## npu modify begin ############# + # The original code is as follows: + # dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + ``` + +8. To perform distributed training, the mixed precision module needs to be introduced, and the model needs to be ported to the Ascend AI Processor. Therefore, the code for determining whether the training is distributed training and whether the model is trained on the GPU needs to be masked. + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + ############## npu modify begin ############# + # Add the following to the code: + # Specify the Ascend AI Processor as the training device. + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(loc) + # Calculate batch_size and workers used for training. + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + ############## npu modify end ############# + # The original code is as follows. The code needs to be masked and is commented out. + # if not torch.cuda.is_available(): + # print('using CPU, this will be slow') + # elif args.distributed: + # # For multiprocessing distributed, DistributedDataParallel constructor + # # should always set the single device scope, otherwise, + # # DistributedDataParallel will use all available devices. + # if args.gpu is not None: + # torch.cuda.set_device(args.gpu) + # model.cuda(args.gpu) + # # When using a single GPU per process and per + # # DistributedDataParallel, we need to divide the batch size + # # ourselves based on the total number of GPUs we have + # args.batch_size = int(args.batch_size / ngpus_per_node) + # args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + # else: + # model.cuda() + # # DistributedDataParallel will divide and allocate batch_size to all + # # available GPUs if device_ids are not set + # model = torch.nn.parallel.DistributedDataParallel(model) + # elif args.gpu is not None: + # torch.cuda.set_device(args.gpu) + # model = model.cuda(args.gpu) + # else: + # # DataParallel will divide and allocate batch_size to all available GPUs + # if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + # model.features = torch.nn.DataParallel(model.features) + # model.cuda() + # else: + # model = torch.nn.DataParallel(model).cuda() + ``` + +9. The loss function, optimizer, and breakpoint training are masked, and this part is combined with the mixed precision training later. + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + # The original code is masked and commented out. + # # define loss function (criterion) and optimizer + # criterion = nn.CrossEntropyLoss().cuda(args.gpu) + # + # optimizer = torch.optim.SGD(model.parameters(), args.lr, + # momentum=args.momentum, + # weight_decay=args.weight_decay) + # + # # optionally resume from a checkpoint + # if args.resume: + # if os.path.isfile(args.resume): + # print("=> loading checkpoint '{}'".format(args.resume)) + # if args.gpu is None: + # checkpoint = torch.load(args.resume) + # else: + # # Map model to be loaded to specified single gpu. + # loc = 'cuda:{}'.format(args.gpu) + # checkpoint = torch.load(args.resume, map_location=loc) + # args.start_epoch = checkpoint['epoch'] + # best_acc1 = checkpoint['best_acc1'] + # if args.gpu is not None: + # # best_acc1 may be from a checkpoint from a different GPU + # best_acc1 = best_acc1.to(args.gpu) + # model.load_state_dict(checkpoint['state_dict']) + # optimizer.load_state_dict(checkpoint['optimizer']) + # print("=> loaded checkpoint '{}' (epoch {})" + # .format(args.resume, checkpoint['epoch'])) + # else: + # print("=> no checkpoint found at '{}'".format(args.resume)) + # + # cudnn.benchmark = True + ``` + +10. A data loader combines a dataset and a sampler and can provide multiple threads to process the dataset. If the Ascend AI Processor is used for training, **pin\_memory** must be set to **False**. Currently, only training in a static shape is supported. The number of remaining samples in the data flow may be less than the batch size. Therefore, **drop\_last** must be set to **True**. In addition, you need to set **shuffle** to **True** for some datasets to be verified. + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + ############## npu modify begin ############# + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=False, drop_last=True) + ############## npu modify end ############# + ``` + +11. Construct the loss function and optimizer, and port the model and loss function to the Ascend AI Processor. The optimizer, the model and the mixed precision module are combined to support the mixed precision training. The breakpoint training part is combined with the mixed precision module to support the mixed precision training. + + Code location: after the data loading verification part of **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=False, drop_last=True) + + ############## npu modify begin ############# + model = model.to(loc) + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().to(loc) + optimizer = torch.optim.SGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + if args.amp: + model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + if args.amp: + amp.load_state_dict(checkpoint['amp']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + ############## npu modify end ############# + ``` + +12. The checkpoint saving needs to be combined with the mixed precision training. The modification is as follows: + + Code location: **main\_worker\(\)** in **main.py** \(The changes are in bold.\) + + ``` + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + ############## npu modify begin ############# + if args.amp: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + 'amp': amp.state_dict(), + }, is_best) + else: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + }, is_best) + ############## npu modify end ############# + ``` + +13. During training, you need to migrate the dataset to the Ascend AI Processor. The modification is as follows: + + Code location: **train\(\)** in **main.py** \(The changes are in bold.\) + + ``` + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + ############## npu modify begin ############# + loc = 'npu:{}'.format(args.gpu) + target = target.to(torch.int32) + images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False) + ############## npu modify end ############# + # The original model code is as follows: + # if args.gpu is not None: + # images = images.cuda(args.gpu, non_blocking=True) + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ``` + +14. Mark the location where the backpropagation .backward\(\) occurs so that the mixed precision module can perform loss scaling and clear the status of each iteration. The code is as follows: + + Code location: **train\(\)** in **main.py** \(The changes are in bold.\) + + ``` + optimizer.zero_grad() + ############## npu modify begin ############# + if args.amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + # The original code is as follows: + # loss.backward() + ############## npu modify end ############# + optimizer.step() + ``` + +15. Before verification, you need to migrate the dataset to be verified to the Ascend AI Processor. The modification is as follows: + + Code location: **validate\(\)** in **main.py** \(The changes are in bold.\) + + ``` + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + ############## npu modify begin ############# + loc = 'npu:{}'.format(args.gpu) + target = target.to(torch.int32) + images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False) + ############## npu modify end ############# + # The original model code is as follows: + # if args.gpu is not None: + # images = images.cuda(args.gpu, non_blocking=True) + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ``` + + +

Executing the Script

+ +## Preparing a Dataset + +Prepare a dataset and upload it to a directory in the operating environment, for example, **/home/data/resnet50/imagenet**. + +## Configuring Environment Variables + +For details, see [Configuring Environment Variables](#configuring-environment-variables.md). + +## Command + +Example: + +Single-device: + +``` +python3.7 main.py /home/data/resnet50/imagenet --batch-size 128 \ # Training batch size + --lr 0.1 \ # Learning rate + --epochs 90 \ # Number of training iterations + --arch resnet50 \ # Model architecture + --world-size 1 \ + --rank 0 \ + --workers 40 \ # Number of processes for loading data + --momentum 0.9 \ # Momentum + --weight-decay 1e-4 # Weight attenuation +``` + +Distributed: + +``` +python3.7 main.py /home/data/resnet50/imagenet --addr='1.1.1.1' \ # Example IP address. Replace it with the actual IP address. + --seed 49 \ # Random seed + --workers 160 \ # Number of processes for loading data + --lr 0.8 \ + --print-freq 1 \ + --arch resnet50 \ # Model architecture + --dist-url 'tcp://127.0.0.1:50000' \ + --dist-backend 'hccl' \ + --multiprocessing-distributed \ # Multi-device training + --world-size 1 \ + --batch-size 2048 \ # Training batch size + --epochs 90 \ # Number of training iterations + --rank 0 \ + --device-list '0,1,2,3,4,5,6,7' \ + --amp # Use mixed precision for training. +``` + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>**dist-backend** must be set to **hccl** to support distributed training on the Ascend AI device. + +

ShuffleNet Model Optimization

+ +- **[Obtaining Samples](#obtaining-samples-6.md)** + +- **[Evaluating the Model](#evaluating-the-model.md)** + +- **[Porting the Network](#porting-the-network.md)** + +- **[Commissioning the Network](#commissioning-the-network.md)** + + +

Obtaining Samples

+ +## How to Obtain + +1. This sample is used to adapt to the porting and reconstruction of the Ascend 910 AI Processor based on the ImageNet dataset training model provided by the PyTorch official website. The sample can be obtained from [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet). +2. For details about the ShuffleNet model, see the [ShuffleNet V2](https://pytorch.org/hub/pytorch_vision_shufflenet_v2/) in the PyTorch official website. Set the **arch** parameter to **shufflenet\_v2\_x1\_0** during script execution. + + ``` + --arch shufflenet_v2_x1_0 + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >ShuffleNet is a model built in PyTorch. For more built-in models, visit the [PyTorch official website](https://pytorch.org/). + + +## Directory Structure + +The structure of major directories and files is as follows: + +``` +├──main.py +``` + +

Evaluating the Model

+ +Model evaluation focuses on operator adaptation. Use the dump op method to obtain the ShuffleNet operator information and compare the information with that in the _PyTorch Adapted Operator List_. If an operator is not supported, in simple scenarios, you can replace the operator with a similar operator or place the operator on the CPU to avoid this problem. In complex scenarios, operator development is required. For details, see the _PyTorch Operator Development Guide_. + +

Porting the Network

+ +For details about how to port the training scripts, see [Single-Device Training Modification](#single-device-training-modification.md) and [Distributed Training Modification](#distributed-training-modification.md). During the script execution, select the **--arch shufflenet\_v2\_x1\_0** parameter. + +

Commissioning the Network

+ +For details about how to commission the network, see [Commissioning Process](#commissioning-process.md). After check, it is found that too much time is consumed by operators during ShuffleNet running. The following provides the time consumption data and solutions. + +## Forward check + +The forward check record table is as follows: + +**Table 1** Forward check + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

time (ms)

+

batch_size

+

Detail

+

1

+

1100

+

512

+

Replace channel_shuffle with channel_shuffle_index_select.

+

2

+

600

+

512

+

Perform the channel_shuffle_index_select operation twice to reduce the non-contiguous tensors caused by chunk.

+

3

+

300

+

512

+

Specify the concat output format to NCHW through the framework layer to eliminate excessive transdata.

+

4

+

285

+

512

+

Rectify the weight format.

+

5

+

275

+

512

+

Rectify the problem that the output format 5HD was not specified for DWCONV.

+
+ +The details are as follows: + +- The native **torch.transpose\(x, 1, 2\).contiguous\(\)** uses the view operator transpose, which produced non-contiguous tensors. For example, the copy bottleneck described in the [copy bottleneck optimization](#performance-optimization-1.md) uses **channel\_shuffle\_index\_select** to replace the framework operator with the compute operator when the semantics is the same, reducing the time consumption. +- ShuffleNet V2 contains a large number of chunk operations, and chunk operations are framework operators in PyTorch. As a result, a tensor is split into several non-contiguous tensors of the same length. The operation of converting non-contiguous tensors to contiguous tensors takes a long time. Therefore, the compute operator is used to eliminate non-contiguous tensors. For details, see the copy bottleneck described in the [copy bottleneck optimization](#performance-optimization-1.md) +- During operator adaptation, the output format is specified as the input format by default. However, Concat does not support the 5HD format whose C dimension is not an integral multiple of 16, so it converts the format into 4D for processing. In addition, the Concat is followed by the GatherV2 operator, which supports only the 4D format. Therefore, the data format conversion process is 5HD \> 4D \> Concat \> 5HD \> 4D \> GatherV2 \> 5HD. The solution is to modify the Concat output format. When the output format is not an integer multiple of 16, the specified output format is 4D. After the optimization, the data format conversion process is 5HD \> 4D \> Concat \> GatherV2 \> 5HD. For details about the method for ShuffleNet, see line 121 in **pytorch/aten/src/ATen/native/npu/CatKernelNpu.cpp**. +- Set the weight initialization format to avoid repeated transdata during calculation, for example, the framework bottleneck described in the [copy bottleneck optimization](#performance-optimization-1.md). +- The output format of the DWCONV weight is rectified to avoid the unnecessary conversion from 5HD to 4D. + +## Entire Network Check + +The record table of the entire network check is as follows: + +**Table 2** Entire network check + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

time (ms)

+

batch_size

+

Detail

+

1

+

5500

+

512

+

The index_add operation is performed by copying index to CPU through the framework layer.

+

2

+

4000

+

512

+

Customize operators to pre-generate an index.

+

3

+

1800

+

512

+

Customize operators to combine index_add and chunk.

+

4

+

885

+

512

+

Add contiguous_with_gatherv2.

+

5

+

3480

+

1024

+

Modify batchsize.

+

6

+

1650

+

1024

+

Modify batchsize and contiguous_with_gatherv2.

+

7

+

1424

+

1024

+

Customize operators to combine cat, shuffle, and chunk to eliminate non-contiguous tensors.

+

8

+

1360

+

1024

+

Modify the format of the gradient transferred by ReluGrad through the framework layer.

+

9

+

1300

+

1024

+

Modify the backward propagation input format of IndexSelectFullImplementation.

+

10

+

920

+

1024

+

Modify amp O1.

+

11

+

860

+

1024

+

Modify amp O2.

+

12

+

830

+

1024

+

Eliminate the excessive transdata introduced by the AXPY during BN parameter update.

+

13

+

800

+

1024

+

Cancel the stream synchronization among forward propagation, backward propagation, and parm_update.

+

14

+

461

+

1024

+

Optimize the GatherV2 operator for non-32-byte alignment scenarios.

+

15

+

429

+

1024

+

Optimize GatherV2 to GatherV3 in the ShuffleNet V2 scenario.

+
+ +The details are as follows: + +1. Replace framework operators with compute operators. + +2. Use buffer to record the index information to the NPU, and cancel the **index.to\(npu creation\)** operation. + +3. Use compute operators to eliminate non-contiguous tensors. + +4. The AI Core operator GatherV2 is used for **contiguous\_with\_gatherv2** to convert non-contiguous tensors to contiguous tensors. + +5. Modify **batchsize**. + +6. Modify **batchsize **and **contiguous\_with\_gatherv2**. + +7. The chunk operator is the backward calculation mode of the Concat operator. It may produce non-contiguous tensors. Therefore, the backward calculation mode of the Concat operator needs to be customized. Combine cat, shuffle, and chunk, then replace chunk with GatherV2 to eliminate non-contiguous tensors. + +8. The ReluGrad operator has two inputs: **grad\_output** \(backward input\) and **self** \(forward output\). In ShuffleNet, the 4D and 5HD formats exist at the same time in some cases. However, the FE format is usually aligned with the format of the first tensor, so the following process occurs: \(4D, 5HD\) \> \(4D, 4D\) \> ReluGrad \> 4D \> 5HD. The forward output format is basically the input format, and ReLU is usually used together with Conv and BN. In this scenario, 5HD format is more suitable for output. Therefore, insert **npu\_format\_cast** manually, and the following process occurs: \(4D, 5HD\) \> \(5HD, 5HD\) \> ReluGrad \> 5HD. + +9. In IndexSelectFullImplementation, the gatherv2 operation is performed twice on a 5HD tensor. In this case, the conversion from 5HD to 4D is performed twice. You can manually convert 5HD to 4D once, so that transdata is not performed during the gatherv2 operation, reducing a transdata operation. + +10. Add the mixed precision O1. + +11. Add the mixed precision O2. +12. Due to the parameter verification of the Axpy operator, when the parameters of all networks are updated, if C dimension is not exactly divided by 16, the Axpy operation for 4D is performed by transdata operators. In this case, a large number of transdata operators are introduced. To solve this problem, add a function, when the Axpy input shapes are the same, the verification ends. This avoids format conversion and improves the running efficiency. + +13. Delete all the stream synchronization operations. This is not adopted because it is easy to cause non-convergence. + +14. After using the GatherV2 operator optimized for non-alignment scenarios, the overall performance is improved to the delivery level. + +15. After using the GatherV3 operator optimized for the ShuffleNet V2 scenario, the overall performance can be further improved. + + +## Python Optimization Details + +The optimization on the Python side is to make the network more affinity on the NPU by modifying some equivalent semantics. The current operations of converting non-contiguous tensors to contiguous tensors can be the performance bottleneck. The **channel\_shuffle** operation in ShuffleNet V2 involves the conversion operations after permute, causing poor performance of the entire network. The performance of the entire network can be greatly improved by modifying the equivalent semantics of the **channel\_shuffle** operation and combining it with the concat operation. The torchvision version is used. For details, go to [open source link](https://github.com/pytorch/vision/blob/master/torchvision/models/shufflenetv2.py). + +- Original **channel\_shuffle** operation: + + ``` + def channel_shuffle(x, groups): + # type: (torch.Tensor, int) -> torch.Tensor + batchsize, num_channels, height, width = x.data.size() + channels_per_group = num_channels // groups + # reshape + x = x.view(batchsize, groups, + channels_per_group, height, width) + x = torch.transpose(x, 1, 2).contiguous() + # flatten + x = x.view(batchsize, -1, height, width) + return x + + class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride): + super(InvertedResidual, self).__init__() + if not (1 <= stride <= 3): + raise ValueError('illegal stride value') + self.stride = stride + branch_features = oup // 2 + assert (self.stride != 1) or (inp == branch_features << 1) + if self.stride > 1: + self.branch1 = nn.Sequential( + self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(inp), + nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + else: + self.branch1 = nn.Sequential() + + self.branch2 = nn.Sequential( + nn.Conv2d(inp if (self.stride > 1) else branch_features, + branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(branch_features), + nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + + @staticmethod + def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): + return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i) + + def forward(self, x): + if self.stride == 1: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + else: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + + out = channel_shuffle(out, 2) + + return out + ``` + +- Equivalent semantics rewriting: + +``` +def channel_shuffle_index_select(x, groups=2): + N, C, H, W = x.shape + inp = C +# The channel_shuffle operation is to rearrange the C dimension according to certain rules. It can be expressed as a simple rearrangement. + group_len = inp // groups + index = torch.from_numpy(np.array(list(range(inp))).reshape(groups, group_len).transpose(1, 0).flatten()).long() + + x = x.index_select(1, index) + return x + +# Compare the results of the two operations. The semantics are the same. +x = torch.randn(2, 232, 14, 14) +for group in [2, 4, 8]: + out1 = channel_shuffle(x, group) + out2 = channel_shuffle_index_select(x, group) + print((out1 - out2).sum()) +``` + +- Affinity writing method of the Ascend AI Processor: + + ``` + # Corresponding to out = channel_shuffle(torch.cat((self.branch1(x), self.branch2(x)), dim=1)) + # Replace channel_shuffle with channel_shuffle_index_select. + # Customize operators to combine channel_shuffle_index_select and cat, and use compute operators to reduce non-contiguous tensors. + class IndexSelectFullImplementation(torch.autograd.Function): + @staticmethod + def forward(ctx, x1, x2, fp_index, bp_index1, bp_index2): + # Forcible stream synchronization, which is used only for training stabilization. + stream = torch.npu.current_stream() + stream.synchronize() + + # Register bp_index1 and bp_index2 with context so that they can be used in backward propagation. + ctx.bp_index1 = bp_index1 + ctx.bp_index2 = bp_index2 + + x = torch.cat([x1, x2], dim=1) + + # Replace channel_shuffle with index_select. In this example, the chunk operator is not used. + result = x.index_select(1, fp_index) + + return result + + @staticmethod + def backward(ctx, grad_output): + # Forcible stream synchronization, which is used only for training stabilization. + stream = torch.npu.current_stream() + stream.synchronize() + + # Convert the format to NCHW to reduce extra transdata because index_select does not support the 5HD format. + grad_output.data = grad_output.data.npu_format_cast(0) + + # Use index_select to reverse index_select and cat based on the reverse expression obtained from forward derivation. + out1 = grad_output.index_select(1, ctx.bp_index1) + out2 = grad_output.index_select(1, ctx.bp_index2) + return out1, out2, None, None, None, None + + + class IndexSelectHalfImplementation(torch.autograd.Function): + @staticmethod + def forward(ctx, x1, x2, fp_index1, fp_index2, bp_index1, bp_index2): + ctx.bp_index1 = bp_index1 + ctx.bp_index2 = bp_index2 + x = torch.cat([x1, x2], dim=1) + + # Replace channel_shuffle with index_select. In this example, the chunk operator is used. + return x.index_select(1, fp_index1), x.index_select(1, fp_index2) + + @staticmethod + def backward(ctx, grad_output1, grad_output2): + grad_output = torch.cat([grad_output1, grad_output2], 1) + + out1 = grad_output.index_select(1, ctx.bp_index1) + out2 = grad_output.index_select(1, ctx.bp_index2) + return out1, out2, None, None, None, None + + + class Channel_Shuffle(nn.Module): + def __init__(self, inp, groups=2, split_shuffle=True): + super(Channel_Shuffle, self).__init__() + + self.split_shuffle = split_shuffle + self.group_len = inp // groups + + # Initialize fp_index to be used in channel_shuffle_index_select. + self.out = np.array(list(range(inp))).reshape(groups, self.group_len).transpose(1, 0).flatten().tolist() + + # Register the initialized fp_index as the buffer of the module. When to.device is called, the buffer is brought to the device to reduce the time consumed by host-to-device copy. + # This section describes only the common usage when the value of group is 2. Expand based on the actual scenario. + if self.split_shuffle: + self.register_buffer('fp_index1', torch.tensor(self.out[:self.group_len], dtype=torch.int32)) + self.register_buffer('fp_index2', torch.tensor(self.out[self.group_len:], dtype=torch.int32)) + else: + self.register_buffer('fp_index', torch.tensor(self.out, dtype=torch.int32)) + + # Register the corresponding bp_index as the buffer of the module. When to.device is called, the buffer is brought to the device to reduce the time consumed by host-to-device copy. + self.register_buffer('bp_index1', torch.tensor(list(range(0, inp, 2)), dtype=torch.int32)) + self.register_buffer('bp_index2', torch.tensor(list(range(1, inp, 2)), dtype=torch.int32)) + + def forward(self, x1, x2): + if self.split_shuffle: + return IndexSelectHalfImplementation.apply(x1, x2, self.fp_index1, self.fp_index2, self.bp_index1, + self.bp_index2) + else: + return IndexSelectFullImplementation.apply(x1, x2, self.fp_index, self.bp_index1, self.bp_index2) + + + class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, split_shuffle=True): + super(InvertedResidual, self).__init__() + + if not (1 <= stride <= 3): + raise ValueError('illegal stride value') + self.stride = stride + + branch_features = oup // 2 + assert (self.stride != 1) or (inp == branch_features << 1) + + if self.stride > 1: + self.branch1 = nn.Sequential( + self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(inp), + nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + else: + self.branch1 = nn.Sequential() + + self.branch2 = nn.Sequential( + nn.Conv2d(inp if (self.stride > 1) else branch_features, + branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(branch_features), + nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + + if self.stride > 1: + self.channel_shuffle = Channel_Shuffle(inp=branch_features + branch_features, groups=2, + split_shuffle=split_shuffle) + else: + self.channel_shuffle = Channel_Shuffle(inp=inp, groups=2, split_shuffle=split_shuffle) + + @staticmethod + def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): + return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i) + + def forward(self, x): + + # Delete the concat and chunk operations and combine them into self.channel_shuffle for processing. + if self.stride == 1: + x1, x2 = x + x2 = self.branch2(x2) + else: + x1 = self.branch1(x) + x2 = self.branch2(x) + + out = self.channel_shuffle(x1, x2) + + return out + ``` + + +

References

+ +- **[Single-Operator Sample Building](#single-operator-sample-building.md)** + +- **[Single-Operator Dump Method](#single-operator-dump-method.md)** + +- **[Common Environment Variables](#common-environment-variables.md)** + +- **[dump op Method](#dump-op-method.md)** + +- **[How Do I Install GCC 7.3.0?](#how-do-i-install-gcc-7-3-0.md)** + + +

Single-Operator Sample Building

+ +When a problem occurs in a model, it is costly to reproduce the problem in the entire network. You can build a single-operator sample to reproduce the precision or performance problem to locate and solve the problem. A single-operator sample can be built in either of the following ways: For details about single-operator dump methods, see [Single-Operator Dump Method](#single-operator-dump-method.md). + +1. Build a single-operator sample test case. You can directly call the operator to reproduce the error scenario. + + The following is an example of building a single-operator sample of the max operator: + + ``` + import torch + import copy + from torch.testing._internal.common_utils import TestCase, run_tests + class TestMax(TestCase): + def cpu_op_exec(self, input1): + # Call the operator. + output = torch.max(input1) + output = output.numpy() + return output + + def npu_op_exec(self, input1): + # Call the corresponding NPU operator. + output = torch.max(input1) + return output + + def test_max(self): + input = torch.randn(10,20)) + input = input.to(torch.int64) # Convert the data type. + input_cpu = copy.deepcopy(input) + input_npu = copy.deepcopy(input).npu() + + output_cpu = self.cpu_op_exec(input_cpu) + output_npu = self.npu_op_exec(input_npu) + + # Compare the calculation results of the CPU and NPU. prec is the allowed error. + self.assertEqual(output_cpu, output_npu, prec = 1e-4) + + if __name__ == '__main__': + run_tests() + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >- Run the preceding code. If the reported error information is the same as that of the max operator in the model, the single-operator test case is successfully built. + >- Assume that the data type conversion code is commented out. If no error is reported in the test case, an error of the max operator is reported on the NPU when the input parameter is **torch.int64**. + +2. Build a single-operator test case based on the context. + + Although this is a single-operator sample, sometimes it is not only an operation but also a scenario with context or a module with parameters. The module mode is a more common method. The following is an example of building a module that contains two operators: + + ``` + import torch + import copy + from torch.testing._internal.common_utils import TestCase, run_tests + + class Model(nn.Module): + def __init__(self, in_channels=1, hooks=False): + super(Model, self).__init__() + self.conv = nn.Conv2d(in_channels, in_channels*2, kernel_size=64) + if hooks: + self.conv.weight.register_hook(lambda grad: print(grad)) + def forward(self, x): + out = self.conv(x) + return out + + class TestConv2d(TestCase): + def test_conv2d(self): + + model = Model(in_channels=16) + + # Add hooks to obtain the backward propagation result. + # model = Model(in_channels=16, hooks=True) + # Create an input tensor. + input_tensor = torch.randn(4,16,64,64) + + input_tensor_cpu= copy.deepcopy(input_tensor) + out = model(input_tensor_cpu) + loss = out.sum() + loss.backward() + cpuout = out + + # Run the model and input tensor on the NPU. + torch.npu.set_device("npu:0") # Set the running device first. + model_npu = Model(in_channels=16).npu() + input_tensor_npu= copy.deepcopy(input_tensor).npu() + out = model_npu(input_tensor_npu) + loss = out.sum() + loss.backward() + npuout = out + # Determine whether the scenario is an error scenario based on the result. + self.assertEqual(cpuout, npuout, prec = 1e-4) + + if __name__ == '__main__': + run_tests() + ``` + + +

Single-Operator Dump Method

+ +## Collecting Dump Data + +Currently, the PyTorch adapted to Ascend AI Processors uses the init\_dump\(\), set\_dump\(\), and finalize\_dump\(\) interfaces in **torch.npu** to collect operator dump data. The init\_dump\(\) interface initializes the dump configuration, invokes the set\_dump\(\) interface to import the configuration file to configure dump parameters, and invokes the finalize\_dump interface to end the dump. The following uses the add\_ operator as an example to describe how to collect dump data. + +``` +import torch +torch.npu.set_device("npu:0") +torch.npu.init_dump() +torch.npu.set_dump("/home/HwHiAiUser/dump.json") # "/home/HwHiAiUser/dump.json" is the path of the configuration file. You can configure it as required. +a = torch.tensor([2, 2]).to("npu:0") +a.add_(1) +torch.npu.finalize_dump() +``` + +The configuration method of **dump.json** is as follows. + +``` +{ + "dump": + { + "dump_list":[], + "dump_path":"/home/HwHiAiUser/dump/output", + "dump_mode":"all", + "dump_op_switch":"on" + } +``` + +The fields of **dump.json** are described as follows. + + + + + + + + + + + + + + + + + + + +

Field

+

Description

+

dump_list

+

Operator model whose data is to be dumped. Leave this parameter empty.

+

dump_path

+

Directory where dump data files are stored in the operating environment. The value can be an absolute path or a relative path.

+
  • An absolute path starts with a slash (/), for example, /home/HwHiAiUser/output.
  • A relative path starts with a directory name, for example, output.
+

For example, if dump_path is set to /home/HwHiAiUser/output, the dump data files are generated under the /home/HwHiAiUser/output directory in the operating environment.

+

dump_mode

+

Dump data mode. The configuration is as follows:

+
  • output (default): dumps operator outputs only.
  • input: dumps operator inputs only.
  • all: dumps both operator inputs and outputs.
+

dump_op_switch

+

Dump data status of the single-operator model. The configuration is as follows:

+
  • off (default): disables dump for the single-operator model.
+
  • on: enables dump for the single-operator model.
+
+ +## Viewing Overflowed Data + +The collected dump data is generated in the _\{dump\_path\}_**/**_\{time\}_**/**_\{deviceid\}_**/**_\{model\_id\}_**/**_\{data\_index\}_ directory, for example, **/home/HwHiAiUser/output/20200808163566/0/0**. + +The fields in the dump data path and file are described as follows: + +- _dump\_path_: user-defined path for storing overflowed data, for example, **/home/HwHiAiUser/output**. + +- _time_: timestamp \(for example, **20200808163566**\) +- _deviceid_: device ID +- **_model\_id_**: subgraph ID +- A dump file is named as: _\{op\_type\}_._\{op\_name\}_._\{taskid\}_._\{stream\_id\}_._\{timestamp\}_. Any period \(.\), slash \(/\), backslash \(\\\), or space in the _op\_type_ or _op\_name_ field is replaced by an underscore \(\_\). + +## Parse the dump file of an overflow operator. + +1. Upload the _\{op\_type\}.\{op\_name\}.\{taskid\}.\{stream\_id\}.\{timestamp\}_ file to the environment with Toolkit installed. +2. Go to the path where the parsing script is stored. Assume that the installation directory of the Toolkit software package is /home/HwHiAiUser/Ascend/ascend-toolkit/latest. + + **cd /home/HwHiAiUser/Ascend/ascend-toolkit/latest/toolkit/tools/operator\_cmp/compare** + +3. Run the **msaccucmp.pyc** script to convert the dump file into a NumPy file. The following is an example: + + **python3 msaccucmp.pyc convert -d /home/HwHiAiUser/dump -out /home/HwHiAiUser/dumptonumpy -v 2** + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >The **-d** option enables the conversion of a single dump file or all dump files in a path. + +4. Use Python to save the NumPy data into a text file. The following is an example: + + **$ python3.7.5** + + **\>\>\> import numpy as np** + + **\>\>\> a = np.load\("/home/HwHiAiUser/dumptonumpy/Pooling.pool1.1147.1589195081588018.output.0.npy"\)** + + **\>\>\> b = a.flatten\(\)** + + **\>\>\> np.savetxt\("/home/HwHiAiUser/dumptonumpy/Pooling.pool1.1147.1589195081588018.output.0.txt", b\)** + + The dimension and **Dtype** information no longer exist in the .txt file. For details, visit the NumPy website. + + +

Common Environment Variables

+ +1. Enables the task delivery in multi-thread mode. When this function is enabled, the training performance of the entire network is improved in most cases. + + **export TASK\_QUEUE\_ENABLE=1** + +2. Redirects logs to stdout, which is used to export host logs to the screen. + + **export ASCEND\_SLOG\_PRINT\_TO\_STDOUT=1** + +3. Sets the log level. Log levels in descending order are: debug \> info \> warning \> error \> null. Generally, the log level is set to **error**. **info** is used for debugging. For details about how to set the log level, see the _CANN Log Reference_. +4. Dumps graph, which is used to view the graph structure. + + **export DUMP\_GE\_GRAPH=2** + + **export DUMP\_GRAPH\_LEVEL=3** + + +

dump op Method

+ +1. Use the profile API to reconstruct the loss calculation and optimization process of the original code training script and print the operator information. The following is a code example. + + ``` + with torch.autograd.profiler.profile() as prof: + out = model(input_tensor) + loss = out.sum() + loss.backward() + # You can also export the file. + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + ``` + +2. Train the reconstructed training script on the CPU. The related operator information is displayed. + +

How Do I Install GCC 7.3.0?

+ +Perform the following steps as the **root** user. + +1. Download **gcc-7.3.0.tar.gz** from [https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz](https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz). +2. GCC installation requires adequate temporary space. Run the following command to clear the **/tmp** directory in advance: + + ``` + sudo rm -rf /tmp/* + ``` + +3. Install dependencies. + + For CentOS/BCLinux, run the following command: + + ``` + yum install bzip2 + ``` + + For Ubuntu/Debian, run the following command: + + ``` + apt-get install bzip2 + ``` + +4. Build and install GCC. + 1. Go to the directory where the source package **gcc-7.3.0.tar.gz** is located and run the following command to decompress it: + + ``` + tar -zxvf gcc-7.3.0.tar.gz + ``` + + 2. Go to the extracted directory and run the following command to download the GCC dependency packages: + + ``` + cd gcc-7.3.0 + ./contrib/download_prerequisites + ``` + + If an error is reported during the command execution, run the following commands in the **gcc-7.3.0/** directory to download the dependency packages: + + ``` + wget http://gcc.gnu.org/pub/gcc/infrastructure/gmp-6.1.0.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpfr-3.1.4.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpc-1.0.3.tar.gz + wget http://gcc.gnu.org/pub/gcc/infrastructure/isl-0.16.1.tar.bz2 + ``` + + After the preceding dependencies are downloaded, run the following command again: + + ``` + ./contrib/download_prerequisites + ``` + + If the validation fails, check whether the dependency packages are repeatedly downloaded. The packages should be downloaded at a time. + + 3. Run the following commands for configuration, build, and installation. + + ``` + ./configure --enable-languages=c,c++ --disable-multilib --with-system-zlib --prefix=/usr/local/linux_gcc7.3.0 + make -j15 # Check the number of CPUs by running grep -w processor /proc/cpuinfo|wc -l. In this example, the number is 15. + make install + ``` + + >![](public_sys-resources/icon-caution.gif) **CAUTION:** + >The **--prefix** option is used to specify the linux\_gcc7.3.0 installation path, which is configurable. Do not set it to **/usr/local** or **/usr**, which is the default installation path for the GCC installed by using the software source. Otherwise, a conflict occurs and the original GCC compilation environment of the system is damaged. In this example, the installation path is set to **/usr/local/linux\_gcc7.3.0**. + + +5. Set the environment variable. + + Training must be performed in the compilation environment with GCC upgraded. If you will run training, configure the following environment variable in your training script: + + ``` + export LD_LIBRARY_PATH=${install_path}/lib64:${LD_LIBRARY_PATH} + ``` + + **$\{install\_path\}** indicates the GCC 7.3.0 installation path configured in [3](#en-us_topic_0000001173199577_en-us_topic_0000001172534867_en-us_topic_0276688294_li1649343041310). In this example, the GCC 7.3.0 installation path is **/usr/local/gcc7.3.0/**. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >Skip this step if you do not need to use the compilation environment with GCC upgraded. + + +

FAQs

+ +- **[FAQs About Software Installation](#faqs-about-software-installation.md)** + +- **[FAQs About Model and Operator Running](#faqs-about-model-and-operator-running.md)** + +- **[FAQs About Model Commissioning](#faqs-about-model-commissioning.md)** + +- **[FAQs About Other Operations](#faqs-about-other-operations.md)** + +- **[FAQs About Distributed Model Training](#faqs-about-distributed-model-training.md)** + + +

FAQs About Software Installation

+ +- **[pip3.7 install Pillow==5.3.0 Installation Failed](#pip3-7-install-pillow-5-3-0-installation-failed.md)** + + +

pip3.7 install Pillow==5.3.0 Installation Failed

+ +## Symptom + +**pip3.7 install pillow==5.3.0** installation failed. + +## Possible Causes + +Necessary dependencies are missing, such as libjpeg, python-devel, zlib-devel, and libjpeg-turbo-devel. + +## Solutions + +Run the following commands to install the dependencies: + +- CentOS/EulerOS/Tlinux/BClinux/Suse + + **yum install libjpeg python-devel zlib-devel libjpeg-turbo-devel** + +- Ubuntu/Debian/UOS + + **apt-get install libjpeg python-devel zlib-devel libjpeg-turbo-devel** + + +

FAQs About Model and Operator Running

+ +- **[What Do I Do If the Error Message "RuntimeError: ExchangeDevice:" Is Displayed During Model or Operator Running?](#what-do-i-do-if-the-error-message-runtimeerror-exchangedevice-is-displayed-during-model-or-operator.md)** + +- **[What Do I Do If the Error Message "Error in atexit.\_run\_exitfuncs:" Is Displayed During Model or Operator Running?](#what-do-i-do-if-the-error-message-error-in-atexit-_run_exitfuncs-is-displayed-during-model-or-operat.md)** + +- **[What Do I Do If the Error Message "terminate called after throwing an instance of 'c10::Error' what\(\): HelpACLExecute:" Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-terminate-called-after-throwing-an-instance-of-c10-error-what()-he.md)** + +- **[What Do I Do If the Error Message "ImportError: libhccl.so." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-importerror-libhccl-so-is-displayed-during-model-running.md)** + +- **[What Do I Do If the Error Message "RuntimeError: Initialize." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-runtimeerror-initialize-is-displayed-during-model-running.md)** + +- **[What Do I Do If the Error Message "TVM/te/cce error." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-tvm-te-cce-error-is-displayed-during-model-running.md)** + +- **[What Do I Do If the Error Message "MemCopySync:drvMemcpy failed." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-memcopysync-drvmemcpy-failed-is-displayed-during-model-running.md)** + +- **[What Do I Do If the Error Message "MemCopySync:drvMemcpy failed." Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-memcopysync-drvmemcpy-failed-is-displayed-during-model-running-7.md)** + +- **[What Do I Do If the Error Message "HelpACLExecute." Is Displayed After Multi-Task Delivery Is Disabled \(export TASK\_QUEUE\_ENABLE=0\) During Model Running?](#what-do-i-do-if-the-error-message-helpaclexecute-is-displayed-after-multi-task-delivery-is-disabled.md)** + +- **[What Do I Do If the Error Message "55056 GetInputConstDataOut: ErrorNo: -1\(failed\)" Is Displayed During Model Running?](#what-do-i-do-if-the-error-message-55056-getinputconstdataout-errorno--1(failed)-is-displayed-during.md)** + + +

What Do I Do If the Error Message "RuntimeError: ExchangeDevice:" Is Displayed During Model or Operator Running?

+ +## Symptom + +![](figures/faq1.png) + +## Possible Causes + +Currently, only one NPU device can be called in a thread. When different NPU devices are switched, the preceding error occurs. + +## Solution + +In the code, when **torch.npu.set\_device\(device\)**, **tensor.to\(device\)**, or **model.to\(device\)** is called in the same thread, the device names are inconsistent. For multiple threads \(such as multi-device training\), each thread can call only a fixed NPU device. + +

What Do I Do If the Error Message "Error in atexit.\_run\_exitfuncs:" Is Displayed During Model or Operator Running?

+ +## Symptom + +![](figures/faq2.png) + +## Possible Causes + +If no NPU device is specified by **torch.npu.device\(id\)** during torch initialization, device 0 is used by default. If another NPU device is directly used, for example, a tensor is created on device 1, the preceding error occurs during running. + +## Solution + +Before calling an NPU device, specify the NPU device by using **torch.npu.set\_device\(device\)**. + +

What Do I Do If the Error Message "terminate called after throwing an instance of 'c10::Error' what\(\): HelpACLExecute:" Is Displayed During Model Running?

+ +## Symptom + +![](figures/faq3.png) + +## Possible Causes + +Currently, the HelpACLExecute error cannot be directly located. In this case, an error is reported when the task is delivered. This is because the multi-thread delivery of the task is enabled \(**export TASK\_QUEUE\_ENABLE=1**\), and the error information is encapsulated at the upper layer. As a result, more detailed error logs cannot be obtained. + +## Solution + +You can resolve this exception by using either of the following methods: + +- Check the host error log information. The default log path is **/var/log/npu/slog/host-0/**. Search for the log file whose name is prefixed with **host-0** based on the time identifier, open the log file, and search for error information using keyword **ERROR**. +- Disable multi-thread delivery \(**export TASK\_QUEUE\_ENABLE=0**\) and run the code again. Generally, you can locate the fault based on the error information reported by the terminal. + +

What Do I Do If the Error Message "ImportError: libhccl.so." Is Displayed During Model Running?

+ +## Symptom + +![](figures/faq7.png) + +## Possible Causes + +Currently, the released PyTorch installation package uses the NPU and HCCL functions by default. Therefore, you need to add the path of the HCCL module to the environment variables when calling the PyTorch installation package. The error message "can not find libhccl.so" indicates that the cause is that the HCCL library file is missing. + +## Solution + +Add the path of the HCCL module to the environment variables. Generally, the path of the HCCL library file is **.../fwkacllib/python/site-packages/hccl** in the installation package. + +

What Do I Do If the Error Message "RuntimeError: Initialize." Is Displayed During Model Running?

+ +## Symptom + +![](figures/faq9.png) + +## Possible Causes + +According to the error information, it is preliminarily determined that an error occurs during the initialization of the NPU device. The error information in the host log is as follows: + +![](figures/faq9-1.png) + +The log information indicates that an error is reported when the system starts the NPU device. + +## Solution + +To solve the problem, perform the following steps: + +1. Restart the server and all NPU devices. + + If the problem is resolved, no further action is required. + + If the problem persists, go to [2](#li77121667913). + +2. Check whether the driver version matches the firmware version. + + If no, go to [3](#li967615545918). + + If yes, go to [4](#li475615212912). + +3. Ensure that the driver version matches the firmware version. + + If the problem is resolved, no further action is required. + + If the problem persists, go to Step 4. + +4. Contact Huawei technical support personnel. + +

What Do I Do If the Error Message "TVM/te/cce error." Is Displayed During Model Running?

+ +## Symptom + +![](figures/faq10.png) + +## Possible Causes + +Calling an NPU operator in PyTorch strongly depends on the TE, CCE, and TVM components. The PyTorch, Toolkit/nnae, and TE versions must be the same. After Toolkit/nnae is updated, components such as TE are not automatically updated. When their versions do not match, this error is reported. + +## Solution + +Update the versions of components such as TE. The **te-\*.whl** and **topi-\*.whl** installation packages need to be updated. In the **fwkacllib** subdirectory of the Toolkit or nnae installation directory \(the installation user is the **root** user and the default installation directory is **/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64**\), update the installation packages: The **topi-0.4.0-py3-none-any.whl** and **te-0.4.0-py3-none-any.whl** installation packages exist in the directory. Run the **pip3 install --upgrade topi-0.4.0-py3-none-any.whl** and **pip install --upgrade te-0.4.0-py3-none-any.whl** commands, respectively. + +![](figures/faq10-1.png) + +

What Do I Do If the Error Message "MemCopySync:drvMemcpy failed." Is Displayed During Model Running?

+ +## Symptom + +Scripts: + +``` + import torch + + def test_sum(): + xs_shape = [22400, 8] + ys_shape = [22400, 8] + gt_bboxes_shape = [22400, 8,4] + xs = torch.rand(xs_shape).npu() + ys = torch.rand(ys_shape).npu() + gt_bboxes = torch.rand(gt_bboxes_shape).npu().half() + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + # stream = torch.npu.current_stream() + # stream.synchronize() + # left, top: fp32, right, bottom: fp16, + # print(left.dtype, top.dtype, right.dtype, bottom.dtype) + bbox_targets = torch.stack((left, top, right, bottom), -1) # Error reported here + # stream.synchronize() + + bbox_targets = torch.sum(bbox_targets) +``` + +Shell error message: + +``` + RuntimeError: Run:/usr1/workspace/PyTorch_Apex_Daily_c20tr5/CODE/aten/src/ATen/native/npu/utils/OpParamMaker.h:280 NPU error,NPU error code is:500002 + [ERROR] RUNTIME(160809)kernel task happen error, retCode=0x28, [aicpu timeout]. + [ERROR] RUNTIME(160809)aicpu kernel execute failed, device_id=0, stream_id=512, task_id=24, fault so_name=, fault kernel_name=, extend_info=. + Error in atexit._run_exitfuncs: + Traceback (most recent call last): + File "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/__init__.py", line 429, in _npu_shutdown + torch._C._npu_shutdown() + RuntimeError: npuSynchronizeDevice:/usr1/workspace/PyTorch_Apex_Daily_c20tr5/CODE/c10/npu/NPUStream.cpp:806 NPU error, error code is 0 +``` + +Log message: + +``` + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.679 [../../../../../../runtime/feature/src/npu_driver.cc:1408]12828 MemCopySync:drvMemcpy failed: dst=0x108040288000, destMax=1240, src=0x7fe7649556d0, size=1240, kind=1, drvRetCode=17! + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.698 [../../../../../../runtime/feature/src/logger.cc:113]12828 KernelLaunch:launch kernel failed, kernel=140631803535760/ArgMinWithValue_tvmbin, dim=32, stream=0x55b22b3def50 + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.717 [../../../../../../runtime/feature/src/api_c.cc:224]12828 rtKernelLaunch:ErrCode=207001, desc=[module new memory error], InnerCode=0x70a0002 +``` + +## Possible Causes + +The shell error message does not match the log message. + +The shell error message indicates that the error occurs on the AI CPU during synchronization. However, the log message indicates that the error occurs on the min operator \(internal call of ArgMinWithValue\_tvmbin\). The two error messages do not match. Generally, this problem occurs because the error information generation in the log is delayed. + +The possible cause is that the AI CPU operator is executed asynchronously. As a result, the error information is delayed. + +## Solution + +Perform the following steps to locate the fault based on the actual error information: + +1. Disable multi-task operator delivery. It is found that the result remains unchanged. It is inferred that the error occurs before the error in the shell error message and the error in the log message occur. +2. Perform stream synchronization based on the error information to narrow down the error range and locate the error operator. Stream synchronization requires that all calculations before the position where the code runs must be complete to locate the error. +3. It is determined that the error operator is stack. +4. Print the shape, dtype, and npu\_format of all stack parameters. Construct a single-operator case to reproduce the problem. The cause is that the data types of the input parameters for subtraction are different. As a result, the data types of the a-b and b-a results are different, and an error is reported in the stack operator. +5. Convert the data types of the stack input parameters to the same one to temporarily avoid the problem. + +

What Do I Do If the Error Message "MemCopySync:drvMemcpy failed." Is Displayed During Model Running?

+ +## Symptom + +Script: + +``` + import torch + + def test_sum(): + xs_shape = [22400, 8] + ys_shape = [22400, 8] + gt_bboxes_shape = [22400, 8,4] + xs = torch.rand(xs_shape).npu() + ys = torch.rand(ys_shape).npu() + gt_bboxes = torch.rand(gt_bboxes_shape).npu().half() + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + # stream = torch.npu.current_stream() + # stream.synchronize() + # left, top: fp32, right, bottom: fp16, + # print(left.dtype, top.dtype, right.dtype, bottom.dtype) + bbox_targets = torch.stack((left, top, right, bottom), -1) # Error reported here + # stream.synchronize() + + bbox_targets = torch.sum(bbox_targets) +``` + +Shell error message: + +``` + RuntimeError: Run:/usr1/workspace/PyTorch_Apex_Daily_c20tr5/CODE/aten/src/ATen/native/npu/utils/OpParamMaker.h:280 NPU error,NPU error code is:500002 + [ERROR] RUNTIME(160809)kernel task happen error, retCode=0x28, [aicpu timeout]. + [ERROR] RUNTIME(160809)aicpu kernel execute failed, device_id=0, stream_id=512, task_id=24, fault so_name=, fault kernel_name=, extend_info=. + Error in atexit._run_exitfuncs: + Traceback (most recent call last): + File "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/__init__.py", line 429, in _npu_shutdown + torch._C._npu_shutdown() + RuntimeError: npuSynchronizeDevice:/usr1/workspace/PyTorch_Apex_Daily_c20tr5/CODE/c10/npu/NPUStream.cpp:806 NPU error, error code is 0 +``` + +Log message: + +``` + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.679 [../../../../../../runtime/feature/src/npu_driver.cc:1408]12828 MemCopySync:drvMemcpy failed: dst=0x108040288000, destMax=1240, src=0x7fe7649556d0, size=1240, kind=1, drvRetCode=17! + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.698 [../../../../../../runtime/feature/src/logger.cc:113]12828 KernelLaunch:launch kernel failed, kernel=140631803535760/ArgMinWithValue_tvmbin, dim=32, stream=0x55b22b3def50 + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.717 [../../../../../../runtime/feature/src/api_c.cc:224]12828 rtKernelLaunch:ErrCode=207001, desc=[module new memory error], InnerCode=0x70a0002 +``` + +## Possible Causes + +The shell error message does not match the log message. + +The shell error message indicates that the error occurs on the AI CPU during synchronization. However, the log message indicates that the error occurs on the min operator \(internal call of ArgMinWithValue\_tvmbin\). The two error messages do not match. Generally, this problem occurs because the error information generation in the log is delayed. + +The possible cause is that the AI CPU operator is executed asynchronously. As a result, the error information is delayed. + +## Solution + +Perform the following steps to locate the fault based on the actual error information: + +1. Disable multi-task operator delivery. It is found that the result remains unchanged. It is inferred that the error occurs before the error in the shell error message and the error in the log message occur. +2. Perform stream synchronization based on the error information to narrow down the error range and locate the error operator. Stream synchronization requires that all calculations before the position where the code runs must be complete to locate the error. +3. It is determined that the error operator is stack. +4. Print the shape, dtype, and npu\_format of all stack parameters. Construct a single-operator case to reproduce the problem. The cause is that the data types of the input parameters for subtraction are different. As a result, the data types of the a-b and b-a results are different, and an error is reported in the stack operator. +5. Convert the data types of the stack input parameters to the same one to temporarily avoid the problem. + +

What Do I Do If the Error Message "HelpACLExecute." Is Displayed After Multi-Task Delivery Is Disabled \(export TASK\_QUEUE\_ENABLE=0\) During Model Running?

+ +## Symptom + +![](figures/faq8.png) + +## Possible Causes + +The PyTorch operator runs on the NPU and calls the optimized operators at the bottom layer through the AcendCL API. When the error message "HelpACLExecute." is reported at the upper layer, the error information and logs are being optimized. As a result, when errors occur in some operators, the error information fails to be obtained. + +## Solution + +View the host log to determine the operator and location where the error is reported. The default log path is **/var/log/npu/slog/host-0**. Search for the **ERROR** field in the log file of the corresponding time to find the error information. For the preceding error, the **ERROR** field in the log is as follows: + +![](figures/faq8-1.png) + +The error information in the log indicates that the error operator is topKD and the error cause is "The number of attrs in op desc and op store does not match." Therefore, it is determined that the error cause is that the parameters of the topKD operator do not match. + +Locate the topKD operator in the model code and check whether the operator can be replaced by another operator. If the operator can be replaced by another operator, use the replacement solution and report the operator error information to Huawei engineers. If the operator cannot be replaced by another operator, contact Huawei technical support. + +

What Do I Do If the Error Message "55056 GetInputConstDataOut: ErrorNo: -1\(failed\)" Is Displayed During Model Running?

+ +## Symptom + +During model training, the following error information may be displayed in the host training log \(directory: **/root/ascend/log/plog/**\): + +![](figures/20210720-102720(welinkpc).png) + +## Possible Causes + +A public API is called. + +## Solution + +The error information does not affect the training function and performance and can be ignored. + +

FAQs About Model Commissioning

+ +- **[What Do I Do If the Error Message "RuntimeError: malloc:/..../pytorch/c10/npu/NPUCachingAllocator.cpp:293 NPU error, error code is 500000." Is Displayed During Model Commissioning?](#what-do-i-do-if-the-error-message-runtimeerror-malloc-pytorch-c10-npu-npucachingallocator-cpp-293-np.md)** + +- **[What Do I Do If the Error Message "RuntimeError: Could not run 'aten::trunc.out' with arguments from the 'NPUTensorId' backend." Is Displayed During Model Commissioning](#what-do-i-do-if-the-error-message-runtimeerror-could-not-run-aten-trunc-out-with-arguments-from-the.md)** + +- **[What Do I Do If the MaxPoolGradWithArgmaxV1 and max Operators Report Errors During Model Commissioning?](#what-do-i-do-if-the-maxpoolgradwithargmaxv1-and-max-operators-report-errors-during-model-commissioni.md)** + +- **[What Do I Do If the Error Message "ModuleNotFoundError: No module named 'torch.\_C'" Is Displayed When torch Is Called?](#what-do-i-do-if-the-error-message-modulenotfounderror-no-module-named-torch-_c-is-displayed-when-tor.md)** + + +

What Do I Do If the Error Message "RuntimeError: malloc:/..../pytorch/c10/npu/NPUCachingAllocator.cpp:293 NPU error, error code is 500000." Is Displayed During Model Commissioning?

+ +## Symptom + +![](figures/faq4.png) + +## Possible Causes + +For the malloc error in **NPUCachingAllocator**, the possible cause is that the required video memory is larger than the available video memory on the NPU. + +## Solution + +During model commissioning, you can decrease the value of the **batch size** parameter to reduce the size of the occupied video memory on the NPU. + +

What Do I Do If the Error Message "RuntimeError: Could not run 'aten::trunc.out' with arguments from the 'NPUTensorId' backend." Is Displayed During Model Commissioning

+ +## Symptom + +![](figures/faq5.png) + +## Possible Causes + +Currently, the NPU supports only some PyTorch operators. The preceding error is reported when operators that are not supported are used. The operators are being developed. For details about the supported operators, see [PyTorch Native Operators](https://support.huaweicloud.com/intl/en-us/opl-pytorch/atlasptol_09_0001.html). + +## Solution + +During model commissioning, you can decrease the value of the **batch size** parameter to reduce the size of the occupied video memory on the NPU. + +

What Do I Do If the MaxPoolGradWithArgmaxV1 and max Operators Report Errors During Model Commissioning?

+ +## Symptom + +![](figures/faq6.png) + +![](figures/faq6-1.png) + +## Possible Causes + +During model building, the operator input parameters are diversified. For some operators \(such as MaxPoolGradWithArgmaxV1 and max\) with specific parameters, an error is reported during calculation or the operators are not supported. You can locate the operators based on the error information. + +## Solution + +Locate the operators based on the error information and perform the following steps: + +1. Check whether the call mode and parameters of the operators in the model are correct. +2. Build a single-operator case based on the error operators to construct the error scenario. +3. Generally, operator errors cannot be resolved on Python, and error scenarios need to be constructed. Post the error scenario in the forum and ask for help from Huawei engineers. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >Pay special attention to the input parameters **shape** and **dtype**, which are the main causes of operator errors. + + +In the preceding figure, the error information indicates that the MaxPoolGradWithArgmaxV1 and max operators report the error. MaxPoolGradWithArgmaxV1 reports the error during backward propagation. Therefore, construct a reverse scenario. The max operator reports the error during forward propagation. Therefore, construct a forward scenario. + +If an operator error is reported in the model, you are advised to build a single-operator test case and determine the error scenario and cause. If a single-operator case cannot be built in a single operator, you need to construct a context-based single-operator scenario. For details about how to build a test case, see [Single-Operator Sample Building](#single-operator-sample-building.md). + +

What Do I Do If the Error Message "ModuleNotFoundError: No module named 'torch.\_C'" Is Displayed When torch Is Called?

+ +## Symptom + +![](figures/faq11.png) + +## Possible Causes + +In the preceding figure, the error path is **.../code/pytorch/torch/\_\_init\_\_.py**. However, the current operating path is **.../code/pytorch**. When the **import torch** command is executed, the **torch** folder is searched in the current directory by default. As a result, an error is reported. The torch package installed in the system directory instead of the torch package in the current directory is called. + +## Solution + +Switch to another directory to run the script. + +

FAQs About Other Operations

+ +- **[What Do I Do If an Error Is Reported During CUDA Stream Synchronization?](#what-do-i-do-if-an-error-is-reported-during-cuda-stream-synchronization.md)** + +- **[What Do I Do If aicpu\_kernels/libpt\_kernels.so Does Not Exist?](#what-do-i-do-if-aicpu_kernels-libpt_kernels-so-does-not-exist.md)** + +- **[What Do I Do If the Python Process Is Residual When the npu-smi info Command Is Used to View Video Memory?](#what-do-i-do-if-the-python-process-is-residual-when-the-npu-smi-info-command-is-used-to-view-video-m.md)** + +- **[What Do I Do If the Error Message "match op inputs failed"Is Displayed When the Dynamic Shape Is Used?](#what-do-i-do-if-the-error-message-match-op-inputs-failed-is-displayed-when-the-dynamic-shape-is-used.md)** + +- **[What Do I Do If the Error Message "Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported" Is Displayed?](#what-do-i-do-if-the-error-message-op-type-sigmoidcrossentropywithlogitsv2-of-ops-kernel-aicoreengine.md)** + +- **[What Do I Do If a Hook Failure Occurs?](#what-do-i-do-if-a-hook-failure-occurs.md)** + +- **[What Do I Do If the Error Message "load state\_dict error." Is Displayed When the Weight Is Loaded?](#what-do-i-do-if-the-error-message-load-state_dict-error-is-displayed-when-the-weight-is-loaded.md)** + + +

What Do I Do If an Error Is Reported During CUDA Stream Synchronization?

+ +## Symptom + +![](figures/model_faq11_20210728.jpg) + +## Possible Causes + +The NPU does not use NPU stream synchronization. + +## Solution + +Use NPU stream synchronization. + +``` +stream = torch.npu.current_stream() +stream.synchronize() +``` + +

What Do I Do If aicpu\_kernels/libpt\_kernels.so Does Not Exist?

+ +## Symptom + +![](figures/faq13.png) + +## Possible Causes + +The AI CPU is not imported. + +## Solution + +Import the AI CPU. \(The following describes how to install the Toolkit software package as the **root** user in the default installation path.\) + +``` +export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest +``` + +

What Do I Do If the Python Process Is Residual When the npu-smi info Command Is Used to View Video Memory?

+ +## Symptom + +![](figures/faq14.png) + +## Possible Causes + +The Python process needs to be killed. + +## Solution + +Kill the Python process. + +``` +pkill -9 python +``` + +

What Do I Do If the Error Message "match op inputs failed"Is Displayed When the Dynamic Shape Is Used?

+ +## Symptom + +![](figures/faq15.png) + +## Possible Causes + +The operator compiled by **PTIndexPut** does not match the input shape, and the log starting with **acl\_dynamic\_shape\_op** is displayed. It is determined that an error is reported for the dynamic shape. + +## Solution + +**PTIndexPut** corresponds to **tensor\[indices\] = value**. Locate the field in the code and change the dynamic shape to a fixed shape. + +

What Do I Do If the Error Message "Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported" Is Displayed?

+ +## Symptom + +``` +[ERROR] GE(24836,python3.7):2021-01-27-18:27:51.562.111 [../../../../../../graphengine/ge/engine_manager/dnnengine_manager.cc:266]25155 GetDNNEngineName: ErrorNo: 1343242282(assign engine failed) GetDNNEngineName:Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported, reason:Op SigmoidCrossEntropyWithLogitsV2 not supported reason: The type of this op is not found in op store, check whether the op store has this type of op. Op store name is tbe-custom. +The dtype, format or shape of input in op desc is not supported in op store, check the dtype, format or shape of input between the op store and the graph. Op store name is tbe-builtin. +``` + +## Possible Causes + +The input data type is not supported by the SigmoidCrossEntropyWithLogitsV2 operator. The possible cause is that the input data type is int64. + +## Solution + +Check the input data type in the Python code and modify the data type. + +

What Do I Do If a Hook Failure Occurs?

+ +## Symptom + +``` +Traceback (most recent call last): + File "tools/train.py", line 227, in + main() + File "tools/train.py", line 221, in main + meta=meta) + File "/root/YoloV3/mmdetection/mmdet/apis/train.py", line 192, in train_detector + runner.run(data_loaders, cfg.workflow, cfg.total_epochs) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 166, in run + epoch_runner(data_loaders[i], **kwargs) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 50, in train + self.run_iter(data_batch, train_mode=True) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 30, in run_iter + outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 100, in train_step + return self.module.train_step(*inputs[0], **kwargs[0]) + File "/root/YoloV3/mmdetection/mmdet/models/detectors/base.py", line 251, in train_step + losses = self(**data) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/modules/module.py", line 660, in __call__ + var = next((v for v in var.values() if isinstance(v, torch.Tensor))) +StopIteration +``` + +## Possible Causes + +The loss structure of the mmdet triggers the bug of the native hook of PyTorch, leading to an infinite loop. + +## Solution + +Add **try** to line 658 to skip in the **/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/modules/module.py** file: + +``` +if len(self._backward_hooks) > 0: + var = result + try: + while not isinstance(var, torch.Tensor): + if isinstance(var, dict): + var = next((v for v in var.values() if isinstance(v, torch.Tensor))) + else: + var = var[0] + grad_fn = var.grad_fn + if grad_fn is not None: + for hook in self._backward_hooks.values(): + wrapper = functools.partial(hook, self) + functools.update_wrapper(wrapper, hook) + grad_fn.register_hook(wrapper) + except Exception as e: + print('hook failed..') + print(str(e)) +return result +``` + +

What Do I Do If the Error Message "load state\_dict error." Is Displayed When the Weight Is Loaded?

+ +## Symptom + +![](figures/faq18.png) + +![](figures/faq18-1.png) + +## Possible Causes + +The key value of **state\_dict** saved after model training is different from the key value of **state\_dict** when the model is loaded. When the model is saved, a **module** prefix is added to the beginning of each key. + +## Solution + +When loading the weight, traverse the **state\_dict** dictionary, modify the key value, and use the new dictionary. For details about the test case, see **demo.py**. + +The script is as follows: + +``` + ckpt = torch.load("checkpoint.pth", map_location=loc) + # model.load_state_dict(ckpt['state_dict']) + state_dict_old = ckpt['state_dict'] + state_dict = {} + for key, value in state_dict_old.items(): + key = key[7:] + state_dict[key] = value + model.load_state_dict(state_dict) +``` + +

FAQs About Distributed Model Training

+ +- **[What Do I Do If the Error Message "host not found." Is Displayed During Distributed Model Training?](#what-do-i-do-if-the-error-message-host-not-found-is-displayed-during-distributed-model-training.md)** + +- **[What Do I Do If the Error Message "RuntimeError: connect\(\) timed out." Is Displayed During Distributed Model Training?](#what-do-i-do-if-the-error-message-runtimeerror-connect()-timed-out-is-displayed-during-distributed-m.md)** + + +

What Do I Do If the Error Message "host not found." Is Displayed During Distributed Model Training?

+ +## Symptom + +![](figures/faq19.png) + +## Possible Causes + +During distributed model training, the Huawei Collective Communication Library \(HCCL\) is invoked. You need to set the IP address and port number based on the site requirements. The error information indicates that the IP address is incorrect. + +## Solution + +Set the correct IP address in the running script. If a single server is deployed, set the IP address to the IP address of the server. If multiple servers are deployed, set the IP address in the script on each server to the IP address of the active node. + +

What Do I Do If the Error Message "RuntimeError: connect\(\) timed out." Is Displayed During Distributed Model Training?

+ +## Symptom + +![](figures/1234.png) + +## Possible Causes + +During distributed model training, the system firewall may block the communication of the HCCL port. Check whether the communication port is enabled based on the error information and perform related settings. + +## Solution + +Query the HCCL port that is blocked by the system firewall and enable the port. + diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/1234.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/1234.png new file mode 100644 index 0000000000000000000000000000000000000000..9c65147ed3a49d6e808aa8a514aeffa026c56c96 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/1234.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/20210720-102720(welinkpc).png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/20210720-102720(welinkpc).png new file mode 100644 index 0000000000000000000000000000000000000000..52a28c311af437ee441fd470e47891faa3a2bd12 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/20210720-102720(welinkpc).png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001106016350.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001106016350.png new file mode 100644 index 0000000000000000000000000000000000000000..e95a0361e813dc685d49f524991b80acc490f988 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001106016350.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001106176222.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001106176222.png new file mode 100644 index 0000000000000000000000000000000000000000..677eee796623b1f62b050eb9ff2aa48f86cb7972 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001106176222.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001115716581.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001115716581.png new file mode 100644 index 0000000000000000000000000000000000000000..eff3d25890d212eb91ed0cfb9f2157fa490d9983 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001115716581.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001152616281.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001152616281.png new file mode 100644 index 0000000000000000000000000000000000000000..eff3d25890d212eb91ed0cfb9f2157fa490d9983 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001152616281.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001152616289.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001152616289.png new file mode 100644 index 0000000000000000000000000000000000000000..e95a0361e813dc685d49f524991b80acc490f988 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/en-us_image_0000001152616289.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq1.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq1.png new file mode 100644 index 0000000000000000000000000000000000000000..53f81e17d826f0aa0002a11873e16bb1f988f179 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq1.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq10-1.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq10-1.png new file mode 100644 index 0000000000000000000000000000000000000000..b95105e3af29020645ad0a4b77e2d78b84cc2fdd Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq10-1.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq10.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq10.png new file mode 100644 index 0000000000000000000000000000000000000000..a0232751b9f354e55b5a7b157ab2fdd8fb79caba Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq10.png differ diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/FAQ12.png" b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq11.png similarity index 100% rename from "docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/FAQ12.png" rename to docs/en/PyTorch Network Model Porting and Training Guide/figures/faq11.png diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq13.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq13.png new file mode 100644 index 0000000000000000000000000000000000000000..773f83071183fb63c410d94b9f658ba901049a3f Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq13.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq14.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq14.png new file mode 100644 index 0000000000000000000000000000000000000000..c1201b3f572aa01e2c91ed7959d4466d768d5723 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq14.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq15.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq15.png new file mode 100644 index 0000000000000000000000000000000000000000..b2ea57e76acbe91b86c910ddb44a27890fe94ff0 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq15.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq18-1.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq18-1.png new file mode 100644 index 0000000000000000000000000000000000000000..9634db02b1c76d601aba391180b3d84ab502c901 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq18-1.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq18.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq18.png new file mode 100644 index 0000000000000000000000000000000000000000..253af857e9ff65a4f67ca154d323ca27769fcaef Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq18.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq19.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq19.png new file mode 100644 index 0000000000000000000000000000000000000000..ffc2a914b6ca1f99b27e52d7fd6d33de7475e566 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq19.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq2.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq2.png new file mode 100644 index 0000000000000000000000000000000000000000..ab0a9f7e0aae085338f2324aeb9464a3c25d5090 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq2.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq3.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq3.png new file mode 100644 index 0000000000000000000000000000000000000000..970b050c2c46f29e9d09ff401e243ea6fb06804b Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq3.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq4.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq4.png new file mode 100644 index 0000000000000000000000000000000000000000..1a813e3ddbcabd36646defdcc63b9a0fdbb7e1a9 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq4.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq5.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq5.png new file mode 100644 index 0000000000000000000000000000000000000000..5c2019b795b4a165e7b150395739922ccacb8253 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq5.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq6-1.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq6-1.png new file mode 100644 index 0000000000000000000000000000000000000000..134ed666fb21e075885226fa4039d84ff4e6642c Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq6-1.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq6.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq6.png new file mode 100644 index 0000000000000000000000000000000000000000..5f0ab093a0c1c35b1c948e4ac2555a890bf73a05 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq6.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq7.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq7.png new file mode 100644 index 0000000000000000000000000000000000000000..d7871f3d6ba9b3c2e37c79f886a7e6cb93147c5a Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq7.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq8-1.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq8-1.png new file mode 100644 index 0000000000000000000000000000000000000000..0316905729ff0cd82806961565b947cb7655acb1 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq8-1.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq8.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq8.png new file mode 100644 index 0000000000000000000000000000000000000000..c1950311e49af3ea74b28c02afe3e77938788396 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq8.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq9-1.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq9-1.png new file mode 100644 index 0000000000000000000000000000000000000000..ec178d5e25a2e60ef4a1a25b80bd24271e25bb02 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq9-1.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq9.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq9.png new file mode 100644 index 0000000000000000000000000000000000000000..70e4bc5824c836a894f3e3e3c6c87c276efd15ec Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/faq9.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/model_faq11_20210728.jpg b/docs/en/PyTorch Network Model Porting and Training Guide/figures/model_faq11_20210728.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ac24282446804eb5ee80070a09978910919d103a Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/model_faq11_20210728.jpg differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/performance-config.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/performance-config.png new file mode 100644 index 0000000000000000000000000000000000000000..d75a5dbc4e684169cfd311d0b4c94d5283c1e762 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/performance-config.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/porting-process.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/porting-process.png new file mode 100644 index 0000000000000000000000000000000000000000..7e353b09cd92b5dd50394763dadbd414fb06ab1d Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/porting-process.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/pth-file.jpg b/docs/en/PyTorch Network Model Porting and Training Guide/figures/pth-file.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7a64f87cec889e72b09d352522ae96b06c7694b6 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/pth-file.jpg differ diff --git "a/docs/en/PyTorch Network Model Porting and Training Guide/figures/pytorch\351\200\202\351\205\215\351\200\273\350\276\221\347\273\223\346\236\204\345\233\276-\344\274\230\345\214\226.png" "b/docs/en/PyTorch Network Model Porting and Training Guide/figures/pytorch\351\200\202\351\205\215\351\200\273\350\276\221\347\273\223\346\236\204\345\233\276-\344\274\230\345\214\226.png" new file mode 100644 index 0000000000000000000000000000000000000000..207410d4779ad9a94bbb8c92c0a60bc384af83bf Binary files /dev/null and "b/docs/en/PyTorch Network Model Porting and Training Guide/figures/pytorch\351\200\202\351\205\215\351\200\273\350\276\221\347\273\223\346\236\204\345\233\276-\344\274\230\345\214\226.png" differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/remote-console-0.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/remote-console-0.png new file mode 100644 index 0000000000000000000000000000000000000000..11ad5582b602d91067c8a351a88f0c986e1774da Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/remote-console-0.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/remote-console.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/remote-console.png new file mode 100644 index 0000000000000000000000000000000000000000..11ad5582b602d91067c8a351a88f0c986e1774da Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/remote-console.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/setting-the-power-policy-2.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/setting-the-power-policy-2.png new file mode 100644 index 0000000000000000000000000000000000000000..93e4f5aa984a7f2e6c2e5298a8d59e4d3d0e9aab Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/setting-the-power-policy-2.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/setting-the-power-policy.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/setting-the-power-policy.png new file mode 100644 index 0000000000000000000000000000000000000000..3c31df7fe027517b9eec9ec717d06e3d75b2f3c7 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/setting-the-power-policy.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/socket-configuration.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/socket-configuration.png new file mode 100644 index 0000000000000000000000000000000000000000..025284eeae4bcc43c74979d1e142a4bb77d63096 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/socket-configuration.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/startup-item-tool-1.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/startup-item-tool-1.png new file mode 100644 index 0000000000000000000000000000000000000000..c9e38eb8879ca076436eb4a77c56586597ebb72d Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/startup-item-tool-1.png differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/figures/startup-item-tool.png b/docs/en/PyTorch Network Model Porting and Training Guide/figures/startup-item-tool.png new file mode 100644 index 0000000000000000000000000000000000000000..c9e38eb8879ca076436eb4a77c56586597ebb72d Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/figures/startup-item-tool.png differ diff --git "a/docs/en/PyTorch Network Model Porting and Training Guide/figures/\346\214\207\345\256\232\347\256\227\345\255\220\345\210\235\345\247\213\345\214\226\346\226\271\345\274\217.png" "b/docs/en/PyTorch Network Model Porting and Training Guide/figures/\346\214\207\345\256\232\347\256\227\345\255\220\345\210\235\345\247\213\345\214\226\346\226\271\345\274\217.png" new file mode 100644 index 0000000000000000000000000000000000000000..35584844f7a44aa3c0076d7e1bdf7259f3479bcc Binary files /dev/null and "b/docs/en/PyTorch Network Model Porting and Training Guide/figures/\346\214\207\345\256\232\347\256\227\345\255\220\345\210\235\345\247\213\345\214\226\346\226\271\345\274\217.png" differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-caution.gif b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-caution.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-caution.gif differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-danger.gif b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-danger.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-danger.gif differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-note.gif b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-note.gif new file mode 100644 index 0000000000000000000000000000000000000000..6314297e45c1de184204098efd4814d6dc8b1cda Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-note.gif differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-notice.gif b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-notice.gif new file mode 100644 index 0000000000000000000000000000000000000000..86024f61b691400bea99e5b1f506d9d9aef36e27 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-notice.gif differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-tip.gif b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-tip.gif new file mode 100644 index 0000000000000000000000000000000000000000..93aa72053b510e456b149f36a0972703ea9999b7 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-tip.gif differ diff --git a/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-warning.gif b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-warning.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Network Model Porting and Training Guide/public_sys-resources/icon-warning.gif differ diff --git a/docs/en/PyTorch Online Inference User Guide/PyTorch Online Inference User Guide.md b/docs/en/PyTorch Online Inference User Guide/PyTorch Online Inference User Guide.md new file mode 100644 index 0000000000000000000000000000000000000000..933c21999e1af0a5f1b61c00ab235580cf1531f5 --- /dev/null +++ b/docs/en/PyTorch Online Inference User Guide/PyTorch Online Inference User Guide.md @@ -0,0 +1,599 @@ +# PyTorch Online Inference Guide +- [Application Scenario](#application-scenario.md) +- [Basic Workflow](#basic-workflow.md) + - [Prerequisites](#prerequisites.md) + - [Online Inference Process](#online-inference-process.md) + - [Environment Variable Configuration](#environment-variable-configuration.md) + - [Sample Reference](#sample-reference.md) +- [Special Topics](#special-topics.md) + - [Mixed Precision](#mixed-precision.md) +- [How Do I Install GCC 7.3.0?](#how-do-i-install-gcc-7-3-0.md) +

Application Scenario

+ +Online inference, unlike offline inference, allows developers to perform inference directly with PyTorch models using the **model.eval\(\)** method. + +In this way, PyTorch-based inference apps can be directly ported to the Ascend AI Processor, which is especially useful in the data center inference scenarios. + +## Supported Processors + +Ascend 910 AI Processor + +Ascend 710 AI Processor + +

Basic Workflow

+ +- **[Prerequisites](#prerequisites.md)** + +- **[Online Inference Process](#online-inference-process.md)** + +- **[Environment Variable Configuration](#environment-variable-configuration.md)** + +- **[Sample Reference](#sample-reference.md)** + + +

Prerequisites

+ +The PyTorch framework and mixed precision module have been installed. For details, see the . + +

Online Inference Process

+ +[Figure 1](#fig13802941161818) shows the online inference process. + +**Figure 1** Online inference process +![](figures/online-inference-process.png "online-inference-process") + +

Environment Variable Configuration

+ +The following are the environment variables required for starting the inference process on PyTorch: + +``` +# Set the environment variable for the installation path of the infrastructure software on which online inference depends. The following assumes that the installation user is HwHiAiUser, and the default installation path is used. +# Method 1: Install Ascend-CANN-Toolkit for inference on an Ascend AI device, which serves as the development environment. +. /home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh +# Method 2: Install Ascend-CANN-NNAE on an Ascend AI device. +. /home/HwHiAiUser/Ascend/nnae/set_env.sh + +# If multiple Python 3 versions exist in the operating environment, configure the installation path of Python 3.7.5 in environment variables. +export PATH=/usr/local/python3.7.5/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib:$LD_LIBRARY_PATH + +# Sets the logical ID of a processor. +export ASCEND_DEVICE_ID=0 + +# Output log information. Replace it as required. +export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export ASCEND_GLOBAL_LOG_LEVEL=0 + +# Task issued with multiple threads +export TASK_QUEUE_ENABLE=0 +``` + +**Table 1** Description of environment variables + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Environment Variable

+

Description

+

Required/Optional

+

LD_LIBRARY_PATH

+

Dynamic library search path. Set this variable based on the preceding example.

+
NOTE:

If GCC 7.3.0 is installed in OSs such as CentOS 7.6, Debian, and BCLinux, configure the related environment variable. For details, see 5.

+
+

Required

+

PATH

+

Executable program search path. Set this variable based on the preceding example.

+

Required

+

ASCEND_DEVICE_ID

+

Sets the logical ID of a processor.

+

The value range is [0, N – 1] and the default value is 0. N indicates the device count in the physical machine, VM, or container.

+

Optional

+

ASCEND_SLOG_PRINT_TO_STDOUT

+

Enables or disables log printing. Values:

+
  • 0 or not configured: disabled.
  • 1: enabled.
+

Optional

+

ASCEND_GLOBAL_LOG_LEVEL

+

Sets the global log level. Values:

+
  • 0: DEBUG
  • 1: INFO
  • 2: WARNING
  • 3: ERROR
  • 4: NULL (no log output)
  • Other values: invalid
+

Optional

+

TASK_QUEUE_ENABLE

+

Enables or disables the task delivery in multi-thread mode. When this function is enabled, the training performance of the entire network is improved in most cases. Values:

+
  • 0 or not configured: disabled
  • 1: enabled
+

Optional

+
+ +>![](public_sys-resources/icon-note.gif) **NOTE:** +>For more log information, see the _CANN Log Reference_. + +

Sample Reference

+ +## Sample Code + +Try to minimize the initialization frequency across the app lifetime during inference. The inference mode is set using the **model.eval\(\)** method, and the inference process must run under the code branch **with torch.no\_grad\(\):**. + +The following uses Python code of the ResNet-50 network as an example for description. + +**resnet50\_infer\_for\_pytorch.py** sample code: + +``` +import argparse +import os +import time +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models + +import torch.npu +from apex import amp + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + + +def parse_args(): + """ User-defined dataset path and model path """ + parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') + parser.add_argument('--data', metavar='DIR', default="/data/imagenet", + help='path to dataset') + parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet18)') + + parser.add_argument('--epochs', default=100, type=int, metavar='N', + help='number of total epochs to run') + + parser.add_argument('-b', '--batch_size', default=512, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') + + parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') + + parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') + + parser.add_argument('--npu', default=None, type=int, + help='NPU id to use.') + + parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', + help='number of data loading workers (default: 8)') + + parser.add_argument('--lr', '--learning_rate', default=0.1, type=float, + metavar='LR', help='initial learning rate', dest='lr') + parser.add_argument('--wd', '--weight_decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') + + args, unknown_args = parser.parse_known_args() + if len(unknown_args) > 0: + for bad_arg in unknown_args: + print("ERROR: Unknown command line arg: %s" % bad_arg) + raise ValueError("Invalid command line arg(s)") + + return args + + +# ========================================================================= +# Main function entry +# ========================================================================= +def main(): + args = parse_args() + if args.npu is None: + args.npu = 0 + global CALCULATE_DEVICE + CALCULATE_DEVICE = "npu:{}".format(args.npu) + torch.npu.set_device(CALCULATE_DEVICE) + print("use ", CALCULATE_DEVICE) + main_worker(args.npu, args) + + +def main_worker(npu, args): + global best_acc1 + args.npu = npu + + # ========================================================================= + # Create a model. + # ========================================================================= + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch](zero_init_residual=True) + + # Copy the model data to the Ascend AI Processor. + model = model.to(CALCULATE_DEVICE) + + optimizer = torch.optim.SGD([ + {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'], 'weight_decay': 0.0}, + {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'], + 'weight_decay': args.weight_decay}], + args.lr) + + # ========================================================================= + # Initialize the mixed precision model. After the model is used, the calculation can be accelerated, but the accuracy may slightly decrease. Determine whether to use the model based on the actual scenario. + # ========================================================================= + model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024, verbosity=1) + + # ========================================================================= + # Load the trained model parameters. + # ========================================================================= + # Restore model parameters from the model file. + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume) + + best_acc1 = checkpoint['best_acc1'] + best_acc1 = best_acc1.to("npu:{}".format(args.npu)) + + model.load_state_dict(checkpoint['state_dict']) + print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) + + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + # ========================================================================= + # Initialize the database. + # ========================================================================= + # Load and preprocess image data. + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=True) + + # ========================================================================= + # Run the validate command. + # ========================================================================= + validate(val_loader, model, args) + + +# ========================================================================= +# Implement the sample API for online inference. +# ========================================================================= +def validate(val_loader, model, args): + batch_time = AverageMeter('Time', ':6.3f') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, top1, top5], + prefix='Test: ') + + # ========================================================================= +# Switch to the inference mode. + # ========================================================================= + model.eval() + + # ========================================================================= +# Execute the model forward propagation under the torch.no_grad(): branch. + # ========================================================================= + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + + # Place the image data on the NPU. + images = images.to(CALCULATE_DEVICE, non_blocking=True) + target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True) + + # Calculate the output. + output = model(images) + + # Collect statistics on result precisions. + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # Measure the running time. + batch_time.update(time.time() - end) + end = time.time() + + # Print inference logs. + progress.display(i) + + print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) + + return top1.avg + + +class AverageMeter(object): + """Calculate and store the average value and current value.""" + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + self.start_count_index = 10 + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + if self.count == 0: + self.batchsize = n + + self.val = val + self.count += n + if self.count > (self.start_count_index * self.batchsize): + self.sum += val * n + self.avg = self.sum / (self.count - self.start_count_index * self.batchsize) + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + """Record model computing information.""" + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def accuracy(output, target, topk=(1,)): + """Calculate the precision of k top predictions based on the specified value k.""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() +``` + +## Sample Running + +The following uses the ResNet-50 model as an example to describe how to perform online inference. + +1. Download a pre-trained model. + + Visit [Ascend ModelZoo](https://www.hiascend.com/en/software/modelzoo) and click **Download Model** to download a pre-trained ResNet-50 model. + +2. Edit the inference script. + + Create a model script file **resnet50\_infer\_for\_pytorch.py** and write code by referring to [Sample Code](). + +3. Run inference. + + Set environment variables by referring to [Environment Variable Configuration](#environment-variable-configuration.md) and then run the following command: + + ``` + python3 pytorch-resnet50-apex.py --data /data/imagenet \ + --npu 7 \ + --epochs 90 \ + --resume ./checkpoint.pth.tar # ./checkpoint.pth.tar is the path of the sample pre-trained model file. + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >The preceding command is an example only. Modify the arguments as needed. + + +

Special Topics

+ +- **[Mixed Precision](#mixed-precision.md)** + + +

Mixed Precision

+ +## Overview + +Based on the architecture features of the NPU, mixed precision is involved in the model computing, that is, the scenario where the float16 and float32 data types are used together. Replacing float32 with float16 has the following advantages: + +- The memory usage of intermediate variables is reduced. +- The data transfer time decreases because the memory usage is reduced. +- The compute units of float16 provide better computing performance. + +However, the mixed precision training is limited by the precision range expressed by float16. If float32 is converted into float16, the training convergence is affected. To use float16 for acceleration in some computations and ensure training convergence, the mixed precision module Apex is used. The mixed precision module Apex is a comprehensive optimization library that features high optimization performance and precision. + +## Supported Features + +[Table 1](#en-us_topic_0278765773_table10717173813332) describes the functions and optimization of the mixed precision module. + +**Table 1** Functions of the mixed precision module + + + + + + + + + + + + + + + + + + + +

Function

+

Description

+

O1 configuration

+

Conv and Matmul use float16 for computing, and Softmax and BN use float32.

+

O2 configuration

+

BN uses float32, and others use float16.

+

Static loss scale

+

Parameters are statically set to ensure the convergence of mixed precision training.

+

Dynamic loss scale

+

The loss scale value is dynamically calculated to determine whether overflow occurs.

+
+ +>![](public_sys-resources/icon-note.gif) **NOTE:** +>In the current version, Apex is implemented using Python and does not support AscendCL or CUDA optimization. + +## Initializing the Mixed Precision Model + +1. To use the mixed precision module Apex, you need to import the amp module from the Apex library as follows: + + ``` + from apex import amp + ``` + +2. After the amp module is imported, you need to initialize it so that it can modify the model, optimizer, and PyTorch internal functions. The initialization code is as follows: + + ``` + model, optimizer = amp.initialize(model, optimizer) + ``` + + For details, see "Initialize the mixed precision model."# in [Sample Code](#sample-reference.md). + + ``` + model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024, verbosity=1) + ``` + + +## Mixed Precision Inference + +After the mixed precision model is initialized, perform model forward propagation. + +Sample code: For details, see the implementation of **validate\(val\_loader, model, args\)** in [Sample Code](#sample-reference.md). + +

How Do I Install GCC 7.3.0?

+ +Perform the following steps as the **root** user. + +1. Download **gcc-7.3.0.tar.gz** from [https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz](https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz). +2. To install GCC, you need to reserve adequate temporary space. Run the following command to clear the **/tmp** directory before the installation: + + ``` + sudo rm -rf /tmp/* + ``` + +3. Install the dependency package. \(CentOS and Ubuntu are used as examples.\) + - For CentOS, run the following command: + + ``` + yum install bzip2 + ``` + + - For Ubuntu, run the following command: + + ``` + apt-get install bzip2 + ``` + +4. Build and install GCC. + 1. Go to the directory where the source code package **gcc-7.3.0.tar.gz** is located and run the following command to extract it: + + ``` + tar -zxvf gcc-7.3.0.tar.gz + ``` + + 2. Go to the extracted directory and run the following command to download the GCC dependency packages: + + ``` + cd gcc-7.3.0 + ./contrib/download_prerequisites + ``` + + If an error is reported during the command execution, run the following commands in the **gcc-7.3.0/** directory to download the dependency packages: + + ``` + wget http://gcc.gnu.org/pub/gcc/infrastructure/gmp-6.1.0.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpfr-3.1.4.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpc-1.0.3.tar.gz + wget http://gcc.gnu.org/pub/gcc/infrastructure/isl-0.16.1.tar.bz2 + ``` + + After downloading the preceding dependency packages, run the following command: + + ``` + ./contrib/download_prerequisites + ``` + + If the verification fails, check whether the dependency package is repeatedly downloaded. The package should be downloaded at a time. + + 3. Run the following commands for configuration, build, and installation. + + ``` + ./configure --enable-languages=c,c++ --disable-multilib --with-system-zlib --prefix=/usr/local/linux_gcc7.3.0 + make -j15 # The value 15 indicates the number of CPUs, which is configurable and can be queried by running grep -w processor /proc/cpuinfo|wc -l. + make install + ``` + + >![](public_sys-resources/icon-notice.gif) **NOTICE:** + >The **--prefix** parameter is used to specify the linux\_gcc7.3.0 installation path, which is configurable. Do not set it to **/usr/local** or **/usr**, which is the default installation path for the GCC installed by using the software source. Otherwise, a conflict occurs and the original GCC compilation environment of the system is damaged. In this example, the installation path is set to **/usr/local/linux\_gcc7.3.0**. + + +5. Set the environment variable. + + The build environment after GCC upgrade is required for training. Therefore, you need to configure the following environment variable in the training script: + + ``` + export LD_LIBRARY_PATH=${install_path}/lib64:${LD_LIBRARY_PATH} + ``` + + **$\{install\_path\}** indicates the GCC 7.3.0 installation path configured in [3.](#en-us_topic_0000001146754749_en-us_topic_0000001072593337_l75d31a2874534a2092e80a5f865b46f0). In this example, the GCC 7.3.0 installation path is **/usr/local/linux\_gcc7.3.0/**. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >The environment variable needs to be configured only when you need to use the build environment after the GCC upgrade. + + diff --git a/docs/en/PyTorch Online Inference User Guide/figures/online-inference-process.png b/docs/en/PyTorch Online Inference User Guide/figures/online-inference-process.png new file mode 100644 index 0000000000000000000000000000000000000000..b42f502fe54acdb6a38472879e3bf2866a391b44 Binary files /dev/null and b/docs/en/PyTorch Online Inference User Guide/figures/online-inference-process.png differ diff --git a/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-caution.gif b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-caution.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-caution.gif differ diff --git a/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-danger.gif b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-danger.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-danger.gif differ diff --git a/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-note.gif b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-note.gif new file mode 100644 index 0000000000000000000000000000000000000000..6314297e45c1de184204098efd4814d6dc8b1cda Binary files /dev/null and b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-note.gif differ diff --git a/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-notice.gif b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-notice.gif new file mode 100644 index 0000000000000000000000000000000000000000..86024f61b691400bea99e5b1f506d9d9aef36e27 Binary files /dev/null and b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-notice.gif differ diff --git a/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-tip.gif b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-tip.gif new file mode 100644 index 0000000000000000000000000000000000000000..93aa72053b510e456b149f36a0972703ea9999b7 Binary files /dev/null and b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-tip.gif differ diff --git a/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-warning.gif b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-warning.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Online Inference User Guide/public_sys-resources/icon-warning.gif differ diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md new file mode 100644 index 0000000000000000000000000000000000000000..65fe9c21471de4d49984dcb61cd099b5343b6403 --- /dev/null +++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md @@ -0,0 +1,1267 @@ +# PyTorch Operator Development Guide +- [Introduction](#introduction.md) +- [Operator Development Process](#operator-development-process.md) +- [Operator Development Preparations](#operator-development-preparations.md) + - [Setting Up the Environment](#setting-up-the-environment.md) + - [Looking Up Operators](#looking-up-operators.md) +- [Operator Adaptation](#operator-adaptation.md) + - [Prerequisites](#prerequisites.md) + - [Obtaining the PyTorch Source Code](#obtaining-the-pytorch-source-code.md) + - [Registering Operator Development](#registering-operator-development.md) + - [Developing an Operator Adaptation Plugin](#developing-an-operator-adaptation-plugin.md) + - [Compiling and Installing the PyTorch Framework](#compiling-and-installing-the-pytorch-framework.md) +- [Operator Function Verification](#operator-function-verification.md) + - [Overview](#overview.md) + - [Implementation](#implementation.md) +- [FAQs](#faqs.md) + - [Pillow==5.3.0 Installation Failed](#pillow-5-3-0-installation-failed.md) + - [pip3.7 install torchvision Installation Failed](#pip3-7-install-torchvision-installation-failed.md) + - ["torch 1.5.0xxxx" and "torchvision" Do Not Match When torch-\*.whl Is Installed](#torch-1-5-0xxxx-and-torchvision-do-not-match-when-torch--whl-is-installed.md) + - [如何查看测试的运行日志](#en-us_topic_0000001117914770.md) + - [What Is the Meaning Of The NPU Error Code Output During the Test? Is There Any Corresponding Explanation?](#what-is-the-meaning-of-the-npu-error-code-output-during-the-test-is-there-any-corresponding-explanat.md) + - [Why Cannot the Custom TBE Operator Be Called?](#why-cannot-the-custom-tbe-operator-be-called.md) + - [How Do I Determine Whether the TBE Operator Is Correctly Called for PyTorch Adaptation?](#how-do-i-determine-whether-the-tbe-operator-is-correctly-called-for-pytorch-adaptation.md) + - [PyTorch Compilation Fails and the Message "error: ld returned 1 exit status" Is Displayed](#pytorch-compilation-fails-and-the-message-error-ld-returned-1-exit-status-is-displayed.md) + - [PyTorch Compilation Fails and the Message "error: call of overload...." Is Displayed](#pytorch-compilation-fails-and-the-message-error-call-of-overload-is-displayed.md) +- [Appendixes](#appendixes.md) + - [Installing CMake](#installing-cmake.md) + - [Exporting a Custom Operator](#exporting-a-custom-operator.md) +

Introduction

+ +## Overview + +To enable the PyTorch deep learning framework to run on Ascend AI Processors, you need to use Tensor Boost Engine \(TBE\) to customize the framework operators. + +

Operator Development Process

+ +PyTorch operator development includes TBE operator development and operator adaptation to the PyTorch framework. + +1. TBE operator development: If an operator on your network is incompatible with the Ascend AI Software Stack, you need to develop a TBE operator and then adapt it to the PyTorch framework. + + For details about the TBE operator development process and methods, see the _CANN TBE Custom Operator Development Guide_. + +2. Operator adaptation to the PyTorch framework: If a TBE operator has been developed and is compatible with the Ascend AI Software Stack, you can directly adapt it to the PyTorch framework. + + The following figure shows the operator adaptation process in the PyTorch framework. + + **Figure 1** Operator adaptation process in the PyTorch framework + ![](figures/operator-adaptation-process-in-the-pytorch-framework.png "operator-adaptation-process-in-the-pytorch-framework") + + +**Table 1** Description of the operator development process + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

Procedure

+

Description

+

Reference

+

1

+

Set up the environment.

+

Set up the development and operating environments required for operator development, execution, and verification.

+

Operator Development Preparations

+

2

+

Look up operators.

+

View the list of supported TBE operators and list of operators adapted to PyTorch.

+
  • List of operators supported by Ascend AI Processors and detailed specifications and constraints of the supported operators
  • List of operators adapted to PyTorch
+

3

+

Obtain the PyTorch source code.

+

Obtain the PyTorch source code from the Ascend Community.

+

Operator Adaptation

+

4

+

Register an operator.

+

Distribute the operator to the Ascend AI Processor.

+

5

+

Develop the operator adaptation layer.

+

Develop the operator adaptation layer to map the attributes of operators based on third-party frameworks to those of the operators adapted to Ascend AI Processors.

+

6

+

Compile and install the PyTorch framework.

+

Compile and adapt the developed PyTorch source code, and install the compiled source package.

+

7

+

Verify the operator functions.

+

Verify the operator functions in the real-world hardware environment.

+

Operator Function Verification

+
+ +

Operator Development Preparations

+ +- **[Setting Up the Environment](#setting-up-the-environment.md)** + +- **[Looking Up Operators](#looking-up-operators.md)** + + +

Setting Up the Environment

+ +## Prerequisites + +- The development or operating environment of CANN has been installed. For details, see the _CANN Software Installation Guide_. +- CMake 3.12.0 or later has been installed. For details, see [Installing CMake](#installing-cmake.md). +- GCC 7.3.0 or later has been installed. For details about how to install and use GCC 7.3.0, see "Installing GCC 7.3.0" in the _CANN Software Installation Guide_. +- The Git tool has been installed. To install Git for Ubuntu and CentOS, run the following commands: + - Ubuntu + + ``` + apt-get install git + ``` + + - CentOS + + ``` + yum install git + ``` + + + +## Installing the PyTorch Environment Dependencies + +If you install Python and its dependencies as a non-root user, add **--user** at the end of each command to ensure that the installation is successful. Example command: **pip3.7 install pyyaml --user** + +``` +pip3.7 install pyyaml +pip3.7 install wheel +pip3.7 install Pillow==5.3.0 +``` + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>If an error is reported in the preceding process, rectify the fault by referring to [FAQs](#faqs.md). + +

Looking Up Operators

+ +During operator development, you can query the list of operators supported by Ascend AI Processors and the list of operators adapted to PyTorch. Develop or adapt operators to PyTorch based on the query result. + +- If an operator is not supported by the Ascend AI Processor, develop a TBE operator and adapt the operator to the PyTorch framework. +- If an operator is supported by the Ascend AI Processor but has not been adapted to the PyTorch framework, you only need to adapt the operator to the PyTorch framework. +- If an operator has been adapted to the PyTorch framework, you can directly use the operator without development or adaptation. + +The following describes how to query the operators supported by Ascend AI Processors as well as operators adapted to PyTorch. + +- You can query the operators supported by Ascend AI Processors and the corresponding operator constraints in either of the following modes: + - For operator development on the command line, you can perform offline lookup. For details, see the _CANN Operator List \(Ascend 910\)_. + - For operator development using MindStudio, you can perform online lookup on MindStudio. For details, see "Supported Operators and Models" in the _MindStudio User Guide_. + +- For the list of operators adapted to PyTorch, see the _PyTorch Adapted Operator List_. + +

Operator Adaptation

+ +- **[Prerequisites](#prerequisites.md)** + +- **[Obtaining the PyTorch Source Code](#obtaining-the-pytorch-source-code.md)** + +- **[Registering Operator Development](#registering-operator-development.md)** + +- **[Developing an Operator Adaptation Plugin](#developing-an-operator-adaptation-plugin.md)** + +- **[Compiling and Installing the PyTorch Framework](#compiling-and-installing-the-pytorch-framework.md)** + + +

Prerequisites

+ +- The development and operating environments have been set up, and related dependencies have been installed. For details, see [Setting Up the Environment](#setting-up-the-environment.md). +- TBE operators have been developed and deployed. For details, see the _CANN TBE Custom Operator Development Guide_. + +

Obtaining the PyTorch Source Code

+ +Visit [https://gitee.com/ascend/pytorch-develop](https://gitee.com/ascend/pytorch-develop) to obtain the PyTorch source code that adapts to the Ascend AI Processor. Run the following **git** command to download the source code: + +``` +git clone https://gitee.com/ascend/pytorch-develop.git --deepth=1 +``` + +After the download is successful, the PyTorch file directory is generated. + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>If you do not have the permission to obtain the code, contact Huawei technical support to join the **Ascend** organization. + +

Registering Operator Development

+ +## Overview + +Currently, the NPU adaptation dispatch principle is as follows: The NPU operator is directly dispatched as the NPU adaptation function without being processed by the common function of the framework. That is, the operator execution call stack contains only the function call of the NPU adaptation and does not contain the common function of the framework. During compilation, the PyTorch framework generates the calling description of the middle layer of the new operator based on the definition in **native\_functions.yaml** and the type and device dispatch principle defined in the framework. For NPUs, the description is generated in **build/aten/src/ATen/NPUType.cpp**. + +## Registering an Operator + +1. Open the **native\_functions.yaml** file. + + The **native\_functions.yaml** file defines all operator function prototypes, including function names and parameters. Each operator function supports dispatch information of different hardware platforms. The file is in the **pytorch/aten/src/ATen/native/native\_functions.yaml** directory. + +2. Determine the functions to be dispatched. + - Existing operator in the YAML file + + Dispatch all functions related to the operator to be adapted. + + - Custom operator that does not exist in the YAML file + + The YAML file does not contain the operator information. Therefore, you need to manually add related functions, including the function names, parameters, and return types. + + ``` + - func: operator name (input parameter information) -> return type + ``` + +3. Modify the **native\_functions.yaml** file and add the dispatch description of the functions related to the operator. + + Regulations on the YAML files: + + - The keyword **npu\_dispatch** is used for adapting the original operator functions in the YAML file. + + ``` + npu_dispatch: + NPU: NPU_Adapt_Fun_Name + ``` + + - The keyword **npu\_dispatch\_only** is used for adapting custom operator functions in the YAML file. + + ``` + npu_dispatch_only: + NPU: NPU_Adapt_Fun_Name + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >The formats of _NPU\_Adapt\_Fun\_Name_ are as follows: + >- If the original _NPU\_Adapt\_Fun\_Name_ does not have the suffix **\_**, the format is _NPU\_Adapt\_Fun\_Name_ + **\_** + **npu**, for example, **add** --\> **add\_npu**. + >- If the original _NPU\_Adapt\_Fun\_Name_ has the suffix **\_**, the format is _NPU\_Adapt\_Fun\_Name_ + **npu\_**, for example, **add\_** --\> **add\_npu\_**. + >The formats are for reference only. The function name during operator adaptation must be the same as **NPU\_Adapt\_Fun\_Name**. + + +## Example + +The following uses the torch.add\(\) operator as an example to describe how to register an operator. + +1. Open the **native\_functions.yaml** file. +2. Determine related functions. + + Search for **add** in the YAML file and find the functions related to the add operator. + +3. Add the dispatch description. + 1. Dispatch description of **add.Tensor** + + ``` + - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU: add + CUDA: add + SparseCPU: add_sparse + SparseCUDA: add_sparse + MkldnnCPU: mkldnn_add + #Add the dispatch description. + npu_dispatch: + NPU: add_npu + supports_named_tensor: True + ``` + + 2. Dispatch description of **add.Scalar** + + ``` + - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + use_c10_dispatcher: full + variants: function, method + supports_named_tensor: True + #Add the dispatch description. + npu_dispatch: + NPU: add_npu + ``` + + 3. Dispatch description of **add\_.Tensor** + + ``` + - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + variants: method + dispatch: + CPU: add_ + CUDA: add_ + SparseCPU: add_sparse_ + SparseCUDA: add_sparse_ + MkldnnCPU: mkldnn_add_ + #Add the dispatch description. + npu_dispatch: + NPU: add_npu_ + supports_named_tensor: True + ``` + + 4. Dispatch description of **add\_.Scalar** + + ``` + - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + variants: method + supports_named_tensor: True + #Add the dispatch description. + npu_dispatch: + NPU: add_npu_ + ``` + + 5. Dispatch description of **add.out** + + ``` + - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: add_out + CUDA: add_out + SparseCPU: add_out_sparse_cpu + SparseCUDA: add_out_sparse_cuda + MkldnnCPU: mkldnn_add_out + #Add the dispatch description. + npu_dispatch: + NPU: add_out_npu + supports_named_tensor: True + ``` + + + +

Developing an Operator Adaptation Plugin

+ +## Overview + +You can develop an operator adaptation plugin to convert the formats of the input parameters, output parameters, and attributes of the PyTorch native operators so that the obtained formats are the same as the formats of the input parameters, output parameters, and attributes of the TBE operators. The PyTorch source code that is adapted to Ascend AI Processors provides methods related to adaptation association, type conversion and discrimination, and dynamic shape processing for users. + +## Adaptation Plugin Implementation + +1. Create an adaptation plugin file. + + The NPU TBE operator adaptation file is stored in the **pytorch/aten/src/ATen/native/npu** directory and is named in the upper camel case. The file name is in the format of _operator name_ + **KernelNpu.cpp**, for example, **AddKernelNpu.cpp**. + +2. Introduce the dependency header files. + + The PyTorch source code that is adapted to Ascend AI Processors provides common tools in **ATen/native/npu/utils** for users. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >For details about the functions and usage of the tools, see the header files and source code. + +3. Define the main adaptation function of the operator. + + Determine the adaptation theme function for custom operators based on the dispatch function in the registered operator. + +4. Implement the main adaptation functions. + + Implement the operator adaptation theme function and construct the corresponding input, output, and attributes based on the TBE operator prototype. + + +## Example + +The following uses the torch.add\(\) operator as an example to describe how to adapt an operator. + +1. Create an adaptation plugin file. + + Create the **AddKernelNpu.cpp** adaptation file in the **pytorch/aten/src/ATen/native/npu** directory. + +2. Introduce the dependency header files. + + ``` + #include + #include "ATen/native/npu/utils/CalcuOpUtil.h" + #include "ATen/native/npu/utils/OpAdapter.h" + ``` + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >**CalcuOpUtil.h** contains type conversion and discrimination functions. + >**OpAdapter.h** contains header files related to adaptation. + +3. Define the main adaptation function of the operator. + + ``` + Tensor add_npu(const Tensor& self, const Tensor& other, Scalar alpha) + Tensor add_npu(const Tensor& self, Scalar other, Scalar alpha) + Tensor& add_npu_(Tensor& self, const Tensor& other, Scalar alpha) + Tensor& add_npu_(Tensor& self, Scalar other, Scalar alpha) + Tensor& add_out_npu(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) + ``` + +4. Implement the main adaptation functions. + 1. **add\_npu** implementation + + ``` + // When two tensors are input + Tensor add_npu(const Tensor& self, const Tensor& other, Scalar alpha) { + alpha_check_npu(self.scalar_type(), alpha); + if ((!(self.is_contiguous() && other.is_contiguous())) && + (NpuUtils::check_5d_5d_match(self) || + NpuUtils::check_5d_5d_match(other)) && + check_size(self, other)) { + int64_t c0_len = 16; + Tensor self_use = stride_add_tensor_get(self); + Scalar self_c1_offset( + self.storage_offset() / (self.size(2) * self.size(3) * c0_len)); + Tensor other_use = stride_add_tensor_get(other); + Scalar other_c1_offset( + other.storage_offset() / (other.size(2) * other.size(3) * c0_len)); + Scalar stride_len(self.size(1) / c0_len); + Tensor result = at::npu_stride_add( + self_use, other_use, self_c1_offset, other_c1_offset, stride_len); + return result; + } + // calculate the output size + Tensor outputTensor = add_dest_output(self, other); + auto outputSize = broadcast_ops_npu_output_size(self, other); + + // construct the output tensor of the NPU + Tensor result = at::empty_with_format( + outputSize, + outputTensor.options(), + CalcuOpUtil::get_tensor_npu_format(outputTensor)); + + // calculate the output result of the NPU + add_out_npu_nocheck(result, self, other, alpha); + + return result; + } + + // When a tensor and a scalar are input + Tensor add_npu(const Tensor& self, Scalar other, Scalar alpha) { + alpha_check_npu(self.scalar_type(), alpha); + // calculate the output size + auto outputSize = input_same_output_size(self); + // construct the output tensor of the NPU + Tensor result = at::empty_with_format( + outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + + // calculate the output result of the NPU + adds_out_npu_nocheck(result, self, other, alpha); + + return result; + } + + ``` + + 2. **add\_npu\_** implementation \(in in-place operation scenarios, the return value is the class itself\) + + ``` + // When two tensors are input + Tensor& add_npu_(Tensor& self, const Tensor& other, Scalar alpha) { + SmallVector inputs = {self, other}; + SmallVector outputs = {self}; + CalcuOpUtil::check_memory_over_laps(inputs, outputs); + + if (!NpuUtils::check_match(&self)) { + Tensor contiguousSelf = NpuUtils::format_contiguous(self); + Tensor result = add_out_npu_nocheck(contiguousSelf, contiguousSelf, other, alpha); + NpuUtils::format_fresh_view(self, result); + } else { + add_out_npu_nocheck(self, self, other, alpha); + } + + return self; + } + + // When a tensor and a scalar are input + Tensor& add_npu_(Tensor& self, Scalar other, Scalar alpha) { + if (!NpuUtils::check_match(&self)) { + Tensor contiguousSelf = NpuUtils::format_contiguous(self); + Tensor result = adds_out_npu_nocheck(contiguousSelf, contiguousSelf, other, alpha); + NpuUtils::format_fresh_view(self, result); + } else { + adds_out_npu_nocheck(self, self, other, alpha); + } + + return self; + } + + ``` + + 3. **add\_out\_npu** implementation \(when the return value is used as the input\) + + ``` + Tensor& add_out_npu( + Tensor& result, + const Tensor& self, + const Tensor& other, + Scalar alpha) { + bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self); + + Tensor outputTensor; + if (not isSelfWrapped) { + outputTensor = self; + } else { + outputTensor = other; + } + auto outputSize = broadcast_ops_npu_output_size(self, other); + OpPreparation::CheckOut( + {self}, + result, + CalcuOpUtil::get_tensor_npu_format(result), + outputTensor.scalar_type(), + outputSize); + + OpPipeWithDefinedOut pipe; + return pipe.CheckMemory({self, other}, {result}) + .Func([&self, &other, &alpha](Tensor& result){add_out_npu_nocheck(result, self, other, alpha);}) + .Call(result); + } + ``` + + + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>For details about the implementation code of **AddKernelNpu.cpp**, see the **pytorch/aten/src/ATen/native/npu/AddKernelNpu.cpp** document. + +

Compiling and Installing the PyTorch Framework

+ +## Compiling the PyTorch Framework + +1. Go to the PyTorch working directory **pytorch**. +2. Set permission for the script file. + + **chmod +x build.sh** + +3. Run the following command to perform compilation: + + **./build.sh** + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >The first compilation takes a long time, which may exceed 30 minutes. You are advised not to run the **make clean** command unless necessary. + +4. After the compilation is successful, the **torch-**_\*_**.whl** package is generated in **pytorch/dist**, for example, **torch-1.5.0a0-cp37-cp37m-linux\_x86.whl**. + +## Installing the PyTorch Framework + +1. Upload the **torch-**_\*_**.whl** package generated in [Compiling and Installing the PyTorch Framework](#compiling-and-installing-the-pytorch-framework.md) to any path on the server. +2. Go to the directory where **torch-**_\*_**.whl** is located and run the **pip** command to install PyTorch. + + If the current user is the **root** user, run the following command: + + ``` + pip3.7 install torch-*.whl + ``` + + If the current user is a non-root user, run the following command: + + ``` + pip3.7 install torch-*.whl --user + ``` + + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>- After the code has been modified, you need to re-compile and re-install PyTorch. +>- During the installation, the system may display a message indicating that the TorchVision 0.6.0 version does not match PyTorch. This problem has no impact and can be ignored. + +

Operator Function Verification

+ +- **[Overview](#overview.md)** + +- **[Implementation](#implementation.md)** + + +

Overview

+ +## Introduction + +After operator adaptation is complete, you can run the PyTorch operator adapted to Ascend AI Processor to verify the operator running result. + +Operator verification involves all deliverables generated during operator development, including the implementation files, operator prototype definitions, operator information library, and operator plugins. This section describes only the verification method. + +## Test Cases and Records + +Use the PyTorch frontend to construct the custom operator function and run the function to verify the custom operator functions. + +The test cases and test tools are provided in the **pytorch/test/test\_npu/test\_network\_ops** directory at **https://gitee.com/ascend/pytorch-develop**. + +

Implementation

+ +## Introduction + +This section describes how to test the functions of a PyTorch operator. + +## Procedure + +1. Set environment variables. + + ``` + # Set environment variables. The details are as follows (the HwHiAiUser user is used as an example and the installation path is the default path): + . /home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh + ``` + + Replace **_install\_path_** with the actual Toolkit installation path. + +2. Compile test scripts. Take the add operator as an example. Compile the test script file **test\_add.py** in the **pytorch/test/test\_npu/test\_network\_ops** directory. + + The following is only a simple example. The implementation of a test case must be completely covered based on the operator definition to ensure that the function is basically correct. + + ``` + # Import the dependency library. + import sys + sys.path.append('..') + import torch + import numpy as np + from common_utils import TestCase, run_tests + from common_device_type import dtypes, instantiate_device_type_tests + from util_test import create_common_tensor + + # Define the add test case class. + class TestAdd(TestCase): + + # Define the functions to execute the add operator on the CPU and NPU. + def cpu_op_exec(self, input1, input2): + output = torch.add(input1, input2, alpha = 1) + output = output.numpy() + return output + def npu_op_exec_new(self, input1, input2): + output = torch.add(input1, input2, alpha = 1) + output = output.to("cpu") + output = output.numpy() + return output + + # Define a general function for the add scenario. This function is used to compare the input data of the scenario and the return results of the CPU and NPU. + def add_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item, 0, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec_new(npu_input1, npu_input2) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + # Define a test case for a specific add scenario. The test case function must start with test_. + def test_add_shape_format_fp32_2d(self, device): + format_list = [0, 3, 29] + shape_format = [ + [np.float32, i, [5, 256]] for i in format_list + ] + self.add_result(shape_format) + + instantiate_device_type_tests(TestAdd, globals(), except_for="cpu") + if __name__ == "__main__": + torch.npu.set_device("npu:0") + run_tests() + + + ``` + +3. Execute the test case script. + + Go to the directory where **test\_add.py** is located, and run the following command: + + ``` + python3.7 test_add.py + ``` + + +

FAQs

+ +- **[Pillow==5.3.0 Installation Failed](#pillow-5-3-0-installation-failed.md)** + +- **[pip3.7 install torchvision Installation Failed](#pip3-7-install-torchvision-installation-failed.md)** + +- **["torch 1.5.0xxxx" and "torchvision" Do Not Match When torch-\*.whl Is Installed](#torch-1-5-0xxxx-and-torchvision-do-not-match-when-torch--whl-is-installed.md)** + +- **[如何查看测试的运行日志](#en-us_topic_0000001117914770.md)** + +- **[What Is the Meaning Of The NPU Error Code Output During the Test? Is There Any Corresponding Explanation?](#what-is-the-meaning-of-the-npu-error-code-output-during-the-test-is-there-any-corresponding-explanat.md)** + +- **[Why Cannot the Custom TBE Operator Be Called?](#why-cannot-the-custom-tbe-operator-be-called.md)** + +- **[How Do I Determine Whether the TBE Operator Is Correctly Called for PyTorch Adaptation?](#how-do-i-determine-whether-the-tbe-operator-is-correctly-called-for-pytorch-adaptation.md)** + +- **[PyTorch Compilation Fails and the Message "error: ld returned 1 exit status" Is Displayed](#pytorch-compilation-fails-and-the-message-error-ld-returned-1-exit-status-is-displayed.md)** + +- **[PyTorch Compilation Fails and the Message "error: call of overload...." Is Displayed](#pytorch-compilation-fails-and-the-message-error-call-of-overload-is-displayed.md)** + + +

Pillow==5.3.0 Installation Failed

+ +## Symptom + +**Pillow==5.3.0** installation failed. + +## Possible Cause + +Necessary dependencies are missing, such as libjpeg, python-devel, zlib-devel, and libjpeg-turbo-devel. + +## Solutions + +Run the following command to install the required dependencies: + +``` +apt-get install libjpeg python-devel zlib-devel libjpeg-turbo-devel +``` + +

pip3.7 install torchvision Installation Failed

+ +## Symptom + +**pip3.7 install torchvision** installation failed. + +## Possible Cause + +The versions of PyTorch and TorchVision do not match. + +## Solutions + +Run the following command: + +``` +pip3.7 install torchvision --no-deps +``` + +

"torch 1.5.0xxxx" and "torchvision" Do Not Match When torch-\*.whl Is Installed

+ +## Symptom + +During the installation of **torch-**_\*_**.whl**, the message "ERROR: torchvision 0.6.0 has requirement torch==1.5.0, but you'll have torch 1.5.0a0+1977093 which is incompatible" " is displayed. + +![](figures/en-us_image_0000001172886189.png) + +However, the installation is successful. + +## Possible Cause + +When the PyTorch is installed, the version check is automatically triggered. The version of the torchvision installed in the environment is 0.6.0. During the check, it is found that the version of the **torch-**_\*_**.whl** is inconsistent with the required version 1.5.0. As a result, an error message is displayed. + +## Solutions + +This problem has no impact on the actual result, and no action is required. + +

What Is the Meaning Of The NPU Error Code Output During the Test? Is There Any Corresponding Explanation?

+ +For details, see [aclError](https://support.huaweicloud.com/intl/en-us/adevg-A800_3000_3010/atlasdevelopment_01_0256.html). + +

Why Cannot the Custom TBE Operator Be Called?

+ +## Symptom + +The custom TBE operator has been developed and adapted to PyTorch. However, the newly developed operator cannot be called during test case execution. + +## Possible Cause + +- The environment variables are not set correctly. +- An error occurs in the YAML file. As a result, the operator is not correctly dispatched. +- The implementation of the custom TBE operator is incorrect. As a result, the operator cannot be called. + +## Solutions + +1. Set the operating environment by referring to [Verifying Operator Functions](#operator-function-verification.md). Pay special attention to the following settings: + + ``` + . /home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh + ``` + +2. Check whether the dispatch configuration of the corresponding operator in the YAML file is correct and complete. +3. Analyze and check the code implementation. The recommended methods are as follows: + 1. Modify the operator adaptation implementation in PyTorch so that **test\_add.py** can call the TBE operator in the custom operator package. + + "pytorch/aten/src/ATen/native/npu/AddKernelNpu.cpp" + + ![](figures/en-us_image_0000001126846510.png) + + 2. After the compilation and installation steps are complete, call **python3.7 test\_add.py** to perform the test. + + ``` + Run the cd command to go to the directory where test_add.py is stored and call + test_add.py + to perform the test. + ``` + + There should be no error in this step. The log added in **add** should be displayed. If an error occurs, check the code to ensure that no newly developed code affects the test. + + 3. The newly developed custom TBE operator is combined into CANN. Logs are added to the operator entry as the running identifier. + 4. After the compilation and installation of CANN are complete, call **python3.7.5 test\_add.py** to perform the test. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >According to the design logic of Ascend, the priority of the custom operator package is higher than that of the built-in operator package. During operator loading, the system preferentially loads the operators in the custom operator package. During the process, if the operator information file in the custom operator package fails to be parsed, the custom operator package is skipped and no operator in the custom operator package is loaded or scheduled. + >- If an error occurs in this step or the log added in **add** is not displayed, the newly developed custom TBE operator is incorrect, which affects the loading of the custom operator package. You are advised to **check whether the operator information definition in the newly developed custom TBE operator is correct**. + >- If this step is correct, **the operator information definition in the newly developed custom TBE operator does not affect the running**. + + 5. Call **python3.7.5** _xxx_**\_testcase.py** to perform the test. + + >![](public_sys-resources/icon-note.gif) **NOTE:** + >- If the logs added to the newly developed custom TBE operator are displayed on the screen, the newly developed operator is scheduled. + >- If the logs added to the newly developed custom TBE operator are not displayed on the screen, the problem may occur in PyTorch adaptation. In this case, you need to check the implementation code of PyTorch adaptation. Most of the problems are due to the incorrect adaption of input and output of _xxxx_**KernelNpu.cpp**. + + + +

How Do I Determine Whether the TBE Operator Is Correctly Called for PyTorch Adaptation?

+ +Both the custom and built-in operators are stored in the installation directory as .py source code after installation. Therefore, you can edit the source code and add logs at the API entry to print the input parameters, and determine whether the input parameters are correct. + +>![](public_sys-resources/icon-caution.gif) **CAUTION:** +>This operation may cause risks. You are advised to back up the file to be modified before performing this operation. If the files are not backed up and cannot be restored after being damaged, contact technical support. + +The following uses the **zn\_2\_nchw** operator in the built-in operator package as an example: + +1. Open the installation directory of the operator package in the user directory. + + ``` + cd ~/.local/Ascend/opp/op_impl/built-in/ai_core/tbe/impl + ll + ``` + + The .py source code file of the corresponding operator is read-only, that is, the file cannot be edited. + + ![](figures/en-us_image_0000001127006336.png) + +2. Modify the attributes of the .py source code file of the operator and add the write permission. + + ``` + sudo chmod +w zn_2_nchw.py + ll + ``` + + ![](figures/en-us_image_0000001172886191.png) + +3. Open the .py source code file of the operator, add logs, save the file, and exit. + + ``` + vi zn_2_nchw.py + ``` + + ![](figures/en-us_image_0000001173046109.png) + + In the preceding example, only an identifier is added. In actual commissioning, you can add the input parameters to be printed. + +4. Call and execute the test case to analyze the input parameter information. +5. After the test analysis is complete, open the .py source code file of the operator again, delete the added logs, save the file, and exit. +6. Modify the attributes of the .py source code file of the operator and remove the write permission. + + ``` + sudo chmod -w zn_2_nchw.py + ``` + + ![](figures/en-us_image_0000001126846512.png) + + +

PyTorch Compilation Fails and the Message "error: ld returned 1 exit status" Is Displayed

+ +## Symptom + +PyTorch compilation fails and the message "error: ld returned 1 exit status" is displayed. + +![](figures/en-us_image_0000001127006338.png) + +## Possible Cause + +According to the log analysis, the possible cause is that the adaptation function implemented in _xxxx_**KernelNpu.cpp** does not match the dispatch implementation API parameters required by the PyTorch framework operator. In the preceding example, the function is **binary\_cross\_entropy\_npu**. Open the corresponding _xxxx_**KernelNpu.cpp** file and find the adaptation function. + +![](figures/en-us_image_0000001172886193.png) + +In the implementation, the type of the last parameter is **int**, which does not match the required **long**. + +## Solutions + +Modify the adaptation function implemented in _xxxx_**KernelNpu.cpp**. In the preceding example, change the type of the last parameter in the **binary\_cross\_entropy\_npu** function to **int64\_t** \(use **int64\_t** instead of **long** in the .cpp file\). + +

PyTorch Compilation Fails and the Message "error: call of overload...." Is Displayed

+ +## Symptom + +PyTorch compilation fails and the message "error: call of overload...." is displayed. + +![](figures/en-us_image_0000001173046111.png) + +![](figures/en-us_image_0000001126846514.png) + +## Possible Cause + +According to the log analysis, the error is located in line 30 in the _xxxx_**KernelNpu.cpp** file, indicating that the **NPUAttrDesc** parameter is invalid. In the preceding example, the function is **binary\_cross\_entropy\_attr**. Open the corresponding _xxxx_**KernelNpu.cpp** file and find the adaptation function. + +![](figures/en-us_image_0000001127006340.png) + +In the implementation, the type of the second input parameter of **NPUAttrDesc** is **int**, which does not match the definition of **NPUAttrDesc**. + +## Solutions + +1. Replace the incorrect code line in the **binary\_cross\_entropy\_attr\(\)** function with the code in the preceding comment. + +2. Change the input parameter type of **binary\_cross\_entropy\_attr\(\)** to **int64\_t**. + +

Appendixes

+ +- **[Installing CMake](#installing-cmake.md)** + +- **[Exporting a Custom Operator](#exporting-a-custom-operator.md)** + + +

Installing CMake

+ +The following describes how to upgrade CMake to 3.12.1. + +1. Obtain the CMake software package. + + ``` + wget https://cmake.org/files/v3.12/cmake-3.12.1.tar.gz --no-check-certificate + ``` + +2. Decompress the package and go to the software package directory. + + ``` + tar -xf cmake-3.12.1.tar.gz + cd cmake-3.12.1/ + ``` + +3. Run the configuration, compilation, and installation commands. + + ``` + ./configure --prefix=/usr/local/cmake + make && make install + ``` + +4. Set the soft link. + + ``` + ln -s /usr/local/cmake/bin/cmake /usr/bin/cmake + ``` + +5. Run the following command to check whether CMake has been installed: + + ``` + cmake --version + ``` + + If the message "cmake version 3.12.1" is displayed, the installation is successful. + + +

Exporting a Custom Operator

+ +## Overview + +A PyTorch model contains a custom operator. You can export the custom operator as an ONNX single-operator model, which can be easily ported to other AI frameworks. Three types of custom operator export are available: NPU-adapted TBE operator export, C++ operator export, and pure Python operator export. + +## Prerequisites + +You have installed the PyTorch framework. + +## TBE Operator Export + +A TBE operator can be exported using either of the following methods: + +Method 1: + +1. Define and register an operator. + + ``` + # Define an operator. + @parse_args('v', 'v', 'f', 'i', 'i', 'i', 'i') + def symbolic_npu_roi_align(g, input, rois, spatial_scale, pooled_height, pooled_width, sample_num, roi_end_mode): + args = [input, rois] + kwargs = {"spatial_scale_f": spatial_scale, + "pooled_height_i": pooled_height, + "pooled_width_i": pooled_width, + "sample_num_i": sample_num, + "roi_end_mode_i": roi_end_mode} + + return g.op('torch::npu_roi_align',*args, **kwargs) + + # Register the operator. + import torch.onnx.symbolic_registry as sym_registry + def register_onnx_sym_npu_roi_align(): + sym_registry.register_op('npu_roi_align', symbolic_npu_roi_align, '', 11) + + register_onnx_sym_npu_roi_align() + ``` + +2. Customize a model. + + ``` + # Define a model. + class CustomModel_npu_op(torch.nn.Module): + def __init__(self,a,b): + super(CustomModel_npu_op, self).__init__() + + self.weight = Parameter(torch.Tensor(8,10,1024)) + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + + def forward(self, a, b, d): + spatial_scale=d[0].item() + pooled_height=d[1].item() + pooled_width=d[2].item() + sample_num=d[3].item() + roi_end_mode=d[4].item() + rtn = torch.npu_roi_align(a, self.weight, spatial_scale, pooled_height, pooled_width, sample_num,roi_end_mode) + + return rtn + ``` + +3. Export the ONNX file. + + ``` + # Define an export function. + def do_export(model, inputs, f, *args, **kwargs): + out = torch.onnx._export(model, inputs, f, verbose=True, export_params=True, do_constant_folding=True,*args, **kwargs) + + # Initialize the input. + """ + Initialize the input parameters a, b, and h1 of the model. For details, see the detailed code. + """ + + # Export the ONNX file. + model = CustomModel_npu_op(a,b) + model = model.npu() + model.eval() + do_export(model, (a, b, h1), f, input_names=["intput"]+["","","","","","","npu_roi_align.weight"],opset_version=11) + ``` + + +Method 2: + +1. Define a method class. + + ``` + # Implement the operator method class and symbol export method. + class CustomClassOp_Func_npu_roi_align(Function): + @staticmethod + def forward(ctx, input, rois, spatial_scale, pooled_height, pooled_width , sample_num, roi_end_mode): + rtn = torch.npu_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sample_num, roi_end_mode) + return rtn + + @staticmethod + def symbolic(g, input, rois, spatial_scale, pooled_height, pooled_width , sample_num, roi_end_mode): + args = [input, rois] + kwargs = {"spatial_scale_f": spatial_scale, + "pooled_height_i": pooled_height, + "pooled_width_i": pooled_width, + "sample_num_i": sample_num, + "roi_end_mode_i": roi_end_mode} + return g.op('torch::npu_roi_align',*args, **kwargs) + ``` + +2. Customize an operator model. + + ``` + # Implement an operator model. + class NpuOp_npu_roi_align_Module(torch.nn.Module): + def __init__(self): + super(NpuOp_npu_roi_align_Module, self).__init__() + + self.spatial_scale = torch.randn(10, dtype=torch.float32, requires_grad=False,device="cpu")[0].item() + self.pooled_height = 2 + self.pooled_width = 0 + self.sample_num = 1 + self.roi_end_mode = 1 + + self.weight = Parameter(torch.Tensor(8,10,1024)) + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + + self.func = CustomClassOp_Func_npu_roi_align.apply + self.test_npu_op=1 + + def forward(self, input): + rtn = self.func(input, self.weight, self.spatial_scale, self.pooled_height, self.pooled_width, self.sample_num, self.roi_end_mode) + return rtn + ``` + +3. Customize a model. + + ``` + # Create a custom model. + class CustomModel_Module_op(torch.nn.Module): + def __init__(self,a,b): + super(CustomModel_Module_op, self).__init__() + self.npu_roi_align = NpuOp_npu_roi_align_Module() + #@staticmethod + def forward(self, a): + rtn = self.npu_roi_align(a) + return rtn + ``` + +4. Export the ONNX file. + + ``` + # Build data. + a = torch.randn(5, 10, 1024, dtype=torch.float32, requires_grad=True,device=rnddata_device) + b = torch.randn(10, 10, 1024, dtype=torch.float32, requires_grad=True,device=rnddata_device) + + # Instantiate the model. + model = CustomModel_Module_op(a,b) + model = model.npu() + model.eval() + a = a.to('npu:6') + b = b.to('npu:6') + + # Export the ONNX file. + do_export(model, a, f=ONNX_NPU_OP_MODULE_FILENAME, input_names=["intput"]+["npu_roi_align.weight"],opset_version=11) + ``` + + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>For details about the implementation code, see [test\_custom\_ops\_npu\_demo.py](https://gitee.com/ascend/pytorch/blob/master/test/test_npu/test_onnx/torch.onnx/custom_ops_demo/test_custom_ops_npu_demo.py). If you do not have the permission to obtain the code, contact Huawei technical support to join the **Ascend** organization. + +## C++ Operator Export + +1. Customize an operator. + + ``` + import torch + import torch.utils.cpp_extension + # Define a C++ operator. + def test_custom_add(): + op_source = """ + #include + + torch::Tensor custom_add(torch::Tensor self, torch::Tensor other) { + return self + other; + } + static auto registry = + torch::RegisterOperators("custom_namespace::custom_add",&custom_add); + """ + torch.utils.cpp_extension.load_inline( + name="custom_add", + cpp_sources=op_source, + is_python_module=False, + verbose=True, + ) + + test_custom_add() + ``` + +2. Register the custom operator. + + ``` + # Define the operator registration method and register the operator. + from torch.onnx import register_custom_op_symbolic + + def symbolic_custom_add(g, self, other): + return g.op('custom_namespace::custom_add', self, other) + + register_custom_op_symbolic('custom_namespace::custom_add', symbolic_custom_add, 9) + ``` + +3. Build a model. + + ``` + # Build an operator model. + class CustomAddModel(torch.nn.Module): + def forward(self, a, b): + return torch.ops.custom_namespace.custom_add(a, b) + ``` + +4. Export the operator as an ONNX model. + + ``` + # Export the operator as an ONNX model. + def do_export(model, inputs, *args, **kwargs): + out = torch.onnx._export(model, inputs, "custom_demo.onnx", *args, **kwargs) + + x = torch.randn(2, 3, 4, requires_grad=False) + y = torch.randn(2, 3, 4, requires_grad=False) + model = CustomAddModel() + do_export(model, (x, y), opset_version=11) + ``` + + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>For details about the implementation code, see [test\_custom\_ops\_demo.py](https://gitee.com/ascend/pytorch/blob/master/test/test_npu/test_onnx/torch.onnx/custom_ops_demo/test_custom_ops_demo.py). If you do not have the permission to obtain the code, contact Huawei technical support to join the **Ascend** organization. + +## Pure Python Operator Export + +1. Customize an operator. + + ``` + import torch + import torch.onnx.symbolic_registry as sym_registry + + import torch.utils.cpp_extension + import torch.nn as nn + import torch.nn.modules as Module + from torch.autograd import Function + import numpy as np + + from torch.nn.parameter import Parameter + import math + from torch.nn import init + + # Define an operator class method. + class CustomClassOp_Add_F(Function): + @staticmethod + def forward(ctx, input1,input2): + rtn = torch.add(input1,input2) + return torch.add(input1,rtn) + + @staticmethod + def symbolic(g,input1,input2): + rtn = g.op("Custom::CustomClassOp_Add", input1, input2,test_attr1_i=1,test_attr2_f=1.0) + rtn = g.op("ATen::CustomClassOp_Add", input1, rtn) + rtn = g.op("C10::CustomClassOp_Add", rtn, input2) + #erro doman: rtn = g.op("onnx::CustomClassOp_Add", input1, input2) + + return rtn + ``` + +2. Build a model. + + ``` + # Register the operator and build a model. + class CustomClassOp_Add(torch.nn.Module): + def __init__(self): + super(CustomClassOp_Add, self).__init__() + self.add = CustomClassOp_Add_F.apply + + #graph(%0 : Float(1, 8, 10, 1024), + # %1 : Float(8, 10, 1024)) + self.weight = Parameter(torch.Tensor(8,10,1024)) + + #%1 : Float(8, 10, 1024) = onnx::Constant[value=]() + self.weight1 = torch.Tensor(8,10,1024) + + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + init.kaiming_uniform_(self.weight1, a=math.sqrt(5)) + def forward(self, input): + rtn = torch.add(self.weight1, self.weight) + + rtn = self.add(self.weight, rtn) + rtn1 = self.add(self.weight, self.weight1) + rtn1 = self.add(self.weight1,rtn1) + rtn = self.add(rtn,rtn1) + + return rtn + ``` + +3. Export the operator as an ONNX model. + + ``` + ONNX_FILE_NAME = "./custom_python_module_demo.onnx" + def do_export(model, inputs, *args, **kwargs): + out = torch.onnx._export(model, inputs, ONNX_FILE_NAME, verbose=True,keep_initializers_as_inputs=True, *args, **kwargs) + + def test_class_export(): + model = CustomModel() + model.eval() + input_x_shape = [1, 8, 10, 1024] + input = torch.randn(input_x_shape) + output = model(input) + do_export(model, input, opset_version=11) + + # Export the operator as an ONNX model. + test_class_export() + ``` + + +>![](public_sys-resources/icon-note.gif) **NOTE:** +>For details about the implementation code, see [test\_custom\_ops\_python\_module.py](https://gitee.com/ascend/pytorch/blob/master/test/test_npu/test_onnx/torch.onnx/custom_ops_demo/test_custom_ops_python_module.py). If you do not have the permission to obtain the code, contact Huawei technical support to join the **Ascend** organization. + diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846510.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846510.png new file mode 100644 index 0000000000000000000000000000000000000000..b29212c4dd9584c26367b4564157e1f9531adbdd Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846510.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846512.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846512.png new file mode 100644 index 0000000000000000000000000000000000000000..df8a3dc25ca13833cbf1f6a199b33d44896e993e Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846512.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846514.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846514.png new file mode 100644 index 0000000000000000000000000000000000000000..40100511c53279294c30a9d1721d8e5eac745de9 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001126846514.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006336.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006336.png new file mode 100644 index 0000000000000000000000000000000000000000..0174ee14d442b03f164c0223e00a3d6728de278f Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006336.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006338.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006338.png new file mode 100644 index 0000000000000000000000000000000000000000..f9526aafbca55cfb04df26df79147b0bf173d6fd Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006338.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006340.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006340.png new file mode 100644 index 0000000000000000000000000000000000000000..525161bd16193bd17140623477be173f81de7d50 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001127006340.png differ diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/zh-cn_image_0000001106176216.png" b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886189.png similarity index 100% rename from "docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/zh-cn_image_0000001106176216.png" rename to docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886189.png diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886191.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886191.png new file mode 100644 index 0000000000000000000000000000000000000000..2b0372f5d8a7018a8c33c80e73929bf9d25c39c0 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886191.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886193.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886193.png new file mode 100644 index 0000000000000000000000000000000000000000..f1841479dfb0355b4295931d96e18b5187d97e27 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001172886193.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001173046109.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001173046109.png new file mode 100644 index 0000000000000000000000000000000000000000..6550ee72574f98e3486b7887f8bdb011a6882be5 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001173046109.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001173046111.png b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001173046111.png new file mode 100644 index 0000000000000000000000000000000000000000..ee7a62ab9cefcc3349958f18853a313fb504da15 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/en-us_image_0000001173046111.png differ diff --git a/docs/en/PyTorch Operator Development Guide/figures/operator-adaptation-process-in-the-pytorch-framework.png b/docs/en/PyTorch Operator Development Guide/figures/operator-adaptation-process-in-the-pytorch-framework.png new file mode 100644 index 0000000000000000000000000000000000000000..df1369e2ffc15ee5f06cce22785cedd0a48283f5 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/figures/operator-adaptation-process-in-the-pytorch-framework.png differ diff --git a/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-caution.gif b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-caution.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-caution.gif differ diff --git a/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-danger.gif b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-danger.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-danger.gif differ diff --git a/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-note.gif b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-note.gif new file mode 100644 index 0000000000000000000000000000000000000000..6314297e45c1de184204098efd4814d6dc8b1cda Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-note.gif differ diff --git a/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-notice.gif b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-notice.gif new file mode 100644 index 0000000000000000000000000000000000000000..86024f61b691400bea99e5b1f506d9d9aef36e27 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-notice.gif differ diff --git a/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-tip.gif b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-tip.gif new file mode 100644 index 0000000000000000000000000000000000000000..93aa72053b510e456b149f36a0972703ea9999b7 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-tip.gif differ diff --git a/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-warning.gif b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-warning.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Operator Development Guide/public_sys-resources/icon-warning.gif differ diff --git a/docs/en/PyTorch Operator Support/PyTorch Operator Support.md b/docs/en/PyTorch Operator Support/PyTorch Operator Support.md new file mode 100644 index 0000000000000000000000000000000000000000..55dd5031e6e21332c6da0468607ed88c521abc8d --- /dev/null +++ b/docs/en/PyTorch Operator Support/PyTorch Operator Support.md @@ -0,0 +1,6002 @@ +# FrameworkPTAdapter 2.0.2 PyTorch Operator Support +- [Mapping Between PyTorch Native Operators and Ascend Adapted Operators](#mapping-between-pytorch-native-operators-and-ascend-adapted-operators.md) +- [PyTorch Operators Customized by Ascend](#pytorch-operators-customized-by-ascend.md) +

Mapping Between PyTorch Native Operators and Ascend Adapted Operators

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

PyTorch Native Operator

+

Ascend Adapted Operator

+

1

+

dropout

+

dropout_npu

+

2

+

dropout_

+

dropout_npu_

+

3

+

abs

+

abs_npu

+

4

+

abs_

+

abs_npu_

+

5

+

abs.out

+

abs_out_npu

+

6

+

acos

+

acos_npu

+

7

+

acos_

+

acos_npu_

+

8

+

acos.out

+

acos_out_npu

+

9

+

adaptive_avg_pool1d

+

adaptive_avg_pool1d_npu

+

10

+

add.Tensor

+

add_npu

+

11

+

add_.Tensor

+

add_npu_

+

12

+

add.out

+

add_out_npu

+

13

+

add.Scalar

+

add_npu

+

14

+

add_.Scalar

+

add_npu_

+

15

+

addmv

+

addmv_npu

+

16

+

addmv_

+

addmv_npu_

+

17

+

addmv.out

+

addmv_out_npu

+

18

+

addr

+

addr_npu

+

19

+

addr_

+

addr_npu_

+

20

+

addr.out

+

addr_out_npu

+

21

+

affine_grid_generator

+

affine_grid_generator_npu

+

22

+

affine_grid_generator_backward

+

affine_grid_generator_backward_npu

+

23

+

all.dim

+

all_npu

+

24

+

all.out

+

all_out_npu

+

25

+

any.dim

+

any_npu

+

26

+

any.out

+

any_out_npu

+

27

+

arange

+

arange_npu

+

28

+

arange.start

+

arange_npu

+

29

+

arange.start_step

+

arange_npu

+

30

+

arange.out

+

arange_out_npu

+

31

+

arange.start_out

+

arange_out_npu

+

32

+

_dim_arange

+

_dim_arange_npu

+

33

+

argmax

+

argmax_npu

+

34

+

argmin

+

argmin_npu

+

35

+

as_strided

+

as_strided_npu

+

36

+

as_strided_

+

as_strided_npu_

+

37

+

asin

+

asin_npu

+

38

+

asin_

+

asin_npu_

+

39

+

asin.out

+

asin_out_npu

+

40

+

atan

+

atan_npu

+

41

+

atan_

+

atan_npu_

+

42

+

atan.out

+

atan_out_npu

+

43

+

baddbmm

+

baddbmm_npu

+

44

+

baddbmm_

+

baddbmm_npu_

+

45

+

baddbmm.out

+

baddbmm_out_npu

+

46

+

bartlett_window

+

bartlett_window_npu

+

47

+

bartlett_window.periodic

+

bartlett_window_npu

+

48

+

batch_norm

+

batch_norm_npu_

+

49

+

_batch_norm_impl_index

+

_batch_norm_impl_index_npu

+

50

+

_batch_norm_impl_index_backward

+

_batch_norm_impl_index_backward_npu

+

51

+

bernoulli

+

bernoulli_npu

+

52

+

bernoulli_.Tensor

+

bernoulli_npu_

+

53

+

bernoulli_.float

+

bernoulli_npu_

+

54

+

binary_cross_entropy

+

binary_cross_entropy_npu

+

55

+

binary_cross_entropy.out

+

binary_cross_entropy_out_npu

+

56

+

binary_cross_entropy_backward

+

binary_cross_entropy_backward_npu

+

57

+

binary_cross_entropy_backward.grad_input

+

binary_cross_entropy_backward_out_npu

+

58

+

binary_cross_entropy_with_logits

+

binary_cross_entropy_with_logits_npu

+

59

+

binary_cross_entropy_with_logits_backward

+

binary_cross_entropy_with_logits_backward_npu

+

60

+

bitwise_not

+

bitwise_not_npu

+

61

+

bitwise_not_

+

bitwise_not_npu_

+

62

+

bitwise_not.out

+

bitwise_not_out_npu

+

63

+

logical_not

+

logical_not_npu

+

64

+

logical_not_

+

logical_not_npu_

+

65

+

logical_not.out

+

logical_not_out_npu

+

66

+

logical_and

+

logical_and_npu

+

67

+

logical_and_

+

logical_and_npu_

+

68

+

logical_and.out

+

logical_and_out_npu

+

69

+

logical_or

+

logical_or_npu

+

70

+

logical_or_

+

logical_or_npu_

+

71

+

logical_or.out

+

logical_or_out_npu

+

72

+

blackman_window

+

blackman_window_npu

+

73

+

blackman_window.periodic

+

blackman_window_npu

+

74

+

bmm

+

bmm_npu

+

75

+

bmm.out

+

bmm_out_npu

+

76

+

cat

+

cat_npu

+

77

+

cat.out

+

cat_out_npu

+

78

+

cat.names

+

cat_npu

+

79

+

cat.names_out

+

cat_out_npu

+

80

+

ceil

+

ceil_npu

+

81

+

ceil_

+

ceil_npu_

+

82

+

ceil.out

+

ceil_out_npu

+

83

+

clamp

+

clamp_npu

+

84

+

clamp_

+

clamp_npu_

+

85

+

clamp.out

+

clamp_out_npu

+

86

+

clamp_max

+

clamp_max_npu

+

87

+

clamp_max_

+

clamp_max_npu_

+

88

+

clamp_max.out

+

clamp_max_out_npu

+

89

+

clamp_min

+

clamp_min_npu

+

90

+

clamp_min_

+

clamp_min_npu_

+

91

+

clamp_min.out

+

clamp_min_out_npu

+

92

+

constant_pad_nd

+

constant_pad_nd_npu

+

93

+

contiguous

+

contiguous_npu

+

94

+

convolution

+

convolution_npu

+

95

+

_convolution

+

_convolution_npu

+

96

+

_convolution_nogroup

+

_convolution_nogroup_npu

+

97

+

conv2d

+

conv2d_npu_

+

98

+

conv3d

+

_conv3d_npu

+

99

+

conv_tbc

+

conv_tbc_npu

+

100

+

conv_tbc_backward

+

conv_tbc_backward_npu

+

101

+

conv_transpose2d.input

+

conv_transpose2d_npu_

+

102

+

copy_

+

copy_npu_

+

103

+

cos

+

cos_npu

+

104

+

cos_

+

cos_npu_

+

105

+

cos.out

+

cos_out_npu

+

106

+

cosh

+

cosh_npu

+

107

+

cosh_

+

cosh_npu_

+

108

+

cosh.out

+

cosh_out_npu

+

109

+

cummin

+

cummin_npu

+

110

+

cummin.out

+

cummin_out_npu

+

111

+

cummin.dimname

+

cummin_npu

+

112

+

cummin.dimname_out

+

cummin_out_npu

+

113

+

cumprod

+

cumprod_npu

+

114

+

cumprod.out

+

cumprod_out_npu

+

115

+

cumprod.dimname

+

cumprod_npu

+

116

+

cumprod.dimname_out

+

cumprod_out_npu

+

117

+

ctc_loss.IntList

+

ctc_loss_npu

+

118

+

ctc_loss.Tensor

+

ctc_loss_npu

+

119

+

_ctc_loss

+

ctc_loss_npu

+

120

+

_ctc_loss_backward

+

ctc_loss_backward_npu

+

121

+

fill_diagonal_

+

fill_diagonal_npu_

+

122

+

div.Tensor

+

div_npu

+

123

+

div_.Tensor

+

div_npu_

+

124

+

div.out

+

div_out_npu

+

125

+

div.Scalar

+

div_npu

+

126

+

div_.Scalar

+

div_npu_

+

127

+

dot

+

dot_npu

+

128

+

dot.out

+

dot_out_npu

+

129

+

embedding

+

embedding_npu

+

130

+

embedding_backward

+

embedding_backward_npu

+

131

+

embedding_dense_backward

+

embedding_dense_backward_npu

+

132

+

embedding_renorm_

+

embedding_renorm_npu_

+

133

+

_embedding_bag

+

_embedding_bag_npu

+

134

+

empty.memory_format

+

empty_npu

+

135

+

resize_

+

resize_npu_

+

136

+

empty_like

+

empty_like_npu

+

137

+

empty_strided

+

empty_strided_npu

+

138

+

erf

+

erf_npu

+

139

+

erf_

+

erf_npu_

+

140

+

erf.out

+

erf_out_npu

+

141

+

exp

+

exp_npu

+

142

+

exp_

+

exp_npu_

+

143

+

exp.out

+

exp_out_npu

+

144

+

expm1

+

expm1_npu

+

145

+

expm1_

+

expm1_npu_

+

146

+

expm1.out

+

expm1_out_npu

+

147

+

eye

+

eye_npu

+

148

+

eye.m

+

eye_npu

+

149

+

eye.out

+

eye_out_npu

+

150

+

eye.m_out

+

eye_out_npu

+

151

+

fill_.Scalar

+

fill_npu_

+

152

+

fill_.Tensor

+

fill_npu_

+

153

+

floor

+

floor_npu

+

154

+

floor_

+

floor_npu_

+

155

+

floor.out

+

floor_out_npu

+

156

+

floor_divide

+

floor_divide_npu

+

157

+

floor_divide_.Tensor

+

floor_divide_npu_

+

158

+

floor_divide.out

+

floor_divide_out_npu

+

159

+

floor_divide.Scalar

+

floor_divide_npu

+

160

+

floor_divide_.Scalar

+

floor_divide_npu_

+

161

+

frac

+

frac_npu

+

162

+

frac_

+

frac_npu_

+

163

+

frac.out

+

frac_out_npu

+

164

+

full.names

+

full_npu

+

165

+

full

+

full_npu

+

166

+

full.out

+

full_out_npu

+

167

+

grid_sampler

+

grid_sampler_npu

+

168

+

grid_sampler_3d

+

grid_sampler_3d_npu

+

169

+

grid_sampler_3d_backward

+

grid_sampler_3d_backward_npu

+

170

+

hann_window

+

hann_window_npu

+

171

+

hann_window.periodic

+

hann_window_npu

+

172

+

hamming_window

+

hamming_window_npu

+

173

+

hamming_window.periodic

+

hamming_window_npu

+

174

+

hamming_window.periodic_alpha

+

hamming_window_npu

+

175

+

hamming_window.periodic_alpha_beta

+

hamming_window_npu

+

176

+

ger

+

ger_npu

+

177

+

ger.out

+

ger_out_npu

+

178

+

index.Tensor

+

index_npu

+

179

+

index_put_

+

index_put_npu_

+

180

+

index_put

+

index_put_npu

+

181

+

_index_put_impl_

+

_index_put_impl_npu_

+

182

+

inverse

+

inverse_npu

+

183

+

inverse.out

+

inverse_out_npu

+

184

+

isclose

+

isclose_npu

+

185

+

isnan

+

isnan_npu

+

186

+

is_nonzero

+

is_nonzero_npu

+

187

+

kl_div

+

kl_div_npu

+

188

+

kl_div_backward

+

kl_div_backward_npu

+

189

+

kthvalue

+

kthvalue_npu

+

190

+

kthvalue.values

+

kthvalue_out_npu

+

191

+

kthvalue.dimname

+

kthvalue_npu

+

192

+

kthvalue.dimname_out

+

kthvalue_out_npu

+

193

+

native_layer_norm

+

layer_norm_npu

+

194

+

native_layer_norm_backward

+

layer_norm_backward_npu

+

195

+

linspace

+

linspace_npu

+

196

+

linspace.out

+

linspace_out_npu

+

197

+

log

+

log_npu

+

198

+

log_

+

log_npu_

+

199

+

log.out

+

log_out_npu

+

200

+

log10

+

log10_npu

+

201

+

log10_

+

log10_npu_

+

202

+

log10.out

+

log10_out_npu

+

203

+

log1p

+

log1p_npu

+

204

+

log1p_

+

log1p_npu_

+

205

+

log1p.out

+

log1p_out_npu

+

206

+

log2

+

log2_npu

+

207

+

log2_

+

log2_npu_

+

208

+

log2.out

+

log2_out_npu

+

209

+

logspace

+

logspace_npu

+

210

+

logspace.out

+

logspace_out_npu

+

211

+

log_softmax.int

+

log_softmax_npu

+

212

+

log_softmax.Dimname

+

log_softmax_npu

+

213

+

_log_softmax

+

_log_softmax_npu

+

214

+

_log_softmax_backward_data

+

_log_softmax_backward_npu

+

215

+

logsumexp

+

logsumexp_npu

+

216

+

logsumexp.out

+

logsumexp_out_npu

+

217

+

logsumexp.names

+

logsumexp_npu

+

218

+

logsumexp.names_out

+

logsumexp_out_npu

+

219

+

matmul

+

matmul_npu

+

220

+

matmul.out

+

matmul_out_npu

+

221

+

matrix_power

+

matrix_power_npu

+

222

+

max.dim

+

max_npu

+

223

+

max.dim_max

+

max_out_npu

+

224

+

max_values

+

max_npu

+

225

+

max.names_dim

+

max_npu

+

226

+

max.names_dim_max

+

max_out_npu

+

227

+

max_values.names

+

max_npu

+

228

+

max_pool2d

+

max_pool2d_npu

+

229

+

quantized_max_pool2d

+

quantized_max_pool2d_npu

+

230

+

mean

+

mean_npu

+

231

+

mean.dim

+

mean_npu

+

232

+

mean.out

+

mean_out_npu

+

233

+

mean.names_dim

+

mean_npu

+

234

+

mean.names_out

+

mean_out_npu

+

235

+

median.dim

+

median_npu

+

236

+

median.dim_values

+

median_out_npu

+

237

+

median.names_dim

+

median_npu

+

238

+

median.names_dim_values

+

median_out_npu

+

239

+

min.dim

+

min_npu

+

240

+

min.dim_min

+

min_out_npu

+

241

+

min_values

+

min_npu

+

242

+

min.names_dim

+

min_npu

+

243

+

min.names_dim_min

+

min_out_npu

+

244

+

min_values.names

+

min_npu

+

245

+

mm

+

mm_npu

+

246

+

mm.out

+

mm_out_npu

+

247

+

mode

+

mode_npu

+

248

+

mode.values

+

mode_out_npu

+

249

+

mul.Tensor

+

mul_npu

+

250

+

mul_.Tensor

+

mul_npu_

+

251

+

mul.out

+

mul_out_npu

+

252

+

mul.Scalar

+

mul_npu

+

253

+

mul_.Scalar

+

mul_npu_

+

254

+

mv

+

mv_npu

+

255

+

mv.out

+

mv_out_npu

+

256

+

narrow_copy

+

narrow_copy_npu

+

257

+

native_batch_norm

+

batch_norm_npu

+

258

+

native_batch_norm_backward

+

batch_norm_backward_npu

+

259

+

_nnpack_spatial_convolution

+

_nnpack_spatial_convolution_npu

+

260

+

ones.names

+

ones_npu

+

261

+

ones

+

ones_npu

+

262

+

ones.out

+

ones_out_npu

+

263

+

ones_like

+

ones_like_npu

+

264

+

cdist

+

cdist_npu

+

265

+

_cdist_forward

+

_cdist_forward_npu

+

266

+

_cdist_backward

+

_cdist_backward_npu

+

267

+

pdist

+

pdist_npu

+

268

+

_pdist_forward

+

_pdist_forward_npu

+

269

+

randperm

+

randperm_npu

+

270

+

randperm.generator

+

randperm_npu

+

271

+

randperm.out

+

randperm_out_npu

+

272

+

randperm.generator_out

+

randperm_out_npu

+

273

+

range.step

+

range_npu

+

274

+

range

+

range_npu

+

275

+

range.out

+

range_out_npu

+

276

+

reciprocal

+

reciprocal_npu

+

277

+

reciprocal_

+

reciprocal_npu_

+

278

+

reciprocal.out

+

reciprocal_out_npu

+

279

+

neg

+

neg_npu

+

280

+

neg_

+

neg_npu_

+

281

+

neg.out

+

neg_out_npu

+

282

+

repeat

+

repeat_npu

+

283

+

repeat_interleave.self_int

+

repeat_interleave_npu

+

284

+

round

+

round_npu

+

285

+

round_

+

round_npu_

+

286

+

round.out

+

round_out_npu

+

287

+

relu

+

relu_npu

+

288

+

relu_

+

relu_npu_

+

289

+

prelu

+

prelu_npu

+

290

+

prelu_backward

+

prelu_backward_npu

+

291

+

gelu

+

gelu_npu

+

292

+

gelu_backward

+

gelu_backward_npu

+

293

+

hardshrink

+

hardshrink_npu

+

294

+

hardshrink_backward

+

hardshrink_backward_npu

+

295

+

rsqrt

+

rsqrt_npu

+

296

+

rsqrt_

+

rsqrt_npu_

+

297

+

rsqrt.out

+

rsqrt_out_npu

+

298

+

selu

+

selu_npu

+

299

+

selu_

+

selu_npu_

+

300

+

celu

+

celu_npu

+

301

+

celu_

+

celu_npu_

+

302

+

sigmoid

+

sigmoid_npu

+

303

+

sigmoid_

+

sigmoid_npu_

+

304

+

sigmoid.out

+

sigmoid_out_npu

+

305

+

sin

+

sin_npu

+

306

+

sin_

+

sin_npu_

+

307

+

sin.out

+

sin_out_npu

+

308

+

sinh

+

sinh_npu

+

309

+

sinh_

+

sinh_npu_

+

310

+

sinh.out

+

sinh_out_npu

+

311

+

slogdet

+

slogdet_npu

+

312

+

softmax.int

+

softmax_npu

+

313

+

softmax.Dimname

+

softmax_npu

+

314

+

_softmax

+

_softmax_npu

+

315

+

_softmax_backward_data

+

_softmax_backward_npu

+

316

+

stack

+

stack_npu

+

317

+

stack.out

+

stack_out_npu

+

318

+

sum

+

sum_npu

+

319

+

sum.dim_IntList

+

sum_npu

+

320

+

sum.dim_DimnameList

+

sum_npu

+

321

+

sum.IntList_out

+

sum_out_npu

+

322

+

sum.DimnameList_out

+

sum_out_npu

+

323

+

sqrt

+

sqrt_npu

+

324

+

sqrt_

+

sqrt_npu_

+

325

+

sqrt.out

+

sqrt_out_npu

+

326

+

std

+

std_npu

+

327

+

std.dim

+

std_dim_npu

+

328

+

std_mean

+

std_mean_npu

+

329

+

std_mean.dim

+

std_mean_dim_npu

+

330

+

std_mean.names_dim

+

std_mean_names_npu

+

331

+

std.out

+

std_out_npu

+

332

+

std.names_dim

+

std_names_npu

+

333

+

std.names_out

+

std_out_npu

+

334

+

prod

+

prod_npu

+

335

+

prod.dim_int

+

prod_npu

+

336

+

prod.int_out

+

prod_out_npu

+

337

+

prod.dim_Dimname

+

prod_npu

+

338

+

prod.Dimname_out

+

prod_out_npu

+

339

+

tan

+

tan_npu

+

340

+

tan_

+

tan_npu_

+

341

+

tan.out

+

tan_out_npu

+

342

+

tanh

+

tanh_npu

+

343

+

tanh_

+

tanh_npu_

+

344

+

tanh.out

+

tanh_out_npu

+

345

+

threshold

+

threshold_npu

+

346

+

threshold_

+

threshold_npu_

+

347

+

threshold.out

+

threshold_out_npu

+

348

+

threshold_backward

+

threshold_backward_npu

+

349

+

one_hot

+

one_hot_npu1

+

350

+

flip

+

flip_npu

+

351

+

roll

+

roll_npu

+

352

+

true_divide.Tensor

+

true_divide_npu

+

353

+

true_divide_.Tensor

+

true_divide_npu_

+

354

+

true_divide.out

+

true_divide_out_npu

+

355

+

true_divide.Scalar

+

true_divide_npu

+

356

+

true_divide_.Scalar

+

true_divide_npu_

+

357

+

trunc

+

trunc_npu

+

358

+

trunc_

+

trunc_npu_

+

359

+

trunc.out

+

trunc_out_npu

+

360

+

_unique2

+

_unique2_npu

+

361

+

var

+

var_npu

+

362

+

var.dim

+

var_npu

+

363

+

var.out

+

var_out_npu

+

364

+

var.names_dim

+

var_npu

+

365

+

var.names_out

+

var_out_npu

+

366

+

var_mean

+

var_mean_npu

+

367

+

var_mean.dim

+

var_mean_npu

+

368

+

var_mean.names_dim

+

var_mean_npu

+

369

+

where.self

+

where_npu

+

370

+

where

+

where_npu

+

371

+

_s_where

+

_s_where_npu

+

372

+

zeros.names

+

zeros_npu

+

373

+

zeros

+

zeros_npu

+

374

+

zeros.out

+

zeros_out_npu

+

375

+

zeros_like

+

zeros_like_npu

+

376

+

norm.ScalarOpt_dtype

+

norm_npu

+

377

+

norm.Scalar

+

norm_npu

+

378

+

norm.ScalarOpt_dim_dtype

+

norm_npu

+

379

+

norm.ScalarOpt_dim

+

norm_npu

+

380

+

norm.dtype_out

+

norm_out_npu

+

381

+

norm.out

+

norm_out_npu

+

382

+

clone

+

clone_npu

+

383

+

resize_as_

+

resize_as_npu_

+

384

+

pow.Tensor_Scalar_out

+

pow_out_npu

+

385

+

pow.Tensor_Scalar

+

pow_npu

+

386

+

zero_

+

zero_npu_

+

387

+

sub.out

+

sub_out_npu

+

388

+

sub.Tensor

+

sub_npu

+

389

+

sub_.Tensor

+

sub_npu_

+

390

+

sub.Scalar

+

sub_npu

+

391

+

sub_.Scalar

+

sub_npu_

+

392

+

rsub.Tensor

+

rsub_npu

+

393

+

rsub.Scalar

+

rsub_npu

+

394

+

addmm.out

+

addmm_out_npu

+

395

+

addmm

+

addmm_npu

+

396

+

addmm_

+

addmm_npu_

+

397

+

quantize_per_tensor

+

quantize_per_tensor_npu

+

398

+

quantize_per_channel

+

quantize_per_channel_npu

+

399

+

to.dtype_layout

+

to_npu

+

400

+

to.device

+

to_device_npu

+

401

+

to.dtype

+

to_dtype_npu

+

402

+

to.other

+

to_other_npu

+

403

+

_local_scalar_dense

+

_local_scalar_dense_npu

+

404

+

lstm.input

+

lstm_npu

+

405

+

lstm.data

+

lstm_npu

+

406

+

gru.input

+

gru_npu_

+

407

+

_pack_padded_sequence

+

_pack_padded_sequence_npu

+

408

+

_pad_packed_sequence

+

_pad_packed_sequence_npu

+

409

+

set_.source_Storage

+

set_npu_

+

410

+

set_.source_Storage_storage_offset

+

set_npu_

+

411

+

set_.source_Tensor

+

set_npu_

+

412

+

set_

+

set_npu_

+

413

+

masked_fill_.Scalar

+

masked_fill_npu_

+

414

+

masked_fill_.Tensor

+

masked_fill_npu_

+

415

+

masked_scatter_

+

masked_scatter_npu_

+

416

+

view

+

view_npu

+

417

+

put_

+

put_npu_

+

418

+

index_add_

+

index_add_npu_

+

419

+

index_add

+

index_add_npu

+

420

+

index_add.dimname

+

index_add_npu

+

421

+

index_fill_.int_Scalar

+

index_fill_npu_

+

422

+

index_fill.int_Scalar

+

index_fill_npu

+

423

+

index_fill_.int_Tensor

+

index_fill_npu_

+

424

+

index_fill.int_Tensor

+

index_fill_npu

+

425

+

scatter_.src

+

scatter_npu_

+

426

+

scatter_.value

+

scatter_npu_

+

427

+

scatter_add_

+

scatter_add_npu_

+

428

+

scatter_add

+

scatter_add_npu

+

429

+

scatter_add.dimname

+

scatter_add_npu

+

430

+

lt_.Scalar

+

lt_npu_

+

431

+

lt_.Tensor

+

lt_npu_

+

432

+

gt_.Scalar

+

gt_npu_

+

433

+

gt_.Tensor

+

gt_npu_

+

434

+

le_.Scalar

+

le_npu_

+

435

+

le_.Tensor

+

le_npu_

+

436

+

ge_.Scalar

+

ge_npu_

+

437

+

ge_.Tensor

+

ge_npu_

+

438

+

eq_.Scalar

+

eq_npu_

+

439

+

eq_.Tensor

+

eq_npu_

+

440

+

ne_.Scalar

+

ne_npu_

+

441

+

ne_.Tensor

+

ne_npu_

+

442

+

bitwise_and.Tensor_out

+

bitwise_and_out_npu

+

443

+

bitwise_and.Scalar_out

+

bitwise_and_out_npu

+

444

+

bitwise_and.Scalar

+

bitwise_and_npu

+

445

+

bitwise_and.Tensor

+

bitwise_and_npu

+

446

+

bitwise_and_.Scalar

+

bitwise_and_npu_

+

447

+

bitwise_and_.Tensor

+

bitwise_and_npu_

+

448

+

__and__.Scalar

+

__and___npu

+

449

+

__and__.Tensor

+

__and___npu

+

450

+

bitwise_or.Tensor_out

+

bitwise_or_out_npu

+

451

+

bitwise_or.Scalar_out

+

bitwise_or_out_npu

+

452

+

bitwise_or.Scalar

+

bitwise_or_npu

+

453

+

bitwise_or.Tensor

+

bitwise_or_npu

+

454

+

bitwise_or_.Scalar

+

bitwise_or_npu_

+

455

+

bitwise_or_.Tensor

+

bitwise_or_npu_

+

456

+

__or__.Scalar

+

__or___npu

+

457

+

__or__.Tensor

+

__or___npu

+

458

+

__ior__.Scalar

+

__ior___npu

+

459

+

__ior__.Tensor

+

__ior___npu

+

460

+

bitwise_xor.Tensor_out

+

bitwise_xor_out_npu

+

461

+

bitwise_xor.Scalar_out

+

bitwise_xor_out_npu

+

462

+

bitwise_xor.Scalar

+

bitwise_xor_npu

+

463

+

bitwise_xor.Tensor

+

bitwise_xor_npu

+

464

+

bitwise_xor_.Scalar

+

bitwise_xor_npu_

+

465

+

bitwise_xor_.Tensor

+

bitwise_xor_npu_

+

466

+

__xor__.Scalar

+

__xor___npu

+

467

+

__xor__.Tensor

+

__xor___npu

+

468

+

atan2_

+

atan2_npu_

+

469

+

tril_

+

tril_npu_

+

470

+

triu_

+

triu_npu_

+

471

+

renorm_

+

renorm_npu_

+

472

+

pow_.Scalar

+

pow_npu_

+

473

+

pow_.Tensor

+

pow_npu_

+

474

+

lerp_.Scalar

+

lerp_npu_

+

475

+

lerp_.Tensor

+

lerp_npu_

+

476

+

fmod_.Scalar

+

fmod_npu_

+

477

+

fmod_.Tensor

+

fmod_npu_

+

478

+

remainder_.Scalar

+

remainder_npu_

+

479

+

remainder_.Tensor

+

remainder_npu_

+

480

+

addbmm_

+

addbmm_npu_

+

481

+

addbmm.out

+

addbmm_out_npu

+

482

+

addbmm

+

addbmm_npu

+

483

+

addcdiv_

+

addcdiv_npu_

+

484

+

random_.from

+

random_npu_

+

485

+

random_.to

+

random_npu_

+

486

+

random_

+

random_npu_

+

487

+

uniform_

+

uniform_npu_

+

488

+

diag.out

+

diag_out_npu

+

489

+

diag

+

diag_npu

+

490

+

cross.out

+

cross_out_npu

+

491

+

cross

+

cross_npu

+

492

+

triu.out

+

triu_out_npu

+

493

+

triu

+

triu_npu

+

494

+

tril.out

+

tril_out_npu

+

495

+

tril

+

tril_npu

+

496

+

ne.Scalar_out

+

ne_out_npu

+

497

+

ne.Scalar

+

ne_npu

+

498

+

ne.Tensor_out

+

ne_out_npu

+

499

+

ne.Tensor

+

ne_npu

+

500

+

eq.Scalar_out

+

eq_out_npu

+

501

+

eq.Scalar

+

eq_npu

+

502

+

eq.Tensor_out

+

eq_out_npu

+

503

+

eq.Tensor

+

eq_npu

+

504

+

ge.Scalar_out

+

ge_out_npu

+

505

+

ge.Scalar

+

ge_npu

+

506

+

ge.Tensor_out

+

ge_out_npu

+

507

+

ge.Tensor

+

ge_npu

+

508

+

le.Scalar_out

+

le_out_npu

+

509

+

le.Scalar

+

le_npu

+

510

+

le.Tensor_out

+

le_out_npu

+

511

+

le.Tensor

+

le_npu

+

512

+

gt.Scalar_out

+

gt_out_npu

+

513

+

gt.Scalar

+

gt_npu

+

514

+

gt.Tensor_out

+

gt_out_npu

+

515

+

gt.Tensor

+

gt_npu

+

516

+

lt.Scalar_out

+

lt_out_npu

+

517

+

lt.Scalar

+

lt_npu

+

518

+

lt.Tensor_out

+

lt_out_npu

+

519

+

lt.Tensor

+

lt_npu

+

520

+

take.out

+

take_out_npu

+

521

+

take

+

take_npu

+

522

+

index_select.out

+

index_select_out_npu

+

523

+

index_select

+

index_select_npu

+

524

+

index_select.dimname_out

+

index_select_out_npu

+

525

+

index_select.dimname

+

index_select_npu

+

526

+

masked_select.out

+

masked_select_out_npu

+

527

+

masked_select

+

masked_select_npu

+

528

+

nonzero.out

+

nonzero_out_npu

+

529

+

nonzero

+

nonzero_npu

+

530

+

gather.out

+

gather_out_npu

+

531

+

gather

+

gather_npu

+

532

+

gather.dimname_out

+

gather_out_npu

+

533

+

gather.dimname

+

gather_npu

+

534

+

addcmul.out

+

addcmul_out_npu

+

535

+

addcmul

+

addcmul_npu

+

536

+

addcmul_

+

addcmul_npu_

+

537

+

addcdiv.out

+

addcdiv_out_npu

+

538

+

addcdiv

+

addcdiv_npu

+

539

+

qr.Q

+

qr_out_npu

+

540

+

qr

+

qr_npu

+

541

+

multinomial.out

+

multinomial_out_npu

+

542

+

multinomial

+

multinomial_npu

+

543

+

erfinv

+

erfinv_npu

+

544

+

erfinv_

+

erfinv_npu_

+

545

+

erfinv.out

+

erfinv_out_npu

+

546

+

sign

+

sign_npu

+

547

+

sign_

+

sign_npu_

+

548

+

sign.out

+

sign_out_npu

+

549

+

atan2.out

+

atan2_out_npu

+

550

+

atan2

+

atan2_npu

+

551

+

lerp.Scalar_out

+

lerp_out_npu

+

552

+

lerp.Tensor_out

+

lerp_out_npu

+

553

+

lerp.Scalar

+

lerp_npu

+

554

+

lerp.Tensor

+

lerp_npu

+

555

+

histc.out

+

histc_out_npu

+

556

+

histc

+

histc_npu

+

557

+

fmod.Scalar_out

+

fmod_out_npu

+

558

+

fmod.Scalar

+

fmod_npu

+

559

+

fmod.Tensor_out

+

fmod_out_npu

+

560

+

fmod.Tensor

+

fmod_npu

+

561

+

remainder.Scalar_out

+

remainder_out_npu

+

562

+

remainder.Scalar

+

remainder_npu

+

563

+

remainder.Tensor_out

+

remainder_out_npu

+

564

+

remainder.Tensor

+

remainder_npu

+

565

+

min.out

+

min_out_npu

+

566

+

min.other

+

min_npu

+

567

+

min

+

min_npu

+

568

+

max.out

+

max_out_npu

+

569

+

max.other

+

max_npu

+

570

+

max

+

max_npu

+

571

+

median

+

median_npu

+

572

+

sort.values

+

sort_out_npu

+

573

+

sort

+

sort_npu

+

574

+

sort.dimname_values

+

sort_out_npu

+

575

+

sort.dimname

+

sort_npu

+

576

+

argsort

+

argsort_npu

+

577

+

argsort.dimname

+

argsort_npu

+

578

+

topk.values

+

topk_out_npu

+

579

+

topk

+

topk_npu

+

580

+

all

+

all_npu

+

581

+

any

+

any_npu

+

582

+

renorm.out

+

renorm_out_npu

+

583

+

renorm

+

renorm_npu

+

584

+

unfold

+

unfold

+

585

+

equal

+

equal_npu

+

586

+

pow.Tensor_Tensor_out

+

pow_out_npu

+

587

+

pow.Tensor_Tensor

+

pow_npu

+

588

+

pow.Scalar_out

+

pow_out_npu

+

589

+

pow.Scalar

+

pow_npu

+

590

+

normal_

+

normal_npu_

+

591

+

normal.Tensor_float_out

+

normal_out_npu

+

592

+

normal.Tensor_float

+

normal_npu

+

593

+

normal.float_Tensor_out

+

normal_out_npu

+

594

+

normal.float_Tensor

+

normal_npu

+

595

+

normal.Tensor_Tensor_out

+

normal_out_npu

+

596

+

normal.Tensor_Tensor

+

normal_npu

+

597

+

normal.float_float

+

normal_npu

+

598

+

normal.float_float_out

+

normal_out_npu

+

599

+

_addr

+

_addr_npu

+

600

+

_addr_

+

_addr_npu_

+

601

+

_addr.out

+

_addr_out_npu

+

602

+

_cumsum

+

_cumsum_npu

+

603

+

_cumsum.out

+

_cumsum_out_npu

+

604

+

_cumprod

+

_cumprod_npu

+

605

+

_cumprod.out

+

_cumprod_out_npu

+

606

+

_var

+

_var_npu

+

607

+

_amp_non_finite_check_and_unscale_

+

_amp_non_finite_check_and_unscale_npu_

+

608

+

_cat

+

_cat_npu

+

609

+

_cat.out

+

_cat_out_npu

+

610

+

_max

+

_max_npu

+

611

+

_max.max

+

_max_out_npu

+

612

+

_min

+

_min_npu

+

613

+

_min.min

+

_min_out_npu

+

614

+

mse_loss.out

+

mse_loss_out_npu

+

615

+

mse_loss

+

mse_loss_npu

+

616

+

mse_loss_backward.grad_input

+

mse_loss_backward_out_npu

+

617

+

mse_loss_backward

+

mse_loss_backward_npu

+

618

+

l1_loss.out

+

l1_loss_out_npu

+

619

+

l1_loss

+

l1_loss_npu

+

620

+

l1_loss_backward.grad_input

+

l1_loss_backward_out_npu

+

621

+

l1_loss_backward

+

l1_loss_backward_npu

+

622

+

multilabel_margin_loss.out

+

multilabel_margin_loss_out_npu

+

623

+

multilabel_margin_loss

+

multilabel_margin_loss_npu

+

624

+

multilabel_margin_loss_forward.output

+

multilabel_margin_loss_forward_out_npu

+

625

+

multilabel_margin_loss_forward

+

multilabel_margin_loss_forward_npu

+

626

+

nll_loss.out

+

nll_loss_out_npu

+

627

+

nll_loss

+

nll_loss_npu

+

628

+

nll_loss_forward.output

+

nll_loss_forward_out_npu

+

629

+

nll_loss_forward

+

nll_loss_forward_npu

+

630

+

nll_loss_backward.grad_input

+

nll_loss_backward_out_npu

+

631

+

nll_loss_backward

+

nll_loss_backward_npu

+

632

+

nll_loss2d.out

+

nll_loss2d_out_npu

+

633

+

nll_loss2d

+

nll_loss2d_npu

+

634

+

nll_loss2d_forward.output

+

nll_loss2d_forward_out_npu

+

635

+

nll_loss2d_forward

+

nll_loss2d_forward_npu

+

636

+

nll_loss2d_backward.grad_input

+

nll_loss2d_backward_out_npu

+

637

+

nll_loss2d_backward

+

nll_loss2d_backward_npu

+

638

+

smooth_l1_loss.out

+

smooth_l1_loss_out_npu

+

639

+

smooth_l1_loss

+

smooth_l1_loss_npu

+

640

+

smooth_l1_loss_backward.grad_input

+

smooth_l1_loss_backward_out_npu

+

641

+

smooth_l1_loss_backward

+

smooth_l1_loss_backward_npu

+

642

+

soft_margin_loss.out

+

soft_margin_loss_out_npu

+

643

+

soft_margin_loss

+

soft_margin_loss_npu

+

644

+

soft_margin_loss_backward.grad_input

+

soft_margin_loss_backward_out_npu

+

645

+

soft_margin_loss_backward

+

soft_margin_loss_backward_npu

+

646

+

elu.out

+

elu_out_npu

+

647

+

elu

+

elu_npu

+

648

+

elu_backward.grad_input

+

elu_backward_out_npu

+

649

+

elu_backward

+

elu_backward_npu

+

650

+

elu_

+

elu_npu_

+

651

+

glu.out

+

glu_out_npu

+

652

+

glu

+

glu_npu

+

653

+

glu_backward.grad_input

+

glu_backward_out_npu

+

654

+

glu_backward

+

glu_backward_npu

+

655

+

hardsigmoid.out

+

hardsigmoid_out_npu

+

656

+

hardsigmoid

+

hardsigmoid_npu

+

657

+

hardsigmoid_

+

hardsigmoid_npu_

+

658

+

hardsigmoid_backward

+

hardsigmoid_backward_npu

+

659

+

hardtanh.out

+

hardtanh_out_npu

+

660

+

hardtanh

+

hardtanh_npu

+

661

+

hardtanh_backward.grad_input

+

hardtanh_backward_out_npu

+

662

+

hardtanh_backward

+

hardtanh_backward_npu

+

663

+

hardtanh_

+

hardtanh_npu_

+

664

+

leaky_relu.out

+

leaky_relu_out_npu

+

665

+

leaky_relu

+

leaky_relu_npu

+

666

+

leaky_relu_backward

+

leaky_relu_backward_npu

+

667

+

leaky_relu_

+

leaky_relu_npu_

+

668

+

log_sigmoid.out

+

log_sigmoid_out_npu

+

669

+

log_sigmoid

+

log_sigmoid_npu

+

670

+

log_sigmoid_forward.output

+

log_sigmoid_forward_out_npu

+

671

+

log_sigmoid_forward

+

log_sigmoid_forward_npu

+

672

+

log_sigmoid_backward.grad_input

+

log_sigmoid_backward_out_npu

+

673

+

log_sigmoid_backward

+

log_sigmoid_backward_npu

+

674

+

softplus.out

+

softplus_out_npu

+

675

+

softplus

+

softplus_npu

+

676

+

softplus_backward.grad_input

+

softplus_backward_out_npu

+

677

+

softplus_backward

+

softplus_backward_npu

+

678

+

softshrink.out

+

softshrink_out_npu

+

679

+

softshrink

+

softshrink_npu

+

680

+

softshrink_backward.grad_input

+

softshrink_backward_out_npu

+

681

+

softshrink_backward

+

softshrink_backward_npu

+

682

+

adaptive_avg_pool2d.out

+

adaptive_avg_pool2d_out_npu

+

683

+

adaptive_avg_pool2d

+

adaptive_avg_pool2d_npu

+

684

+

_adaptive_avg_pool2d

+

_adaptive_avg_pool2d_npu

+

685

+

_adaptive_avg_pool2d_backward

+

adaptive_avg_pool2d_backward_npu

+

686

+

adaptive_avg_pool3d.out

+

adaptive_avg_pool3d_out_npu

+

687

+

adaptive_avg_pool3d

+

adaptive_avg_pool3d_npu

+

688

+

adaptive_avg_pool3d_backward.grad_input

+

adaptive_avg_pool3d_backward_out_npu

+

689

+

adaptive_avg_pool3d_backward

+

adaptive_avg_pool3d_backward_npu

+

690

+

adaptive_max_pool2d.out

+

adaptive_max_pool2d_out_npu

+

691

+

adaptive_max_pool2d

+

adaptive_max_pool2d_npu

+

692

+

adaptive_max_pool2d_backward.grad_input

+

adaptive_max_pool2d_backward_out_npu

+

693

+

adaptive_max_pool2d_backward

+

adaptive_max_pool2d_backward_npu

+

694

+

avg_pool2d.out

+

avg_pool2d_out_npu

+

695

+

avg_pool2d

+

avg_pool2d_npu

+

696

+

avg_pool2d_backward.grad_input

+

avg_pool2d_backward_out_npu

+

697

+

avg_pool2d_backward

+

avg_pool2d_backward_npu

+

698

+

avg_pool3d.out

+

avg_pool3d_out_npu

+

699

+

avg_pool3d

+

avg_pool3d_npu

+

700

+

avg_pool3d_backward.grad_input

+

avg_pool3d_backward_out_npu

+

701

+

avg_pool3d_backward

+

avg_pool3d_backward_npu

+

702

+

max_pool2d_with_indices.out

+

max_pool2d_with_indices_out_npu

+

703

+

max_pool2d_with_indices

+

max_pool2d_with_indices_npu

+

704

+

max_pool2d_with_indices_backward.grad_input

+

max_pool2d_with_indices_backward_out_npu

+

705

+

max_pool2d_with_indices_backward

+

max_pool2d_with_indices_backward_npu

+

706

+

max_pool3d_with_indices.out

+

max_pool3d_with_indices_out_npu

+

707

+

max_pool3d_with_indices

+

max_pool3d_with_indices_npu

+

708

+

max_pool3d_with_indices_backward.grad_input

+

max_pool3d_with_indices_backward_out_npu

+

709

+

max_pool3d_with_indices_backward

+

max_pool3d_with_indices_backward_npu

+

710

+

reflection_pad2d.out

+

reflection_pad2d_out_npu

+

711

+

reflection_pad2d

+

reflection_pad2d_npu

+

712

+

replication_pad2d.out

+

replication_pad2d_out_npu

+

713

+

replication_pad2d

+

replication_pad2d_npu

+

714

+

upsample_linear1d.out

+

upsample_linear1d_out_npu

+

715

+

upsample_linear1d

+

upsample_linear1d_npu

+

716

+

upsample_linear1d_backward

+

upsample_linear1d_backward_npu

+

717

+

upsample_bilinear2d.out

+

upsample_bilinear2d_out_npu

+

718

+

upsample_bilinear2d

+

upsample_bilinear2d_npu

+

719

+

upsample_bilinear2d_backward.grad_input

+

upsample_bilinear2d_backward_out_npu

+

720

+

upsample_bilinear2d_backward

+

upsample_bilinear2d_backward_npu

+

721

+

upsample_bicubic2d.out

+

upsample_bicubic2d_out_npu

+

722

+

upsample_bicubic2d

+

upsample_bicubic2d_npu

+

723

+

upsample_bicubic2d_backward.grad_input

+

upsample_bicubic2d_backward_out_npu

+

724

+

upsample_bicubic2d_backward

+

upsample_bicubic2d_backward_npu

+

725

+

upsample_trilinear3d.out

+

upsample_trilinear3d_out_npu

+

726

+

upsample_trilinear3d

+

upsample_trilinear3d_npu

+

727

+

upsample_trilinear3d_backward.grad_input

+

upsample_trilinear3d_backward_out_npu

+

728

+

upsample_trilinear3d_backward

+

upsample_trilinear3d_backward_npu

+

729

+

upsample_nearest1d.out

+

upsample_nearest1d_out_npu

+

730

+

upsample_nearest1d

+

upsample_nearest1d_npu

+

731

+

upsample_nearest1d_backward.grad_input

+

upsample_nearest1d_backward_out_npu

+

732

+

upsample_nearest1d_backward

+

upsample_nearest1d_backward_npu

+

733

+

upsample_nearest2d.out

+

upsample_nearest2d_out_npu

+

734

+

upsample_nearest2d

+

upsample_nearest2d_npu

+

735

+

upsample_nearest2d_backward.grad_input

+

upsample_nearest2d_backward_out_npu

+

736

+

upsample_nearest2d_backward

+

upsample_nearest2d_backward_npu

+

737

+

upsample_nearest3d.out

+

upsample_nearest3d_out_npu

+

738

+

upsample_nearest3d

+

upsample_nearest3d_npu

+

739

+

upsample_nearest3d_backward.grad_input

+

upsample_nearest3d_backward_out_npu

+

740

+

upsample_nearest3d_backward

+

upsample_nearest3d_backward_npu

+

741

+

sigmoid_backward.grad_input

+

sigmoid_backward_out_npu

+

742

+

sigmoid_backward

+

sigmoid_backward_npu

+

743

+

tanh_backward.grad_input

+

tanh_backward_out_npu

+

744

+

tanh_backward

+

tanh_backward_npu

+

745

+

slow_conv_transpose2d.out

+

slow_conv_transpose2d_out_npu

+

746

+

slow_conv_transpose2d

+

slow_conv_transpose2d_npu

+

747

+

slow_conv_transpose2d_backward.grad_output

+

slow_conv_transpose2d_backward_out_npu

+

748

+

slow_conv_transpose2d_backward.output_mask

+

slow_conv_transpose2d_backward_npu

+

749

+

thnn_conv2d.out

+

thnn_conv2d_out_npu

+

750

+

thnn_conv2d

+

thnn_conv2d_npu

+

751

+

thnn_conv2d_forward.output

+

thnn_conv2d_forward_out_npu

+

752

+

thnn_conv2d_forward

+

thnn_conv2d_forward_npu

+

753

+

thnn_conv2d_backward.output_mask

+

thnn_conv2d_backward_npu

+

754

+

thnn_conv_depthwise2d.out

+

thnn_conv_depthwise2d_out_npu

+

755

+

thnn_conv_depthwise2d

+

thnn_conv_depthwise2d_npu

+

756

+

thnn_conv_depthwise2d_forward.out

+

thnn_conv_depthwise2d_forward_out_npu

+

757

+

thnn_conv_depthwise2d_forward

+

thnn_conv_depthwise2d_forward_npu

+

758

+

thnn_conv_depthwise2d_backward.grad_input

+

thnn_conv_depthwise2d_backward_out_npu

+

759

+

thnn_conv_depthwise2d_backward.output_mask

+

thnn_conv_depthwise2d_backward_npu

+

760

+

slow_conv_dilated2d

+

slow_conv_dilated2d_npu

+

761

+

slow_conv_dilated2d_backward

+

slow_conv_dilated2d_backward_npu

+

762

+

col2im.out

+

im2col_backward_out_npu

+

763

+

col2im

+

im2col_backward_npu

+

764

+

col2im_backward.grad_input

+

col2im_backward_out_npu

+

765

+

col2im_backward

+

col2im_backward_npu

+

766

+

im2col.out

+

im2col_out_npu

+

767

+

im2col

+

im2col_npu

+

768

+

im2col_backward.grad_input

+

im2col_backward_out_npu

+

769

+

im2col_backward

+

im2col_backward_npu

+

770

+

isfinite

+

isfinite_npu

+
+ +

PyTorch Operators Customized by Ascend

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

No.

+

PyTorch Operator (Developed by Ascend)

+

Ascend Adapted Operator

+

1

+

npu_convolution_transpose

+

npu_convolution_transpose

+

2

+

npu_conv_transpose2d

+

convolution_transpose_npu

+

3

+

npu_convolution_transpose_backward

+

convolution_transpose_backward_npu

+

4

+

npu_convolution

+

npu_convolution

+

5

+

npu_convolution_backward

+

npu_convolution_backward

+

6

+

npu_conv2d

+

conv2d_npu

+

7

+

npu_conv2d.out

+

conv2d_out_npu

+

8

+

npu_conv2d_backward

+

conv2d_backward_npu

+

9

+

npu_conv3d

+

conv3d_npu

+

10

+

npu_conv3d.out

+

conv3d_out_npu

+

11

+

npu_conv3d_backward

+

conv3d_backward_npu

+

12

+

one_

+

one_npu_

+

13

+

npu_sort_v2.out

+

sort_without_indices_out_npu

+

14

+

npu_sort_v2

+

sort_without_indices_npu

+

15

+

npu_format_cast

+

format_cast_npu

+

16

+

npu_format_cast_.acl_format

+

format_cast_npu_

+

17

+

npu_format_cast_.src

+

format_cast_npu_

+

18

+

npu_transpose_to_contiguous

+

transpose_to_contiguous_npu

+

19

+

npu_transpose

+

transpose_npu

+

20

+

npu_transpose.out

+

transpose_out_npu

+

21

+

npu_broadcast

+

broadcast_npu

+

22

+

npu_broadcast.out

+

broadcast_out_npu

+

23

+

npu_dtype_cast

+

dtype_cast_npu

+

24

+

npu_dtype_cast_.Tensor

+

dtype_cast_npu_

+

25

+

npu_roi_alignbk

+

roi_align_backward_npu

+

26

+

empty_with_format

+

empty_with_format_npu

+

27

+

empty_with_format.names

+

empty_with_format_npu

+

28

+

copy_memory_

+

copy_memory_npu_

+

29

+

npu_one_hot

+

one_hot_npu

+

30

+

npu_stride_add

+

stride_add_npu

+

31

+

npu_softmax_cross_entropy_with_logits

+

softmax_cross_entropy_with_logits_npu

+

32

+

npu_softmax_cross_entropy_with_logits_backward

+

softmax_cross_entropy_with_logits_backward_npu

+

33

+

npu_ps_roi_pooling

+

ps_roi_pooling_npu

+

34

+

npu_ps_roi_pooling_backward

+

ps_roi_pooling_backward_npu

+

35

+

npu_roi_align

+

roi_align_npu

+

36

+

npu_nms_v4

+

nms_v4_npu

+

37

+

npu_lstm

+

lstm_npu

+

38

+

npu_lstm_backward

+

lstm_backward_npu

+

39

+

npu_iou

+

iou_npu

+

40

+

npu_ptiou

+

ptiou_npu

+

41

+

npu_nms_with_mask

+

nms_with_mask_npu

+

42

+

npu_pad

+

pad_npu

+

43

+

npu_bounding_box_encode

+

bounding_box_encode_npu

+

44

+

npu_bounding_box_decode

+

bounding_box_decode_npu

+

45

+

npu_gru

+

gru_npu

+

46

+

npu_gru_backward

+

gru_backward_npu

+

47

+

npu_set_.source_Storage_storage_offset_format

+

set_npu_

+

48

+

npu_random_choice_with_mask

+

random_choice_with_mask_npu

+

49

+

npu_batch_nms

+

batch_nms_npu

+

50

+

npu_slice

+

slice_npu

+

51

+

npu_slice.out

+

slice_out_npu

+

52

+

npu_dropoutV2

+

dropout_v2_npu

+

53

+

npu_dropoutV2_backward

+

dropout_v2_backward_npu

+

54

+

_npu_dropout

+

_dropout_npu

+

55

+

_npu_dropout_inplace

+

_dropout_npu_inplace

+

56

+

npu_dropout_backward

+

dropout_backward_npu

+

57

+

npu_indexing

+

indexing_npu

+

58

+

npu_indexing.out

+

indexing_out_npu

+

59

+

npu_ifmr

+

ifmr_npu

+

60

+

npu_max.dim

+

max_v1_npu

+

61

+

npu_max.names_dim

+

max_v1_npu

+

62

+

npu_scatter

+

scatter_npu

+

63

+

npu_max_backward

+

max_backward_npu

+

64

+

npu_apply_adam

+

apply_adam_npu

+

65

+

npu_layer_norm_eval

+

layer_norm_eval_npu

+

66

+

npu_alloc_float_status

+

alloc_float_status_npu

+

67

+

npu_get_float_status

+

get_float_status_npu

+

68

+

npu_clear_float_status

+

clear_float_status_npu

+

69

+

npu_confusion_transpose

+

confusion_transpose_npu

+

70

+

npu_confusion_transpose_backward

+

confusion_transpose_backward_npu

+

71

+

npu_bmmV2

+

bmm_v2_npu

+

72

+

fast_gelu

+

fast_gelu_npu

+

73

+

fast_gelu_backward

+

fast_gelu_backward_npu

+

74

+

npu_sub_sample

+

sub_sample_npu

+

75

+

npu_deformable_conv2d

+

deformable_conv2d_npu

+

76

+

npu_deformable_conv2dbk

+

deformable_conv2d_backward_npu

+

77

+

npu_mish

+

mish_npu

+

78

+

npu_anchor_response_flags

+

anchor_response_flags_npu

+

79

+

npu_yolo_boxes_encode

+

yolo_boxes_encode_npu

+

80

+

npu_grid_assign_positive

+

grid_assign_positive_npu

+

81

+

npu_mish_backward

+

mish_backward_npu

+

82

+

npu_normalize_batch

+

normalize_batch_npu

+

83

+

npu_masked_fill_range

+

masked_fill_range_npu

+
+ diff --git a/docs/en/PyTorch Operator Support/public_sys-resources/icon-caution.gif b/docs/en/PyTorch Operator Support/public_sys-resources/icon-caution.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Operator Support/public_sys-resources/icon-caution.gif differ diff --git a/docs/en/PyTorch Operator Support/public_sys-resources/icon-danger.gif b/docs/en/PyTorch Operator Support/public_sys-resources/icon-danger.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Operator Support/public_sys-resources/icon-danger.gif differ diff --git a/docs/en/PyTorch Operator Support/public_sys-resources/icon-note.gif b/docs/en/PyTorch Operator Support/public_sys-resources/icon-note.gif new file mode 100644 index 0000000000000000000000000000000000000000..6314297e45c1de184204098efd4814d6dc8b1cda Binary files /dev/null and b/docs/en/PyTorch Operator Support/public_sys-resources/icon-note.gif differ diff --git a/docs/en/PyTorch Operator Support/public_sys-resources/icon-notice.gif b/docs/en/PyTorch Operator Support/public_sys-resources/icon-notice.gif new file mode 100644 index 0000000000000000000000000000000000000000..86024f61b691400bea99e5b1f506d9d9aef36e27 Binary files /dev/null and b/docs/en/PyTorch Operator Support/public_sys-resources/icon-notice.gif differ diff --git a/docs/en/PyTorch Operator Support/public_sys-resources/icon-tip.gif b/docs/en/PyTorch Operator Support/public_sys-resources/icon-tip.gif new file mode 100644 index 0000000000000000000000000000000000000000..93aa72053b510e456b149f36a0972703ea9999b7 Binary files /dev/null and b/docs/en/PyTorch Operator Support/public_sys-resources/icon-tip.gif differ diff --git a/docs/en/PyTorch Operator Support/public_sys-resources/icon-warning.gif b/docs/en/PyTorch Operator Support/public_sys-resources/icon-warning.gif new file mode 100644 index 0000000000000000000000000000000000000000..6e90d7cfc2193e39e10bb58c38d01a23f045d571 Binary files /dev/null and b/docs/en/PyTorch Operator Support/public_sys-resources/icon-warning.gif differ diff --git a/docs/en/RELEASENOTE/RELEASENOTE.md b/docs/en/RELEASENOTE/RELEASENOTE.md new file mode 100644 index 0000000000000000000000000000000000000000..47a9de634802cca189dcca262576ac69dbba1115 --- /dev/null +++ b/docs/en/RELEASENOTE/RELEASENOTE.md @@ -0,0 +1,139 @@ +# PyTorch Release Notes 2.0.2 +- [Before You Start](#before-you-start.md) +- [New Features](#new-features.md) +- [Modified Features](#modified-features.md) +- [Resolved Issues](#resolved-issues.md) +- [Known Issues](#known-issues.md) +- [Compatibility](#compatibility.md) +

Before You Start

+ +This framework is modified based on the open-source PyTorch 1.5.0 primarily developed by Facebook, inherits native PyTorch features, and uses NPUs for dynamic image training. Models are adapted by operator granularity, code can be reused, and current networks can be ported and used on NPUs with only device types or data types modified. + +

New Features

+ +**Table 1** Features supported by PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Level-1 Feature

+

Level-2 Feature

+

Description

+

Adapted training models

+

YOLOV4

+

-

+

YOLOV3

+

-

+

DB

+

-

+

RFCN

+

-

+

CRNN

+

-

+

Densenset161

+

-

+

Densenset191

+

-

+

PyTorch features adapted to NPUs

+

Basic framework functions

+

Added the function of adapted operator development. For details, see the operator list.

+

Model accuracy analyzer

+

Added model accuracy analyzers and supported training accuracy demarcation.

+

Ascend 710 AI Processor

+

Supported the online inference for the Ascend 710 AI Processor.

+

OS compatibility

+

Supported Ubunta 18.04.5 and openEuler 20.03 LTS.

+
+ +

Modified Features

+ +N/A + +

Resolved Issues

+ +N/A + +

Known Issues

+ + + + + + + + + + + + + + + + + + + + + + + +

Known Issue

+

Description

+

Data type

+

NPU does not support the input or output of the inf/nan data of the float16 type.

+

Data format

+

Dimensions cannot be reduced when the format larger than 4D is used.

+

Restrictions on collective communication

+

+

The graphs executed on different devices in a training job must be the same.

+

Allocation at only 1, 2, 4, or 8 processors is supported.

+

Only the int8, int32, float16, and float32 data types are supported.

+

Apex function

+

In the current version, Apex is implemented mainly using Python, and the customized optimization CUDA kernel in Apex is not supported.

+
+ +

Compatibility

+ +Atlas 800 \(model 9010\): CentOS 7.6/Ubuntu 18.04/BC-Linux 7.6/Debian 9.9/Debian 10/openEuler 20.03 LTS + +Atlas 800 \(model 9000\): CentOS 7.6/Euler 2.8/Kylin v10/BC-Linux 7.6/openEuler 20.03 LTS + diff --git "a/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227.md" index ff47abe64afcb7dcdfac419e988c99d5168fc1d9..a127b3865ac3d7d589bfc75e13e056b622c700e2 100644 --- "a/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227.md" @@ -33,7 +33,7 @@

前提条件

-已完成PyTorch框架及混合精度模块的安装,详情请参见《PyTorch安装指南》进行PyTorch相关运行环境搭建。 +已完成PyTorch框架及混合精度模块的安装,详情请参见《PyTorch安装指南](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97.md)》进行PyTorch相关运行环境搭建。

在线推理流程

@@ -131,7 +131,7 @@ export TASK_QUEUE_ENABLE=0 >![](public_sys-resources/icon-note.gif) **说明:** ->更多日志信息,请参见《CANN 日志参考》。 +>更多日志信息,请参见《CANN 日志参考](https://support.huawei.com/enterprise/zh/doc/EDOC1100206691?idPath=23710424%7C251366513%7C22892968%7C251168373)》。

样例参考

@@ -445,7 +445,7 @@ if __name__ == '__main__': 基于NPU芯片的架构特性,模型运算会涉及到混合精度,即混合使用float16和float32数据类型的应用场景。使用float16代替float32有如下好处: - 对于中间变量的内存占用更少,节省内存的使用。 -- 因内存使用会减少,所以数据传出的时间也会减少。 +- 因内存使用会减少,所以数据传出的时间也会相应减少。 - float16的计算单元可以提供更快的计算性能。 但是,混合精度训练受限于float16表达的精度范围,单纯将float32转换成float16会影响训练收敛情况,为了保证部分计算使用float16来进行加速的同时能保证训练收敛,这里采用混合精度模块Apex来达到以上效果。混合精度模块Apex是一个集优化性能、精度收敛于一身的综合优化库。 @@ -540,7 +540,6 @@ if __name__ == '__main__': apt-get install bzip2 ``` - 4. 编译安装gcc。 1. 进入gcc-7.3.0.tar.gz源码包所在目录,解压源码包,命令为: diff --git "a/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/\345\234\250\347\272\277\346\216\250\347\220\206\346\265\201\347\250\213\345\233\276.png" "b/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/\345\234\250\347\272\277\346\216\250\347\220\206\346\265\201\347\250\213\345\233\276.png" index a6aebc6309a376c2a80c2dcb1de1072936233b9b..3e69dda120f04dc07f6f9e88d94c66ab822653ba 100644 Binary files "a/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/\345\234\250\347\272\277\346\216\250\347\220\206\346\265\201\347\250\213\345\233\276.png" and "b/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/\345\234\250\347\272\277\346\216\250\347\220\206\346\265\201\347\250\213\345\233\276.png" differ diff --git "a/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/\346\235\203\351\207\215\346\233\264\346\226\260\346\265\201\347\250\213\347\244\272\346\204\217\345\233\276.png" "b/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/\346\235\203\351\207\215\346\233\264\346\226\260\346\265\201\347\250\213\347\244\272\346\204\217\345\233\276.png" deleted file mode 100644 index 87b61f83d611b35ea15d0f069b71245d72ee7a0a..0000000000000000000000000000000000000000 Binary files "a/docs/zh/PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227/figures/\346\235\203\351\207\215\346\233\264\346\226\260\346\265\201\347\250\213\347\244\272\346\204\217\345\233\276.png" and /dev/null differ diff --git "a/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" index ea7376bab905c5bb95a78068695e3ecf6fbd2638..052e0f8f6a655c49bf843387c1b4609bc6fbf5cc 100644 --- "a/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" @@ -34,7 +34,7 @@ ## 前提条件 -- 需完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南》。 +- 需完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南](https://support.huawei.com/enterprise/zh/doc/EDOC1100206656?idPath=23710424%7C251366513%7C22892968%7C251168373)》。 - 需安装3.12.0以上版本的CMake,安装方法请参考[CMake安装方法](#CMake安装方法.md)。 - 需确保已安装7.3.0以上版本的gcc,7.3.0版本gcc具体安装及使用方式请参考[安装7.3.0版本gcc](#安装7-3-0版本gcc.md)。 - 需确保环境中已安装patch、git工具,以Ubuntu和CentOS系统为例,命令如下: @@ -118,7 +118,6 @@ git submodule update --init --recursive ``` - >![](public_sys-resources/icon-note.gif) **说明:** >受网络波动影响,源码获取时间可能较长,下载过程中请耐心等待。 下载完成之后若没有报错,即生成了PyTorch及其依赖的第三方代码。 @@ -141,7 +140,6 @@ 生成的二进制包在当前的dist目录下,即“pytorch/pytorch/dist”文件夹目录下。 - 5. 安装PyTorch。 进入“pytorch/pytorch/dist“文件夹目录,执行如下命令安装。 @@ -341,7 +339,6 @@ export HCCL_IF_IP="1.1.1.1" # “1.1.1.1”为示例使用的host网卡IP,请 cd .. ``` - >![](public_sys-resources/icon-note.gif) **说明:** >受网络波动影响,源码获取时间可能较长,下载过程中请耐心等待。 @@ -364,7 +361,6 @@ export HCCL_IF_IP="1.1.1.1" # “1.1.1.1”为示例使用的host网卡IP,请 生成的二进制包在当前的dist目录下,即“apex/apex/dist”文件夹目录下。 - 4. 安装apex。 进入“apex/apex/dist“文件夹目录,执行如下命令安装。 @@ -391,7 +387,7 @@ export HCCL_IF_IP="1.1.1.1" # “1.1.1.1”为示例使用的host网卡IP,请 ## 前提条件 -- 已完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南》。 +- 已完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南](https://support.huawei.com/enterprise/zh/doc/EDOC1100206656?idPath=23710424%7C251366513%7C22892968%7C251168373)》。 - 宿主机上已安装Docker。 ## 获取并使用镜像 @@ -497,7 +493,6 @@ CMake版本升级为3.12.1的方法 apt-get install bzip2 ``` - 4. 编译安装gcc。 1. 进入gcc-7.3.0.tar.gz源码包所在目录,解压源码包,命令为: diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" index 32e95bc3714e23a93167f53da6e976936c743d41..a42c9403d325f7dcca01a1357de86383736ed0cf 100644 --- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -18,7 +18,6 @@ - [pip3.7 install torchvision安装失败](#pip3-7-install-torchvision安装失败.md) - [安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配](#安装-torch--whl-提示-torch-1-5-0xxxx-与-torchvision-所依赖的版本不匹配.md) - [如何查看测试的运行日志](#如何查看测试的运行日志.md) - - [测试运行输出的NPU错误码是什么意思,有无对应的含义解释?](#测试运行输出的NPU错误码是什么意思-有无对应的含义解释.md) - [为什么我实现的“自定义TBE算子”无法调用到?](#为什么我实现的-自定义TBE算子-无法调用到.md) - [如何确定“TBE算子”是否被“PyTorch适配”正确调用](#如何确定-TBE算子-是否被-PyTorch适配-正确调用.md) - [PyTorch编译失败,提示“error: ld returned 1 exit status”](#PyTorch编译失败-提示-error-ld-returned-1-exit-status.md) @@ -30,7 +29,7 @@ ## 概述 -为了实现PyTorch深度学习框架在昇腾AI处理器上运行,需要将框架算子用TBE自定义开发。用户通过完成TBE自定义算子适配PyTorch框架,实现PyTorch框架中算子在昇腾AI处理器上运行。 +为了实现PyTorch深度学习框架在昇腾AI处理器上运行,需要将框架算子用TBE自定义开发。

算子开发流程

@@ -38,7 +37,7 @@ Pytorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 1. TBE算子开发:昇腾AI软件栈中不包含相应的算子,需要先完成TBE算子的开发,再进行PyTorch框架下的算子适配。 - TBE算子开发流程及方法请参见《CANN TBE自定义算子开发指南》。 + TBE算子开发流程及方法请参见《CANN TBE自定义算子开发指南](https://support.huawei.com/enterprise/zh/doc/EDOC1100206660)》。 2. PyTorch框架下的算子适配:昇腾AI软件栈中已实现了相应的TBE算子,可直接进行PyTorch框架适配。 @@ -131,9 +130,9 @@ Pytorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 ## 前提条件 -- 需完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南》。 +- 需完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南](https://support.huawei.com/enterprise/zh/doc/EDOC1100206656?idPath=23710424%7C251366513%7C22892968%7C251168373)》。 - 需安装3.12.0及以上版本的CMake,安装方法请参考[CMake安装方法](#CMake安装方法.md)。 -- 需确保已安装7.3.0以上版本的gcc,7.3.0版本gcc具体安装及使用方式请参见《CANN 软件安装指南》中的“安装7.3.0版本gcc”章节。 +- 需确保已安装7.3.0以上版本的gcc,7.3.0版本gcc具体安装及使用方式请参见《CANN 软件安装指南](https://support.huawei.com/enterprise/zh/doc/EDOC1100206656?idPath=23710424%7C251366513%7C22892968%7C251168373)》中的“安装7.3.0版本gcc”章节。 - 需确保环境中已安装git工具,以Ubuntu和CentOS系统为例,命令如下: - Ubuntu系统 @@ -173,10 +172,10 @@ pip3.7 install Pillow==5.3.0 昇腾AI处理器和PyTorch适配的算子查询方式如下。 - 当前昇腾AI处理器中支持的算子以及对应的算子约束可以通过以下两种方式查询。 - - 命令行开发方式下,您可以参见《CANN 算子清单 \(Ascend 910\)》进行离线查询。 - - MindStudio开发方式下,您可以通过MindStudio进行在线查询,详细查看方法可参见《MindStudio 用户指南》中的“算子&模型速查”章节。 + - 命令行开发方式下,您可以参见《CANN 算子清单 \(Ascend 910\)](https://support.huawei.com/enterprise/zh/doc/EDOC1100206663)》进行离线查询。 + - MindStudio开发方式下,您可以通过MindStudio进行在线查询,详细查看方法可参见《MindStudio 用户指南](https://support.huaweicloud.com/mindstudio302/)》中的“算子&模型速查”章节。 -- 当前PyTorch适配的算子列表可以参见《PyTorch适配算子清单》。 +- 当前PyTorch适配的算子列表可以参见《PyTorch适配算子清单](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E9%80%82%E9%85%8D%E7%AE%97%E5%AD%90%E6%B8%85%E5%8D%95/PyTorch%E9%80%82%E9%85%8D%E7%AE%97%E5%AD%90%E6%B8%85%E5%8D%95.md)》。

算子适配开发

@@ -194,7 +193,7 @@ pip3.7 install Pillow==5.3.0

前提条件

- 完成开发及运行环境准备及相关依赖的安装,详情请参见[环境准备](#环境准备.md)。 -- 完成相关的TBE算子开发及部署,详情请参见《CANN TBE自定义算子开发指南》。 +- 完成相关的TBE算子开发及部署,详情请参见《CANN TBE自定义算子开发指南](https://support.huawei.com/enterprise/zh/doc/EDOC1100206660)》。

获取PyTorch源码

@@ -234,7 +233,6 @@ git clone https://gitee.com/ascend/pytorch-develop.git --deepth=1 - func:适配算子名称(输入参数信息) -> 返回类型 ``` - 3. 修改native\_functions.yaml文件,添加实现该算子相关函数的分发描述。 yaml 文件编写规范: @@ -253,7 +251,6 @@ git clone https://gitee.com/ascend/pytorch-develop.git --deepth=1 NPU: NPU_Adapt_Fun_Name ``` - >![](public_sys-resources/icon-note.gif) **说明:** >NPU\_Adapt\_Fun\_Name的格式为 : >- 如果原Fun\_Name无"\_"后缀,则格式:Fun\_Name + "\_" + "npu",如:add --\> add\_npu。 @@ -656,7 +653,7 @@ git clone https://gitee.com/ascend/pytorch-develop.git --deepth=1 format_list = [0, 3, 29] shape_format = [ [np.float32, i, [5, 256]] for i in format_list - ] + ] self.add_result(shape_format) instantiate_device_type_tests(TestAdd, globals(), except_for="cpu") @@ -686,8 +683,6 @@ git clone https://gitee.com/ascend/pytorch-develop.git --deepth=1 - **[如何查看测试的运行日志](#如何查看测试的运行日志.md)** -- **[测试运行输出的NPU错误码是什么意思,有无对应的含义解释?](#测试运行输出的NPU错误码是什么意思-有无对应的含义解释.md)** - - **[为什么我实现的“自定义TBE算子”无法调用到?](#为什么我实现的-自定义TBE算子-无法调用到.md)** - **[如何确定“TBE算子”是否被“PyTorch适配”正确调用](#如何确定-TBE算子-是否被-PyTorch适配-正确调用.md)** @@ -772,10 +767,6 @@ pip3.7 install torchvision --no-deps ``` -

测试运行输出的NPU错误码是什么意思,有无对应的含义解释?

- -参考“[错误码定义](https://support.huaweicloud.com/adevg-A300_3000_3010/atlasdevelopment_01_0256.html)”。 -

为什么我实现的“自定义TBE算子”无法调用到?

## 现象描述 diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" index b9995f8b8f2099ed4d92f283013c2a69db8a6022..eaa6fcd9fd2efe92159efd4aa2f2924da61c08a9 100644 --- "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" @@ -70,7 +70,7 @@ - [在模型运行时遇到报错“RuntimeError: Initialize.”](#在模型运行时遇到报错-RuntimeError-Initialize.md) - [在模型运行时遇到报错“TVM/te/cce error.”](#在模型运行时遇到报错-TVM-te-cce-error.md) - [在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed.md) - - [在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”1](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed-1.md) + - [在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed-7.md) - [在模型运行时将多任务下发关闭\(export TASK\_QUEUE\_ENABLE=0\)后仍然遇到报错“HelpACLExecute.”](#在模型运行时将多任务下发关闭(export-TASK_QUEUE_ENABLE-0)后仍然遇到报错-HelpACLExecute.md) - [在模型运行时遇到报错“55056 GetInputConstDataOut: ErrorNo: -1\(failed\)”](#在模型运行时遇到报错-55056-GetInputConstDataOut-ErrorNo--1(failed).md) - [模型调测常见问题](#模型调测常见问题.md) @@ -152,7 +152,7 @@

算子开发

-

详情请参见《PyTorch算子开发指南》

+

详情请参见PyTorch算子开发指南

环境准备

@@ -172,7 +172,7 @@

错误分析

-

详情请参见《CANN 日志参考》《CANN 开发辅助工具指南》中“AI Core Error分析工具使用指南”章节。

+

详情请参见CANN 日志参考《CANN 开发辅助工具指南》中“AI Core Error分析工具使用指南”章节。

性能调优和分析

@@ -192,7 +192,7 @@

应用软件开发

-

详情请参见《CANN 应用软件开发指南(C&C++, 推理)》

+

详情请参见CANN 应用软件开发指南(C&C++, 推理)

FAQ

@@ -206,7 +206,7 @@

模型移植评估

1. 在选取模型时,尽可能选取权威Pytorch模型实现仓作为标杆,包括但不限于Pytorch\([example](https://github.com/pytorch/examples/tree/master/imagenet)/[vision](https://github.com/pytorch/vision)等\)、facebookresearch\([Detectron](https://github.com/facebookresearch/Detectron)/[detectron2](https://github.com/facebookresearch/detectron2)等\)和open-mmlab\([mmdetection](https://github.com/open-mmlab/mmdetection)/[mmpose](https://github.com/open-mmlab/mmpose)等\)。 -2. 查看算子适配情况。将原始模型及训练脚本迁移到昇腾AI处理器上之前,可以将原始模型及训练脚本在CPU上进行训练,使用dump op方法获取算子信息,与《PyTorch适配算子清单》算子进行比较,查看是否支持。dump op方法参见[dump op方法](#dump-op方法.md),当有不支持算子时参见《PyTorch算子开发指南》进行算子开发。 +2. 查看算子适配情况。将原始模型及训练脚本迁移到昇腾AI处理器上之前,可以将原始模型及训练脚本在CPU上进行训练,使用dump op方法获取算子信息,与《PyTorch适配算子清单](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E9%80%82%E9%85%8D%E7%AE%97%E5%AD%90%E6%B8%85%E5%8D%95/PyTorch%E9%80%82%E9%85%8D%E7%AE%97%E5%AD%90%E6%B8%85%E5%8D%95.md)》算子进行比较,查看是否支持。dump op方法参见[dump op方法](#dump-op方法.md),当有不支持算子时参见《PyTorch算子开发指南](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E7%AE%97%E5%AD%90%E5%BC%80%E5%8F%91%E6%8C%87%E5%8D%97/PyTorch%E7%AE%97%E5%AD%90%E5%BC%80%E5%8F%91%E6%8C%87%E5%8D%97.md)》进行算子开发。 >![](public_sys-resources/icon-note.gif) **说明:** >查看算子适配情况也可以先将模型及训练脚本迁移到昇腾AI处理器(迁移方法参见下文)进行训练来查看报错信息。一般会提示不能在昇腾AI处理器的backend下运行某个算子(第一个不支持的算子)。 @@ -221,7 +221,7 @@

准备运行环境

-请参见《PyTorch安装指南》进行PyTorch相关运行环境搭建。 +请参见《PyTorch安装指南](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97.md)》进行PyTorch相关运行环境搭建。

配置环境变量

@@ -328,7 +328,7 @@ export HCCL_IF_IP="1.1.1.1" # “1.1.1.1”为示例使用的host网卡IP,请

HCCL_WHITELIST_DISABLE

配置在使用HCCL时是否开启通信白名单。

-
  • 0:开启白名单,无需校验HCCL通信白名单。
  • 1:关闭白名单,需校验HCCL通信白名单。
+
  • 0:开启白名单
  • 1:关闭白名单

缺省值为0,默认开启白名单。

@@ -360,7 +360,7 @@ export HCCL_IF_IP="1.1.1.1" # “1.1.1.1”为示例使用的host网卡IP,请

工具迁移

-Ascend平台提供了脚本转换工具使用户能通过命令行方式将训练脚本迁移到昇腾AI处理器上进行训练,命令行方式工具详细使用说明参见下文。除命令行方式外,用户也可通过MindStudio中集成的PyTorch GPU2Ascend功能进行迁移,详情请参见《MindStudio 用户指南》。 +Ascend平台提供了脚本转换工具使用户能通过命令行方式将训练脚本迁移到昇腾AI处理器上进行训练,命令行方式工具详细使用说明参见下文。除命令行方式外,用户也可通过MindStudio中集成的PyTorch GPU2Ascend功能进行迁移,详情请参见《MindStudio 用户指南](https://support.huaweicloud.com/mindstudio302/)》。 - **[功能介绍](#功能介绍.md)** @@ -674,7 +674,7 @@ Ascend平台提供了脚本转换工具使用户能通过命令行方式将训 ## 环境准备 -详情请参考《CANN 软件安装指南》安装开发环境。 +详情请参考《CANN 软件安装指南](https://support.huawei.com/enterprise/zh/doc/EDOC1100206656?idPath=23710424%7C251366513%7C22892968%7C251168373)》安装开发环境。

操作指南

@@ -813,7 +813,8 @@ Ascend平台提供了脚本转换工具使用户能通过命令行方式将训

parent_module

-

父级模块名称

+

父级模块全名

+

例如torch.cuda.amp,amp的父级模块全名为torch.cuda。

@@ -1110,7 +1111,7 @@ def main(): -更多接口请参见《PyTorch API支持清单》。 +更多接口请参见《PyTorch API支持清单](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%20API%E6%94%AF%E6%8C%81%E6%B8%85%E5%8D%95/PyTorch%20API%E6%94%AF%E6%8C%81%E6%B8%85%E5%8D%95.md)》。

混合精度

@@ -1119,7 +1120,7 @@ def main(): 基于NPU芯片的架构特性,会涉及到混合精度训练,即混合使用float16和float32数据类型的应用场景。使用float16代替float32有如下好处: - 对于中间变量的内存占用更少,节省内存的使用。 -- 因内存使用会减少,所以数据传出的时间也会减半。 +- 因内存使用会减少,所以数据传出的时间也会相应减少。 - float16的计算单元可以提供更快的计算性能。 但是,混合精度训练受限于float16表达的精度范围,单纯将float32转换成float16会影响训练收敛情况,为了保证部分计算使用float16来进行加速的同时能保证训练收敛,这里采用混合精度模块Apex来达到以上效果。混合精度模块Apex是一个集优化性能、精度收敛于一身的综合优化库。 @@ -1144,7 +1145,7 @@ def main():

O1配置模式

-

Conv,Matmal等使用float16计算,其他如Softmax、BN使用float32

+

Conv,Matmul等使用float16计算,其他如Softmax、BN使用float32

O2配置模式

@@ -1322,7 +1323,6 @@ def main(): systemctl start cpupower ``` - 3. 设置CPU为performance模式。 ``` @@ -1434,7 +1434,7 @@ def main():

前提条件

1. 参见[样例说明](#样例说明.md)改造开源代码,使模型能够正常运行,包括数据预处理,前向计算,loss计算,混合精度,反向计算,参数更新等。 -2. 模型迁移阶段优先关注模型是否能跑通,现有算子是否能满足,如果遇到不满足的算子需参见《PyTorch算子开发指南》进行算子适配开发。 +2. 模型迁移阶段优先关注模型是否能跑通,现有算子是否能满足,如果遇到不满足的算子需参见《PyTorch算子开发指南](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E7%AE%97%E5%AD%90%E5%BC%80%E5%8F%91%E6%8C%87%E5%8D%97/PyTorch%E7%AE%97%E5%AD%90%E5%BC%80%E5%8F%91%E6%8C%87%E5%8D%97.md)》进行算子适配开发。 3. 优先打通单卡功能,再打通多卡功能。

调测过程

@@ -1536,7 +1536,7 @@ def main(): export ASCEND_SLOG_PRINT_TO_STDOUT=1 ``` -3. 设置日志级别为info,参考《CANN 日志参考》设置日志级别。 +3. 设置日志级别为info,参考《CANN 日志参考](https://support.huawei.com/enterprise/zh/doc/EDOC1100206691?idPath=23710424%7C251366513%7C22892968%7C251168373)》设置日志级别。 4. 执行训练脚本,进行模型训练,训练完成后获取host侧日志,默认位置为$HOME/ascend/log/plog目录下,$HOME表示Host侧用户根目录。 5. 解析host侧日志会在当前目录下得到OPInfo信息ascend\_op\_info\_summary.txt。 @@ -1675,7 +1675,6 @@ def main(): - 解决方案:改进算子精度或功能问题。 - 2. loss计算错误。 - 定位思路:由于Loss的特殊性和可以自定义,在判断Loss计算错误后建议dump网络中的loss的输入来测试而非随机同shape tensor,这样才能更好地复现证明。 @@ -1683,7 +1682,6 @@ def main(): - 解决方案:改进算子精度或功能问题(loss也是由算子构成)。 - 3. 参数更新错误。 - 定位思路:在每个optim.step\(\)前对网络中的参数逐个打印其grad进行排查判断是哪个地方有较大嫌疑,然后构建单算子用例逐渐缩小错误范围,证明该算子在当前网络场景下梯度计算有误,可以对比CPU或GPU结果证明。该项优先级应低于[1.](#li17755175510322)与[2.](#li25281726103316),因为1与2的错误同样可以造成grad异常。 @@ -1692,7 +1690,6 @@ def main(): - 解决方案:改进计算grad的算子精度或功能问题。 - 4. 多卡计算错误。 - 定位思路:在保证单卡精度OK的前提下,稳定复现多卡不收敛。 @@ -1778,7 +1775,7 @@ def main(): 如果想使用Auto Tune优化功能,请参考《CANN 开发辅助工具指南》手册中“Auto Tune工具使用指导”章节。 -离线推理应用构建请参考《CANN 应用软件开发指南\(C&C++, 推理\)》。整体流程如下: +离线推理应用构建请参考《CANN 应用软件开发指南\(C&C++, 推理\)](https://support.huawei.com/enterprise/zh/doc/EDOC1100206685?idPath=23710424%7C251366513%7C22892968%7C251168373)》。整体流程如下: ![](figures/zh-cn_image_0000001106176222.png) @@ -1802,7 +1799,7 @@ Pytorch在训练过程中,通常使用torch.save\(\)来保存Checkpoint文件 torch.save(net.state_dict(), PATH) ``` - 2. 加载模型以用于在线推理,示例如下,详情请参见《PyTorch在线推理指南》。 + 2. 加载模型以用于在线推理,示例如下,详情请参见《PyTorch在线推理指南](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E5%9C%A8%E7%BA%BF%E6%8E%A8%E7%90%86%E6%8C%87%E5%8D%97/PyTorch%E5%9C%A8%E7%BA%BF%E6%8E%A8%E7%90%86%E6%8C%87%E5%8D%97.md)》。 ``` # 模型文件保存路径 @@ -1813,7 +1810,6 @@ Pytorch在训练过程中,通常使用torch.save\(\)来保存Checkpoint文件 model.eval() ``` - >![](public_sys-resources/icon-notice.gif) **须知:** >保存.pth或.pt文件扩展名的文件时要提供模型定义文件,否则无法部署。 @@ -2061,10 +2057,10 @@ if __name__ == "__main__": # if not torch.cuda.is_available(): # print('using CPU, this will be slow') # elif args.distributed: - ############## npu modify begin ############# + ############## npu modify begin ############# # 迁移后为直接判断是否进行分布式训练,去掉判断是否在GPU上进行训练 if args.distributed: - ############## npu modify end ############# + ############## npu modify end ############# # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. @@ -2122,7 +2118,6 @@ if __name__ == "__main__": ############## npu modify end ############# ``` - - 代码位置:main.py文件中的validate\(\)函数(修改部分为字体加粗部分): ``` @@ -2142,7 +2137,6 @@ if __name__ == "__main__": ############## npu modify end ############# ``` - 6. 设置当前正在使用的device。 代码位置:main.py文件中的主函数入口(修改部分为字体加粗部分): @@ -2595,7 +2589,7 @@ python3.7 main.py /home/data/resnet50/imagenet --addr='1.1.1.1' \

模型评估

-模型评估主要关注算子适配情况,使用dump op方法获取ShuffleNet网络算子信息,与《PyTorch适配算子清单》算子进行对比,若是发现某个算子当前暂不支持,对于简单场景我们可以考虑先暂时替换成类似的算子或者把该算子单独放到cpu上执行两种方式规避,复杂场景不支持算子需要参见《PyTorch算子开发指南》进行算子开发。 +模型评估主要关注算子适配情况,使用dump op方法获取ShuffleNet网络算子信息,与《PyTorch适配算子清单](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E9%80%82%E9%85%8D%E7%AE%97%E5%AD%90%E6%B8%85%E5%8D%95/PyTorch%E9%80%82%E9%85%8D%E7%AE%97%E5%AD%90%E6%B8%85%E5%8D%95.md)》算子进行对比,若是发现某个算子当前暂不支持,对于简单场景我们可以考虑先暂时替换成类似的算子或者把该算子单独放到cpu上执行两种方式规避,复杂场景不支持算子需要参见《PyTorch算子开发指南](#https://gitee.com/ascend/pytorch/blob/2.0.2.tr5/docs/zh/PyTorch%E7%AE%97%E5%AD%90%E5%BC%80%E5%8F%91%E6%8C%87%E5%8D%97/PyTorch%E7%AE%97%E5%AD%90%E5%BC%80%E5%8F%91%E6%8C%87%E5%8D%97.md)》进行算子开发。

网络迁移

@@ -3326,7 +3320,7 @@ torch.npu.finalize_dump() **export ASCEND\_SLOG\_PRINT\_TO\_STDOUT=1** -3. 设置日志级别,日志级别设置,信息从多到少分别是 debug --\> info --\> warning --\> error --\> null,一般设置为error,调试时使用info。请参考《CANN 日志参考》设置日志级别。 +3. 设置日志级别,日志级别设置,信息从多到少分别是 debug --\> info --\> warning --\> error --\> null,一般设置为error,调试时使用info。请参考《CANN 日志参考](https://support.huawei.com/enterprise/zh/doc/EDOC1100206691?idPath=23710424%7C251366513%7C22892968%7C251168373)》设置日志级别。 4. dump图,主要用于查看图结构。 **export DUMP\_GE\_GRAPH=2** @@ -3488,7 +3482,7 @@ pip3.7 install pillow==5.3.0安装失败。 - **[在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed.md)** -- **[在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”1](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed-1.md)** +- **[在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed-7.md)** - **[在模型运行时将多任务下发关闭\(export TASK\_QUEUE\_ENABLE=0\)后仍然遇到报错“HelpACLExecute.”](#在模型运行时将多任务下发关闭(export-TASK_QUEUE_ENABLE-0)后仍然遇到报错-HelpACLExecute.md)** @@ -3677,7 +3671,7 @@ shell报错是在同步操作中和AI CPU错误,而日志报错信息却是在 4. 打印stack所有参数的shape、dtype、npu\_format,通过构造单算子用例复现问题。定位到问题原因为减法计算输入参数数据类型不同,导致a-b和b-a结果的数据类型不一致,最终在stack算子中报错。 5. 将stack入参数据类型转换为一致即可临时规避问题。 -

在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”1

+

在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”

## 现象描述 @@ -3884,7 +3878,7 @@ pytorch算子在npu上运行,通过ACL接口调用底层经过优化的算子 ## 现象描述 -![](figures/FAQ12.png) +![](figures/model_faq11_20210728.jpg) ## 可能原因 diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/model_faq11_20210728.jpg" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/model_faq11_20210728.jpg" new file mode 100644 index 0000000000000000000000000000000000000000..ac24282446804eb5ee80070a09978910919d103a Binary files /dev/null and "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/model_faq11_20210728.jpg" differ diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/\347\216\257\345\242\203\345\207\206\345\244\207\346\265\201\347\250\213\345\233\276.png" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/\347\216\257\345\242\203\345\207\206\345\244\207\346\265\201\347\250\213\345\233\276.png" deleted file mode 100644 index cdda4fab2365a81d54807e9118cc617a25b8f4f2..0000000000000000000000000000000000000000 Binary files "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/\347\216\257\345\242\203\345\207\206\345\244\207\346\265\201\347\250\213\345\233\276.png" and /dev/null differ diff --git "a/docs/zh/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225.md" "b/docs/zh/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225.md" index 6b13eaeb75451e1ef70e93f46ff1ec2a4a031e07..1d286c4692957a56aa7dcb2ad01691b2cc95c76a 100644 --- "a/docs/zh/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225.md" +++ "b/docs/zh/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225/PyTorch\351\200\202\351\205\215\347\256\227\345\255\220\346\270\205\345\215\225.md" @@ -1,868 +1,6002 @@ -# **PyTorch适配算子清单** -- **[PyTorch原生API昇腾算子对应表](#PyTorch原生API昇腾算子对应表.md)** -- **[PyTorch昇腾自定义算子](#PyTorch昇腾自定义算子.md)** -

PyTorch原生API昇腾算子对应表

- -| 序号 | PyTorch 原生算子 | 昇腾适配算子 | -| ---- | ------------------------------------------- | --------------------------------------------- | -| 1 | dropout | dropout_npu | -| 2 | dropout_ | dropout_npu_ | -| 3 | abs | abs_npu | -| 4 | abs_ | abs_npu_ | -| 5 | abs.out | abs_out_npu | -| 6 | acos | acos_npu | -| 7 | acos_ | acos_npu_ | -| 8 | acos.out | acos_out_npu | -| 9 | adaptive_avg_pool1d | adaptive_avg_pool1d_npu | -| 10 | add.Tensor | add_npu | -| 11 | add_.Tensor | add_npu_ | -| 12 | add.out | add_out_npu | -| 13 | add.Scalar | add_npu | -| 14 | add_.Scalar | add_npu_ | -| 15 | addmv | addmv_npu | -| 16 | addmv_ | addmv_npu_ | -| 17 | addmv.out | addmv_out_npu | -| 18 | addr | addr_npu | -| 19 | addr_ | addr_npu_ | -| 20 | addr.out | addr_out_npu | -| 21 | affine_grid_generator | affine_grid_generator_npu | -| 22 | affine_grid_generator_backward | affine_grid_generator_backward_npu | -| 23 | all.dim | all_npu | -| 24 | all.out | all_out_npu | -| 25 | any.dim | any_npu | -| 26 | any.out | any_out_npu | -| 27 | arange | arange_npu | -| 28 | arange.start | arange_npu | -| 29 | arange.start_step | arange_npu | -| 30 | arange.out | arange_out_npu | -| 31 | arange.start_out | arange_out_npu | -| 32 | _dim_arange | _dim_arange_npu | -| 33 | argmax | argmax_npu | -| 34 | argmin | argmin_npu | -| 35 | as_strided | as_strided_npu | -| 36 | as_strided_ | as_strided_npu_ | -| 37 | asin | asin_npu | -| 38 | asin_ | asin_npu_ | -| 39 | asin.out | asin_out_npu | -| 40 | atan | atan_npu | -| 41 | atan_ | atan_npu_ | -| 42 | atan.out | atan_out_npu | -| 43 | baddbmm | baddbmm_npu | -| 44 | baddbmm_ | baddbmm_npu_ | -| 45 | baddbmm.out | baddbmm_out_npu | -| 46 | bartlett_window | bartlett_window_npu | -| 47 | bartlett_window.periodic | bartlett_window_npu | -| 48 | batch_norm | batch_norm_npu_ | -| 49 | _batch_norm_impl_index | _batch_norm_impl_index_npu | -| 50 | _batch_norm_impl_index_backward | _batch_norm_impl_index_backward_npu | -| 51 | bernoulli | bernoulli_npu | -| 52 | bernoulli_.Tensor | bernoulli_npu_ | -| 53 | bernoulli_.float | bernoulli_npu_ | -| 54 | binary_cross_entropy | binary_cross_entropy_npu | -| 55 | binary_cross_entropy.out | binary_cross_entropy_out_npu | -| 56 | binary_cross_entropy_backward | binary_cross_entropy_backward_npu | -| 57 | binary_cross_entropy_backward.grad_input | binary_cross_entropy_backward_out_npu | -| 58 | binary_cross_entropy_with_logits | binary_cross_entropy_with_logits_npu | -| 59 | binary_cross_entropy_with_logits_backward | binary_cross_entropy_with_logits_backward_npu | -| 60 | bitwise_not | bitwise_not_npu | -| 61 | bitwise_not_ | bitwise_not_npu_ | -| 62 | bitwise_not.out | bitwise_not_out_npu | -| 63 | logical_not | logical_not_npu | -| 64 | logical_not_ | logical_not_npu_ | -| 65 | logical_not.out | logical_not_out_npu | -| 66 | logical_and | logical_and_npu | -| 67 | logical_and_ | logical_and_npu_ | -| 68 | logical_and.out | logical_and_out_npu | -| 69 | logical_or | logical_or_npu | -| 70 | logical_or_ | logical_or_npu_ | -| 71 | logical_or.out | logical_or_out_npu | -| 72 | blackman_window | blackman_window_npu | -| 73 | blackman_window.periodic | blackman_window_npu | -| 74 | bmm | bmm_npu | -| 75 | bmm.out | bmm_out_npu | -| 76 | cat | cat_npu | -| 77 | cat.out | cat_out_npu | -| 78 | cat.names | cat_npu | -| 79 | cat.names_out | cat_out_npu | -| 80 | ceil | ceil_npu | -| 81 | ceil_ | ceil_npu_ | -| 82 | ceil.out | ceil_out_npu | -| 83 | clamp | clamp_npu | -| 84 | clamp_ | clamp_npu_ | -| 85 | clamp.out | clamp_out_npu | -| 86 | clamp_max | clamp_max_npu | -| 87 | clamp_max_ | clamp_max_npu_ | -| 88 | clamp_max.out | clamp_max_out_npu | -| 89 | clamp_min | clamp_min_npu | -| 90 | clamp_min_ | clamp_min_npu_ | -| 91 | clamp_min.out | clamp_min_out_npu | -| 92 | constant_pad_nd | constant_pad_nd_npu | -| 93 | contiguous | contiguous_npu | -| 94 | convolution | convolution_npu | -| 95 | _convolution | _convolution_npu | -| 96 | _convolution_nogroup | _convolution_nogroup_npu | -| 97 | conv2d | conv2d_npu_ | -| 98 | conv3d | _conv3d_npu | -| 99 | conv_tbc | conv_tbc_npu | -| 100 | conv_tbc_backward | conv_tbc_backward_npu | -| 101 | conv_transpose2d.input | conv_transpose2d_npu_ | -| 102 | copy_ | copy_npu_ | -| 103 | cos | cos_npu | -| 104 | cos_ | cos_npu_ | -| 105 | cos.out | cos_out_npu | -| 106 | cosh | cosh_npu | -| 107 | cosh_ | cosh_npu_ | -| 108 | cosh.out | cosh_out_npu | -| 109 | cummin | cummin_npu | -| 110 | cummin.out | cummin_out_npu | -| 111 | cummin.dimname | cummin_npu | -| 112 | cummin.dimname_out | cummin_out_npu | -| 113 | cumprod | cumprod_npu | -| 114 | cumprod.out | cumprod_out_npu | -| 115 | cumprod.dimname | cumprod_npu | -| 116 | cumprod.dimname_out | cumprod_out_npu | -| 117 | ctc_loss.IntList | ctc_loss_npu | -| 118 | ctc_loss.Tensor | ctc_loss_npu | -| 119 | _ctc_loss | ctc_loss_npu | -| 120 | _ctc_loss_backward | ctc_loss_backward_npu | -| 121 | fill_diagonal_ | fill_diagonal_npu_ | -| 122 | div.Tensor | div_npu | -| 123 | div_.Tensor | div_npu_ | -| 124 | div.out | div_out_npu | -| 125 | div.Scalar | div_npu | -| 126 | div_.Scalar | div_npu_ | -| 127 | dot | dot_npu | -| 128 | dot.out | dot_out_npu | -| 129 | embedding | embedding_npu | -| 130 | embedding_backward | embedding_backward_npu | -| 131 | embedding_dense_backward | embedding_dense_backward_npu | -| 132 | embedding_renorm_ | embedding_renorm_npu_ | -| 133 | _embedding_bag | _embedding_bag_npu | -| 134 | empty.memory_format | empty_npu | -| 135 | resize_ | resize_npu_ | -| 136 | empty_like | empty_like_npu | -| 137 | empty_strided | empty_strided_npu | -| 138 | erf | erf_npu | -| 139 | erf_ | erf_npu_ | -| 140 | erf.out | erf_out_npu | -| 141 | exp | exp_npu | -| 142 | exp_ | exp_npu_ | -| 143 | exp.out | exp_out_npu | -| 144 | expm1 | expm1_npu | -| 145 | expm1_ | expm1_npu_ | -| 146 | expm1.out | expm1_out_npu | -| 147 | eye | eye_npu | -| 148 | eye.m | eye_npu | -| 149 | eye.out | eye_out_npu | -| 150 | eye.m_out | eye_out_npu | -| 151 | fill_.Scalar | fill_npu_ | -| 152 | fill_.Tensor | fill_npu_ | -| 153 | floor | floor_npu | -| 154 | floor_ | floor_npu_ | -| 155 | floor.out | floor_out_npu | -| 156 | floor_divide | floor_divide_npu | -| 157 | floor_divide_.Tensor | floor_divide_npu_ | -| 158 | floor_divide.out | floor_divide_out_npu | -| 159 | floor_divide.Scalar | floor_divide_npu | -| 160 | floor_divide_.Scalar | floor_divide_npu_ | -| 161 | frac | frac_npu | -| 162 | frac_ | frac_npu_ | -| 163 | frac.out | frac_out_npu | -| 164 | full.names | full_npu | -| 165 | full | full_npu | -| 166 | full.out | full_out_npu | -| 167 | grid_sampler | grid_sampler_npu | -| 168 | grid_sampler_3d | grid_sampler_3d_npu | -| 169 | grid_sampler_3d_backward | grid_sampler_3d_backward_npu | -| 170 | hann_window | hann_window_npu | -| 171 | hann_window.periodic | hann_window_npu | -| 172 | hamming_window | hamming_window_npu | -| 173 | hamming_window.periodic | hamming_window_npu | -| 174 | hamming_window.periodic_alpha | hamming_window_npu | -| 175 | hamming_window.periodic_alpha_beta | hamming_window_npu | -| 176 | ger | ger_npu | -| 177 | ger.out | ger_out_npu | -| 178 | index.Tensor | index_npu | -| 179 | index_put_ | index_put_npu_ | -| 180 | index_put | index_put_npu | -| 181 | _index_put_impl_ | _index_put_impl_npu_ | -| 182 | inverse | inverse_npu | -| 183 | inverse.out | inverse_out_npu | -| 184 | isclose | isclose_npu | -| 185 | isnan | isnan_npu | -| 186 | is_nonzero | is_nonzero_npu | -| 187 | kl_div | kl_div_npu | -| 188 | kl_div_backward | kl_div_backward_npu | -| 189 | kthvalue | kthvalue_npu | -| 190 | kthvalue.values | kthvalue_out_npu | -| 191 | kthvalue.dimname | kthvalue_npu | -| 192 | kthvalue.dimname_out | kthvalue_out_npu | -| 193 | native_layer_norm | layer_norm_npu | -| 194 | native_layer_norm_backward | layer_norm_backward_npu | -| 195 | linspace | linspace_npu | -| 196 | linspace.out | linspace_out_npu | -| 197 | log | log_npu | -| 198 | log_ | log_npu_ | -| 199 | log.out | log_out_npu | -| 200 | log10 | log10_npu | -| 201 | log10_ | log10_npu_ | -| 202 | log10.out | log10_out_npu | -| 203 | log1p | log1p_npu | -| 204 | log1p_ | log1p_npu_ | -| 205 | log1p.out | log1p_out_npu | -| 206 | log2 | log2_npu | -| 207 | log2_ | log2_npu_ | -| 208 | log2.out | log2_out_npu | -| 209 | logspace | logspace_npu | -| 210 | logspace.out | logspace_out_npu | -| 211 | log_softmax.int | log_softmax_npu | -| 212 | log_softmax.Dimname | log_softmax_npu | -| 213 | _log_softmax | _log_softmax_npu | -| 214 | _log_softmax_backward_data | _log_softmax_backward_npu | -| 215 | logsumexp | logsumexp_npu | -| 216 | logsumexp.out | logsumexp_out_npu | -| 217 | logsumexp.names | logsumexp_npu | -| 218 | logsumexp.names_out | logsumexp_out_npu | -| 219 | matmul | matmul_npu | -| 220 | matmul.out | matmul_out_npu | -| 221 | matrix_power | matrix_power_npu | -| 222 | max.dim | max_npu | -| 223 | max.dim_max | max_out_npu | -| 224 | max_values | max_npu | -| 225 | max.names_dim | max_npu | -| 226 | max.names_dim_max | max_out_npu | -| 227 | max_values.names | max_npu | -| 228 | max_pool2d | max_pool2d_npu | -| 229 | quantized_max_pool2d | quantized_max_pool2d_npu | -| 230 | mean | mean_npu | -| 231 | mean.dim | mean_npu | -| 232 | mean.out | mean_out_npu | -| 233 | mean.names_dim | mean_npu | -| 234 | mean.names_out | mean_out_npu | -| 235 | median.dim | median_npu | -| 236 | median.dim_values | median_out_npu | -| 237 | median.names_dim | median_npu | -| 238 | median.names_dim_values | median_out_npu | -| 239 | min.dim | min_npu | -| 240 | min.dim_min | min_out_npu | -| 241 | min_values | min_npu | -| 242 | min.names_dim | min_npu | -| 243 | min.names_dim_min | min_out_npu | -| 244 | min_values.names | min_npu | -| 245 | mm | mm_npu | -| 246 | mm.out | mm_out_npu | -| 247 | mode | mode_npu | -| 248 | mode.values | mode_out_npu | -| 249 | mul.Tensor | mul_npu | -| 250 | mul_.Tensor | mul_npu_ | -| 251 | mul.out | mul_out_npu | -| 252 | mul.Scalar | mul_npu | -| 253 | mul_.Scalar | mul_npu_ | -| 254 | mv | mv_npu | -| 255 | mv.out | mv_out_npu | -| 256 | narrow_copy | narrow_copy_npu | -| 257 | native_batch_norm | batch_norm_npu | -| 258 | native_batch_norm_backward | batch_norm_backward_npu | -| 259 | _nnpack_spatial_convolution | _nnpack_spatial_convolution_npu | -| 260 | ones.names | ones_npu | -| 261 | ones | ones_npu | -| 262 | ones.out | ones_out_npu | -| 263 | ones_like | ones_like_npu | -| 264 | cdist | cdist_npu | -| 265 | _cdist_forward | _cdist_forward_npu | -| 266 | _cdist_backward | _cdist_backward_npu | -| 267 | pdist | pdist_npu | -| 268 | _pdist_forward | _pdist_forward_npu | -| 269 | randperm | randperm_npu | -| 270 | randperm.generator | randperm_npu | -| 271 | randperm.out | randperm_out_npu | -| 272 | randperm.generator_out | randperm_out_npu | -| 273 | range.step | range_npu | -| 274 | range | range_npu | -| 275 | range.out | range_out_npu | -| 276 | reciprocal | reciprocal_npu | -| 277 | reciprocal_ | reciprocal_npu_ | -| 278 | reciprocal.out | reciprocal_out_npu | -| 279 | neg | neg_npu | -| 280 | neg_ | neg_npu_ | -| 281 | neg.out | neg_out_npu | -| 282 | repeat | repeat_npu | -| 283 | repeat_interleave.self_int | repeat_interleave_npu | -| 284 | round | round_npu | -| 285 | round_ | round_npu_ | -| 286 | round.out | round_out_npu | -| 287 | relu | relu_npu | -| 288 | relu_ | relu_npu_ | -| 289 | prelu | prelu_npu | -| 290 | prelu_backward | prelu_backward_npu | -| 291 | gelu | gelu_npu | -| 292 | gelu_backward | gelu_backward_npu | -| 293 | hardshrink | hardshrink_npu | -| 294 | hardshrink_backward | hardshrink_backward_npu | -| 295 | rsqrt | rsqrt_npu | -| 296 | rsqrt_ | rsqrt_npu_ | -| 297 | rsqrt.out | rsqrt_out_npu | -| 298 | selu | selu_npu | -| 299 | selu_ | selu_npu_ | -| 300 | celu | celu_npu | -| 301 | celu_ | celu_npu_ | -| 302 | sigmoid | sigmoid_npu | -| 303 | sigmoid_ | sigmoid_npu_ | -| 304 | sigmoid.out | sigmoid_out_npu | -| 305 | sin | sin_npu | -| 306 | sin_ | sin_npu_ | -| 307 | sin.out | sin_out_npu | -| 308 | sinh | sinh_npu | -| 309 | sinh_ | sinh_npu_ | -| 310 | sinh.out | sinh_out_npu | -| 311 | slogdet | slogdet_npu | -| 312 | softmax.int | softmax_npu | -| 313 | softmax.Dimname | softmax_npu | -| 314 | _softmax | _softmax_npu | -| 315 | _softmax_backward_data | _softmax_backward_npu | -| 316 | stack | stack_npu | -| 317 | stack.out | stack_out_npu | -| 318 | sum | sum_npu | -| 319 | sum.dim_IntList | sum_npu | -| 320 | sum.dim_DimnameList | sum_npu | -| 321 | sum.IntList_out | sum_out_npu | -| 322 | sum.DimnameList_out | sum_out_npu | -| 323 | sqrt | sqrt_npu | -| 324 | sqrt_ | sqrt_npu_ | -| 325 | sqrt.out | sqrt_out_npu | -| 326 | std | std_npu | -| 327 | std.dim | std_dim_npu | -| 328 | std_mean | std_mean_npu | -| 329 | std_mean.dim | std_mean_dim_npu | -| 330 | std_mean.names_dim | std_mean_names_npu | -| 331 | std.out | std_out_npu | -| 332 | std.names_dim | std_names_npu | -| 333 | std.names_out | std_out_npu | -| 334 | prod | prod_npu | -| 335 | prod.dim_int | prod_npu | -| 336 | prod.int_out | prod_out_npu | -| 337 | prod.dim_Dimname | prod_npu | -| 338 | prod.Dimname_out | prod_out_npu | -| 339 | tan | tan_npu | -| 340 | tan_ | tan_npu_ | -| 341 | tan.out | tan_out_npu | -| 342 | tanh | tanh_npu | -| 343 | tanh_ | tanh_npu_ | -| 344 | tanh.out | tanh_out_npu | -| 345 | threshold | threshold_npu | -| 346 | threshold_ | threshold_npu_ | -| 347 | threshold.out | threshold_out_npu | -| 348 | threshold_backward | threshold_backward_npu | -| 349 | one_hot | one_hot_npu1 | -| 350 | flip | flip_npu | -| 351 | roll | roll_npu | -| 352 | true_divide.Tensor | true_divide_npu | -| 353 | true_divide_.Tensor | true_divide_npu_ | -| 354 | true_divide.out | true_divide_out_npu | -| 355 | true_divide.Scalar | true_divide_npu | -| 356 | true_divide_.Scalar | true_divide_npu_ | -| 357 | trunc | trunc_npu | -| 358 | trunc_ | trunc_npu_ | -| 359 | trunc.out | trunc_out_npu | -| 360 | _unique2 | _unique2_npu | -| 361 | var | var_npu | -| 362 | var.dim | var_npu | -| 363 | var.out | var_out_npu | -| 364 | var.names_dim | var_npu | -| 365 | var.names_out | var_out_npu | -| 366 | var_mean | var_mean_npu | -| 367 | var_mean.dim | var_mean_npu | -| 368 | var_mean.names_dim | var_mean_npu | -| 369 | where.self | where_npu | -| 370 | where | where_npu | -| 371 | _s_where | _s_where_npu | -| 372 | zeros.names | zeros_npu | -| 373 | zeros | zeros_npu | -| 374 | zeros.out | zeros_out_npu | -| 375 | zeros_like | zeros_like_npu | -| 376 | norm.ScalarOpt_dtype | norm_npu | -| 377 | norm.Scalar | norm_npu | -| 378 | norm.ScalarOpt_dim_dtype | norm_npu | -| 379 | norm.ScalarOpt_dim | norm_npu | -| 380 | norm.dtype_out | norm_out_npu | -| 381 | norm.out | norm_out_npu | -| 382 | clone | clone_npu | -| 383 | resize_as_ | resize_as_npu_ | -| 384 | pow.Tensor_Scalar_out | pow_out_npu | -| 385 | pow.Tensor_Scalar | pow_npu | -| 386 | zero_ | zero_npu_ | -| 387 | sub.out | sub_out_npu | -| 388 | sub.Tensor | sub_npu | -| 389 | sub_.Tensor | sub_npu_ | -| 390 | sub.Scalar | sub_npu | -| 391 | sub_.Scalar | sub_npu_ | -| 392 | rsub.Tensor | rsub_npu | -| 393 | rsub.Scalar | rsub_npu | -| 394 | addmm.out | addmm_out_npu | -| 395 | addmm | addmm_npu | -| 396 | addmm_ | addmm_npu_ | -| 397 | quantize_per_tensor | quantize_per_tensor_npu | -| 398 | quantize_per_channel | quantize_per_channel_npu | -| 399 | to.dtype_layout | to_npu | -| 400 | to.device | to_device_npu | -| 401 | to.dtype | to_dtype_npu | -| 402 | to.other | to_other_npu | -| 403 | _local_scalar_dense | _local_scalar_dense_npu | -| 404 | lstm.input | lstm_npu | -| 405 | lstm.data | lstm_npu | -| 406 | gru.input | gru_npu_ | -| 407 | _pack_padded_sequence | _pack_padded_sequence_npu | -| 408 | _pad_packed_sequence | _pad_packed_sequence_npu | -| 409 | set_.source_Storage | set_npu_ | -| 410 | set_.source_Storage_storage_offset | set_npu_ | -| 411 | set_.source_Tensor | set_npu_ | -| 412 | set_ | set_npu_ | -| 413 | masked_fill_.Scalar | masked_fill_npu_ | -| 414 | masked_fill_.Tensor | masked_fill_npu_ | -| 415 | masked_scatter_ | masked_scatter_npu_ | -| 416 | view | view_npu | -| 417 | put_ | put_npu_ | -| 418 | index_add_ | index_add_npu_ | -| 419 | index_add | index_add_npu | -| 420 | index_add.dimname | index_add_npu | -| 421 | index_fill_.int_Scalar | index_fill_npu_ | -| 422 | index_fill.int_Scalar | index_fill_npu | -| 423 | index_fill_.int_Tensor | index_fill_npu_ | -| 424 | index_fill.int_Tensor | index_fill_npu | -| 425 | scatter_.src | scatter_npu_ | -| 426 | scatter_.value | scatter_npu_ | -| 427 | scatter_add_ | scatter_add_npu_ | -| 428 | scatter_add | scatter_add_npu | -| 429 | scatter_add.dimname | scatter_add_npu | -| 430 | lt_.Scalar | lt_npu_ | -| 431 | lt_.Tensor | lt_npu_ | -| 432 | gt_.Scalar | gt_npu_ | -| 433 | gt_.Tensor | gt_npu_ | -| 434 | le_.Scalar | le_npu_ | -| 435 | le_.Tensor | le_npu_ | -| 436 | ge_.Scalar | ge_npu_ | -| 437 | ge_.Tensor | ge_npu_ | -| 438 | eq_.Scalar | eq_npu_ | -| 439 | eq_.Tensor | eq_npu_ | -| 440 | ne_.Scalar | ne_npu_ | -| 441 | ne_.Tensor | ne_npu_ | -| 442 | bitwise_and.Tensor_out | bitwise_and_out_npu | -| 443 | bitwise_and.Scalar_out | bitwise_and_out_npu | -| 444 | bitwise_and.Scalar | bitwise_and_npu | -| 445 | bitwise_and.Tensor | bitwise_and_npu | -| 446 | bitwise_and_.Scalar | bitwise_and_npu_ | -| 447 | bitwise_and_.Tensor | bitwise_and_npu_ | -| 448 | and.Scalar | and_npu | -| 449 | and.Tensor | and_npu | -| 450 | bitwise_or.Tensor_out | bitwise_or_out_npu | -| 451 | bitwise_or.Scalar_out | bitwise_or_out_npu | -| 452 | bitwise_or.Scalar | bitwise_or_npu | -| 453 | bitwise_or.Tensor | bitwise_or_npu | -| 454 | bitwise_or_.Scalar | bitwise_or_npu_ | -| 455 | bitwise_or_.Tensor | bitwise_or_npu_ | -| 456 | or.Scalar | or_npu | -| 457 | or.Tensor | or_npu | -| 458 | ior.Scalar | ior_npu | -| 459 | ior.Tensor | ior_npu | -| 460 | bitwise_xor.Tensor_out | bitwise_xor_out_npu | -| 461 | bitwise_xor.Scalar_out | bitwise_xor_out_npu | -| 462 | bitwise_xor.Scalar | bitwise_xor_npu | -| 463 | bitwise_xor.Tensor | bitwise_xor_npu | -| 464 | bitwise_xor_.Scalar | bitwise_xor_npu_ | -| 465 | bitwise_xor_.Tensor | bitwise_xor_npu_ | -| 466 | xor.Scalar | xor_npu | -| 467 | xor.Tensor | xor_npu | -| 468 | atan2_ | atan2_npu_ | -| 469 | tril_ | tril_npu_ | -| 470 | triu_ | triu_npu_ | -| 471 | renorm_ | renorm_npu_ | -| 472 | pow_.Scalar | pow_npu_ | -| 473 | pow_.Tensor | pow_npu_ | -| 474 | lerp_.Scalar | lerp_npu_ | -| 475 | lerp_.Tensor | lerp_npu_ | -| 476 | fmod_.Scalar | fmod_npu_ | -| 477 | fmod_.Tensor | fmod_npu_ | -| 478 | remainder_.Scalar | remainder_npu_ | -| 479 | remainder_.Tensor | remainder_npu_ | -| 480 | addbmm_ | addbmm_npu_ | -| 481 | addbmm.out | addbmm_out_npu | -| 482 | addbmm | addbmm_npu | -| 483 | addcdiv_ | addcdiv_npu_ | -| 484 | random_.from | random_npu_ | -| 485 | random_.to | random_npu_ | -| 486 | random_ | random_npu_ | -| 487 | uniform_ | uniform_npu_ | -| 488 | diag.out | diag_out_npu | -| 489 | diag | diag_npu | -| 490 | cross.out | cross_out_npu | -| 491 | cross | cross_npu | -| 492 | triu.out | triu_out_npu | -| 493 | triu | triu_npu | -| 494 | tril.out | tril_out_npu | -| 495 | tril | tril_npu | -| 496 | ne.Scalar_out | ne_out_npu | -| 497 | ne.Scalar | ne_npu | -| 498 | ne.Tensor_out | ne_out_npu | -| 499 | ne.Tensor | ne_npu | -| 500 | eq.Scalar_out | eq_out_npu | -| 501 | eq.Scalar | eq_npu | -| 502 | eq.Tensor_out | eq_out_npu | -| 503 | eq.Tensor | eq_npu | -| 504 | ge.Scalar_out | ge_out_npu | -| 505 | ge.Scalar | ge_npu | -| 506 | ge.Tensor_out | ge_out_npu | -| 507 | ge.Tensor | ge_npu | -| 508 | le.Scalar_out | le_out_npu | -| 509 | le.Scalar | le_npu | -| 510 | le.Tensor_out | le_out_npu | -| 511 | le.Tensor | le_npu | -| 512 | gt.Scalar_out | gt_out_npu | -| 513 | gt.Scalar | gt_npu | -| 514 | gt.Tensor_out | gt_out_npu | -| 515 | gt.Tensor | gt_npu | -| 516 | lt.Scalar_out | lt_out_npu | -| 517 | lt.Scalar | lt_npu | -| 518 | lt.Tensor_out | lt_out_npu | -| 519 | lt.Tensor | lt_npu | -| 520 | take.out | take_out_npu | -| 521 | take | take_npu | -| 522 | index_select.out | index_select_out_npu | -| 523 | index_select | index_select_npu | -| 524 | index_select.dimname_out | index_select_out_npu | -| 525 | index_select.dimname | index_select_npu | -| 526 | masked_select.out | masked_select_out_npu | -| 527 | masked_select | masked_select_npu | -| 528 | nonzero.out | nonzero_out_npu | -| 529 | nonzero | nonzero_npu | -| 530 | gather.out | gather_out_npu | -| 531 | gather | gather_npu | -| 532 | gather.dimname_out | gather_out_npu | -| 533 | gather.dimname | gather_npu | -| 534 | addcmul.out | addcmul_out_npu | -| 535 | addcmul | addcmul_npu | -| 536 | addcmul_ | addcmul_npu_ | -| 537 | addcdiv.out | addcdiv_out_npu | -| 538 | addcdiv | addcdiv_npu | -| 539 | qr.Q | qr_out_npu | -| 540 | qr | qr_npu | -| 541 | multinomial.out | multinomial_out_npu | -| 542 | multinomial | multinomial_npu | -| 543 | erfinv | erfinv_npu | -| 544 | erfinv_ | erfinv_npu_ | -| 545 | erfinv.out | erfinv_out_npu | -| 546 | sign | sign_npu | -| 547 | sign_ | sign_npu_ | -| 548 | sign.out | sign_out_npu | -| 549 | atan2.out | atan2_out_npu | -| 550 | atan2 | atan2_npu | -| 551 | lerp.Scalar_out | lerp_out_npu | -| 552 | lerp.Tensor_out | lerp_out_npu | -| 553 | lerp.Scalar | lerp_npu | -| 554 | lerp.Tensor | lerp_npu | -| 555 | histc.out | histc_out_npu | -| 556 | histc | histc_npu | -| 557 | fmod.Scalar_out | fmod_out_npu | -| 558 | fmod.Scalar | fmod_npu | -| 559 | fmod.Tensor_out | fmod_out_npu | -| 560 | fmod.Tensor | fmod_npu | -| 561 | remainder.Scalar_out | remainder_out_npu | -| 562 | remainder.Scalar | remainder_npu | -| 563 | remainder.Tensor_out | remainder_out_npu | -| 564 | remainder.Tensor | remainder_npu | -| 565 | min.out | min_out_npu | -| 566 | min.other | min_npu | -| 567 | min | min_npu | -| 568 | max.out | max_out_npu | -| 569 | max.other | max_npu | -| 570 | max | max_npu | -| 571 | median | median_npu | -| 572 | sort.values | sort_out_npu | -| 573 | sort | sort_npu | -| 574 | sort.dimname_values | sort_out_npu | -| 575 | sort.dimname | sort_npu | -| 576 | argsort | argsort_npu | -| 577 | argsort.dimname | argsort_npu | -| 578 | topk.values | topk_out_npu | -| 579 | topk | topk_npu | -| 580 | all | all_npu | -| 581 | any | any_npu | -| 582 | renorm.out | renorm_out_npu | -| 583 | renorm | renorm_npu | -| 584 | unfold | unfold | -| 585 | equal | equal_npu | -| 586 | pow.Tensor_Tensor_out | pow_out_npu | -| 587 | pow.Tensor_Tensor | pow_npu | -| 588 | pow.Scalar_out | pow_out_npu | -| 589 | pow.Scalar | pow_npu | -| 590 | normal_ | normal_npu_ | -| 591 | normal.Tensor_float_out | normal_out_npu | -| 592 | normal.Tensor_float | normal_npu | -| 593 | normal.float_Tensor_out | normal_out_npu | -| 594 | normal.float_Tensor | normal_npu | -| 595 | normal.Tensor_Tensor_out | normal_out_npu | -| 596 | normal.Tensor_Tensor | normal_npu | -| 597 | normal.float_float | normal_npu | -| 598 | normal.float_float_out | normal_out_npu | -| 599 | _addr | _addr_npu | -| 600 | _addr_ | _addr_npu_ | -| 601 | _addr.out | _addr_out_npu | -| 602 | _cumsum | _cumsum_npu | -| 603 | _cumsum.out | _cumsum_out_npu | -| 604 | _cumprod | _cumprod_npu | -| 605 | _cumprod.out | _cumprod_out_npu | -| 606 | _var | _var_npu | -| 607 | _amp_non_finite_check_and_unscale_ | _amp_non_finite_check_and_unscale_npu_ | -| 608 | _cat | _cat_npu | -| 609 | _cat.out | _cat_out_npu | -| 610 | _max | _max_npu | -| 611 | _max.max | _max_out_npu | -| 612 | _min | _min_npu | -| 613 | _min.min | _min_out_npu | -| 614 | mse_loss.out | mse_loss_out_npu | -| 615 | mse_loss | mse_loss_npu | -| 616 | mse_loss_backward.grad_input | mse_loss_backward_out_npu | -| 617 | mse_loss_backward | mse_loss_backward_npu | -| 618 | l1_loss.out | l1_loss_out_npu | -| 619 | l1_loss | l1_loss_npu | -| 620 | l1_loss_backward.grad_input | l1_loss_backward_out_npu | -| 621 | l1_loss_backward | l1_loss_backward_npu | -| 622 | multilabel_margin_loss.out | multilabel_margin_loss_out_npu | -| 623 | multilabel_margin_loss | multilabel_margin_loss_npu | -| 624 | multilabel_margin_loss_forward.output | multilabel_margin_loss_forward_out_npu | -| 625 | multilabel_margin_loss_forward | multilabel_margin_loss_forward_npu | -| 626 | nll_loss.out | nll_loss_out_npu | -| 627 | nll_loss | nll_loss_npu | -| 628 | nll_loss_forward.output | nll_loss_forward_out_npu | -| 629 | nll_loss_forward | nll_loss_forward_npu | -| 630 | nll_loss_backward.grad_input | nll_loss_backward_out_npu | -| 631 | nll_loss_backward | nll_loss_backward_npu | -| 632 | nll_loss2d.out | nll_loss2d_out_npu | -| 633 | nll_loss2d | nll_loss2d_npu | -| 634 | nll_loss2d_forward.output | nll_loss2d_forward_out_npu | -| 635 | nll_loss2d_forward | nll_loss2d_forward_npu | -| 636 | nll_loss2d_backward.grad_input | nll_loss2d_backward_out_npu | -| 637 | nll_loss2d_backward | nll_loss2d_backward_npu | -| 638 | smooth_l1_loss.out | smooth_l1_loss_out_npu | -| 639 | smooth_l1_loss | smooth_l1_loss_npu | -| 640 | smooth_l1_loss_backward.grad_input | smooth_l1_loss_backward_out_npu | -| 641 | smooth_l1_loss_backward | smooth_l1_loss_backward_npu | -| 642 | soft_margin_loss.out | soft_margin_loss_out_npu | -| 643 | soft_margin_loss | soft_margin_loss_npu | -| 644 | soft_margin_loss_backward.grad_input | soft_margin_loss_backward_out_npu | -| 645 | soft_margin_loss_backward | soft_margin_loss_backward_npu | -| 646 | elu.out | elu_out_npu | -| 647 | elu | elu_npu | -| 648 | elu_backward.grad_input | elu_backward_out_npu | -| 649 | elu_backward | elu_backward_npu | -| 650 | elu_ | elu_npu_ | -| 651 | glu.out | glu_out_npu | -| 652 | glu | glu_npu | -| 653 | glu_backward.grad_input | glu_backward_out_npu | -| 654 | glu_backward | glu_backward_npu | -| 655 | hardsigmoid.out | hardsigmoid_out_npu | -| 656 | hardsigmoid | hardsigmoid_npu | -| 657 | hardsigmoid_ | hardsigmoid_npu_ | -| 658 | hardsigmoid_backward | hardsigmoid_backward_npu | -| 659 | hardtanh.out | hardtanh_out_npu | -| 660 | hardtanh | hardtanh_npu | -| 661 | hardtanh_backward.grad_input | hardtanh_backward_out_npu | -| 662 | hardtanh_backward | hardtanh_backward_npu | -| 663 | hardtanh_ | hardtanh_npu_ | -| 664 | leaky_relu.out | leaky_relu_out_npu | -| 665 | leaky_relu | leaky_relu_npu | -| 666 | leaky_relu_backward | leaky_relu_backward_npu | -| 667 | leaky_relu_ | leaky_relu_npu_ | -| 668 | log_sigmoid.out | log_sigmoid_out_npu | -| 669 | log_sigmoid | log_sigmoid_npu | -| 670 | log_sigmoid_forward.output | log_sigmoid_forward_out_npu | -| 671 | log_sigmoid_forward | log_sigmoid_forward_npu | -| 672 | log_sigmoid_backward.grad_input | log_sigmoid_backward_out_npu | -| 673 | log_sigmoid_backward | log_sigmoid_backward_npu | -| 674 | softplus.out | softplus_out_npu | -| 675 | softplus | softplus_npu | -| 676 | softplus_backward.grad_input | softplus_backward_out_npu | -| 677 | softplus_backward | softplus_backward_npu | -| 678 | softshrink.out | softshrink_out_npu | -| 679 | softshrink | softshrink_npu | -| 680 | softshrink_backward.grad_input | softshrink_backward_out_npu | -| 681 | softshrink_backward | softshrink_backward_npu | -| 682 | adaptive_avg_pool2d.out | adaptive_avg_pool2d_out_npu | -| 683 | adaptive_avg_pool2d | adaptive_avg_pool2d_npu | -| 684 | _adaptive_avg_pool2d | _adaptive_avg_pool2d_npu | -| 685 | _adaptive_avg_pool2d_backward | adaptive_avg_pool2d_backward_npu | -| 686 | adaptive_avg_pool3d.out | adaptive_avg_pool3d_out_npu | -| 687 | adaptive_avg_pool3d | adaptive_avg_pool3d_npu | -| 688 | adaptive_avg_pool3d_backward.grad_input | adaptive_avg_pool3d_backward_out_npu | -| 689 | adaptive_avg_pool3d_backward | adaptive_avg_pool3d_backward_npu | -| 690 | adaptive_max_pool2d.out | adaptive_max_pool2d_out_npu | -| 691 | adaptive_max_pool2d | adaptive_max_pool2d_npu | -| 692 | adaptive_max_pool2d_backward.grad_input | adaptive_max_pool2d_backward_out_npu | -| 693 | adaptive_max_pool2d_backward | adaptive_max_pool2d_backward_npu | -| 694 | avg_pool2d.out | avg_pool2d_out_npu | -| 695 | avg_pool2d | avg_pool2d_npu | -| 696 | avg_pool2d_backward.grad_input | avg_pool2d_backward_out_npu | -| 697 | avg_pool2d_backward | avg_pool2d_backward_npu | -| 698 | avg_pool3d.out | avg_pool3d_out_npu | -| 699 | avg_pool3d | avg_pool3d_npu | -| 700 | avg_pool3d_backward.grad_input | avg_pool3d_backward_out_npu | -| 701 | avg_pool3d_backward | avg_pool3d_backward_npu | -| 702 | max_pool2d_with_indices.out | max_pool2d_with_indices_out_npu | -| 703 | max_pool2d_with_indices | max_pool2d_with_indices_npu | -| 704 | max_pool2d_with_indices_backward.grad_input | max_pool2d_with_indices_backward_out_npu | -| 705 | max_pool2d_with_indices_backward | max_pool2d_with_indices_backward_npu | -| 706 | max_pool3d_with_indices.out | max_pool3d_with_indices_out_npu | -| 707 | max_pool3d_with_indices | max_pool3d_with_indices_npu | -| 708 | max_pool3d_with_indices_backward.grad_input | max_pool3d_with_indices_backward_out_npu | -| 709 | max_pool3d_with_indices_backward | max_pool3d_with_indices_backward_npu | -| 710 | reflection_pad2d.out | reflection_pad2d_out_npu | -| 711 | reflection_pad2d | reflection_pad2d_npu | -| 712 | replication_pad2d.out | replication_pad2d_out_npu | -| 713 | replication_pad2d | replication_pad2d_npu | -| 714 | upsample_linear1d.out | upsample_linear1d_out_npu | -| 715 | upsample_linear1d | upsample_linear1d_npu | -| 716 | upsample_linear1d_backward | upsample_linear1d_backward_npu | -| 717 | upsample_bilinear2d.out | upsample_bilinear2d_out_npu | -| 718 | upsample_bilinear2d | upsample_bilinear2d_npu | -| 719 | upsample_bilinear2d_backward.grad_input | upsample_bilinear2d_backward_out_npu | -| 720 | upsample_bilinear2d_backward | upsample_bilinear2d_backward_npu | -| 721 | upsample_bicubic2d.out | upsample_bicubic2d_out_npu | -| 722 | upsample_bicubic2d | upsample_bicubic2d_npu | -| 723 | upsample_bicubic2d_backward.grad_input | upsample_bicubic2d_backward_out_npu | -| 724 | upsample_bicubic2d_backward | upsample_bicubic2d_backward_npu | -| 725 | upsample_trilinear3d.out | upsample_trilinear3d_out_npu | -| 726 | upsample_trilinear3d | upsample_trilinear3d_npu | -| 727 | upsample_trilinear3d_backward.grad_input | upsample_trilinear3d_backward_out_npu | -| 728 | upsample_trilinear3d_backward | upsample_trilinear3d_backward_npu | -| 729 | upsample_nearest1d.out | upsample_nearest1d_out_npu | -| 730 | upsample_nearest1d | upsample_nearest1d_npu | -| 731 | upsample_nearest1d_backward.grad_input | upsample_nearest1d_backward_out_npu | -| 732 | upsample_nearest1d_backward | upsample_nearest1d_backward_npu | -| 733 | upsample_nearest2d.out | upsample_nearest2d_out_npu | -| 734 | upsample_nearest2d | upsample_nearest2d_npu | -| 735 | upsample_nearest2d_backward.grad_input | upsample_nearest2d_backward_out_npu | -| 736 | upsample_nearest2d_backward | upsample_nearest2d_backward_npu | -| 737 | upsample_nearest3d.out | upsample_nearest3d_out_npu | -| 738 | upsample_nearest3d | upsample_nearest3d_npu | -| 739 | upsample_nearest3d_backward.grad_input | upsample_nearest3d_backward_out_npu | -| 740 | upsample_nearest3d_backward | upsample_nearest3d_backward_npu | -| 741 | sigmoid_backward.grad_input | sigmoid_backward_out_npu | -| 742 | sigmoid_backward | sigmoid_backward_npu | -| 743 | tanh_backward.grad_input | tanh_backward_out_npu | -| 744 | tanh_backward | tanh_backward_npu | -| 745 | slow_conv_transpose2d.out | slow_conv_transpose2d_out_npu | -| 746 | slow_conv_transpose2d | slow_conv_transpose2d_npu | -| 747 | slow_conv_transpose2d_backward.grad_output | slow_conv_transpose2d_backward_out_npu | -| 748 | slow_conv_transpose2d_backward.output_mask | slow_conv_transpose2d_backward_npu | -| 749 | thnn_conv2d.out | thnn_conv2d_out_npu | -| 750 | thnn_conv2d | thnn_conv2d_npu | -| 751 | thnn_conv2d_forward.output | thnn_conv2d_forward_out_npu | -| 752 | thnn_conv2d_forward | thnn_conv2d_forward_npu | -| 753 | thnn_conv2d_backward.output_mask | thnn_conv2d_backward_npu | -| 754 | thnn_conv_depthwise2d.out | thnn_conv_depthwise2d_out_npu | -| 755 | thnn_conv_depthwise2d | thnn_conv_depthwise2d_npu | -| 756 | thnn_conv_depthwise2d_forward.out | thnn_conv_depthwise2d_forward_out_npu | -| 757 | thnn_conv_depthwise2d_forward | thnn_conv_depthwise2d_forward_npu | -| 758 | thnn_conv_depthwise2d_backward.grad_input | thnn_conv_depthwise2d_backward_out_npu | -| 759 | thnn_conv_depthwise2d_backward.output_mask | thnn_conv_depthwise2d_backward_npu | -| 760 | slow_conv_dilated2d | slow_conv_dilated2d_npu | -| 761 | slow_conv_dilated2d_backward | slow_conv_dilated2d_backward_npu | -| 762 | col2im.out | im2col_backward_out_npu | -| 763 | col2im | im2col_backward_npu | -| 764 | col2im_backward.grad_input | col2im_backward_out_npu | -| 765 | col2im_backward | col2im_backward_npu | -| 766 | im2col.out | im2col_out_npu | -| 767 | im2col | im2col_npu | -| 768 | im2col_backward.grad_input | im2col_backward_out_npu | -| 769 | im2col_backward | im2col_backward_npu | -| 770 | isfinite | isfinite_npu | +# PyTorch适配算子清单 +- [PyTorch原生算子与昇腾算子对应表](#PyTorch原生算子与昇腾算子对应表.md) +- [PyTorch昇腾自定义算子](#PyTorch昇腾自定义算子.md) +

PyTorch原生算子与昇腾算子对应表

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

PyTorch 原生算子

+

昇腾适配算子

+

1

+

dropout

+

dropout_npu

+

2

+

dropout_

+

dropout_npu_

+

3

+

abs

+

abs_npu

+

4

+

abs_

+

abs_npu_

+

5

+

abs.out

+

abs_out_npu

+

6

+

acos

+

acos_npu

+

7

+

acos_

+

acos_npu_

+

8

+

acos.out

+

acos_out_npu

+

9

+

adaptive_avg_pool1d

+

adaptive_avg_pool1d_npu

+

10

+

add.Tensor

+

add_npu

+

11

+

add_.Tensor

+

add_npu_

+

12

+

add.out

+

add_out_npu

+

13

+

add.Scalar

+

add_npu

+

14

+

add_.Scalar

+

add_npu_

+

15

+

addmv

+

addmv_npu

+

16

+

addmv_

+

addmv_npu_

+

17

+

addmv.out

+

addmv_out_npu

+

18

+

addr

+

addr_npu

+

19

+

addr_

+

addr_npu_

+

20

+

addr.out

+

addr_out_npu

+

21

+

affine_grid_generator

+

affine_grid_generator_npu

+

22

+

affine_grid_generator_backward

+

affine_grid_generator_backward_npu

+

23

+

all.dim

+

all_npu

+

24

+

all.out

+

all_out_npu

+

25

+

any.dim

+

any_npu

+

26

+

any.out

+

any_out_npu

+

27

+

arange

+

arange_npu

+

28

+

arange.start

+

arange_npu

+

29

+

arange.start_step

+

arange_npu

+

30

+

arange.out

+

arange_out_npu

+

31

+

arange.start_out

+

arange_out_npu

+

32

+

_dim_arange

+

_dim_arange_npu

+

33

+

argmax

+

argmax_npu

+

34

+

argmin

+

argmin_npu

+

35

+

as_strided

+

as_strided_npu

+

36

+

as_strided_

+

as_strided_npu_

+

37

+

asin

+

asin_npu

+

38

+

asin_

+

asin_npu_

+

39

+

asin.out

+

asin_out_npu

+

40

+

atan

+

atan_npu

+

41

+

atan_

+

atan_npu_

+

42

+

atan.out

+

atan_out_npu

+

43

+

baddbmm

+

baddbmm_npu

+

44

+

baddbmm_

+

baddbmm_npu_

+

45

+

baddbmm.out

+

baddbmm_out_npu

+

46

+

bartlett_window

+

bartlett_window_npu

+

47

+

bartlett_window.periodic

+

bartlett_window_npu

+

48

+

batch_norm

+

batch_norm_npu_

+

49

+

_batch_norm_impl_index

+

_batch_norm_impl_index_npu

+

50

+

_batch_norm_impl_index_backward

+

_batch_norm_impl_index_backward_npu

+

51

+

bernoulli

+

bernoulli_npu

+

52

+

bernoulli_.Tensor

+

bernoulli_npu_

+

53

+

bernoulli_.float

+

bernoulli_npu_

+

54

+

binary_cross_entropy

+

binary_cross_entropy_npu

+

55

+

binary_cross_entropy.out

+

binary_cross_entropy_out_npu

+

56

+

binary_cross_entropy_backward

+

binary_cross_entropy_backward_npu

+

57

+

binary_cross_entropy_backward.grad_input

+

binary_cross_entropy_backward_out_npu

+

58

+

binary_cross_entropy_with_logits

+

binary_cross_entropy_with_logits_npu

+

59

+

binary_cross_entropy_with_logits_backward

+

binary_cross_entropy_with_logits_backward_npu

+

60

+

bitwise_not

+

bitwise_not_npu

+

61

+

bitwise_not_

+

bitwise_not_npu_

+

62

+

bitwise_not.out

+

bitwise_not_out_npu

+

63

+

logical_not

+

logical_not_npu

+

64

+

logical_not_

+

logical_not_npu_

+

65

+

logical_not.out

+

logical_not_out_npu

+

66

+

logical_and

+

logical_and_npu

+

67

+

logical_and_

+

logical_and_npu_

+

68

+

logical_and.out

+

logical_and_out_npu

+

69

+

logical_or

+

logical_or_npu

+

70

+

logical_or_

+

logical_or_npu_

+

71

+

logical_or.out

+

logical_or_out_npu

+

72

+

blackman_window

+

blackman_window_npu

+

73

+

blackman_window.periodic

+

blackman_window_npu

+

74

+

bmm

+

bmm_npu

+

75

+

bmm.out

+

bmm_out_npu

+

76

+

cat

+

cat_npu

+

77

+

cat.out

+

cat_out_npu

+

78

+

cat.names

+

cat_npu

+

79

+

cat.names_out

+

cat_out_npu

+

80

+

ceil

+

ceil_npu

+

81

+

ceil_

+

ceil_npu_

+

82

+

ceil.out

+

ceil_out_npu

+

83

+

clamp

+

clamp_npu

+

84

+

clamp_

+

clamp_npu_

+

85

+

clamp.out

+

clamp_out_npu

+

86

+

clamp_max

+

clamp_max_npu

+

87

+

clamp_max_

+

clamp_max_npu_

+

88

+

clamp_max.out

+

clamp_max_out_npu

+

89

+

clamp_min

+

clamp_min_npu

+

90

+

clamp_min_

+

clamp_min_npu_

+

91

+

clamp_min.out

+

clamp_min_out_npu

+

92

+

constant_pad_nd

+

constant_pad_nd_npu

+

93

+

contiguous

+

contiguous_npu

+

94

+

convolution

+

convolution_npu

+

95

+

_convolution

+

_convolution_npu

+

96

+

_convolution_nogroup

+

_convolution_nogroup_npu

+

97

+

conv2d

+

conv2d_npu_

+

98

+

conv3d

+

_conv3d_npu

+

99

+

conv_tbc

+

conv_tbc_npu

+

100

+

conv_tbc_backward

+

conv_tbc_backward_npu

+

101

+

conv_transpose2d.input

+

conv_transpose2d_npu_

+

102

+

copy_

+

copy_npu_

+

103

+

cos

+

cos_npu

+

104

+

cos_

+

cos_npu_

+

105

+

cos.out

+

cos_out_npu

+

106

+

cosh

+

cosh_npu

+

107

+

cosh_

+

cosh_npu_

+

108

+

cosh.out

+

cosh_out_npu

+

109

+

cummin

+

cummin_npu

+

110

+

cummin.out

+

cummin_out_npu

+

111

+

cummin.dimname

+

cummin_npu

+

112

+

cummin.dimname_out

+

cummin_out_npu

+

113

+

cumprod

+

cumprod_npu

+

114

+

cumprod.out

+

cumprod_out_npu

+

115

+

cumprod.dimname

+

cumprod_npu

+

116

+

cumprod.dimname_out

+

cumprod_out_npu

+

117

+

ctc_loss.IntList

+

ctc_loss_npu

+

118

+

ctc_loss.Tensor

+

ctc_loss_npu

+

119

+

_ctc_loss

+

ctc_loss_npu

+

120

+

_ctc_loss_backward

+

ctc_loss_backward_npu

+

121

+

fill_diagonal_

+

fill_diagonal_npu_

+

122

+

div.Tensor

+

div_npu

+

123

+

div_.Tensor

+

div_npu_

+

124

+

div.out

+

div_out_npu

+

125

+

div.Scalar

+

div_npu

+

126

+

div_.Scalar

+

div_npu_

+

127

+

dot

+

dot_npu

+

128

+

dot.out

+

dot_out_npu

+

129

+

embedding

+

embedding_npu

+

130

+

embedding_backward

+

embedding_backward_npu

+

131

+

embedding_dense_backward

+

embedding_dense_backward_npu

+

132

+

embedding_renorm_

+

embedding_renorm_npu_

+

133

+

_embedding_bag

+

_embedding_bag_npu

+

134

+

empty.memory_format

+

empty_npu

+

135

+

resize_

+

resize_npu_

+

136

+

empty_like

+

empty_like_npu

+

137

+

empty_strided

+

empty_strided_npu

+

138

+

erf

+

erf_npu

+

139

+

erf_

+

erf_npu_

+

140

+

erf.out

+

erf_out_npu

+

141

+

exp

+

exp_npu

+

142

+

exp_

+

exp_npu_

+

143

+

exp.out

+

exp_out_npu

+

144

+

expm1

+

expm1_npu

+

145

+

expm1_

+

expm1_npu_

+

146

+

expm1.out

+

expm1_out_npu

+

147

+

eye

+

eye_npu

+

148

+

eye.m

+

eye_npu

+

149

+

eye.out

+

eye_out_npu

+

150

+

eye.m_out

+

eye_out_npu

+

151

+

fill_.Scalar

+

fill_npu_

+

152

+

fill_.Tensor

+

fill_npu_

+

153

+

floor

+

floor_npu

+

154

+

floor_

+

floor_npu_

+

155

+

floor.out

+

floor_out_npu

+

156

+

floor_divide

+

floor_divide_npu

+

157

+

floor_divide_.Tensor

+

floor_divide_npu_

+

158

+

floor_divide.out

+

floor_divide_out_npu

+

159

+

floor_divide.Scalar

+

floor_divide_npu

+

160

+

floor_divide_.Scalar

+

floor_divide_npu_

+

161

+

frac

+

frac_npu

+

162

+

frac_

+

frac_npu_

+

163

+

frac.out

+

frac_out_npu

+

164

+

full.names

+

full_npu

+

165

+

full

+

full_npu

+

166

+

full.out

+

full_out_npu

+

167

+

grid_sampler

+

grid_sampler_npu

+

168

+

grid_sampler_3d

+

grid_sampler_3d_npu

+

169

+

grid_sampler_3d_backward

+

grid_sampler_3d_backward_npu

+

170

+

hann_window

+

hann_window_npu

+

171

+

hann_window.periodic

+

hann_window_npu

+

172

+

hamming_window

+

hamming_window_npu

+

173

+

hamming_window.periodic

+

hamming_window_npu

+

174

+

hamming_window.periodic_alpha

+

hamming_window_npu

+

175

+

hamming_window.periodic_alpha_beta

+

hamming_window_npu

+

176

+

ger

+

ger_npu

+

177

+

ger.out

+

ger_out_npu

+

178

+

index.Tensor

+

index_npu

+

179

+

index_put_

+

index_put_npu_

+

180

+

index_put

+

index_put_npu

+

181

+

_index_put_impl_

+

_index_put_impl_npu_

+

182

+

inverse

+

inverse_npu

+

183

+

inverse.out

+

inverse_out_npu

+

184

+

isclose

+

isclose_npu

+

185

+

isnan

+

isnan_npu

+

186

+

is_nonzero

+

is_nonzero_npu

+

187

+

kl_div

+

kl_div_npu

+

188

+

kl_div_backward

+

kl_div_backward_npu

+

189

+

kthvalue

+

kthvalue_npu

+

190

+

kthvalue.values

+

kthvalue_out_npu

+

191

+

kthvalue.dimname

+

kthvalue_npu

+

192

+

kthvalue.dimname_out

+

kthvalue_out_npu

+

193

+

native_layer_norm

+

layer_norm_npu

+

194

+

native_layer_norm_backward

+

layer_norm_backward_npu

+

195

+

linspace

+

linspace_npu

+

196

+

linspace.out

+

linspace_out_npu

+

197

+

log

+

log_npu

+

198

+

log_

+

log_npu_

+

199

+

log.out

+

log_out_npu

+

200

+

log10

+

log10_npu

+

201

+

log10_

+

log10_npu_

+

202

+

log10.out

+

log10_out_npu

+

203

+

log1p

+

log1p_npu

+

204

+

log1p_

+

log1p_npu_

+

205

+

log1p.out

+

log1p_out_npu

+

206

+

log2

+

log2_npu

+

207

+

log2_

+

log2_npu_

+

208

+

log2.out

+

log2_out_npu

+

209

+

logspace

+

logspace_npu

+

210

+

logspace.out

+

logspace_out_npu

+

211

+

log_softmax.int

+

log_softmax_npu

+

212

+

log_softmax.Dimname

+

log_softmax_npu

+

213

+

_log_softmax

+

_log_softmax_npu

+

214

+

_log_softmax_backward_data

+

_log_softmax_backward_npu

+

215

+

logsumexp

+

logsumexp_npu

+

216

+

logsumexp.out

+

logsumexp_out_npu

+

217

+

logsumexp.names

+

logsumexp_npu

+

218

+

logsumexp.names_out

+

logsumexp_out_npu

+

219

+

matmul

+

matmul_npu

+

220

+

matmul.out

+

matmul_out_npu

+

221

+

matrix_power

+

matrix_power_npu

+

222

+

max.dim

+

max_npu

+

223

+

max.dim_max

+

max_out_npu

+

224

+

max_values

+

max_npu

+

225

+

max.names_dim

+

max_npu

+

226

+

max.names_dim_max

+

max_out_npu

+

227

+

max_values.names

+

max_npu

+

228

+

max_pool2d

+

max_pool2d_npu

+

229

+

quantized_max_pool2d

+

quantized_max_pool2d_npu

+

230

+

mean

+

mean_npu

+

231

+

mean.dim

+

mean_npu

+

232

+

mean.out

+

mean_out_npu

+

233

+

mean.names_dim

+

mean_npu

+

234

+

mean.names_out

+

mean_out_npu

+

235

+

median.dim

+

median_npu

+

236

+

median.dim_values

+

median_out_npu

+

237

+

median.names_dim

+

median_npu

+

238

+

median.names_dim_values

+

median_out_npu

+

239

+

min.dim

+

min_npu

+

240

+

min.dim_min

+

min_out_npu

+

241

+

min_values

+

min_npu

+

242

+

min.names_dim

+

min_npu

+

243

+

min.names_dim_min

+

min_out_npu

+

244

+

min_values.names

+

min_npu

+

245

+

mm

+

mm_npu

+

246

+

mm.out

+

mm_out_npu

+

247

+

mode

+

mode_npu

+

248

+

mode.values

+

mode_out_npu

+

249

+

mul.Tensor

+

mul_npu

+

250

+

mul_.Tensor

+

mul_npu_

+

251

+

mul.out

+

mul_out_npu

+

252

+

mul.Scalar

+

mul_npu

+

253

+

mul_.Scalar

+

mul_npu_

+

254

+

mv

+

mv_npu

+

255

+

mv.out

+

mv_out_npu

+

256

+

narrow_copy

+

narrow_copy_npu

+

257

+

native_batch_norm

+

batch_norm_npu

+

258

+

native_batch_norm_backward

+

batch_norm_backward_npu

+

259

+

_nnpack_spatial_convolution

+

_nnpack_spatial_convolution_npu

+

260

+

ones.names

+

ones_npu

+

261

+

ones

+

ones_npu

+

262

+

ones.out

+

ones_out_npu

+

263

+

ones_like

+

ones_like_npu

+

264

+

cdist

+

cdist_npu

+

265

+

_cdist_forward

+

_cdist_forward_npu

+

266

+

_cdist_backward

+

_cdist_backward_npu

+

267

+

pdist

+

pdist_npu

+

268

+

_pdist_forward

+

_pdist_forward_npu

+

269

+

randperm

+

randperm_npu

+

270

+

randperm.generator

+

randperm_npu

+

271

+

randperm.out

+

randperm_out_npu

+

272

+

randperm.generator_out

+

randperm_out_npu

+

273

+

range.step

+

range_npu

+

274

+

range

+

range_npu

+

275

+

range.out

+

range_out_npu

+

276

+

reciprocal

+

reciprocal_npu

+

277

+

reciprocal_

+

reciprocal_npu_

+

278

+

reciprocal.out

+

reciprocal_out_npu

+

279

+

neg

+

neg_npu

+

280

+

neg_

+

neg_npu_

+

281

+

neg.out

+

neg_out_npu

+

282

+

repeat

+

repeat_npu

+

283

+

repeat_interleave.self_int

+

repeat_interleave_npu

+

284

+

round

+

round_npu

+

285

+

round_

+

round_npu_

+

286

+

round.out

+

round_out_npu

+

287

+

relu

+

relu_npu

+

288

+

relu_

+

relu_npu_

+

289

+

prelu

+

prelu_npu

+

290

+

prelu_backward

+

prelu_backward_npu

+

291

+

gelu

+

gelu_npu

+

292

+

gelu_backward

+

gelu_backward_npu

+

293

+

hardshrink

+

hardshrink_npu

+

294

+

hardshrink_backward

+

hardshrink_backward_npu

+

295

+

rsqrt

+

rsqrt_npu

+

296

+

rsqrt_

+

rsqrt_npu_

+

297

+

rsqrt.out

+

rsqrt_out_npu

+

298

+

selu

+

selu_npu

+

299

+

selu_

+

selu_npu_

+

300

+

celu

+

celu_npu

+

301

+

celu_

+

celu_npu_

+

302

+

sigmoid

+

sigmoid_npu

+

303

+

sigmoid_

+

sigmoid_npu_

+

304

+

sigmoid.out

+

sigmoid_out_npu

+

305

+

sin

+

sin_npu

+

306

+

sin_

+

sin_npu_

+

307

+

sin.out

+

sin_out_npu

+

308

+

sinh

+

sinh_npu

+

309

+

sinh_

+

sinh_npu_

+

310

+

sinh.out

+

sinh_out_npu

+

311

+

slogdet

+

slogdet_npu

+

312

+

softmax.int

+

softmax_npu

+

313

+

softmax.Dimname

+

softmax_npu

+

314

+

_softmax

+

_softmax_npu

+

315

+

_softmax_backward_data

+

_softmax_backward_npu

+

316

+

stack

+

stack_npu

+

317

+

stack.out

+

stack_out_npu

+

318

+

sum

+

sum_npu

+

319

+

sum.dim_IntList

+

sum_npu

+

320

+

sum.dim_DimnameList

+

sum_npu

+

321

+

sum.IntList_out

+

sum_out_npu

+

322

+

sum.DimnameList_out

+

sum_out_npu

+

323

+

sqrt

+

sqrt_npu

+

324

+

sqrt_

+

sqrt_npu_

+

325

+

sqrt.out

+

sqrt_out_npu

+

326

+

std

+

std_npu

+

327

+

std.dim

+

std_dim_npu

+

328

+

std_mean

+

std_mean_npu

+

329

+

std_mean.dim

+

std_mean_dim_npu

+

330

+

std_mean.names_dim

+

std_mean_names_npu

+

331

+

std.out

+

std_out_npu

+

332

+

std.names_dim

+

std_names_npu

+

333

+

std.names_out

+

std_out_npu

+

334

+

prod

+

prod_npu

+

335

+

prod.dim_int

+

prod_npu

+

336

+

prod.int_out

+

prod_out_npu

+

337

+

prod.dim_Dimname

+

prod_npu

+

338

+

prod.Dimname_out

+

prod_out_npu

+

339

+

tan

+

tan_npu

+

340

+

tan_

+

tan_npu_

+

341

+

tan.out

+

tan_out_npu

+

342

+

tanh

+

tanh_npu

+

343

+

tanh_

+

tanh_npu_

+

344

+

tanh.out

+

tanh_out_npu

+

345

+

threshold

+

threshold_npu

+

346

+

threshold_

+

threshold_npu_

+

347

+

threshold.out

+

threshold_out_npu

+

348

+

threshold_backward

+

threshold_backward_npu

+

349

+

one_hot

+

one_hot_npu1

+

350

+

flip

+

flip_npu

+

351

+

roll

+

roll_npu

+

352

+

true_divide.Tensor

+

true_divide_npu

+

353

+

true_divide_.Tensor

+

true_divide_npu_

+

354

+

true_divide.out

+

true_divide_out_npu

+

355

+

true_divide.Scalar

+

true_divide_npu

+

356

+

true_divide_.Scalar

+

true_divide_npu_

+

357

+

trunc

+

trunc_npu

+

358

+

trunc_

+

trunc_npu_

+

359

+

trunc.out

+

trunc_out_npu

+

360

+

_unique2

+

_unique2_npu

+

361

+

var

+

var_npu

+

362

+

var.dim

+

var_npu

+

363

+

var.out

+

var_out_npu

+

364

+

var.names_dim

+

var_npu

+

365

+

var.names_out

+

var_out_npu

+

366

+

var_mean

+

var_mean_npu

+

367

+

var_mean.dim

+

var_mean_npu

+

368

+

var_mean.names_dim

+

var_mean_npu

+

369

+

where.self

+

where_npu

+

370

+

where

+

where_npu

+

371

+

_s_where

+

_s_where_npu

+

372

+

zeros.names

+

zeros_npu

+

373

+

zeros

+

zeros_npu

+

374

+

zeros.out

+

zeros_out_npu

+

375

+

zeros_like

+

zeros_like_npu

+

376

+

norm.ScalarOpt_dtype

+

norm_npu

+

377

+

norm.Scalar

+

norm_npu

+

378

+

norm.ScalarOpt_dim_dtype

+

norm_npu

+

379

+

norm.ScalarOpt_dim

+

norm_npu

+

380

+

norm.dtype_out

+

norm_out_npu

+

381

+

norm.out

+

norm_out_npu

+

382

+

clone

+

clone_npu

+

383

+

resize_as_

+

resize_as_npu_

+

384

+

pow.Tensor_Scalar_out

+

pow_out_npu

+

385

+

pow.Tensor_Scalar

+

pow_npu

+

386

+

zero_

+

zero_npu_

+

387

+

sub.out

+

sub_out_npu

+

388

+

sub.Tensor

+

sub_npu

+

389

+

sub_.Tensor

+

sub_npu_

+

390

+

sub.Scalar

+

sub_npu

+

391

+

sub_.Scalar

+

sub_npu_

+

392

+

rsub.Tensor

+

rsub_npu

+

393

+

rsub.Scalar

+

rsub_npu

+

394

+

addmm.out

+

addmm_out_npu

+

395

+

addmm

+

addmm_npu

+

396

+

addmm_

+

addmm_npu_

+

397

+

quantize_per_tensor

+

quantize_per_tensor_npu

+

398

+

quantize_per_channel

+

quantize_per_channel_npu

+

399

+

to.dtype_layout

+

to_npu

+

400

+

to.device

+

to_device_npu

+

401

+

to.dtype

+

to_dtype_npu

+

402

+

to.other

+

to_other_npu

+

403

+

_local_scalar_dense

+

_local_scalar_dense_npu

+

404

+

lstm.input

+

lstm_npu

+

405

+

lstm.data

+

lstm_npu

+

406

+

gru.input

+

gru_npu_

+

407

+

_pack_padded_sequence

+

_pack_padded_sequence_npu

+

408

+

_pad_packed_sequence

+

_pad_packed_sequence_npu

+

409

+

set_.source_Storage

+

set_npu_

+

410

+

set_.source_Storage_storage_offset

+

set_npu_

+

411

+

set_.source_Tensor

+

set_npu_

+

412

+

set_

+

set_npu_

+

413

+

masked_fill_.Scalar

+

masked_fill_npu_

+

414

+

masked_fill_.Tensor

+

masked_fill_npu_

+

415

+

masked_scatter_

+

masked_scatter_npu_

+

416

+

view

+

view_npu

+

417

+

put_

+

put_npu_

+

418

+

index_add_

+

index_add_npu_

+

419

+

index_add

+

index_add_npu

+

420

+

index_add.dimname

+

index_add_npu

+

421

+

index_fill_.int_Scalar

+

index_fill_npu_

+

422

+

index_fill.int_Scalar

+

index_fill_npu

+

423

+

index_fill_.int_Tensor

+

index_fill_npu_

+

424

+

index_fill.int_Tensor

+

index_fill_npu

+

425

+

scatter_.src

+

scatter_npu_

+

426

+

scatter_.value

+

scatter_npu_

+

427

+

scatter_add_

+

scatter_add_npu_

+

428

+

scatter_add

+

scatter_add_npu

+

429

+

scatter_add.dimname

+

scatter_add_npu

+

430

+

lt_.Scalar

+

lt_npu_

+

431

+

lt_.Tensor

+

lt_npu_

+

432

+

gt_.Scalar

+

gt_npu_

+

433

+

gt_.Tensor

+

gt_npu_

+

434

+

le_.Scalar

+

le_npu_

+

435

+

le_.Tensor

+

le_npu_

+

436

+

ge_.Scalar

+

ge_npu_

+

437

+

ge_.Tensor

+

ge_npu_

+

438

+

eq_.Scalar

+

eq_npu_

+

439

+

eq_.Tensor

+

eq_npu_

+

440

+

ne_.Scalar

+

ne_npu_

+

441

+

ne_.Tensor

+

ne_npu_

+

442

+

bitwise_and.Tensor_out

+

bitwise_and_out_npu

+

443

+

bitwise_and.Scalar_out

+

bitwise_and_out_npu

+

444

+

bitwise_and.Scalar

+

bitwise_and_npu

+

445

+

bitwise_and.Tensor

+

bitwise_and_npu

+

446

+

bitwise_and_.Scalar

+

bitwise_and_npu_

+

447

+

bitwise_and_.Tensor

+

bitwise_and_npu_

+

448

+

__and__.Scalar

+

__and___npu

+

449

+

__and__.Tensor

+

__and___npu

+

450

+

bitwise_or.Tensor_out

+

bitwise_or_out_npu

+

451

+

bitwise_or.Scalar_out

+

bitwise_or_out_npu

+

452

+

bitwise_or.Scalar

+

bitwise_or_npu

+

453

+

bitwise_or.Tensor

+

bitwise_or_npu

+

454

+

bitwise_or_.Scalar

+

bitwise_or_npu_

+

455

+

bitwise_or_.Tensor

+

bitwise_or_npu_

+

456

+

__or__.Scalar

+

__or___npu

+

457

+

__or__.Tensor

+

__or___npu

+

458

+

__ior__.Scalar

+

__ior___npu

+

459

+

__ior__.Tensor

+

__ior___npu

+

460

+

bitwise_xor.Tensor_out

+

bitwise_xor_out_npu

+

461

+

bitwise_xor.Scalar_out

+

bitwise_xor_out_npu

+

462

+

bitwise_xor.Scalar

+

bitwise_xor_npu

+

463

+

bitwise_xor.Tensor

+

bitwise_xor_npu

+

464

+

bitwise_xor_.Scalar

+

bitwise_xor_npu_

+

465

+

bitwise_xor_.Tensor

+

bitwise_xor_npu_

+

466

+

__xor__.Scalar

+

__xor___npu

+

467

+

__xor__.Tensor

+

__xor___npu

+

468

+

atan2_

+

atan2_npu_

+

469

+

tril_

+

tril_npu_

+

470

+

triu_

+

triu_npu_

+

471

+

renorm_

+

renorm_npu_

+

472

+

pow_.Scalar

+

pow_npu_

+

473

+

pow_.Tensor

+

pow_npu_

+

474

+

lerp_.Scalar

+

lerp_npu_

+

475

+

lerp_.Tensor

+

lerp_npu_

+

476

+

fmod_.Scalar

+

fmod_npu_

+

477

+

fmod_.Tensor

+

fmod_npu_

+

478

+

remainder_.Scalar

+

remainder_npu_

+

479

+

remainder_.Tensor

+

remainder_npu_

+

480

+

addbmm_

+

addbmm_npu_

+

481

+

addbmm.out

+

addbmm_out_npu

+

482

+

addbmm

+

addbmm_npu

+

483

+

addcdiv_

+

addcdiv_npu_

+

484

+

random_.from

+

random_npu_

+

485

+

random_.to

+

random_npu_

+

486

+

random_

+

random_npu_

+

487

+

uniform_

+

uniform_npu_

+

488

+

diag.out

+

diag_out_npu

+

489

+

diag

+

diag_npu

+

490

+

cross.out

+

cross_out_npu

+

491

+

cross

+

cross_npu

+

492

+

triu.out

+

triu_out_npu

+

493

+

triu

+

triu_npu

+

494

+

tril.out

+

tril_out_npu

+

495

+

tril

+

tril_npu

+

496

+

ne.Scalar_out

+

ne_out_npu

+

497

+

ne.Scalar

+

ne_npu

+

498

+

ne.Tensor_out

+

ne_out_npu

+

499

+

ne.Tensor

+

ne_npu

+

500

+

eq.Scalar_out

+

eq_out_npu

+

501

+

eq.Scalar

+

eq_npu

+

502

+

eq.Tensor_out

+

eq_out_npu

+

503

+

eq.Tensor

+

eq_npu

+

504

+

ge.Scalar_out

+

ge_out_npu

+

505

+

ge.Scalar

+

ge_npu

+

506

+

ge.Tensor_out

+

ge_out_npu

+

507

+

ge.Tensor

+

ge_npu

+

508

+

le.Scalar_out

+

le_out_npu

+

509

+

le.Scalar

+

le_npu

+

510

+

le.Tensor_out

+

le_out_npu

+

511

+

le.Tensor

+

le_npu

+

512

+

gt.Scalar_out

+

gt_out_npu

+

513

+

gt.Scalar

+

gt_npu

+

514

+

gt.Tensor_out

+

gt_out_npu

+

515

+

gt.Tensor

+

gt_npu

+

516

+

lt.Scalar_out

+

lt_out_npu

+

517

+

lt.Scalar

+

lt_npu

+

518

+

lt.Tensor_out

+

lt_out_npu

+

519

+

lt.Tensor

+

lt_npu

+

520

+

take.out

+

take_out_npu

+

521

+

take

+

take_npu

+

522

+

index_select.out

+

index_select_out_npu

+

523

+

index_select

+

index_select_npu

+

524

+

index_select.dimname_out

+

index_select_out_npu

+

525

+

index_select.dimname

+

index_select_npu

+

526

+

masked_select.out

+

masked_select_out_npu

+

527

+

masked_select

+

masked_select_npu

+

528

+

nonzero.out

+

nonzero_out_npu

+

529

+

nonzero

+

nonzero_npu

+

530

+

gather.out

+

gather_out_npu

+

531

+

gather

+

gather_npu

+

532

+

gather.dimname_out

+

gather_out_npu

+

533

+

gather.dimname

+

gather_npu

+

534

+

addcmul.out

+

addcmul_out_npu

+

535

+

addcmul

+

addcmul_npu

+

536

+

addcmul_

+

addcmul_npu_

+

537

+

addcdiv.out

+

addcdiv_out_npu

+

538

+

addcdiv

+

addcdiv_npu

+

539

+

qr.Q

+

qr_out_npu

+

540

+

qr

+

qr_npu

+

541

+

multinomial.out

+

multinomial_out_npu

+

542

+

multinomial

+

multinomial_npu

+

543

+

erfinv

+

erfinv_npu

+

544

+

erfinv_

+

erfinv_npu_

+

545

+

erfinv.out

+

erfinv_out_npu

+

546

+

sign

+

sign_npu

+

547

+

sign_

+

sign_npu_

+

548

+

sign.out

+

sign_out_npu

+

549

+

atan2.out

+

atan2_out_npu

+

550

+

atan2

+

atan2_npu

+

551

+

lerp.Scalar_out

+

lerp_out_npu

+

552

+

lerp.Tensor_out

+

lerp_out_npu

+

553

+

lerp.Scalar

+

lerp_npu

+

554

+

lerp.Tensor

+

lerp_npu

+

555

+

histc.out

+

histc_out_npu

+

556

+

histc

+

histc_npu

+

557

+

fmod.Scalar_out

+

fmod_out_npu

+

558

+

fmod.Scalar

+

fmod_npu

+

559

+

fmod.Tensor_out

+

fmod_out_npu

+

560

+

fmod.Tensor

+

fmod_npu

+

561

+

remainder.Scalar_out

+

remainder_out_npu

+

562

+

remainder.Scalar

+

remainder_npu

+

563

+

remainder.Tensor_out

+

remainder_out_npu

+

564

+

remainder.Tensor

+

remainder_npu

+

565

+

min.out

+

min_out_npu

+

566

+

min.other

+

min_npu

+

567

+

min

+

min_npu

+

568

+

max.out

+

max_out_npu

+

569

+

max.other

+

max_npu

+

570

+

max

+

max_npu

+

571

+

median

+

median_npu

+

572

+

sort.values

+

sort_out_npu

+

573

+

sort

+

sort_npu

+

574

+

sort.dimname_values

+

sort_out_npu

+

575

+

sort.dimname

+

sort_npu

+

576

+

argsort

+

argsort_npu

+

577

+

argsort.dimname

+

argsort_npu

+

578

+

topk.values

+

topk_out_npu

+

579

+

topk

+

topk_npu

+

580

+

all

+

all_npu

+

581

+

any

+

any_npu

+

582

+

renorm.out

+

renorm_out_npu

+

583

+

renorm

+

renorm_npu

+

584

+

unfold

+

unfold

+

585

+

equal

+

equal_npu

+

586

+

pow.Tensor_Tensor_out

+

pow_out_npu

+

587

+

pow.Tensor_Tensor

+

pow_npu

+

588

+

pow.Scalar_out

+

pow_out_npu

+

589

+

pow.Scalar

+

pow_npu

+

590

+

normal_

+

normal_npu_

+

591

+

normal.Tensor_float_out

+

normal_out_npu

+

592

+

normal.Tensor_float

+

normal_npu

+

593

+

normal.float_Tensor_out

+

normal_out_npu

+

594

+

normal.float_Tensor

+

normal_npu

+

595

+

normal.Tensor_Tensor_out

+

normal_out_npu

+

596

+

normal.Tensor_Tensor

+

normal_npu

+

597

+

normal.float_float

+

normal_npu

+

598

+

normal.float_float_out

+

normal_out_npu

+

599

+

_addr

+

_addr_npu

+

600

+

_addr_

+

_addr_npu_

+

601

+

_addr.out

+

_addr_out_npu

+

602

+

_cumsum

+

_cumsum_npu

+

603

+

_cumsum.out

+

_cumsum_out_npu

+

604

+

_cumprod

+

_cumprod_npu

+

605

+

_cumprod.out

+

_cumprod_out_npu

+

606

+

_var

+

_var_npu

+

607

+

_amp_non_finite_check_and_unscale_

+

_amp_non_finite_check_and_unscale_npu_

+

608

+

_cat

+

_cat_npu

+

609

+

_cat.out

+

_cat_out_npu

+

610

+

_max

+

_max_npu

+

611

+

_max.max

+

_max_out_npu

+

612

+

_min

+

_min_npu

+

613

+

_min.min

+

_min_out_npu

+

614

+

mse_loss.out

+

mse_loss_out_npu

+

615

+

mse_loss

+

mse_loss_npu

+

616

+

mse_loss_backward.grad_input

+

mse_loss_backward_out_npu

+

617

+

mse_loss_backward

+

mse_loss_backward_npu

+

618

+

l1_loss.out

+

l1_loss_out_npu

+

619

+

l1_loss

+

l1_loss_npu

+

620

+

l1_loss_backward.grad_input

+

l1_loss_backward_out_npu

+

621

+

l1_loss_backward

+

l1_loss_backward_npu

+

622

+

multilabel_margin_loss.out

+

multilabel_margin_loss_out_npu

+

623

+

multilabel_margin_loss

+

multilabel_margin_loss_npu

+

624

+

multilabel_margin_loss_forward.output

+

multilabel_margin_loss_forward_out_npu

+

625

+

multilabel_margin_loss_forward

+

multilabel_margin_loss_forward_npu

+

626

+

nll_loss.out

+

nll_loss_out_npu

+

627

+

nll_loss

+

nll_loss_npu

+

628

+

nll_loss_forward.output

+

nll_loss_forward_out_npu

+

629

+

nll_loss_forward

+

nll_loss_forward_npu

+

630

+

nll_loss_backward.grad_input

+

nll_loss_backward_out_npu

+

631

+

nll_loss_backward

+

nll_loss_backward_npu

+

632

+

nll_loss2d.out

+

nll_loss2d_out_npu

+

633

+

nll_loss2d

+

nll_loss2d_npu

+

634

+

nll_loss2d_forward.output

+

nll_loss2d_forward_out_npu

+

635

+

nll_loss2d_forward

+

nll_loss2d_forward_npu

+

636

+

nll_loss2d_backward.grad_input

+

nll_loss2d_backward_out_npu

+

637

+

nll_loss2d_backward

+

nll_loss2d_backward_npu

+

638

+

smooth_l1_loss.out

+

smooth_l1_loss_out_npu

+

639

+

smooth_l1_loss

+

smooth_l1_loss_npu

+

640

+

smooth_l1_loss_backward.grad_input

+

smooth_l1_loss_backward_out_npu

+

641

+

smooth_l1_loss_backward

+

smooth_l1_loss_backward_npu

+

642

+

soft_margin_loss.out

+

soft_margin_loss_out_npu

+

643

+

soft_margin_loss

+

soft_margin_loss_npu

+

644

+

soft_margin_loss_backward.grad_input

+

soft_margin_loss_backward_out_npu

+

645

+

soft_margin_loss_backward

+

soft_margin_loss_backward_npu

+

646

+

elu.out

+

elu_out_npu

+

647

+

elu

+

elu_npu

+

648

+

elu_backward.grad_input

+

elu_backward_out_npu

+

649

+

elu_backward

+

elu_backward_npu

+

650

+

elu_

+

elu_npu_

+

651

+

glu.out

+

glu_out_npu

+

652

+

glu

+

glu_npu

+

653

+

glu_backward.grad_input

+

glu_backward_out_npu

+

654

+

glu_backward

+

glu_backward_npu

+

655

+

hardsigmoid.out

+

hardsigmoid_out_npu

+

656

+

hardsigmoid

+

hardsigmoid_npu

+

657

+

hardsigmoid_

+

hardsigmoid_npu_

+

658

+

hardsigmoid_backward

+

hardsigmoid_backward_npu

+

659

+

hardtanh.out

+

hardtanh_out_npu

+

660

+

hardtanh

+

hardtanh_npu

+

661

+

hardtanh_backward.grad_input

+

hardtanh_backward_out_npu

+

662

+

hardtanh_backward

+

hardtanh_backward_npu

+

663

+

hardtanh_

+

hardtanh_npu_

+

664

+

leaky_relu.out

+

leaky_relu_out_npu

+

665

+

leaky_relu

+

leaky_relu_npu

+

666

+

leaky_relu_backward

+

leaky_relu_backward_npu

+

667

+

leaky_relu_

+

leaky_relu_npu_

+

668

+

log_sigmoid.out

+

log_sigmoid_out_npu

+

669

+

log_sigmoid

+

log_sigmoid_npu

+

670

+

log_sigmoid_forward.output

+

log_sigmoid_forward_out_npu

+

671

+

log_sigmoid_forward

+

log_sigmoid_forward_npu

+

672

+

log_sigmoid_backward.grad_input

+

log_sigmoid_backward_out_npu

+

673

+

log_sigmoid_backward

+

log_sigmoid_backward_npu

+

674

+

softplus.out

+

softplus_out_npu

+

675

+

softplus

+

softplus_npu

+

676

+

softplus_backward.grad_input

+

softplus_backward_out_npu

+

677

+

softplus_backward

+

softplus_backward_npu

+

678

+

softshrink.out

+

softshrink_out_npu

+

679

+

softshrink

+

softshrink_npu

+

680

+

softshrink_backward.grad_input

+

softshrink_backward_out_npu

+

681

+

softshrink_backward

+

softshrink_backward_npu

+

682

+

adaptive_avg_pool2d.out

+

adaptive_avg_pool2d_out_npu

+

683

+

adaptive_avg_pool2d

+

adaptive_avg_pool2d_npu

+

684

+

_adaptive_avg_pool2d

+

_adaptive_avg_pool2d_npu

+

685

+

_adaptive_avg_pool2d_backward

+

adaptive_avg_pool2d_backward_npu

+

686

+

adaptive_avg_pool3d.out

+

adaptive_avg_pool3d_out_npu

+

687

+

adaptive_avg_pool3d

+

adaptive_avg_pool3d_npu

+

688

+

adaptive_avg_pool3d_backward.grad_input

+

adaptive_avg_pool3d_backward_out_npu

+

689

+

adaptive_avg_pool3d_backward

+

adaptive_avg_pool3d_backward_npu

+

690

+

adaptive_max_pool2d.out

+

adaptive_max_pool2d_out_npu

+

691

+

adaptive_max_pool2d

+

adaptive_max_pool2d_npu

+

692

+

adaptive_max_pool2d_backward.grad_input

+

adaptive_max_pool2d_backward_out_npu

+

693

+

adaptive_max_pool2d_backward

+

adaptive_max_pool2d_backward_npu

+

694

+

avg_pool2d.out

+

avg_pool2d_out_npu

+

695

+

avg_pool2d

+

avg_pool2d_npu

+

696

+

avg_pool2d_backward.grad_input

+

avg_pool2d_backward_out_npu

+

697

+

avg_pool2d_backward

+

avg_pool2d_backward_npu

+

698

+

avg_pool3d.out

+

avg_pool3d_out_npu

+

699

+

avg_pool3d

+

avg_pool3d_npu

+

700

+

avg_pool3d_backward.grad_input

+

avg_pool3d_backward_out_npu

+

701

+

avg_pool3d_backward

+

avg_pool3d_backward_npu

+

702

+

max_pool2d_with_indices.out

+

max_pool2d_with_indices_out_npu

+

703

+

max_pool2d_with_indices

+

max_pool2d_with_indices_npu

+

704

+

max_pool2d_with_indices_backward.grad_input

+

max_pool2d_with_indices_backward_out_npu

+

705

+

max_pool2d_with_indices_backward

+

max_pool2d_with_indices_backward_npu

+

706

+

max_pool3d_with_indices.out

+

max_pool3d_with_indices_out_npu

+

707

+

max_pool3d_with_indices

+

max_pool3d_with_indices_npu

+

708

+

max_pool3d_with_indices_backward.grad_input

+

max_pool3d_with_indices_backward_out_npu

+

709

+

max_pool3d_with_indices_backward

+

max_pool3d_with_indices_backward_npu

+

710

+

reflection_pad2d.out

+

reflection_pad2d_out_npu

+

711

+

reflection_pad2d

+

reflection_pad2d_npu

+

712

+

replication_pad2d.out

+

replication_pad2d_out_npu

+

713

+

replication_pad2d

+

replication_pad2d_npu

+

714

+

upsample_linear1d.out

+

upsample_linear1d_out_npu

+

715

+

upsample_linear1d

+

upsample_linear1d_npu

+

716

+

upsample_linear1d_backward

+

upsample_linear1d_backward_npu

+

717

+

upsample_bilinear2d.out

+

upsample_bilinear2d_out_npu

+

718

+

upsample_bilinear2d

+

upsample_bilinear2d_npu

+

719

+

upsample_bilinear2d_backward.grad_input

+

upsample_bilinear2d_backward_out_npu

+

720

+

upsample_bilinear2d_backward

+

upsample_bilinear2d_backward_npu

+

721

+

upsample_bicubic2d.out

+

upsample_bicubic2d_out_npu

+

722

+

upsample_bicubic2d

+

upsample_bicubic2d_npu

+

723

+

upsample_bicubic2d_backward.grad_input

+

upsample_bicubic2d_backward_out_npu

+

724

+

upsample_bicubic2d_backward

+

upsample_bicubic2d_backward_npu

+

725

+

upsample_trilinear3d.out

+

upsample_trilinear3d_out_npu

+

726

+

upsample_trilinear3d

+

upsample_trilinear3d_npu

+

727

+

upsample_trilinear3d_backward.grad_input

+

upsample_trilinear3d_backward_out_npu

+

728

+

upsample_trilinear3d_backward

+

upsample_trilinear3d_backward_npu

+

729

+

upsample_nearest1d.out

+

upsample_nearest1d_out_npu

+

730

+

upsample_nearest1d

+

upsample_nearest1d_npu

+

731

+

upsample_nearest1d_backward.grad_input

+

upsample_nearest1d_backward_out_npu

+

732

+

upsample_nearest1d_backward

+

upsample_nearest1d_backward_npu

+

733

+

upsample_nearest2d.out

+

upsample_nearest2d_out_npu

+

734

+

upsample_nearest2d

+

upsample_nearest2d_npu

+

735

+

upsample_nearest2d_backward.grad_input

+

upsample_nearest2d_backward_out_npu

+

736

+

upsample_nearest2d_backward

+

upsample_nearest2d_backward_npu

+

737

+

upsample_nearest3d.out

+

upsample_nearest3d_out_npu

+

738

+

upsample_nearest3d

+

upsample_nearest3d_npu

+

739

+

upsample_nearest3d_backward.grad_input

+

upsample_nearest3d_backward_out_npu

+

740

+

upsample_nearest3d_backward

+

upsample_nearest3d_backward_npu

+

741

+

sigmoid_backward.grad_input

+

sigmoid_backward_out_npu

+

742

+

sigmoid_backward

+

sigmoid_backward_npu

+

743

+

tanh_backward.grad_input

+

tanh_backward_out_npu

+

744

+

tanh_backward

+

tanh_backward_npu

+

745

+

slow_conv_transpose2d.out

+

slow_conv_transpose2d_out_npu

+

746

+

slow_conv_transpose2d

+

slow_conv_transpose2d_npu

+

747

+

slow_conv_transpose2d_backward.grad_output

+

slow_conv_transpose2d_backward_out_npu

+

748

+

slow_conv_transpose2d_backward.output_mask

+

slow_conv_transpose2d_backward_npu

+

749

+

thnn_conv2d.out

+

thnn_conv2d_out_npu

+

750

+

thnn_conv2d

+

thnn_conv2d_npu

+

751

+

thnn_conv2d_forward.output

+

thnn_conv2d_forward_out_npu

+

752

+

thnn_conv2d_forward

+

thnn_conv2d_forward_npu

+

753

+

thnn_conv2d_backward.output_mask

+

thnn_conv2d_backward_npu

+

754

+

thnn_conv_depthwise2d.out

+

thnn_conv_depthwise2d_out_npu

+

755

+

thnn_conv_depthwise2d

+

thnn_conv_depthwise2d_npu

+

756

+

thnn_conv_depthwise2d_forward.out

+

thnn_conv_depthwise2d_forward_out_npu

+

757

+

thnn_conv_depthwise2d_forward

+

thnn_conv_depthwise2d_forward_npu

+

758

+

thnn_conv_depthwise2d_backward.grad_input

+

thnn_conv_depthwise2d_backward_out_npu

+

759

+

thnn_conv_depthwise2d_backward.output_mask

+

thnn_conv_depthwise2d_backward_npu

+

760

+

slow_conv_dilated2d

+

slow_conv_dilated2d_npu

+

761

+

slow_conv_dilated2d_backward

+

slow_conv_dilated2d_backward_npu

+

762

+

col2im.out

+

im2col_backward_out_npu

+

763

+

col2im

+

im2col_backward_npu

+

764

+

col2im_backward.grad_input

+

col2im_backward_out_npu

+

765

+

col2im_backward

+

col2im_backward_npu

+

766

+

im2col.out

+

im2col_out_npu

+

767

+

im2col

+

im2col_npu

+

768

+

im2col_backward.grad_input

+

im2col_backward_out_npu

+

769

+

im2col_backward

+

im2col_backward_npu

+

770

+

isfinite

+

isfinite_npu

+

PyTorch昇腾自定义算子

-| 序号 | PyTorch 算子(由昇腾开发) | 昇腾适配算子 | -| ---- | ---------------------------------------------- | ---------------------------------------------- | -| 1 | npu_convolution_transpose | npu_convolution_transpose | -| 2 | npu_conv_transpose2d | convolution_transpose_npu | -| 3 | npu_convolution_transpose_backward | convolution_transpose_backward_npu | -| 4 | npu_convolution | npu_convolution | -| 5 | npu_convolution_backward | npu_convolution_backward | -| 6 | npu_conv2d | conv2d_npu | -| 7 | npu_conv2d.out | conv2d_out_npu | -| 8 | npu_conv2d_backward | conv2d_backward_npu | -| 9 | npu_conv3d | conv3d_npu | -| 10 | npu_conv3d.out | conv3d_out_npu | -| 11 | npu_conv3d_backward | conv3d_backward_npu | -| 12 | one_ | one_npu_ | -| 13 | npu_sort_v2.out | sort_without_indices_out_npu | -| 14 | npu_sort_v2 | sort_without_indices_npu | -| 15 | npu_format_cast | format_cast_npu | -| 16 | npu_format_cast_.acl_format | format_cast_npu_ | -| 17 | npu_format_cast_.src | format_cast_npu_ | -| 18 | npu_transpose_to_contiguous | transpose_to_contiguous_npu | -| 19 | npu_transpose | transpose_npu | -| 20 | npu_transpose.out | transpose_out_npu | -| 21 | npu_broadcast | broadcast_npu | -| 22 | npu_broadcast.out | broadcast_out_npu | -| 23 | npu_dtype_cast | dtype_cast_npu | -| 24 | npu_dtype_cast_.Tensor | dtype_cast_npu_ | -| 25 | npu_roi_alignbk | roi_align_backward_npu | -| 26 | empty_with_format | empty_with_format_npu | -| 27 | empty_with_format.names | empty_with_format_npu | -| 28 | copy_memory_ | copy_memory_npu_ | -| 29 | npu_one_hot | one_hot_npu | -| 30 | npu_stride_add | stride_add_npu | -| 31 | npu_softmax_cross_entropy_with_logits | softmax_cross_entropy_with_logits_npu | -| 32 | npu_softmax_cross_entropy_with_logits_backward | softmax_cross_entropy_with_logits_backward_npu | -| 33 | npu_ps_roi_pooling | ps_roi_pooling_npu | -| 34 | npu_ps_roi_pooling_backward | ps_roi_pooling_backward_npu | -| 35 | npu_roi_align | roi_align_npu | -| 36 | npu_nms_v4 | nms_v4_npu | -| 37 | npu_lstm | lstm_npu | -| 38 | npu_lstm_backward | lstm_backward_npu | -| 39 | npu_iou | iou_npu | -| 40 | npu_ptiou | ptiou_npu | -| 41 | npu_nms_with_mask | nms_with_mask_npu | -| 42 | npu_pad | pad_npu | -| 43 | npu_bounding_box_encode | bounding_box_encode_npu | -| 44 | npu_bounding_box_decode | bounding_box_decode_npu | -| 45 | npu_gru | gru_npu | -| 46 | npu_gru_backward | gru_backward_npu | -| 47 | npu_set_.source_Storage_storage_offset_format | set_npu_ | -| 48 | npu_random_choice_with_mask | random_choice_with_mask_npu | -| 49 | npu_batch_nms | batch_nms_npu | -| 50 | npu_slice | slice_npu | -| 51 | npu_slice.out | slice_out_npu | -| 52 | npu_dropoutV2 | dropout_v2_npu | -| 53 | npu_dropoutV2_backward | dropout_v2_backward_npu | -| 54 | _npu_dropout | _dropout_npu | -| 55 | _npu_dropout_inplace | _dropout_npu_inplace | -| 56 | npu_dropout_backward | dropout_backward_npu | -| 57 | npu_indexing | indexing_npu | -| 58 | npu_indexing.out | indexing_out_npu | -| 59 | npu_ifmr | ifmr_npu | -| 60 | npu_max.dim | max_v1_npu | -| 61 | npu_max.names_dim | max_v1_npu | -| 62 | npu_scatter | scatter_npu | -| 63 | npu_max_backward | max_backward_npu | -| 64 | npu_apply_adam | apply_adam_npu | -| 65 | npu_layer_norm_eval | layer_norm_eval_npu | -| 66 | npu_alloc_float_status | alloc_float_status_npu | -| 67 | npu_get_float_status | get_float_status_npu | -| 68 | npu_clear_float_status | clear_float_status_npu | -| 69 | npu_confusion_transpose | confusion_transpose_npu | -| 70 | npu_confusion_transpose_backward | confusion_transpose_backward_npu | -| 71 | npu_bmmV2 | bmm_v2_npu | -| 72 | fast_gelu | fast_gelu_npu | -| 73 | fast_gelu_backward | fast_gelu_backward_npu | -| 74 | npu_sub_sample | sub_sample_npu | -| 75 | npu_deformable_conv2d | deformable_conv2d_npu | -| 76 | npu_deformable_conv2dbk | deformable_conv2d_backward_npu | -| 77 | npu_mish | mish_npu | -| 78 | npu_anchor_response_flags | anchor_response_flags_npu | -| 79 | npu_yolo_boxes_encode | yolo_boxes_encode_npu | -| 80 | npu_grid_assign_positive | grid_assign_positive_npu | -| 81 | npu_mish_backward | mish_backward_npu | -| 82 | npu_normalize_batch | normalize_batch_npu | -| 83 | npu_masked_fill_range | masked_fill_range_npu | - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

PyTorch 算子(由昇腾开发)

+

昇腾适配算子

+

1

+

npu_convolution_transpose

+

npu_convolution_transpose

+

2

+

npu_conv_transpose2d

+

convolution_transpose_npu

+

3

+

npu_convolution_transpose_backward

+

convolution_transpose_backward_npu

+

4

+

npu_convolution

+

npu_convolution

+

5

+

npu_convolution_backward

+

npu_convolution_backward

+

6

+

npu_conv2d

+

conv2d_npu

+

7

+

npu_conv2d.out

+

conv2d_out_npu

+

8

+

npu_conv2d_backward

+

conv2d_backward_npu

+

9

+

npu_conv3d

+

conv3d_npu

+

10

+

npu_conv3d.out

+

conv3d_out_npu

+

11

+

npu_conv3d_backward

+

conv3d_backward_npu

+

12

+

one_

+

one_npu_

+

13

+

npu_sort_v2.out

+

sort_without_indices_out_npu

+

14

+

npu_sort_v2

+

sort_without_indices_npu

+

15

+

npu_format_cast

+

format_cast_npu

+

16

+

npu_format_cast_.acl_format

+

format_cast_npu_

+

17

+

npu_format_cast_.src

+

format_cast_npu_

+

18

+

npu_transpose_to_contiguous

+

transpose_to_contiguous_npu

+

19

+

npu_transpose

+

transpose_npu

+

20

+

npu_transpose.out

+

transpose_out_npu

+

21

+

npu_broadcast

+

broadcast_npu

+

22

+

npu_broadcast.out

+

broadcast_out_npu

+

23

+

npu_dtype_cast

+

dtype_cast_npu

+

24

+

npu_dtype_cast_.Tensor

+

dtype_cast_npu_

+

25

+

npu_roi_alignbk

+

roi_align_backward_npu

+

26

+

empty_with_format

+

empty_with_format_npu

+

27

+

empty_with_format.names

+

empty_with_format_npu

+

28

+

copy_memory_

+

copy_memory_npu_

+

29

+

npu_one_hot

+

one_hot_npu

+

30

+

npu_stride_add

+

stride_add_npu

+

31

+

npu_softmax_cross_entropy_with_logits

+

softmax_cross_entropy_with_logits_npu

+

32

+

npu_softmax_cross_entropy_with_logits_backward

+

softmax_cross_entropy_with_logits_backward_npu

+

33

+

npu_ps_roi_pooling

+

ps_roi_pooling_npu

+

34

+

npu_ps_roi_pooling_backward

+

ps_roi_pooling_backward_npu

+

35

+

npu_roi_align

+

roi_align_npu

+

36

+

npu_nms_v4

+

nms_v4_npu

+

37

+

npu_lstm

+

lstm_npu

+

38

+

npu_lstm_backward

+

lstm_backward_npu

+

39

+

npu_iou

+

iou_npu

+

40

+

npu_ptiou

+

ptiou_npu

+

41

+

npu_nms_with_mask

+

nms_with_mask_npu

+

42

+

npu_pad

+

pad_npu

+

43

+

npu_bounding_box_encode

+

bounding_box_encode_npu

+

44

+

npu_bounding_box_decode

+

bounding_box_decode_npu

+

45

+

npu_gru

+

gru_npu

+

46

+

npu_gru_backward

+

gru_backward_npu

+

47

+

npu_set_.source_Storage_storage_offset_format

+

set_npu_

+

48

+

npu_random_choice_with_mask

+

random_choice_with_mask_npu

+

49

+

npu_batch_nms

+

batch_nms_npu

+

50

+

npu_slice

+

slice_npu

+

51

+

npu_slice.out

+

slice_out_npu

+

52

+

npu_dropoutV2

+

dropout_v2_npu

+

53

+

npu_dropoutV2_backward

+

dropout_v2_backward_npu

+

54

+

_npu_dropout

+

_dropout_npu

+

55

+

_npu_dropout_inplace

+

_dropout_npu_inplace

+

56

+

npu_dropout_backward

+

dropout_backward_npu

+

57

+

npu_indexing

+

indexing_npu

+

58

+

npu_indexing.out

+

indexing_out_npu

+

59

+

npu_ifmr

+

ifmr_npu

+

60

+

npu_max.dim

+

max_v1_npu

+

61

+

npu_max.names_dim

+

max_v1_npu

+

62

+

npu_scatter

+

scatter_npu

+

63

+

npu_max_backward

+

max_backward_npu

+

64

+

npu_apply_adam

+

apply_adam_npu

+

65

+

npu_layer_norm_eval

+

layer_norm_eval_npu

+

66

+

npu_alloc_float_status

+

alloc_float_status_npu

+

67

+

npu_get_float_status

+

get_float_status_npu

+

68

+

npu_clear_float_status

+

clear_float_status_npu

+

69

+

npu_confusion_transpose

+

confusion_transpose_npu

+

70

+

npu_confusion_transpose_backward

+

confusion_transpose_backward_npu

+

71

+

npu_bmmV2

+

bmm_v2_npu

+

72

+

fast_gelu

+

fast_gelu_npu

+

73

+

fast_gelu_backward

+

fast_gelu_backward_npu

+

74

+

npu_sub_sample

+

sub_sample_npu

+

75

+

npu_deformable_conv2d

+

deformable_conv2d_npu

+

76

+

npu_deformable_conv2dbk

+

deformable_conv2d_backward_npu

+

77

+

npu_mish

+

mish_npu

+

78

+

npu_anchor_response_flags

+

anchor_response_flags_npu

+

79

+

npu_yolo_boxes_encode

+

yolo_boxes_encode_npu

+

80

+

npu_grid_assign_positive

+

grid_assign_positive_npu

+

81

+

npu_mish_backward

+

mish_backward_npu

+

82

+

npu_normalize_batch

+

normalize_batch_npu

+

83

+

npu_masked_fill_range

+

masked_fill_range_npu

+
diff --git a/docs/zh/RELEASENOTE/RELEASENOTE.md b/docs/zh/RELEASENOTE/RELEASENOTE.md index 559b2b09b3b907bcf26d01d2a343da946506ba6c..485e1989729a6f8d15cf5c2fca486de6b9971279 100644 --- a/docs/zh/RELEASENOTE/RELEASENOTE.md +++ b/docs/zh/RELEASENOTE/RELEASENOTE.md @@ -1,4 +1,4 @@ -# PyTorch版本说明书 +# PyTorch版本说明书 2.0.2 - [用户须知](#用户须知.md) - [新增特性](#新增特性.md) - [特性修改](#特性修改.md)