From 90647cc225b39c6ba41fd69276d2b835826f2ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=A6=E6=99=93=E7=8E=B2?= <3174348550@qq.com> Date: Tue, 18 Nov 2025 14:42:12 +0800 Subject: [PATCH] add en files --- docs/mindspore/source_en/features/amp.md | 81 +++++++++ docs/mindspore/source_en/features/index.rst | 2 + docs/mindspore/source_en/features/mint.md | 165 +++++++++++++++++++ docs/mindspore/source_zh_cn/features/mint.md | 20 +-- 4 files changed, 258 insertions(+), 10 deletions(-) create mode 100644 docs/mindspore/source_en/features/amp.md create mode 100644 docs/mindspore/source_en/features/mint.md diff --git a/docs/mindspore/source_en/features/amp.md b/docs/mindspore/source_en/features/amp.md new file mode 100644 index 0000000000..eca148c34d --- /dev/null +++ b/docs/mindspore/source_en/features/amp.md @@ -0,0 +1,81 @@ +# Automatic Mixed Precision + +[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_en/features/amp.md) + +Mixed precision training refers to an operation policy in which different numerical precisions are used for different operations of a neural network during training. In neural network operations, some operations are insensitive to numerical precision. In this case, using lower precision can achieve a significant acceleration effect (such as conv and matmul). For operations with a large difference between the input and output values, higher precision is required to ensure the correctness of the results (such as log and softmax). + +## Mechanism + +Floating-point data types include double-precision (FP64), single-precision (FP32), half-precision (FP16), and brain floating point (BF16). Each of them is represented by a sign bit, an exponent bit, and a floating-point bit. FP64 indicates that 8 bytes (64 bits) are used. FP32 indicates that 4 bytes (32 bits) are used. FP16 and BF16 indicate that 2 bytes (16 bits) are used. For details, see the following figure. + +![fp16-vs-fp32](./images/fp16_vs_fp32.png) + +As shown in the figure, the storage space of FP16 is half of that of FP32. Similarly, the storage space of FP32 is half of that of FP64. Therefore, low-precision computing offers the following advantages: + +- Reduced memory usage: The bit width of FP16 or BF16 is half of that of FP32. Therefore, the memory occupied by parameters such as the weight is also half of the original memory. The saved memory can be used to store larger network models or train more data. +- Higher computing efficiency: On special AI acceleration chips, such as Huawei Atlas training series products and Atlas 200/300/500 inference series products, or GPUs of the NVIDIA VOLTA architecture, FP16 and BF16 deliver faster operation performance than FP32. +- Higher communication efficiency: For distributed training, especially LLM training, the communication overhead restricts the overall performance. Using a smaller communication bit width can improve the communication performance, reduce the waiting time, and accelerate the data flow. + +However, low-precision computing also encounters the following problems: + +- Data overflow: The valid data range of FP16 is $[5.9\\times10^{-8},65504]$, and that of FP32 is $[1.4\\times10^{-45},1.7\\times10^{38}]$. It is evident that the valid range of FP16 is much narrower than that of FP32. Therefore, using FP16 to replace FP32 can lead to overflow or underflow. In deep learning, the gradient (first-order derivative) of a weight in a network model needs to be calculated. Therefore, the gradient is smaller than the weight value, and underflow often occurs. +- Rounding error: FP32 is sufficient for representing small backward gradients for network models. However, shifting to FP16 will result in an interval smaller than the current minimum interval, causing data overflow. For example, `0.00006666666` can be properly represented in FP32, but it will be represented as `0.000067` in FP16. Numbers beyond the minimum interval requirement of FP16 will be forcibly rounded off. + +Therefore, when using mixed precision to accelerate training and save memory, you need to solve the problems introduced by low precision. Generally, mixed precision is used with loss scaling. When calculating the loss value, it scales out the loss by a certain multiple. According to the chain rule, the gradients are scaled out, and then scaled in by a corresponding multiple when the optimizer updates the weights, avoiding data underflow. + +The following figure shows the typical mixed-precision computing process. + +![mix precision](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/images/mix_precision_fp16.png) + +## Mixed Precision Usage Examples + +```python +import mindspore +from mindspore import amp + +loss_scaler = amp.DynamicLossScaler(scale_value=1024, scale_factor=2, scale_window=1000) +ori_model = Net() +# Enable automatic mixed precision. +model = amp.auto_mixed_precision(ori_model, amp_level="auto", dtype=mindspore.float16) +loss_fn = Loss() +optimizer = Optimizer() + +# Build a feedforward network. +def forward_fn(data, label): + logits = model(data) + loss = loss_fn(logits, label) + # Scale the loss. + loss = loss_scaler.scale(loss) + return loss, logits + +# Generate a derivation function to calculate the feedforward propagation result and gradient of a given function. +grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params(), has_aux=True) + +# Build a training function. +def train_step(data, label): + (loss, _), grads = grad_fn(data, label) + # Unscale the loss to the actual loss. + loss = loss_scaler.unscale(loss) + # Check whether the gradient does not overflow. + is_finite = amp.all_finite(grads) + if is_finite: + # If the gradient does not overflow, unscale it to the actual gradient. + # After unscaling the gradient, you can perform operations such as clipping and penalty on the gradient. + grads = loss_scaler.unscale(grads) + # Update the model parameters using the optimizer. + optimizer(grads) + # Dynamically update the value of loss_scaler. + loss_scaler.adjust(is_finite) + return loss + +# Build a data iterator. +train_dataset = Dataset() +train_dataset_iter = train_dataset.create_tuple_iterator() + +for epoch in range(epochs): + for data, label in train_dataset_iter: + # Start training and obtain the loss. + loss = train_step(data, label) +``` + +For more details about automatic mixed precision, see [amp.auto_mixed_precision](https://www.mindspore.cn/docs/en/master/api_python/amp/mindspore.amp.auto_mixed_precision.html). diff --git a/docs/mindspore/source_en/features/index.rst b/docs/mindspore/source_en/features/index.rst index 1b9f7d6041..3484e5a43c 100644 --- a/docs/mindspore/source_en/features/index.rst +++ b/docs/mindspore/source_en/features/index.rst @@ -15,5 +15,7 @@ Developer Notes runtime/memory_manager runtime/multilevel_pipeline runtime/multistream_concurrency + mint view data_engine + amp diff --git a/docs/mindspore/source_en/features/mint.md b/docs/mindspore/source_en/features/mint.md new file mode 100644 index 0000000000..e540575049 --- /dev/null +++ b/docs/mindspore/source_en/features/mint.md @@ -0,0 +1,165 @@ +# Introduction to mint API + +[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_en/features/mint.md) + +## Overview + +With the introduction of aclnn-class operators in cann, existing MindSpore APIs like ops and nn require adaptation and optimization. To preserve the behavior of existing APIs while ensuring compatibility, we have created a new API directory for this purpose. The name "mint" for this directory draws inspiration from the Linux motto "Linux is not unix". Under mindspore.mint, common pytorch-like APIs are provided for tensor creation, computation, neural networks, communication, and more. This article primarily introduces the scope of support for mint-class APIs and differences in input parameters. This set of APIs mainly includes tensor creation, random sampling, mathematical computation, neural networks, and cluster communication classes. + +### Tensor Creation + +Let's examine the key differences using the API empty: + +| torch.empty | mindspore.mint.empty | Explanation | +|:---: | :---: | :---:| +| `*size` (int...) | `*size` (int...) | Required | +| `dtype` | `dtype` | Optional | +| `device` | `device` | Optional | +| `layout` | - | Optional | +|`requires_grad` | - | Optional | +| `pin_memory` | - | Optional | +|`memory_format` | - | Optional | +| `out` | - | Optional | + +#### Description of Currently Unsupported Parameters + +- `layout`: When torch creates a tensor, the default layout is typically stride, i.e, a dense tensor. When MindSpore creates a tensor, the default is also a dense tensor, identical to torch. Developers do not need to set this. +- `memory_format`: The default memory layout for tensors is NCHW format. Torch provides the channel_last format (NHWC), which may offer performance improvements in certain scenarios. However, developers should conduct actual testing and verification to ensure its generalizability and compatibility. When developing with MindSpore, this parameter does not need to be set. +- `requires_grad`: Due to differences in the framework's automatic differentiation mechanism, MindSpore does not include this parameter in its Tensor attributes. For determining whether gradient computation is required, the commonly used parameter class provides this parameter. If gradient computation is unnecessary, refer to [mindspore.ops.stop_gradient](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.stop_gradient.html). +- `pin_memory`: The returned tensor is allocated to pinned memory. We have planned to support this feature. It is scheduled for release in version 2.7.1. +- `out`: Specify the output tensor for in-place operations and memory optimization. When the `out` parameter is provided, the operation result is written directly to the specified tensor instead of creating a new one. Support for this parameter is currently not planned. + +**Code Example**: + +```diff +- import torch ++ import mindspore + +- x = torch.empty(2, 3, dtype=torch.float32) ++ x = mindspore.mint.empty(2, 3, dtype=mindspore.float32) +``` + +Summary: Tensor-related optional parameters vary depending on framework implementation mechanisms. We will continue to refine them based on developer feedback, such as the planned enhancement of tensor storage capabilities. + +### Random Sampling + +Take Bernoulli as an example: + +| torch.bernoulli | mindspore.mint.bernoulli | Explanation | +|:---: | :---: | :---:| +| `input` (Tensor...) | `input` (Tensor...) | Required | +| `generator` | `generator`| Optional | +| `out` | - | Optional | + +For differences in the `out` parameter, refer to Tensor creation. + +**Code Example**: + +```diff +- import torch ++ import mindspore.mint + +- a = torch.ones(3, 3) ++ a = mindspore.mint.ones(3, 3) + +- torch.bernoulli(a) ++ mindspore.mint.bernoulli(a) +``` + +### Mathematical Calculation + +All basic arithmetic operations are now supported. For example, the multiplication operation: + +| torch.mul | mindspore.mint.mul | Explanation | +|:---: | :---: | :---:| +| `*size` (Tensor...) | `*size` (Tensor...) | Required | +| `other` | `other`| Optional | +| `out` | - | Optional | + +The parameters currently unsupported by computational ops are similar to those for tensor creation, which relates to the underlying implementation mechanism of tensors. For example, `out`: + +**Code Example**: + +```diff +- import torch ++ import mindspore.mint + +- a = torch.randn(3) ++ a = mindspore.mint.randn(3) + +- torch.mul(a,50) ++ mindspore.mint.mul(a,50) +``` + +### Neural Network + +Common nn classes, such as conv2d, share identical parameters. + +| torch.conv2d | mindspore.mint.conv2d | Explanation | +|:---: | :---: | :---:| +| `in_channels` (int) | `in_channels` (int) | Required | +| `out_channels`(int) | `out_channels`(int) | Required | +| `kernel_size` (int or tuple) | `kernel_size` (int or tuple) | Required | +| `stride` (int or tuple) | `stride` (int or tuple) | Optional | +| `padding`(int, tuple or str) | `padding`(int, tuple or str) | Optional | +| `padding_mode` (str) | `padding_mode` (str) | Optional | +| `dilation`(int or tuple) | `dilation`(int or tuple) | Optional | +| `groups` (int) | `groups` (int) | Optional | +| `bias`(bool) | `bias`(bool) | Optional | + +**Code Example**: + +```diff +- import torch ++ import mindspore + +in_channels = 16 +out_channels = 33 +kernel_size = (3, 5) +stride = (2, 1) +padding = (4, 2) +dilation = (3, 1) + +- input = torch.rand(20,16,50,100) ++ input = mindspore.mint.rand(20,16,50,100) + +- model = torch.conv2d(16,33,(3,5),stride=(2, 1), padding=(4, 2), dilation=(3, 1)) ++ model = mindspore.mint.conv2d(16,33,(3,5),stride=(2, 1), padding=(4, 2), dilation=(3, 1)) + +output = model(input) +``` + +Functions containing the `inplace` parameter are not yet fully supported. For example: + +| API | Args | +| :--- | :--- | +| torch.nn.functional_dropout2d | input, p=0.5, training=True, inplace=False | +| mindspore.mint.nn.functional_dropout2d | input, p=0.5, training=True| + +Deprecated parameters in Torch are not supported, for example: + +| torch.nn.MSELoss | Discontinued or not | mindspore.nn.MSELoss | Explanation | +|:---: | :--- | :---: | :---:| +| `size_average` | yes | N.A | Not supported | +| `reduce` | yes | N.A | Not supported | +| `reduction` | no | `reduction` | Supported | + +### Cluster Communication Class + +Common operations such as `all_gather`, `all_reduce`, and `all_to_all` are now supported, with consistent parameters. For example: + +| torch.distributed.all_gather | mindspore.mint.distributed.all_gather | Explanation | +|:---: | :---: | :---:| +| `tensor_list` (list[Tensor]) | `tensor_list` (list(Tensor)) | Required | +| `tensor`(Tensor) | `tensor`(Tensor) | Optional | +| `group`(ProcessGroup) | `group` (ProcessGroup) | Optional | +| `async_op` (bool) | `async_op` (bool) | Optional | + +| torch.distributed.all_reduce | mindspore.mint.distributed.all_reduce | Explanation | +|:---: | :---: | :---:| +| `tensor` (Tensor) | `Tensor` (Tensor) | Required | +| `op` | `op` | Optional | +| `group`(ProcessGroup) | `group` (ProcessGroup) | Optional | +| `async_op` (bool) | `async_op` (bool) | Optional | + +For more API support details, please refer to the [mint support list](https://www.mindspore.cn/docs/en/master/api_python/mindspore.mint.html). diff --git a/docs/mindspore/source_zh_cn/features/mint.md b/docs/mindspore/source_zh_cn/features/mint.md index 9cc38148fe..a8aa8044c6 100644 --- a/docs/mindspore/source_zh_cn/features/mint.md +++ b/docs/mindspore/source_zh_cn/features/mint.md @@ -16,18 +16,18 @@ | `dtype` | `dtype` | 可选 | | `device` | `device` | 可选 | | `layout` | 无 | 可选 | -`requires_grad` | 无 | 可选 | +|`requires_grad` | 无 | 可选 | | `pin_memory` | 无 | 可选 | |`memory_format` | 无 | 可选 | | `out` | 无 | 可选 | #### 当前不支持的参数说明 -- `layout`: 创建torch tensor时,一般默认layout是stride,即dense tensor。mindspore创建tensor时,默认是dense tensor,与torch 无差异。开发者无需设置。 -- `memory_format`: tensor的内存排布,默认都是NCHW格式。torch 提供channel_last格式即NHWC,在一些场景中,这样会有性能提升,但是泛化性和兼容性需要开发者实际测试和验证。使用mindspore开发,可不设置此参数。 -- `requires_grad`: 由于框架自动微分求导机制不同,mindspore在tensor的属性中没有设置此参数。对于是否需要计算梯度,常用的parameter类提供了此参数。如果无需计算梯度,可参考[mindspore.ops.stop_gradient](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.stop_gradient.html)。 -- `pin_memory`: 返回的tensor被分配到pinned memory,我们已经规划支持此功能。计划在2.7.1版本推出。 -- `out`: 指定输出张量,用于原地操作和内存优化。当提供 `out` 参数时,操作结果会直接写入到指定的张量中,而不是创建新的张量。当前未规划支持此参数。 +- `layout`:创建torch tensor时,一般默认layout是stride,即dense tensor。mindspore创建tensor时,默认是dense tensor,与torch 无差异。开发者无需设置。 +- `memory_format`:tensor的内存排布,默认都是NCHW格式。torch 提供channel_last格式即NHWC,在一些场景中,这样会有性能提升,但是泛化性和兼容性需要开发者实际测试和验证。使用mindspore开发,可不设置此参数。 +- `requires_grad`:由于框架自动微分求导机制不同,mindspore在tensor的属性中没有设置此参数。对于是否需要计算梯度,常用的parameter类提供了此参数。如果无需计算梯度,可参考[mindspore.ops.stop_gradient](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.stop_gradient.html)。 +- `pin_memory`:返回的tensor被分配到pinned memory,我们已经规划支持此功能。计划在2.7.1版本推出。 +- `out`:指定输出张量,用于原地操作和内存优化。当提供 `out` 参数时,操作结果会直接写入到指定的张量中,而不是创建新的张量。当前未规划支持此参数。 **代码示例**: @@ -51,7 +51,7 @@ | `generator` | `generator`| 可选 | | `out` | 无 | 可选 | -out参数差异参考张量创建 +out参数差异参考张量创建。 **代码示例**: @@ -101,7 +101,7 @@ out参数差异参考张量创建 | `out_channels`(int) | `out_channels`(int) | 必选 | | `kernel_size` (int or tuple) | `kernel_size` (int or tuple) | 必选 | | `stride` (int or tuple) | `stride` (int or tuple) | 可选 | -| `padding`(int,tuple or str) | `padding`(int,tuple or str) | 可选 | +| `padding`(int, tuple or str) | `padding`(int, tuple or str) | 可选 | | `padding_mode` (str) | `padding_mode` (str) | 可选 | | `dilation`(int or tuple) | `dilation`(int or tuple) | 可选 | | `groups` (int) | `groups` (int) | 可选 | @@ -134,7 +134,7 @@ output = model(input) | API | Args | | :--- | :--- | | torch.nn.functional_dropout2d | input, p=0.5, training=True, inplace=False | -| mindspore.mint.nn.functional_dropout2d | input, p=0.5, training=True +| mindspore.mint.nn.functional_dropout2d | input, p=0.5, training=True| torch废弃的参数,不支持,例如: @@ -162,4 +162,4 @@ torch废弃的参数,不支持,例如: | `group`(ProcessGroup) | `group` (ProcessGroup) | 可选 | | `async_op` (bool) | `async_op` (bool) | 可选 | -更多API支持情况请查阅[mint支持列表](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore.mint.html)。 \ No newline at end of file +更多API支持情况请查阅[mint支持列表](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore.mint.html)。 -- Gitee