name | about | labels |
---|---|---|
Bug Report | Use this template for reporting a bug | kind/bug |
分布式训练报如下错误:
Ascend
/GPU
/CPU
) / 硬件环境:Please delete the backend not involved / 请删除不涉及的后端:
/device ascend
Software Environment / 软件环境 (Mandatory / 必填):
-- MindSpore version (e.g.,r1.6 commit_id=xxxx) : 1.5
-- Python version (e.g., Python 3.7.5) : 3.7.5
Excute Mode / 执行模式 (Mandatory / 必填)(PyNative
/Graph
):
Please delete the mode not involved / 请删除不涉及的模式:
/mode graph (编译模式)
无
import mindspore.nn as nn
from mindspore import ops
from src.model_utils.config import config
class AlexNet(nn.Cell):
"""
Alexnet
"""
def __init__(self):
super(AlexNet, self).__init__()
self.fc1 = nn.Dense(128, 256, has_bias=False)
self.fc1.matmul.shard(((config.dp, 1), (config.mp, 1)))
self.fc2 = nn.Dense(256, 512, has_bias=False)
self.fc2.matmul.shard(((config.dp, config.mp), (1, config.mp)))
def construct(self, x):
"""define network"""
x = self.fc1(x)
x = self.fc2(x)
x = ops.reduce_mean(x, 0)
x = ops.reduce_mean(x, 0)
return x
2. 分布式训练脚本:
```python
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
######################## train alexnet example ########################
train alexnet and get network model files(.ckpt) :
python train.py --data_path /YourDataPath
"""
import os
from mindspore.nn import Cell
from mindspore.profiler import Profiler
from mindspore.train.callback import LossMonitor, TimeMonitor, CheckpointConfig, ModelCheckpoint
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
from src.model_utils.device_adapter import get_device_id, get_rank_id, get_job_id
from src.dataset import create_dataset_cifar10
from src.generator_lr import get_lr_cifar10, get_lr_imagenet
from src.alexnet import AlexNet
from src.get_param_groups import get_param_groups
import mindspore.nn as nn
from mindspore.communication.management import init, get_rank
from mindspore import dataset as de
from mindspore import context
from mindspore import Tensor, ops
from mindspore.train import Model
from mindspore.context import ParallelMode
from mindspore.common import set_seed
import numpy as np
from termcolor import cprint
set_seed(1)
de.config.set_seed(1)
class OpsReduceSum(Cell):
def __init__(self):
super(OpsReduceSum, self).__init__()
def construct(self, x, y):
return ops.add(x, y)
def modelarts_pre_process():
pass
@moxing_wrapper(pre_process=modelarts_pre_process)
def train_alexnet():
print(config)
cprint(f'device id: {get_device_id()}', on_color='on_red')
# print('device num:', config.device_num)
cprint(f'rank id: {get_rank_id()}', on_color='on_red')
cprint(f'job id: {get_job_id()}', on_color='on_red')
device_num = config.dp * config.mp
context.set_context(mode=context.GRAPH_MODE,
save_graphs=True,
device_target='Ascend',
device_id=get_device_id(),
save_graphs_path=f"bs{config.batch_size}_dn{config.device_num}_dp{config.dp}_mp{config.mp}")
print("set save_graphs ....")
cprint(f"device_num={device_num}, device_id={get_device_id()}", on_color='on_red')
context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num=device_num, full_batch=True, global_rank=0,
parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True)
profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
init()
network = AlexNet()
cprint(network.trainable_params(), on_color='on_red')
metrics = None
step_per_epoch = 10000
lr = Tensor(get_lr_imagenet(config.learning_rate, config.epoch_size, step_per_epoch))
opt = nn.Momentum(params=network.trainable_params(),
learning_rate=lr,
momentum=config.momentum)
from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
if config.is_dynamic_loss_scale == 1:
loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
else:
loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
model = Model(network, loss_fn=OpsReduceSum(), optimizer=opt, metrics=metrics, amp_level="O2", keep_batchnorm_fp32=False,
loss_scale_manager=loss_scale_manager)
cprint("============== Starting Training ==============", on_color='on_red')
for i in range(1):
model.train_network.compile(Tensor(np.ones([config.batch_size, 128]).astype(np.float32)),
Tensor(np.ones([1]).astype(np.int32)))
profiler.analyse()
if __name__ == "__main__":
train_alexnet()
3. 执行训练任务
```shell
export RANK_TABLE_FILE=/home/z00448363/mindspore/model_zoo/utils/hccl_tools/hccl_1p_6_0.0.0.22.json
export DEVICE_ID=6
export RANK_ID=0
python train.py --device_id=6 --dp=2 --mp=1
### Describe the expected behavior / 预期结果 (Mandatory / 必填)
预期顺利编译成功
### Related log / screenshot / 日志 / 截图 (Mandatory / 必填)
打印了部分结果:
![输入图片说明](https://images.gitee.com/uploads/images/2022/0224/113927_8ecbf9ea_78977.png "屏幕截图.png")
### Special notes for this issue/备注 (Optional / 选填)
Please assign maintainer to check this issue.
请为此issue分配处理人。
@fangwenyi @chengxiaoli
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。
Please add labels (comp or sig), also you can visit https://gitee.com/mindspore/community/blob/master/sigs/dx/docs/labels.md to find more.
为了让代码尽快被审核,请您为Pull Request打上 组件(comp)或兴趣组(sig) 标签,打上标签的PR可以直接推送给责任人进行审核。
更多的标签可以查看https://gitee.com/mindspore/community/blob/master/sigs/dx/docs/labels.md
以组件相关代码提交为例,如果你提交的是data组件代码,你可以这样评论:
//comp/data
当然你也可以邀请data SIG组来审核代码,可以这样写:
//sig/data
另外你还可以给这个PR标记类型,例如是bugfix或者是特性需求:
//kind/bug or //kind/feature
恭喜你,你已经学会了使用命令来打标签,接下来就在下面的评论里打上标签吧!
@stepbystep 可以提供下脚本吗?复现一下报错。
您好,问题已经解决了,我采用的虚拟编译的方式,该方式无法适用于常亮的allreduce算子,在ms1.5版本上报错是正常的,为避免该报错,需要修改ms源码。感谢yaoyifan老哥的支持!
您好,问题已经解决了,我采用的虚拟编译的方式,该方式无法适用于常亮的allreduce算子,在ms1.5版本上报错是正常的,为避免该报错,需要修改ms源码。感谢yaoyifan老哥的支持!
@stepbystep 好的
登录 后才可以发表评论