name | about | labels |
---|---|---|
Bug Report | Use this template for reporting a bug | kind/bug |
Ascend
/GPU
/CPU
):Uncomment only one
/device <>
line, hit enter to put that in a new line, and remove leading whitespaces from that line:
/device gpu
test_parallel_switch_layer_dataparallel
'''
TEST_SUMMARY: train network with switch_layer using data_parallel
'''
def test_parallel_switch_layer_dataparallel():
inputs_np = np.random.randn(128, 3, 2, 2).astype(np.float32)
standalone_dataset = FakeData(size=32, batch_size=32, image_size=(3, 2, 2), num_classes=12)
fact = DataParallelMixedprecisionFactory(in_channel=3, out_channel=12, opt_flag="Momentum", level="O2",
loss_scale_manager=DynamicLossScaleManager(), learning_rate_flag="list")
fact.mindspore_standalone_impl(dataset=standalone_dataset, epoch=1)
parallel_dataset = FakeData(size=32, batch_size=4, image_size=(3, 2, 2), num_classes=12, use_parallel=True)
rank_size = int(os.environ['RANK_SIZE'])
fact.mindspore_data_parallel_impl(dataset=parallel_dataset, epoch=1, device_num=rank_size)
fact.checkpoint_cmp(inputs_np=inputs_np)
switch dataparallel First input node of call node is not switch
case pass
[[1m fact = DataParallelMixedprecisionFactory(in_channel=3, out_channel=12, opt_flag="Momentum", level="O2",^[[0m
^[[1m loss_scale_manager=DynamicLossScaleManager(), learning_rate_flag="list")^[[0m
^[[1m> fact.mindspore_standalone_impl(dataset=standalone_dataset, epoch=1)^[[0m
^[[1m^[[31m../test_parallel_switch_layer.py^[[0m:189:
^[[1m^[[31m../test_parallel_switch_layer.py^[[0m:109: in mindspore_standalone_impl
^[[1m dataset=dataset)^[[0m
^[[1m^[[31m../../../share/meta.py^[[0m:315: in _mindspore_standalone_impl
^[[1m eval_network=eval_network)^[[0m
^[[1m^[[31m../../../share/meta.py^[[0m:232: in __mindspore_impl
^[[1m eval_network=eval_network)^[[0m
^[[1m^[[31m../test_parallel_switch_layer.py^[[0m:165: in _model_train_and_save_ckpt
^[[1m dataset_sink_mode=self.dataset_sink_mode)^[[0m
^[[1m^[[31m/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/mindspore/train/model.py^[[0m:637: in train
^[[1m sink_size=sink_size)^[[0m
^[[1m^[[31m/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/mindspore/train/model.py^[[0m:431: in _train
^[[1m self._train_process(epoch, train_dataset, list_callback, cb_params)^[[0m
^[[1m^[[31m/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/mindspore/train/model.py^[[0m:556: in _train_process
^[[1m outputs = self._train_network(*next_element)^[[0m
^[[1m^[[31m/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/mindspore/nn/cell.py^[[0m:381: in call
^[[1m out = self.compile_and_run(*inputs)^[[0m
^[[1m^[[31m/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/mindspore/nn/cell.py^[[0m:639: in compile_and_run
^[[1m self.compile(*inputs)^[[0m
^[[1m^[[31m/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/mindspore/nn/cell.py^[[0m:626: in compile
^[[1m _executor.compile(self, *inputs, phase=self.phase, auto_parallel_mode=self._auto_parallel_mode)^[[0m
self = <mindspore.common.api._Executor object at 0x7fe5e80efbd0>
obj = TrainOneStepCell<
(network): WithLossCell<
(_backbone): DataParallelNet<
(relu): ReLU<>
(relu6): ReL...: SoftmaxCrossEntropyWithLogits<>
>
(optimizer): Momentum<
(learning_rate): _IteratorLearningRate<>
>
phase = '0train.1625082867569332992', do_convert = True
auto_parallel_mode = False
args = (Tensor(shape=[32, 3, 2, 2], dtype=Float32, value=
[[[[ 1.76405239e+00, 4.00157213e-01],
[ 9.78738010e-01, 2.2408...000e+00],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]))
^[[1m def compile(self, obj, *args, phase='predict', do_convert=True, auto_parallel_mode=False):^[[0m
^[[1m """^[[0m
^[[1m Compiles graph.^[[0m
^[[1m ^[[0m
^[[1m Args:^[[0m
^[[1m obj (Function/Cell): The function or cell instance need compile.^[[0m
^[[1m args (tuple): Function or cell input arguments.^[[0m
^[[1m phase (str): The name of compile phase. Default: 'predict'.^[[0m
^[[1m do_convert (bool): When set to True, convert ME graph to GE graph after compiling graph.^[[0m
^[[1m auto_parallel_mode: When set to True, use auto parallel mode to compile graph.^[[0m
^[[1m ^[[0m
^[[1m Return:^[[0m
^[[1m Str, the full phase of the cell.^[[0m
^[[1m Bool, if the graph has been compiled before, return False, else return True.^[[0m
^[[1m """^[[0m
^[[1m ^[[0m
^[[1m args_names, args_list = _generate_pip_args(obj, *args)^[[0m
^[[1m dic = dict(zip(args_names, args_list))^[[0m
^[[1m key = generate_key(phase, dic)^[[0m
^[[1m obj.phase_prefix = str(key[1])^[[0m
^[[1m if 'export' in phase:^[[0m
^[[1m phase = phase + '.' + obj.phase_prefix + '.' + str(obj.create_time)^[[0m
^[[1m else:^[[0m
^[[1m phase = obj.phase_prefix + phase + '.' + str(obj.create_time)^[[0m
^[[1m ^[[0m
^[[1m if phase in self.compile_cache.keys():^[[0m
^[[1m logger.debug("%r graph has existed.", phase)^[[0m
^[[1m return phase, False^[[0m
^[[1m ^[[0m
^[[1m obj.check_names()^[[0m
^[[1m _check_full_batch()^[[0m
^[[1m self._set_dataset_mode(args_list)^[[0m
^[[1m ^[[0m
^[[1m is_sink_mode = args and isinstance(args[0], Tensor) and args[0].virtual_flag^[[0m
^[[1m if auto_parallel_mode and _need_to_full() and not is_sink_mode and obj.auto_parallel_compile_and_run():^[[0m
^[[1m args_full = _to_full_tensor(args, _get_device_num(), _get_global_rank())^[[0m
^[[1m _, args_list = _generate_pip_args(obj, *args_full)^[[0m
^[[1m ^[[0m
^[[1m enable_debug_runtime = context.get_context("enable_debug_runtime")^[[0m
^[[1m enable_ge = context.get_context("enable_ge")^[[0m
^[[1m use_vm = not enable_ge or (enable_debug_runtime and context.get_context("mode") == context.PYNATIVE_MODE)^[[0m
^[[1m> result = self._executor.compile(obj, args_list, phase, use_vm, self.queue_name)^[[0m
^[[1m^[[31mE RuntimeError: mindspore/ccsrc/runtime/framework/control_node_parser.cc:882 FetchFrontToFrontParameter] First input node of call node is not switch, node:299_288_277_100_1_construct_wrapper.171:[CNode]127{[0]: [CNode]127, [1]: equivx, [2]: equiv178}^[[0m
^[[1m^[[31mE ^[[0m
^[[1m^[[31mE #^[[0m
^[[1m^[[31m/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/mindspore/common/api.py^[[0m:531: RuntimeError
^[[33m=============================== warnings summary ===============================^[[0m
/root/miniconda3/envs/ci3.7/lib/python3.7/site-packages/tensorflow_core/python/pywrap_tensorflow_internal.py:15
Please add labels (comp or sig), also you can visit "https://gitee.com/mindspore/community/blob/master/sigs/dx/docs/labels.md" to find more.
为了让问题更快得到响应,请您为该issue打上**组件(comp)或兴趣组(sig)**标签,打上标签的问题可以直接推送给责任人进行处理。更多的标签可以查看 https://gitee.com/mindspore/community/blob/master/sigs/dx/docs/labels.md"
以组件问题为例,如果你发现问题是data组件造成的,你可以这样评论:
//comp/data
当然你也可以向data SIG组求助,可以这样写:
//comp/data
//sig/data
如果是一个简单的问题,你可以留给刚进入社区的小伙伴来回答,这时候你可以这样写:
//good-first-issue
恭喜你,你已经学会了使用命令来打标签,接下来就在下面的评论里打上标签吧!
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。
Appearance & Root Cause:
未处理call链接call的场景
Fix Solution:
在switch actor中如果存在call 接call,则直接调用二阶call的funcgraph
verified
登录 后才可以发表评论