diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml index 9d63f6e9afa5438464c2234d915b62bd24356146..3ecdb38296607042050562844b6cfa6973e4effc 100644 --- a/.jenkins/test/config/dependent_packages.yaml +++ b/.jenkins/test/config/dependent_packages.yaml @@ -1,2 +1,2 @@ mindspore: - 'https://repo.mindspore.cn/mindspore/mindspore/version/202411/20241107/master_20241107010033_82dd840960481006a493ef35a632979cdf4619d6_newest/' \ No newline at end of file + 'https://repo.mindspore.cn/mindspore/mindspore/version/202411/20241126/master_20241126101834_a95514d8ad1bfa5c2cfb72a7142df7d19a638dbd_newest/' \ No newline at end of file diff --git a/mindformers/modules/transformer/transformer.py b/mindformers/modules/transformer/transformer.py index 377481eb63deb2603871d9a3f0db01252ecbdeb6..7df34b200b194e360ada589b39f758b46e571b39 100644 --- a/mindformers/modules/transformer/transformer.py +++ b/mindformers/modules/transformer/transformer.py @@ -1001,7 +1001,10 @@ class LowerTriangularMaskWithDynamic(Cell): self.one = Tensor([1.0], dtype=compute_type) if use_past: if self.is_dynamic: - self.lower_triangle_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), 1)) + mask_coeff = 1.0 if compute_type is mstype.bfloat16 else -10000.0 + self.lower_triangle_mask = Tensor( + np.triu(np.ones(shape=(128, 128), dtype=np.float16), 1) * mask_coeff, dtype=compute_type + ) else: self.lower_triangle_mask = None else: diff --git a/research/qwen/qwen_model.py b/research/qwen/qwen_model.py index b61090592a9273d7df266c6bce88af9ae732df45..e30ebcd39875d8691a029fa041f266aa8bdacf45 100644 --- a/research/qwen/qwen_model.py +++ b/research/qwen/qwen_model.py @@ -493,7 +493,10 @@ class CausalMaskForQwen(nn.Cell): self.multiply_data = Tensor([-10000.0], dtype=compute_type) self.one = Tensor([1.0], dtype=compute_type) if self.is_dynamic: - self.lower_triangle_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), 1)) + mask_coeff = 1.0 if compute_type is mstype.bfloat16 else -10000.0 + self.lower_triangle_mask = Tensor( + np.triu(np.ones(shape=(128, 128), dtype=np.float16), 1) * mask_coeff, dtype=compute_type + ) else: self.lower_triangle_mask = Tensor(np.tril(np.ones(shape=(seq_length, seq_length))), mstype.float32) self.shape = P.Shape() diff --git a/research/qwenvl/qwen/qwen_model.py b/research/qwenvl/qwen/qwen_model.py index 995de17a4c9b853c558057398f1c293c63743bc6..a60abd1dba42a60d53b8ed478765b7cd110e8e56 100644 --- a/research/qwenvl/qwen/qwen_model.py +++ b/research/qwenvl/qwen/qwen_model.py @@ -568,7 +568,10 @@ class CausalMaskForQwen(nn.Cell): self.one = Tensor([1.0], dtype=compute_type) if use_past: if self.is_dynamic: - self.lower_triangle_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), 1)) + mask_coeff = 1.0 if compute_type is mstype.bfloat16 else -10000.0 + self.lower_triangle_mask = Tensor( + np.triu(np.ones(shape=(128, 128), dtype=np.float16), 1) * mask_coeff, dtype=compute_type + ) else: self.lower_triangle_mask = None else: diff --git a/tests/st/test_distri_core/test_mixtral/test_mixtral.py b/tests/st/test_distri_core/test_mixtral/test_mixtral.py index 6ef7b6e2b6e8abf4bee6f34d165a242de88a1ffd..172ab64609853b4d6894015a02c678d2c750c8b5 100644 --- a/tests/st/test_distri_core/test_mixtral/test_mixtral.py +++ b/tests/st/test_distri_core/test_mixtral/test_mixtral.py @@ -223,61 +223,6 @@ class TestMixtral: "please check your code." - @pytest.mark.level0 - @pytest.mark.platform_arm_ascend910b_training - @pytest.mark.env_single - @pytest.mark.run(order=2) - def test_mixtral_pynative_ep2tp2pp2(self): - """ - Feature: test mixtral pynative - Description: run pynative mode mixtral to generate pynative loss - Expectation: test success - """ - os.environ['HCCL_BUFFSIZE'] = "200" - scripts_name = "run_mixtral.py" - device_num = 8 - postfix = "_ep2tp2pp2" - - rm_list = ["npy_pynative*", f"msrun_log_pynative{postfix}*", "kernel_meta*"] - print("") - for rm_path in rm_list: - rm_path = os.path.join(os.getcwd(), rm_path) - print(f"removing {rm_path}") - os.system(f"rm -rf {rm_path}") - - sh_path = os.path.split(os.path.realpath(__file__))[0] - scripts_path = os.path.join(sh_path, scripts_name) - - scripts_cmd = f"{scripts_path} --config_path=./config_mixtral_small.yaml --ep=2 --tp=2 --pp=2 --sp" - cmd = f"msrun --worker_num={device_num} "+\ - f"--local_worker_num={device_num} "+\ - f"--master_port=8119 "+\ - f"--log_dir=msrun_log_pynative{postfix} "+\ - f"--join=True "+\ - f"--cluster_time_out=300 "+\ - f"{scripts_cmd}" - ret = os.system(cmd) - os.system(f"grep -E 'ERROR|error' {sh_path}/msrun_log_pynative{postfix}/worker_0.log -C 3") - assert ret == 0, f"msrun failed, please check msrun_log_pynative{postfix}/worker_*.log" - - # check loss with golden loss - pynative_log_path = f'msrun_log_pynative{postfix}/worker_4.log' - pynative_loss = self.extract_loss_from_log(pynative_log_path) - print(f"pynative_loss are:\n{pynative_loss}") - - golden_loss = [4.1485944, 4.1479816, 4.1473684, 4.146756, 4.146144, - 4.145531, 4.144919, 4.144307, 4.143695, 4.1430836] - - golden_loss = np.array(golden_loss) - print(f"golden_loss are:\n{golden_loss}") - - assert np.allclose(golden_loss, pynative_loss, atol=1.e-4, rtol=1e-4), \ - f"Expect relative error between pynative and golden loss below 1e-4,\n" + \ - f"but got pynative loss:\n{pynative_loss},\n" + \ - f"and golden loss:\n{golden_loss},\n" + \ - "please check your code." - - @pytest.mark.level1 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single