同步操作将从 MindSpore/mindspore 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
apply_ada_max_cpu_kernel.h
/**
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADA_MAX_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADA_MAX_CPU_KERNEL_H_
#include
#include
#include
#include
#include "plugin/device/cpu/kernel/cpu_kernel.h"
#include "plugin/factory/ms_factory.h"
namespace mindspore {
namespace kernel {
class ApplyAdaMaxCpuKernelMod : public NativeCpuKernelMod {
public:
ApplyAdaMaxCpuKernelMod() = default;
~ApplyAdaMaxCpuKernelMod() override = default;
bool Init(const BaseOperatorPtr &base_operator, const std::vector &inputs,
const std::vector &outputs) override;
int Resize(
const BaseOperatorPtr &base_operator, const std::vector &inputs,
const std::vector &outputs,
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost = std::map<uint32_t, tensor::TensorPtr>()) override;
bool Launch(const std::vector &inputs, const std::vector &,
const std::vector &outputs) override;
protected:
std::vector GetOpSupport() override {
static std::vector support_list = {KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)};
return support_list;
}
private:
TypeId dtype_{kTypeUnknown};
template
void LaunchKernel(const std::vector &inputs, const std::vector &outputs);
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADA_MAX_CPU_KERNEL_H_
apply_ada_max_cpu_kernel.cc
/**
#include
#include "plugin/device/cpu/kernel/apply_ada_max_cpu_kernel.h"
#include "plugin/device/cpu/kernel/nnacl/errorcode.h"
#include "plugin/device/cpu/kernel/nnacl/fp32/adam_fp32.h"
#include "plugin/device/cpu/hal/device/cpu_device_address.h"
#include "utils/ms_utils.h"
namespace {
const size_t kZero = 0;
const size_t kOne = 1;
const size_t kTwo = 2;
constexpr size_t kScalarIndex = 0;
constexpr size_t kIndexVar = 0;
constexpr size_t kIndexM = 1;
constexpr size_t kIndexV = 2;
constexpr size_t kIndexBeta1Power = 3;
constexpr size_t kIndexLr = 4;
constexpr size_t kIndexBeta1 = 5;
constexpr size_t kIndexBeta2 = 6;
constexpr size_t kIndexEpsilon = 7;
constexpr size_t kIndexGrad = 8;
constexpr size_t kApplyAdaMaxInputsNum = 9;
constexpr size_t kApplyAdaMaxOutputsNum = 3;
} // namespace
namespace mindspore {
namespace kernel {
bool ApplyAdaMaxCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs,
const std::vector &outputs) {
kernel_name_ = base_operator->name();
dtype_ = inputs[0]->GetDtype();
return true;
}
int ApplyAdaMaxCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs,
const std::vector &outputs,
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) {
int ret = 0;
if ((ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost)) != 0) {
return ret;
}
return ret;
}
bool ApplyAdaMaxCpuKernelMod::Launch(const std::vectorkernel::AddressPtr &inputs,
const std::vectorkernel::AddressPtr &,
const std::vectorkernel::AddressPtr &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kApplyAdaMaxInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kApplyAdaMaxOutputsNum, kernel_name_);
if (inputs[kIndexVar]->size != inputs[kIndexM]->size) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_
<< "', the dtype and shape of 'm' and 'var' must be the same, but got the memory size of 'm': "
<< inputs[kIndexM]->size << " and 'var': " << inputs[kIndexVar]->size;
}
if (inputs[kIndexVar]->size != inputs[kIndexV]->size) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_
<< "', the dtype and shape of 'v' and 'var' must be the same, but got the memory size of 'v': "
<< inputs[kIndexV]->size << " and 'var': " << inputs[kIndexVar]->size;
}
if (inputs[kIndexVar]->size != inputs[kIndexGrad]->size) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_
<< "', the dtype and shape of 'grad' and 'var' must be the same, "
"but got the memory size of 'grad': "
<< inputs[kIndexGrad]->size << " and 'var': " << inputs[kIndexVar]->size;
}
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel(inputs, outputs);
} else {
MS_EXCEPTION(TypeError) << "For '" << kernel_name_ << "', input dtype only support float16 and float32, but got ["
<< dtype_ << "].";
}
return true;
}
template
void ApplyAdaMaxCpuKernelMod::LaunchKernel(const std::vector &inputs,
const std::vector &outputs) {
T *var = reinterpret_cast<T *>(inputs[kIndexVar]->addr);
T *m = reinterpret_cast<T *>(inputs[kIndexM]->addr);
T *v = reinterpret_cast<T *>(inputs[kIndexV]->addr);
T beta1_power = static_cast(reinterpret_cast<float *>(inputs[kIndexBeta1Power]->addr)[kScalarIndex]);
T lr = static_cast(reinterpret_cast<float *>(inputs[kIndexLr]->addr)[kScalarIndex]);
T beta1 = static_cast(reinterpret_cast<float *>(inputs[kIndexBeta1]->addr)[kScalarIndex]);
T beta2 = static_cast(reinterpret_cast<float *>(inputs[kIndexBeta2]->addr)[kScalarIndex]);
T epsilon = static_cast(reinterpret_cast<float *>(inputs[kIndexEpsilon]->addr)[kScalarIndex]);
T *grad = reinterpret_cast<T *>(inputs[kIndexGrad]->addr);
auto one = static_cast(1);
if (beta1_power == one) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the 'beta1_power' can't be set 1.";
}
// multithreading
size_t length = inputs[kZero]->size / sizeof(T);
auto task = [this, &var, &m, &v, &beta1_power, &lr, &beta1, &beta2, &epsilon, &grad](size_t start, size_t end) {
T one = static_cast(1.0);
for (size_t i = start; i < end; i++) {
m[i] = static_cast(beta1 * m[i] + (one - beta1) * grad[i]);
auto zero = static_cast(0);
auto grad_abs = (grad[i] > zero) ? grad[i] : -grad[i];
v[i] = std::max(beta2 * v[i], grad_abs);
var[i] = var[i] - (lr / (one - beta1_power)) * (m[i] / (v[i] + epsilon));
}
};
CPUKernelUtils::ParallelForAutoSearch(task, length, ¶llel_search_info_);
// Copy result to output tensor
auto output_var = reinterpret_cast<T *>(outputs[kZero]->addr);
auto output_m = reinterpret_cast<T *>(outputs[kOne]->addr);
auto output_v = reinterpret_cast<T *>(outputs[kTwo]->addr);
auto ret = memcpy_s(output_var, outputs[kZero]->size, var, inputs[kZero]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
}
ret = memcpy_s(output_m, outputs[kOne]->size, m, inputs[kOne]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
}
ret = memcpy_s(output_v, outputs[kTwo]->size, v, inputs[kTwo]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
}
}
MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, ApplyAdaMax, ApplyAdaMaxCpuKernelMod);
} // namespace kernel
} // namespace mindspore
import numpy as np
import pytest
import mindspore.context as context
import mindspore.nn as nn
from mindspore import Tensor, Parameter
from mindspore.ops import operations as P
import mindspore.common.dtype as mstype
class Net(nn.Cell):
def init(self):
super(Net, self).init()
self.apply_ada_max = P.ApplyAdaMax()
self.var = Parameter(Tensor(np.array([[0.6, 0.4],
[0.1, 0.5]]).astype(np.float32)), name="var")
self.m = Parameter(Tensor(np.array([[0.6, 0.5],
[0.2, 0.6]]).astype(np.float32)), name="m")
self.v = Parameter(Tensor(np.array([[0.9, 0.1],
[0.7, 0.8]]).astype(np.float32)), name="v")
def construct(self, beta1_power, lr, beta1, beta2, epsilon, grad):
out = self.apply_ada_max(self.var, self.m, self.v, beta1_power, lr, beta1, beta2, epsilon, grad)
return out
@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_cpu
@pytest.mark.env_onecard
def test_apply_ada_max():
"""
Feature: ApplyAdaMax Operator on CPU
Description: Test ApplyAdaMax Operator
Expectation: Consistent with the results calculated using numpy
"""
# ms
net = Net()
beta1_power = Tensor(0.9, mstype.float32)
lr = Tensor(0.001, mstype.float32)
beta1 = Tensor(0.9, mstype.float32)
beta2 = Tensor(0.99, mstype.float32)
epsilon = Tensor(1e-10, mstype.float32)
grad = Tensor(np.array([[0.3, 0.7], [0.1, 0.8]]).astype(np.float32))
output = net(beta1_power, lr, beta1, beta2, epsilon, grad)
# numpy
np_var = np.array([[0.6, 0.4], [0.1, 0.5]])
np_m = np.array([[0.6, 0.5], [0.2, 0.6]])
np_v = np.array([[0.9, 0.1], [0.7, 0.8]])
np_beta1_power = 0.9
np_lr = 0.001
np_beta1 = 0.9
np_beta2 = 0.99
np_epsilon = 1e-10
np_grad = np.array([[0.3, 0.7], [0.1, 0.8]])
np_m = np_beta1 * np_m + (1.0 - np_beta1) * np_grad
np_v = np.maximum(np_beta2 * np_v, abs(np_grad))
np_var = np_var - (np_lr / (1 - np_beta1_power)) * (np_m / (np_v + np_epsilon))
ms_m = output[1].asnumpy()
ms_v = output[2].asnumpy()
ms_var = output[0].asnumpy()
eps = np.array([1e-6 for i in range(4)]).reshape(2, 2)
assert np.all(np_m - ms_m < eps)
assert np.all(np_v - ms_v < eps)
assert np.all(np_var - ms_var < eps)
登录 后才可以发表评论