diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/.gitignore b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/LICENSE b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..09d493bf1fc257505c1336f3f87425568ab9da3c --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2017, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/README.md b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..898d4b8c24a4922cc8eb17de45a338a65a31a7f2 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/README.md @@ -0,0 +1,44 @@ +# 多任务模型 +## 多任务的优势 +* 相较于单任务,多任务不容易过拟合,因为多任务的损失函数同时受到多个任务的loss约束,这样会制约单个任务的性能; +* 多任务训练更加优雅,对资源的消耗更少,试想如果我们多任务有3个,变成独立的3个单任务进行训练的话会用数据集训练三个模型,这样会需要大量的资源进行三个模型的训练,资源利用率较低 +* 缓解稀疏性,单个任务的数据一般存在严重的稀疏性即正负样本比率严重不平衡,导致的结果是单个任务的训练效果会比较差,利用多任务的共同训练机制及share bottom机制可以利用其他任务的训练数据对底层embedding进行修正,一定程度上会缓解稀疏性 +* 修正样本偏差问题,由于单个任务的目的不一样,因此单任务选择的样本存在不一致的问题,例如:CTR任务选择的样本是曝光点击,CVR任务选择的样本是点击到购买,CVR任务的样本仅仅是CTR任务样本的一个子集,如果独立训练的话会造成单个模型训练结果的有偏,而多任务可以在一个样本空间进行训练,便于修正样本空间的问题。 + +## 多任务的一些问题 +* 单个任务的性能容易受到限制,任务loss为多个任务loss的加权,对于单个任务的loss训练存在不充分的情况 +* 任务之间存在蹊跷板现象及一个任务训练效果很好,另外一个很差或者没特别差别 +* 多任务联合loss的方式无法断定出当前的loss为每个独立任务最好loss的加权 + +## ESMM模型 +![avatar](./pic/ESMM.png) + +### 模型解读 +模型主要有一个双塔的形式分别构建user和item相关的特征,然后进行embedding和拼接,最终输出到各个task的NN网络中进行学习,整体结构非常简单清晰,这里不一一细说。主要强调几个需要注意和可以优化的点: +* user filed和item filed侧的特征如何融合,图上写的比较模糊,这里我的理解为:分别对user的每个特征进行向量化,例如类别特征直接embedding为一个向量,序列特征需要进行融合成为一个embedding(具体融合可以自己定义实验,如:maxpool,avgpool,sum都可以尝试),然后各个特征embedding在进行element-wise级别的加和操作,可以得到一个用户信息的整体embedding,item相关的信息处理类似,这里我们需要注意的是其中是没有考虑一些numeric特征的,如果有numeric特征我们需要在concatenate layer进行拼接上去 +* 模型的训练的实际loss = loss_{ctr} + loss_{ctcvr}论文中加号右半部分写的是loss_{ctr}*loss_{cvr}这个容易让人搞混,因为实际上在整个样本空间上CVR是没有办法进行训练的,所以cvrloss根本计算不了,因为给定的输入数据是全样本空间即曝光数据,所以从曝光到点击的数据是CTR,从曝光到购买的数据是CTCVR这一点需要理解清楚; +* 可以尝试优化的点:1. 序列特征的处理技巧,attention等的加入;2. loss的加权方式需要根据实际训练结果调整,因为不同任务的正负样本不一致,因此每个任务的loss都不一样,如果两个loss数量级差距很大,较大的loss会主导loss的优化,导致数量级较小的那个loss训练不充分; + +## MMOE模型 +![avatar](./pic/MMOE.png) + +### 模型解读 +模型跟ESMM大体框架基本一致,这里说一下主要的区别:其实图上说的也比较直观,大致就是传统的ESMM底层是直接共享一个share bottom的结构,这里一般是embedding层的共享,这种共享会让各个任务直接彼此受到很强的束缚,不能完全发挥单任务的最大价值;这里的改进是提出了一种expert机制,利用多个expert分别进行加权融合的形式去表达每个单任务,这样一定程度能缓解ESMM那找直接share bottom的束缚; +在深入一点就是,我们在得到融合后的input embedding(这里可以理解为user embedding和item embedding拼接后的整体embedding),然后加入多个结构相同的DNN网络进行学习,虽然DNN网络结构相同,但是参数在初始化及反向传播迭代的时候会有一定的差别,相当于多个相同结构的DNN同时训练一批数据预测的结果,然后这里添加了一个门机制对各个DNN的输出进行加权处理得到最终的输出,最后塞入单个任务塔中得到最终的结果; +个人理解MMOE其实就是集成版本的ESMM但是这种内部模块的集成非不同模型的集成其效果有待验证,了解集成的应该都清楚,模型的差异化带来的集成效果较为明显但是模型相同参数不同的集成带来的效果一般都很小。 + + +## 实验情况 +###数据说明 +选用的是uci的数据集https://archive.ics.uci.edu/ml/datasets/census+income + +### 特征处理 +数值特征进行归一化,类别特征进行embedding操作,由于没有区分user和item的特征,这里直接取前面7个特征为user特征后面的特征为item特征,然后进行向量化拼接 +预测值分别为**income_50k**和**marital_status** + +### 实验结果 +| 模型 | auc_income | auc_marital | +| ---- | ---- | ---- | +| ESMM | 90.2% | 96.2% | +| MMOE | 89.6% | 96.1% | +可以看到在这个小数据集上ESMM效果于MMOE差别很小,较大数据集还未实验 \ No newline at end of file diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/esmm.py b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/esmm.py new file mode 100644 index 0000000000000000000000000000000000000000..744820280beef3ff4e831c30bd32ac03a6109d8d --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/esmm.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +# +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ +# +# @Time : 2021-04-13 14:42 +# @Author : WenYi +# @Contact : 1244058349@qq.com +# @Description : script description + + +import torch +import torch.nn as nn +import torch.npu +import os +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) +if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') + + +class ESMM(nn.Module): + def __init__(self, user_feature_dict, item_feature_dict, emb_dim=128, hidden_dim=[128, 64], dropouts=[0.5, 0.5], + output_size=1, num_task=2): + """ + esmm model input parameters + :param user_feature_dict: user feature dict include: {feature_name: (feature_unique_num, feature_index)} + :param item_feature_dict: item feature dict include: {feature_name: (feature_unique_num, feature_index)} + :param emb_dim: int, embedding size + :param hidden_dim: list of ctr and ctcvr dnn hidden sizes + :param dropouts: list of ctr and ctcvr dnn drop out probability + :param output_size: int out put size + :param num_task: int default 2 multitask numbers + """ + super(ESMM, self).__init__() + + # check input parameters + if user_feature_dict is None or item_feature_dict is None: + raise Exception("input parameter user_feature_dict and item_feature_dict must be not None") + if isinstance(user_feature_dict, dict) is False or isinstance(item_feature_dict, dict) is False: + raise Exception("input parameter user_feature_dict and item_feature_dict must be dict") + + self.user_feature_dict = user_feature_dict + self.item_feature_dict = item_feature_dict + self.num_task = num_task + + # embedding鍒濆鍖 + user_cate_feature_nums, item_cate_feature_nums = 0, 0 + for user_cate, num in self.user_feature_dict.items(): + if num[0] > 1: + user_cate_feature_nums += 1 + setattr(self, user_cate, nn.Embedding(num[0], emb_dim)) + for item_cate, num in self.item_feature_dict.items(): + if num[0] > 1: + item_cate_feature_nums += 1 + setattr(self, item_cate, nn.Embedding(num[0], emb_dim)) + + # user embedding + item embedding + hidden_size = emb_dim * (user_cate_feature_nums + item_cate_feature_nums) + \ + (len(user_feature_dict) - user_cate_feature_nums) + (len(item_feature_dict) - item_cate_feature_nums) + + # esmm 鐙珛浠诲姟鐨凞NN缁撴瀯 + for i in range(self.num_task): + setattr(self, 'task_{}_dnn'.format(i + 1), nn.ModuleList()) + hid_dim = [hidden_size] + hidden_dim + for j in range(len(hid_dim) - 1): + getattr(self, 'task_{}_dnn'.format(i + 1)).add_module('ctr_hidden_{}'.format(j), + nn.Linear(hid_dim[j], hid_dim[j + 1])) + getattr(self, 'task_{}_dnn'.format(i + 1)).add_module('ctr_batchnorm_{}'.format(j), + nn.BatchNorm1d(hid_dim[j + 1])) + getattr(self, 'task_{}_dnn'.format(i + 1)).add_module('ctr_dropout_{}'.format(j), + nn.Dropout(dropouts[j])) + getattr(self, 'task_{}_dnn'.format(i + 1)).add_module('task_last_layer', + nn.Linear(hid_dim[-1], output_size)) + + def forward(self, x): + assert x.size()[1] == len(self.item_feature_dict) + len(self.user_feature_dict) + # embedding + user_embed_list, item_embed_list = list(), list() + for user_feature, num in self.user_feature_dict.items(): + if num[0] > 1: + user_embed_list.append(getattr(self, user_feature)(x[:, num[1]].to(torch.int32))) + else: + user_embed_list.append(x[:, num[1]].unsqueeze(1).to(torch.float16)) + for item_feature, num in self.item_feature_dict.items(): + if num[0] > 1: + item_embed_list.append(getattr(self, item_feature)(x[:, num[1]].to(torch.int32))) + else: + item_embed_list.append(x[:, num[1]].unsqueeze(1).to(torch.float16)) + + # embedding 铻嶅悎 + user_embed = torch.cat(user_embed_list, axis=1) + item_embed = torch.cat(item_embed_list, axis=1) + + # hidden layer + hidden = torch.cat([user_embed, item_embed], axis=1) + + # task tower + task_outputs = list() + for i in range(self.num_task): + x = hidden + for mod in getattr(self, 'task_{}_dnn'.format(i + 1)): + x = mod(x) + task_outputs.append(x) + + return task_outputs + + +if __name__ == "__main__": + import numpy as np + a = torch.from_numpy(np.array([[1, 2, 4, 2, 0.5, 0.1], + [4, 5, 3, 8, 0.6, 0.43], + [6, 3, 2, 9, 0.12, 0.32], + [9, 1, 1, 1, 0.12, 0.45], + [8, 3, 1, 4, 0.21, 0.67]])) + user_cate_dict = {'user_id': (11, 0), 'user_list': (12, 3), 'user_num': (1, 4)} + item_cate_dict = {'item_id': (8, 1), 'item_cate': (6, 2), 'item_num': (1, 5)} + esmm = ESMM(user_cate_dict, item_cate_dict) + tasks = esmm(a) + print(tasks) diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/main.py b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/main.py new file mode 100644 index 0000000000000000000000000000000000000000..78fd474c33b3a46112579f6420646120a4577fc8 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/main.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ +# +# @Time : 2021-04-19 17:25 +# @Author : WenYi +# @Contact : 1244058349@qq.com +# @Description : script description + + +from utils import data_preparation, TrainDataSet +from torch.utils.data import DataLoader +from model_train import train_model +from esmm import ESMM +from mmoe import MMOE +import torch +import torch.nn as nn +import torch.npu +import os +from apex import amp +import apex +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) +if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') + + +def main(): + train_data, test_data, user_feature_dict, item_feature_dict = data_preparation() + train_dataset = (train_data.iloc[:, :-2].values, train_data.iloc[:, -2].values, train_data.iloc[:, -1].values) + # val_dataset = (val_data.iloc[:, :-2].values, val_data.iloc[:, -2].values, val_data.iloc[:, -1].values) + test_dataset = (test_data.iloc[:, :-2].values, test_data.iloc[:, -2].values, test_data.iloc[:, -1].values) + train_dataset = TrainDataSet(train_dataset) + # val_dataset = TrainDataSet(val_dataset) + test_dataset = TrainDataSet(test_dataset) + + # dataloader + train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True, pin_memory=True) + # val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False) + test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False) + + # pytorch浼樺寲鍙傛暟 + learn_rate = 0.01 + bce_loss = nn.BCEWithLogitsLoss() + early_stop = 3 + epoch = 10 + + # train model + # esmm Epoch 17 val loss is 1.164, income auc is 0.875 and marry auc is 0.953 + esmm = ESMM(user_feature_dict, item_feature_dict, emb_dim=64).to(f'npu:{NPU_CALCULATE_DEVICE}') + optimizer = apex.optimizers.NpuFusedAdam(esmm.parameters(), lr=learn_rate) + esmm, optimizer = amp.initialize(esmm, optimizer, opt_level='O2', loss_scale=128.0, combine_grad=True) + train_model(esmm, train_dataloader, test_dataloader, epoch, bce_loss, optimizer, 'model/model_esmm_{}', early_stop) + + # mmoe + #mmoe = MMOE(user_feature_dict, item_feature_dict, emb_dim=64) + #optimizer = torch.optim.Adam(mmoe.parameters(), lr=learn_rate) + #train_model(mmoe, train_dataloader, test_dataloader, epoch, bce_loss, optimizer, 'model/model_mmoe_{}', early_stop) + + +if __name__ == "__main__": + main() diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/mmoe.py b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/mmoe.py new file mode 100644 index 0000000000000000000000000000000000000000..3f021c8f2aa78c50a0fcf7a0b95a801870a653a2 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/mmoe.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- +# +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ +# +# @Time : 2021-04-19 12:12 +# @Author : WenYi +# @Contact : 1244058349@qq.com +# @Description : script description + +import torch +import torch.nn as nn +import torch.npu +import os +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) +if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') + + +class MMOE(nn.Module): + """ + MMOE for CTCVR problem + """ + def __init__(self, user_feature_dict, item_feature_dict, emb_dim=128, n_expert=3, mmoe_hidden_dim=128, + hidden_dim=[128, 64], dropouts=[0.5, 0.5], output_size=1, expert_activation=None, num_task=2): + """ + MMOE model input parameters + :param user_feature_dict: user feature dict include: {feature_name: (feature_unique_num, feature_index)} + :param item_feature_dict: item feature dict include: {feature_name: (feature_unique_num, feature_index)} + :param emb_dim: int embedding dimension + :param n_expert: int number of experts in mmoe + :param mmoe_hidden_dim: mmoe layer input dimension + :param hidden_dim: list task tower hidden dimension + :param dropouts: list of task dnn drop out probability + :param output_size: int task output size + :param expert_activation: activation function like 'relu' or 'sigmoid' + :param num_task: int default 2 multitask numbers + """ + super(MMOE, self).__init__() + # check input parameters + if user_feature_dict is None or item_feature_dict is None: + raise Exception("input parameter user_feature_dict and item_feature_dict must be not None") + if isinstance(user_feature_dict, dict) is False or isinstance(item_feature_dict, dict) is False: + raise Exception("input parameter user_feature_dict and item_feature_dict must be dict") + + self.user_feature_dict = user_feature_dict + self.item_feature_dict = item_feature_dict + self.expert_activation = expert_activation + self.num_task = num_task + + # embedding鍒濆鍖 + user_cate_feature_nums, item_cate_feature_nums = 0, 0 + for user_cate, num in self.user_feature_dict.items(): + if num[0] > 1: + user_cate_feature_nums += 1 + setattr(self, user_cate, nn.Embedding(num[0], emb_dim)) + for item_cate, num in self.item_feature_dict.items(): + if num[0] > 1: + item_cate_feature_nums += 1 + setattr(self, item_cate, nn.Embedding(num[0], emb_dim)) + + # user embedding + item embedding + hidden_size = emb_dim * (user_cate_feature_nums + item_cate_feature_nums) + \ + (len(self.user_feature_dict) - user_cate_feature_nums) + ( + len(self.item_feature_dict) - item_cate_feature_nums) + + # experts + self.experts = torch.nn.Parameter(torch.rand(hidden_size, mmoe_hidden_dim, n_expert), requires_grad=True) + self.experts.data.normal_(0, 1) + self.experts_bias = torch.nn.Parameter(torch.rand(mmoe_hidden_dim, n_expert), requires_grad=True) + # gates + self.gates = [torch.nn.Parameter(torch.rand(hidden_size, n_expert), requires_grad=True) for _ in range(num_task)] + for gate in self.gates: + gate.data.normal_(0, 1) + self.gates_bias = [torch.nn.Parameter(torch.rand(n_expert), requires_grad=True) for _ in range(num_task)] + + # esmm ctr鍜宑tcvr鐙珛浠诲姟鐨凞NN缁撴瀯 + for i in range(self.num_task): + setattr(self, 'task_{}_dnn'.format(i+1), nn.ModuleList()) + hid_dim = [mmoe_hidden_dim] + hidden_dim + for j in range(len(hid_dim) - 1): + getattr(self, 'task_{}_dnn'.format(i+1)).add_module('ctr_hidden_{}'.format(j), nn.Linear(hid_dim[j], hid_dim[j + 1])) + getattr(self, 'task_{}_dnn'.format(i+1)).add_module('ctr_batchnorm_{}'.format(j), nn.BatchNorm1d(hid_dim[j + 1])) + getattr(self, 'task_{}_dnn'.format(i+1)).add_module('ctr_dropout_{}'.format(j), nn.Dropout(dropouts[j])) + getattr(self, 'task_{}_dnn'.format(i+1)).add_module('task_last_layer', nn.Linear(hid_dim[-1], output_size)) + + def forward(self, x): + assert x.size()[1] == len(self.item_feature_dict) + len(self.user_feature_dict) + # embedding + user_embed_list, item_embed_list = list(), list() + for user_feature, num in self.user_feature_dict.items(): + if num[0] > 1: + user_embed_list.append(getattr(self, user_feature)(x[:, num[1]].long())) + else: + user_embed_list.append(x[:, num[1]].unsqueeze(1)) + for item_feature, num in self.item_feature_dict.items(): + if num[0] > 1: + item_embed_list.append(getattr(self, item_feature)(x[:, num[1]].long())) + else: + item_embed_list.append(x[:, num[1]].unsqueeze(1)) + + # embedding 铻嶅悎 + user_embed = torch.cat(user_embed_list, axis=1) + item_embed = torch.cat(item_embed_list, axis=1) + + # hidden layer + hidden = torch.cat([user_embed, item_embed], axis=1).float() # batch * hidden_size + + # mmoe + experts_out = torch.einsum('ij, jkl -> ikl', hidden, self.experts) # batch * mmoe_hidden_size * num_experts + experts_out += self.experts_bias + if self.expert_activation is not None: + experts_out = self.expert_activation(experts_out) + + gates_out = list() + for idx, gate in enumerate(self.gates): + gate_out = torch.einsum('ab, bc -> ac', hidden, gate) # batch * num_experts + if self.gates_bias: + gate_out += self.gates_bias[idx] + gate_out = nn.Softmax(dim=-1)(gate_out) + gates_out.append(gate_out) + + outs = list() + for gate_output in gates_out: + expanded_gate_output = torch.unsqueeze(gate_output, 1) # batch * 1 * num_experts + weighted_expert_output = experts_out * expanded_gate_output.expand_as(experts_out) # batch * mmoe_hidden_size * num_experts + outs.append(torch.sum(weighted_expert_output, 2)) # batch * mmoe_hidden_size + + # task tower + task_outputs = list() + for i in range(self.num_task): + x = outs[i] + for mod in getattr(self, 'task_{}_dnn'.format(i+1)): + x = mod(x) + task_outputs.append(x) + + return task_outputs + + +if __name__ == "__main__": + import numpy as np + + a = torch.from_numpy(np.array([[1, 2, 4, 2, 0.5, 0.1], + [4, 5, 3, 8, 0.6, 0.43], + [6, 3, 2, 9, 0.12, 0.32], + [9, 1, 1, 1, 0.12, 0.45], + [8, 3, 1, 4, 0.21, 0.67]])) + user_cate_dict = {'user_id': (11, 0), 'user_list': (12, 3), 'user_num': (1, 4)} + item_cate_dict = {'item_id': (8, 1), 'item_cate': (6, 2), 'item_num': (1, 5)} + mmoe = MMOE(user_cate_dict, item_cate_dict) + outs = mmoe(a) + print(outs) diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/model/.keep b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/model/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/model_train.py b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/model_train.py new file mode 100644 index 0000000000000000000000000000000000000000..efdba3e60424d7570b7b922c756cfc6449b26f64 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/model_train.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- +# +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ +# +# @Time : 2021-04-19 17:10 +# @Author : WenYi +# @Contact : 1244058349@qq.com +# @Description : model train function + +import torch +from tqdm import tqdm +from sklearn.metrics import roc_auc_score +import time +import torch.npu +import os +from apex import amp +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) +if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') + +def train_model(model, train_loader, val_loader, epoch, loss_function, optimizer, path, early_stop): + """ + pytorch model train function + :param model: pytorch model + :param train_loader: dataloader, train data loader + :param val_loader: dataloader, val data loader + :param epoch: int, number of iters + :param loss_function: loss function of train model + :param optimizer: pytorch optimizer + :param path: save path + :param early_stop: int, early stop number + :return: None + """ + # use GPU + device = torch.device(f'npu:{NPU_CALCULATE_DEVICE}') + model.to(f'npu:{NPU_CALCULATE_DEVICE}') + + # 澶氬皯姝ュ唴楠岃瘉闆嗙殑loss娌℃湁鍙樺皬灏辨彁鍓嶅仠姝 + patience, eval_loss = 0, 0 + + # train + for i in range(epoch): + y_train_income_true = [] + y_train_income_predict = [] + y_train_marry_true = [] + y_train_marry_predict = [] + total_loss, count = 0, 0 + for idx, (x, y1, y2) in tqdm(enumerate(train_loader), total=len(train_loader)): + start_time = time.time() + x, y1, y2 = x.to(f'npu:{NPU_CALCULATE_DEVICE}',non_blocking=True), y1.to(f'npu:{NPU_CALCULATE_DEVICE}',non_blocking=True), y2.to(f'npu:{NPU_CALCULATE_DEVICE}',non_blocking=True) + predict = model(x) + loss_1 = loss_function(predict[0], y1.unsqueeze(1).float()) + loss_2 = loss_function(predict[1], y2.unsqueeze(1).float()) + loss = loss_1 + loss_2 + optimizer.zero_grad() + with amp.scale_loss(loss,optimizer) as scaled_loss: + scaled_loss.backward() + optimizer.step() + total_loss += float(loss) + count += 1 + step_time = time.time() - start_time + print("Epoch:{}, Step:{}, Loss:{:.4f}, time/step:{:4f}".format(i + 1,count,total_loss / count,step_time)) + y_train_income_true += list(y1.squeeze().cpu().numpy()) + y_train_marry_true += list(y2.squeeze().cpu().numpy()) + y_train_income_predict += list(predict[0].squeeze().cpu().detach().numpy()) + y_train_marry_predict += list(predict[1].squeeze().cpu().detach().numpy()) + torch.save(model.state_dict(), path.format(i + 1)) + income_auc = roc_auc_score(y_train_income_true, y_train_income_predict) + marry_auc = roc_auc_score(y_train_marry_true, y_train_marry_predict) + print("Epoch %d train loss is %.3f, income auc is %.3f and marry auc is %.3f" % (i + 1, total_loss / count, + income_auc, marry_auc)) + + # 楠岃瘉 + total_eval_loss = 0 + model.eval() + count_eval = 0 + y_val_income_true = [] + y_val_marry_true = [] + y_val_income_predict = [] + y_val_marry_predict = [] + for idx, (x, y1, y2) in tqdm(enumerate(val_loader), total=len(val_loader)): + x, y1, y2 = x.to(f'npu:{NPU_CALCULATE_DEVICE}'), y1.to(f'npu:{NPU_CALCULATE_DEVICE}'), y2.to(f'npu:{NPU_CALCULATE_DEVICE}') + predict = model(x) + y_val_income_true += list(y1.squeeze().cpu().numpy()) + y_val_marry_true += list(y2.squeeze().cpu().numpy()) + y_val_income_predict += list(predict[0].squeeze().cpu().detach().numpy()) + y_val_marry_predict += list(predict[1].squeeze().cpu().detach().numpy()) + loss_1 = loss_function(predict[0], y1.unsqueeze(1).float()) + loss_2 = loss_function(predict[1], y2.unsqueeze(1).float()) + loss = loss_1 + loss_2 + total_eval_loss += float(loss) + count_eval += 1 + income_auc = roc_auc_score(y_val_income_true, y_val_income_predict) + marry_auc = roc_auc_score(y_val_marry_true, y_val_marry_predict) + print("Epoch %d val loss is %.3f, income auc is %.3f and marry auc is %.3f" % (i + 1, + total_eval_loss / count_eval, + income_auc, marry_auc)) + + # earl stopping + if i == 0: + eval_loss = total_eval_loss / count_eval + else: + if total_eval_loss / count_eval < eval_loss: + eval_loss = total_eval_loss / count_eval + else: + if patience < early_stop: + patience += 1 + else: + print("val loss is not decrease in %d epoch and break training" % patience) + break diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/modelzoo_level b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/modelzoo_level new file mode 100644 index 0000000000000000000000000000000000000000..405b26618a0c92027927a9c583a4b47f640bcf7b --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/modelzoo_level @@ -0,0 +1,3 @@ +FuncStatus:OK +PerfStatus:POK +PrecisionStatus:OK \ No newline at end of file diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/requirements.txt b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..2fc60bf31b398438bb3d3c204d6a3e45a05793d4 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_full_1p.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ESMM_ID2839_for_PyTorch" +#训练epoch +train_epochs=11 +#训练batch_size +batch_size=64 +#训练step +#train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.495 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不h需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ + + +sed -i "s|./data|$data_path|g" utils.py +cp $data_path/model* $cur_path/../model +#sed -i "s|epoch = 10|epoch = 1|g" main.py +#sed -i "s|pass|break|g" main.py + +#python3 setup.py install +#mkdir -p checkpoints +#mkdir -p /root/.cache/torch/hub/checkpoints +#cp $data_path/fcn_* /root/.cache/torch/hub/checkpoints + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + #cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + #cpustep=`expr $cpucount / 8` + #echo "taskset c steps:" $cpustep + #let a=RANK_ID*$cpustep + #let b=RANK_ID+1 + #let c=b*$cpustep-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + nohup python3 main.py > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#恢复参数 +sed -i "s|$data_path|./data|g" utils.py +rm -rf $cur_path/../model/* +#sed -i "s|epoch = 1|epoch = 10|g" main.py +#sed -i "s|break|pass|g" main.py + +#conda deactivate +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +time=`grep Epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time/step:" '{print $2}'|tail -n +3|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${time}'}'` + + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'val loss' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk -F "marry auc is " '{print $2}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "Loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss:" '{print $2}'|awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ad95a40a51ba48d3bebf69574bbbbb798b8dca10 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_performance_1p.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ESMM_ID2839_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=64 +#训练step +#train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.495 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不h需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ + + +sed -i "s|./data|$data_path|g" utils.py +sed -i "s|epoch = 10|epoch = 1|g" main.py +cp $data_path/model* $cur_path/../model +#sed -i "s|pass|break|g" main.py + +#python3 setup.py install +#mkdir -p checkpoints +#mkdir -p /root/.cache/torch/hub/checkpoints +#cp $data_path/fcn_* /root/.cache/torch/hub/checkpoints + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + #cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + #cpustep=`expr $cpucount / 8` + #echo "taskset c steps:" $cpustep + #let a=RANK_ID*$cpustep + #let b=RANK_ID+1 + #let c=b*$cpustep-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + nohup python3 main.py > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#恢复参数 +sed -i "s|$data_path|./data|g" utils.py +sed -i "s|epoch = 1|epoch = 10|g" main.py +rm -rf $cur_path/../model/* +#sed -i "s|break|pass|g" main.py + +#conda deactivate +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +time=`grep Epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time/step:" '{print $2}'|tail -n +3|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${time}'}'` + + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'val loss' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk -F "marry auc is " '{print $2}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` +TrainingTime=${time} + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "Loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss:" '{print $2}'|awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_performance_success_1p.sh b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_performance_success_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..c6694abd0f6b20c9fdae1158259fbd2aa74425cf --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/test/train_performance_success_1p.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ESMM_ID2839_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=64 +#训练step +#train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.495 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不h需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ + + +sed -i "s|./data|$data_path|g" utils.py +sed -i "s|epoch = 10|epoch = 1|g" main.py +#sed -i "s|pass|break|g" main.py + +#python3 setup.py install +#mkdir -p checkpoints +#mkdir -p /root/.cache/torch/hub/checkpoints +#cp $data_path/fcn_* /root/.cache/torch/hub/checkpoints + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + #cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + #cpustep=`expr $cpucount / 8` + #echo "taskset c steps:" $cpustep + #let a=RANK_ID*$cpustep + #let b=RANK_ID+1 + #let c=b*$cpustep-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + nohup python3 main.py > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#恢复参数 +sed -i "s|$data_path|./data|g" utils.py +sed -i "s|epoch = 1|epoch = 10|g" main.py +#sed -i "s|break|pass|g" main.py + +#conda deactivate +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +time=`grep FPS $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time/step:" '{print $2}'|tail -n +2|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${time}'}'` + + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'| sed 's/,//g' |cut -c 1-5` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +#echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "FPS" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss :" '{print $2}'|awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/utils.py b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b3006d73ccb2b9666853ff0697c03337b25b1bdf --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ESMM_ID2839_for_PyTorch/utils.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +# +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ +# +# @Time : 2021-04-20 10:36 +# @Author : WenYi +# @Contact : 1244058349@qq.com +# @Description : script description + + +import pandas as pd +from sklearn.preprocessing import LabelEncoder, MinMaxScaler +from sklearn.model_selection import train_test_split +from torch.utils.data import Dataset, DataLoader +import torch.npu +import os +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) +if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') + + +# data process +def data_preparation(): + # The column names are from + column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', + 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', + 'income_50k'] + + # Load the dataset in Pandas + train_df = pd.read_csv( + './data/adult.data', + delimiter=',', + header=None, + index_col=None, + names=column_names + ) + other_df = pd.read_csv( + './data/adult.test', + delimiter=',', + header=None, + index_col=None, + names=column_names + ) + + train_df['tag'] = 1 + other_df['tag'] = 0 + other_df.dropna(inplace=True) + other_df['income_50k'] = other_df['income_50k'].apply(lambda x: x[:-1]) + data = pd.concat([train_df, other_df]) + data.dropna(inplace=True) + # First group of tasks according to the paper + label_columns = ['income_50k', 'marital_status'] + + # categorical columns + categorical_columns = ['workclass', 'education', 'occupation', 'relationship', 'race', 'sex', 'native_country'] + for col in label_columns: + if col == 'income_50k': + data[col] = data[col].apply(lambda x: 0 if x == ' <=50K' else 1) + else: + data[col] = data[col].apply(lambda x: 0 if x == ' Never-married' else 1) + + # feature engine + for col in column_names: + if col not in label_columns + ['tag']: + if col in categorical_columns: + le = LabelEncoder() + data[col] = le.fit_transform(data[col]) + else: + mm = MinMaxScaler() + data[col] = mm.fit_transform(data[[col]]).reshape(-1) + data = data[['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'occupation', + 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', + 'income_50k', 'marital_status', 'tag']] + + # user feature, item feature + user_feature_dict, item_feature_dict = dict(), dict() + for idx, col in enumerate(data.columns): + if col not in label_columns + ['tag']: + if idx < 7: + if col in categorical_columns: + user_feature_dict[col] = (len(data[col].unique())+1, idx) + else: + user_feature_dict[col] = (1, idx) + else: + if col in categorical_columns: + item_feature_dict[col] = (len(data[col].unique())+1, idx) + else: + item_feature_dict[col] = (1, idx) + + # Split the other dataset into 1:1 validation to test according to the paper + train_data, test_data = data[data['tag'] == 1], data[data['tag'] == 0] + train_data.drop('tag', axis=1, inplace=True) + test_data.drop('tag', axis=1, inplace=True) + + # val data + # train_data, val_data = train_test_split(train_data, test_size=0.5, random_state=2021) + return train_data, test_data, user_feature_dict, item_feature_dict + + +class TrainDataSet(Dataset): + def __init__(self, data): + self.feature = data[0] + self.label1 = data[1] + self.label2 = data[2] + + def __getitem__(self, index): + feature = self.feature[index] + label1 = self.label1[index] + label2 = self.label2[index] + return feature, label1, label2 + + def __len__(self): + return len(self.feature)