diff --git a/examples/diffusers/flux/README.md b/examples/diffusers/flux/README.md index d8cc92e18b5a3dc628bc7c86ea642a227095f907..c33281ac0d534ac7d6f835d9ef13fd19ea184912 100644 --- a/examples/diffusers/flux/README.md +++ b/examples/diffusers/flux/README.md @@ -179,35 +179,6 @@ --validation_prompt="a validation prompt based on your own dataset" \ ``` - 【因模型较大 如不需要`checkpointing_steps`,请设置其大于`max_train_steps`, 避免开启】 - - ```shell - --checkpointing_steps=50000 \ # 修改50000步为所需要步数 - ``` - - 【如需保存checkpointing请修改代码】 - - ```shell - vim examples/dreambooth/train_dreambooth_flux.py #(1669行附近) - vim examples/dreambooth/train_dreambooth_lora_flux.py #(1788行附近) - ``` - - - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 (30行附近) - - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED` - - ```python - from accelerate import Accelerator, DistributedType - if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED: - ``` - - 更改shell脚本: - - ```shell - export HCCL_CONNECT_TIMEOUT=1200 # 大幅调高HCCL_CONNECT_TIMEOUT (如5000) - export HCCL_EXEC_TIMEOUT=17000 - --checkpointing_steps=50000 \ # 修改50000步为所需要步数 - ``` - 2. 【配置 FLUX 微调脚本】 联网情况下,微调模型可通过以下步骤下载。无网络时,用户可访问huggingface官网自行下载[FLUX.1-dev模型](https://huggingface.co/black-forest-labs/FLUX.1-dev) `model_name`模型 @@ -236,9 +207,12 @@ vim src/diffusers/models/embeddings.py ``` - 2. 打开`train_dreambooth_flux.py`或`train_dreambooth_lora_flux.py`文件 + ```python + freqs_dtype = torch.float32 # 760行附近 + # freqs_dtype = torch.float32 if is_mps else torch.float64 # 原代码 + ``` - - 在62行附近添加代码 + 2. 打开`train_dreambooth_flux.py`或`train_dreambooth_lora_flux.py`文件 ```shell cd examples/dreambooth/ # 从diffusers目录进入dreambooth目录 @@ -246,11 +220,16 @@ vim train_dreambooth_lora_flux.py # 进入Python文件 ``` + - 在import栏/`if is_wandb_available():`上方(62行附近添加代码) + ```python # 添加代码到train_dreambooth_flux.py 62行附近 - from patch_flux import TorchPatcher, config_gc + from patch_flux import TorchPatcher, config_gc, create_save_model_hook TorchPatcher.apply_patch() config_gc() + + if is_wandb_available(): # 原代码 + import wandb ``` - 在log_validation里修改`pipeline = pipeline.to(accelerator.device)`,`train_dreambooth_flux.py`在171行附近`train_dreambooth_lora_flux.py`在180行附近 @@ -258,17 +237,70 @@ ```python # 修改pipeline为: pipeline = pipeline.to(accelerator.device, dtype=torch_dtype) + # pipeline = pipeline.to(accelerator.device) # 原代码 ``` 3. 【Optional】Ubuntu系统需在1701行附近 添加 `accelerator.print("")` ```python - if global_step >= args.max_train_steps: + if global_step >= args.max_train_steps: # 原代码 break - accelerator.print("") + accelerator.print("") # 添加 + ``` + + 4. 【Optional】模型checkpoint saving保存 + + 【因模型较大 如不需要`checkpointing_steps`,请设置其大于`max_train_steps`, 避免开启】 + + ```shell + --checkpointing_steps=50000 \ # 修改50000步为所需要步数 + ``` + + 【如需保存checkpointing请修改代码】 + + ```shell + vim examples/dreambooth/train_dreambooth_flux.py #(1669行附近) + vim examples/dreambooth/train_dreambooth_lora_flux.py #(1788行附近) + ``` + + - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 (30行附近) + - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED` (1669/1788行附近) + + ```python + from accelerate import Accelerator, DistributedType + # from accelerate import Accelerator # 原代码 + + if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED: + # if accelerator.is_main_process: # 原代码 + ``` + + Lora任务需调用patch任务进行权重保存: + 在`train_dreambooth_lora_flux.py`文件中找到代码`accelerator.register_save_state_pre_hook(save_model_hook)`进行修改(1308行附近),复制粘贴以下代码: + + ```python + # 添加 + save_Model_Hook = create_save_model_hook( + accelerator=accelerator, + unwrap_model=unwrap_model, + transformer=transformer, + text_encoder_one=text_encoder_one, + args=args, + weight_dtype=weight_dtype + ) + accelerator.register_save_state_pre_hook(save_Model_Hook) # 修改 + # accelerator.register_save_state_pre_hook(save_model_hook) # 原代码 + accelerator.register_load_state_pre_hook(load_model_hook) # 原代码 不修改 + ``` + + 更改shell脚本: + + ```shell + export HCCL_CONNECT_TIMEOUT=1200 # 大幅调高HCCL_CONNECT_TIMEOUT (如5000) + export HCCL_EXEC_TIMEOUT=17000 + --checkpointing_steps=50000 \ # 修改50000步为所需要步数 ``` -3. 【启动 FLUX 微调脚本】 +4. 【启动 FLUX 微调脚本】 本任务主要提供flux_dreambooth与flux_dreambooth_lora微调脚本,支持多卡训练。 diff --git a/examples/diffusers/flux/patch_flux.py b/examples/diffusers/flux/patch_flux.py index b16371ac60fc55575e7f077715fb18f3e2d7cbac..7368ed98f1168f90fac92ac681c5a8f07c2b0929 100644 --- a/examples/diffusers/flux/patch_flux.py +++ b/examples/diffusers/flux/patch_flux.py @@ -1,8 +1,25 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. # Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import gc import torch +from diffusers import FluxPipeline +from peft.utils import get_peft_model_state_dict from torch.distributed._shard.sharded_tensor.api import ShardedTensor @@ -31,5 +48,59 @@ class TorchPatcher: def config_gc(): - # set gc threshold + # set gc threshold, best range from experiments gc.set_threshold(700, 50, 1000) + + +# Save Lora weights for checkpointing steps +def create_save_model_hook( + accelerator, unwrap_model, transformer, text_encoder_one, args, weight_dtype +): + def save_model_hook(models, weights, output_dir): + if accelerator.is_main_process: + transformer_lora_layers_to_save = None + text_encoder_one_lora_layers_to_save = None + + for model in models: + if isinstance(unwrap_model(model), type(unwrap_model(transformer))): + transformer_model = unwrap_model(model) + if args.upcast_before_saving: + transformer_model = transformer_model.to(torch.float32) + else: + transformer_model = transformer_model.to(weight_dtype) + transformer_lora_layers_to_save = get_peft_model_state_dict( + transformer_model + ) + + elif ( + isinstance( + unwrap_model(model), type(unwrap_model(text_encoder_one)) + ) + and args.train_text_encoder + ): + text_encoder_one_lora_layers_to_save = get_peft_model_state_dict( + model.to(torch.float32) + ) + + elif ( + isinstance( + unwrap_model(model), type(unwrap_model(text_encoder_one)) + ) + and not args.train_text_encoder + ): + text_encoder_one_lora_layers_to_save = None + + else: + raise ValueError(f"unexpected save model: {model.__class__}") + + # make sure to pop weight so that corresponding model is not saved again + if weights: + weights.pop() + + FluxPipeline.save_lora_weights( + output_dir, + transformer_lora_layers=transformer_lora_layers_to_save, + text_encoder_lora_layers=text_encoder_one_lora_layers_to_save, + ) + + return save_model_hook diff --git a/examples/diffusers/sd3/README.md b/examples/diffusers/sd3/README.md index 26d800775559657dd7eba7d979c74c3c0c325f34..e32829e1e0179209a8a8c4a697fe9479546ebbfd 100644 --- a/examples/diffusers/sd3/README.md +++ b/examples/diffusers/sd3/README.md @@ -223,9 +223,9 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw 如下: ```python - if global_step >= args.max_train_steps: + if global_step >= args.max_train_steps: # 原代码 break - accelerator.print("") + accelerator.print("") # 添加 ``` 4. 【如需保存checkpointing请修改代码】 @@ -236,14 +236,39 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw vim examples/dreambooth/train_dreambooth_lora_sd3.py ``` + - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 (30行附近),并增加patch引用`from patch_sd3 import create_save_model_hook` - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED`(dreambooth在1681行附近,lora在1833行附近) - - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 (30行附近) ```python from accelerate import Accelerator, DistributedType + # from accelerate import Accelerator # 原代码 + from patch_sd3 import create_save_model_hook # 添加此行patch引用代码 + from accelerate.logging import get_logger # 原代码 + if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED: + # if accelerator.is_main_process: # 原代码 1681/1833行附近 ``` + Lora任务需调用patch任务进行权重保存: + + 在`train_dreambooth_lora_sd3.py`文件中找到代码`accelerator.register_save_state_pre_hook(save_model_hook)`进行修改(1368行附近),修改如下: + + ```python + # 添加 + save_Model_Hook = create_save_model_hook( + accelerator=accelerator, + unwrap_model=unwrap_model, + transformer=transformer, + text_encoder_one=text_encoder_one, + text_encoder_two=text_encoder_two, + args=args, + weight_dtype=weight_dtype + ) + accelerator.register_save_state_pre_hook(save_Model_Hook) # 修改 + # accelerator.register_save_state_pre_hook(save_model_hook) # 原代码 + accelerator.register_load_state_pre_hook(load_model_hook) # 原代码 不修改 + ``` + 5. 【修改文件】 ```shell @@ -257,6 +282,7 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw ```python # 修改pipeline为: pipeline = pipeline.to(accelerator.device, dtype=torch_dtype) + # pipeline = pipeline.to(accelerator.device) # 原代码 ``` 6. 【启动 SD3 微调脚本】 diff --git a/examples/diffusers/sd3/patch_sd3.py b/examples/diffusers/sd3/patch_sd3.py new file mode 100644 index 0000000000000000000000000000000000000000..42ac7a01a6fca2689a491554916afe02e73a50be --- /dev/null +++ b/examples/diffusers/sd3/patch_sd3.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch +from diffusers import StableDiffusion3Pipeline +from peft.utils import get_peft_model_state_dict + + +# Save Lora weights for checkpointing steps +def create_save_model_hook( + accelerator, + unwrap_model, + transformer, + text_encoder_one, + text_encoder_two, + args, + weight_dtype, +): + def save_model_hook(models, weights, output_dir): + if accelerator.is_main_process: + transformer_lora_layers_to_save = None + text_encoder_one_lora_layers_to_save = None + text_encoder_two_lora_layers_to_save = None + + for model in models: + if isinstance(unwrap_model(model), type(unwrap_model(transformer))): + transformer_model = unwrap_model(model) + if args.upcast_before_saving: + transformer_model = transformer_model.to(torch.float32) + else: + transformer_model = transformer_model.to(weight_dtype) + transformer_lora_layers_to_save = get_peft_model_state_dict( + transformer_model + ) + + elif ( + isinstance( + unwrap_model(model), type(unwrap_model(text_encoder_one)) + ) + and args.train_text_encoder + ): + # both text encoders are of the same class + hidden_size = unwrap_model(model).config.hidden_size + if hidden_size == 768: + text_encoder_one_lora_layers_to_save = ( + get_peft_model_state_dict(model.to(torch.float32)) + ) + elif hidden_size == 1280: + text_encoder_two_lora_layers_to_save = ( + get_peft_model_state_dict(model.to(torch.float32)) + ) + + elif ( + isinstance( + unwrap_model(model), type(unwrap_model(text_encoder_one)) + ) + and not args.train_text_encoder + ): + text_encoder_one_lora_layers_to_save = None + text_encoder_two_lora_layers_to_save = None + + else: + raise ValueError(f"unexpected save model: {model.__class__}") + + # make sure to pop weight so that corresponding model is not saved again + if weights: + weights.pop() + + StableDiffusion3Pipeline.save_lora_weights( + output_dir, + transformer_lora_layers=transformer_lora_layers_to_save, + text_encoder_lora_layers=text_encoder_one_lora_layers_to_save, + text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save, + ) + + return save_model_hook