diff --git a/examples/wan2.2/A14B/i2v/data.json b/examples/wan2.2/A14B/i2v/data.json new file mode 100644 index 0000000000000000000000000000000000000000..3d83ab3a7aeeed1636026b00deb48d04a0a27323 --- /dev/null +++ b/examples/wan2.2/A14B/i2v/data.json @@ -0,0 +1,103 @@ +{ + "dataset_param": { + "dataset_type": "t2v", + "basic_parameters": { + "data_path": "./examples/wan2.2/data.txt", + "data_storage_mode": "combine" + }, + "preprocess_parameters": { + "video_processor_type": "OpensoraplanVideoProcessor", + "video_reader_type": "DecordVideo", + "image_reader_type": "Image", + "num_frames": 81, + "min_num_frames": 1, + "frame_interval": 1, + "auto_interval": false, + "max_height": 480, + "max_width": 832, + "max_hxw": 399360, + "train_fps": 24, + "speed_factor": 1.0, + "drop_short_ratio": 0.0, + "cfg": 0.0, + "hw_stride": 8, + "force_resolution": false, + "hw_aspect_thr": 5.0, + "train_pipeline": { + "video": [ + { + "trans_type": "ResizeVideo", + "param": { + "transform_size": "auto", + "interpolation_mode": "bilinear", + "antialias": true, + "mode": "shortside" + } + }, + { + "trans_type": "ToTensorVideo" + }, + { + "trans_type": "norm_fun", + "param": { + "mean": 0.5, + "std": 0.5 + } + }, + { + "trans_type": "CenterCropResizeVideo", + "param": { + "use_short_edge": false, + "top_crop": false, + "align_corners": false, + "antialias": true, + "transform_size": "auto" + } + } + ], + "first_fram_vae": [ + { + "trans_type": "ResizeVideo", + "param": { + "transform_size": "auto", + "mode": "resize", + "interpolation_mode": "bilinear", + "align_corners": false, + "antialias": false + } + }, + { + "trans_type": "norm_fun", + "param": { + "mean": 0.5, + "std": 0.5 + } + } + ] + } + }, + "use_text_processer": true, + "enable_text_preprocessing": true, + "text_preprocess_methods":{ + "method": "whitespace_clean" + }, + "tokenizer_config": + { + "autotokenizer_name": "AutoTokenizer", + "hub_backend": "hf", + "from_pretrained": "Wan2.2-I2V-A14B-Diffusers/tokenizer", + "model_max_length": 512 + } + }, + "dataloader_param": { + "dataloader_mode": "sampler", + "sampler_type": "LengthGroupedSampler", + "shuffle": true, + "drop_last": true, + "pin_memory": true, + "group_frame": false, + "group_resolution": false, + "group_data": true, + "initial_global_step_for_sampler": 0 + } +} \ No newline at end of file diff --git a/examples/wan2.2/A14B/i2v/inference.sh b/examples/wan2.2/A14B/i2v/inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..710df0f4702d7b1e562fc277cbe3fb180674c09f --- /dev/null +++ b/examples/wan2.2/A14B/i2v/inference.sh @@ -0,0 +1,61 @@ +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +export TASK_QUEUE_ENABLE=1 +export COMBINED_ENABLE=1 +export CPU_AFFINITY_CONF=1 +export HCCL_CONNECT_TIMEOUT=1200 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=1 +WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) + +TP=1 +PP=1 +CP=1 +MBS=1 +GBS=$(($WORLD_SIZE*$MBS/$CP/$TP)) + +MM_MODEL="examples/wan2.2/A14B/i2v/inference_model.json" +LOAD_PATH="./weights/Wan-AI/Wan2.2-I2V-A14B-Diffusers/transformer/" # ensure the wandit weight be converted + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" +MM_ARGS=" + --mm-model $MM_MODEL +" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --lr 5e-6 \ + --min-lr 5e-6 \ + --train-iters 5010 \ + --weight-decay 0 \ + --clip-grad 0.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --no-save-optim \ + --no-save-rng \ + --bf16 \ + --load $LOAD_PATH \ + --context-parallel-algo ulysses_cp_algo \ +" + +torchrun $DISTRIBUTED_ARGS inference_sora.py $MM_ARGS $GPT_ARGS \ No newline at end of file diff --git a/examples/wan2.2/A14B/i2v/inference_model.json b/examples/wan2.2/A14B/i2v/inference_model.json new file mode 100644 index 0000000000000000000000000000000000000000..6e28e2c49420fe2a00d0c95e6777179a8a29528d --- /dev/null +++ b/examples/wan2.2/A14B/i2v/inference_model.json @@ -0,0 +1,88 @@ +{ + "pipeline_class": "WanPipeline", + "prompt": "examples/wan2.2/samples_i2v_prompts.txt", + "image": "examples/wan2.2/samples_i2v_images.txt", + "save_path": "examples/wan2.2/samples/A14B/i2v/", + "use_prompt_preprocess": false, + "dtype": "bf16", + "device": "npu", + "frame_interval": 1, + "fps": 16, + "boundary_ratio": 0.875, + "pipeline_config": { + "input_size": [81, 480, 832], + "cpu_offload": true + }, + "ae": { + "model_id": "wan_video_vae", + "from_pretrained": "./weights/Wan-AI/Wan2.2-I2V-A14B-480P-Diffusers/vae/", + "dtype": "bf16", + "do_sample": false, + "enable_tiling": false, + "norm_latents": true, + "norm_mode": "channel_specified_shift_scale" + }, + "tokenizer":{ + "autotokenizer_name": "AutoTokenizer", + "hub_backend": "hf", + "from_pretrained": "./weights/Wan-AI/Wan2.2-I2V-A14B-480P-Diffusers/tokenizer/" + }, + "text_encoder": { + "model_id": "UMT5", + "hub_backend": "hf", + "from_pretrained": "./weights/Wan-AI/Wan2.2-I2V-A14B-480P-Diffusers/text_encoder/", + "dtype": "bf16" + }, + "predictor": { + "model_id": "wandit", + "dtype": "bf16", + "model_type": "wan2.2-i2v", + "patch_size": [1, 2, 2], + "text_len": 512, + "in_dim": 36, + "hidden_size": 5120, + "ffn_dim": 13824, + "freq_dim": 256, + "text_dim": 4096, + "img_dim": 1280, + "out_dim": 16, + "num_heads": 40, + "num_layers": 40, + "qk_norm": true, + "qk_norm_type": "rmsnorm", + "cross_attn_norm": true, + "eps": 1e-6, + "max_seq_len": 1024, + "use_fused_rmsnorm":true + }, + "low_noise_predictor": "./weights/Wan-AI/Wan2.2-I2V-A14B-Diffusers/transformer_2/", + "diffusion": { + "model_id": "UniPCMultistepScheduler", + "num_train_steps": 1000, + "num_inference_steps": 40, + "guidance_scale": [4.0, 3.0], + "beta_end": 0.02, + "beta_schedule": "linear", + "beta_start": 0.0001, + "disable_corrector": [], + "dynamic_thresholding_ratio": 0.995, + "final_sigmas_type": "zero", + "flow_shift": 3.0, + "lower_order_final": true, + "predict_x0": true, + "prediction_type": "flow_prediction", + "rescale_betas_zero_snr": false, + "sample_max_value": 1.0, + "solver_order": 2, + "solver_p": null, + "solver_type": "bh2", + "steps_offset": 0, + "thresholding": false, + "timestep_spacing": "linspace", + "trained_betas": null, + "use_beta_sigmas": false, + "use_exponential_sigmas": false, + "use_flow_sigmas": true, + "use_karras_sigmas": false + } +} diff --git a/examples/wan2.2/A14B/i2v/pretrain.sh b/examples/wan2.2/A14B/i2v/pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..502b5be37bfac4a3c1ea03b05ba7159341debbe5 --- /dev/null +++ b/examples/wan2.2/A14B/i2v/pretrain.sh @@ -0,0 +1,153 @@ +#!/bin/bash +source /usr/local/Ascend/ascend-toolkit/set_env.sh +# 该变量只用于规避megatron对其校验,对npu无效 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +export TASK_QUEUE_ENABLE=1 +export COMBINED_ENABLE=1 +export CPU_AFFINITY_CONF=1 +export HCCL_CONNECT_TIMEOUT=1200 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +TP=1 +PP=1 +VP=1 +CP=1 +MBS=1 +GRAD_ACC_STEP=1 +DP=$(($WORLD_SIZE/$TP/$PP/$CP)) +GBS=$(($MBS*$GRAD_ACC_STEP*$DP)) +layerzero_config="./examples/wan2.2/zero_config.yaml" + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --virtual-pipeline-model-parallel-size ${VP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ulysses_cp_algo \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --num-workers 8 \ + --lr 1e-5 \ + --min-lr 1e-5 \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr-decay-style constant \ + --weight-decay 1e-2 \ + --lr-warmup-init 0 \ + --lr-warmup-iters 0 \ + --clip-grad 1.0 \ + --train-iters 5000 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --no-save-optim \ + --no-save-rng \ + --bf16 \ + --recompute-granularity full \ + --recompute-method block \ + --recompute-num-layers 40 \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --use-fused-rmsnorm \ + --layerzero \ + --layerzero-config ${layerzero_config} \ +" + +# ========train high noised model======== + +MM_DATA="./examples/wan2.2/A14B/i2v/data.json" +MM_MODEL="./examples/wan2.2/A14B/i2v/pretrain_model_high.json" +MM_TOOL="./mindspeed_mm/tools/tools.json" +LOAD_PATH="./weights/Wan-AI/Wan2.2-I2V-A14B-Diffusers/transformer/" # ensure the wandit weight be converted +SAVE_PATH="path to save your wandit weight" +layerzero_config="./examples/wan2.2/zero_config.yaml" + +MM_ARGS=" + --mm-data $MM_DATA \ + --mm-model $MM_MODEL \ + --mm-tool $MM_TOOL +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 10 \ + --load $LOAD_PATH \ + --save $SAVE_PATH \ + --ckpt-format torch \ +" + +logfile=wan_$(date +%Y%m%d)_$(date +%H%M%S) +mkdir -p logs +torchrun $DISTRIBUTED_ARGS pretrain_sora.py \ + $GPT_ARGS \ + $MM_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + 2>&1 | tee logs/train_${logfile}.log + +chmod 440 logs/train_${logfile}.log +find $SAVE_PATH -type d -exec chmod 750 {} \; +find $SAVE_PATH -type f -exec chmod 640 {} \; +STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'` +SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` +echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS" + +# ========train low noised model======== + +MM_DATA="./examples/wan2.2/A14B/i2v/data.json" +MM_MODEL="./examples/wan2.2/A14B/i2v/pretrain_model_low.json" +MM_TOOL="./mindspeed_mm/tools/tools.json" +LOAD_PATH="./weights/Wan-AI/Wan2.2-I2V-A14B-Diffusers/transformer_2/" # ensure the wandit weight be converted +SAVE_PATH="path to save your wandit weight" + +MM_ARGS=" + --mm-data $MM_DATA \ + --mm-model $MM_MODEL \ + --mm-tool $MM_TOOL +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 10 \ + --load $LOAD_PATH \ + --save $SAVE_PATH \ + --ckpt-format torch \ +" + +logfile=wan_$(date +%Y%m%d)_$(date +%H%M%S) +mkdir -p logs +torchrun $DISTRIBUTED_ARGS pretrain_sora.py \ + $GPT_ARGS \ + $MM_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + 2>&1 | tee logs/train_${logfile}.log + +chmod 440 logs/train_${logfile}.log +find $SAVE_PATH -type d -exec chmod 750 {} \; +find $SAVE_PATH -type f -exec chmod 640 {} \; +STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'` +SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` +echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS" \ No newline at end of file diff --git a/examples/wan2.2/A14B/i2v/pretrain_model_high.json b/examples/wan2.2/A14B/i2v/pretrain_model_high.json new file mode 100644 index 0000000000000000000000000000000000000000..2af08c715f1f4a1171eb334632fc7dc95c354cb8 --- /dev/null +++ b/examples/wan2.2/A14B/i2v/pretrain_model_high.json @@ -0,0 +1,64 @@ +{ + "load_video_features": false, + "load_text_features": false, + "task": "i2v", + "diffusion": { + "model_id": "wan_flow_match_scheduler", + "num_train_timesteps": 1000, + "shift": 5, + "sigma_min": 0.0, + "extra_one_step": true, + "max_timestep_boundary": 1.0, + "min_timestep_boundary": 0.875 + }, + "predictor": { + "model_id": "wandit", + "dtype": "bf16", + "model_type": "wan2.2-i2v", + "patch_size": [1, 2, 2], + "text_len": 512, + "in_dim": 32, + "hidden_size": 5120, + "ffn_dim": 13824, + "freq_dim": 256, + "text_dim": 4096, + "img_dim": 1280, + "out_dim": 16, + "num_heads": 40, + "num_layers": 40, + "pipeline_num_layers": [10,10,10,10], + "qk_norm": true, + "qk_norm_type": "rmsnorm", + "cross_attn_norm": true, + "eps": 1e-6, + "max_seq_len": 1024, + "attention_async_offload": false, + "use_fused_rmsnorm":true + }, + "patch": { + "clip_grad_async": true + }, + "ae": { + "model_id": "wan_video_vae", + "from_pretrained": "Wan2.2-I2V-A14B-Diffusers/vae", + "dtype": "bf16", + "enable_tiling": true, + "tiling_param": { + "tile_size": [34, 34], + "tile_stride": [18, 16] + }, + "norm_latents": true, + "norm_mode": "channel_specified_shift_scale", + "do_sample": false, + "i2v_processor": { + "processor_id": "wan_i2v_processor", + "i2v_vae_encod_tiling": false + } + }, + "text_encoder": { + "model_id": "UMT5", + "dtype": "bf16", + "hub_backend": "hf", + "from_pretrained": "Wan2.2-I2V-A14B-Diffusers/text_encoder" + } +} \ No newline at end of file diff --git a/examples/wan2.2/A14B/i2v/pretrain_model_low.json b/examples/wan2.2/A14B/i2v/pretrain_model_low.json new file mode 100644 index 0000000000000000000000000000000000000000..9dd39dce0e11682f4ffea5072994b06feacd9b7f --- /dev/null +++ b/examples/wan2.2/A14B/i2v/pretrain_model_low.json @@ -0,0 +1,64 @@ +{ + "load_video_features": false, + "load_text_features": false, + "task": "i2v", + "diffusion": { + "model_id": "wan_flow_match_scheduler", + "num_train_timesteps": 1000, + "shift": 5, + "sigma_min": 0.0, + "extra_one_step": true, + "max_timestep_boundary": 0.875, + "min_timestep_boundary": 0.0 + }, + "predictor": { + "model_id": "wandit", + "dtype": "bf16", + "model_type": "wan2.2-i2v", + "patch_size": [1, 2, 2], + "text_len": 512, + "in_dim": 32, + "hidden_size": 5120, + "ffn_dim": 13824, + "freq_dim": 256, + "text_dim": 4096, + "img_dim": 1280, + "out_dim": 16, + "num_heads": 40, + "num_layers": 40, + "pipeline_num_layers": [10,10,10,10], + "qk_norm": true, + "qk_norm_type": "rmsnorm", + "cross_attn_norm": true, + "eps": 1e-6, + "max_seq_len": 1024, + "attention_async_offload": false, + "use_fused_rmsnorm":true + }, + "patch": { + "clip_grad_async": true + }, + "ae": { + "model_id": "wan_video_vae", + "from_pretrained": "Wan2.2-I2V-A14B-Diffusers/vae", + "dtype": "bf16", + "enable_tiling": true, + "tiling_param": { + "tile_size": [34, 34], + "tile_stride": [18, 16] + }, + "norm_latents": true, + "norm_mode": "channel_specified_shift_scale", + "do_sample": false, + "i2v_processor": { + "processor_id": "wan_i2v_processor", + "i2v_vae_encod_tiling": false + } + }, + "text_encoder": { + "model_id": "UMT5", + "dtype": "bf16", + "hub_backend": "hf", + "from_pretrained": "Wan2.2-I2V-A14B-Diffusers/text_encoder" + } +} \ No newline at end of file diff --git a/examples/wan2.2/i2v_input.jpg b/examples/wan2.2/i2v_input.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b2c8d3aa43cf57c7ab32f806b9818465aea2c779 Binary files /dev/null and b/examples/wan2.2/i2v_input.jpg differ diff --git a/examples/wan2.2/samples_i2v_images.txt b/examples/wan2.2/samples_i2v_images.txt new file mode 100644 index 0000000000000000000000000000000000000000..31ded6a6a387ba0c182acd2ae764e8600d848c74 --- /dev/null +++ b/examples/wan2.2/samples_i2v_images.txt @@ -0,0 +1 @@ +examples/wan2.2/i2v_input.jpg \ No newline at end of file diff --git a/examples/wan2.2/samples_i2v_prompts.txt b/examples/wan2.2/samples_i2v_prompts.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a47e199aa75e85df06e1ae9a0ed57bbdd73021f --- /dev/null +++ b/examples/wan2.2/samples_i2v_prompts.txt @@ -0,0 +1 @@ +An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot. \ No newline at end of file