diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/LICENSE b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/LICENSE
deleted file mode 100755
index e6e77b08909f2e34c57dce5b47021a315d1ee70e..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
- Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/README.md b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/README.md
deleted file mode 100755
index 7b4f7d78e0531df5dd0fd7b355516049dcbc299e..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/README.md
+++ /dev/null
@@ -1,516 +0,0 @@
-# stable-diffusion模型-推理指导  
-
-
-- [概述](#ZH-CN_TOPIC_0000001172161501)
-   
-   - [输入输出数据](#section540883920406)
-
-- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
-
-- [快速上手](#ZH-CN_TOPIC_0000001126281700)
-
-  - [获取源码](#section4622531142816)
-  - [准备数据集](#section183221994411)
-  - [模型推理](#section741711594517)
-
-- [模型推理性能&精度](#ZH-CN_TOPIC_0000001172201573)
-
-
-# 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
-
-   stable-diffusion是一种文本到图像的扩散模型，能够在给定任何文本输入的情况下生成照片逼真的图像。有关稳定扩散函数的更多信息，请查看[Stable Diffusion blog](https://huggingface.co/blog/stable_diffusion)。
-
-- 参考实现：
-  ```bash
-   # StableDiffusion v1.5
-   https://huggingface.co/runwayml/stable-diffusion-v1-5
-
-   # StableDiffusion v2.1
-   https://huggingface.co/stabilityai/stable-diffusion-2-1-base
-  ```
-
-## 输入输出数据<a name="section540883920406"></a>
-
-- 输入数据
-
-  | 输入数据  | 大小      | 数据类型                | 数据排布格式 |
-  | -------- | -------- | ------------------------- | ------------ |
-  | input    |  1 x 77 | FLOAT32|  ND|
-
-
-- 输出数据
-
-  | 输出数据 | 大小     | 数据类型 | 数据排布格式 |
-  | -------- | -------- | -------- | ------------ |
-  | output1  | 1 x 512 x 512 x 3 | FLOAT32  | NHWD           |
-
-# 推理环境准备<a name="ZH-CN_TOPIC_0000001126281702"></a>
-
-- 该模型需要以下插件与驱动
-
-  **表 1**  版本配套表
-  | 配套                                                         | 版本    | 环境准备指导                                                 |
-  | ------------------------------------------------------------ | ------- | ------------------------------------------------------------ |
-  | 固件与驱动                                                   | 24.1.RC1  | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) |
-  | CANN（+MindIE-RT）                                            | 8.0.RC1(1.0.RC1) | -                                                            |
-  | Python                                                       | 3.10   | -                                                            |                                                           |
-如在优化模型时使用了--FA、--TOME_num、--faster_gelu参数，需要安装与CANN包配套版本的MindIE
-
-- 该模型性能受CPU规格影响，建议将CPU设置为性能模式以获得最优性能
-
-
-# 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
-
-## 获取源码<a name="section4622531142816"></a>
-1. 获取本仓源码
-   
-   ```
-   git clone https://gitee.com/ascend/ModelZoo-PyTorch.git
-   cd ModelZoo-PyTorch/ACL_PyTorch/built-in/foundation_models/stable_diffusion
-   ```
-
-1. 安装依赖。
-   ```bash
-   pip3 install -r requirements.txt
-   ```
-
-2. 代码修改
-
-   执行命令：
-   
-   ```bash
-   python3 stable_diffusion_clip_patch.py
-   ```
-
-3. 安装昇腾推理工具
-
-   1. 请访问[ais_bench推理工具](https://gitee.com/ascend/tools/tree/master/ais-bench_workload/tool/ais_bench)，根据readme文件进行工具安装。
-
-   2. 请访问[msit代码仓](https://gitee.com/ascend/msit/tree/master/msit/)，根据readme文档进行工具安装 debug surgeon。
-   
-## 准备数据集<a name="section183221994411"></a>
-
-1. 获取原始数据集。
-
-   本模型输入文本信息生成图片，无需数据集。
-   
-## 模型推理<a name="section741711594517"></a>
-
-1. 模型转换。
-   使用PyTorch将模型权重文件.pth转换为.onnx文件，再使用ATC工具将.onnx文件转为离线推理模型文件.om文件。
-
-   0. 获取权重（可选）
-
-      可提前下载权重，以避免执行后面步骤时可能会出现下载失败。
-
-      ```bash
-      # 需要使用 git-lfs (https://git-lfs.com)
-      git lfs install
-
-      # v1.5
-      git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
-
-      # v2.1
-      git clone https://huggingface.co/stabilityai/stable-diffusion-2-1-base
-      ```
-
-   1. 导出ONNX模型
-
-      设置模型名称或路径
-      ```bash
-      # v1.5 (执行时下载权重)
-      model_base="runwayml/stable-diffusion-v1-5"
-
-      # v1.5 (使用上一步下载的权重)
-      model_base="./stable-diffusion-v1-5"
-
-      # v2.1 (执行时下载权重)
-      model_base="stabilityai/stable-diffusion-2-1-base"
-
-      # v2.1 (使用上一步下载的权重)
-      model_base="./stable-diffusion-2-1-base"
-      ```
-
-      注意：若条件允许，该模型可以双芯片并行的方式进行推理，从而获得更短的端到端耗时。具体指令的差异之处会在后面的步骤中单独说明，请留意。
-
-      执行命令：
-
-      ```bash
-      # 设置模型的batch size
-      bs=1
-
-      python3 stable_diffusion_2_onnx.py --model ${model_base} --output_dir ./models_bs${bs} --batch_size ${bs}
-
-      # 使用并行方案
-      python3 stable_diffusion_2_onnx.py --model ${model_base} --output_dir ./models_bs${bs} --batch_size ${bs} --parallel
-      ```
-
-      参数说明：
-      - --model：模型名称或本地模型目录的路径
-      - --output_dir: ONNX模型输出目录
-      - --batch_size：模型batch size
-      - --parallel：导出适用于并行方案的模型
-      
-      执行成功后生成onnx模型：  
-         - models_bs${bs}/clip/clip.onnx  
-         - models_bs${bs}/unet/unet.onnx
-         - models_bs${bs}/vae/vae.onnx
-
-   2. 优化onnx模型
-
-      1. 量化（可选，Duo/Pro卡上可提升性能但可能导致精度下降）
-
-         量化步骤请参考[量化指导](./Readme_quant.md)
-
-      2. 模型优化
-
-         运行modify_onnx.py脚本。
-
-         未量化场景，TOME_num可设为5以获得最优性能收益。如果使用量化，推荐将TOME_num参数设为4以获得较好的精度和性能数据。
-         ```bash 
-         # 使用未量化模型
-         python3 modify_onnx.py \
-               --model models_bs${bs}/unet/unet.onnx \
-               --new_model models_bs${bs}/unet/unet_md.onnx \
-               --FA_soc Duo \
-               --TOME_num 5 \
-               --faster_gelu
-
-         # 使用量化模型
-         python3 modify_onnx.py \
-               --model models_bs${bs}/unet_quant/unet.onnx \
-               --new_model models_bs${bs}/unet/unet_md.onnx \
-               --FA_soc Duo \
-               --TOME_num 4 \
-               --faster_gelu
-         ```
-         参数说明：
-         - --model：onnx模型路径。
-         - --new_model：优化后生成的onnx模型路径。
-         - --FA_soc：使用FA算子的硬件形态。目前FlashAttention算子支持Atlas 300I Duo/Pro和Atlas 800I A2，请根据使用硬件设置参数Duo或A2，其他不支持硬件请设置为None。默认为None。
-         - --TOME_num：插入TOME插件的数量，有效取值为[0, 5]。Tome插件目前支持Atlas 300I Duo/Pro和Atlas 800I A2，其他不支持硬件请设置为0。默认为0。
-         - --faster_gelu：使用slice+gelu的融合算子。
-
-         FA、TOME、Gelu融合算子需通过安装与CANN版本对应的推理引擎包(MindIE)来获取，如未安装推理引擎或使用的版本不支持FA、TOME、SliceGelu算子，FA_soc和TOME_num参数请使用默认配置、不设置faster_gelu参数。
-      
-      3. 使用cache方案（可选，可提升性能但可能导致精度下降）
-
-         运行unet_cache.py脚本。
-         ```bash
-         python3 unet_cache.py --model models_bs${bs}/unet/unet_md.onnx --save_dir models_bs${bs}/unet/
-         ```
-         参数说明：
-         - --model：优化后的onnx模型路径。
-         - --save_dir：cache模型的保存路径。
-
-         运行成功后在save_dir下得到unet_cache.onnx和unet_skip.onnx。
-
-   
-   3. 使用ATC工具将ONNX模型转OM模型。
-
-      1. 配置环境变量。
-
-         ```bash
-         source /usr/local/Ascend/ascend-toolkit/set_env.sh
-
-         # 如果安装了推理引擎算子包，需配置推理引擎路径
-         source /usr/local/Ascend/mindie-rt/set_env.sh
-         ```
-
-         > **说明：** 
-         >该脚本中环境变量仅供参考，请以实际安装环境配置环境变量。详细介绍请参见《[CANN 开发辅助工具指南 \(推理\)](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=developer-documents&subcategory=auxiliary-development-tools)》。
-
-      2. 执行命令查看芯片名称（$\{chip\_name\}）。
-
-         ```
-         npu-smi info
-         #该设备芯片名为Ascend310P3 （自行替换）
-         回显如下：
-         +-------------------+-----------------+------------------------------------------------------+
-         | NPU     Name      | Health          | Power(W)     Temp(C)           Hugepages-Usage(page) |
-         | Chip    Device    | Bus-Id          | AICore(%)    Memory-Usage(MB)                        |
-         +===================+=================+======================================================+
-         | 0       310P3     | OK              | 15.8         42                0    / 0              |
-         | 0       0         | 0000:82:00.0    | 0            1074 / 21534                            |
-         +===================+=================+======================================================+
-         | 1       310P3     | OK              | 15.4         43                0    / 0              |
-         | 0       1         | 0000:89:00.0    | 0            1070 / 21534                            |
-         +===================+=================+======================================================+
-         ```
-
-      3. 执行ATC命令。
-
-         ```bash
-         # clip
-         atc --framework=5 \
-             --model=./models_bs${bs}/clip/clip.onnx \
-             --output=./models_bs${bs}/clip/clip \
-             --input_format=ND \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-         
-         # unet
-         cd ./models_bs${bs}/unet/
-
-         # 不使用cache方案
-         atc --framework=5 \
-             --model=./unet_md.onnx \
-             --output=./unet \
-             --input_format=NCHW \
-             --log=error \
-             --optypelist_for_implmode="Gelu,Sigmoid" \
-             --op_select_implmode=high_performance \
-             --soc_version=Ascend${chip_name}
-
-         # 使用cache方案
-         atc --framework=5 \
-             --model=./unet_cache.onnx \
-             --output=./unet_cache \
-             --input_format=NCHW \
-             --log=error \
-             --optypelist_for_implmode="Gelu,Sigmoid" \
-             --op_select_implmode=high_performance \
-             --soc_version=Ascend${chip_name}
-
-         atc --framework=5 \
-             --model=./unet_skip.onnx \
-             --output=./unet_skip \
-             --input_format=NCHW \
-             --log=error \
-             --optypelist_for_implmode="Gelu,Sigmoid" \
-             --op_select_implmode=high_performance \
-             --soc_version=Ascend${chip_name}
-
-         cd ../../
-
-         # vae
-         atc --framework=5 \
-             --model=./models_bs${bs}/vae/vae.onnx \
-             --output=./models_bs${bs}/vae/vae \
-             --input_format=NCHW \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-         ```
-      
-      参数说明：
-      - --model：为ONNX模型文件。
-      - --output：输出的OM模型。
-      - --framework：5代表ONNX模型。
-      - --log：日志级别。
-      - --soc_version：处理器型号。
-      - --input_shape: 模型的输入shape信息。
-
-
-      执行成功后生成om模型列表：  
-
-         - models_bs${bs}/clip/clip.om
-         - models_bs${bs}/unet/unet.om
-         - models_bs${bs}/unet/unet_cache.om
-         - models_bs${bs}/unet/unet_skip.om
-         - models_bs${bs}/vae/vae.om  
-   
-2. 开始推理验证。
-  
-   1. 安装绑核工具并根据NUMA亲和性配置任务进程与NUMA node 的映射关系是为了排除cpu的影响
-
-      安装绑核工具
-
-      ```shell
-      yum install numactl
-      ```
-      查询卡的NUMA node
-
-      ```shell
-      lspci -vs bus-id
-      ```
-      bus-id可通过npu-smi info获得，查询到NUMA node，在推理命令前加上对应的数字
-
-      可通过lscpu获得NUMA node对应的CPU核数
-
-      ```shell
-      NUMA node0: 0-23
-      NUMA node1: 24-47
-      NUMA node2: 48-71
-      NUMA node3: 72-95
-      ```
-      当前查到NUMA node是0，对应0-23，推荐绑定其中单核以获得更好的性能。
-
-   2. 执行推理脚本。
-
-      ```bash
-      # 普通方式
-      numactl -C 0 python3 stable_diffusion_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models_bs${bs} \
-              --prompt_file ./prompts.txt \
-              --device 0 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-
-      # 并行方式
-      numactl -C 0 python3 stable_diffusion_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models_bs${bs} \
-              --prompt_file ./prompts.txt \
-              --device 0,1 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-      ```
-
-      参数说明：
-      - --model：模型名称或本地模型目录的路径。
-      - --model_dir：存放导出模型的目录。
-      - --prompt_file：输入文本文件，按行分割。
-      - --save_dir：生成图片的存放目录。
-      - --batch_size：模型batch size。
-      - --steps：生成图片迭代次数。
-      - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-      - --use_cache: 在推理过程中使用cache。
-      - --cache_steps: 使用cache的迭代次数，迭代次数越多性能越好，但次数过多可能会导致精度下降。
-      
-      执行完成后在`./results`目录下生成推理图片。并在终端显示推理时间，参考如下：
-
-      ```
-      [info] infer number: 16; use time: 292.648s; average time: 18.290s
-      ```
-      *注意*：
-
-         如果使用arm机器，出现`*torch*.so*: cannot allocate memory in static TLS block`报错，则增加环境变量指向报错路径
-         ```bash
-         export LD_PRELOAD=报错.so路径:$LD_PRELOAD
-   
-   3. 测试推理图片展示在`./test_results`目录下，注：每次生成的图像不同。部分测试结果如下：
-
-      ![](./test_results/illustration_0.png)  
-      Prompt: "Beautiful illustration of The ocean. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper"
-
-      ![](./test_results/illustration_1.png)  
-      Prompt: "Beautiful illustration of Islands in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper"
-
-      ![](./test_results/illustration_2.png)  
-      Prompt: "Beautiful illustration of Seaports in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper"
-
-## 精度验证<a name="section741711594518"></a>
-
-   由于生成的图片存在随机性，所以精度验证将使用CLIP-score来评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。
-
-   注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
-
-   1. 下载Parti数据集
-
-      ```bash
-      wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
-      ```
-
-   2. 下载Clip模型权重
-
-      ```bash
-      GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
-      cd ./CLIP-ViT-H-14-laion2B-s32B-b79K
-
-      # 用 git-lfs 下载
-      git lfs pull
-
-      # 或者访问https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin，将权重下载并放到这个目录下
-      ```
-
-   2. 使用推理脚本读取Parti数据集，生成图片
-      ```bash
-      # 普通方式
-      numactl -C 0 python3 stable_diffusion_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models_bs${bs} \
-              --prompt_file ./PartiPrompts.tsv \
-              --prompt_file_type parti \
-              --num_images_per_prompt 4 \
-              --max_num_prompts 0 \
-              --device 0 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-
-      # 并行方式
-      numactl -C 0 python3 stable_diffusion_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models_bs${bs} \
-              --prompt_file ./PartiPrompts.tsv \
-              --prompt_file_type parti \
-              --num_images_per_prompt 4 \
-              --max_num_prompts 0 \
-              --device 0,1 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-      ```
-
-      参数说明：
-      - --model：模型名称或本地模型目录的路径。
-      - --model_dir：存放导出模型的目录。
-      - --prompt_file：输入文本文件，按行分割。
-      - --prompt_file_type: prompt文件类型，用于指定读取方式。
-      - --num_images_per_prompt: 每个prompt生成的图片数量。
-      - --max_num_prompts：限制prompt数量为前X个，0表示不限制。
-      - --save_dir：生成图片的存放目录。
-      - --batch_size：模型batch size。
-      - --steps：生成图片迭代次数。
-      - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-      - --use_cache: 在推理过程中使用cache，迭代次数越多性能越好，但次数过多可能会导致精度下降。
-
-      执行完成后会在`./results`目录下生成推理图片，并且会在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。
-
-   4. 计算CLIP-score
-
-      ```bash
-      python clip_score.py \
-             --device=cpu \
-             --image_info="image_info.json" \
-             --model_name="ViT-H-14" \
-             --model_weights_path="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
-      ```
-
-      参数说明：
-      - --device: 推理设备。
-      - --image_info: 上一步生成的`image_info.json`文件。
-      - --model_name: Clip模型名称。
-      - --model_weights_path: Clip模型权重文件路径。
-
-      执行完成后会在屏幕打印出精度计算结果。
-
-   
-# 模型推理性能&精度<a name="ZH-CN_TOPIC_0000001172201573"></a>
-
-调用ACL接口推理计算，性能参考下列数据。
-
-### StableDiffusion v2.1
-
-| 加速卡 | 服务器 |  运行方案 | 优化方案 | 迭代次数 | 平均耗时    |
-| :------: | :--: | :--: | :--: | :--: | :--------: |
-| Atlas 300I Duo  |  Atlas 800 3000 + 2路处理器，处理器规格：48核3.0GHz  |  并行  |  FA+TOME*5+faster_gleu+cache  |  50  |  1.513s   |
-
-迭代50次的参考精度结果如下：
-
-   ```
-   average score: 0.379
-   category average scores:
-   [Abstract], average score: 0.285
-   [Vehicles], average score: 0.379
-   [Illustrations], average score: 0.378
-   [Arts], average score: 0.425
-   [World Knowledge], average score: 0.388
-   [People], average score: 0.382
-   [Animals], average score: 0.389
-   [Artifacts], average score: 0.374
-   [Food & Beverage], average score: 0.367
-   [Produce & Plants], average score: 0.367
-   [Outdoor Scenes], average score: 0.372
-   [Indoor Scenes], average score: 0.382
-   ```
-
-# 公网地址说明
-代码涉及公网地址参考 public_address_statement.md
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/Readme_quant.md b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/Readme_quant.md
deleted file mode 100644
index 42017fed8f3e2aca09e75c02925d43f29d726ffa..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/Readme_quant.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# Unet模型量化指导
-
-## 环境配置
-```bash
-# 指定量化使用的device
-export DEVICE_ID=0
-
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
-```
-
-> **说明：** 
->该脚本中环境变量仅供参考，请以实际安装环境配置环境变量。详细介绍请参见《[CANN 开发辅助工具指南 \(推理\)](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=developer-documents&subcategory=auxiliary-development-tools)》。
-
-## 量化操作
-
-量化时可使用虚拟数据或者真实数据校准。使用真实数据的量化精度更高，但需进行一次推理得到真实数据。
-
-### 虚拟数据校准
-
-运行quant_unet.py脚本进行量化。
-
-```bash
-python3 quant_unet.py \
-    --model ${model_base} \
-    --model_dir ./models_bs${bs} \
-    --prompt_file ./prompts.txt \
-    --save_path unet_quant \
-    --data_free
-```
-参数说明：
-- --model：模型名称或本地模型目录的路径。
-- --model_dir：存放导出模型的目录。
-- --prompt_file：输入文本文件，按行分割。
-- --save_path：量化模型的储存目录，为model_dir下的子文件夹名。
-- --data_free：使用虚拟数据。
-
-执行成功后生成`models_bs${bs}/unet_quant`文件夹，包含unet.onnx模型及权重。
-        
-### 真实数据校准
-1. 使用ATC工具将ONNX模型转OM模型。
-
-    1. 执行命令查看芯片名称（$\{chip\_name\}）。
-
-        ```
-        npu-smi info
-        #该设备芯片名为Ascend310P3 （自行替换）
-        回显如下：
-        +-------------------+-----------------+------------------------------------------------------+
-        | NPU     Name      | Health          | Power(W)     Temp(C)           Hugepages-Usage(page) |
-        | Chip    Device    | Bus-Id          | AICore(%)    Memory-Usage(MB)                        |
-        +===================+=================+======================================================+
-        | 0       310P3     | OK              | 15.8         42                0    / 0              |
-        | 0       0         | 0000:82:00.0    | 0            1074 / 21534                            |
-        +===================+=================+======================================================+
-        | 1       310P3     | OK              | 15.4         43                0    / 0              |
-        | 0       1         | 0000:89:00.0    | 0            1070 / 21534                            |
-        +===================+=================+======================================================+
-        ```
-
-    2. 执行ATC命令。
-
-        ```bash
-        # clip
-        atc --framework=5 \
-            --model=./models_bs${bs}/clip/clip.onnx \
-            --output=./models_bs${bs}/clip/clip \
-            --input_format=ND \
-            --log=error \
-            --soc_version=Ascend${chip_name}
-        
-        # unet
-        cd ./models_bs${bs}/unet/
-
-        atc --framework=5 \
-            --model=./unet.onnx \
-            --output=./unet \
-            --input_format=NCHW \
-            --log=error \
-            --soc_version=Ascend${chip_name}
-
-        cd ../../
-        ```
-        参数说明：
-        - --model：为ONNX模型文件。
-        - --output：输出的OM模型。
-        - --framework：5代表ONNX模型。
-        - --log：日志级别。
-        - --soc_version：处理器型号。
-            
-        执行成功后生成`models_bs${bs}/clip/clip.om、models_bs${bs}/unet/unet.om`文件。
-
-    3. 执行量化
-
-        运行quant_unet.py脚本进行量化
-
-        ```bash
-        # 普通方式
-        python3 quant_unet.py \
-            --model ${model_base} \
-            --model_dir ./models_bs${bs} \
-            --prompt_file ./prompts.txt \
-            --device 0 \
-            --save_path unet_quant
-
-        # 并行方式
-        python3 quant_unet.py \
-            --model ${model_base} \
-            --model_dir ./models_bs${bs} \
-            --prompt_file ./prompts.txt \
-            --device 0,1 \
-            --save_path unet_quant
-        ```
-        参数说明：
-        - --model：模型名称或本地模型目录的路径。
-        - --model_dir：存放导出模型的目录。
-        - --prompt_file：输入文本文件，按行分割。
-        - --save_path：量化模型的储存文件夹，为model_dir下的子文件夹名。
-        - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-
-        执行成功后生成`models_bs${bs}/unet_quant`文件夹，包含unet.onnx模型及权重。
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/background_session.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/background_session.py
deleted file mode 100644
index 30f1e52d3a0de7999bd9ad2aa04cc57bb83bfc0d..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/background_session.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import multiprocessing as mp
-from dataclasses import dataclass
-from typing import List, Optional
-
-import numpy as np
-import aclruntime
-from ais_bench.infer.interface import InferSession
-
-
-@dataclass
-class SessionIOInfo:
-    input_shapes: List[tuple]
-    input_dtypes: List[type]
-    output_shapes: List[tuple]
-    output_dtypes: List[type]
-
-
-@dataclass
-class BackgroundInferSessionOptions:
-    device_id: int
-    model_path: List[str]
-    io_info: SessionIOInfo
-    acl_json_path: Optional[str] = None
-    debug: Optional[bool] = False
-    loop: Optional[int] = 1
-
-
-class BackgroundInferSession:
-    def __init__(
-        self, 
-        device_id: int, 
-        model_path: str, 
-        io_info: SessionIOInfo,
-    ):
-        # Create a pipe for process synchronization
-        self.sync_pipe, sync_pipe_peer = mp.Pipe(duplex=True)
-
-        # Create shared buffers
-        input_spaces = self.create_shared_buffers(io_info.input_shapes, io_info.input_dtypes)
-        output_spaces = self.create_shared_buffers(io_info.output_shapes, io_info.output_dtypes)
-
-        # Build numpy arrays on the shared buffers
-        self.input_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(input_spaces, io_info.input_shapes, io_info.input_dtypes)]
-        self.output_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(output_spaces, io_info.output_shapes, io_info.output_dtypes)]
-
-        mp.set_start_method('forkserver', force=True)
-        self.p = mp.Process(
-            target=self.run_session, 
-            args=[sync_pipe_peer, input_spaces, output_spaces,
-                  io_info, device_id, model_path]
-        )
-        self.p.start()
-
-        # Wait until the sub process is ready
-        self.wait()
-
-    def infer_asyn(self, feeds: List[np.ndarray], skip=0) -> None:
-        for i in range(len(self.input_arrays)):
-            self.input_arrays[i][:] = feeds[i][:]
-
-        if skip:
-            self.sync_pipe.send('skip')
-        else:
-            self.sync_pipe.send('cache')
-
-    def wait(self) -> None:
-        self.sync_pipe.recv()
-
-    def get_outputs(self) -> List[np.ndarray]:
-        return self.output_arrays
-
-    def wait_and_get_outputs(self) -> List[np.ndarray]:
-        self.wait()
-        return self.get_outputs()
-    
-    def infer(self, feeds: List[np.ndarray]) -> List[np.ndarray]:
-        # This function should work as same as InferSession.infer()
-        self.infer_asyn(feeds)
-        return self.wait_and_get_outputs()
-
-    def stop(self):
-        # Stop the sub process
-        self.p.terminate()
-
-    @classmethod
-    def clone(
-        cls, 
-        session: InferSession, 
-        device_id: int, 
-        model_path: List[str]) -> 'BackgroundInferSession':
-        # Get shapes, datatypes, and model path from an existed InferSession, 
-        # then use them to create a BackgroundInferSession
-        io_info = cls.get_io_info_from_session(session)
-        io_info.output_shapes = [io_info.output_shapes[0]]
-        io_info.output_dtypes = [io_info.output_dtypes[0]]
-
-        return cls(device_id, model_path, io_info)
-
-    @staticmethod
-    def get_io_info_from_session(session: InferSession) -> SessionIOInfo:
-        # Map aclruntime datatype to numpy datatype
-        np_types = (np.float32, np.float16, np.int8, np.int32, 
-                    np.uint8, '', np.int16, np.uint16, np.uint32, 
-                    np.int64, np.uint64)
-
-        # Get input shapes and datatypes
-        inputs = session.get_inputs()
-        input_shapes = [t.shape for t in inputs]
-        input_dtypes = [np_types[t.datatype] for t in inputs]
-
-        # Get output shapes and datatypes
-        outputs = session.get_outputs()
-        output_shapes = [t.shape for t in outputs]
-        output_dtypes = [np_types[t.datatype] for t in outputs]
-
-        return SessionIOInfo(input_shapes, input_dtypes, 
-                             output_shapes, output_dtypes)
-
-    @staticmethod
-    def create_shared_buffers(shapes: List[tuple], dtypes: List[type]) -> List[mp.RawArray]:
-        buffers = []
-        for shape, dtype in zip(shapes, dtypes):
-            size = 1
-            for x in shape:
-                size *= x
-
-            raw_array = mp.RawArray(np.ctypeslib.as_ctypes_type(dtype), size)
-            buffers.append(raw_array)
-
-        return buffers
-
-    @staticmethod
-    def run_session(
-        sync_pipe: mp.connection.Connection,
-        input_spaces: List[np.ndarray],
-        output_spaces: List[np.ndarray],
-        io_info: SessionIOInfo,
-        device_id: int, 
-        model_path: list, 
-    ) -> None:
-        # The sub process function
-
-        # Create an InferSession
-        session_cache = aclruntime.InferenceSession(
-            model_path[0], 
-            device_id, 
-            aclruntime.session_options()
-            )
-        if model_path[1]:
-            session_skip = aclruntime.InferenceSession(
-                model_path[1], 
-                device_id, 
-                aclruntime.session_options()
-                )
-
-        # Build numpy arrays on the shared buffers
-        input_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(input_spaces, io_info.input_shapes, io_info.input_dtypes)]
-
-        output_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(output_spaces, io_info.output_shapes, io_info.output_dtypes)]
-
-        # Tell the main function that we are ready
-        sync_pipe.send('')
-
-        # Keep looping until recived a 'STOP'
-        while True:
-            flag = sync_pipe.recv()
-            if flag == 'cache':
-                feeds = {}
-                inputs = session_cache.get_inputs()
-                for i in range(len(input_arrays)):
-                    feed = aclruntime.Tensor(input_arrays[i])
-                    feed.to_device(device_id)
-                    feeds[inputs[i].name] = feed
-                out_names = [out.name for out in session_cache.get_outputs()]
-                
-                outputs = session_cache.run(out_names, feeds)
-                if len(outputs) > 1:
-                    cache = outputs[1]
-            else:
-                feeds = {}
-                inputs = session_skip.get_inputs()
-                for i in range(len(input_arrays)):
-                    feed = aclruntime.Tensor(input_arrays[i])
-                    feed.to_device(device_id)
-                    feeds[inputs[i].name] = feed
-                feeds[inputs[-1].name] = cache
-                out_names = [out.name for out in session_skip.get_outputs()]
-                
-                outputs = session_skip.run(out_names, feeds)
-            outputs[0].to_host()
-            output = np.array(outputs[0])
-            for i in range(len(output_arrays)):
-                output_arrays[i][:] = output[:]
-
-            sync_pipe.send('')
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/clip.patch b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/clip.patch
deleted file mode 100644
index e3e4719b66f771ebb660f25151c33d140566c3f3..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/clip.patch
+++ /dev/null
@@ -1,10 +0,0 @@
-22a23
-> import numpy as np
-760c761,762
-<         mask.triu_(1)  # zero out the lower diagonal
----
->         # mask.triu_(1)  # zero out the lower diagonal
->         mask = torch.from_numpy(np.triu(mask.numpy(), 1))
-1324a1327
-> 
-
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/clip_score.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/clip_score.py
deleted file mode 100644
index 069f5d6e9a9baaa61b9a3537bcab6f637605858e..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/clip_score.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import json
-import time
-import argparse
-
-import open_clip
-import numpy as np
-from PIL import Image
-import torch
-import torch.nn.functional as F
-
-
-def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
-    imgs = []
-    texts = []
-    for image_file in image_files:
-        img = preprocess(Image.open(image_file)).unsqueeze(0).to(device)
-        imgs.append(img)
-        text = tokenizer([prompt]).to(device)
-        texts.append(text)
-
-    img = torch.cat(imgs)   # [bs, 3, 224, 224]
-    text = torch.cat(texts) # [bs, 77]
-
-    with torch.no_grad():
-        text_ft = model_clip.encode_text(text).float()
-        img_ft = model_clip.encode_image(img).float()
-        score = F.cosine_similarity(img_ft, text_ft).squeeze()
-    
-    return score.cpu()
-
-
-def main():
-    args = parse_arguments()
-    
-    if args.device is None:
-        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
-    else:
-        device = torch.device(args.device)
-    
-    t_b = time.time()
-    print(f"Load clip model...") 
-    model_clip, _, preprocess = open_clip.create_model_and_transforms(
-        args.model_name, pretrained=args.model_weights_path, device=device)
-    model_clip.eval()
-    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
-    
-    tokenizer = open_clip.get_tokenizer(args.model_name)
-
-    with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f:
-        image_info = json.load(f)
-
-    t_b = time.time()
-    print(f"Calc clip score...") 
-    all_scores = []
-    cat_scores = {}
-
-    for i, info in enumerate(image_info):
-        image_files = info['images']
-        category = info['category']
-        prompt = info['prompt']
-
-        print(f"[{i + 1}/{len(image_info)}] {prompt}")
-
-        image_scores = clip_score(model_clip, 
-                                  tokenizer, 
-                                  preprocess, 
-                                  prompt, 
-                                  image_files, 
-                                  device)
-        if len(image_files) > 1:
-            best_score = max(image_scores)
-        else:
-            best_score = image_scores
-
-        print(f"image scores: {image_scores}")
-        print(f"best score: {best_score}")
-
-        all_scores.append(best_score)
-        if category not in cat_scores:
-            cat_scores[category] = []
-        cat_scores[category].append(best_score)
-    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
-
-    average_score = np.average(all_scores)
-    print(f"====================================")
-    print(f"average score: {average_score:.3f}")
-    print(f"category average scores:")
-    cat_average_scores = {}
-    for category, scores in cat_scores.items():
-        cat_average_scores[category] = np.average(scores)
-        print(f"[{category}], average score: {cat_average_scores[category]:.3f}")
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        choices=["cpu", "cuda"],
-        help="device for torch.",
-    )
-    parser.add_argument(
-        "--image_info",
-        type=str,
-        default="./image_info.json",
-        help="Image_info.json file.",
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="ViT-H-14",
-        help="open clip model name",
-    )
-    parser.add_argument(
-        "--model_weights_path",
-        type=str,
-        default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
-        help="open clip model weights",
-    )
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/modelzoo_level.txt b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/modelzoo_level.txt
deleted file mode 100755
index bab92903cfc388d00deb4af63d1c4b19033ab4f8..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/modelzoo_level.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-FuncStatus:OK
-PerfStatus:PERFECT
-PrecisionStatus:OK
-ModelConvert:OK
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/modify_onnx.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/modify_onnx.py
deleted file mode 100644
index a36fbcd62b681007e0cf25508b2ad3967c2151e4..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/modify_onnx.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import numpy as np
-from auto_optimizer import OnnxGraph
-
-
-def del_add(model):
-    init = [n.name for n in model.get_nodes('Initializer')]
-    for node in model.get_nodes('Add'):
-        if 'attn' in node.name and node.inputs[1] in init:
-            value = model[node.inputs[1]].value
-            if (value == 0).all():
-                model.remove(node.name)
-
-            
-def add_flash_attention(model, fa_name, soc_type):
-    for node in model.get_nodes('Mul'):
-        name = node.name
-        if soc_type == 1:
-            flag = 'attn' in name
-        else:
-            flag = 'attn1' in name
-        if flag:
-            matmul = model[name[:-3] + 'to_q/MatMul']
-            reshape = model[name[:-3] + 'Reshape']
-            if soc_type == 2 and model[reshape.inputs[1]].value[1] != 4096:
-                continue
-            softmax_node = model.get_next_nodes(node.outputs[0])[0]
-            if soc_type == 1:
-                # move mul to q
-                softmax_node.inputs[0] = node.inputs[0]
-                node.inputs[0] = matmul.outputs[0]
-                reshape.inputs[0] = node.outputs[0]
-
-            # add flashattention
-            new_node = model.add_node(name[:-3] + fa_name, fa_name)
-            inputs = [None, None, None]
-            # input 0: q
-            if soc_type == 1:
-                matmul_node = model.get_prev_node(softmax_node.inputs[0])
-            if soc_type == 2:
-                matmul_node = model.get_prev_node(node.inputs[0])
-            inputs[0] = matmul_node.inputs[0]
-            # input 1: k
-            transpose_node = model.get_prev_node(matmul_node.inputs[1])
-            inputs[1] = transpose_node.inputs[0]
-            # input 2: v
-            cast_node = model.get_next_nodes(softmax_node.outputs[0])[0]
-            last_node = model.get_next_nodes(cast_node.outputs[0])[0]
-            inputs[2] = last_node.inputs[1]
-            # output
-            outputs = last_node.outputs
-            # update link
-            new_node.inputs = inputs
-            new_node.outputs = outputs
-            
-            model.remove(matmul_node.name, {})
-            model.remove(transpose_node.name, {})
-            model.remove(softmax_node.name, {})
-            model.remove(cast_node.name, {})
-            model.remove(last_node.name, {})
-    model.update_map()
-    for node in model.get_nodes(fa_name):
-        for _ in range(soc_type):
-            for i in range(3):
-                prev_node = model.get_prev_node(node.inputs[i])
-                model.remove(prev_node.name)
-            next_node = model.get_next_nodes(node.outputs[0])[0]
-            model.remove(next_node.name)
-        if soc_type == 2:
-            name = node.name.replace(fa_name, 'Cast')
-            cast = model.add_node(name, 'Cast', attrs={'to': 1})
-            model.insert_node(node.name, cast)
-
-
-def change_input_type(model):
-    model.remove('t')
-    model.add_input('t', 'int32', [1])
-    model.inputs[1], model.inputs[2] = model.inputs[2], model.inputs[1]
-
-
-def get_index(model, init, name):
-    if name in init:
-        return model[name].value
-    else:
-        return name
-
-
-def replace_slice(model, fast):
-    # find pairs of slice
-    slice_pair = []
-    for node in model.get_nodes('Slice'):
-        if node.name[-2:] == '_1':
-            slice_pair.append((model[node.name[:-2]], model[node.name]))
-    # replace
-    init = [n.name for n in model.get_nodes('Initializer')]
-    for pair in slice_pair:
-        next_node = model.get_next_nodes(pair[0].outputs[0])[0]
-        if fast and next_node.op_type == 'Mul':
-            name = pair[0].name[:-5] + 'SliceTransGeluMul'
-            model.add_node(name, 'SliceTransGeluMul', inputs=[pair[0].inputs[0]], outputs=next_node.outputs)
-            model.remove(next_node.name, {})
-        else:
-            name = pair[0].name[:-5] + 'Split'
-            data = pair[0].inputs[0]
-            start_0 = get_index(model, init, pair[0].inputs[1])
-            end_0 = get_index(model, init, pair[0].inputs[2])
-            start_1 = get_index(model, init, pair[1].inputs[1])
-            end_1 = get_index(model, init, pair[1].inputs[2])
-            if start_1 == end_0:
-                outputs = pair[0].outputs + pair[1].outputs
-            elif start_0 == end_1:
-                outputs = pair[1].outputs + pair[0].outputs
-
-            axes = pair[0].inputs[3]
-            axis = model[axes].value[0]
-            model.add_node(name, 'Split', inputs=[data], outputs=outputs, attrs={'axis': axis})
-        model.remove(pair[0].name, {})
-        model.remove(pair[1].name, {})
-    model.update_map()
-        
-
-def build_index(h, w, sy=2, sx=2):
-    # random select one from a 2x2 block
-    hsy = h // sy
-    wsx = w // sx
-    rand_idx = np.random.randint(sy * sx, size=(hsy, wsx))
-        
-    idx = np.ones((hsy, wsx, sy * sx), dtype=np.int64)
-    for i in range(hsy):
-        for j in range(wsx):
-            idx[i, j][rand_idx[i, j]] = 0
-    idx = idx.reshape(hsy, wsx, sy, sx).transpose(0, 2, 1, 3)
-    idx_rand = idx.reshape(-1).argsort()
-    index_a = np.sort(idx_rand[hsy * wsx:])
-    index_b = np.sort(idx_rand[:hsy * wsx])
-    return index_a, index_b
-
-
-def get_block(model):
-    # find self-attention block
-    norms = []
-    for node in model.get_nodes('Add'):
-        next_nodes = model.get_next_nodes(node.outputs[0])
-        if len(next_nodes) != 3:
-            continue
-        op_type = set(n.op_type for n in next_nodes)
-        if len(op_type) == 1 and 'MatMul' in op_type:
-            if model[node.inputs[1]].value.shape[0] == 320:
-                norms.append(node)
-    return norms
-
-
-def find_nodes(model, node):
-    prev_node = model.get_prev_node(node.inputs[0])
-    while prev_node.op_type != 'Sub':
-        prev_node = model.get_prev_node(prev_node.inputs[0])
-    inp = prev_node.inputs[0]
-    next_nodes = model.get_next_nodes(inp)
-    for next_node in next_nodes:
-        if next_node.op_type == 'Add':
-            if next_node.inputs[0] == inp:
-                out = next_node.inputs[1]
-            else:
-                out = next_node.inputs[0]
-    return inp, out
-
-
-def build_tome_block(model, name, inputs, inputs_un):
-    # link merge to attn
-    for node in model.get_next_nodes(inputs[1]):
-        ind = 0
-        for inp in node.inputs:
-            if inp == inputs[1]:
-                node.inputs[ind] = name + 'Concat_output'
-            ind += 1
-    # norm block
-    model.add_node(
-        name + 'Mul',
-        'Mul',
-        inputs=[inputs[0], inputs[0]],
-        outputs=[name + 'Mul_output']
-    )
-    model.add_node(
-        name + 'ReduceSum',
-        'ReduceSum',
-        inputs=[name + 'Mul_output'],
-        outputs=[name + 'ReduceSum_output'],
-        attrs={'axes': [-1], 'keepdims': 1}
-    )
-    model.add_node(
-        name + 'Sqrt',
-        'Sqrt',
-        inputs=[name + 'ReduceSum_output'],
-        outputs=[name + 'Sqrt_output']
-    )
-    model.add_node(
-        name + 'Div',
-        'Div',
-        inputs=[inputs[0], name + 'Sqrt_output'],
-        outputs=[name + 'Div_output']
-    )
-    # compute similarity
-    model.add_node(
-        name + 'Gather_0',
-        'Gather',
-        inputs=[name + 'Div_output', 'tome/Gather_index_a'],
-        outputs=[name + 'Gather_0_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Gather_1',
-        'Gather',
-        inputs=[name + 'Div_output', 'tome/Gather_index_b'],
-        outputs=[name + 'Gather_1_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Transpose',
-        'Transpose',
-        inputs=[name + 'Gather_1_output'],
-        outputs=[name + 'Transpose_output'],
-        attrs={'perm': [0, 2, 1]}
-    )
-    model.add_node(
-        name + 'MatMul',
-        'MatMul',
-        inputs=[name + 'Gather_0_output', name + 'Transpose_output'],
-        outputs=[name + 'MatMul_output']
-    )
-    model.add_node(
-        name + 'FindMax',
-        'FindMax',
-        inputs=[name + 'MatMul_output'],
-        outputs=[name + 'FindMax_output_0', name + 'FindMax_output_1'],
-        attrs={}
-    )
-    model.add_node(
-        name + 'TopK',
-        'TopK',
-        inputs=[name + 'FindMax_output_0', 'tome/Topk_k'],
-        outputs=[name + 'TopK_output_0', name + 'TopK_output_1'],
-        attrs={'axis': -1, 'largest': 1}
-    )
-    # split token
-    model.add_node(
-        name + 'Gather_2',
-        'Gather',
-        inputs=[inputs[1], 'tome/Gather_index_a'],
-        outputs=[name + 'Gather_2_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Gather_3',
-        'Gather',
-        inputs=[inputs[1], 'tome/Gather_index_b'],
-        outputs=[name + 'Gather_3_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Cast_0',
-        'Cast',
-        inputs=[name + 'Gather_2_output'],
-        outputs=[name + 'Cast_0_output'],
-        attrs={'to': 1}
-    )
-    model.add_node(
-        name + 'Cast_1',
-        'Cast',
-        inputs=[name + 'Gather_3_output'],
-        outputs=[name + 'Cast_1_output'],
-        attrs={'to': 1}
-    )
-    # tome merge
-    merge_inputs = [
-        name + 'Cast_0_output', 
-        name + 'Cast_1_output', 
-        name + 'TopK_output_1', 
-        name + 'FindMax_output_1'
-    ]
-    merge_outputs = [
-        name + 'TomeMerged_output_0',
-        name + 'TomeMerged_output_1',
-        name + 'TomeMerged_output_2'
-    ]
-    model.add_node(
-        name + 'TomeMerged',
-        'TomeMerged',
-        inputs=merge_inputs,
-        outputs=merge_outputs
-    )
-    model.add_node(
-        name + 'ReduceSum_1',
-        'ReduceSum',
-        inputs=[name + 'TomeMerged_output_1'],
-        outputs=[name + 'ReduceSum_1_output'],
-        attrs={'axes': [1], 'keepdims': 0}
-    )
-    model.add_node(
-        name + 'ReduceSum_2',
-        'ReduceSum',
-        inputs=[name + 'TomeMerged_output_2'],
-        outputs=[name + 'ReduceSum_2_output'],
-        attrs={'axes': [1], 'keepdims': 0}
-    )
-    model.add_node(
-        name + 'Unsqueeze',
-        'Unsqueeze',
-        inputs=[name + 'ReduceSum_2_output'],
-        outputs=[name + 'Unsqueeze_output'],
-        attrs={'axes': [2]}
-    )
-    model.add_node(
-        name + 'Div_1',
-        'Div',
-        inputs=[name + 'ReduceSum_1_output', name + 'Unsqueeze_output'],
-        outputs=[name + 'Div_1_output']
-    )
-    model.add_node(
-        name + 'Concat',
-        'Concat',
-        inputs=[name + 'TomeMerged_output_0', name + 'Div_1_output'],
-        outputs=[name + 'Concat_output'],
-        attrs={'axis': 1}
-    )
-    # link unmerge to norm
-    for node in model.get_next_nodes(inputs_un[0]):
-        ind = 0
-        for inp in node.inputs:
-            if inp == inputs_un[0]:
-                node.inputs[ind] = name + 'TomeUngerme_output'
-            ind += 1
-    # add unmerge node
-    unmerge_inputs = inputs_un + [name + 'TopK_output_1', name + 'FindMax_output_1']
-    model.add_node(
-        name + 'tome/TomeUnmerge',
-        'TomeUnmerged',
-        inputs=unmerge_inputs,
-        outputs=[name + 'TomeUngerme_output']
-    )
-    model.update_map()
-
-
-def insert_tome_block(model, max_num):
-    bs = model['latent_model_input'].shape[0]
-    h, w = model['latent_model_input'].shape[2:]
-    index_a, index_b = build_index(h, w)
-    # add initializer
-    model.add_initializer('tome/Gather_index_a', index_a)
-    model.add_initializer('tome/Gather_index_b', index_b)
-    bs_index_a = np.tile(index_a.reshape(1, -1), [bs, 1])
-    bs_index_b = np.tile(index_b.reshape(1, -1), [bs, 1])
-    model.add_initializer('tome/index_a', bs_index_a)
-    model.add_initializer('tome/index_b', bs_index_b)
-    model.add_initializer('tome/Topk_k', np.array([3072]))
-    # get reshape nodes
-    reshapes = model.get_nodes('Reshape')
-    # find inputs
-    norm_outs = get_block(model)[:max_num]
-    for node in norm_outs:
-        name = node.name.rsplit('/', 2)[0] + '/attn1/'
-        norm_input, sa_output = find_nodes(model, node)
-        inputs_0 = [norm_input] + node.outputs
-        inputs_1 = [sa_output] + ['tome/index_a', 'tome/index_b']
-        # add tome block
-        build_tome_block(model, name.replace('attn', 'tome'), inputs_0, inputs_1)
-        # change shape of reshape
-        for reshape in reshapes:
-            if name in reshape.name:
-                shape = model[reshape.inputs[1]].value.copy()
-                ind = 0
-                for size in shape:
-                    if size == 4096:
-                        shape[ind] = '-1'
-                    ind += 1
-                model[reshape.inputs[1]].value = shape
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="models/unet/unet.onnx",
-        help="Path of the unet onnx model.",
-    )
-    parser.add_argument(
-        "--new_model",
-        type=str,
-        default="models/unet/unet_md.onnx",
-        help="Path to save the modified model",
-    )
-    parser.add_argument(
-        "--FA_soc", 
-        choices=["None", "Duo", "A2"],
-        default="None", 
-        help="Type of FA operator.",
-    )
-    parser.add_argument(
-        "--TOME_num",
-        type=int,
-        default=0,
-        help="Number of TOME used in the model",
-    )
-    parser.add_argument(
-        "--faster_gelu",
-        action="store_true",
-        help="Use specific gelu operation"
-    )
-    return parser.parse_args()
-
-
-def main():
-    model = OnnxGraph.parse(args.model)
-    del_add(model)
-    if args.FA_soc == 'Duo':
-        add_flash_attention(model, 'FlashAttentionTik', soc_type=1)
-    elif args.FA_soc == 'A2':
-        add_flash_attention(model, 'UnpadFlashAttentionMix', soc_type=2)
-    if args.TOME_num:
-        insert_tome_block(model, args.TOME_num)
-    change_input_type(model)
-    replace_slice(model, args.faster_gelu)
-    model.remove_unused_nodes()
-    model.save(args.new_model)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main()
-    
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/pipeline_ascend_stable_diffusion.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/pipeline_ascend_stable_diffusion.py
deleted file mode 100644
index 81256738bb9ccc3266c18fc21d2824ded77b12d2..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/pipeline_ascend_stable_diffusion.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, List, Optional, Union
-
-import torch
-import numpy as np
-import aclruntime
-from diffusers import StableDiffusionPipeline
-from ais_bench.infer.interface import InferSession
-
-
-class AscendStableDiffusionPipeline(StableDiffusionPipeline):
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-        clip_session,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
- 
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        text_inputs = self.tokenizer(prompt,
-                                     padding="max_length",
-                                     max_length=self.tokenizer.model_max_length,
-                                     truncation=True,
-                                     return_tensors="pt")
-        text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-            text_input_ids, untruncated_ids
-        ):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            print("[warning] The following part of your input was truncated"
-                  " because CLIP can only handle sequences up to"
-                  f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-
-        text_embeddings = clip_session.infer([text_input_ids.numpy()])
-        text_embeddings = [torch.from_numpy(text) for text in text_embeddings]
-        text_embeddings = text_embeddings[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                                f" {type(prompt)}.")
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                                 f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                                 " the batch size of `prompt`.")
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(uncond_tokens, 
-                                          padding="max_length",
-                                          max_length=max_length, 
-                                          truncation=True, 
-                                          return_tensors="pt")
-
-            uncond_embeddings = clip_session.infer([uncond_input.input_ids.numpy()])
-            uncond_embeddings = [torch.from_numpy(text) for text in uncond_embeddings]
-            uncond_embeddings = uncond_embeddings[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        return text_embeddings
-
-
-    @torch.no_grad()
-    def ascend_infer(
-        self,
-        prompt: Union[str, List[str]],
-        clip_session: InferSession,
-        unet_sessions: list,
-        vae_session: InferSession,
-        skip_status: List[int],
-        device_id: int = 0,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
- 
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (畏) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
- 
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(prompt,
-                                              num_images_per_prompt,
-                                              do_classifier_free_guidance,
-                                              negative_prompt,
-                                              clip_session)
-
-        text_embeddings_dtype = text_embeddings.dtype
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(batch_size * num_images_per_prompt,
-                                       num_channels_latents,
-                                       height,
-                                       width,
-                                       text_embeddings_dtype,
-                                       device,
-                                       generator,
-                                       latents)
-
-        # 6. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        unet_session, unet_session_bg = unet_sessions
-        use_parallel_inferencing = unet_session_bg is not None
-        if use_parallel_inferencing and do_classifier_free_guidance:
-            # Split embeddings
-            text_embeddings, text_embeddings_2 = text_embeddings.chunk(2)
-            text_embeddings_2 = text_embeddings_2.numpy()
-
-        text_embeddings = text_embeddings.numpy()
-        cache = None
-
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            t_numpy = t[None].numpy().astype(np.int32)
-
-            # expand the latents if we are doing classifier free guidance
-            if not use_parallel_inferencing and do_classifier_free_guidance:
-                latent_model_input = torch.cat([latents] * 2)
-            else:
-                latent_model_input = latents
-
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t).numpy()
-
-            # predict the noise residual
-            if use_parallel_inferencing and do_classifier_free_guidance:
-                unet_session_bg.infer_asyn(
-                    [
-                        latent_model_input,
-                        t_numpy,
-                        text_embeddings_2,
-                    ],
-                    skip_status[i]
-                )
-
-            if skip_status[i]:
-                inputs = [
-                    latent_model_input,
-                    t_numpy,
-                    text_embeddings,
-                    cache,
-                ]
-                noise_pred = torch.from_numpy(
-                    np.array(self.unet_infer(unet_session[1], inputs, device_id)[0])
-                )
-            else:
-                inputs = [
-                    latent_model_input,
-                    t_numpy,
-                    text_embeddings,
-                ]
-                outputs = self.unet_infer(unet_session[0], inputs, device_id)
-                noise_pred = torch.from_numpy(np.array(outputs[0]))
-                if len(outputs) > 1:
-                    cache = outputs[1]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                if use_parallel_inferencing:
-                    noise_pred_text = torch.from_numpy(unet_session_bg.wait_and_get_outputs()[0])
-                else:
-                    noise_pred, noise_pred_text = noise_pred.chunk(2)
- 
-                noise_pred = noise_pred + guidance_scale * (noise_pred_text - noise_pred)
- 
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(
-                noise_pred, t, latents, **extra_step_kwargs
-            ).prev_sample
- 
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
- 
- 
-        # 8. Post-processing
-        latents = 1 / self.vae.config.scaling_factor * latents
-
-        latents = self.vae.post_quant_conv(latents)
-        image = torch.from_numpy(vae_session.infer([latents.numpy()])[0])
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-
-        # 9. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        return (image, None)
-
-
-    def unet_infer(self, session, data, device_id):
-        feeds = {}
-        inputs = session.get_inputs()
-        for i in range(3):
-            feed = aclruntime.Tensor(data[i])
-            feed.to_device(device_id)
-            feeds[inputs[i].name] = feed
-        if len(inputs) > 3:
-            feeds[inputs[3].name] = data[3]
-        out_names = [out.name for out in session.get_outputs()]
-
-        outputs = session.run(out_names, feeds)
-        outputs[0].to_host()
-        return outputs
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/prompts.txt b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/prompts.txt
deleted file mode 100644
index a375a0bb63931d0d5da6c6d91df1e14f870f47d0..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/prompts.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Beautiful illustration of The ocean. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Islands in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Seaports in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of The waves. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Grassland. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Wheat. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Hut Tong. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of The boat. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Pine trees. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Bamboo. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of The temple. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Cloud in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Sun in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Spring. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Lotus. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Snow piles. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/public_address_statement.md b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/public_address_statement.md
deleted file mode 100644
index 44a78e5880e57df0d547582b77a3de20f72994c1..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/public_address_statement.md
+++ /dev/null
@@ -1,8 +0,0 @@
-| 类型 | 开源代码地址 | 文件名 | 公网IP地址/公网URL地址/域名/邮箱地址 | 用途说明 |
-| ---- | ------------ | ------ | ------------------------------------ | -------- |
-|开源代码引入| https://huggingface.co/stabilityai/stable-diffusion-2-1-base | pipeline_ascend_stable_diffusion.py |[Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). |论文地址|
-|开源代码引入| https://huggingface.co/stabilityai/stable-diffusion-2-1-base | pipeline_ascend_stable_diffusion.py |[Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). |论文地址|
-|开源代码引入| https://huggingface.co/stabilityai/stable-diffusion-2-1-base | pipeline_ascend_stable_diffusion.py | DDIM paper: https://arxiv.org/abs/2010.02502. |论文地址|
-|开源代码引入| https://huggingface.co/stabilityai/stable-diffusion-2-1-base |pipeline_ascend_stable_diffusion.py |[torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) |论文地址|
-|开源代码引入| https://huggingface.co/stabilityai/stable-diffusion-2-1-base | pipeline_ascend_stable_diffusion.py |[PIL](https://pillow.readthedocs.io/en/stable/) |论文地址|
-|开源代码引入| https://huggingface.co/stabilityai/stable-diffusion-2-1-base | pipeline_ascend_stable_diffusion.py |Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . |论文地址|
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/quant_unet.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/quant_unet.py
deleted file mode 100644
index 804c53d5106c990c85c6be26d50d89e37686dcaf..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/quant_unet.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from typing import Callable, List, Optional, Union
-
-import onnx
-import torch
-import numpy as np
-from ais_bench.infer.interface import InferSession
-from modelslim.onnx.squant_ptq.onnx_quant_tools import OnnxCalibrator
-from modelslim.onnx.squant_ptq.quant_config import QuantConfig
-from diffusers import DPMSolverMultistepScheduler, EulerDiscreteScheduler, DDIMScheduler
-
-from background_session import BackgroundInferSession
-from pipeline_ascend_stable_diffusion import AscendStableDiffusionPipeline
-from stable_diffusion_ascend_infer import check_device_range_valid
-
-
-class StableDiffusionDumpPipeline(AscendStableDiffusionPipeline):
-    @torch.no_grad()
-    def dump_data(
-        self,
-        prompt: Union[str, List[str]],
-        clip_session: InferSession,
-        unet_sessions: list,
-        dump_num: int = 10,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        device = self._execution_device
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(prompt,
-                                              num_images_per_prompt,
-                                              do_classifier_free_guidance,
-                                              negative_prompt,
-                                              clip_session)
-
-        text_embeddings_dtype = text_embeddings.dtype
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(batch_size * num_images_per_prompt,
-                                       num_channels_latents,
-                                       height,
-                                       width,
-                                       text_embeddings_dtype,
-                                       device,
-                                       generator,
-                                       latents)
-
-        # 6. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        unet_session, unet_session_bg = unet_sessions
-        use_parallel_inferencing = unet_session_bg is not None
-        if use_parallel_inferencing and do_classifier_free_guidance:
-            # Split embeddings
-            text_embeddings, text_embeddings_2 = text_embeddings.chunk(2)
-            text_embeddings_2 = text_embeddings_2.numpy()
-
-        text_embeddings = text_embeddings.numpy()
-
-        dump_data = []
-        start_id = num_inference_steps // 2 - dump_num // 2
-        end_id = start_id + dump_num
-
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            t_numpy = t[None].numpy()
-
-            # expand the latents if we are doing classifier free guidance
-            if not use_parallel_inferencing and do_classifier_free_guidance:
-                latent_model_input = torch.cat([latents] * 2)
-            else:
-                latent_model_input = latents
-
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t).numpy()
-            if start_id <= i < end_id:
-                dump_data.append([latent_model_input, t_numpy, text_embeddings])
-
-            # predict the noise residual
-            if use_parallel_inferencing and do_classifier_free_guidance:
-                unet_session_bg.infer_asyn(
-                    [
-                        latent_model_input,
-                        t_numpy,
-                        text_embeddings_2,
-                    ]
-                )
-
-            noise_pred = torch.from_numpy(
-                unet_session.infer(
-                    [
-                        latent_model_input,
-                        t_numpy,
-                        text_embeddings,
-                    ]
-                )[0]
-            )
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                if use_parallel_inferencing:
-                    noise_pred_text = torch.from_numpy(unet_session_bg.wait_and_get_outputs()[0])
-                else:
-                    noise_pred, noise_pred_text = noise_pred.chunk(2)
- 
-                noise_pred = noise_pred + guidance_scale * (noise_pred_text - noise_pred)
- 
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(
-                noise_pred, t, latents, **extra_step_kwargs
-            ).prev_sample
- 
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        return dump_data
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-2-1-base",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "--prompt_file",
-        type=str,
-        default="prompts.txt",
-        help="A prompt file used to generate images.",
-    )
-    parser.add_argument(
-        "--model_dir",
-        type=str,
-        default="./models",
-        help="Base path of om models.",
-    )
-    parser.add_argument(
-        "--save_path", 
-        type=str, 
-        default="unet_quant", 
-        help="Path to save result images.",
-    )
-    parser.add_argument(
-        "--scheduler", 
-        choices=["DDIM", "Euler", "DPM"],
-        default="DDIM", 
-        help="Type of Sampling methods. Can choose from DDIM, Euler, DPM",
-    )
-    parser.add_argument(
-        "--device", 
-        type=check_device_range_valid, 
-        default=0, 
-        help="NPU device id. Give 2 ids to enable parallel inferencing."
-    )
-    parser.add_argument(
-        "--steps", 
-        type=int, 
-        default=50, 
-        help="Number of inference steps.",
-    )
-    parser.add_argument(
-        "--data_num", 
-        type=int, 
-        default=10,
-        help="the number of real data used in quant process"
-    )
-    parser.add_argument(
-        "--data_free", 
-        action='store_true', 
-        help="do not use real data"
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-
-    unet_onnx = os.path.join(args.model_dir, "unet", "unet.onnx")
-
-    if args.data_free:
-        data = [[]]
-
-    input_shape = ''
-    model = onnx.load(unet_onnx)
-    inputs = model.graph.input
-
-    for inp in inputs:
-        dims = inp.type.tensor_type.shape.dim
-        shape = [str(x.dim_value) for x in dims]
-        input_shape += inp.name + ':' + ','.join(shape) + ';'
-        if args.data_free:
-            dtype = inp.type.tensor_type.elem_type
-            data_size = [x.dim_value for x in dims]
-            if dtype == 1:
-                data[0].append(np.random.random(data_size).astype(np.float32))
-            if dtype == 7:
-                data[0].append(np.random.randint(10, size=data_size).astype(np.int64))
-
-    if not args.data_free:
-        device = None
-        device_2 = None
-
-        if isinstance(args.device, list):
-            device, device_2 = args.device
-        else:
-            device = args.device
-        
-        batch_size = inputs[0].type.tensor_type.shape.dim[0].dim_value
-        if not device_2:
-            batch_size = batch_size // 2
-
-        pipe = StableDiffusionDumpPipeline.from_pretrained(args.model).to("cpu")
-
-        if args.scheduler == "DDIM":
-            pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        if args.scheduler == "Euler":
-            pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-        if args.scheduler == "DPM":
-            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-
-        clip_om = os.path.join(args.model_dir, "clip", "clip.om")
-        unet_om = os.path.join(args.model_dir, "unet", "unet.om")
-
-        clip_session = InferSession(device, clip_om)
-        unet_session = InferSession(device, unet_om)
-
-        unet_session_bg = None
-        if device_2:
-            unet_session_bg = BackgroundInferSession.clone(unet_session, device_2, [unet_om, ""])
-
-        with os.fdopen(os.open(args.prompt_file, os.O_RDONLY), "r") as f:
-            prompts = [line.strip() for line in f]
-
-        data = pipe.dump_data(
-            prompts[:batch_size],
-            clip_session,
-            [unet_session, unet_session_bg],
-            args.data_num,
-            num_inference_steps=args.steps
-        )
-
-        if unet_session_bg:
-            unet_session_bg.stop()
-    
-    config = QuantConfig(
-        disable_names=[],
-        quant_mode=0,
-        amp_num=0,
-        use_onnx=False,
-        disable_first_layer=True,
-        quant_param_ops=['Conv'],
-        atc_input_shape=input_shape[:-1],
-        num_input=len(inputs)
-    )
-
-    calib = OnnxCalibrator(unet_onnx, config, calib_data=data)
-    calib.run()
-    quant_path = os.path.join(args.model_dir, args.save_path)
-    if not os.path.exists(quant_path):
-        os.makedirs(quant_path, mode=0o744)
-    quant_onnx = os.path.join(quant_path, 'unet.onnx')
-    calib.export_quant_onnx(quant_onnx, use_external=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/requirements.txt b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/requirements.txt
deleted file mode 100755
index dd40de4cf81cd94366e8ee0acc9008fe90673eba..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-torch==1.13.0
-diffusers==0.18.0
-transformers==4.26.1
-open_clip_torch==2.20.0
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_2_onnx.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_2_onnx.py
deleted file mode 100755
index 79d1e8951b76b61b90eddf7fc2896eee9ef8f371..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_2_onnx.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from argparse import Namespace
-
-import torch
-from diffusers import StableDiffusionPipeline
-
-
-def parse_arguments() -> Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-o",
-        "--output_dir",
-        type=str,
-        default="./models",
-        help="Path of directory to save ONNX models.",
-    )
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-2-1-base",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "-bs",
-        "--batch_size", 
-        type=int, 
-        default=1, 
-        help="Batch size."
-    )
-    parser.add_argument(
-        "-p",
-        "--parallel",
-        action="store_true",
-        help="Export the unet of bs=1 for parallel inferencing.",
-    )
-
-    return parser.parse_args()
-
-
-def export_clip(sd_pipeline: StableDiffusionPipeline, save_dir: str, batch_size:int) -> None:
-    print("Exporting the text encoder...")
-    clip_path = os.path.join(save_dir, "clip")
-    if not os.path.exists(clip_path):
-        os.makedirs(clip_path, mode=0o744)
-
-    clip_model = sd_pipeline.text_encoder
-
-    max_position_embeddings = clip_model.config.max_position_embeddings
-    dummy_input = torch.ones([batch_size, max_position_embeddings], dtype=torch.int64)
-
-    torch.onnx.export(
-        clip_model,
-        dummy_input,
-        os.path.join(clip_path, "clip.onnx"),
-        input_names=["prompt"],
-        output_names=["text_embeddings"],
-        opset_version=11,
-    )
-
-
-def export_unet(sd_pipeline: StableDiffusionPipeline, save_dir: str, batch_size: int) -> None:
-    print("Exporting the image information creater...")
-    unet_path = os.path.join(save_dir, "unet")
-    if not os.path.exists(unet_path):
-        os.makedirs(unet_path, mode=0o744)
-
-    unet_model = sd_pipeline.unet
-    clip_model = sd_pipeline.text_encoder
-
-    sample_size = unet_model.config.sample_size
-    in_channels = unet_model.config.in_channels
-    encoder_hidden_size = clip_model.config.hidden_size
-    max_position_embeddings = clip_model.config.max_position_embeddings
-
-    dummy_input = (
-        torch.ones([batch_size, in_channels, sample_size, sample_size], dtype=torch.float32),
-        torch.ones([1], dtype=torch.int64),
-        torch.ones(
-            [batch_size, max_position_embeddings, encoder_hidden_size], dtype=torch.float32
-        ),
-    )
-
-    torch.onnx.export(
-        unet_model,
-        dummy_input,
-        os.path.join(unet_path, f"unet.onnx"),
-        input_names=["latent_model_input", "t", "encoder_hidden_states"],
-        output_names=["sample"],
-        opset_version=11,
-    )
-
-
-def export_vae(sd_pipeline: StableDiffusionPipeline, save_dir: str, batch_size: int) -> None:
-    print("Exporting the image decoder...")
-
-    vae_path = os.path.join(save_dir, "vae")
-    if not os.path.exists(vae_path):
-        os.makedirs(vae_path, mode=0o744)
-
-    vae_model = sd_pipeline.vae
-    unet_model = sd_pipeline.unet
-
-    sample_size = unet_model.config.sample_size
-    in_channels = unet_model.config.out_channels
-
-    dummy_input = torch.ones([batch_size, in_channels, sample_size, sample_size])
-
-    torch.onnx.export(
-        vae_model.decoder,
-        dummy_input,
-        os.path.join(vae_path, "vae.onnx"),
-        input_names=["latents"],
-        output_names=["image"],
-        opset_version=11,
-    )
-
-
-def export_onnx(model_path: str, save_dir: str, batch_size:int, parallel: bool=False) -> None:
-    pipeline = StableDiffusionPipeline.from_pretrained(model_path).to("cpu")
-
-    export_clip(pipeline, save_dir, batch_size)
-
-    if parallel:
-        export_unet(pipeline, save_dir, batch_size)
-    else:
-        export_unet(pipeline, save_dir, batch_size * 2)
-
-    export_vae(pipeline, save_dir, batch_size)
-
-
-def main():
-    args = parse_arguments()
-    export_onnx(args.model, args.output_dir, args.batch_size, args.parallel)
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_ascend_infer.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_ascend_infer.py
deleted file mode 100755
index f960657d9a3cab7e07bd00471797cb9616ebf6e1..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_ascend_infer.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import csv
-import time
-import json
-import argparse
-
-import aclruntime
-from ais_bench.infer.interface import InferSession
-from diffusers import DPMSolverMultistepScheduler, EulerDiscreteScheduler, DDIMScheduler
-
-from background_session import BackgroundInferSession
-from pipeline_ascend_stable_diffusion import AscendStableDiffusionPipeline
-
-
-class PromptLoader:
-    def __init__(
-        self,
-        prompt_file: str,
-        prompt_file_type: str,
-        batch_size: int,
-        num_images_per_prompt: int=1,
-        max_num_prompts: int=0
-    ):
-        self.prompts = []
-        self.catagories = ['Not_specified']
-        self.batch_size = batch_size
-        self.num_images_per_prompt = num_images_per_prompt
-
-        if prompt_file_type == 'plain':
-            self.load_prompts_plain(prompt_file, max_num_prompts)
-
-        elif prompt_file_type == 'parti':
-            self.load_prompts_parti(prompt_file, max_num_prompts)
-
-        self.current_id = 0
-        self.inner_id = 0
-
-    def __len__(self):
-        return len(self.prompts) * self.num_images_per_prompt
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.current_id == len(self.prompts):
-            raise StopIteration
-
-        ret = {
-            'prompts': [],
-            'catagories': [],
-            'save_names': [],
-            'n_prompts': self.batch_size,
-        }
-        for _ in range(self.batch_size):
-            if self.current_id == len(self.prompts):
-                ret['prompts'].append('')
-                ret['save_names'].append('')
-                ret['catagories'].append('')
-                ret['n_prompts'] -= 1
-
-            else:
-                prompt, catagory_id = self.prompts[self.current_id]
-                ret['prompts'].append(prompt)
-                ret['catagories'].append(self.catagories[catagory_id])
-                ret['save_names'].append(f'{self.current_id}_{self.inner_id}')
-
-                self.inner_id += 1
-                if self.inner_id == self.num_images_per_prompt:
-                    self.inner_id = 0
-                    self.current_id += 1
-
-        return ret
-
-    def load_prompts_plain(self, file_path: str, max_num_prompts: int):
-        with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
-            for i, line in enumerate(f):
-                if max_num_prompts and i == max_num_prompts:
-                    break
-
-                prompt = line.strip()
-                self.prompts.append((prompt, 0))
-                
-    def load_prompts_parti(self, file_path: str, max_num_prompts: int):
-        with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
-            # Skip the first line
-            next(f)
-            tsv_file = csv.reader(f, delimiter="\t")
-            for i, line in enumerate(tsv_file):
-                if max_num_prompts and i == max_num_prompts:
-                    break
-
-                prompt = line[0]
-                catagory = line[1]
-                if catagory not in self.catagories:
-                    self.catagories.append(catagory)
-
-                catagory_id = self.catagories.index(catagory)
-                self.prompts.append((prompt, catagory_id))
-
-
-def check_device_range_valid(value):
-    # if contain , split to int list
-    min_value = 0
-    max_value = 255
-    if ',' in value:
-        ilist = [ int(v) for v in value.split(',') ]
-        for ivalue in ilist[:2]:
-            if ivalue < min_value or ivalue > max_value:
-                raise argparse.ArgumentTypeError("{} of device:{} is invalid. valid value range is [{}, {}]".format(
-                    ivalue, value, min_value, max_value))
-        return ilist[:2]
-    else:
-		# default as single int value
-        ivalue = int(value)
-        if ivalue < min_value or ivalue > max_value:
-            raise argparse.ArgumentTypeError("device:{} is invalid. valid value range is [{}, {}]".format(
-                ivalue, min_value, max_value))
-        return ivalue
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-2-1-base",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "--prompt_file",
-        type=str,
-        required=True,
-        help="A prompt file used to generate images.",
-    )
-    parser.add_argument(
-        "--prompt_file_type", 
-        choices=["plain", "parti"],
-        default="plain", 
-        help="Type of prompt file.",
-    )
-    parser.add_argument(
-        "--model_dir",
-        type=str,
-        default="./models",
-        help="Base path of om models.",
-    )
-    parser.add_argument(
-        "--save_dir", 
-        type=str, 
-        default="./results", 
-        help="Path to save result images.",
-    )
-    parser.add_argument(
-        "--info_file_save_path", 
-        type=str, 
-        default="./image_info.json", 
-        help="Path to save image information file.",
-    )
-    parser.add_argument(
-        "--steps", 
-        type=int, 
-        default=50, 
-        help="Number of inference steps.",
-    )
-    parser.add_argument(
-        "--num_images_per_prompt",
-        default=1,
-        type=int,
-        help="Number of images generated for each prompt.",
-    )
-    parser.add_argument(
-        "--max_num_prompts",
-        default=0,
-        type=int,
-        help="Limit the number of prompts (0: no limit).",
-    )
-    parser.add_argument(
-        "--scheduler", 
-        choices=["DDIM", "Euler", "DPM"],
-        default="DDIM", 
-        help="Type of Sampling methods. Can choose from DDIM, Euler, DPM",
-    )
-    parser.add_argument(
-        "--device", 
-        type=check_device_range_valid, 
-        default=0, 
-        help="NPU device id. Give 2 ids to enable parallel inferencing."
-    )
-    parser.add_argument(
-        "-bs",
-        "--batch_size", 
-        type=int, 
-        default=1, 
-        help="Batch size."
-    )
-    parser.add_argument(
-        "--use_cache", 
-        action="store_true",
-        help="Use cache during inference."
-    )
-    parser.add_argument(
-        "--cache_steps", 
-        type=str, 
-        default="1,2,3,5,6,7,9,10,12,13,14,16,18,19,21,23,24,26,27,29,\
-                30,31,33,34,36,37,39,40,41,43,44,45,47,48,49", 
-        help="Steps to use cache data."
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-    save_dir = args.save_dir
-    device = None
-    device_2 = None
-
-    if isinstance(args.device, list):
-        device, device_2 = args.device
-    else:
-        device = args.device
-
-    pipe = AscendStableDiffusionPipeline.from_pretrained(args.model).to("cpu")
-
-    if args.scheduler == "DDIM":
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-    if args.scheduler == "Euler":
-        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-    if args.scheduler == "DPM":
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-
-
-    clip_om = os.path.join(args.model_dir, "clip", "clip.om")
-    vae_om = os.path.join(args.model_dir, "vae", "vae.om")
-
-    clip_session = InferSession(device, clip_om)
-    vae_session = InferSession(device, vae_om)
-
-    skip_status = [0] * args.steps
-    if args.use_cache:
-        for i in args.cache_steps.split(','):
-            if int(i) >= args.steps:
-                continue
-            skip_status[int(i)] = 1
-        unet_cache_om = os.path.join(args.model_dir, "unet", "unet_cache.om")
-        unet_skip_om = os.path.join(args.model_dir, "unet", "unet_skip.om")
-        unet_session = [
-            aclruntime.InferenceSession(unet_cache_om, device, aclruntime.session_options()),
-            aclruntime.InferenceSession(unet_skip_om, device, aclruntime.session_options()),
-        ]
-    else:
-        unet_cache_om = os.path.join(args.model_dir, "unet", "unet.om")
-        unet_skip_om = ""
-        unet_session = [
-            aclruntime.InferenceSession(unet_cache_om, device, aclruntime.session_options()),
-            None,
-        ]
-
-    unet_session_bg = None
-    if device_2:
-        unet_session_bg = BackgroundInferSession.clone(
-            unet_session[0], 
-            device_2, 
-            [unet_cache_om, unet_skip_om]
-        )
-
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir, mode=0o744)
-
-    use_time = 0
-
-    prompt_loader = PromptLoader(args.prompt_file, 
-                                 args.prompt_file_type, 
-                                 args.batch_size,
-                                 args.num_images_per_prompt,
-                                 args.max_num_prompts)
-                                 
-    infer_num = 0
-    image_info = []
-    current_prompt = None
-    for i, input_info in enumerate(prompt_loader):
-        prompts = input_info['prompts']
-        catagories = input_info['catagories']
-        save_names = input_info['save_names']
-        n_prompts = input_info['n_prompts']
-        
-        print(f"[{infer_num + n_prompts}/{len(prompt_loader)}]: {prompts}")
-        infer_num += args.batch_size
-
-        start_time = time.time()
-        images = pipe.ascend_infer(
-            prompts,
-            clip_session,
-            [unet_session, unet_session_bg],
-            vae_session,
-            skip_status,
-            device_id=device,
-            num_inference_steps=args.steps,
-            guidance_scale=7.5,
-        )
-        use_time += time.time() - start_time
-
-        for j in range(n_prompts):
-            image_save_path = os.path.join(save_dir, f"{save_names[j]}.png")
-            image = images[0][j]
-            image.save(image_save_path)
-
-            if current_prompt != prompts[j]:
-                current_prompt = prompts[j]
-                image_info.append({'images': [], 'prompt': current_prompt, 'category': catagories[j]})
-
-            image_info[-1]['images'].append(image_save_path)
-
-    if unet_session_bg:
-        unet_session_bg.stop()
-
-    # Save image information to a json file
-    if os.path.exists(args.info_file_save_path):
-        os.remove(args.info_file_save_path)
-        
-    with os.fdopen(os.open(args.info_file_save_path, os.O_RDWR|os.O_CREAT, 0o644), "w") as f:
-        json.dump(image_info, f)
-
-    print(
-        f"[info] infer number: {infer_num}; use time: {use_time:.3f}s; "
-        f"average time: {use_time/infer_num:.3f}s"
-    )
-
-    # free npu resource
-    clip_session.free_resource()
-    vae_session.free_resource()
-    unet_session[0].free_resource()
-    if args.use_cache:
-        unet_session[1].free_resource()
-    InferSession.finalize()
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_clip_patch.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_clip_patch.py
deleted file mode 100755
index f29696fd8b60f06d0b655689f6ce0abfcc9f66cf..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/stable_diffusion_clip_patch.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import transformers
-
-
-def main():
-    transformers_path = transformers.__path__
-    transformers_version = transformers.__version__
-    
-    assert transformers_version is not '4.26.1', "expectation transformers==4.26.1"        
-    os.system(f'patch -p0 {transformers_path[0]}/models/clip/modeling_clip.py clip.patch')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_0.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_0.png
deleted file mode 100644
index 164421dc0df51b694d2afe950f61cce7526cf7fe..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_0.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_1.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_1.png
deleted file mode 100644
index 7526a25c652f52ddaf5d14edaca564d222f7e443..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_1.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_10.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_10.png
deleted file mode 100644
index 48154477530478f6ec8df629a4afacb64ab5c96c..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_10.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_11.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_11.png
deleted file mode 100644
index d47a9851f852e56a18010a75279ebd81aa1d3896..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_11.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_12.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_12.png
deleted file mode 100644
index a59d24199e7dcf181d4e108121ca72ba2b8477f7..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_12.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_13.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_13.png
deleted file mode 100644
index 2e143e4157425ed17fa5cd288ab9ad6e5e541380..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_13.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_14.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_14.png
deleted file mode 100644
index 6c92bedae327543da3fa5e34f0ce06ad733ab583..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_14.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_15.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_15.png
deleted file mode 100644
index c22814f11313122240f813b684f305bee567b72d..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_15.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_2.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_2.png
deleted file mode 100644
index 58ebdd87d1bb426fb75fa6674a0fa33061222273..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_2.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_3.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_3.png
deleted file mode 100644
index a53c8d3427ab668a182927f69f8731856359ad1f..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_3.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_4.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_4.png
deleted file mode 100644
index 11c982df1a27f13cd6e2b483e53c5195bda83922..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_4.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_5.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_5.png
deleted file mode 100644
index 2fbbb3dee1021fdb6ed372aad57bbb0d147414ac..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_5.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_6.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_6.png
deleted file mode 100644
index 60b8df1707c27d4dfa6535822f20a99cd45cb90c..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_6.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_7.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_7.png
deleted file mode 100644
index 61efefa823f017f57fd06bfb7b8cdb0e2ed0e96e..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_7.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_8.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_8.png
deleted file mode 100644
index 4eec5fe63c5cef3b7401386238d0d41db82b73ee..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_8.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_9.png b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_9.png
deleted file mode 100644
index c47302720d9f301dbcea85cf06ceb1d41cb56749..0000000000000000000000000000000000000000
Binary files a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/test_results/illustration_9.png and /dev/null differ
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/unet_cache.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusion/unet_cache.py
deleted file mode 100644
index de6b2bc8cd1c613c27985bdba757324b561f1814..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusion/unet_cache.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-
-from auto_optimizer import OnnxGraph
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="models/unet/unet.onnx",
-        help="Path of the unet onnx model.",
-    )
-    parser.add_argument(
-        "--save_dir",
-        type=str,
-        default="models/unet",
-        help="Path to save the modified model",
-    )
-    return parser.parse_args()
-
-
-def cache_unet(model_path, new_model_path, data):
-    model = OnnxGraph.parse(model_path)
-    model.add_output(data, dtype='float32', shape=[])
-    model.save(new_model_path)
-    return
-
-
-def skip_unet(model_path, new_model_path, data):
-    model = OnnxGraph.parse(model_path)
-    node = model.get_next_nodes(data)[0]
-    batch_size = model.inputs[0].shape[0]
-    model.add_input('cache', dtype='float32', shape=[batch_size, 640, 64, 64])
-    node.inputs[0] = 'cache'
-    model.remove_unused_nodes()
-    model.save(new_model_path)
-    return
-
-
-def main(args):
-    cache_path = os.path.join(args.save_dir, "unet_cache.onnx")
-    skip_path = os.path.join(args.save_dir, "unet_skip.onnx")
-    cache_name = '/up_blocks.2/upsamplers.0/conv/Conv_output_0'
-    cache_unet(args.model, cache_path, cache_name)
-    skip_unet(args.model, skip_path, cache_name)
-    return
-
-
-if __name__ =="__main__":
-    main(parse_arguments())
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/README.md b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/README.md
deleted file mode 100644
index c0c6356c4cda98d5afec19a9c6dc2b21e93d3813..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/README.md
+++ /dev/null
@@ -1,571 +0,0 @@
-# stable-diffusionxl模型-推理指导  
-
-
-- [概述](#ZH-CN_TOPIC_0000001172161501)
-   
-   - [输入输出数据](#section540883920406)
-
-- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
-
-- [快速上手](#ZH-CN_TOPIC_0000001126281700)
-
-  - [获取源码](#section4622531142816)
-  - [模型推理](#section741711594517)
-
-- [模型推理性能&精度](#ZH-CN_TOPIC_0000001172201573)
-
-
-# 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
-
-   SDXL 由一组用于潜在扩散的专家管道组成： 在第一步中，使用基础模型生成（噪声）潜伏， 然后使用专门用于最终降噪步骤的细化模型[此处获得](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/)
-   
-   **说明：**后续更新请参考[MindIE-Torch](../../../../MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/README.md)(0711)
-
-- 参考实现：
-  ```bash
-   # StableDiffusionxl
-   https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-  ```
-
-## 输入输出数据<a name="section540883920406"></a>
-
-- 输入数据
-
-  | 输入数据  | 大小      | 数据类型                | 数据排布格式 |
-  | -------- | -------- | ------------------------- | ------------ |
-  | prompt    |  1 x 77 | INT64|  ND|
-
-
-- 输出数据
-
-  | 输出数据 | 大小     | 数据类型 | 数据排布格式 |
-  | -------- | -------- | -------- | ------------ |
-  | output1  | 1 x 3 x 1024 x 1024 | FLOAT32  | NCHW          |
-
-# 推理环境准备<a name="ZH-CN_TOPIC_0000001126281702"></a>
-
-- 该模型需要以下插件与驱动
-
-  **表 1**  版本配套表
-  | 配套                                                         | 版本    | 环境准备指导                                                 |
-  | ------------------------------------------------------------ | ------- | ------------------------------------------------------------ |
-  | 固件与驱动                                                   | 24.1.rc1  | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) |
-  | CANN（+MindIE）                                              | 8.0.RC2(1.0.RC2) | -                                                            |
-  | Python                                                       | 3.10   | -                                                            |                                                           |
-如在优化模型时使用了--FA、--TOME_num、--faster_gelu参数，需要安装与CANN包配套版本的MindIE
-
-该模型性能受CPU规格影响，建议使用64核CPU（arm）以复现性能
-
-
-# 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
-
-## 获取源码<a name="section4622531142816"></a>
-1. 获取本仓源码
-   
-   ```
-   git clone https://gitee.com/ascend/ModelZoo-PyTorch.git
-   cd ModelZoo-PyTorch/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl
-   ```
-
-1. 安装依赖。
-   ```bash
-   pip3 install -r requirements.txt
-
-   git clone https://github.com/tgxs002/HPSv2.git
-   cd HPSv2
-   pip3 install -e .
-   ```
-
-2. 代码修改
-
-   执行命令：
-   
-   ```bash
-   TRANSFORMERS_PATH=`python3 -c "import transformers; print(transformers.__path__[0])"`
-   patch  -p0 ${TRANSFORMERS_PATH}/models/clip/modeling_clip.py clip.patch 
-   ```
-
-3. 安装昇腾推理工具
-
-   请访问[msit代码仓](https://gitee.com/ascend/msit/tree/master/msit/)，根据readme文档进行工具安装。可只安装需要的组件：debug surgeon，其他组件为可选安装。
-   
-   请访问[ais_bench](https://gitee.com/ascend/tools/tree/master/ais-bench_workload/tool/ais_bench)，根据readme文件进行工具安装，建议使用whl包进行安装。
-   
-
-## 模型推理<a name="section741711594517"></a>
-
-1. 模型转换。
-   使用PyTorch将模型权重文件转换为.onnx文件，再使用ATC工具将.onnx文件转为离线推理模型文件.om文件。
-
-   0. 获取权重（可选）
-
-      可提前下载权重，放到代码同级目录下，以避免执行后面步骤时可能会出现下载失败。
-
-      ```bash
-      # 需要使用 git-lfs (https://git-lfs.com)
-      git lfs install
-
-      # 下载权重
-      git clone https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-      ```
-
-   1. 导出ONNX模型
-
-      设置模型名称或路径
-      ```bash
-      # base (执行时下载权重)
-      model_base="stabilityai/stable-diffusion-xl-base-1.0"
-
-      # base (下载的权重路径)
-      model_base="./stable-diffusion-xl-base-1.0"
-      ```
-
-      执行命令：
-
-      ```bash
-      python3 stable_diffusionxl_2_onnx.py --model ${model_base} --output_dir ./models
-
-      ```
-
-      参数说明：
-      - --model：模型权重路径
-      - --output_dir: ONNX模型输出目录
- 
-      
-      执行成功后生成onnx模型：
-         ```
-         |—— models
-                |—— text_encoder 
-                       |—— text_encoder.onnx 
-                       |—— text_encoder_2.onnx 
-                |—— unet 
-                       |—— unet.onnx 
-                |—— vae 
-                       |—— vae.onnx 
-                |—— ddim 
-                       |—— ddim.onnx 
-         ```      
-
-   2. 优化onnx模型
-
-      不建议同时使用量化与unet cache方案，精度可能会下降超过10%。
-
-      1. 量化（可选，可提升性能但可能导致精度下降）
-
-         量化步骤请参考[量化指导](./README_quant.md)
-
-      2. 模型优化
-
-         运行modify_onnx.py脚本。
-         ```bash 
-         bs=1
-         # 量化模型
-         unet_model="models/unet_quant/unet_fuse.onnx"
-         # 非量化模型
-         unet_model="models/unet/unet.onnx"
-
-         # 非并行方案
-         python3 modify_onnx.py \
-               --model ${unet_model} \
-               --new_model models/unet/unet_md.onnx \
-               --FA_soc A2 \
-               --TOME_num 10 \
-               --faster_gelu \
-               --batch_size ${bs}
-         
-         # 并行方案
-         python3 modify_onnx.py \
-               --model ${unet_model} \
-               --new_model models/unet/unet_md.onnx \
-               --FA_soc A2 \
-               --TOME_num 10 \
-               --faster_gelu \
-               --batch_size ${bs} \
-               --parallel
-         ```
-         参数说明：
-         - --model：onnx模型路径。
-         - --new_model：优化后生成的onnx模型路径。
-         - --FA_soc：使用FA算子的硬件形态。目前FlashAttention算子支持Atlas 300I Duo/Pro和Atlas 800I A2，请根据硬件设置参数为Duo或A2，其他不支持硬件请设置为None。
-         - --TOME_num：插入TOME插件的数量，有效取值为[0, 10]。如果设置这个参数对精度造成影响，建议调小此值。目前支持Atlas 300I Duo/Pro和Atlas 800I A2，其他不支持硬件请设置为0。默认选取10。
-         - --faster_gelu：使用slice+gelu的融合算子。
-         - --batch_size：生成适用于指定batch_size的模型，默认值为1。
-         - --parallel：生成适用于并行方案的模型
-
-         FA、TOME、Gelu融合算子需通过安装与CANN版本对应的推理引擎包(MindIE)来获取，如未安装推理引擎或使用的版本不支持FA、TOME、SliceGelu算子，FA_soc和TOME_num参数请使用默认配置、不设置faster_gelu参数。
-
-
-      3. 适配cache方案(可选，可提升性能但可能导致精度下降)
-
-         运行unet_cache.py脚本
-         ```bash
-         python3 unet_cache.py --model models/unet/unet_md.onnx --save_dir models/unet/
-         ```
-
-   
-   3. 使用ATC工具将ONNX模型转OM模型。
-
-      1. 配置环境变量。
-
-         ```bash
-         source /usr/local/Ascend/ascend-toolkit/set_env.sh
-
-         # 如果安装了推理引擎算子包，需配置推理引擎路径
-         source /usr/local/Ascend/mindie/set_env.sh
-         ```
-
-         > **说明：** 
-         >该脚本中环境变量仅供参考，请以实际安装环境配置环境变量。详细介绍请参见《[CANN 开发辅助工具指南 \(推理\)](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=developer-documents&subcategory=auxiliary-development-tools)》。
-
-      2. 执行命令查看芯片名称（$\{chip\_name\}）。
-
-         ```
-         npu-smi info
-         #该设备芯片名为Ascend310P3 （自行替换）
-         回显如下：
-         +-------------------+-----------------+------------------------------------------------------+
-         | NPU     Name      | Health          | Power(W)     Temp(C)           Hugepages-Usage(page) |
-         | Chip    Device    | Bus-Id          | AICore(%)    Memory-Usage(MB)                        |
-         +===================+=================+======================================================+
-         | 0       310P3     | OK              | 15.8         42                0    / 0              |
-         | 0       0         | 0000:82:00.0    | 0            1074 / 21534                            |
-         +===================+=================+======================================================+
-         | 1       310P3     | OK              | 15.4         43                0    / 0              |
-         | 0       1         | 0000:89:00.0    | 0            1070 / 21534                            |
-         +===================+=================+======================================================+
-         ```
-
-      3. 执行ATC命令。
-
-         ```bash
-         # text_encoder
-         cd ./models/text_encoder
-         atc --framework=5 \
-             --model=./text_encoder.onnx \
-             --output=./text_encoder \
-             --input_format=ND \
-             --input_shape="prompt:${bs},77" \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-         atc --framework=5 \
-             --model=./text_encoder_2.onnx \
-             --output=./text_encoder_2 \
-             --input_format=ND \
-             --input_shape="prompt:${bs},77" \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-         
-         # unet
-         cd ../unet/
-
-         # 不使用cache方案
-         atc --framework=5 \
-             --model=./unet_md.onnx \
-             --output=./unet \
-             --input_format=NCHW \
-             --log=error \
-             --optypelist_for_implmode="Gelu,Sigmoid" \
-             --op_select_implmode=high_performance \
-             --soc_version=Ascend${chip_name}
-
-         # 使用cache方案
-         atc --framework=5 \
-            --model=./unet_cache.onnx \
-            --output=./unet_cache \
-            --input_format=NCHW \
-            --log=error \
-            --optypelist_for_implmode="Gelu,Sigmoid" \
-            --op_select_implmode=high_performance \
-            --soc_version=Ascend${chip_name}
-
-         atc --framework=5 \
-            --model=./unet_skip.onnx \
-            --output=./unet_skip \
-            --input_format=NCHW \
-            --log=error \
-            --optypelist_for_implmode="Gelu,Sigmoid" \
-            --op_select_implmode=high_performance \
-            --soc_version=Ascend${chip_name}
-
-         cd ../../
-
-         # vae
-         atc --framework=5 \
-             --model=./models/vae/vae.onnx \
-             --output=./models/vae/vae \
-             --input_format=NCHW \
-             --input_shape="latents:${bs},4,128,128" \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-
-         # 如果使用ddim采样器
-         atc --framework=5 \
-             --model=./models/ddim/ddim.onnx \
-             --output=./models/ddim/ddim \
-             --input_format=ND \
-             --input_shape="noise_pred:${bs},4,128,128;latents:${bs},4,128,128" \
-             --log=error \
-             --soc_version=Ascend${chip_name} 
-         ```
-      
-      参数说明：
-      - --model：为ONNX模型文件。
-      - --output：输出的OM模型。
-      - --framework：5代表ONNX模型。
-      - --log：日志级别。
-      - --soc_version：处理器型号。
-      - --input_shape: 模型的输入shape信息。
-
-
-      执行成功后生成om模型列表：  
-         ```
-         |—— models
-                 |—— text_encoder
-                        |—— text_encoder.om
-                        |—— text_encoder_2.om
-                 |—— unet
-                        |—— unet.om
-                 |—— vae
-                        |—— vae.om
-                 |—— ddim
-                        |—— ddim.om
-         ```
-       
-2. 开始推理验证。
-    
-   1. 安装绑核工具并根据NUMA亲和性配置任务进程与NUMA node 的映射关系是为了排除cpu的影响
-
-      安装绑核工具
-      ```
-      yum install numactl
-      ```
-      通过`npu-smi info`查询device的bus-id，并根据bus-id通过`lspci -vs bus-id`查询卡的NUMA node。
-
-      查到NUMA node后，使用`lscpu`获得NUMA node对应的CPU核，推荐绑定其中单核以获得更好的性能。
-      ```bash
-      NUMA node0: 0-23
-      NUMA node1: 24-47
-      NUMA node2: 48-71
-      NUMA node3: 72-95
-      ```
-      例如，device对应的NUMA node为3，则在NUMA node3对应的CPU核中选择一个，比如72
-
-   2. 执行推理脚本。
-
-      ```bash
-      # 非并行方案
-      numactl -C 72 python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --prompt_file ./prompts.txt \
-              --device 0 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-
-      # 并行方案
-      numactl -C 72 python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --prompt_file ./prompts.txt \
-              --device 0,1 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-      ```
-
-      参数说明：
-      - --model：模型名称或本地模型目录的路径。
-      - --model_dir：存放导出模型的目录。
-      - --prompt_file：提示词文件。
-      - --save_dir：生成图片的存放目录。
-      - --batch_size：模型batch size。
-      - --steps：生成图片迭代次数。
-      - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-      - --use_cache: 在推理过程中使用cache。
-      - --cache_steps: 使用cache的迭代次数，迭代次数越多性能越好，但次数过多可能会导致精度下降。取值范围为[1, stpes-1]。
-      - --scheduler：采样器。可选None、DDIM、Euler、DPM、EulerAncestral、DPM++SDEKarras。None即为默认scheduler。
-      
-      执行完成后在`./results`目录下生成推理图片。并在终端显示推理时间，参考如下：
-
-      ```
-      [info] infer number: 16; use time: 104.6s; average time: 6.542s
-      ```
-      *注意*：
-
-         如果使用arm机器，出现`*torch*.so*: cannot allocate memory in static TLS block`报错，则增加环境变量指向报错路径
-         ```bash
-         export LD_PRELOAD=报错.so路径:$LD_PRELOAD
-         ```
-   
-
-## 精度验证<a name="section741711594518"></a>
-
-   由于生成的图片存在随机性，提供两种精度验证方法：
-   1. CLIP-score（文图匹配度量）：评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。使用Parti数据集进行验证。
-   2. HPSv2（图片美学度量）：评估生成图片的人类偏好评分，分数的取值范围为[0, 1]，越高越好。使用HPSv2数据集进行验证
-
-   注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
-
-   1. 下载数据集
-
-      1. 下载Parti数据集
-
-         ```bash
-         wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
-         ```
-      2. 下载HPSv2数据集
-         
-         下载[HPSv2数据集](https://huggingface.co/datasets/zhwang/HPDv2/tree/main/benchmark)中的anime.json, concept-art.json, paintings.json, photo.json文件，并放在数据集路径`dataset`下
-         ```bash
-         mkdir dataset
-         ```
-         得到的hpsv2数据集目录（文件名称需与以下目录结构中文件名称保持一致）
-         ```
-         |—— dataset
-            |—— anime.json
-            |—— concept-art.json
-            |—— paintings.json
-            |—— photo.json
-         ```
-
-   2. 下载模型权重
-
-      ```bash
-      # Clip Score 和 HPSv2 均需使用的权重
-      GIT_LFS_SKIP_SMUDGE=1 
-      git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
-
-      # HPSv2权重
-      wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
-      ```
-      也可手动下载[CLIP权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin)
-      将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下，手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径
-
-
-   3. 使用推理脚本生成图片
-
-      ```bash
-      # Clip Score
-      # 非并行方案
-      python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --prompt_file ./PartiPrompts.tsv \
-              --prompt_file_type parti \
-              --num_images_per_prompt 4 \
-              --max_num_prompts 0 \
-              --device 0 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-              
-      # 并行方案
-      python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --prompt_file ./PartiPrompts.tsv \
-              --prompt_file_type parti \
-              --num_images_per_prompt 4 \
-              --max_num_prompts 0 \
-              --device 0,1 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-
-      # HPSv2
-      # 非并行方案
-      python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --prompt_file_type hpsv2 \
-              --prompt_file ./dataset \
-              --max_num_prompts 0 \
-              --device 0 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-              
-      # 并行方案
-      python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --prompt_file_type hpsv2 \
-              --prompt_file ./dataset \
-              --max_num_prompts 0 \
-              --device 0,1 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-      ```
-
-      参数说明：
-      - --model：模型名称或本地模型目录的路径。
-      - --model_dir：存放导出模型的目录。
-      - --prompt_file：提示词文件路径。如果是hpsv2数据集，提供提示词文件所在目录路径。
-      - --prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
-      - --num_images_per_prompt: 每个prompt生成的图片数量。
-      - --max_num_prompts：限制prompt数量为前X个，0表示不限制。
-      - --save_dir：生成图片的存放目录。
-      - --batch_size：模型batch size。
-      - --steps：生成图片迭代次数。
-      - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-      - --use_cache: 在推理过程中使用cache。
-      - --cache_steps: 使用cache的迭代次数，迭代次数越多性能越好，但次数过多可能会导致精度下降。
-
-      执行完成后会在`./results`目录下生成推理图片，并且会在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。
-
-   4. 计算精度指标
-   
-      1. CLIP-score
-
-         ```bash
-         python3 clip_score.py \
-               --device=cpu \
-               --image_info="image_info.json" \
-               --model_name="ViT-H-14" \
-               --model_weights_path="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
-         ```
-
-         参数说明：
-         - --device: 推理设备。
-         - --image_info: 上一步生成的`image_info.json`文件。
-         - --model_name: Clip模型名称。
-         - --model_weights_path: Clip模型权重文件路径。
-
-         执行完成后会在屏幕打印出精度计算结果。
-      
-      2. HPSv2
-
-         ```bash
-         python3 hpsv2_score.py \
-               --image_info="image_info.json" \
-               --HPSv2_checkpoint="./HPS_v2_compressed.pt" \
-               --clip_checkpoint="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
-         ```
-
-         参数说明：
-         - --image_info: 上一步生成的`image_info.json`文件。
-         - --HPSv2_checkpoint: HPSv2模型权重文件路径。
-         - --clip_checkpointh: Clip模型权重文件路径。
-
-         执行完成后会在屏幕打印出精度计算结果。
-   
-# 模型推理性能&精度<a name="ZH-CN_TOPIC_0000001172201573"></a>
-
-调用ACL接口推理计算，性能参考下列数据。
-
-### StableDiffusionxl
-
-| 硬件形态 | batch size | 迭代次数 | 平均耗时    | 优化方案 | 精度  | 采样器 |
-| :------: | :-----: | :----: | :--------: | :--------: | :----: | :----: |
-| A2  |  1  |   50  |  4.88s   | 非并行，FA+TOME+faster_gelu，unet_cache | 0.376 | ddim |
-| DUO |  1  |   50  |  10.44s   | 并行，FA+TOME+faster_gelu，unet_cache | 0.376 | ddim |
-
-性能测试需要独占npu和cpu
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/README_quant.md b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/README_quant.md
deleted file mode 100644
index 150d1d5036447d14ad90b88ac18bc08befa49fc6..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/README_quant.md
+++ /dev/null
@@ -1,141 +0,0 @@
-# Unet模型量化指导
-
-## 环境配置
-```bash
-# 指定量化使用的device
-export DEVICE_ID=0
-
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
-```
-
-> **说明：** 
->该脚本中环境变量仅供参考，请以实际安装环境配置环境变量。详细介绍请参见《[CANN 开发辅助工具指南 \(推理\)](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=developer-documents&subcategory=auxiliary-development-tools)》。
-
-## 量化操作
-
-量化时可使用虚拟数据或者真实数据校准。使用真实数据的量化精度更高，但需进行一次推理得到真实数据。
-
-### 虚拟数据校准
-
-运行quant_unet.py脚本进行量化。
-
-```bash
-python3 quant_unet.py \
-    --model ${model_base} \
-    --model_dir ./models \
-    --prompt_file ./prompts.txt \
-    --save_path unet_quant \
-    --data_free
-```
-参数说明：
-- --model：模型名称或本地模型目录的路径。
-- --model_dir：存放导出模型的目录。
-- --prompt_file：输入文本文件，按行分割。
-- --save_path：量化模型的储存目录，为model_dir下的子文件夹名。
-- --data_free：使用虚拟数据。
-
-执行成功后生成`models_bs${bs}/unet_quant`文件夹，包含unet.onnx模型, unet_fuse.onnx(matmul和dequant算子融合)模型及权重。
-
-### 真实数据校准
-1. 使用ATC工具将ONNX模型转OM模型。
-
-    1. 执行命令查看芯片名称（$\{chip\_name\}）。
-
-        ```
-        npu-smi info
-        #该设备芯片名为Ascend310P3 （自行替换）
-        回显如下：
-        +-------------------+-----------------+------------------------------------------------------+
-        | NPU     Name      | Health          | Power(W)     Temp(C)           Hugepages-Usage(page) |
-        | Chip    Device    | Bus-Id          | AICore(%)    Memory-Usage(MB)                        |
-        +===================+=================+======================================================+
-        | 0       310P3     | OK              | 15.8         42                0    / 0              |
-        | 0       0         | 0000:82:00.0    | 0            1074 / 21534                            |
-        +===================+=================+======================================================+
-        | 1       310P3     | OK              | 15.4         43                0    / 0              |
-        | 0       1         | 0000:89:00.0    | 0            1070 / 21534                            |
-        +===================+=================+======================================================+
-        ```
-
-    2. 执行ATC命令。
-
-        ```bash
-        # 为减少量化耗时，要求使用bs=1场景进行量化
-        bs=1
-        # text_encoder
-        cd ./models/text_encoder
-        atc --framework=5 \
-            --model=./text_encoder.onnx \
-            --output=./text_encoder \
-            --input_format=ND \
-            --input_shape="prompt:${bs},77" \
-            --log=error \
-            --soc_version=Ascend${chip_name}
-        atc --framework=5 \
-            --model=./text_encoder_2.onnx \
-            --output=./text_encoder_2 \
-            --input_format=ND \
-            --input_shape="prompt:${bs},77" \
-            --log=error \
-            --soc_version=Ascend${chip_name}
-        
-        # unet
-        cd ../unet/
-        atc --framework=5 \
-            --model=./unet.onnx \
-            --output=./unet \
-            --input_format=NCHW \
-            --log=error \
-            --optypelist_for_implmode="Gelu,Sigmoid" \
-            --op_select_implmode=high_performance \
-            --soc_version=Ascend${chip_name}
-
-        cd ../../
-
-        # 如果使用ddim采样器
-        atc --framework=5 \
-            --model=./models/ddim/ddim.onnx \
-            --output=./models/ddim/ddim \
-            --input_format=ND \
-            --input_shape="noise_pred:${bs},4,128,128;latents:${bs},4,128,128" \
-            --log=error \
-            --soc_version=Ascend${chip_name} 
-        ```
-        参数说明：
-        - --model：为ONNX模型文件。
-        - --output：输出的OM模型。
-        - --framework：5代表ONNX模型。
-        - --log：日志级别。
-        - --soc_version：处理器型号。
-            
-        执行成功后生成om模型列表：  
-            ```
-            |—— models
-                    |—— text_encoder
-                            |—— text_encoder.om
-                            |—— text_encoder_2.om
-                    |—— unet
-                            |—— unet.om
-                    |—— ddim
-                            |—— ddim.om
-
-    3. 执行量化
-
-        运行quant_unet.py脚本进行量化
-
-        ```bash
-        python3 quant_unet.py \
-            --model ${model_base} \
-            --model_dir ./models \
-            --prompt_file ./prompts.txt \
-            --device 0,1 \
-            --save_path unet_quant
-        ```
-        参数说明：
-        - --model：模型名称或本地模型目录的路径。
-        - --model_dir：存放导出模型的目录。
-        - --prompt_file：输入文本文件，按行分割。
-        - --save_path：量化模型的储存文件夹，为model_dir下的子文件夹名。
-        - --device：推理设备ID；可用逗号分割传入两个设备ID，可设置相同device。
-
-        执行成功后生成`models/unet_quant`文件夹，包含unet.onnx模型及权重。
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/background_session.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/background_session.py
deleted file mode 100644
index 30f1e52d3a0de7999bd9ad2aa04cc57bb83bfc0d..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/background_session.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import multiprocessing as mp
-from dataclasses import dataclass
-from typing import List, Optional
-
-import numpy as np
-import aclruntime
-from ais_bench.infer.interface import InferSession
-
-
-@dataclass
-class SessionIOInfo:
-    input_shapes: List[tuple]
-    input_dtypes: List[type]
-    output_shapes: List[tuple]
-    output_dtypes: List[type]
-
-
-@dataclass
-class BackgroundInferSessionOptions:
-    device_id: int
-    model_path: List[str]
-    io_info: SessionIOInfo
-    acl_json_path: Optional[str] = None
-    debug: Optional[bool] = False
-    loop: Optional[int] = 1
-
-
-class BackgroundInferSession:
-    def __init__(
-        self, 
-        device_id: int, 
-        model_path: str, 
-        io_info: SessionIOInfo,
-    ):
-        # Create a pipe for process synchronization
-        self.sync_pipe, sync_pipe_peer = mp.Pipe(duplex=True)
-
-        # Create shared buffers
-        input_spaces = self.create_shared_buffers(io_info.input_shapes, io_info.input_dtypes)
-        output_spaces = self.create_shared_buffers(io_info.output_shapes, io_info.output_dtypes)
-
-        # Build numpy arrays on the shared buffers
-        self.input_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(input_spaces, io_info.input_shapes, io_info.input_dtypes)]
-        self.output_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(output_spaces, io_info.output_shapes, io_info.output_dtypes)]
-
-        mp.set_start_method('forkserver', force=True)
-        self.p = mp.Process(
-            target=self.run_session, 
-            args=[sync_pipe_peer, input_spaces, output_spaces,
-                  io_info, device_id, model_path]
-        )
-        self.p.start()
-
-        # Wait until the sub process is ready
-        self.wait()
-
-    def infer_asyn(self, feeds: List[np.ndarray], skip=0) -> None:
-        for i in range(len(self.input_arrays)):
-            self.input_arrays[i][:] = feeds[i][:]
-
-        if skip:
-            self.sync_pipe.send('skip')
-        else:
-            self.sync_pipe.send('cache')
-
-    def wait(self) -> None:
-        self.sync_pipe.recv()
-
-    def get_outputs(self) -> List[np.ndarray]:
-        return self.output_arrays
-
-    def wait_and_get_outputs(self) -> List[np.ndarray]:
-        self.wait()
-        return self.get_outputs()
-    
-    def infer(self, feeds: List[np.ndarray]) -> List[np.ndarray]:
-        # This function should work as same as InferSession.infer()
-        self.infer_asyn(feeds)
-        return self.wait_and_get_outputs()
-
-    def stop(self):
-        # Stop the sub process
-        self.p.terminate()
-
-    @classmethod
-    def clone(
-        cls, 
-        session: InferSession, 
-        device_id: int, 
-        model_path: List[str]) -> 'BackgroundInferSession':
-        # Get shapes, datatypes, and model path from an existed InferSession, 
-        # then use them to create a BackgroundInferSession
-        io_info = cls.get_io_info_from_session(session)
-        io_info.output_shapes = [io_info.output_shapes[0]]
-        io_info.output_dtypes = [io_info.output_dtypes[0]]
-
-        return cls(device_id, model_path, io_info)
-
-    @staticmethod
-    def get_io_info_from_session(session: InferSession) -> SessionIOInfo:
-        # Map aclruntime datatype to numpy datatype
-        np_types = (np.float32, np.float16, np.int8, np.int32, 
-                    np.uint8, '', np.int16, np.uint16, np.uint32, 
-                    np.int64, np.uint64)
-
-        # Get input shapes and datatypes
-        inputs = session.get_inputs()
-        input_shapes = [t.shape for t in inputs]
-        input_dtypes = [np_types[t.datatype] for t in inputs]
-
-        # Get output shapes and datatypes
-        outputs = session.get_outputs()
-        output_shapes = [t.shape for t in outputs]
-        output_dtypes = [np_types[t.datatype] for t in outputs]
-
-        return SessionIOInfo(input_shapes, input_dtypes, 
-                             output_shapes, output_dtypes)
-
-    @staticmethod
-    def create_shared_buffers(shapes: List[tuple], dtypes: List[type]) -> List[mp.RawArray]:
-        buffers = []
-        for shape, dtype in zip(shapes, dtypes):
-            size = 1
-            for x in shape:
-                size *= x
-
-            raw_array = mp.RawArray(np.ctypeslib.as_ctypes_type(dtype), size)
-            buffers.append(raw_array)
-
-        return buffers
-
-    @staticmethod
-    def run_session(
-        sync_pipe: mp.connection.Connection,
-        input_spaces: List[np.ndarray],
-        output_spaces: List[np.ndarray],
-        io_info: SessionIOInfo,
-        device_id: int, 
-        model_path: list, 
-    ) -> None:
-        # The sub process function
-
-        # Create an InferSession
-        session_cache = aclruntime.InferenceSession(
-            model_path[0], 
-            device_id, 
-            aclruntime.session_options()
-            )
-        if model_path[1]:
-            session_skip = aclruntime.InferenceSession(
-                model_path[1], 
-                device_id, 
-                aclruntime.session_options()
-                )
-
-        # Build numpy arrays on the shared buffers
-        input_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(input_spaces, io_info.input_shapes, io_info.input_dtypes)]
-
-        output_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(output_spaces, io_info.output_shapes, io_info.output_dtypes)]
-
-        # Tell the main function that we are ready
-        sync_pipe.send('')
-
-        # Keep looping until recived a 'STOP'
-        while True:
-            flag = sync_pipe.recv()
-            if flag == 'cache':
-                feeds = {}
-                inputs = session_cache.get_inputs()
-                for i in range(len(input_arrays)):
-                    feed = aclruntime.Tensor(input_arrays[i])
-                    feed.to_device(device_id)
-                    feeds[inputs[i].name] = feed
-                out_names = [out.name for out in session_cache.get_outputs()]
-                
-                outputs = session_cache.run(out_names, feeds)
-                if len(outputs) > 1:
-                    cache = outputs[1]
-            else:
-                feeds = {}
-                inputs = session_skip.get_inputs()
-                for i in range(len(input_arrays)):
-                    feed = aclruntime.Tensor(input_arrays[i])
-                    feed.to_device(device_id)
-                    feeds[inputs[i].name] = feed
-                feeds[inputs[-1].name] = cache
-                out_names = [out.name for out in session_skip.get_outputs()]
-                
-                outputs = session_skip.run(out_names, feeds)
-            outputs[0].to_host()
-            output = np.array(outputs[0])
-            for i in range(len(output_arrays)):
-                output_arrays[i][:] = output[:]
-
-            sync_pipe.send('')
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/clip.patch b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/clip.patch
deleted file mode 100644
index 7c6cb785636263f8dc44758bfe6266201e66ea67..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/clip.patch
+++ /dev/null
@@ -1,7 +0,0 @@
-22a23
-> import numpy as np
-760c761,762
-<         mask.triu_(1)  # zero out the lower diagonal
----
->         # mask.triu_(1)  # zero out the lower diagonal
->         mask = torch.from_numpy(np.triu(mask.numpy(), 1))
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/clip_score.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/clip_score.py
deleted file mode 100644
index e0987baac799142a4ca0e051c3f67b6ac5fede8c..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/clip_score.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import json
-import time
-import argparse
-
-import open_clip
-import numpy as np
-from PIL import Image
-import torch
-import torch.nn.functional as F
-
-
-def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
-    imgs = []
-    texts = []
-    for image_file in image_files:
-        img = preprocess(Image.open(image_file)).unsqueeze(0).to(device)
-        imgs.append(img)
-        text = tokenizer([prompt]).to(device)
-        texts.append(text)
-
-    img = torch.cat(imgs)   # [bs, 3, 224, 224]
-    text = torch.cat(texts) # [bs, 77]
-
-    with torch.no_grad():
-        text_ft = model_clip.encode_text(text).float()
-        img_ft = model_clip.encode_image(img).float()
-        score = F.cosine_similarity(img_ft, text_ft).squeeze()
-    
-    return score.cpu()
-
-
-def main():
-    args = parse_arguments()
-    
-    if args.device is None:
-        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
-    else:
-        device = torch.device(args.device)
-    
-    t_b = time.time()
-    print(f"Load clip model...") 
-    model_clip, _, preprocess = open_clip.create_model_and_transforms(
-        args.model_name, pretrained=args.model_weights_path, device=device)
-    model_clip.eval()
-    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
-    
-    tokenizer = open_clip.get_tokenizer(args.model_name)
-
-    with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f:
-        image_info = json.load(f)
-
-    t_b = time.time()
-    print(f"Calc clip score...") 
-    all_scores = []
-    cat_scores = {}
-
-    for i, info in enumerate(image_info):
-        image_files = info['images']
-        category = info['category']
-        prompt = info['prompt']
-
-        print(f"[{i + 1}/{len(image_info)}] {prompt}")
-
-        image_scores = clip_score(model_clip, 
-                                  tokenizer, 
-                                  preprocess, 
-                                  prompt, 
-                                  image_files, 
-                                  device)
-        if len(image_files) > 1:
-            best_score = max(image_scores)
-        else:
-            best_score = image_scores
-
-        print(f"image scores: {image_scores}")
-        print(f"best score: {best_score}")
-
-        all_scores.append(best_score)
-        if category not in cat_scores:
-            cat_scores[category] = []
-        cat_scores[category].append(best_score)
-    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
-
-    average_score = np.average(all_scores)
-    print(f"====================================")
-    print(f"average score: {average_score:.3f}")
-    print(f"category average scores:")
-    cat_average_scores = {}
-    for category, scores in cat_scores.items():
-        cat_average_scores[category] = np.average(scores)
-        print(f"[{category}], average score: {cat_average_scores[category]:.3f}")
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        choices=["cpu", "cuda"],
-        help="device for torch.",
-    )
-    parser.add_argument(
-        "--image_info",
-        type=str,
-        default="./image_info.json",
-        help="Image_info.json file.",
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="ViT-H-14",
-        help="open clip model name",
-    )
-    parser.add_argument(
-        "--model_weights_path",
-        type=str,
-        default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
-        help="open clip model weights",
-    )
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/hpsv2_score.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/hpsv2_score.py
deleted file mode 100644
index 04e9bd8d8f82ece84c642520b001b62901286eda..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/hpsv2_score.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from typing import Union
-import json
-
-from clint.textui import progress
-import hpsv2
-from hpsv2.utils import root_path, hps_version_map
-from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
-import huggingface_hub
-from PIL import Image
-import requests
-import torch
-
-
-def initialize_model(pretrained_path, device):
-    model, _, preprocess_val = create_model_and_transforms(
-        "ViT-H-14", pretrained=pretrained_path, precision='amp',
-        device=device,
-        jit=False,
-        force_quick_gelu=False,
-        force_custom_text=False,
-        force_patch_dropout=False,
-        force_image_size=None,
-        pretrained_image=False,
-        image_mean=None,
-        image_std=None,
-        light_augmentation=True,
-        aug_cfg={},
-        output_dict=True,
-        with_score_predictor=False,
-        with_region_predictor=False
-    )
-    return model, preprocess_val
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--image_info",
-        type=str,
-        default="./image_info.json",
-        help="Image_info.json file.",
-    )
-    parser.add_argument(
-        "--HPSv2_checkpoint",
-        type=str,
-        default="./HPS_v2_compressed.pt",
-        help="HPS_v2 model weights",
-    )
-    parser.add_argument(
-        "--clip_checkpoint",
-        type=str,
-        default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
-        help="open clip model weights",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-    
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-    model, preprocess_val = initialize_model(args.clip_checkpoint, device)
-
-    checkpoint = torch.load(args.HPSv2_checkpoint, map_location=device)
-    model.load_state_dict(checkpoint['state_dict'])
-    tokenizer = get_tokenizer('ViT-H-14')
-    model = model.to(device)
-    model.eval()
-    
-    with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f:
-        image_info = json.load(f)
-
-    result = []
-    for i, info in enumerate(image_info):
-        image_file = info['images'][0]
-        prompt = info['prompt']
-        
-        # Load your image and prompt
-        with torch.no_grad():
-            # Process the image
-            if isinstance(image_file, str):
-                image = preprocess_val(Image.open(image_file))
-            elif isinstance(image_file, Image.Image):
-                image = preprocess_val(image_file)
-            else:
-                raise TypeError('The type of parameter img_path is illegal.')
-            image = image.unsqueeze(0).to(device=device, non_blocking=True)
-            # Process the prompt
-            text = tokenizer([prompt]).to(device=device, non_blocking=True)
-            # Calculate the HPS
-            with torch.cuda.amp.autocast():
-                outputs = model(image, text)
-                image_features = outputs["image_features"]
-                text_features = outputs["text_features"]
-                logits_per_image = image_features @ text_features.T
-
-                hps_score = torch.diagonal(logits_per_image).cpu().numpy()
-                print(f"image {i} hps_score: ", hps_score[0])
-
-        result.append(hps_score[0])
-
-    print('avg HPSv2 score:', sum(result) / len(result))
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/modify_onnx.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/modify_onnx.py
deleted file mode 100644
index e0a82946beda796772ff8699f1f959a70ed9350a..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/modify_onnx.py
+++ /dev/null
@@ -1,493 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import numpy as np
-from auto_optimizer import OnnxGraph
-
-
-def del_add(model):
-    init = [n.name for n in model.get_nodes('Initializer')]
-    for node in model.get_nodes('Add'):
-        if 'attn' in node.name and node.inputs[1] in init:
-            value = model[node.inputs[1]].value
-            if (value == 0).all():
-                model.remove(node.name)
-
-            
-def add_flash_attention(model, fa_name, soc_type):
-    for node in model.get_nodes('Mul'):
-        name = node.name
-        if soc_type == 1:
-            flag = 'attn' in name
-        else:
-            flag = 'attn1' in name
-        if flag:
-            matmul = model[name[:-3] + 'to_q/MatMul']
-            reshape = model[name[:-3] + 'Reshape']
-            seqlen = 4096
-            if soc_type == 3 and model[reshape.inputs[1]].value[1] != seqlen:
-                continue
-            softmax_node = model.get_next_nodes(node.outputs[0])[0]
-            if soc_type == 1:
-                # move mul to q
-                softmax_node.inputs[0] = node.inputs[0]
-                node.inputs[0] = matmul.outputs[0]
-                reshape.inputs[0] = node.outputs[0]
-
-            # add flashattention
-            new_node = model.add_node(name[:-3] + fa_name, fa_name)
-            if soc_type == 3:
-                new_node.attrs = {
-                    'input_layout': 'BSH',
-                    'num_heads': 10, 
-                    'scale_value': 0.125, 
-                    'next_tokens': 65535
-                }
-            inputs = [None, None, None]
-            # input 0: q
-            if soc_type == 1:
-                matmul_node = model.get_prev_node(softmax_node.inputs[0])
-            if soc_type == 3:
-                matmul_node = model.get_prev_node(node.inputs[0])
-            inputs[0] = matmul_node.inputs[0]
-            # input 1: k
-            transpose_node = model.get_prev_node(matmul_node.inputs[1])
-            inputs[1] = transpose_node.inputs[0]
-            # input 2: v
-            cast_node = model.get_next_nodes(softmax_node.outputs[0])[0]
-            last_node = model.get_next_nodes(cast_node.outputs[0])[0]
-            inputs[2] = last_node.inputs[1]
-            # output
-            outputs = last_node.outputs
-            # update link
-            new_node.inputs = inputs
-            new_node.outputs = outputs
-            
-            model.remove(matmul_node.name, {})
-            model.remove(transpose_node.name, {})
-            model.remove(softmax_node.name, {})
-            model.remove(cast_node.name, {})
-            model.remove(last_node.name, {})
-    model.update_map()
-    for node in model.get_nodes(fa_name):
-        for _ in range(soc_type):
-            for i in range(3):
-                prev_node = model.get_prev_node(node.inputs[i])
-                model.remove(prev_node.name)
-            next_node = model.get_next_nodes(node.outputs[0])[0]
-            model.remove(next_node.name)
-
-
-def change_input(model, bs):
-    inputs = [inp.name for inp in model.inputs]
-    for inp in inputs:
-        shape = model[inp].shape
-        dtype = model[inp].dtype
-        if inp == 't':
-            dtype = 'int32'
-        else:
-            shape[0] *= bs
-        model.remove(inp)
-        model.add_input(inp, shape=shape, dtype=dtype)
-
-
-def get_index(model, init, name):
-    if name in init:
-        return model[name].value
-    else:
-        return name
-
-
-def replace_slice(model, fast):
-    # find pairs of slice
-    slice_pair = []
-    for node in model.get_nodes('Slice'):
-        if node.name[-2:] == '_1':
-            slice_pair.append((model[node.name[:-2]], model[node.name]))
-    # replace
-    init = [n.name for n in model.get_nodes('Initializer')]
-    for pair in slice_pair:
-        next_node = model.get_next_nodes(pair[0].outputs[0])[0]
-        if fast and next_node.op_type == 'Mul':
-            name = pair[0].name[:-5] + 'SliceTransGeluMul'
-            model.add_node(name, 'SliceTransGeluMul', inputs=[pair[0].inputs[0]], outputs=next_node.outputs)
-            model.remove(next_node.name, {})
-        else:
-            name = pair[0].name[:-5] + 'Split'
-            data = pair[0].inputs[0]
-            start_0 = get_index(model, init, pair[0].inputs[1])
-            end_0 = get_index(model, init, pair[0].inputs[2])
-            start_1 = get_index(model, init, pair[1].inputs[1])
-            end_1 = get_index(model, init, pair[1].inputs[2])
-            if start_1 == end_0:
-                outputs = pair[0].outputs + pair[1].outputs
-            elif start_0 == end_1:
-                outputs = pair[1].outputs + pair[0].outputs
-
-            axes = pair[0].inputs[3]
-            axis = model[axes].value[0]
-            model.add_node(name, 'Split', inputs=[data], outputs=outputs, attrs={'axis': axis})
-        model.remove(pair[0].name, {})
-        model.remove(pair[1].name, {})
-    model.update_map()
-        
-
-def build_index(h, w, sy=2, sx=2):
-    # random select one from a 2x2 block
-    hsy = h // sy
-    wsx = w // sx
-    rand_idx = np.random.randint(sy * sx, size=(hsy, wsx))
-        
-    idx = np.ones((hsy, wsx, sy * sx), dtype=np.int64)
-    for i in range(hsy):
-        for j in range(wsx):
-            idx[i, j][rand_idx[i, j]] = 0
-    idx = idx.reshape(hsy, wsx, sy, sx).transpose(0, 2, 1, 3)
-    idx_rand = idx.reshape(-1).argsort()
-    index_a = np.sort(idx_rand[hsy * wsx:])
-    index_b = np.sort(idx_rand[:hsy * wsx])
-    return index_a, index_b
-
-
-def get_block(model):
-    # find self-attention block
-    norms = []
-    for node in model.get_nodes('Add'):
-        next_nodes = model.get_next_nodes(node.outputs[0])
-        if next_nodes[0].op_type == 'AscendQuant':
-            next_nodes = model.get_next_nodes(next_nodes[0].outputs[0])
-        if len(next_nodes) != 3:
-            continue
-        op_type = set(n.op_type for n in next_nodes)
-        if len(op_type) == 1 and 'MatMul' in op_type:
-            if model[node.inputs[1]].value.shape[0] == 640:
-                norms.append(node)
-    return norms
-
-
-def find_nodes(model, node):
-    prev_node = model.get_prev_node(node.inputs[0])
-    while prev_node.op_type != 'Sub':
-        prev_node = model.get_prev_node(prev_node.inputs[0])
-    inp = prev_node.inputs[0]
-    next_nodes = model.get_next_nodes(inp)
-    for next_node in next_nodes:
-        if next_node.op_type == 'Add':
-            if next_node.inputs[0] == inp:
-                out = next_node.inputs[1]
-            else:
-                out = next_node.inputs[0]
-    return inp, out
-
-
-def build_tome_block(model, name, inputs, inputs_un):
-    # link merge to attn
-    for node in model.get_next_nodes(inputs[1]):
-        ind = 0
-        for inp in node.inputs:
-            if inp == inputs[1]:
-                node.inputs[ind] = name + 'Concat_output'
-            ind += 1
-    # norm block
-    model.add_node(
-        name + 'Mul',
-        'Mul',
-        inputs=[inputs[0], inputs[0]],
-        outputs=[name + 'Mul_output']
-    )
-    model.add_node(
-        name + 'ReduceSum',
-        'ReduceSum',
-        inputs=[name + 'Mul_output'],
-        outputs=[name + 'ReduceSum_output'],
-        attrs={'axes': [-1], 'keepdims': 1}
-    )
-    model.add_node(
-        name + 'Sqrt',
-        'Sqrt',
-        inputs=[name + 'ReduceSum_output'],
-        outputs=[name + 'Sqrt_output']
-    )
-    model.add_node(
-        name + 'Div',
-        'Div',
-        inputs=[inputs[0], name + 'Sqrt_output'],
-        outputs=[name + 'Div_output']
-    )
-    # compute similarity
-    model.add_node(
-        name + 'Gather_0',
-        'Gather',
-        inputs=[name + 'Div_output', 'tome/Gather_index_a'],
-        outputs=[name + 'Gather_0_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Gather_1',
-        'Gather',
-        inputs=[name + 'Div_output', 'tome/Gather_index_b'],
-        outputs=[name + 'Gather_1_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Transpose',
-        'Transpose',
-        inputs=[name + 'Gather_1_output'],
-        outputs=[name + 'Transpose_output'],
-        attrs={'perm': [0, 2, 1]}
-    )
-    model.add_node(
-        name + 'MatMul',
-        'MatMul',
-        inputs=[name + 'Gather_0_output', name + 'Transpose_output'],
-        outputs=[name + 'MatMul_output']
-    )
-    model.add_node(
-        name + 'FindMax',
-        'FindMax',
-        inputs=[name + 'MatMul_output'],
-        outputs=[name + 'FindMax_output_0', name + 'FindMax_output_1'],
-        attrs={}
-    )
-    model.add_node(
-        name + 'TopK',
-        'TopK',
-        inputs=[name + 'FindMax_output_0', 'tome/Topk_k'],
-        outputs=[name + 'TopK_output_0', name + 'TopK_output_1'],
-        attrs={'axis': -1, 'largest': 1}
-    )
-    # split token
-    model.add_node(
-        name + 'Gather_2',
-        'Gather',
-        inputs=[inputs[1], 'tome/Gather_index_a'],
-        outputs=[name + 'Gather_2_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Gather_3',
-        'Gather',
-        inputs=[inputs[1], 'tome/Gather_index_b'],
-        outputs=[name + 'Gather_3_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Cast_0',
-        'Cast',
-        inputs=[name + 'Gather_2_output'],
-        outputs=[name + 'Cast_0_output'],
-        attrs={'to': 1}
-    )
-    model.add_node(
-        name + 'Cast_1',
-        'Cast',
-        inputs=[name + 'Gather_3_output'],
-        outputs=[name + 'Cast_1_output'],
-        attrs={'to': 1}
-    )
-    # tome merge
-    merge_inputs = [
-        name + 'Cast_0_output', 
-        name + 'Cast_1_output', 
-        name + 'TopK_output_1', 
-        name + 'FindMax_output_1'
-    ]
-    merge_outputs = [
-        name + 'TomeMerged_output_0',
-        name + 'TomeMerged_output_1',
-        name + 'TomeMerged_output_2'
-    ]
-    model.add_node(
-        name + 'TomeMerged',
-        'TomeMerged',
-        inputs=merge_inputs,
-        outputs=merge_outputs
-    )
-    model.add_node(
-        name + 'ReduceSum_1',
-        'ReduceSum',
-        inputs=[name + 'TomeMerged_output_1'],
-        outputs=[name + 'ReduceSum_1_output'],
-        attrs={'axes': [1], 'keepdims': 0}
-    )
-    model.add_node(
-        name + 'ReduceSum_2',
-        'ReduceSum',
-        inputs=[name + 'TomeMerged_output_2'],
-        outputs=[name + 'ReduceSum_2_output'],
-        attrs={'axes': [1], 'keepdims': 0}
-    )
-    model.add_node(
-        name + 'Unsqueeze',
-        'Unsqueeze',
-        inputs=[name + 'ReduceSum_2_output'],
-        outputs=[name + 'Unsqueeze_output'],
-        attrs={'axes': [2]}
-    )
-    model.add_node(
-        name + 'Div_1',
-        'Div',
-        inputs=[name + 'ReduceSum_1_output', name + 'Unsqueeze_output'],
-        outputs=[name + 'Div_1_output']
-    )
-    model.add_node(
-        name + 'Concat',
-        'Concat',
-        inputs=[name + 'TomeMerged_output_0', name + 'Div_1_output'],
-        outputs=[name + 'Concat_output'],
-        attrs={'axis': 1}
-    )
-    # link unmerge to norm
-    for node in model.get_next_nodes(inputs_un[0]):
-        ind = 0
-        for inp in node.inputs:
-            if inp == inputs_un[0]:
-                node.inputs[ind] = name + 'TomeUngerme_output'
-            ind += 1
-    # add unmerge node
-    unmerge_inputs = inputs_un + [name + 'TopK_output_1', name + 'FindMax_output_1']
-    model.add_node(
-        name + 'tome/TomeUnmerge',
-        'TomeUnmerged',
-        inputs=unmerge_inputs,
-        outputs=[name + 'TomeUngerme_output']
-    )
-    model.update_map()
-
-
-def insert_tome_block(model, max_num):
-    bs = model['latent_model_input'].shape[0]
-    h, w = model['latent_model_input'].shape[2:]
-    h = h // 2 
-    w = w // 2
-    index_a, index_b = build_index(h, w)
-    # add initializer
-    model.add_initializer('tome/Gather_index_a', index_a)
-    model.add_initializer('tome/Gather_index_b', index_b)
-    bs_index_a = np.tile(index_a.reshape(1, -1), [bs, 1])
-    bs_index_b = np.tile(index_b.reshape(1, -1), [bs, 1])
-    model.add_initializer('tome/index_a', bs_index_a)
-    model.add_initializer('tome/index_b', bs_index_b)
-    model.add_initializer('tome/Topk_k', np.array([3072]))
-    # get reshape nodes
-    reshapes = model.get_nodes('Reshape')
-    # find inputs
-    norm_outs = get_block(model)[:max_num]
-    for node in norm_outs:
-        name = node.name.rsplit('/', 2)[0] + '/attn1/'
-        norm_input, sa_output = find_nodes(model, node)
-        inputs_0 = [norm_input] + node.outputs
-        inputs_1 = [sa_output] + ['tome/index_a', 'tome/index_b']
-        # add tome block
-        build_tome_block(model, name.replace('attn', 'tome'), inputs_0, inputs_1)
-        # change shape of reshape
-        for reshape in reshapes:
-            if name in reshape.name:
-                shape = model[reshape.inputs[1]].value.copy()
-                ind = 0
-                for size in shape:
-                    if size == 4096:
-                        shape[ind] = '-1'
-                    ind += 1
-                model[reshape.inputs[1]].value = shape
-
-
-def change_bs(model, bs):
-    node = model.get_nodes('Expand')[0]
-    node.inputs[1] = 'bs'
-    model.add_initializer('bs', value=np.array([bs]))
-    
-    inits = [init.name for init in model.initializers]
-    shapes = []
-    for node in model.get_nodes('Reshape'):
-        shape = node.inputs[1]
-        if shape in inits and shape not in shapes:
-            shapes.append(shape)
-            value = model[shape].value.copy()
-            value[0] *= bs
-            model[shape].value = value
-
-    model.update_map()
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="models/unet/unet.onnx",
-        help="Path of the unet onnx model.",
-    )
-    parser.add_argument(
-        "--new_model",
-        type=str,
-        default="models/unet/unet_md.onnx",
-        help="Path to save the modified model",
-    )
-    parser.add_argument(
-        "--FA_soc", 
-        choices=["None", "Duo", "A2"],
-        default="None", 
-        help="Type of FA operator.",
-    )
-    parser.add_argument(
-        "--TOME_num",
-        type=int,
-        default=0,
-        help="Number of TOME used in the model",
-    )
-    parser.add_argument(
-        "--faster_gelu",
-        action="store_true",
-        help="Use specific gelu operation"
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="Batch size"
-    )
-    parser.add_argument(
-        "--parallel",
-        action="store_true",
-        help="Use parallel unet model"
-    )
-    return parser.parse_args()
-
-
-def main():
-    model = OnnxGraph.parse(args.model)
-    del_add(model)
-    if args.parallel:
-        batch_size = args.batch_size
-    else:
-        batch_size = args.batch_size * 2
-    if batch_size > 1:
-        change_bs(model, batch_size)
-    change_input(model, batch_size)
-    if args.FA_soc == 'Duo':
-        add_flash_attention(model, 'FlashAttentionTik', soc_type=1)
-    elif args.FA_soc == 'A2':
-        add_flash_attention(model, 'NPUPromptFlashAttention', soc_type=3)
-    if args.TOME_num:
-        insert_tome_block(model, args.TOME_num)
-    replace_slice(model, args.faster_gelu)
-    model.remove_unused_nodes()
-    model.save(args.new_model)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/pipeline_ascend_stable_diffusionxl.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/pipeline_ascend_stable_diffusionxl.py
deleted file mode 100644
index 7a295a2cb2e5ee8667b5ae11951284abf933e37e..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/pipeline_ascend_stable_diffusionxl.py
+++ /dev/null
@@ -1,589 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import aclruntime
-import numpy as np
-import torch
-from ais_bench.infer.interface import InferSession
-from diffusers import StableDiffusionXLPipeline
-from diffusers.loaders import TextualInversionLoaderMixin
-
-
-class AscendStableDiffusionXLPipeline(StableDiffusionXLPipeline):
-    def encode_prompt(
-        self,
-        prompt,
-        prompt_2,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-        negative_prompt_2,
-        lora_scale,
-        clip_skip,
-        encode_session,
-        encode_session_2
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in both text-encoders
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-        """
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        if prompt is not None:
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Define tokenizers and text encoders
-        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
-        text_encoders = (
-            [encode_session, encode_session_2] if encode_session is not None else [encode_session_2]
-        )
-
-        prompt_2 = prompt_2 or prompt
-        prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
-
-        # textual inversion: procecss multi-vector tokens if necessary
-        prompt_embeds_list = []
-        prompts = [prompt, prompt_2]
-        for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, tokenizer)
-
-            text_inputs = tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
-                print("[warning] The following part of your input was truncated"
-                    " because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-    
-            prompt_embeds = text_encoder.infer([text_input_ids.to("cpu").numpy()])
-
-            prompt_embeds = [torch.from_numpy(text) for text in prompt_embeds]
-
-            # We are only ALWAYS interested in the pooled output of the final text encoder
-            pooled_prompt_embeds = prompt_embeds[0]
-            if clip_skip is None:
-                prompt_embeds = prompt_embeds[-2]
-            else:
-                # "2" because SDXL always indexes from the penultimate layer.
-                prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
-
-            prompt_embeds_list.append(prompt_embeds)
-
-        prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
-
-        # get unconditional embeddings for classifier free guidance
-        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
-        if do_classifier_free_guidance and zero_out_negative_prompt:
-            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
-        elif do_classifier_free_guidance:
-            negative_prompt = negative_prompt or ""
-            negative_prompt_2 = negative_prompt_2 or negative_prompt
-
-            # normalize str to list
-            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-            negative_prompt_2 = (
-                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
-            )
-
-            uncond_tokens: List[str]
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = [negative_prompt, negative_prompt_2]
-
-            negative_prompt_embeds_list = []
-            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
-                if isinstance(self, TextualInversionLoaderMixin):
-                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
-
-                max_length = prompt_embeds.shape[1]
-                uncond_input = tokenizer(
-                    negative_prompt,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="pt",
-                )
-
-                negative_prompt_embeds = text_encoder.infer(
-                    uncond_input.input_ids.to(device).numpy()
-                    )
-                # We are only ALWAYS interested in the pooled output of the final text encoder
-                negative_prompt_embeds = [torch.from_numpy(text) for text in negative_prompt_embeds]
-                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
-                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
-
-                negative_prompt_embeds_list.append(negative_prompt_embeds)
-
-            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device="cpu")
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device="cpu")
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
-            bs_embed * num_images_per_prompt, -1
-        )
-        if do_classifier_free_guidance:
-            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
-                bs_embed * num_images_per_prompt, -1
-            )
-
-        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
-
-   
-    @torch.no_grad()
-    def ascend_infer(
-        self,
-        prompt: Union[str, List[str]],
-        prompt_2: Optional[Union[str, List[str]]],
-        encode_session: InferSession,
-        encode_session_2: InferSession,
-        unet_sessions: List[list],
-        scheduler_session: InferSession,
-        vae_session: InferSession,
-        skip_status: List[int],
-        device_id: int = 0,
-        use_npu_scheduler: bool = False,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        denoising_end: Optional[float] = None,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt_2: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        negative_original_size: Optional[Tuple[int, int]] = None,
-        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
-        negative_target_size: Optional[Tuple[int, int]] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in both text-encoders
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            denoising_end (`float`, *optional*):
-                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
-                completed before it is intentionally prematurely terminated. As a result, the returned sample will
-                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
-                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
-                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            guidance_rescale (`float`, *optional*, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
-                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a target image resolution. It should be as same
-                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        """
-        # 0. Default height and width to unet
-        height = height or self.default_sample_size * self.vae_scale_factor
-        width = width or self.default_sample_size * self.vae_scale_factor
-
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            prompt_2,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            negative_prompt_2,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # 3. Encode input prompt
-        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            prompt_2=prompt_2,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            negative_prompt_2=negative_prompt_2,
-            lora_scale=lora_scale,
-            clip_skip=clip_skip,
-            encode_session=encode_session,
-            encode_session_2=encode_session_2
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Prepare added time ids & embeddings
-        unet_session, unet_session_bg = unet_sessions
-        use_parallel_inferencing = unet_session_bg is not None
-        add_text_embeds = pooled_prompt_embeds
-
-        add_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            dtype=prompt_embeds.dtype,
-        )
-        if negative_original_size is not None and negative_target_size is not None:
-            negative_add_time_ids = self._get_add_time_ids(
-                negative_original_size,
-                negative_crops_coords_top_left,
-                negative_target_size,
-                dtype=prompt_embeds.dtype,
-            )
-        else:
-            negative_add_time_ids = add_time_ids
-
-        if do_classifier_free_guidance and not use_parallel_inferencing:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
-            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
-
-        add_text_embeds = add_text_embeds.numpy()
-        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1).numpy()
-
-        # 8. Denoising loop
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        prompt_embeds = prompt_embeds.numpy()
-        # 8.1 Apply denoising_end
-        if (
-            denoising_end is not None
-            and isinstance(denoising_end, float)
-            and denoising_end > 0
-            and denoising_end < 1
-        ):
-            discrete_timestep_cutoff = int(
-                round(
-                    self.scheduler.config.num_train_timesteps
-                    - (denoising_end * self.scheduler.config.num_train_timesteps)
-                )
-            )
-            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
-            timesteps = timesteps[:num_inference_steps]
-
-        cache = None
-        for i, t in enumerate(timesteps):
-            # expand the latents if we are doing classifier free guidance
-            t_numpy = t[None].numpy()
-            if not use_parallel_inferencing and do_classifier_free_guidance:
-                latent_model_input = torch.cat([latents] * 2)
-            else:
-                latent_model_input = latents
-
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            if use_parallel_inferencing and do_classifier_free_guidance:
-                unet_session_bg.infer_asyn(
-                    [
-                        latent_model_input,
-                        t_numpy.astype(np.int32),
-                        negative_prompt_embeds.numpy(),
-                        negative_pooled_prompt_embeds.numpy(),
-                        negative_add_time_ids.numpy(),
-                    ],
-                    skip_status[i]
-                )
-
-            if skip_status[i]:
-                inputs = [
-                    latent_model_input.numpy(),
-                    t_numpy.astype(np.int32),
-                    prompt_embeds,
-                    add_text_embeds,
-                    add_time_ids,
-                    cache,
-                ]
-                noise_pred = torch.from_numpy(
-                    np.array(self.unet_infer(unet_session[1], inputs, device_id)[0])
-                )
-            else:
-                inputs = [
-                    latent_model_input.numpy(),
-                    t_numpy.astype(np.int32),
-                    prompt_embeds,
-                    add_text_embeds,
-                    add_time_ids,
-                ]
-                outputs = self.unet_infer(unet_session[0], inputs, device_id)
-                noise_pred = torch.from_numpy(np.array(outputs[0]))
-                if len(outputs) > 1:
-                    cache = outputs[1]
-
-            if do_classifier_free_guidance:
-                if use_parallel_inferencing:
-                    noise_pred_uncond = torch.from_numpy(unet_session_bg.wait_and_get_outputs()[0])
-                else:
-                    noise_pred_uncond, noise_pred = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
-
-            # perform guidance
-            if use_npu_scheduler:
-                latents = torch.from_numpy(
-                    scheduler_session.infer(
-                        [
-                            noise_pred.numpy(),
-                            t_numpy,
-                            latents.numpy(),
-                            np.array(i)
-                        ]
-                    )[0]
-                )
-
-            else:
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False,
-                )[0]
-
-        if not output_type == "latent":
-            latents = latents / self.vae.config.scaling_factor
-            latents = self.vae.post_quant_conv(latents)
-            image = torch.from_numpy(vae_session.infer([latents.numpy()])[0])
-            image = (image / 2 + 0.5).clamp(0, 1)
-            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-
-        else:
-            image = latents
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        return (image, None)
-
-    def unet_infer(self, session, data, device_id):
-        feeds = {}
-        inputs = session.get_inputs()
-        for i, inp in enumerate(inputs):
-            if inp.name == 'cache':
-                feeds[inp.name] = data[i]
-                continue
-            feed = aclruntime.Tensor(data[i])
-            feed.to_device(device_id)
-            feeds[inp.name] = feed
-        out_names = [out.name for out in session.get_outputs()]
-
-        outputs = session.run(out_names, feeds)
-        outputs[0].to_host()
-        return outputs
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/prompts.txt b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/prompts.txt
deleted file mode 100644
index 2eeaefc73f76d2bdb5edd4d03e298f066c931527..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/prompts.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Beautiful illustration of The ocean. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Islands in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Seaports in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of The waves. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Grassland. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Wheat. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Hut Tong. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of The boat. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Pine trees. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Bamboo. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of The temple. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Cloud in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Sun in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Spring. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Lotus. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
-Beautiful illustration of Snow piles. in a serene landscape, magic realism, narrative realism, beautiful matte painting, heavenly lighting, retrowave, 4 k hd wallpaper
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/quant_unet.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/quant_unet.py
deleted file mode 100644
index a9c2d8f90ecf6fd61327ab113d309c8e8e3bba77..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/quant_unet.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-from ais_bench.infer.interface import InferSession
-from diffusers import DPMSolverMultistepScheduler, EulerDiscreteScheduler, DDIMScheduler
-from modelslim.onnx.squant_ptq.onnx_quant_tools import OnnxCalibrator
-from modelslim.onnx.squant_ptq.quant_config import QuantConfig
-from auto_optimizer import OnnxGraph
-import numpy as np
-import onnx
-import torch
-
-from background_session import BackgroundInferSession
-from pipeline_ascend_stable_diffusionxl import AscendStableDiffusionXLPipeline
-from stable_diffusionxl_ascend_infer import check_device_range_valid
-
-
-class StableDiffusionXLDumpPipeline(AscendStableDiffusionXLPipeline):
-    @torch.no_grad()
-    def dump_data(
-        self,
-        prompt: Union[str, List[str]],
-        prompt_2: Optional[Union[str, List[str]]],
-        encode_session: InferSession,
-        encode_session_2: InferSession,
-        unet_sessions: List[List[InferSession]],
-        scheduler_session: InferSession,
-        dump_num: int = 10,
-        use_npu_scheduler: bool = False,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        denoising_end: Optional[float] = None,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt_2: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        negative_original_size: Optional[Tuple[int, int]] = None,
-        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
-        negative_target_size: Optional[Tuple[int, int]] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        # 0. Default height and width to unet
-        height = height or self.default_sample_size * self.vae_scale_factor
-        width = width or self.default_sample_size * self.vae_scale_factor
-
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            prompt_2,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            negative_prompt_2,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # 3. Encode input prompt
-        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            prompt_2=prompt_2,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            negative_prompt_2=negative_prompt_2,
-            lora_scale=lora_scale,
-            clip_skip=clip_skip,
-            encode_session=encode_session,
-            encode_session_2=encode_session_2
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Prepare added time ids & embeddings
-        unet_session, unet_session_bg = unet_sessions
-        use_parallel_inferencing = unet_session_bg is not None
-        add_text_embeds = pooled_prompt_embeds
-
-        add_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            dtype=prompt_embeds.dtype,
-        )
-        if negative_original_size is not None and negative_target_size is not None:
-            negative_add_time_ids = self._get_add_time_ids(
-                negative_original_size,
-                negative_crops_coords_top_left,
-                negative_target_size,
-                dtype=prompt_embeds.dtype,
-            )
-        else:
-            negative_add_time_ids = add_time_ids
-
-        if do_classifier_free_guidance and not use_parallel_inferencing:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
-            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
-
-        add_text_embeds = add_text_embeds.numpy()
-        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1).numpy()
-
-        # 8. Denoising loop
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        prompt_embeds = prompt_embeds.numpy()
-        # 8.1 Apply denoising_end
-        if (
-            denoising_end is not None
-            and isinstance(denoising_end, float)
-            and denoising_end > 0
-            and denoising_end < 1
-        ):
-            discrete_timestep_cutoff = int(
-                round(
-                    self.scheduler.config.num_train_timesteps
-                    - (denoising_end * self.scheduler.config.num_train_timesteps)
-                )
-            )
-            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
-            timesteps = timesteps[:num_inference_steps]
-
-        dump_data = []
-        start_id = num_inference_steps // 2 - dump_num // 2
-        end_id = start_id + dump_num
-
-        for i, t in enumerate(timesteps):
-            # expand the latents if we are doing classifier free guidance
-            t_numpy = t[None].numpy()
-            if not use_parallel_inferencing and do_classifier_free_guidance:
-                latent_model_input = torch.cat([latents] * 2)
-            else:
-                latent_model_input = latents
-
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            if start_id <= i < end_id:
-                dump_data.append([latent_model_input, t_numpy, prompt_embeds, add_text_embeds, add_time_ids])
-            elif i == end_id:
-                break
-
-            if use_parallel_inferencing and do_classifier_free_guidance:
-                unet_session_bg.infer_asyn(
-                    [
-                        latent_model_input,
-                        t_numpy,
-                        negative_prompt_embeds.numpy(),
-                        negative_pooled_prompt_embeds.numpy(),
-                        negative_add_time_ids.numpy(),
-                    ],
-                )
-
-            inputs = [
-                latent_model_input.numpy(),
-                t_numpy,
-                prompt_embeds,
-                add_text_embeds,
-                add_time_ids,
-            ]
-            noise_pred = torch.from_numpy(unet_session.infer(inputs)[0])
-
-            if do_classifier_free_guidance:
-                if use_parallel_inferencing:
-                    noise_pred_uncond = torch.from_numpy(unet_session_bg.wait_and_get_outputs()[0])
-                else:
-                    noise_pred_uncond, noise_pred = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
-
-            # perform guidance
-            if use_npu_scheduler:
-                latents = torch.from_numpy(
-                    scheduler_session.infer(
-                        [
-                            noise_pred.numpy(),
-                            t_numpy,
-                            latents.numpy(),
-                            np.array(i)
-                        ]
-                    )[0]
-                )
-
-            else:
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False,
-                )[0]
-
-        return dump_data
-
-
-def get_quant_data(node, param, graph):
-    input_scale = param.input_scale
-    weight_scale = param.weight_scale
-    input_offset = param.input_offset
-    quant_weight = param.quant_weight
-    node_name = '_'.join(node.inputs[1].split('_')[:-1])
-    scale = input_scale[node_name] * weight_scale[node_name]
-    packed_weight_np_data = scale.squeeze()
-    float32_scale_deq = np.array(packed_weight_np_data, np.float32)
-    uint32_scale_deq = np.frombuffer(float32_scale_deq, np.uint32)
-    uint64_result = np.zeros(float32_scale_deq.shape, np.int64)
-    if len(uint64_result.shape) == 0:
-        uint64_result = np.expand_dims(uint64_result, axis=0)
-    uint64_result |= np.int64(uint32_scale_deq)
-    graph.add_initializer('_'.join([node.name, 'scale']), uint64_result)
-    graph.add_initializer('_'.join([node.name, 'offset']), np.array(0).astype(np.float32))
-    correction = quant_weight[node_name].astype(np.float32).sum(axis=0)*input_offset[node_name].astype(np.float32)
-
-    return scale, correction
-
-
-def modify_quant_fuse(unet, quant, param):
-    quant_graph = OnnxGraph.parse(quant)
-    unet_graph = OnnxGraph.parse(unet)
-    quant_op_type = "AscendDequant"
-    quant_list = quant_graph.get_nodes(quant_op_type)
-    input_scale = param.input_scale
-    weight_scale = param.weight_scale
-    input_offset = param.input_offset
-    quant_weight = param.quant_weight
-    for node in quant_list:
-        pre_node = quant_graph.get_prev_node(node.inputs[0])
-        if pre_node.op_type == "MatMul":
-            _, _ = get_quant_data(pre_node, param, quant_graph)
-            x = pre_node.inputs[1]
-            w = quant_graph[x].value
-            quant_graph[x].value = w.transpose(1,0)
-            node_name = pre_node.name
-            pre_input = pre_node.inputs[0]
-            quant_graph.remove(pre_node.name, mapping={})
-            quant_graph.add_node(node_name, 
-                                 "QuantBatchMatMul", 
-                                 inputs=[pre_input, x, '_'.join([node_name, 'scale']), 
-                                 '_'.join([node_name, 'offset'])], 
-                                 outputs=[node.outputs[0]], 
-                                 attrs={"dtype":0, "transpose_x2":True})
-            quant_graph.remove(node.name, mapping={})
-            quant_graph.update_map()
-        elif pre_node.op_type == "Add":
-            matmul_node = quant_graph.get_prev_node(pre_node.inputs[0])
-            scale, correction = get_quant_data(matmul_node, param, quant_graph)
-            x = matmul_node.inputs[1]
-            w = quant_graph[x].value
-            quant_graph[x].value = w.transpose(1,0)
-            ori_bias = np.round(unet_graph[unet_graph[pre_node.name].inputs[0]].value / scale - correction).astype(np.int32)
-            quant_graph.add_initializer('_'.join([matmul_node.name, 'bias']), ori_bias)
-            node_name = matmul_node.name
-            matmul_input = matmul_node.inputs[0]
-            quant_graph.remove(matmul_node.name, mapping={})
-            quant_graph.add_node(node_name, 
-                                 "QuantBatchMatMul", 
-                                 inputs=[matmul_input, x, 
-                                 '_'.join([node_name, 'scale']), 
-                                 '_'.join([node_name, 'offset']), 
-                                 '_'.join([node_name, 'bias'])], 
-                                 outputs=[node.outputs[0]], 
-                                 attrs={"dtype":0, "transpose_x2":True})
-            quant_graph.remove(pre_node.name, mapping={})
-            quant_graph.remove(node.name, mapping={})
-            quant_graph.update_map()
-
-    return quant_graph
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-2-1-base",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "--prompt_file",
-        type=str,
-        default="prompts.txt",
-        help="A prompt file used to generate images.",
-    )
-    parser.add_argument(
-        "--model_dir",
-        type=str,
-        default="./models",
-        help="Base path of om models.",
-    )
-    parser.add_argument(
-        "--save_path", 
-        type=str, 
-        default="unet_quant", 
-        help="Path to save result images.",
-    )
-    parser.add_argument(
-        "--scheduler", 
-        choices=["DDIM", "Euler", "DPM", "EulerAncestral", "DPM++SDEKarras"],
-        default="DDIM", 
-        help="Type of Sampling methods. Can choose from DDIM, Euler, DPM",
-    )
-    parser.add_argument(
-        "--device", 
-        type=check_device_range_valid, 
-        default=0, 
-        help="NPU device id. Give 2 ids to enable parallel inferencing."
-    )
-    parser.add_argument(
-        "--steps", 
-        type=int, 
-        default=50, 
-        help="Number of inference steps.",
-    )
-    parser.add_argument(
-        "--data_num", 
-        type=int, 
-        default=10,
-        help="the number of real data used in quant process"
-    )
-    parser.add_argument(
-        "--data_free", 
-        action='store_true', 
-        help="do not use real data"
-    )
-    
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-
-    unet_onnx = os.path.join(args.model_dir, "unet", "unet.onnx")
-
-    if args.data_free:
-        data = [[]]
-
-    input_shape = ''
-    model = onnx.load(unet_onnx)
-    inputs = model.graph.input
-
-    for inp in inputs:
-        dims = inp.type.tensor_type.shape.dim
-        shape = [str(x.dim_value) for x in dims]
-        input_shape += inp.name + ':' + ','.join(shape) + ';'
-        if args.data_free:
-            dtype = inp.type.tensor_type.elem_type
-            data_size = [x.dim_value for x in dims]
-            if dtype == 1:
-                data[0].append(np.random.random(data_size).astype(np.float32))
-            if dtype == 7:
-                data[0].append(np.random.randint(10, size=data_size).astype(np.int64))
-
-    if not args.data_free:
-        device = None
-        device_2 = None
-
-        if isinstance(args.device, list):
-            device, device_2 = args.device
-        else:
-            device = args.device
-        
-        batch_size = inputs[0].type.tensor_type.shape.dim[0].dim_value
-        if not device_2:
-            batch_size = batch_size // 2
-
-        pipe = StableDiffusionXLDumpPipeline.from_pretrained(args.model).to("cpu")
-
-        use_npu_scheduler = False
-
-        if args.scheduler == "DDIM":
-            pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-            use_npu_scheduler = True
-
-        elif args.scheduler == "Euler":
-            pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-        elif args.scheduler == "DPM":
-            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        elif args.scheduler == "EulerAncestral":
-            pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-        elif args.scheduler == "DPM++SDEKarras":
-            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-            pipe.scheduler.config.algorithm_type = 'sde-dpmsolver++'
-            pipe.scheduler.config.use_karras_sigmas = True
-
-        encoder_om = os.path.join(args.model_dir, "text_encoder", "text_encoder.om")
-        encoder_om_2 = os.path.join(args.model_dir, "text_encoder", "text_encoder_2.om")
-        unet_om = os.path.join(args.model_dir, "unet", "unet.om")
-
-        encoder_session = InferSession(device, encoder_om)
-        encoder_session_2 = InferSession(device, encoder_om_2)
-        unet_session = InferSession(device, unet_om)
-
-        if use_npu_scheduler:
-            scheduler_om = os.path.join(args.model_dir, "ddim", "ddim.om")
-            scheduler_session = InferSession(device, scheduler_om)
-        else:
-            scheduler_session = None
-
-        unet_session_bg = None
-        if device_2:
-            unet_session_bg = BackgroundInferSession.clone(unet_session, device_2, [unet_om, ""])
-
-        with os.fdopen(os.open(args.prompt_file, os.O_RDONLY), "r") as f:
-            prompts = [line.strip() for line in f]
-
-        data = pipe.dump_data(
-            prompts[:batch_size],
-            "",
-            encoder_session,
-            encoder_session_2,
-            [unet_session, unet_session_bg],
-            scheduler_session,
-            args.data_num,
-            num_inference_steps=args.steps,
-            guidance_scale=5.0,
-            use_npu_scheduler=use_npu_scheduler,
-        )
-
-        if unet_session_bg:
-            unet_session_bg.stop()
-    
-    config = QuantConfig(
-        disable_names=[],
-        quant_mode=0,
-        amp_num=0,
-        use_onnx=False,
-        disable_first_layer=True,
-        quant_param_ops=['Conv', 'MatMul'],
-        atc_input_shape=input_shape[:-1],
-        num_input=len(inputs),
-    )
-
-    calib = OnnxCalibrator(unet_onnx, config, calib_data=data)
-    calib.run()
-    quant_path = os.path.join(args.model_dir, args.save_path)
-    if not os.path.exists(quant_path):
-        os.makedirs(quant_path, mode=0o744)
-    quant_onnx = os.path.join(quant_path, 'unet.onnx')
-    calib.export_quant_onnx(quant_onnx, use_external=True)
-    quant_numpy = calib._get_quant_params()
-    graph = modify_quant_fuse(unet_onnx, quant_onnx, quant_numpy)
-    fuse_path = os.path.join(quant_path, 'unet_fuse.onnx')
-    graph.save(fuse_path)
-
-if __name__ == "__main__":
-    main()
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/requirements.txt b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/requirements.txt
deleted file mode 100644
index b6055abcc57b3691689c1082cd8aca579d088445..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-torch==1.13.0
-diffusers==0.21.0
-transformers==4.26.1
-open_clip_torch==2.20.0
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/stable_diffusionxl_2_onnx.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/stable_diffusionxl_2_onnx.py
deleted file mode 100644
index 15d8f1569b3e64fe619977fb152c91411a109a76..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/stable_diffusionxl_2_onnx.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from argparse import Namespace
-
-import torch
-import torch.nn as nn 
-from diffusers import DDIMScheduler
-from diffusers import StableDiffusionXLPipeline
-
-
-def parse_arguments() -> Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-o",
-        "--output_dir",
-        type=str,
-        default="./models",
-        help="Path of directory to save ONNX models.",
-    )
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-xl-base-1.0",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "-steps",
-        "--steps", 
-        type=int, 
-        default=50, 
-        help="steps."
-    )
-    parser.add_argument(
-        "-guid",
-        "--guidance_scale", 
-        type=float, 
-        default=5.0, 
-        help="guidance_scale"
-    )
-
-    return parser.parse_args()
-
-
-class NewDdim(nn.Module):
-    def __init__(self, num_train_timesteps=1000, num_inference_steps=50, alphas_cumprod=None,
-                 guidance_scale=7.5, alpha_prod_t_prev_cache=None):
-        super(NewDdim, self).__init__()
-        self.num_train_timesteps = num_train_timesteps
-        self.num_inference_steps = num_inference_steps
-        self.alphas_cumprod = alphas_cumprod
-        self.guidance_scale = guidance_scale
-        self.alpha_prod_t_prev_cache = alpha_prod_t_prev_cache
-
-    def forward(
-        self,
-        model_output: torch.FloatTensor,
-        timestep: int,
-        sample: torch.FloatTensor,
-        step_index: int):
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alpha_prod_t_prev_cache[step_index]
-        beta_prod_t = 1 - alpha_prod_t
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        pred_epsilon = model_output
-        pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
-        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
-        return(prev_sample,)
-
-
-def export_ddim(sd_pipeline: StableDiffusionXLPipeline, save_dir: str, steps: int, guidance_scale: float) -> None:
-    print("Exporting the ddim...")
-    ddim_path = os.path.join(save_dir, "ddim")
-    if not os.path.exists(ddim_path):
-        os.makedirs(ddim_path, mode=0o744)
-    
-    dummy_input = (
-                   torch.randn(1, 4, 128, 128),
-                   torch.tensor(981),
-                   torch.randn(1, 4, 128, 128),
-                   torch.tensor(0)
-                   )
-    scheduler = DDIMScheduler.from_config(sd_pipeline.scheduler.config)
-    scheduler.set_timesteps(steps, device="cpu")
-
-    timesteps = scheduler.timesteps[:steps]
-    alpha_prod_t_prev_cache = []
-    for timestep in timesteps:
-        prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
-        alpha_prod_t_prev = scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
-        alpha_prod_t_prev_cache.append(alpha_prod_t_prev)
-
-    new_ddim = NewDdim(
-        num_train_timesteps=scheduler.config.num_train_timesteps,
-        num_inference_steps=scheduler.num_inference_steps,
-        alphas_cumprod=scheduler.alphas_cumprod,
-        guidance_scale=guidance_scale,
-        alpha_prod_t_prev_cache=torch.tensor(alpha_prod_t_prev_cache)
-    )
-
-    new_ddim.eval()
-    torch.onnx.export(
-        new_ddim,
-        dummy_input,
-        os.path.join(ddim_path, "ddim.onnx"),
-        input_names=["noise_pred", "timestep", "latents", "step_index"],
-        output_names=["out_latents"],
-        dynamic_axes={
-            "noise_pred": {0: 'bs'},
-            "latents": {0: 'bs'},
-        },
-        opset_version=11,
-        verbose=False,
-    )
-
-
-def export_encoder(sd_pipeline: StableDiffusionXLPipeline, save_dir: str) -> None:
-    print("Exporting the text encoder...")
-    encoder_path = os.path.join(save_dir, "text_encoder")
-    if not os.path.exists(encoder_path):
-        os.makedirs(encoder_path, mode=0o744)
-
-    encoder_model = sd_pipeline.text_encoder
-
-    max_position_embeddings = encoder_model.config.max_position_embeddings
-    dummy_input = (
-        torch.ones([1, max_position_embeddings], dtype=torch.int64),
-        None,
-        None,
-        None,
-        True
-    )
-
-    torch.onnx.export(
-        encoder_model,
-        dummy_input,
-        os.path.join(encoder_path, "text_encoder.onnx"),
-        input_names=["prompt"],
-        output_names=["text_embeddings"],
-        dynamic_axes={"prompt": {0: 'bs'}},
-        opset_version=11,
-    )
-
-    print("Exporting the text encoder 2...")
-    encoder_2_model = sd_pipeline.text_encoder_2
-
-    torch.onnx.export(
-        encoder_2_model,
-        dummy_input,
-        os.path.join(encoder_path, "text_encoder_2.onnx"),
-        input_names=["prompt"],
-        output_names=["text_embeddings"],
-        dynamic_axes={"prompt": {0: 'bs'}},
-        opset_version=11,
-    )
-
-
-def export_unet(sd_pipeline: StableDiffusionXLPipeline, save_dir: str) -> None:
-    print("Exporting the image information creater...")
-    unet_path = os.path.join(save_dir, "unet")
-    if not os.path.exists(unet_path):
-        os.makedirs(unet_path, mode=0o744)
-
-    unet_model = sd_pipeline.unet
-    encoder_model = sd_pipeline.text_encoder
-    encoder_model_2 = sd_pipeline.text_encoder_2
-
-    sample_size = unet_model.config.sample_size
-    in_channels = unet_model.config.in_channels
-    encoder_hidden_size_2 = encoder_model_2.config.hidden_size
-    encoder_hidden_size = encoder_model.config.hidden_size + encoder_hidden_size_2
-    max_position_embeddings = encoder_model.config.max_position_embeddings
-
-    dummy_input = (
-        torch.ones([1, in_channels, sample_size, sample_size], dtype=torch.float32),
-        torch.ones([1], dtype=torch.int64),
-        torch.ones(
-            [1, max_position_embeddings, encoder_hidden_size], dtype=torch.float32
-        ),
-        None,
-        None,
-        None,
-        None,
-        {
-            "text_embeds": torch.ones([1, encoder_hidden_size_2], dtype=torch.float32),
-            "time_ids": torch.ones([1, 6], dtype=torch.float32)
-        },
-        {}
-    )
-
-    torch.onnx.export(
-        unet_model,
-        dummy_input,
-        os.path.join(unet_path, f"unet.onnx"),
-        input_names=["latent_model_input", "t", "encoder_hidden_states", "text_embeds", "time_ids"],
-        output_names=["sample"],
-        opset_version=11,
-    )
-
-
-def export_vae(sd_pipeline: StableDiffusionXLPipeline, save_dir: str) -> None:
-    print("Exporting the image decoder...")
-
-    vae_path = os.path.join(save_dir, "vae")
-    if not os.path.exists(vae_path):
-        os.makedirs(vae_path, mode=0o744)
-
-    vae_model = sd_pipeline.vae
-    unet_model = sd_pipeline.unet
-
-    sample_size = unet_model.config.sample_size
-    in_channels = unet_model.config.out_channels
-
-    dummy_input = torch.ones([1, in_channels, sample_size, sample_size])
-
-    torch.onnx.export(
-        vae_model.decoder,
-        dummy_input,
-        os.path.join(vae_path, "vae.onnx"),
-        input_names=["latents"],
-        output_names=["image"],
-        dynamic_axes={"latents": {0: 'bs'}},
-        opset_version=11,
-    )
-
-
-def export_onnx(model_path: str, save_dir: str, steps:int, guidance_scale:float) -> None:
-    pipeline = StableDiffusionXLPipeline.from_pretrained(model_path).to("cpu")
-
-    export_encoder(pipeline, save_dir)
-
-    export_unet(pipeline, save_dir)
-
-    export_vae(pipeline, save_dir)
-
-    export_ddim(pipeline, save_dir, steps, guidance_scale)
-
-
-def main():
-    args = parse_arguments()
-    export_onnx(args.model, args.output_dir, args.steps, args.guidance_scale)
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/stable_diffusionxl_ascend_infer.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/stable_diffusionxl_ascend_infer.py
deleted file mode 100644
index 1f47d5b514e04f419aa36661547c275fbc00df60..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/stable_diffusionxl_ascend_infer.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import csv
-import time
-import json
-import argparse
-
-import aclruntime
-from ais_bench.infer.interface import InferSession
-from diffusers.schedulers import *
-import hpsv2
-
-from background_session import BackgroundInferSession
-from pipeline_ascend_stable_diffusionxl import AscendStableDiffusionXLPipeline
-
-
-class PromptLoader:
-    def __init__(
-        self,
-        prompt_file: str,
-        prompt_file_type: str,
-        batch_size: int,
-        num_images_per_prompt: int=1,
-        max_num_prompts: int=0
-    ):
-        self.prompts = []
-        self.catagories = ['Not_specified']
-        self.batch_size = batch_size
-        self.num_images_per_prompt = num_images_per_prompt
-
-        if prompt_file_type == 'plain':
-            self.load_prompts_plain(prompt_file, max_num_prompts)
-
-        elif prompt_file_type == 'parti':
-            self.load_prompts_parti(prompt_file, max_num_prompts)
-
-        elif prompt_file_type == 'hpsv2':
-            self.load_prompts_hpsv2(prompt_file, max_num_prompts)
-
-        self.current_id = 0
-        self.inner_id = 0
-
-    def __len__(self):
-        return len(self.prompts) * self.num_images_per_prompt
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.current_id == len(self.prompts):
-            raise StopIteration
-
-        ret = {
-            'prompts': [],
-            'catagories': [],
-            'save_names': [],
-            'n_prompts': self.batch_size,
-        }
-        for _ in range(self.batch_size):
-            if self.current_id == len(self.prompts):
-                ret['prompts'].append('')
-                ret['save_names'].append('')
-                ret['catagories'].append('')
-                ret['n_prompts'] -= 1
-
-            else:
-                prompt, catagory_id = self.prompts[self.current_id]
-                ret['prompts'].append(prompt)
-                ret['catagories'].append(self.catagories[catagory_id])
-                ret['save_names'].append(f'{self.current_id}_{self.inner_id}')
-
-                self.inner_id += 1
-                if self.inner_id == self.num_images_per_prompt:
-                    self.inner_id = 0
-                    self.current_id += 1
-
-        return ret
-
-    def load_prompts_plain(self, file_path: str, max_num_prompts: int):
-        with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
-            for i, line in enumerate(f):
-                if max_num_prompts and i == max_num_prompts:
-                    break
-
-                prompt = line.strip()
-                self.prompts.append((prompt, 0))
-                
-    def load_prompts_parti(self, file_path: str, max_num_prompts: int):
-        with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
-            # Skip the first line
-            next(f)
-            tsv_file = csv.reader(f, delimiter="\t")
-            for i, line in enumerate(tsv_file):
-                if max_num_prompts and i == max_num_prompts:
-                    break
-
-                prompt = line[0]
-                catagory = line[1]
-                if catagory not in self.catagories:
-                    self.catagories.append(catagory)
-
-                catagory_id = self.catagories.index(catagory)
-                self.prompts.append((prompt, catagory_id))
-
-    def load_prompts_hpsv2(self, root_path: str, max_num_prompts: int):
-        hpsv2_style = ['anime', 'concept-art', 'paintings', 'photo']
-        count = 0
-        for style in hpsv2_style:
-            file_path = os.path.join(root_path, f'{style}.json')
-            with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
-                prompts = json.load(f)
-                
-            for prompt in prompts:
-                count += 1
-                if max_num_prompts and count >= max_num_prompts:
-                    break
-
-                if style not in self.catagories:
-                    self.catagories.append(style)
-
-                catagory_id = self.catagories.index(style)
-                self.prompts.append((prompt, catagory_id))
-
-
-def check_device_range_valid(value):
-    # if contain , split to int list
-    min_value = 0
-    max_value = 255
-    if ',' in value:
-        ilist = [ int(v) for v in value.split(',') ]
-        for ivalue in ilist[:2]:
-            if ivalue < min_value or ivalue > max_value:
-                raise argparse.ArgumentTypeError("{} of device:{} is invalid. valid value range is [{}, {}]".format(
-                    ivalue, value, min_value, max_value))
-        return ilist[:2]
-    else:
-		# default as single int value
-        ivalue = int(value)
-        if ivalue < min_value or ivalue > max_value:
-            raise argparse.ArgumentTypeError("device:{} is invalid. valid value range is [{}, {}]".format(
-                ivalue, min_value, max_value))
-        return ivalue
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-2-1-base",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "--prompt_file",
-        type=str,
-        default='prompts.txt',
-        help="A prompt file used to generate images.",
-    )
-    parser.add_argument(
-        "--prompt_file_type", 
-        choices=["plain", "parti", 'hpsv2'],
-        default="plain", 
-        help="Type of prompt file.",
-    )
-    parser.add_argument(
-        "--model_dir",
-        type=str,
-        default="./models",
-        help="Base path of om models.",
-    )
-    parser.add_argument(
-        "--save_dir", 
-        type=str, 
-        default="./results", 
-        help="Path to save result images.",
-    )
-    parser.add_argument(
-        "--info_file_save_path", 
-        type=str, 
-        default="./image_info.json", 
-        help="Path to save image information file.",
-    )
-    parser.add_argument(
-        "--steps", 
-        type=int, 
-        default=50, 
-        help="Number of inference steps.",
-    )
-    parser.add_argument(
-        "--num_images_per_prompt",
-        default=1,
-        type=int,
-        help="Number of images generated for each prompt.",
-    )
-    parser.add_argument(
-        "--max_num_prompts",
-        default=0,
-        type=int,
-        help="Limit the number of prompts (0: no limit).",
-    )
-    parser.add_argument(
-        "--scheduler", 
-        choices=["None", "DDIM", "Euler", "DPM", "EulerAncestral", "DPM++SDEKarras"],
-        default="DDIM", 
-        help="Type of Sampling methods. Can choose from DDIM, Euler, DPM",
-    )
-    parser.add_argument(
-        "--device", 
-        type=check_device_range_valid, 
-        default=0, 
-        help="NPU device id."
-    )
-    parser.add_argument(
-        "-bs",
-        "--batch_size", 
-        type=int, 
-        default=1, 
-        help="Batch size."
-    )
-    parser.add_argument(
-        "--use_cache", 
-        action="store_true",
-        help="Use cache during inference."
-    )
-    parser.add_argument(
-        "--cache_steps", 
-        type=str, 
-        default="1,2,4,6,7,9,10,12,13,14,16,18,19,21,23,24,26,27,29,\
-                30,31,33,34,36,37,39,40,42,43,45,47,48,49", 
-        help="Steps to use cache data."
-    )
-
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-    save_dir = args.save_dir
-    device = None
-    device_2 = None
-
-    if isinstance(args.device, list):
-        device, device_2 = args.device
-    else:
-        device = args.device
-
-    pipe = AscendStableDiffusionXLPipeline.from_pretrained(args.model).to("cpu")
-    use_npu_scheduler = False
-
-    if args.scheduler == "DDIM":
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        use_npu_scheduler = True
-    elif args.scheduler == "Euler":
-        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-    elif args.scheduler == "DPM":
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-    elif args.scheduler == "EulerAncestral":
-        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-    elif args.scheduler == "DPM++SDEKarras":
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.scheduler.config.algorithm_type = 'sde-dpmsolver++'
-        pipe.scheduler.config.use_karras_sigmas = True
-
-    encoder_om = os.path.join(args.model_dir, "text_encoder", "text_encoder.om")
-    encoder_om_2 = os.path.join(args.model_dir, "text_encoder", "text_encoder_2.om")
-    vae_om = os.path.join(args.model_dir, "vae", "vae.om")
-
-    encoder_session = InferSession(device, encoder_om)
-    encoder_session_2 = InferSession(device, encoder_om_2)
-    vae_session = InferSession(device, vae_om)
-
-    if use_npu_scheduler:
-        scheduler_om = os.path.join(args.model_dir, "ddim", "ddim.om")
-        scheduler_session = InferSession(device, scheduler_om)
-    else:
-        scheduler_session = None
-
-    skip_status = [0] * args.steps
-    if args.use_cache:
-        for i in args.cache_steps.split(','):
-            if int(i) >= args.steps:
-                continue
-            skip_status[int(i)] = 1
-        unet_cache_om = os.path.join(args.model_dir, "unet", "unet_cache.om")
-        unet_skip_om = os.path.join(args.model_dir, "unet", "unet_skip.om")
-        unet_session = [
-            aclruntime.InferenceSession(unet_cache_om, device, aclruntime.session_options()),
-            aclruntime.InferenceSession(unet_skip_om, device, aclruntime.session_options()),
-        ]
-    else:
-        unet_cache_om = os.path.join(args.model_dir, "unet", "unet.om")
-        unet_skip_om = ""
-        unet_session = [
-            aclruntime.InferenceSession(unet_cache_om, device, aclruntime.session_options()),
-            None,
-        ]
-
-    unet_session_bg = None
-    if device_2:
-        unet_session_bg = BackgroundInferSession.clone(
-            unet_session[0], 
-            device_2, 
-            [unet_cache_om, unet_skip_om]
-        )
-
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir, mode=0o744)
-
-    use_time = 0
-
-    prompt_loader = PromptLoader(args.prompt_file, 
-                                 args.prompt_file_type, 
-                                 args.batch_size,
-                                 args.num_images_per_prompt,
-                                 args.max_num_prompts)
-
-    prompts_2 = ""
-    infer_num = 0
-    image_info = []
-    current_prompt = None
-    for _, input_info in enumerate(prompt_loader):
-        prompts = input_info['prompts']
-        catagories = input_info['catagories']
-        save_names = input_info['save_names']
-        n_prompts = input_info['n_prompts']
-        
-        print(f"[{infer_num + n_prompts}/{len(prompt_loader)}]: {prompts}")
-        infer_num += args.batch_size
-
-        start_time = time.time()
-        images = pipe.ascend_infer(
-            prompts,
-            prompts_2,
-            encoder_session,
-            encoder_session_2,
-            [unet_session, unet_session_bg],
-            scheduler_session,
-            vae_session,
-            skip_status,
-            device_id=device,
-            num_inference_steps=args.steps,
-            guidance_scale=5.0,
-            use_npu_scheduler=use_npu_scheduler,
-        )
-
-        use_time += time.time() - start_time
-
-        for j in range(n_prompts):
-            image_save_path = os.path.join(save_dir, f"{save_names[j]}.png")
-            image = images[0][j]
-            image.save(image_save_path)
-
-            if current_prompt != prompts[j]:
-                current_prompt = prompts[j]
-                image_info.append({'images': [], 'prompt': current_prompt, 'category': catagories[j]})
-
-            image_info[-1]['images'].append(image_save_path)
-
-    if unet_session_bg:
-        unet_session_bg.stop()
-
-    # Save image information to a json file
-    if os.path.exists(args.info_file_save_path):
-        os.remove(args.info_file_save_path)
-        
-    with os.fdopen(os.open(args.info_file_save_path, os.O_RDWR|os.O_CREAT, 0o644), "w") as f:
-        json.dump(image_info, f)
-
-    print(
-        f"[info] infer number: {infer_num}; use time: {use_time:.3f}s; "
-        f"average time: {use_time/infer_num:.3f}s"
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/unet_cache.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/unet_cache.py
deleted file mode 100644
index 8335caab61c9580253ec0c5ec432cff9801b646b..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl/unet_cache.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-
-from auto_optimizer import OnnxGraph
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="models/unet/unet.onnx",
-        help="Path of the unet onnx model.",
-    )
-    parser.add_argument(
-        "--save_dir",
-        type=str,
-        default="models/unet",
-        help="Path to save the modified model",
-    )
-    return parser.parse_args()
-
-
-def cache_unet(model_path, new_model_path, data):
-    model = OnnxGraph.parse(model_path)
-    model.add_output(data, dtype='float32', shape=[])
-    model.save(new_model_path)
-
-
-def skip_unet(model_path, new_model_path, data):
-    model = OnnxGraph.parse(model_path)
-    node = model.get_next_nodes(data)[0]
-    batch_size = model.inputs[0].shape[0]
-    model.add_input('cache', dtype='float32', shape=[batch_size, 1280, 64, 64])
-    node.inputs[0] = 'cache'
-    model.remove_unused_nodes()
-    model.save(new_model_path)
-
-
-def main(args):
-    cache_path = os.path.join(args.save_dir, "unet_cache.onnx")
-    skip_path = os.path.join(args.save_dir, "unet_skip.onnx")
-    cache_name = '/up_blocks.0/upsamplers.0/conv/Conv_output_0'
-    cache_unet(args.model, cache_path, cache_name)
-    skip_unet(args.model, skip_path, cache_name)
-
-
-if __name__ == "__main__":
-    main(parse_arguments())
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/README.md b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/README.md
deleted file mode 100644
index a9bd9eed5f5e620372fd56478af4d26e769eaafe..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/README.md
+++ /dev/null
@@ -1,486 +0,0 @@
-# stable-diffusionxl_refiner模型-推理指导  
-
-
-- [概述](#ZH-CN_TOPIC_0000001172161501)
-   
-   - [输入输出数据](#section540883920406)
-
-- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
-
-- [快速上手](#ZH-CN_TOPIC_0000001126281700)
-
-  - [获取源码](#section4622531142816)
-  - [模型推理](#section741711594517)
-
-- [模型推理性能&精度](#ZH-CN_TOPIC_0000001172201573)
-
-
-# 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
-
-   SDXL 由一组用于潜在扩散的专家管道组成： 在第一步中，使用基础模型生成（噪声）潜伏， 然后使用专门用于最终降噪步骤的细化模型[此处获得](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/)，该模型提供SDXL的图生图功能
-
-- 参考实现：
-  ```bash
-   # StableDiffusionxl
-   https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-  ```
-
-## 输入输出数据<a name="section540883920406"></a>
-
-- 输入数据
-
-  | 输入数据  | 大小      | 数据类型                | 数据排布格式 |
-  | -------- | -------- | ------------------------- | ------------ |
-  | prompt    |  1 x 77 | INT64|  ND|
-
-
-- 输出数据
-
-  | 输出数据 | 大小     | 数据类型 | 数据排布格式 |
-  | -------- | -------- | -------- | ------------ |
-  | output1  | 1 x 3 x 1024 x 1024 | FLOAT32  | NCHW          |
-
-# 推理环境准备<a name="ZH-CN_TOPIC_0000001126281702"></a>
-
-- 该模型需要以下插件与驱动
-
-  **表 1**  版本配套表
-  | 配套                                                         | 版本    | 环境准备指导                                                 |
-  | ------------------------------------------------------------ | ------- | ------------------------------------------------------------ |
-  | 固件与驱动                                                   | 24.1.rc1  | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) |
-  | CANN（+MindIE）                                              | 8.0.RC1(1.0.RC1) | -                                                            |
-  | Python                                                       | 3.10   | -                                                            |                                                           |
-如在优化模型时使用了--FA_soc、--TOME_num、--faster_gelu参数，需要安装与CANN包配套版本的MindIE
-
-该模型性能受CPU规格影响，建议使用64核CPU（arm）以复现性能
-
-
-# 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
-
-## 获取源码<a name="section4622531142816"></a>
-1. 获取本仓源码
-   
-   ```
-   git clone https://gitee.com/ascend/ModelZoo-PyTorch.git
-   cd ModelZoo-PyTorch/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner
-   ```
-
-1. 安装依赖。
-   ```bash
-   pip3 install -r requirements.txt
-   
-   git clone https://github.com/tgxs002/HPSv2.git
-   cd HPSv2
-   pip3 install -e .
-   ```
-
-2. 代码修改
-
-   执行命令：
-   
-   ```bash
-   TRANSFORMERS_PATH=`python3 -c "import transformers; print(transformers.__path__[0])"`
-   patch  -p0 ${TRANSFORMERS_PATH}/models/clip/modeling_clip.py clip.patch 
-   ```
-
-3. 安装昇腾统一推理工具（AIT）
-
-   请访问[AIT代码仓](https://gitee.com/ascend/ait/tree/master/ait#ait)，根据readme文档进行工具安装。可只安装需要的组件：debug surgeon，其他组件为可选安装。
-   
-   请访问[ais_bench](https://gitee.com/ascend/tools/tree/master/ais-bench_workload/tool/ais_bench)，根据readme文件进行工具安装。
-   
-
-## 模型推理<a name="section741711594517"></a>
-
-1. 模型转换。
-   使用PyTorch将模型权重文件转换为.onnx文件，再使用ATC工具将.onnx文件转为离线推理模型文件.om文件。
-
-   0. 获取权重（可选）
-
-      可提前下载权重，放到代码同级目录下，以避免执行后面步骤时可能会出现下载失败。
-
-      ```bash
-      # 需要使用 git-lfs (https://git-lfs.com)
-      git lfs install
-
-      # 下载权重
-      git clone https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0
-      ```
-
-   1. 导出ONNX模型
-
-      设置模型名称或路径
-      ```bash
-      # base (执行时下载权重)
-      model_base="stabilityai/stable-diffusion-xl-refiner-1.0"
-
-      # base (下载的权重路径)
-      model_base="./stable-diffusion-xl-refiner-1.0"
-      ```
-
-      执行命令：
-
-      ```bash
-      python3 stable_diffusionxl_2_onnx.py --model ${model_base} --output_dir ./models
-
-      ```
-
-      参数说明：
-      - --model：模型权重路径
-      - --output_dir: ONNX模型输出目录
- 
-      
-      执行成功后生成onnx模型：
-         ```
-         |—— models
-                |—— text_encoder 
-                       |—— text_encoder_2.onnx 
-                |—— unet 
-                       |—— unet.onnx 
-                |—— vae 
-                       |—— vae.onnx 
-                |—— ddim 
-                       |—— ddim.onnx 
-         ```      
-
-   2. 优化onnx模型
-
-      1. 模型优化
-
-         运行modify_onnx.py脚本。
-         ```bash 
-         bs=1
-
-         # 非并行方案
-         python3 modify_onnx.py \
-               --model models/unet/unet.onnx \
-               --new_model models/unet/unet_md.onnx \
-               --FA_soc Duo \
-               --faster_gelu \
-               --batch_size ${bs}
-         
-         # 并行方案
-         python3 modify_onnx.py \
-               --model models/unet/unet.onnx \
-               --new_model models/unet/unet_md.onnx \
-               --FA_soc Duo \
-               --faster_gelu \
-               --batch_size ${bs} \
-               --parallel
-         ```
-         参数说明：
-         - --model：onnx模型路径。
-         - --new_model：优化后生成的onnx模型路径。
-         - --FA_soc：使用FA算子的硬件形态。目前FlashAttention算子支持Atlas 300I Duo/Pro和Atlas 800I A2，请根据硬件设置参数为Duo或A2，其他不支持硬件请设置为None。
-         - --faster_gelu：使用slice+gelu的融合算子。
-         - --batch_size：生成适用于指定batch_size的模型，默认值为1。
-         - --parallel：生成适用于并行方案的模型
-
-         FA、SliceGelu融合算子需通过安装与CANN版本对应的推理引擎包(MindIE)来获取，如未安装推理引擎或使用的版本不支持FA、SliceGelu算子，FA_soc参数请使用默认配置、不设置faster_gelu参数。
-
-         多batch场景限制：A2场景下暂不支持FA算子优化，FA_soc参数请设置为None。
-
-   3. 使用ATC工具将ONNX模型转OM模型。
-
-      1. 配置环境变量。
-
-         ```bash
-         source /usr/local/Ascend/ascend-toolkit/set_env.sh
-
-         # 如果安装了推理引擎算子包，需配置推理引擎路径
-         source /usr/local/Ascend/mindie/set_env.sh
-         ```
-
-         > **说明：** 
-         >该脚本中环境变量仅供参考，请以实际安装环境配置环境变量。详细介绍请参见《[CANN 开发辅助工具指南 \(推理\)](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=developer-documents&subcategory=auxiliary-development-tools)》。
-
-      2. 执行命令查看芯片名称（$\{chip\_name\}）。
-
-         ```
-         npu-smi info
-         #该设备芯片名为Ascend310P3 （自行替换）
-         回显如下：
-         +-------------------+-----------------+------------------------------------------------------+
-         | NPU     Name      | Health          | Power(W)     Temp(C)           Hugepages-Usage(page) |
-         | Chip    Device    | Bus-Id          | AICore(%)    Memory-Usage(MB)                        |
-         +===================+=================+======================================================+
-         | 0       310P3     | OK              | 15.8         42                0    / 0              |
-         | 0       0         | 0000:82:00.0    | 0            1074 / 21534                            |
-         +===================+=================+======================================================+
-         | 1       310P3     | OK              | 15.4         43                0    / 0              |
-         | 0       1         | 0000:89:00.0    | 0            1070 / 21534                            |
-         +===================+=================+======================================================+
-         ```
-
-      3. 执行ATC命令。
-
-         ```bash
-         # text_encoder
-         cd ./models/text_encoder
-         atc --framework=5 \
-             --model=./text_encoder_2.onnx \
-             --output=./text_encoder_2 \
-             --input_format=ND \
-             --input_shape="prompt:${bs},77" \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-         
-         # unet
-         cd ../unet/
-
-         atc --framework=5 \
-             --model=./unet_md.onnx \
-             --output=./unet \
-             --input_format=NCHW \
-             --log=error \
-             --optypelist_for_implmode="Gelu,Sigmoid" \
-             --op_select_implmode=high_performance \
-             --soc_version=Ascend${chip_name}
-
-         cd ../../
-
-         # vae
-         atc --framework=5 \
-             --model=./models/vae/vae_encoder.onnx \
-             --output=./models/vae/vae_encoder \
-             --input_format=NCHW \
-             --input_shape="image:${bs},3,1024,1024" \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-
-         atc --framework=5 \
-             --model=./models/vae/vae_decoder.onnx \
-             --output=./models/vae/vae_decoder \
-             --input_format=NCHW \
-             --input_shape="latents:${bs},4,128,128" \
-             --log=error \
-             --soc_version=Ascend${chip_name}
-
-         # 如果使用ddim采样器
-         atc --framework=5 \
-             --model=./models/ddim/ddim.onnx \
-             --output=./models/ddim/ddim \
-             --input_format=ND \
-             --input_shape="noise_pred:${bs},4,128,128;latents:${bs},4,128,128" \
-             --log=error \
-             --soc_version=Ascend${chip_name} 
-         ```
-      
-      参数说明：
-      - --model：为ONNX模型文件。
-      - --output：输出的OM模型。
-      - --framework：5代表ONNX模型。
-      - --log：日志级别。
-      - --soc_version：处理器型号。
-      - --input_shape: 模型的输入shape信息。
-
-
-      执行成功后生成om模型列表：  
-         ```
-         |—— models
-                 |—— text_encoder
-                        |—— text_encoder_2.om
-                 |—— unet
-                        |—— unet.om
-                 |—— vae
-                        |—— vae.om
-                 |—— ddim
-                        |—— ddim.om
-         ```
-       
-2. 开始推理验证。
-    
-   1. 安装绑核工具并根据NUMA亲和性配置任务进程与NUMA node 的映射关系是为了排除cpu的影响
-
-      安装绑核工具
-      ```
-      yum install numactl
-      ```
-      通过`npu-smi info`查询device的bus-id，并根据bus-id通过`lspci -vs bus-id`查询卡的NUMA node。
-
-      查到NUMA node后，使用`lscpu`获得NUMA node对应的CPU核，推荐绑定其中单核以获得更好的性能。
-      ```bash
-      NUMA node0: 0-23
-      NUMA node1: 24-47
-      NUMA node2: 48-71
-      NUMA node3: 72-95
-      ```
-      例如，device对应的NUMA node为3，则在NUMA node3对应的CPU核中选择一个，比如72
-
-   2. 执行推理脚本。
-
-      推理前需要先准备推理所需的文本和图片，并将信息保存在json文件中，生成方法可参考[SDXL_Base](../stable_diffusion/README.md)
-
-      json文件中保存的image路径是与json文件的相对路径。
-
-      ```bash
-      # 非并行方案
-      numactl -C 72 python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --image_info image_info.json \
-              --info_file_save_path refiner_image_info.json \
-              --device 0 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50
-
-      # 并行方案
-      numactl -C 72 python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --image_info image_info.json \
-              --info_file_save_path refiner_image_info.json \
-              --device 0,1 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50
-      ```
-
-      参数说明：
-      - --model：模型名称或本地模型目录的路径。
-      - --model_dir：存放导出模型的目录。
-      - --image_info：存放输入的prompt和image路径的json文件。
-      - --info_file_save_path：存放输出的prompt和image路径的json文件。
-      - --save_dir：生成图片的存放目录。
-      - --max_num_prompts：限制prompt数量为前X个，0表示不限制。
-      - --batch_size：模型batch size。
-      - --steps：生成图片迭代次数。
-      - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-      - --use_cache: 在推理过程中使用cache。
-      - --cache_steps: 使用cache的迭代次数，迭代次数越多性能越好，但次数过多可能会导致精度下降。取值范围为[1, stpes-1]。
-      - --scheduler：采样器。可选None、DDIM、Euler、DPM、EulerAncestral、DPM++SDEKarras。None即为默认scheduler。
-      
-      执行完成后在`./results`目录下生成推理图片。并在终端显示推理时间，参考如下：
-
-      ```
-      [info] infer number: 16; use time: 104.6s; average time: 6.542s
-      ```
-      *注意*：
-
-         如果使用arm机器，出现`*torch*.so*: cannot allocate memory in static TLS block`报错，则增加环境变量指向报错路径
-         ```bash
-         export LD_PRELOAD=报错.so路径:$LD_PRELOAD
-         ```
-
-## 精度验证<a name="section741711594518"></a>
-
-   由于生成的图片存在随机性，提供两种精度验证方法：
-   1. CLIP-score（文图匹配度量）：评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。使用Parti数据集进行验证。
-   2. HPSv2（图片美学度量）：评估生成图片的人类偏好评分，分数的取值范围为[0, 1]，越高越好。使用HPSv2数据集进行验证
-
-   注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
-
-   1. 下载Parti数据集
-
-      ```bash
-      wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
-      ```
-
-   2. 下载模型权重
-
-      ```bash
-      # Clip Score 和 HPSv2 均需使用的权重
-      GIT_LFS_SKIP_SMUDGE=1 
-      git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
-
-      # HPSv2权重
-      wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
-      ```
-      也可手动下载[CLIP权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin)
-      将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下，手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径
-
-
-   2. 使用推理脚本生成图片
-
-      ```bash
-      # 非并行方案
-      python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --image_info image_info.json \
-              --info_file_save_path refiner_image_info.json \
-              --max_num_prompts 0 \
-              --device 0 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-              
-      # 并行方案
-      python3 stable_diffusionxl_ascend_infer.py \
-              --model ${model_base} \
-              --model_dir ./models \
-              --image_info image_info.json \
-              --info_file_save_path refiner_image_info.json \
-              --max_num_prompts 0 \
-              --device 0,1 \
-              --save_dir ./results \
-              --batch_size ${bs} \
-              --steps 50 \
-              --use_cache
-      ```
-
-      参数说明：
-      - --model：模型名称或本地模型目录的路径。
-      - --model_dir：存放导出模型的目录。
-      - --image_info：存放输入的prompt和image路径的json文件。
-      - --info_file_save_path：存放输出的prompt和image路径的json文件。
-      - --num_images_per_prompt: 每个prompt生成的图片数量。
-      - --max_num_prompts：限制prompt数量为前X个，0表示不限制。
-      - --save_dir：生成图片的存放目录。
-      - --batch_size：模型batch size。
-      - --steps：生成图片迭代次数。
-      - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-      - --use_cache: 在推理过程中使用cache。
-      - --cache_steps: 使用cache的迭代次数，迭代次数越多性能越好，但次数过多可能会导致精度下降。
-
-      执行完成后会在`./results`目录下生成推理图片，并且会在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。
-
-   4. 计算精度指标
-   
-      1. CLIP-score
-
-         ```bash
-         python3 clip_score.py \
-               --device=cpu \
-               --image_info="refiner_image_info.json" \
-               --model_name="ViT-H-14" \
-               --model_weights_path="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
-         ```
-
-         参数说明：
-         - --device: 推理设备。
-         - --image_info: 上一步生成的`refiner_image_info.json`文件。
-         - --model_name: Clip模型名称。
-         - --model_weights_path: Clip模型权重文件路径。
-
-         执行完成后会在屏幕打印出精度计算结果。
-      
-      2. HPSv2
-
-         ```bash
-         python3 hpsv2_score.py \
-               --image_info="refiner_image_info.json" \
-               --HPSv2_checkpoint="./HPS_v2_compressed.pt" \
-               --clip_checkpoint="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
-         ```
-
-         参数说明：
-         - --image_info: 上一步生成的`refiner_image_info.json`文件。
-         - --HPSv2_checkpoint: HPSv2模型权重文件路径。
-         - --clip_checkpointh: Clip模型权重文件路径。
-
-         执行完成后会在屏幕打印出精度计算结果。
-   
-# 模型推理性能&精度<a name="ZH-CN_TOPIC_0000001172201573"></a>
-
-调用ACL接口推理计算，性能参考下列数据。
-
-### StableDiffusionxl
-
-| 硬件形态 | batch size | 迭代次数 | 平均耗时    | 优化方案 | clip score  | 采样器 |
-| :------: | :-----: | :----: | :--------: | :--------: | :----: | :----: |
-| DUO |  1  |   50  |  7.54s   | 并行，FA+faster_gelu | 0.372 | ddim |
-
-性能测试需要独占npu和cpu
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/background_session.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/background_session.py
deleted file mode 100644
index 30f1e52d3a0de7999bd9ad2aa04cc57bb83bfc0d..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/background_session.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import multiprocessing as mp
-from dataclasses import dataclass
-from typing import List, Optional
-
-import numpy as np
-import aclruntime
-from ais_bench.infer.interface import InferSession
-
-
-@dataclass
-class SessionIOInfo:
-    input_shapes: List[tuple]
-    input_dtypes: List[type]
-    output_shapes: List[tuple]
-    output_dtypes: List[type]
-
-
-@dataclass
-class BackgroundInferSessionOptions:
-    device_id: int
-    model_path: List[str]
-    io_info: SessionIOInfo
-    acl_json_path: Optional[str] = None
-    debug: Optional[bool] = False
-    loop: Optional[int] = 1
-
-
-class BackgroundInferSession:
-    def __init__(
-        self, 
-        device_id: int, 
-        model_path: str, 
-        io_info: SessionIOInfo,
-    ):
-        # Create a pipe for process synchronization
-        self.sync_pipe, sync_pipe_peer = mp.Pipe(duplex=True)
-
-        # Create shared buffers
-        input_spaces = self.create_shared_buffers(io_info.input_shapes, io_info.input_dtypes)
-        output_spaces = self.create_shared_buffers(io_info.output_shapes, io_info.output_dtypes)
-
-        # Build numpy arrays on the shared buffers
-        self.input_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(input_spaces, io_info.input_shapes, io_info.input_dtypes)]
-        self.output_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(output_spaces, io_info.output_shapes, io_info.output_dtypes)]
-
-        mp.set_start_method('forkserver', force=True)
-        self.p = mp.Process(
-            target=self.run_session, 
-            args=[sync_pipe_peer, input_spaces, output_spaces,
-                  io_info, device_id, model_path]
-        )
-        self.p.start()
-
-        # Wait until the sub process is ready
-        self.wait()
-
-    def infer_asyn(self, feeds: List[np.ndarray], skip=0) -> None:
-        for i in range(len(self.input_arrays)):
-            self.input_arrays[i][:] = feeds[i][:]
-
-        if skip:
-            self.sync_pipe.send('skip')
-        else:
-            self.sync_pipe.send('cache')
-
-    def wait(self) -> None:
-        self.sync_pipe.recv()
-
-    def get_outputs(self) -> List[np.ndarray]:
-        return self.output_arrays
-
-    def wait_and_get_outputs(self) -> List[np.ndarray]:
-        self.wait()
-        return self.get_outputs()
-    
-    def infer(self, feeds: List[np.ndarray]) -> List[np.ndarray]:
-        # This function should work as same as InferSession.infer()
-        self.infer_asyn(feeds)
-        return self.wait_and_get_outputs()
-
-    def stop(self):
-        # Stop the sub process
-        self.p.terminate()
-
-    @classmethod
-    def clone(
-        cls, 
-        session: InferSession, 
-        device_id: int, 
-        model_path: List[str]) -> 'BackgroundInferSession':
-        # Get shapes, datatypes, and model path from an existed InferSession, 
-        # then use them to create a BackgroundInferSession
-        io_info = cls.get_io_info_from_session(session)
-        io_info.output_shapes = [io_info.output_shapes[0]]
-        io_info.output_dtypes = [io_info.output_dtypes[0]]
-
-        return cls(device_id, model_path, io_info)
-
-    @staticmethod
-    def get_io_info_from_session(session: InferSession) -> SessionIOInfo:
-        # Map aclruntime datatype to numpy datatype
-        np_types = (np.float32, np.float16, np.int8, np.int32, 
-                    np.uint8, '', np.int16, np.uint16, np.uint32, 
-                    np.int64, np.uint64)
-
-        # Get input shapes and datatypes
-        inputs = session.get_inputs()
-        input_shapes = [t.shape for t in inputs]
-        input_dtypes = [np_types[t.datatype] for t in inputs]
-
-        # Get output shapes and datatypes
-        outputs = session.get_outputs()
-        output_shapes = [t.shape for t in outputs]
-        output_dtypes = [np_types[t.datatype] for t in outputs]
-
-        return SessionIOInfo(input_shapes, input_dtypes, 
-                             output_shapes, output_dtypes)
-
-    @staticmethod
-    def create_shared_buffers(shapes: List[tuple], dtypes: List[type]) -> List[mp.RawArray]:
-        buffers = []
-        for shape, dtype in zip(shapes, dtypes):
-            size = 1
-            for x in shape:
-                size *= x
-
-            raw_array = mp.RawArray(np.ctypeslib.as_ctypes_type(dtype), size)
-            buffers.append(raw_array)
-
-        return buffers
-
-    @staticmethod
-    def run_session(
-        sync_pipe: mp.connection.Connection,
-        input_spaces: List[np.ndarray],
-        output_spaces: List[np.ndarray],
-        io_info: SessionIOInfo,
-        device_id: int, 
-        model_path: list, 
-    ) -> None:
-        # The sub process function
-
-        # Create an InferSession
-        session_cache = aclruntime.InferenceSession(
-            model_path[0], 
-            device_id, 
-            aclruntime.session_options()
-            )
-        if model_path[1]:
-            session_skip = aclruntime.InferenceSession(
-                model_path[1], 
-                device_id, 
-                aclruntime.session_options()
-                )
-
-        # Build numpy arrays on the shared buffers
-        input_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(input_spaces, io_info.input_shapes, io_info.input_dtypes)]
-
-        output_arrays = [np.frombuffer(b, dtype=t).reshape(s) for (
-            b, s, t) in zip(output_spaces, io_info.output_shapes, io_info.output_dtypes)]
-
-        # Tell the main function that we are ready
-        sync_pipe.send('')
-
-        # Keep looping until recived a 'STOP'
-        while True:
-            flag = sync_pipe.recv()
-            if flag == 'cache':
-                feeds = {}
-                inputs = session_cache.get_inputs()
-                for i in range(len(input_arrays)):
-                    feed = aclruntime.Tensor(input_arrays[i])
-                    feed.to_device(device_id)
-                    feeds[inputs[i].name] = feed
-                out_names = [out.name for out in session_cache.get_outputs()]
-                
-                outputs = session_cache.run(out_names, feeds)
-                if len(outputs) > 1:
-                    cache = outputs[1]
-            else:
-                feeds = {}
-                inputs = session_skip.get_inputs()
-                for i in range(len(input_arrays)):
-                    feed = aclruntime.Tensor(input_arrays[i])
-                    feed.to_device(device_id)
-                    feeds[inputs[i].name] = feed
-                feeds[inputs[-1].name] = cache
-                out_names = [out.name for out in session_skip.get_outputs()]
-                
-                outputs = session_skip.run(out_names, feeds)
-            outputs[0].to_host()
-            output = np.array(outputs[0])
-            for i in range(len(output_arrays)):
-                output_arrays[i][:] = output[:]
-
-            sync_pipe.send('')
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/clip.patch b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/clip.patch
deleted file mode 100644
index a07c10fc20a05b33d9ed614132fecf89b76e33b0..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/clip.patch
+++ /dev/null
@@ -1,7 +0,0 @@
-22a23
-> import numpy as np
-760c761,762
-<         mask.triu_(1)  # zero out the lower diagonal
----
->         # mask.triu_(1)  # zero out the lower diagonal
->         mask = torch.from_numpy(np.triu(mask.numpy(), 1))
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/clip_score.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/clip_score.py
deleted file mode 100644
index 069f5d6e9a9baaa61b9a3537bcab6f637605858e..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/clip_score.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import json
-import time
-import argparse
-
-import open_clip
-import numpy as np
-from PIL import Image
-import torch
-import torch.nn.functional as F
-
-
-def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
-    imgs = []
-    texts = []
-    for image_file in image_files:
-        img = preprocess(Image.open(image_file)).unsqueeze(0).to(device)
-        imgs.append(img)
-        text = tokenizer([prompt]).to(device)
-        texts.append(text)
-
-    img = torch.cat(imgs)   # [bs, 3, 224, 224]
-    text = torch.cat(texts) # [bs, 77]
-
-    with torch.no_grad():
-        text_ft = model_clip.encode_text(text).float()
-        img_ft = model_clip.encode_image(img).float()
-        score = F.cosine_similarity(img_ft, text_ft).squeeze()
-    
-    return score.cpu()
-
-
-def main():
-    args = parse_arguments()
-    
-    if args.device is None:
-        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
-    else:
-        device = torch.device(args.device)
-    
-    t_b = time.time()
-    print(f"Load clip model...") 
-    model_clip, _, preprocess = open_clip.create_model_and_transforms(
-        args.model_name, pretrained=args.model_weights_path, device=device)
-    model_clip.eval()
-    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
-    
-    tokenizer = open_clip.get_tokenizer(args.model_name)
-
-    with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f:
-        image_info = json.load(f)
-
-    t_b = time.time()
-    print(f"Calc clip score...") 
-    all_scores = []
-    cat_scores = {}
-
-    for i, info in enumerate(image_info):
-        image_files = info['images']
-        category = info['category']
-        prompt = info['prompt']
-
-        print(f"[{i + 1}/{len(image_info)}] {prompt}")
-
-        image_scores = clip_score(model_clip, 
-                                  tokenizer, 
-                                  preprocess, 
-                                  prompt, 
-                                  image_files, 
-                                  device)
-        if len(image_files) > 1:
-            best_score = max(image_scores)
-        else:
-            best_score = image_scores
-
-        print(f"image scores: {image_scores}")
-        print(f"best score: {best_score}")
-
-        all_scores.append(best_score)
-        if category not in cat_scores:
-            cat_scores[category] = []
-        cat_scores[category].append(best_score)
-    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
-
-    average_score = np.average(all_scores)
-    print(f"====================================")
-    print(f"average score: {average_score:.3f}")
-    print(f"category average scores:")
-    cat_average_scores = {}
-    for category, scores in cat_scores.items():
-        cat_average_scores[category] = np.average(scores)
-        print(f"[{category}], average score: {cat_average_scores[category]:.3f}")
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        choices=["cpu", "cuda"],
-        help="device for torch.",
-    )
-    parser.add_argument(
-        "--image_info",
-        type=str,
-        default="./image_info.json",
-        help="Image_info.json file.",
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="ViT-H-14",
-        help="open clip model name",
-    )
-    parser.add_argument(
-        "--model_weights_path",
-        type=str,
-        default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
-        help="open clip model weights",
-    )
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/hpsv2_score.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/hpsv2_score.py
deleted file mode 100644
index 04e9bd8d8f82ece84c642520b001b62901286eda..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/hpsv2_score.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from typing import Union
-import json
-
-from clint.textui import progress
-import hpsv2
-from hpsv2.utils import root_path, hps_version_map
-from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
-import huggingface_hub
-from PIL import Image
-import requests
-import torch
-
-
-def initialize_model(pretrained_path, device):
-    model, _, preprocess_val = create_model_and_transforms(
-        "ViT-H-14", pretrained=pretrained_path, precision='amp',
-        device=device,
-        jit=False,
-        force_quick_gelu=False,
-        force_custom_text=False,
-        force_patch_dropout=False,
-        force_image_size=None,
-        pretrained_image=False,
-        image_mean=None,
-        image_std=None,
-        light_augmentation=True,
-        aug_cfg={},
-        output_dict=True,
-        with_score_predictor=False,
-        with_region_predictor=False
-    )
-    return model, preprocess_val
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--image_info",
-        type=str,
-        default="./image_info.json",
-        help="Image_info.json file.",
-    )
-    parser.add_argument(
-        "--HPSv2_checkpoint",
-        type=str,
-        default="./HPS_v2_compressed.pt",
-        help="HPS_v2 model weights",
-    )
-    parser.add_argument(
-        "--clip_checkpoint",
-        type=str,
-        default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
-        help="open clip model weights",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-    
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-    model, preprocess_val = initialize_model(args.clip_checkpoint, device)
-
-    checkpoint = torch.load(args.HPSv2_checkpoint, map_location=device)
-    model.load_state_dict(checkpoint['state_dict'])
-    tokenizer = get_tokenizer('ViT-H-14')
-    model = model.to(device)
-    model.eval()
-    
-    with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f:
-        image_info = json.load(f)
-
-    result = []
-    for i, info in enumerate(image_info):
-        image_file = info['images'][0]
-        prompt = info['prompt']
-        
-        # Load your image and prompt
-        with torch.no_grad():
-            # Process the image
-            if isinstance(image_file, str):
-                image = preprocess_val(Image.open(image_file))
-            elif isinstance(image_file, Image.Image):
-                image = preprocess_val(image_file)
-            else:
-                raise TypeError('The type of parameter img_path is illegal.')
-            image = image.unsqueeze(0).to(device=device, non_blocking=True)
-            # Process the prompt
-            text = tokenizer([prompt]).to(device=device, non_blocking=True)
-            # Calculate the HPS
-            with torch.cuda.amp.autocast():
-                outputs = model(image, text)
-                image_features = outputs["image_features"]
-                text_features = outputs["text_features"]
-                logits_per_image = image_features @ text_features.T
-
-                hps_score = torch.diagonal(logits_per_image).cpu().numpy()
-                print(f"image {i} hps_score: ", hps_score[0])
-
-        result.append(hps_score[0])
-
-    print('avg HPSv2 score:', sum(result) / len(result))
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/modify_onnx.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/modify_onnx.py
deleted file mode 100644
index 7321e682c82a34c34d46c053dcb06a7d1a8b7cb5..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/modify_onnx.py
+++ /dev/null
@@ -1,492 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import numpy as np
-from auto_optimizer import OnnxGraph
-
-
-def del_add(model):
-    init = [n.name for n in model.get_nodes('Initializer')]
-    for node in model.get_nodes('Add'):
-        if 'attn' in node.name and node.inputs[1] in init:
-            value = model[node.inputs[1]].value
-            if (value == 0).all():
-                model.remove(node.name)
-
-            
-def add_flash_attention(model, fa_name, soc_type):
-    for node in model.get_nodes('Mul'):
-        name = node.name
-        if soc_type == 1:
-            flag = 'attn' in name
-        else:
-            flag = 'attn1' in name
-        if flag:
-            matmul = model[name[:-3] + 'to_q/MatMul']
-            reshape = model[name[:-3] + 'Reshape']
-            seqlen = 4096
-            if soc_type == 2 and model[reshape.inputs[1]].value[1] != seqlen:
-                continue
-            softmax_node = model.get_next_nodes(node.outputs[0])[0]
-            if soc_type == 1:
-                # move mul to q
-                softmax_node.inputs[0] = node.inputs[0]
-                node.inputs[0] = matmul.outputs[0]
-                reshape.inputs[0] = node.outputs[0]
-
-            # add flashattention
-            new_node = model.add_node(name[:-3] + fa_name, fa_name)
-            inputs = [None, None, None]
-            # input 0: q
-            if soc_type == 1:
-                matmul_node = model.get_prev_node(softmax_node.inputs[0])
-            if soc_type == 2:
-                matmul_node = model.get_prev_node(node.inputs[0])
-            inputs[0] = matmul_node.inputs[0]
-            # input 1: k
-            transpose_node = model.get_prev_node(matmul_node.inputs[1])
-            inputs[1] = transpose_node.inputs[0]
-            # input 2: v
-            cast_node = model.get_next_nodes(softmax_node.outputs[0])[0]
-            last_node = model.get_next_nodes(cast_node.outputs[0])[0]
-            inputs[2] = last_node.inputs[1]
-            # output
-            outputs = last_node.outputs
-            # update link
-            new_node.inputs = inputs
-            new_node.outputs = outputs
-            
-            model.remove(matmul_node.name, {})
-            model.remove(transpose_node.name, {})
-            model.remove(softmax_node.name, {})
-            model.remove(cast_node.name, {})
-            model.remove(last_node.name, {})
-    model.update_map()
-    for node in model.get_nodes(fa_name):
-        for _ in range(soc_type):
-            for i in range(3):
-                prev_node = model.get_prev_node(node.inputs[i])
-                model.remove(prev_node.name)
-            next_node = model.get_next_nodes(node.outputs[0])[0]
-            model.remove(next_node.name)
-        if soc_type == 2:
-            name = node.name.replace(fa_name, 'Cast')
-            cast = model.add_node(name, 'Cast', attrs={'to': 1})
-            model.insert_node(node.name, cast)
-
-
-def change_input(model, bs):
-    inputs = [inp.name for inp in model.inputs]
-    for inp in inputs:
-        shape = model[inp].shape
-        dtype = model[inp].dtype
-        if inp == 't':
-            dtype = 'int32'
-        else:
-            shape[0] *= bs
-        model.remove(inp)
-        model.add_input(inp, shape=shape, dtype=dtype)
-
-
-def get_index(model, init, name):
-    if name in init:
-        return model[name].value
-    else:
-        return name
-
-
-def replace_slice(model, fast):
-    # find pairs of slice
-    slice_pair = []
-    for node in model.get_nodes('Slice'):
-        if node.name[-2:] == '_1':
-            slice_pair.append((model[node.name[:-2]], model[node.name]))
-    # replace
-    init = [n.name for n in model.get_nodes('Initializer')]
-    for pair in slice_pair:
-        next_node = model.get_next_nodes(pair[0].outputs[0])[0]
-        if fast and next_node.op_type == 'Mul':
-            name = pair[0].name[:-5] + 'SliceTransGeluMul'
-            model.add_node(name, 'SliceTransGeluMul', inputs=[pair[0].inputs[0]], outputs=next_node.outputs)
-            model.remove(next_node.name, {})
-        else:
-            name = pair[0].name[:-5] + 'Split'
-            data = pair[0].inputs[0]
-            start_0 = get_index(model, init, pair[0].inputs[1])
-            end_0 = get_index(model, init, pair[0].inputs[2])
-            start_1 = get_index(model, init, pair[1].inputs[1])
-            end_1 = get_index(model, init, pair[1].inputs[2])
-            if start_1 == end_0:
-                outputs = pair[0].outputs + pair[1].outputs
-            elif start_0 == end_1:
-                outputs = pair[1].outputs + pair[0].outputs
-
-            axes = pair[0].inputs[3]
-            axis = model[axes].value[0]
-            model.add_node(name, 'Split', inputs=[data], outputs=outputs, attrs={'axis': axis})
-        model.remove(pair[0].name, {})
-        model.remove(pair[1].name, {})
-    model.update_map()
-        
-
-def build_index(h, w, sy=2, sx=2):
-    # random select one from a 2x2 block
-    hsy = h // sy
-    wsx = w // sx
-    rand_idx = np.random.randint(sy * sx, size=(hsy, wsx))
-        
-    idx = np.ones((hsy, wsx, sy * sx), dtype=np.int64)
-    for i in range(hsy):
-        for j in range(wsx):
-            idx[i, j][rand_idx[i, j]] = 0
-    idx = idx.reshape(hsy, wsx, sy, sx).transpose(0, 2, 1, 3)
-    idx_rand = idx.reshape(-1).argsort()
-    index_a = np.sort(idx_rand[hsy * wsx:])
-    index_b = np.sort(idx_rand[:hsy * wsx])
-    return index_a, index_b
-
-
-def get_block(model):
-    # find self-attention block
-    norms = []
-    for node in model.get_nodes('Add'):
-        next_nodes = model.get_next_nodes(node.outputs[0])
-        if len(next_nodes) != 3:
-            continue
-        op_type = set(n.op_type for n in next_nodes)
-        if len(op_type) == 1 and 'MatMul' in op_type:
-            if model[node.inputs[1]].value.shape[0] == 768:
-                norms.append(node)
-    return norms
-
-
-def find_nodes(model, node):
-    prev_node = model.get_prev_node(node.inputs[0])
-    while prev_node.op_type != 'Sub':
-        prev_node = model.get_prev_node(prev_node.inputs[0])
-    inp = prev_node.inputs[0]
-    next_nodes = model.get_next_nodes(inp)
-    for next_node in next_nodes:
-        if next_node.op_type == 'Add':
-            if next_node.inputs[0] == inp:
-                out = next_node.inputs[1]
-            else:
-                out = next_node.inputs[0]
-    return inp, out
-
-
-def build_tome_block(model, name, inputs, inputs_un):
-    # link merge to attn
-    for node in model.get_next_nodes(inputs[1]):
-        ind = 0
-        for inp in node.inputs:
-            if inp == inputs[1]:
-                node.inputs[ind] = name + 'Concat_output'
-            ind += 1
-    # norm block
-    model.add_node(
-        name + 'Mul',
-        'Mul',
-        inputs=[inputs[0], inputs[0]],
-        outputs=[name + 'Mul_output']
-    )
-    model.add_node(
-        name + 'ReduceSum',
-        'ReduceSum',
-        inputs=[name + 'Mul_output'],
-        outputs=[name + 'ReduceSum_output'],
-        attrs={'axes': [-1], 'keepdims': 1}
-    )
-    model.add_node(
-        name + 'Sqrt',
-        'Sqrt',
-        inputs=[name + 'ReduceSum_output'],
-        outputs=[name + 'Sqrt_output']
-    )
-    model.add_node(
-        name + 'Div',
-        'Div',
-        inputs=[inputs[0], name + 'Sqrt_output'],
-        outputs=[name + 'Div_output']
-    )
-    # compute similarity
-    model.add_node(
-        name + 'Gather_0',
-        'Gather',
-        inputs=[name + 'Div_output', 'tome/Gather_index_a'],
-        outputs=[name + 'Gather_0_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Gather_1',
-        'Gather',
-        inputs=[name + 'Div_output', 'tome/Gather_index_b'],
-        outputs=[name + 'Gather_1_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Transpose',
-        'Transpose',
-        inputs=[name + 'Gather_1_output'],
-        outputs=[name + 'Transpose_output'],
-        attrs={'perm': [0, 2, 1]}
-    )
-    model.add_node(
-        name + 'MatMul',
-        'MatMul',
-        inputs=[name + 'Gather_0_output', name + 'Transpose_output'],
-        outputs=[name + 'MatMul_output']
-    )
-    model.add_node(
-        name + 'FindMax',
-        'FindMax',
-        inputs=[name + 'MatMul_output'],
-        outputs=[name + 'FindMax_output_0', name + 'FindMax_output_1'],
-        attrs={}
-    )
-    model.add_node(
-        name + 'TopK',
-        'TopK',
-        inputs=[name + 'FindMax_output_0', 'tome/Topk_k'],
-        outputs=[name + 'TopK_output_0', name + 'TopK_output_1'],
-        attrs={'axis': -1, 'largest': 1}
-    )
-    # split token
-    model.add_node(
-        name + 'Gather_2',
-        'Gather',
-        inputs=[inputs[1], 'tome/Gather_index_a'],
-        outputs=[name + 'Gather_2_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Gather_3',
-        'Gather',
-        inputs=[inputs[1], 'tome/Gather_index_b'],
-        outputs=[name + 'Gather_3_output'],
-        attrs={'axis': 1}
-    )
-    model.add_node(
-        name + 'Cast_0',
-        'Cast',
-        inputs=[name + 'Gather_2_output'],
-        outputs=[name + 'Cast_0_output'],
-        attrs={'to': 1}
-    )
-    model.add_node(
-        name + 'Cast_1',
-        'Cast',
-        inputs=[name + 'Gather_3_output'],
-        outputs=[name + 'Cast_1_output'],
-        attrs={'to': 1}
-    )
-    # tome merge
-    merge_inputs = [
-        name + 'Cast_0_output', 
-        name + 'Cast_1_output', 
-        name + 'TopK_output_1', 
-        name + 'FindMax_output_1'
-    ]
-    merge_outputs = [
-        name + 'TomeMerged_output_0',
-        name + 'TomeMerged_output_1',
-        name + 'TomeMerged_output_2'
-    ]
-    model.add_node(
-        name + 'TomeMerged',
-        'TomeMerged',
-        inputs=merge_inputs,
-        outputs=merge_outputs
-    )
-    model.add_node(
-        name + 'ReduceSum_1',
-        'ReduceSum',
-        inputs=[name + 'TomeMerged_output_1'],
-        outputs=[name + 'ReduceSum_1_output'],
-        attrs={'axes': [1], 'keepdims': 0}
-    )
-    model.add_node(
-        name + 'ReduceSum_2',
-        'ReduceSum',
-        inputs=[name + 'TomeMerged_output_2'],
-        outputs=[name + 'ReduceSum_2_output'],
-        attrs={'axes': [1], 'keepdims': 0}
-    )
-    model.add_node(
-        name + 'Unsqueeze',
-        'Unsqueeze',
-        inputs=[name + 'ReduceSum_2_output'],
-        outputs=[name + 'Unsqueeze_output'],
-        attrs={'axes': [2]}
-    )
-    model.add_node(
-        name + 'Div_1',
-        'Div',
-        inputs=[name + 'ReduceSum_1_output', name + 'Unsqueeze_output'],
-        outputs=[name + 'Div_1_output']
-    )
-    model.add_node(
-        name + 'Concat',
-        'Concat',
-        inputs=[name + 'TomeMerged_output_0', name + 'Div_1_output'],
-        outputs=[name + 'Concat_output'],
-        attrs={'axis': 1}
-    )
-    # link unmerge to norm
-    for node in model.get_next_nodes(inputs_un[0]):
-        ind = 0
-        for inp in node.inputs:
-            if inp == inputs_un[0]:
-                node.inputs[ind] = name + 'TomeUngerme_output'
-            ind += 1
-    # add unmerge node
-    unmerge_inputs = inputs_un + [name + 'TopK_output_1', name + 'FindMax_output_1']
-    model.add_node(
-        name + 'tome/TomeUnmerge',
-        'TomeUnmerged',
-        inputs=unmerge_inputs,
-        outputs=[name + 'TomeUngerme_output']
-    )
-    model.update_map()
-
-
-def insert_tome_block(model, max_num):
-    bs = model['latent_model_input'].shape[0]
-    h, w = model['latent_model_input'].shape[2:]
-    h = h // 2 
-    w = w // 2
-    index_a, index_b = build_index(h, w)
-    # add initializer
-    model.add_initializer('tome/Gather_index_a', index_a)
-    model.add_initializer('tome/Gather_index_b', index_b)
-    bs_index_a = np.tile(index_a.reshape(1, -1), [bs, 1])
-    bs_index_b = np.tile(index_b.reshape(1, -1), [bs, 1])
-    model.add_initializer('tome/index_a', bs_index_a)
-    model.add_initializer('tome/index_b', bs_index_b)
-    model.add_initializer('tome/Topk_k', np.array([3072]))
-    # get reshape nodes
-    reshapes = model.get_nodes('Reshape')
-    # find inputs
-    norm_outs = get_block(model)[:max_num]
-    for node in norm_outs:
-        name = node.name.rsplit('/', 2)[0] + '/attn1/'
-        norm_input, sa_output = find_nodes(model, node)
-        inputs_0 = [norm_input] + node.outputs
-        inputs_1 = [sa_output] + ['tome/index_a', 'tome/index_b']
-        # add tome block
-        build_tome_block(model, name.replace('attn', 'tome'), inputs_0, inputs_1)
-        # change shape of reshape
-        for reshape in reshapes:
-            if name in reshape.name:
-                shape = model[reshape.inputs[1]].value.copy()
-                ind = 0
-                for size in shape:
-                    if size == 4096:
-                        shape[ind] = '-1'
-                    ind += 1
-                model[reshape.inputs[1]].value = shape
-
-
-def change_bs(model, bs):
-    node = model.get_nodes('Expand')[0]
-    node.inputs[1] = 'bs'
-    model.add_initializer('bs', value=np.array([bs]))
-    
-    inits = [init.name for init in model.initializers]
-    shapes = []
-    for node in model.get_nodes('Reshape'):
-        shape = node.inputs[1]
-        if shape in inits and shape not in shapes:
-            shapes.append(shape)
-            value = model[shape].value.copy()
-            value[0] *= bs
-            model[shape].value = value
-
-    model.update_map()
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="models/unet/unet.onnx",
-        help="Path of the unet onnx model.",
-    )
-    parser.add_argument(
-        "--new_model",
-        type=str,
-        default="models/unet/unet_md.onnx",
-        help="Path to save the modified model",
-    )
-    parser.add_argument(
-        "--FA_soc", 
-        choices=["None", "Duo", "A2"],
-        default="None", 
-        help="Type of FA operator.",
-    )
-    parser.add_argument(
-        "--TOME_num",
-        type=int,
-        default=0,
-        help="Number of TOME used in the model",
-    )
-    parser.add_argument(
-        "--faster_gelu",
-        action="store_true",
-        help="Use specific gelu operation"
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="Batch size"
-    )
-    parser.add_argument(
-        "--parallel",
-        action="store_true",
-        help="Use parallel unet model"
-    )
-    return parser.parse_args()
-
-
-def main():
-    model = OnnxGraph.parse(args.model)
-    del_add(model)
-    if args.parallel:
-        batch_size = args.batch_size
-    else:
-        batch_size = args.batch_size * 2
-    if batch_size > 1:
-        change_bs(model, batch_size)
-    change_input(model, batch_size)
-    if args.FA_soc == 'Duo':
-        add_flash_attention(model, 'FlashAttentionTik', soc_type=1)
-    elif args.FA_soc == 'A2':
-        if batch_size > 2:
-            print('A2 does not support FA in multi-batch case! The FA modification does not effect.')
-        else:
-            add_flash_attention(model, 'UnpadFlashAttentionMix', soc_type=2)
-    if args.TOME_num:
-        insert_tome_block(model, args.TOME_num)
-    replace_slice(model, args.faster_gelu)
-    model.remove_unused_nodes()
-    model.save(args.new_model)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main()
-    
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/pipeline_ascend_stable_diffusionxl.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/pipeline_ascend_stable_diffusionxl.py
deleted file mode 100644
index f5b45691d6a453cfff9e5a300664a84726aeadd7..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/pipeline_ascend_stable_diffusionxl.py
+++ /dev/null
@@ -1,658 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import aclruntime
-import numpy as np
-from PIL import Image
-import torch
-
-from ais_bench.infer.interface import InferSession
-from diffusers import StableDiffusionXLImg2ImgPipeline
-from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import TextualInversionLoaderMixin
-from diffusers.models.vae import DiagonalGaussianDistribution
-from diffusers.utils.torch_utils import randn_tensor
-
-
-class AscendStableDiffusionXLImg2ImgPipeline(StableDiffusionXLImg2ImgPipeline):
-    def encode_prompt(
-        self,
-        prompt: str,
-        prompt_2: Optional[str] = None,
-        num_images_per_prompt: int = 1,
-        do_classifier_free_guidance: bool = True,
-        negative_prompt: Optional[str] = None,
-        negative_prompt_2: Optional[str] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        encode_session: InferSession = None,
-        encode_session_2: InferSession = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in both text-encoders
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Define tokenizers and text encoders
-        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
-        text_encoders = (
-            [encode_session, encode_session_2] if encode_session is not None else [encode_session_2]
-        )
-
-        if prompt_embeds is None:
-            prompt_2 = prompt_2 or prompt
-            # textual inversion: procecss multi-vector tokens if necessary
-            prompt_embeds_list = []
-            prompts = [prompt, prompt_2]
-            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
-                if isinstance(self, TextualInversionLoaderMixin):
-                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
-
-                text_inputs = tokenizer(
-                    prompt,
-                    padding="max_length",
-                    max_length=tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="pt",
-                )
-
-                text_input_ids = text_inputs.input_ids
-                prompt_embeds = text_encoder.infer([text_input_ids.numpy()])
-
-                # We are only ALWAYS interested in the pooled output of the final text encoder
-                pooled_prompt_embeds = torch.from_numpy(prompt_embeds[0])
-                prompt_embeds = torch.from_numpy(prompt_embeds[-2])
-
-                prompt_embeds_list.append(prompt_embeds)
-
-            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
-
-        # get unconditional embeddings for classifier free guidance
-        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
-        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
-            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
-        elif do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            negative_prompt_2 = negative_prompt_2 or negative_prompt
-
-            uncond_tokens: List[str]
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt, negative_prompt_2]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = [negative_prompt, negative_prompt_2]
-
-            negative_prompt_embeds_list = []
-            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
-                if isinstance(self, TextualInversionLoaderMixin):
-                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
-
-                max_length = prompt_embeds.shape[1]
-                uncond_input = tokenizer(
-                    negative_prompt,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="pt",
-                )
-
-                negative_prompt_embeds = text_encoder.infer(
-                    [uncond_input.input_ids.numpy()]
-                )
-                # We are only ALWAYS interested in the pooled output of the final text encoder
-                negative_pooled_prompt_embeds = torch.from_numpy(negative_prompt_embeds[0])
-                negative_prompt_embeds = torch.from_numpy(negative_prompt_embeds[-2])
-
-                negative_prompt_embeds_list.append(negative_prompt_embeds)
-
-            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
-            bs_embed * num_images_per_prompt, -1
-        )
-        if do_classifier_free_guidance:
-            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
-                bs_embed * num_images_per_prompt, -1
-            )
-
-        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
-
-    def prepare_latents(
-        self, 
-        image, 
-        timestep, 
-        batch_size, 
-        num_images_per_prompt, 
-        dtype,
-        encoder_session,
-        generator=None, 
-        add_noise=True,
-    ):
-        if not isinstance(image, (torch.Tensor, Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        # Offload text encoder if `enable_model_cpu_offload` was enabled
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.text_encoder_2.to("cpu")
-            torch.cuda.empty_cache()
-
-        image = image.to(dtype=dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-
-        if image.shape[1] == 4:
-            init_latents = image
-
-        else:
-            # make sure the VAE is in float32 mode, as it overflows in float16
-            if self.vae.config.force_upcast:
-                image = image.float()
-                self.vae.to(dtype=torch.float32)
-
-            if isinstance(generator, list) and len(generator) != batch_size:
-                raise ValueError(
-                    f"You have passed a list of generators of length {len(generator)}, but requested an effective"
-                    f" batch size of {batch_size}. Make sure the batch size matches the length of the generators."
-                )
-
-            elif isinstance(generator, list):
-                init_latents = [
-                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-                ]
-                init_latents = torch.cat(init_latents, dim=0)
-            else:
-                h = torch.from_numpy(encoder_session.infer([image.numpy()])[0])
-
-                moments = self.vae.quant_conv(h)
-                posterior = DiagonalGaussianDistribution(moments)
-                init_latents = posterior.sample(generator)
-
-            if self.vae.config.force_upcast:
-                self.vae.to(dtype)
-
-            init_latents = init_latents.to(dtype)
-            init_latents = self.vae.config.scaling_factor * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = torch.cat([init_latents], dim=0)
-
-        if add_noise:
-            shape = init_latents.shape
-            noise = randn_tensor(shape, generator=generator, dtype=dtype)
-            # get latents
-            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-
-        latents = init_latents
-
-        return latents
-   
-    @torch.no_grad()
-    def ascend_infer(
-        self,
-        prompt: Union[str, List[str]] = None,
-        prompt_2: Optional[Union[str, List[str]]] = None,
-        image: PipelineImageInput = None,
-        strength: float = 0.3,
-        encode_session: InferSession = None,
-        encode_session_2: InferSession = None,
-        unet_sessions: List[list] = None,
-        scheduler_session: InferSession = None,
-        vae_encoder_session: InferSession = None,
-        vae_decoder_session: InferSession = None,
-        skip_status: List[int] = None,
-        device_id: int = 0,
-        use_npu_scheduler: bool = False,
-        num_inference_steps: int = 50,
-        denoising_start: Optional[float] = None,
-        denoising_end: Optional[float] = None,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt_2: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: [Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: [Tuple[int, int]] = None,
-        negative_original_size: Optional[Tuple[int, int]] = None,
-        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
-        negative_target_size: Optional[Tuple[int, int]] = None,
-        aesthetic_score: float = 6.0,
-        negative_aesthetic_score: float = 2.5,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in both text-encoders
-            image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
-                The image(s) to modify with the pipeline.
-            strength (`float`, *optional*, defaults to 0.3):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
-                `denoising_start` being declared as an integer, the value of `strength` will be ignored.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            denoising_start (`float`, *optional*):
-                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
-                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
-                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
-                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
-                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
-            denoising_end (`float`, *optional*):
-                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
-                completed before it is intentionally prematurely terminated. As a result, the returned sample will
-                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
-                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
-                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
-                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            guidance_rescale (`float`, *optional*, defaults to 0.7):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
-                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a target image resolution. It should be as same
-                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            aesthetic_score (`float`, *optional*, defaults to 6.0):
-                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
-                Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
-                Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
-                simulate an aesthetic score of the generated image by influencing the negative text condition.
-        """
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            prompt_2,
-            strength,
-            num_inference_steps,
-            callback_steps,
-            negative_prompt,
-            negative_prompt_2,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # 3. Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            prompt_2=prompt_2,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            negative_prompt_2=negative_prompt_2,
-            encode_session=encode_session,
-            encode_session_2=encode_session_2,
-        )
-
-        # 4. Prepare image
-        image = self.image_processor.preprocess(image)
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps, num_inference_steps = self.get_timesteps(
-            num_inference_steps, strength, device, None
-        )
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-
-        add_noise = True if denoising_start is None else False
-
-        # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            vae_encoder_session,
-            generator,
-            add_noise,
-        )
-
-        # 7. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        height, width = latents.shape[-2:]
-        height = height * self.vae_scale_factor
-        width = width * self.vae_scale_factor
-
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 8. Prepare added time ids & embeddings
-        unet_session, unet_session_bg = unet_sessions
-        use_parallel_inferencing = unet_session_bg is not None
-
-        if negative_original_size is None:
-            negative_original_size = original_size
-        else:
-            negative_original_size = target_size
-        add_text_embeds = pooled_prompt_embeds
-
-        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            aesthetic_score,
-            negative_aesthetic_score,
-            negative_original_size,
-            negative_crops_coords_top_left,
-            negative_target_size,
-            dtype=prompt_embeds.dtype,
-        )
-
-        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
-        add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
-
-        if do_classifier_free_guidance and not use_parallel_inferencing:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
-            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
-        elif do_classifier_free_guidance:
-            add_neg_time_ids = add_neg_time_ids.numpy()
-            negative_prompt_embeds = negative_prompt_embeds.numpy()
-            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.numpy()
-            
-        prompt_embeds = prompt_embeds.numpy()
-        add_text_embeds = add_text_embeds.numpy()
-        add_time_ids = add_time_ids.numpy()
-
-        # 9. Denoising loop
-        # 9.1 Apply denoising_end
-        cache = None
-        for i, t in enumerate(timesteps):
-            # expand the latents if we are doing classifier free guidance
-            t_numpy = t[None].numpy()
-            if not use_parallel_inferencing and do_classifier_free_guidance:
-                latent_model_input = torch.cat([latents] * 2)
-            else:
-                latent_model_input = latents
-
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            if use_parallel_inferencing and do_classifier_free_guidance:
-                unet_session_bg.infer_asyn(
-                    [
-                        latent_model_input,
-                        t_numpy.astype(np.int32),
-                        negative_prompt_embeds,
-                        negative_pooled_prompt_embeds,
-                        add_neg_time_ids,
-                    ],
-                    skip_status[i]
-                )
-
-            if skip_status[i]:
-                inputs = [
-                    latent_model_input.numpy(),
-                    t_numpy.astype(np.int32),
-                    prompt_embeds,
-                    add_text_embeds,
-                    add_time_ids,
-                    cache,
-                ]
-                noise_pred = torch.from_numpy(
-                    np.array(self.unet_infer(unet_session[1], inputs, device_id)[0])
-                )
-            else:
-                inputs = [
-                    latent_model_input.numpy(),
-                    t_numpy.astype(np.int32),
-                    prompt_embeds,
-                    add_text_embeds,
-                    add_time_ids,
-                ]
-                outputs = self.unet_infer(unet_session[0], inputs, device_id)
-                noise_pred = torch.from_numpy(np.array(outputs[0]))
-                if len(outputs) > 1:
-                    cache = outputs[1]
-
-            if do_classifier_free_guidance:
-                if use_parallel_inferencing:
-                    noise_pred_uncond = torch.from_numpy(unet_session_bg.wait_and_get_outputs()[0])
-                else:
-                    noise_pred_uncond, noise_pred = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
-
-            # perform guidance
-            if use_npu_scheduler:
-                latents = torch.from_numpy(
-                    scheduler_session.infer(
-                        [
-                            noise_pred.numpy(),
-                            t_numpy,
-                            latents.numpy(),
-                            np.array(i)
-                        ]
-                    )[0]
-                )
-
-            else:
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False,
-                )[0]
-
-        if not output_type == "latent":
-            latents = latents / self.vae.config.scaling_factor
-            latents = self.vae.post_quant_conv(latents)
-            image = torch.from_numpy(vae_decoder_session.infer([latents.numpy()])[0])
-
-        else:
-            image = latents
-            return (image,)
-        
-        image = self.image_processor.postprocess(image, output_type=output_type)
-
-        return (image, )
-
-    def unet_infer(self, session, data, device_id):
-        feeds = {}
-        inputs = session.get_inputs()
-        for i, inp in enumerate(inputs):
-            if inp.name == 'cache':
-                feeds[inp.name] = data[i]
-                continue
-            feed = aclruntime.Tensor(data[i])
-            feed.to_device(device_id)
-            feeds[inp.name] = feed
-        out_names = [out.name for out in session.get_outputs()]
-
-        outputs = session.run(out_names, feeds)
-        outputs[0].to_host()
-        return outputs
-        
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/requirements.txt b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/requirements.txt
deleted file mode 100644
index c51d9deb2976e34f043f96c8453e5a0c5439766f..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-torch==1.13.0
-diffusers==0.21.0
-transformers==4.26.1
-open_clip_torch==2.20.0
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/stable_diffusionxl_2_onnx.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/stable_diffusionxl_2_onnx.py
deleted file mode 100644
index c7347b1e3b716baf0e0f87be1759c7a7bbc4ebd1..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/stable_diffusionxl_2_onnx.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from argparse import Namespace
-
-import torch
-import torch.nn as nn 
-from diffusers import DDIMScheduler
-from diffusers import StableDiffusionXLImg2ImgPipeline
-
-
-def parse_arguments() -> Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-o",
-        "--output_dir",
-        type=str,
-        default="./models",
-        help="Path of directory to save ONNX models.",
-    )
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-xl-base-1.0",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "-steps",
-        "--steps", 
-        type=int, 
-        default=50, 
-        help="steps."
-    )
-    parser.add_argument(
-        "-guid",
-        "--guidance_scale", 
-        type=float, 
-        default=5.0, 
-        help="guidance_scale"
-    )
-    parser.add_argument(
-        "--strength", 
-        type=float, 
-        default=0.3, 
-        help="Must be between 0 and 1."
-    )
-
-    return parser.parse_args()
-
-
-class NewDdim(nn.Module):
-    def __init__(self, num_train_timesteps=1000, num_inference_steps=50, alphas_cumprod=None,
-                 guidance_scale=7.5, alpha_prod_t_prev_cache=None):
-        super(NewDdim, self).__init__()
-        self.num_train_timesteps = num_train_timesteps
-        self.num_inference_steps = num_inference_steps
-        self.alphas_cumprod = alphas_cumprod
-        self.guidance_scale = guidance_scale
-        self.alpha_prod_t_prev_cache = alpha_prod_t_prev_cache
-
-    def forward(
-        self,
-        model_output: torch.FloatTensor,
-        timestep: int,
-        sample: torch.FloatTensor,
-        step_index: int):
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alpha_prod_t_prev_cache[step_index]
-        beta_prod_t = 1 - alpha_prod_t
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        pred_epsilon = model_output
-        pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
-        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
-        return(prev_sample,)
-
-
-def export_ddim(
-        sd_pipeline: StableDiffusionXLImg2ImgPipeline, 
-        save_dir: str, 
-        steps: int, 
-        strength: float,
-        guidance_scale: float
-    ) -> None:
-    print("Exporting the ddim...")
-    ddim_path = os.path.join(save_dir, "ddim")
-    if not os.path.exists(ddim_path):
-        os.makedirs(ddim_path, mode=0o744)
-    
-    dummy_input = (
-                   torch.randn(1, 4, 128, 128),
-                   torch.tensor(981),
-                   torch.randn(1, 4, 128, 128),
-                   torch.tensor(0)
-                   )
-    scheduler = DDIMScheduler.from_config(sd_pipeline.scheduler.config)
-    sd_pipeline.scheduler = scheduler
-    scheduler.set_timesteps(steps, device="cpu")
-
-    timesteps, _ = sd_pipeline.get_timesteps(steps, strength, None, None)
-    alpha_prod_t_prev_cache = []
-    for timestep in timesteps:
-        prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
-        alpha_prod_t_prev = scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
-        alpha_prod_t_prev_cache.append(alpha_prod_t_prev)
-
-    new_ddim = NewDdim(
-        num_train_timesteps=scheduler.config.num_train_timesteps,
-        num_inference_steps=scheduler.num_inference_steps,
-        alphas_cumprod=scheduler.alphas_cumprod,
-        guidance_scale=guidance_scale,
-        alpha_prod_t_prev_cache=torch.tensor(alpha_prod_t_prev_cache)
-    )
-
-    new_ddim.eval()
-    torch.onnx.export(
-        new_ddim,
-        dummy_input,
-        os.path.join(ddim_path, "ddim.onnx"),
-        input_names=["noise_pred", "timestep", "latents", "step_index"],
-        output_names=["out_latents"],
-        dynamic_axes={
-            "noise_pred": {0: 'bs'},
-            "latents": {0: 'bs'},
-        },
-        opset_version=11,
-        verbose=False,
-    )
-
-
-def export_encoder(sd_pipeline: StableDiffusionXLImg2ImgPipeline, save_dir: str) -> None:
-    encoder_path = os.path.join(save_dir, "text_encoder")
-    if not os.path.exists(encoder_path):
-        os.makedirs(encoder_path, mode=0o744)
-        
-    encoder_model = sd_pipeline.text_encoder
-    encoder_model_2 = sd_pipeline.text_encoder_2
-    max_position_embeddings = encoder_model_2.config.max_position_embeddings
-    dummy_input = (
-        torch.ones([1, max_position_embeddings], dtype=torch.int64),
-        None,
-        None,
-        None,
-        True
-    )
-
-    if encoder_model:
-        print("Exporting the text encoder...")
-
-        torch.onnx.export(
-            encoder_model,
-            dummy_input,
-            os.path.join(encoder_path, "text_encoder.onnx"),
-            input_names=["prompt"],
-            output_names=["text_embeddings"],
-            dynamic_axes={"prompt": {0: 'bs'}},
-            opset_version=11,
-        )
-
-    print("Exporting the text encoder 2...")
-    encoder_2_model = sd_pipeline.text_encoder_2
-
-    torch.onnx.export(
-        encoder_2_model,
-        dummy_input,
-        os.path.join(encoder_path, "text_encoder_2.onnx"),
-        input_names=["prompt"],
-        output_names=["text_embeddings"],
-        dynamic_axes={"prompt": {0: 'bs'}},
-        opset_version=11,
-    )
-
-
-def export_unet(sd_pipeline: StableDiffusionXLImg2ImgPipeline, save_dir: str) -> None:
-    print("Exporting the image information creater...")
-    unet_path = os.path.join(save_dir, "unet")
-    if not os.path.exists(unet_path):
-        os.makedirs(unet_path, mode=0o744)
-
-    unet_model = sd_pipeline.unet
-    encoder_model = sd_pipeline.text_encoder
-    encoder_model_2 = sd_pipeline.text_encoder_2
-
-    sample_size = unet_model.config.sample_size
-    in_channels = unet_model.config.in_channels
-    encoder_hidden_size_1 = 0
-    if encoder_model:
-        encoder_hidden_size_1 = encoder_model.config.hidden_size
-    encoder_hidden_size_2 = encoder_model_2.config.hidden_size
-    encoder_hidden_size = encoder_hidden_size_1 + encoder_hidden_size_2
-    max_position_embeddings = encoder_model_2.config.max_position_embeddings
-
-    dummy_input = (
-        torch.ones([1, in_channels, sample_size, sample_size], dtype=torch.float32),
-        torch.ones([1], dtype=torch.int64),
-        torch.ones(
-            [1, max_position_embeddings, encoder_hidden_size], dtype=torch.float32
-        ),
-        None,
-        None,
-        None,
-        None,
-        {
-            "text_embeds": torch.ones([1, encoder_hidden_size_2], dtype=torch.float32),
-            "time_ids": torch.ones([1, 5], dtype=torch.float32)
-        },
-        {}
-    )
-
-    torch.onnx.export(
-        unet_model,
-        dummy_input,
-        os.path.join(unet_path, f"unet.onnx"),
-        input_names=["latent_model_input", "t", "encoder_hidden_states", "text_embeds", "time_ids"],
-        output_names=["sample"],
-        opset_version=11,
-    )
-
-
-def export_vae(sd_pipeline: StableDiffusionXLImg2ImgPipeline, save_dir: str) -> None:
-    vae_path = os.path.join(save_dir, "vae")
-    if not os.path.exists(vae_path):
-        os.makedirs(vae_path, mode=0o744)
-
-    vae_model = sd_pipeline.vae
-    unet_model = sd_pipeline.unet
-
-    print("Exporting the image encoder...")
-    sample_size = vae_model.config.sample_size
-
-    dummy_input = torch.ones([1, 3, sample_size, sample_size])
-
-    torch.onnx.export(
-        vae_model.encoder,
-        dummy_input,
-        os.path.join(vae_path, "vae_encoder.onnx"),
-        input_names=["image"],
-        output_names=["init_latents"],
-        dynamic_axes={"image": {0: 'bs'}},
-        opset_version=11,
-    )
-
-    print("Exporting the image decoder...")
-    sample_size = unet_model.config.sample_size
-    in_channels = unet_model.config.out_channels
-
-    dummy_input = torch.ones([1, in_channels, sample_size, sample_size])
-
-    torch.onnx.export(
-        vae_model.decoder,
-        dummy_input,
-        os.path.join(vae_path, "vae_decoder.onnx"),
-        input_names=["latents"],
-        output_names=["image"],
-        dynamic_axes={"latents": {0: 'bs'}},
-        opset_version=11,
-    )
-
-
-def main():
-    args = parse_arguments()
-    pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(args.model).to("cpu")
-
-    export_encoder(pipeline, args.output_dir)
-
-    export_unet(pipeline, args.output_dir)
-
-    export_vae(pipeline, args.output_dir)
-    
-    export_ddim(pipeline, args.output_dir, args.steps, args.strength, args.guidance_scale)
-
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
-    
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/stable_diffusionxl_ascend_infer.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/stable_diffusionxl_ascend_infer.py
deleted file mode 100644
index bbc5f0b7ab8511f99c7b0e4c14b655fc7f0e4249..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/stable_diffusionxl_ascend_infer.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import csv
-import time
-import json
-import argparse
-
-import aclruntime
-from ais_bench.infer.interface import InferSession
-from diffusers.schedulers import *
-from diffusers.utils import load_image
-
-from background_session import BackgroundInferSession
-from pipeline_ascend_stable_diffusionxl import AscendStableDiffusionXLImg2ImgPipeline
-
-
-class DataLoader:
-    def __init__(
-        self,
-        info_file: str,
-        batch_size: int,
-        num_images_per_prompt: int=1,
-        max_num_prompts: int=0
-    ):
-        self.prompts = []
-        self.batch_size = batch_size
-        self.num_images_per_prompt = num_images_per_prompt
-        self.categories = []
-        self.root_path = os.path.dirname(info_file)
-
-        self.current_id = 0
-        self.inner_id = 0
-        self.load_data(info_file, max_num_prompts)
-
-    def __len__(self):
-        return len(self.prompts) * self.num_images_per_prompt
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.current_id == len(self.prompts):
-            raise StopIteration
-
-        ret = {
-            'prompts': [],
-            'images': [],
-            'categories': [],
-            'save_names': [],
-            'n_prompts': self.batch_size,
-        }
-        for _ in range(self.batch_size):
-            if self.current_id == len(self.prompts):
-                ret['prompts'].append('')
-                ret['images'].append(image)
-                ret['save_names'].append('')
-                ret['categories'].append('')
-                ret['n_prompts'] -= 1
-
-            else:
-                prompt, image_file, category_id = self.prompts[self.current_id]
-                image_path = os.path.join(self.root_path, image_file)
-                image = load_image(image_path).convert('RGB')
-                save_path = os.path.basename(image_file).split('.')[0]
-                ret['prompts'].append(prompt)
-                ret['images'].append(image)
-                ret['categories'].append(self.categories[category_id])
-                ret['save_names'].append(f'{save_path}_{self.inner_id}')
-
-                self.inner_id += 1
-                if self.inner_id == self.num_images_per_prompt:
-                    self.inner_id = 0
-                    self.current_id += 1
-
-        return ret
-
-    def load_data(self, file_path: str, max_num_prompts: int):
-        with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
-            image_info = json.load(f)
-        count = 0
-        for info in image_info:
-            image_files = info['images']
-            category = info['category']
-            prompt = info['prompt']
-
-            if category not in self.categories:
-                self.categories.append(category)
-            category_id = self.categories.index(category)
-            for image_file in image_files:
-                self.prompts.append((prompt, image_file, category_id))
-            count += 1
-            if max_num_prompts and count == max_num_prompts:
-                break
-
-
-def check_device_range_valid(value):
-    # if contain , split to int list
-    min_value = 0
-    max_value = 255
-    if ',' in value:
-        ilist = [ int(v) for v in value.split(',') ]
-        for ivalue in ilist[:2]:
-            if ivalue < min_value or ivalue > max_value:
-                raise argparse.ArgumentTypeError("{} of device:{} is invalid. valid value range is [{}, {}]".format(
-                    ivalue, value, min_value, max_value))
-        return ilist[:2]
-    else:
-		# default as single int value
-        ivalue = int(value)
-        if ivalue < min_value or ivalue > max_value:
-            raise argparse.ArgumentTypeError("device:{} is invalid. valid value range is [{}, {}]".format(
-                ivalue, min_value, max_value))
-        return ivalue
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m",
-        "--model",
-        type=str,
-        default="stabilityai/stable-diffusion-2-1-base",
-        help="Path or name of the pre-trained model.",
-    )
-    parser.add_argument(
-        "--image_info",
-        type=str,
-        default="./image_info.json",
-        help="Image_info json file.",
-    )
-    parser.add_argument(
-        "--model_dir",
-        type=str,
-        default="./models",
-        help="Base path of om models.",
-    )
-    parser.add_argument(
-        "--save_dir", 
-        type=str, 
-        default="./results", 
-        help="Path to save result images.",
-    )
-    parser.add_argument(
-        "--info_file_save_path", 
-        type=str, 
-        default="./refiner_image_info.json", 
-        help="Path to save image information file.",
-    )
-    parser.add_argument(
-        "--steps", 
-        type=int, 
-        default=50, 
-        help="Number of inference steps.",
-    )
-    parser.add_argument(
-        "--num_images_per_prompt",
-        default=1,
-        type=int,
-        help="Number of images generated for each prompt.",
-    )
-    parser.add_argument(
-        "--max_num_prompts",
-        default=0,
-        type=int,
-        help="Limit the number of prompts (0: no limit).",
-    )
-    parser.add_argument(
-        "--scheduler", 
-        choices=["None", "DDIM", "Euler", "DPM", "EulerAncestral", "DPM++SDEKarras"],
-        default="DDIM", 
-        help="Type of Sampling methods. Can choose from DDIM, Euler, DPM",
-    )
-    parser.add_argument(
-        "--device", 
-        type=check_device_range_valid, 
-        default=0, 
-        help="NPU device id."
-    )
-    parser.add_argument(
-        "-bs",
-        "--batch_size", 
-        type=int, 
-        default=1, 
-        help="Batch size."
-    )
-    parser.add_argument(
-        "--strength", 
-        type=float, 
-        default=0.3, 
-        help="Must be between 0 and 1."
-    )
-    parser.add_argument(
-        "--use_cache", 
-        action="store_true",
-        help="Use cache during inference."
-    )
-    parser.add_argument(
-        "--cache_steps", 
-        type=str, 
-        default="1,2,4,6,7,9,10,12,13,14,16,18,19,21,23,24,26,27,29,\
-                30,31,33,34,36,37,39,40,42,43,45,47,48,49", 
-        help="Steps to use cache data."
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-    save_dir = args.save_dir
-    device = None
-    device_2 = None
-
-    if isinstance(args.device, list):
-        device, device_2 = args.device
-    else:
-        device = args.device
-
-    pipe = AscendStableDiffusionXLImg2ImgPipeline.from_pretrained(args.model).to("cpu")
-    use_npu_scheduler = False
-
-    if args.scheduler == "DDIM":
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        use_npu_scheduler = True
-    elif args.scheduler == "Euler":
-        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-    elif args.scheduler == "DPM":
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-    elif args.scheduler == "EulerAncestral":
-        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-    elif args.scheduler == "DPM++SDEKarras":
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.scheduler.config.algorithm_type = 'sde-dpmsolver++'
-        pipe.scheduler.config.use_karras_sigmas = True
-
-    if pipe.text_encoder:
-        encoder_om = os.path.join(args.model_dir, "text_encoder", "text_encoder.om")
-        encoder_session = InferSession(device, encoder_om)
-    else:
-        encoder_session = None
-    encoder_om_2 = os.path.join(args.model_dir, "text_encoder", "text_encoder_2.om")
-    encoder_session_2 = InferSession(device, encoder_om_2)
-    vae_encoder_om = os.path.join(args.model_dir, "vae", "vae_encoder.om")
-    vae_encoder_session = InferSession(device, vae_encoder_om)
-    vae_decoder_om = os.path.join(args.model_dir, "vae", "vae_decoder.om")
-    vae_decoder_session = InferSession(device, vae_decoder_om)
-
-    if use_npu_scheduler:
-        scheduler_om = os.path.join(args.model_dir, "ddim", "ddim.om")
-        scheduler_session = InferSession(device, scheduler_om)
-    else:
-        scheduler_session = None
-
-    skip_status = [0] * args.steps
-    if args.use_cache:
-        for i in args.cache_steps.split(','):
-            if int(i) >= args.steps:
-                continue
-            skip_status[int(i)] = 1
-        unet_cache_om = os.path.join(args.model_dir, "unet", "unet_cache.om")
-        unet_skip_om = os.path.join(args.model_dir, "unet", "unet_skip.om")
-        unet_session = [
-            aclruntime.InferenceSession(unet_cache_om, device, aclruntime.session_options()),
-            aclruntime.InferenceSession(unet_skip_om, device, aclruntime.session_options()),
-        ]
-    else:
-        unet_cache_om = os.path.join(args.model_dir, "unet", "unet.om")
-        unet_skip_om = ""
-        unet_session = [
-            aclruntime.InferenceSession(unet_cache_om, device, aclruntime.session_options()),
-            None,
-        ]
-
-    unet_session_bg = None
-    if device_2:
-        unet_session_bg = BackgroundInferSession.clone(
-            unet_session[0], 
-            device_2, 
-            [unet_cache_om, unet_skip_om]
-        )
-
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir, mode=0o744)
-
-    use_time = 0
-
-    infer_num = 0
-    refiner_image_info = []
-    current_prompt = None
-
-    data_loader = DataLoader(args.image_info, 
-                                 args.batch_size,
-                                 args.num_images_per_prompt,
-                                 args.max_num_prompts)
-
-    infer_num = 0
-    image_info = []
-    current_prompt = None
-    negative_prompt = [""] * args.batch_size
-    for _, input_info in enumerate(data_loader):
-        prompts = input_info['prompts']
-        images = input_info['images']
-        categories = input_info['categories']
-        save_names = input_info['save_names']
-        n_prompts = input_info['n_prompts']
-        
-        print(f"[{infer_num + n_prompts}/{len(data_loader)}]: {prompts}")
-        infer_num += args.batch_size
-
-        start_time = time.time()
-        images = pipe.ascend_infer(
-            prompt=prompts,
-            negative_prompt=negative_prompt,
-            image=images,
-            strength=args.strength,
-            encode_session=encoder_session,
-            encode_session_2=encoder_session_2,
-            unet_sessions=[unet_session, unet_session_bg],
-            scheduler_session=scheduler_session,
-            vae_encoder_session=vae_encoder_session, 
-            vae_decoder_session=vae_decoder_session,
-            skip_status=skip_status,
-            device_id=device,
-            num_inference_steps=args.steps,
-            guidance_scale=5.0,
-            use_npu_scheduler=use_npu_scheduler,
-        )
-
-        use_time += time.time() - start_time
-
-        for j in range(n_prompts):
-            image_save_path = os.path.join(save_dir, f"{save_names[j]}.png")
-            image = images[0][j]
-            image.save(image_save_path)
-
-            if current_prompt != prompts[j]:
-                current_prompt = prompts[j]
-                image_info.append({'images': [], 'prompt': current_prompt, 'category': categories[j]})
-
-            image_info[-1]['images'].append(image_save_path)
-
-    if unet_session_bg:
-        unet_session_bg.stop()
-
-    # Save image information to a json file
-    if os.path.exists(args.info_file_save_path):
-        os.remove(args.info_file_save_path)
-        
-    with os.fdopen(os.open(args.info_file_save_path, os.O_RDWR|os.O_CREAT, 0o644), "w") as f:
-        json.dump(image_info, f)
-
-    print(
-        f"[info] infer number: {infer_num}; use time: {use_time:.3f}s; "
-        f"average time: {use_time/infer_num:.3f}s"
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/unet_cache.py b/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/unet_cache.py
deleted file mode 100644
index 8335caab61c9580253ec0c5ec432cff9801b646b..0000000000000000000000000000000000000000
--- a/ACL_PyTorch/built-in/foundation_models/stable_diffusionxl_refiner/unet_cache.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-
-from auto_optimizer import OnnxGraph
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="models/unet/unet.onnx",
-        help="Path of the unet onnx model.",
-    )
-    parser.add_argument(
-        "--save_dir",
-        type=str,
-        default="models/unet",
-        help="Path to save the modified model",
-    )
-    return parser.parse_args()
-
-
-def cache_unet(model_path, new_model_path, data):
-    model = OnnxGraph.parse(model_path)
-    model.add_output(data, dtype='float32', shape=[])
-    model.save(new_model_path)
-
-
-def skip_unet(model_path, new_model_path, data):
-    model = OnnxGraph.parse(model_path)
-    node = model.get_next_nodes(data)[0]
-    batch_size = model.inputs[0].shape[0]
-    model.add_input('cache', dtype='float32', shape=[batch_size, 1280, 64, 64])
-    node.inputs[0] = 'cache'
-    model.remove_unused_nodes()
-    model.save(new_model_path)
-
-
-def main(args):
-    cache_path = os.path.join(args.save_dir, "unet_cache.onnx")
-    skip_path = os.path.join(args.save_dir, "unet_skip.onnx")
-    cache_name = '/up_blocks.0/upsamplers.0/conv/Conv_output_0'
-    cache_unet(args.model, cache_path, cache_name)
-    skip_unet(args.model, skip_path, cache_name)
-
-
-if __name__ == "__main__":
-    main(parse_arguments())
diff --git a/MindIE/LLM/Pangu/openPangu-Embedded-1B-OrangePi/README.md b/MindIE/LLM/Pangu/openPangu-Embedded-1B-OrangePi/README.md
index 61c8fdd5ddfa4cbaa81e49dfa150974f7577eb5e..fc22fca967c58f03882fa9c77e172ab9538d8354 100644
--- a/MindIE/LLM/Pangu/openPangu-Embedded-1B-OrangePi/README.md
+++ b/MindIE/LLM/Pangu/openPangu-Embedded-1B-OrangePi/README.md
@@ -133,12 +133,11 @@ pip install torch_npu-2.1.0.post13-cp310-cp310-manylinux_2_17_aarch64.manylinux2
 ### 5. 安装模型仓
 
 使用编译好的包进行安装
-  - 下载编译好的包[链接](https://support.huawei.com/enterprise/zh/ascend-computing/mindie-pid-261803968/software/266130647?idAbPath=fixnode01|23710424|251366513|254884019|261408772|261803968)
+  - 下载编译好的包[链接](https://mindie.obs.cn-north-4.myhuaweicloud.com/artifact/ATB-Models/2.2.T10/Ascend-mindie-atb-models_2.2.T10_linux-aarch64_py310_torch2.1.0-abi0.tar.gz)
 
     | 包名                                                         |
     | ------------------------------------------------------------ |
     | Ascend-mindie-atb-models_2.1.RC1_linux-aarch64_py310_torch2.1.0-abi0.tar.gz |
-    | Ascend-mindie-atb-models_2.1.RC1_linux-aarch64_py310_torch2.1.0-abi1.tar.gz |
 
   - 将文件放置在\${working_dir}路径下
   - 解压