diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/1. Profiling\350\204\232\346\234\254\344\270\216\346\225\260\346\215\256\350\216\267\345\217\226.docx" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/1. Profiling\350\204\232\346\234\254\344\270\216\346\225\260\346\215\256\350\216\267\345\217\226.docx"
deleted file mode 100644
index e0cb867c28be21e755ac03020025c9f0fc8ed3f7..0000000000000000000000000000000000000000
Binary files "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/1. Profiling\350\204\232\346\234\254\344\270\216\346\225\260\346\215\256\350\216\267\345\217\226.docx" and /dev/null differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/2. NPU\344\270\216GPU\344\270\213Profilig\346\225\260\346\215\256\345\210\206\346\236\220.docx" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/2. NPU\344\270\216GPU\344\270\213Profilig\346\225\260\346\215\256\345\210\206\346\236\220.docx"
deleted file mode 100644
index a206b973dbdfe30dc6e0270cfb114c9a658d20ee..0000000000000000000000000000000000000000
Binary files "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/2. NPU\344\270\216GPU\344\270\213Profilig\346\225\260\346\215\256\345\210\206\346\236\220.docx" and /dev/null differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/3. NPU\344\270\213OM\346\250\241\345\236\213Autotune\350\260\203\344\274\230.docx" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/3. NPU\344\270\213OM\346\250\241\345\236\213Autotune\350\260\203\344\274\230.docx"
deleted file mode 100644
index 2d2ab8c2d2ed8b5d82ea8956ff7c1ac0b68e2cf3..0000000000000000000000000000000000000000
Binary files "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/3. NPU\344\270\213OM\346\250\241\345\236\213Autotune\350\260\203\344\274\230.docx" and /dev/null differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/4. ONNX\346\216\250\347\220\206\346\250\241\345\236\213\346\200\247\350\203\275\344\274\230\345\214\226\344\271\213UB\350\236\215\345\220\210.docx" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/4. ONNX\346\216\250\347\220\206\346\250\241\345\236\213\346\200\247\350\203\275\344\274\230\345\214\226\344\271\213UB\350\236\215\345\220\210.docx"
deleted file mode 100644
index 79fdc3f537d8bc701cbaa29db31fa974b91ce2e5..0000000000000000000000000000000000000000
Binary files "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/4. ONNX\346\216\250\347\220\206\346\250\241\345\236\213\346\200\247\350\203\275\344\274\230\345\214\226\344\271\213UB\350\236\215\345\220\210.docx" and /dev/null differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/5. ONNX\346\216\250\347\220\206\346\250\241\345\236\213\346\200\247\350\203\275\344\274\230\345\214\226\344\271\213Graph\350\236\215\345\220\210.docx" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/5. ONNX\346\216\250\347\220\206\346\250\241\345\236\213\346\200\247\350\203\275\344\274\230\345\214\226\344\271\213Graph\350\236\215\345\220\210.docx"
deleted file mode 100644
index ff38d55fb6fe5f3cfb86f70293e6bc35b7bf6eca..0000000000000000000000000000000000000000
Binary files "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/5. ONNX\346\216\250\347\220\206\346\250\241\345\236\213\346\200\247\350\203\275\344\274\230\345\214\226\344\271\213Graph\350\236\215\345\220\210.docx" and /dev/null differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/README.md" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/README.md"
new file mode 100644
index 0000000000000000000000000000000000000000..75592016de14480c465f774f007c22df8ff658f9
--- /dev/null
+++ "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/README.md"
@@ -0,0 +1,107 @@
+# 1. 性能调优工具及流程介绍
+
+本文档主要解决：Ascend310上推理om模型的性能不达标问题
+
+- [1. 性能调优工具及流程介绍](#1-性能调优工具及流程介绍)
+- [2. 根据profiling数据识别性能瓶颈点](#2-根据profiling数据识别性能瓶颈点)
+  - [2.1. 使用trtexec生成profiling结果](#21-使用trtexec生成profiling结果)
+  - [2.2. 生成NPU的profiling结果](#22-生成npu的profiling结果)
+  - [2.3. 识别性能瓶颈点](#23-识别性能瓶颈点)
+- [3. 性能优化手段](#3-性能优化手段)
+  - [3.1. 计算瓶颈类的性能提升手段](#31-计算瓶颈类的性能提升手段)
+  - [3.2. 内存瓶颈类的性能提升手段](#32-内存瓶颈类的性能提升手段)
+  - [3.3. 其他性能优化手段](#33-其他性能优化手段)
+
+# 2. 根据profiling数据识别性能瓶颈点
+
+## 2.1. 使用trtexec生成profiling结果
+
+在正常使用 `trtexec` 工具进行纯推理基础上，加上 `--dumpProfile` 或者 `--exportProfile=<file>` 得到GPU上的profiling结果。
+
+![gpu_profiling](./img/performence_01.png)
+
+- 分析GPU profiling的目的：
+  - 通过对比识别性能差的算子，进行单算子性能优化；
+  - 通过对比参考GPU算子融合方法，利用子图融合提升模型性能；
+
+## 2.2. 生成NPU的profiling结果
+
+```shell
+# profile.sh
+export install_path=/usr/local/Ascend/ascend-toolkit/latest
+export profile_path=${install_path}/toolkit/tools/profiler
+export PATH=/usr/local/python3.7.5/bin:${install_path}/atc/ccec_compiler/bin:${install_path}/atc/bin:${profile_path}/bin:$PATH
+export PYTHONPATH=${install_path}/atc/python/site-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=${install_path}/atc/lib64:${install_path}/acllib/lib64:$LD_LIBRARY_PATH
+export ASCEND_OPP_PATH=${install_path}/opp
+
+# profiling数据采集
+msprof --output=$1 \
+       --application=$2 \
+       --sys-hardware-mem=on \
+       --sys-cpu-profiling=on \
+       --sys-profiling=on
+
+# profiling数据解析
+python3.7.5 ${profile_path}/profiler_tool/analysis/msprof/msprof.py export summary -dir $1 --format csv
+python3.7.5 ${profile_path}/profiler_tool/analysis/msprof/msprof.py export timeline -dir $1
+```
+> 使用方法：`bash profile.sh dir_to_profiling_result path_to_runfile`，其中 `path_to_runfile` 内容为可以正常运行的 benchmark 或者 msame 推理 om 模型的命令
+
+## 2.3. 识别性能瓶颈点
+
+在 `dir_to_profiling_result` 路径下会生成 `op_summary_*.csv` 文件，该文件即为 om 模型中每个算子的性能统计表格。
+<table border="1">
+<tr>
+  <th>类别</th>
+  <td>说明</td>
+</tr>
+<tr>
+  <th>aicore_time/Task Duration</th>
+  <td>对应算子执行耗时，可通过降序排列该数值找到耗时最严重算子</td>
+</tr>
+<tr>
+  <th rowspan="3">compute bound</th>
+  <td>vec_ratio：表示向量类运算指令耗时占比</td>
+</tr>
+<tr>
+  <td>mac_ratio：表示矩阵类运算指令耗时占比</td>
+</tr>
+<tr>
+  <td>scalar_ratio：表示标量类运算指令耗时占比</td>
+</tr>
+<tr>
+  <th rowspan="3">memory bound</th>
+  <td>mte1_ratio：表示L1->L0A/L0B数据搬运类指令耗时占比</td>
+</tr>
+<tr>
+  <td>mte2_ratio：表示DDR->AICORE数据搬运类指令耗时占比</td>
+</tr>
+<tr>
+  <td>mte3_ratio：表示AICORE->DDR数据搬运类指令耗时占比</td>
+</tr>
+</table>
+
+- **compute bound** 表示计算瓶颈类算子，即该算子没有最大限度发挥硬件能力，可以理解成“数据搬运到对应硬件单元花费1个单位时间，但该算子计算花费N个单位时间”
+- **memory bound** 表示内存瓶颈类算子，即该算子大量时间都在等待数据搬运，可以理解成“数据搬运到对应硬件单元花费N个单位时间，但该算子计算花费1个单位时间”
+
+# 3. 性能优化手段
+
+## 3.1. 计算瓶颈类的性能提升手段
+这类问题优化思路是修改算子实现方式，更改算法实现或者接口调用来提升性能。
+- [softmax算子案例](优化在最后一根轴做softmax性能低的问题.md)
+  `softmax` 算子在最后一根轴上计算时性能低，但经过算子同事分析目前无法从算子侧进行优化，因此可以参考本案例进行等价转换提升性能。
+
+## 3.2. 内存瓶颈类的性能提升手段
+
+这类问题的常见优化思路是消除transdata算子，避免冗余的数据搬运和转换。
+> 为了达到极限性能目的，昇腾芯片上有需要内置数据排布格式，transdata算子是实现不同格式的数据转换，如 `NCHW<==>Ascend_format` 或 `Ascend_format_1 <==> Ascend_format_2`
+- [Conv1D性能优化案例](./案例-Conv1D算子优化.md)
+  直接使用Conv1D会导致产生多余transdata算子，因此可以参考本案例消除transdata算子来提升性能
+
+## 3.3. 其他性能优化手段
+
+- autotune
+
+  在原 ATC 模型转换命令中添加 `--auto_tune_mode="RL,GA"` 即可实现 autotune 调优。autotune的作用是控制TBE算子编译时能在昇腾AI处理器上寻找最好的性能配置。详细介绍参见 [Auto Tune工具使用指南](https://support.huaweicloud.com/auxiliarydevtool-cann504alpha5infer/atlasautotune_16_0003.html)
+
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/performence_01.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/performence_01.png"
new file mode 100644
index 0000000000000000000000000000000000000000..5098d03c8fa83272aee4d6830fc237c33fc112c1
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/performence_01.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_01.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_01.png"
new file mode 100644
index 0000000000000000000000000000000000000000..31b731bcd6c3dc700d9858396af53a202b3cf437
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_01.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_02.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_02.png"
new file mode 100644
index 0000000000000000000000000000000000000000..38159e419472a49a59e67b14c1cb7b3a9334a22f
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_02.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_03.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_03.png"
new file mode 100644
index 0000000000000000000000000000000000000000..92ec1cd3226aab2b0f184cbae2b171affdd5e55a
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_03.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_04.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_04.png"
new file mode 100644
index 0000000000000000000000000000000000000000..e2644e91b98b9e9f03e5e87141fa17f161522859
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_04.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_05.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_05.png"
new file mode 100644
index 0000000000000000000000000000000000000000..99f0ecb98e12ce7e130e256f0d9f26084e523ea1
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_05.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_06.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_06.png"
new file mode 100644
index 0000000000000000000000000000000000000000..a23c75b06905f574af57f45be84ce6e29a7e3155
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_06.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_07.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_07.png"
new file mode 100644
index 0000000000000000000000000000000000000000..f12f76bbed4d58bf07965f4b84600f09efa4a67c
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_07.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_08.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_08.png"
new file mode 100644
index 0000000000000000000000000000000000000000..2e30473caaf60dd58ac2a0763f35f55c3235c954
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_08.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_09.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_09.png"
new file mode 100644
index 0000000000000000000000000000000000000000..f25cee2f6d80cc060b1113772c05b4c2571e2d12
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_09.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_10.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_10.png"
new file mode 100644
index 0000000000000000000000000000000000000000..cdff29397c798a680b9fc89d4a6658190c358653
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_10.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_11.png" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_11.png"
new file mode 100644
index 0000000000000000000000000000000000000000..2ebb440e660346efed2c9409eefa986171950a48
Binary files /dev/null and "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/img/transpose_11.png" differ
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/\346\241\210\344\276\213-PCB\346\250\241\345\236\213\347\262\276\345\272\246\350\260\203\350\257\225.md" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/\346\241\210\344\276\213-PCB\346\250\241\345\236\213\346\200\247\350\203\275\350\260\203\344\274\230.md"
similarity index 100%
rename from "Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/\346\241\210\344\276\213-PCB\346\250\241\345\236\213\347\262\276\345\272\246\350\260\203\350\257\225.md"
rename to "Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/\346\241\210\344\276\213-PCB\346\250\241\345\236\213\346\200\247\350\203\275\350\260\203\344\274\230.md"
diff --git "a/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/\346\241\210\344\276\213-transpose\344\274\230\345\214\226\346\261\207\346\200\273.md" "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/\346\241\210\344\276\213-transpose\344\274\230\345\214\226\346\261\207\346\200\273.md"
new file mode 100644
index 0000000000000000000000000000000000000000..878fb8493c508615b9ffae22b22c015b78194417
--- /dev/null
+++ "b/Ascend-PyTorch\347\246\273\347\272\277\346\216\250\347\220\206\346\214\207\345\257\274/\344\270\223\351\242\230\346\241\210\344\276\213/\346\200\247\350\203\275\350\260\203\344\274\230/\346\241\210\344\276\213-transpose\344\274\230\345\214\226\346\261\207\346\200\273.md"
@@ -0,0 +1,166 @@
+# 1. transpose优化汇总
+
+本文档主要提供transpose性能提升思路和解决案例
+
+- [1. transpose优化汇总](#1-transpose优化汇总)
+- [2. transpose性能提升的建议](#2-transpose性能提升的建议)
+- [3. 目前已识别并解决的案例](#3-目前已识别并解决的案例)
+  - [3.1. 案例一：使用tile代替transpose，整网性能提升270%](#31-案例一使用tile代替transpose整网性能提升270)
+  - [3.2. 案例二：使用transpose组合代替单个transpose，单算子性能提升20~60倍](#32-案例二使用transpose组合代替单个transpose单算子性能提升2060倍)
+  - [3.3. 案例三：连续排列的维度可以合并成一个维度](#33-案例三连续排列的维度可以合并成一个维度)
+
+# 2. transpose性能提升的建议
+
+当前阶段，CANN中的transpose算子已经经过了很多优化尝试，但是部分场景仍然性能比较差，这种情况下与其给算子同事施压还不如想想能不能从模型层面优化。
+
+根据经验有如下建议可以参考：
+- `broadcast_OP + transpose` 组合可以使用一个 `broadcast_OP` 等价实现，性能会提升很多。参见案例1。
+- 尽可能让transpose一次操作更多内存连续的数据。参见案例2。
+- 连续排列的维度可以合并成一个维度(大小为1的维度可以直接去除)。参见案例3。
+
+
+# 3. 目前已识别并解决的案例
+
+## 3.1. 案例一：使用tile代替transpose，整网性能提升270%
+
+经验：广播操作类算子运行效率会比数据搬运类算子运行效率高很多，所以使用tile算子代替transpose算子提升性能。
+
+- 问题现象
+
+  om推理性能不达标，做完profiling后，查看op_summary文件，对aicore_time进行降序排列，发现transpose耗时占比较高。可以看出transopseD是其它算子耗时的14倍之多。
+  ![transpose_profiling](./img/transpose_01.png)
+
+- 原因分析
+
+  在ONNX模型中找到这些transpose出现的地方，发现是对数据量最大的两轴做transopse，进而导致性能差。
+  ![transpose_src](./img/transpose_02.png)
+
+- 解决方案
+
+  找到pytorch源码中的transpose出现地方，用于实现算子的等价替换。因为transose的输入是expand得到的，且transpose互换的两轴是相等的，所以想到了tile算子。最终通过实验使用reshape和tile实现同样功能。
+  ```python
+  import torch
+  from torch import nn
+
+  class SrcCode(nn.Module):
+      def forward(self,x):
+          y = x.expand(1024, 1024, 3).transpose(0, 1)
+          return y
+
+  class Optimizer(nn.Module):
+      def forward(self,x):
+          t = x.reshape(1024,1,3).repeat((1,1024,1))
+          return t
+
+  src = SrcCode()
+  opt = Optimizer()
+  src.eval()
+  opt.eval()
+
+  input1 = torch.randn(1024,3)
+  out = src(input1)
+  out2 = opt(input1)
+  # 精度对比
+  print(((out2-out).abs()>1e-6).sum())
+  torch.onnx.export(model,input1,'step_tile.onnx',opset_version=10)
+  ```
+  pytorch侧精度对比通过，om侧精度对比也通过。导出的样例onnx对比如下
+
+  <img src=./img/transpose_03.png width=40% height=40%/>
+
+  性能提升结果：整网性能提升270%，性能达标。
+  ![transpose_opt](./img/transpose_04.png)
+  ![opt_profiling](./img/transpose_05.png)
+
+## 3.2. 案例二：使用transpose组合代替单个transpose，单算子性能提升20~60倍
+
+经验：一次尽可能操作更多数据，连续排列的维度可以合并成一个维度。
+
+- 问题现象
+
+  om推理性能不达标，做完profiling后，查看op_summary文件，对aicore_time进行降序排列，发现transpose耗时占比较高。可以看出transopseD是其它算子耗时的15倍之多。
+  ![transpose_profiling](./img/transpose_06.png)
+
+- 原因分析
+
+  从整网中分析有4个循环结构，每个循环结构中有两个十分耗时transpose算子。这里以 `batch=1` 为例进行分析，多batch收益可能更大：
+  - 第一类transpose是将 `x.shape=[1, 6, 32, 9, 196]` 转换成 `y.shape=[1,6,196,9,32]`，transpose 使用的 vector 指令，一次操作大量连续内存数据时性能会很好，但这里 `[32, 9, 196]` 换成 `[196,9,32]` 一次操作数据量太少所以性能不好。
+  - 第二类transpose是将 `x.shape=[1,6,196,9,32]` 转换成 `y.shape=[1,6,32,9,196]`，与上述场景同理。
+
+- 解决方案
+
+  找到pytorch源码中的transpose出现地方，用于实现算子的等价替换。核心思想就是一次操作尽可能多的数据，剩下的就是尝试，毫无技巧可言。
+  ```python
+  import torch
+
+  class src1(torch.nn.Module):
+      def forward(self, x):
+          return x.reshape(1, 6, 32, 9, 196).permute(0, 1, 4, 3, 2)
+
+  class dst1(torch.nn.Module):
+      def forward(self, x):
+          return x.reshape(1*6, 32*9, 196).permute(0, 2, 1).reshape(
+                      1*6*196, 32, 9).permute(0,2,1).reshape(1, 6, 196, 9, 32)
+
+  class src2(torch.nn.Module):
+      def forward(self, x):
+          return x.permute(0, 1, 4, 3, 2).reshape(1, 6*32, 9, 196)
+
+  class dst2(torch.nn.Module):
+      def forward(self, x):
+          return x.reshape(1*6, 9*196, 32).permute(0, 2, 1).reshape(
+                    1*6*32, 196, 9).permute(0,2,1).reshape(1, 6*32, 9, 196)
+
+  # case 1
+  x1 = torch.randn(1, 6*32*9, 196, dtype=torch.float32)
+  msrc1 = src1()
+  msrc1.eval()
+  ysrc1 = msrc1(x1)
+  mdst1 = dst1()
+  mdst1.eval()
+  ydst1 = mdst1(x1)
+  # 精度对比
+  print(((ysrc1 - ydst1).abs() > 1e-6).sum())
+
+  # case 2
+  x2 = torch.randn(1, 6, 196, 9, 32, dtype=torch.float32)
+  msrc2 = src2()
+  msrc2.eval()
+  ysrc2 = msrc2(x2)
+  mdst2 = dst2()
+  mdst2.eval()
+  ydst2 = mdst2(x2)
+  # 精度对比
+  print(((ysrc2 - ydst2).abs() > 1e-6).sum())
+
+
+  torch.onnx.export(msrc1, x1, 'src1.onnx', opset_version=11)
+  torch.onnx.export(mdst1, x1, 'dst1.onnx', opset_version=11)
+  torch.onnx.export(msrc2, x2, 'src2.onnx', opset_version=11)
+  torch.onnx.export(mdst2, x2, 'dst2.onnx', opset_version=11)
+  ```
+  pytorch侧精度对比通过，om侧精度对比也通过。导出的样例onnx对比如下
+
+  <img src=./img/transpose_07.png width=60% height=40%/>
+
+  **`src1 VS dst1`** 性能提升结果：性能提升`59020/(813+1194) = 29`，收益可观。
+  ![src1_profiling](./img/transpose_08.png)
+  ![dst1_profiling](./img/transpose_09.png)
+
+  **`src2 VS dst2`** 性能提升结果：性能提升`55657/(417+529) = 59`，收益可观。
+  ![src2_profiling](./img/transpose_10.png)
+  ![dst2_profiling](./img/transpose_11.png)
+
+## 3.3. 案例三：连续排列的维度可以合并成一个维度
+
+本案例摘自[知乎博文](https://zhuanlan.zhihu.com/p/425587014)，仅供参考。
+```python
+import torch
+x = torch.randn(3, 4, 5, 6)
+# src
+y = x.permute(2, 3, 0, 1)     # y.shape = (5, 6, 3, 4)
+
+# dst
+y = x.reshape(3*4, 5*6
+              ).permute(1, 0).reshape(5, 6, 3, 4)
+```