From 00226dbc94c536fcbbe3cfd68e45cd8ad5e311be Mon Sep 17 00:00:00 2001 From: huanglei_Sorry Date: Wed, 22 Mar 2023 11:08:59 +0800 Subject: [PATCH 01/11] origin glm init --- PyTorch/contrib/nlp/GLM-130B/LICENSE | 201 ++++++++++ PyTorch/contrib/nlp/GLM-130B/MODEL_LICENSE | 33 ++ PyTorch/contrib/nlp/GLM-130B/README.md | 202 ++++++++++ PyTorch/contrib/nlp/GLM-130B/README_zh.md | 379 ++++++++++++++++++ PyTorch/contrib/nlp/GLM-130B/benchmark.py | 20 + .../nlp/GLM-130B/configs/model_glm_130b.sh | 15 + .../GLM-130B/configs/model_glm_130b_int4.sh | 16 + .../GLM-130B/configs/model_glm_130b_int8.sh | 16 + .../GLM-130B/configs/model_glm_130b_v100.sh | 17 + PyTorch/contrib/nlp/GLM-130B/cuda/Makefile | 22 + .../contrib/nlp/GLM-130B/cuda/quantization.cu | 81 ++++ .../GLM-130B/docs/evaluate-your-own-tasks.md | 86 ++++ .../docs/inference-with-fastertransformer.md | 156 +++++++ .../GLM-130B/docs/low-resource-inference.md | 28 ++ .../contrib/nlp/GLM-130B/docs/quantization.md | 66 +++ PyTorch/contrib/nlp/GLM-130B/evaluate.py | 67 ++++ .../nlp/GLM-130B/evaluation/__init__.py | 12 + .../nlp/GLM-130B/evaluation/configs.py | 59 +++ .../nlp/GLM-130B/evaluation/dataset.py | 371 +++++++++++++++++ .../nlp/GLM-130B/evaluation/metrics.py | 140 +++++++ .../contrib/nlp/GLM-130B/evaluation/model.py | 202 ++++++++++ .../contrib/nlp/GLM-130B/evaluation/tasks.py | 220 ++++++++++ .../contrib/nlp/GLM-130B/evaluation/utils.py | 67 ++++ PyTorch/contrib/nlp/GLM-130B/generate.py | 219 ++++++++++ .../nlp/GLM-130B/generation/__init__.py | 1 + .../nlp/GLM-130B/generation/strategies.py | 193 +++++++++ PyTorch/contrib/nlp/GLM-130B/initialize.py | 116 ++++++ .../contrib/nlp/GLM-130B/kernels/__init__.py | 99 +++++ .../nlp/GLM-130B/kernels/quantization.fatbin | Bin 0 -> 70696 bytes PyTorch/contrib/nlp/GLM-130B/logs/README.md | 5 + .../contrib/nlp/GLM-130B/logs/main-log-en.md | 251 ++++++++++++ PyTorch/contrib/nlp/GLM-130B/logs/main-log.md | 251 ++++++++++++ .../nlp/GLM-130B/quantization/__init__.py | 63 +++ .../nlp/GLM-130B/quantization/functional.py | 26 ++ .../nlp/GLM-130B/quantization/layers.py | 87 ++++ PyTorch/contrib/nlp/GLM-130B/requirements.txt | 6 + .../nlp/GLM-130B/resources/WechatGroup.jpeg | Bin 0 -> 551455 bytes .../nlp/GLM-130B/resources/multitask_list.txt | 70 ++++ .../contrib/nlp/GLM-130B/scripts/benchmark.sh | 20 + .../contrib/nlp/GLM-130B/scripts/evaluate.sh | 23 ++ .../scripts/evaluate_multiple_node.sh | 28 ++ .../contrib/nlp/GLM-130B/scripts/generate.sh | 38 ++ .../nlp/GLM-130B/tasks/bloom/glue_cola.yaml | 6 + .../nlp/GLM-130B/tasks/bloom/glue_mnli.yaml | 7 + .../nlp/GLM-130B/tasks/bloom/glue_qnli.yaml | 6 + .../nlp/GLM-130B/tasks/bloom/glue_wnli.yaml | 6 + .../nlp/GLM-130B/tasks/bloom/math_qa.yaml | 7 + .../nlp/GLM-130B/tasks/bloom/mc_taco.yaml | 6 + .../nlp/GLM-130B/tasks/bloom/openbook_qa.yaml | 7 + .../nlp/GLM-130B/tasks/bloom/pubmed_qa.yaml | 6 + .../GLM-130B/tasks/bloom/superglue_axb.yaml | 6 + .../GLM-130B/tasks/bloom/superglue_axg.yaml | 6 + .../GLM-130B/tasks/chinese/clue/afqmc.yaml | 4 + .../nlp/GLM-130B/tasks/chinese/clue/c3.yaml | 4 + .../GLM-130B/tasks/chinese/clue/cluewsc.yaml | 4 + .../GLM-130B/tasks/chinese/clue/cmnli.yaml | 4 + .../GLM-130B/tasks/chinese/clue/cmrc2018.yaml | 3 + .../nlp/GLM-130B/tasks/chinese/clue/csl.yaml | 4 + .../nlp/GLM-130B/tasks/chinese/clue/drcd.yaml | 3 + .../GLM-130B/tasks/chinese/clue/ocnli.yaml | 4 + .../GLM-130B/tasks/chinese/fewclue/bustm.yaml | 7 + .../GLM-130B/tasks/chinese/fewclue/chidf.yaml | 7 + .../tasks/chinese/fewclue/cluewscf.yaml | 7 + .../GLM-130B/tasks/chinese/fewclue/cslf.yaml | 7 + .../tasks/chinese/fewclue/eprstmt.yaml | 7 + .../tasks/chinese/fewclue/ocnlif.yaml | 7 + .../tasks/ethnic/crows-pair/crows-pair.yaml | 8 + .../GLM-130B/tasks/ethnic/crows-pair/tasks.py | 114 ++++++ .../ethnic/ethos/ethos-fewshot-multi.yaml | 7 + .../ethnic/ethos/ethos-fewshot-single.yaml | 7 + .../tasks/ethnic/ethos/ethos-oneshot.yaml | 7 + .../tasks/ethnic/ethos/ethos-zeroshot.yaml | 7 + .../tasks/ethnic/stereoset/stereoset.yaml | 9 + .../GLM-130B/tasks/ethnic/stereoset/tasks.py | 126 ++++++ .../tasks/lambada/lambada-unidirectional.yaml | 13 + .../nlp/GLM-130B/tasks/lambada/lambada.yaml | 12 + .../nlp/GLM-130B/tasks/lambada/strategy.py | 21 + .../nlp/GLM-130B/tasks/lambada/task.py | 61 +++ .../GLM-130B/tasks/language-modeling/pile.py | 83 ++++ .../tasks/language-modeling/pile.yaml | 10 + .../GLM-130B/tasks/language-modeling/ptb.yaml | 8 + .../tasks/language-modeling/wikitext-103.yaml | 8 + .../tasks/language-modeling/wikitext-2.yaml | 8 + .../contrib/nlp/GLM-130B/tasks/mmlu/mmlu.yaml | 10 + .../contrib/nlp/GLM-130B/tasks/mmlu/task.py | 78 ++++ .../contrib/nlp/GLM-130B/tools/__init__.py | 0 .../contrib/nlp/GLM-130B/tools/convert_tp.py | 154 +++++++ .../nlp/GLM-130B/tools/tokenize_pile.py | 24 ++ 88 files changed, 4882 insertions(+) create mode 100644 PyTorch/contrib/nlp/GLM-130B/LICENSE create mode 100644 PyTorch/contrib/nlp/GLM-130B/MODEL_LICENSE create mode 100644 PyTorch/contrib/nlp/GLM-130B/README.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/README_zh.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/benchmark.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int4.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int8.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_v100.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/cuda/Makefile create mode 100644 PyTorch/contrib/nlp/GLM-130B/cuda/quantization.cu create mode 100644 PyTorch/contrib/nlp/GLM-130B/docs/evaluate-your-own-tasks.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/docs/inference-with-fastertransformer.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/docs/low-resource-inference.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/docs/quantization.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluate.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluation/__init__.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluation/configs.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluation/dataset.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluation/metrics.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluation/model.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluation/tasks.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/evaluation/utils.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/generate.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/generation/__init__.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/generation/strategies.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/initialize.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/kernels/__init__.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/kernels/quantization.fatbin create mode 100644 PyTorch/contrib/nlp/GLM-130B/logs/README.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/logs/main-log-en.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/logs/main-log.md create mode 100644 PyTorch/contrib/nlp/GLM-130B/quantization/__init__.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/quantization/functional.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/quantization/layers.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/requirements.txt create mode 100644 PyTorch/contrib/nlp/GLM-130B/resources/WechatGroup.jpeg create mode 100644 PyTorch/contrib/nlp/GLM-130B/resources/multitask_list.txt create mode 100644 PyTorch/contrib/nlp/GLM-130B/scripts/benchmark.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/scripts/evaluate.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/scripts/evaluate_multiple_node.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/scripts/generate.sh create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/glue_cola.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/glue_mnli.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/glue_qnli.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/glue_wnli.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/math_qa.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/mc_taco.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/openbook_qa.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/pubmed_qa.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/superglue_axb.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/bloom/superglue_axg.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/afqmc.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/c3.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/cluewsc.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/cmnli.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/cmrc2018.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/csl.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/drcd.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/clue/ocnli.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/fewclue/bustm.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/fewclue/chidf.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/fewclue/cluewscf.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/fewclue/cslf.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/fewclue/eprstmt.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/chinese/fewclue/ocnlif.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/crows-pair/crows-pair.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/crows-pair/tasks.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/ethos/ethos-fewshot-multi.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/ethos/ethos-fewshot-single.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/ethos/ethos-oneshot.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/ethos/ethos-zeroshot.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/stereoset/stereoset.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/ethnic/stereoset/tasks.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/lambada/lambada-unidirectional.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/lambada/lambada.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/lambada/strategy.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/lambada/task.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/language-modeling/pile.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/language-modeling/pile.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/language-modeling/ptb.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/language-modeling/wikitext-103.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/language-modeling/wikitext-2.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/mmlu/mmlu.yaml create mode 100644 PyTorch/contrib/nlp/GLM-130B/tasks/mmlu/task.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tools/__init__.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tools/convert_tp.py create mode 100644 PyTorch/contrib/nlp/GLM-130B/tools/tokenize_pile.py diff --git a/PyTorch/contrib/nlp/GLM-130B/LICENSE b/PyTorch/contrib/nlp/GLM-130B/LICENSE new file mode 100644 index 0000000000..dffead34fd --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright Aohan Zeng + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/PyTorch/contrib/nlp/GLM-130B/MODEL_LICENSE b/PyTorch/contrib/nlp/GLM-130B/MODEL_LICENSE new file mode 100644 index 0000000000..d1eb47b011 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/MODEL_LICENSE @@ -0,0 +1,33 @@ +The GLM-130B License + +1. Definitions + +“Licensor” means the GLM-130B Model Team that distributes its Software. + +“Software” means the GLM-130B model parameters made available under this license. + +2. License Grant + +Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes. + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +3. Restriction + +You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes. + +You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings. + +4. Disclaimer + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +5. Limitation of Liability + +EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +6. Dispute Resolution + +This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing. + +Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com. diff --git a/PyTorch/contrib/nlp/GLM-130B/README.md b/PyTorch/contrib/nlp/GLM-130B/README.md new file mode 100644 index 0000000000..fd00a1287e --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/README.md @@ -0,0 +1,202 @@ + + +

+ 🌐 Blog • ⏬ Download Model • 🪧 Demo • ✉️ Email • 📃 Paper [ICLR 2023]
+

+ +

+ 💬 Google Group (Updates) or Wechat Group or Slack channel (Discussions) +

+ +# GLM-130B: An Open Bilingual Pre-Trained Model + +GLM-130B is an open bilingual (English & Chinese) bidirectional dense model with 130 billion parameters, pre-trained using the algorithm of [General Language Model (GLM)](https://aclanthology.org/2022.acl-long.26). It is designed to support inference tasks with the 130B parameters on **a single A100 (40G * 8)** or **V100 (32G * 8) server**. With INT4 quantization, the hardware requirements can further be reduced to **a single server with 4 * RTX 3090 (24G)** with **almost no performance degradation**. As of July 3rd, 2022, GLM-130B has been trained on over 400 billion text tokens (200B each for Chinese and English) and it has the following unique features: + +- **Bilingual:** supports both English and Chinese. +- **Performance (EN):** better than GPT-3 175B (+4.0%), OPT-175B (+5.5%), and BLOOM-176B (+13.0%) on LAMBADA and slightly better than GPT-3 175B (+0.9%) on MMLU. +- **Performance (CN):** significantly better than ERNIE TITAN 3.0 260B on 7 zero-shot CLUE datasets (+24.26%) and 5 zero-shot FewCLUE datasets (+12.75%). +- **Fast Inference:** supports fast inference on both [SAT](https://github.com/THUDM/SwissArmyTransformer) and [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) (up to 2.5X faster) with a single A100 server. +- **Reproducibility:** all results (30+ tasks) can be easily reproduced with open-sourced code and model checkpoints. +- **Cross-Platform:** supports training and inference on NVIDIA, Hygon DCU, Ascend 910, and Sunway (Will be released soon). + +This repository mainly focus on the evaluation of GLM-130B, the training part is open for research purposes, please send an email to glm-130b@googlegroups.com to apply for access. If you find our work and our open-sourced efforts useful, ⭐️ to encourage our following development! :) + +## News + +- **[2023.01.21]** GLM-130B has been accepted to [ICLR 2023](https://iclr.cc/Conferences/2023)! +- **[2022.10.06]** Our [paper](http://arxiv.org/abs/2210.02414) for GLM-130B is out! +- **[2022.08.24]** We are proud to publish the quantized version for GLM-130B. While preserving the activation precision as FP16, the model weights can be quantized to as low as **INT4 with almost no degradation of performance**, further reducing the hardware requirements of the GLM-130B to **a single server with 4 * RTX 3090 (24G)**! See [Quantization of GLM-130B](docs/quantization.md) for details. + +For smaller models, please find [monolingual GLMs](https://github.com/THUDM/GLM) (English: 10B/2B/515M/410M/335M/110M, Chinese: 10B/335M) and an [1B multilingual GLM](https://github.com/THUDM/Multilingual-GLM) (104 languages). + +## Getting Started + +### Environment Setup + +#### Hardware + +| **Hardware** | **GPU Memory** | **Quantization** | **Weight Offload** | +| --------------- | -------------- | ---------------- | ------------------ | +| 8 * A100 | 40 GB | No | No | +| 8 * V100 | 32 GB | No | Yes (BMInf) | +| 8 * V100 | 32 GB | INT8 | No | +| 8 * RTX 3090 | 24 GB | INT8 | No | +| 4 * RTX 3090 | 24 GB | INT4 | No | +| 8 * RTX 2080 Ti | 11 GB | INT4 | No | + +It is recommended to use the an A100 (40G * 8) server, as all GLM-130B evaluation results (~30 tasks) reported can be easily reproduced with a single A100 server in about half a day. With INT8/INT4 quantization, efficient inference on **a single server with 4 * RTX 3090 (24G)** is possible, see [Quantization of GLM-130B](docs/quantization.md) for details. Combining quantization and weight offloading techniques, GLM-130B can also be inferenced on servers with even smaller GPU memory, see [Low-Resource Inference](docs/low-resource-inference.md) for details. + +#### Software + +The GLM-130B code is built on the top of [SAT](https://github.com/THUDM/SwissArmyTransformer). We recommend using [Miniconda](https://docs.conda.io/en/latest/miniconda.html) to manage your environment and installing additional dependencies via `pip install -r requirements.txt`. Here are the recommended environment configurations: + +- Python 3.9+ / CUDA 11+ / PyTorch 1.10+ / DeepSpeed 0.6+ / Apex (**installation with CUDA and C++ extensions is required, see [here](https://github.com/NVIDIA/apex/#linux)**) +- SwissArmyTransformer>=0.2.11 is required for quantization + +#### Model weights + +Download the GLM-130B’s model checkpoint from [here](https://docs.google.com/forms/d/e/1FAIpQLSehr5Dh_i3TwACmFFi8QEgIVNYGmSPwV0GueIcsUev0NEfUug/viewform?usp=sf_link), make sure all 60 chunks are downloaded completely, then use the following command to merge them into a single archive file and extract it: + +```bash +cat glm-130b-sat.tar.part_* > glm-130b-sat.tar +tar xvf glm-130b-sat.tar +``` + +Set `CHECKPOINT_PATH` in `configs/model_glm_130b.sh` to the path of the extracted folder. Since the checkpoint file is up to 260G, it is recommended to use the SSD or RAM disk to reduce the checkpoint loading time. Since the checkpoint we distribute is in 8-way tensor parallel, a conversion scripts is also provided if you need to change the tensor parallel dimension. + +```bash +python tools/convert_tp.py \ + --input-folder \ + --output-folder \ + --target-tp +``` + +### Left-To-Right Generation / Blank Filling + +```bash +bash scripts/generate.sh --input-source interactive +``` + +You can also specify an input file by `--input-source input.txt`. + +GLM-130B uses two different mask tokens: `[MASK]` for short blank filling and `[gMASK]` for left-to-right long text generation. When the input does not contain any MASK token, `[gMASK]` will be automatically appended to the end of the text. + +
+Left-To-Right Generation Examples + +#### Example 1 +Input: Who is the greatest artist? The greatest artist is [gMASK] + +Output: Who is the greatest artist? The greatest artist is not the one who paints the most beautiful picture, or writes the best poem, or plays the best symphony. The greatest artist is the one whose work has the greatest impact on the lives of others. The Greatest Artist is the artist who inspires us to be our best selves. The Great Artist is not the artist whose work is flawless. The great artist is not one who is perfect. The true great artist has the ability to make us feel something. The artist who makes us feel is the great artist. The art of the great artists is to make you feel something that you have never felt before. The greatness of an artist is in the ability of their work to touch the lives and hearts of others, and to make them feel something they have never experienced before. This is the true greatness of the artist. 2. The Significance of the Great Artist The significance of the greatest artist cannot be overstated. The significance and impact of the art of a great artist cannot even be measured. The impact of a single work of art can change the course of history. A single work can change a person’s life. A work of great art can make you laugh, make you cry, make your heart race, or make you think. A great work of artistic expression has the power to change the way you look at the world. The work of a truly great artist can change your life forever. 3. The Impact of the Greatest Artist The greatest artists have the ability and the power of their art to change people’s lives. They have the power and the ability, through their art, to make people feel something new. The power of the work of the truly great artists cannot be measured, but it can be felt. The greater the impact of an artists work, the greater the power it has to change lives. 4. The Influence of the Art of the Artist The art and the work created by the greatest artists has the potential to change a life. The influence of the artists work cannot be quantified, but is felt by those who experience it. The truly great work has an ability to touch people’ lives in a way that no other work can. + +#### Example 2 (Chinese) +Input: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] + +Output: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:避寒的话,建议去海南三亚。 三亚位于海南岛的最南端,是中国最南部的热带滨海旅游城市,中国海滨城市,是中国空气质量最好的城市、全国最长寿地区(平均寿命80岁)。三亚市别称鹿城,又被称为"东方夏威夷",位居中国四大一线旅游城市"三威杭厦"之首,拥有全岛最美丽的海滨风光。三亚拥有被无数城市嫉妒的清新空气,柔和海滩。在沙滩悠闲散步、沐浴傍晚温和阳光,在海边玩耍,在雨林里面呼吸健康,欣赏自然奇观,一切都是那么令人享受。 气候 三亚地处热带,属热带季风气候区域,终年气温高,寒暑变化不大,年平均气温25.5度。三亚最热的月份是6月,平均气温为28.5°C,极端最高气温为35.7°C ,三亚没有明显的冬季,而且冬季更是到海滨旅游的最好时候,所以三亚是全年全天候避寒、消暑、度假、旅游的好地方。不过,三亚最旺的旅游时间是从10月份到农历春节,这个时候的人特别多,房间也不好订,建议最好避开高峰。三亚的三个旅游高峰期是"五一"、"十一"、"春节",这三个节日期间的房价都会上浮几倍,如果你选择了这三个假期来三亚的话要提前一个星期定房和咨询。 +
+ +
+Blank Filling Examples + +#### Example 1 +Input: Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai. + +Output: Ng is an adjunct professor at Stanford University (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.
+ +#### Example 2 (Chinese) +Input: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。 + +Output: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念拿破仑胜利而建,门高25米,顶上矗立两武士青铜古兵车铸像。 +
+ +
+Arguments Useful in Generation + +- `--input-source [path] or "interactive"` The input file's path. It can also be "interactive", which will launch a CLI. +- `—-output-path [path]` The folder containing the results. +- `—-out-seq-length [int]` The maximum sequence length for generation (including context). +- `—-min-gen-length [int]` The minimum generation length for each MASK. +- `—-sampling-strategy "BaseStrategy" or "BeamSearchStrategy"`. The sampling strategy used. + - For BeamSearchStrategy: + - `—-num-beams [int]` The number of beams. + - `—-length-penalty [float]` The maximum sequence length for generation (including context). + - `—-no-repeat-ngram-size [int]` Prohibit repeated n-gram generation. + - `—-print-all-beam` Print the generated results for all beams. + - For BaseStrategy: + - `—-top-k [int]` Top k sampling. + - `—-top-p [float]` Top p sampling. + - `—-temperature [float]` The sampling temperature. +
+ +### Evaluation + +We use the YAML file to define tasks. Specifically, you can add multiple tasks or folders at a time for evaluation, and the evaluation script will automatically collect all YAML files under those folders recursively. + +``` +bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... +``` + +Download our evaluation dataset [here](https://cloud.tsinghua.edu.cn/f/826f0df4356f4022a264/), and set `DATA_PATH` in `scripts/evaluate.sh` to your local dataset directory. The task folder contains the YAML files for 30+ tasks we evaluated for GLM-130B. Take the [CoLA](https://nyu-mll.github.io/CoLA/) task for example, run `bash scripts/evaluate.sh tasks/bloom/glue_cola.yaml`, which outputs an accuracy of ~65% for the best prompt and ~57% for the median. + +
+Expected Output + +```plain +MultiChoiceTaskConfig(name='glue_cola', type=, path='/thudm/LargeScale/data/zeroshot/bloom/glue_cola', module=None, metrics=['Accuracy'], use_task_mask=False, use_multitask_encoding=False, unidirectional=False, max_seq_length=2048, file_pattern={'validation': '**/validation.jsonl'}, micro_batch_size=8) +Evaluating task glue_cola: + Evaluating group validation: + Finish Following_sentence_acceptable/mul/validation.jsonl, Accuracy = 42.665 + Finish Make_sense_yes_no/mul/validation.jsonl, Accuracy = 56.951 + Finish Previous_sentence_acceptable/mul/validation.jsonl, Accuracy = 65.197 + Finish editing/mul/validation.jsonl, Accuracy = 57.622 + Finish is_this_correct/mul/validation.jsonl, Accuracy = 65.197 +Evaluation results of task glue_cola: + Group validation Accuracy: max = 65.197, median = 57.622, average = 57.526 +Finish task glue_cola in 101.2s. +``` +
+ +Multi-node evaluation can be configured by setting `HOST_FILE_PATH`(required by the [DeepSpeed lanucher](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)) in `scripts/evaluate_multiple_node.sh`. Set `DATA_PATH` in `scripts/evaluate_multiple_node.sh` and run the following command to evaluate all the tasks in `./task` directory. + +``` +bash scripts/evaluate_multiple_node.sh ./tasks +``` + +See [Evaluate Your Own Tasks](docs/evaluate-your-own-tasks.md) for details on how to add new tasks. + +### 2.5X faster Inference using FasterTransformer + +By adapting the GLM-130B model to [FasterTransfomer](https://github.com/NVIDIA/FasterTransformer), a highly optimized transformer model library by NVIDIA, we can reach up to 2.5X speedup on generation, see [Inference with FasterTransformer](docs/inference-with-fastertransformer.md) for details. + + + +## License + +This repository is licensed under the [Apache-2.0 license](LICENSE). The use of GLM-130B model weights is subject to the [Model License](MODEL_LICENSE). + +## Citation + +If you find our work useful, please consider citing GLM-130B: + +``` +@inproceedings{ + zeng2023glm-130b, + title={{GLM}-130B: An Open Bilingual Pre-trained Model}, + author={Aohan Zeng and Xiao Liu and Zhengxiao Du and Zihan Wang and Hanyu Lai and Ming Ding and Zhuoyi Yang and Yifan Xu and Wendi Zheng and Xiao Xia and Weng Lam Tam and Zixuan Ma and Yufei Xue and Jidong Zhai and Wenguang Chen and Zhiyuan Liu and Peng Zhang and Yuxiao Dong and Jie Tang}, + booktitle={The Eleventh International Conference on Learning Representations (ICLR)}, + year={2023}, + url={https://openreview.net/forum?id=-Aw0rrrPUF} +} +``` + +You may also consider GLM's original work in your reference: + +``` +@inproceedings{du2022glm, + title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling}, + author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie}, + booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages={320--335}, + year={2022} +} +``` diff --git a/PyTorch/contrib/nlp/GLM-130B/README_zh.md b/PyTorch/contrib/nlp/GLM-130B/README_zh.md new file mode 100644 index 0000000000..4a57b91c65 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/README_zh.md @@ -0,0 +1,379 @@ + + +

+ 🌐 博客 • ⏬ 下载模型 • 🪧 样例演示 • 💬 讨论 • ✉️ 邮箱 • 💬 谷歌群组 or 微信群 + • 📃 论文(敬请期待)
+

+ +# GLM-130B:开放的中英双语预训练模型 + +## 摘要:何为 GLM-130B? + +GLM-130B 是一个开源开放的双语(中文和英文)双向稠密模型,拥有 1300 亿个参数,模型架构采用通用语言模型(GLM)。它旨在支持在**一台 A100(40G * 8)** 或 **V100(32G * 8)服务器**上对千亿规模的参数进行推理。截至 2022 年 7 月 3 日,GLM-130B 已经对超过 4000 亿个文本标识符(中文和英文各 2000 亿)进行了训练,它有以下独特优势: + +* **双语**:同时支持中文和英文。 +* **任务表现(英文)**: 在 LAMBADA 上优于 GPT-3 175B(+4.0%)、OPT-175B(+5.5%)和 BLOOM-176B(+13.0%),在 MMLU 上略优于GPT-3 175B(+0.9%)。 +* **任务表现(中文)**:在 7 个零样本 CLUE 数据集(+24.26%)和 5 个零样本 FewCLUE 数据集(+12.75%)上明显优于 ERNIE TITAN 3.0 260B。 +* **快速推理**:支持用一台 A100 服务器使用 [SAT](https://github.com/THUDM/SwissArmyTransformer) 和 [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) 进行快速推理(速度最高可达2.5倍)。 +* **可复现性**:所有的结果(超过30个任务)都可以用我们开源的代码和模型参数轻松复现。 +* **多平台**:支持在 NVIDIA、Hygon DCU、Ascend 910 和 Sunway 处理器上进行训练与推理(代码即将开源)。 + +## 快速上手 + +### 环境配置 + +我们的代码是建立在 [SAT](https://github.com/THUDM/SwissArmyTransformer) 之上的。我们推荐使用 Miniconda 来管理环境并通过 `pip install -r requirements.txt` 来安装额外的依赖包。以下是我们推荐的环境配置: + +- Python 3.9+ / PyTorch 1.10+ / DeepSpeed 0.6+ / Apex(**需要安装包含 CUDA 和 C++ 扩展的版本,[参考资料](https://github.com/NVIDIA/apex/#linux)**) + +建议使用 A100(40G * 8)服务器,因为所有报告的评估结果(约30个任务)都可以用一台 A100 服务器在大约半天内轻松再现。GLM-130B 也可以在具有较小 GPU 内存的服务器上进行推断,例如具有 V100(32G * 8)的服务器。详见 [Low-resource Inference](docs/low-resource-inference.md)。 + +从 [这里](https://models.aminer.cn/glm/zh-CN/download/GLM-130B) 申请下载 GLM-130B 的模型检查点,确保所有 60 个块都已完全下载,然后使用以下命令将它们合并为一个存档文件并解压缩: + +```bash +cat glm-130b-sat.tar.part_* > glm-130b-sat.tar +tar xvf glm-130b-sat.tar +``` + +将 `configs/model_glm_130b.sh` 中的 `CHECKPOINT_PATH` 设置为解压文件夹的路径。 由于 checkpoint 文件最大 260G,建议使用 SSD 或 RAM 盘来减少 checkpoint 加载时间。 + +### 自回归文本生成 / 中间文本填空 + +```bash +bash scripts/generate.sh --input-source interactive +``` + +你也可以通过 `--input-source input.txt` 指定一个输入文件。 + +GLM-130B 使用两种不同的掩码标记:`[MASK]` 用于简短的填空任务,`[gMASK]` 用于从左到右的长文本生成。当输入不包含任何 MASK 标记时,`[gMASK]` 将被自动附加到文本的末尾。 + +
+长文本生成样例 + +#### Example 1 +Input: Who is the greatest artist? The greatest artist is [gMASK] + +Output: Who is the greatest artist? The greatest artist is not the one who paints the most beautiful picture, or writes the best poem, or plays the best symphony. The greatest artist is the one whose work has the greatest impact on the lives of others. The Greatest Artist is the artist who inspires us to be our best selves. The Great Artist is not the artist whose work is flawless. The great artist is not one who is perfect. The true great artist has the ability to make us feel something. The artist who makes us feel is the great artist. The art of the great artists is to make you feel something that you have never felt before. The greatness of an artist is in the ability of their work to touch the lives and hearts of others, and to make them feel something they have never experienced before. This is the true greatness of the artist. 2. The Significance of the Great Artist The significance of the greatest artist cannot be overstated. The significance and impact of the art of a great artist cannot even be measured. The impact of a single work of art can change the course of history. A single work can change a person’s life. A work of great art can make you laugh, make you cry, make your heart race, or make you think. A great work of artistic expression has the power to change the way you look at the world. The work of a truly great artist can change your life forever. 3. The Impact of the Greatest Artist The greatest artists have the ability and the power of their art to change people’s lives. They have the power and the ability, through their art, to make people feel something new. The power of the work of the truly great artists cannot be measured, but it can be felt. The greater the impact of an artists work, the greater the power it has to change lives. 4. The Influence of the Art of the Artist The art and the work created by the greatest artists has the potential to change a life. The influence of the artists work cannot be quantified, but is felt by those who experience it. The truly great work has an ability to touch people’ lives in a way that no other work can. + +#### Example 2 (Chinese) +Input: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] + +Output: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:避寒的话,建议去海南三亚。 三亚位于海南岛的最南端,是中国最南部的热带滨海旅游城市,中国海滨城市,是中国空气质量最好的城市、全国最长寿地区(平均寿命80岁)。三亚市别称鹿城,又被称为"东方夏威夷",位居中国四大一线旅游城市"三威杭厦"之首,拥有全岛最美丽的海滨风光。三亚拥有被无数城市嫉妒的清新空气,柔和海滩。在沙滩悠闲散步、沐浴傍晚温和阳光,在海边玩耍,在雨林里面呼吸健康,欣赏自然奇观,一切都是那么令人享受。 气候 三亚地处热带,属热带季风气候区域,终年气温高,寒暑变化不大,年平均气温25.5度。三亚最热的月份是6月,平均气温为28.5°C,极端最高气温为35.7°C ,三亚没有明显的冬季,而且冬季更是到海滨旅游的最好时候,所以三亚是全年全天候避寒、消暑、度假、旅游的好地方。不过,三亚最旺的旅游时间是从10月份到农历春节,这个时候的人特别多,房间也不好订,建议最好避开高峰。三亚的三个旅游高峰期是"五一"、"十一"、"春节",这三个节日期间的房价都会上浮几倍,如果你选择了这三个假期来三亚的话要提前一个星期定房和咨询。 +
+ +
+文本填空样例 + +#### Example 1 +Input: Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai. + +Output: Ng is an adjunct professor at Stanford University (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.
+ +#### Example 2 (Chinese) +Input: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。 + +Output: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念拿破仑胜利而建,门高25米,顶上矗立两武士青铜古兵车铸像。 +
+ + +
+控制生成的主要超参数 + +- `--input-source [path] or "interactive"`. 输入文件的路径。当设为"interactive"时,将会启动交互式CLI。 +- `—-output-path [path]`. 结果输出路径。 +- `—-out-seq-length [int]`. (包括输入内容在内的)最大输出序列长度。 +- `—-min-gen-length [int]` 每个MASK标识符位置的最小生成长度。 +- `—-sampling-strategy "BaseStrategy" or "BeamSearchStrategy"`. 生成的采样策略。 + - 对于 BeamSearchStrategy(集束搜索): + - `—-num-beams [int]`. 集束数目。 + - `—-length-penalty [float]`. (包括输入内容在内的)生成长度惩罚项;数值范围[0, 1],数值越大生成长度越长。 + - `—-no-repeat-ngram-size [int]`. 禁止重复生成的n-gram长度。 + - `—-print-all-beam`. 是否打印每一束搜索结果。 + - For BaseStrategy: + - `—-top-k [int]`. Top k 采样。 + - `—-top-p [float]`. Top p 采样。 + - `—-temperature [float]` . 采样时设置的温度项。 +
+ +### 评估 + +我们使用YAML文件来定义任务。具体来说,你可以一次添加多个任务或文件夹进行评估,评估脚本会自动递归地收集这些文件夹下的所有YAML文件。 + +``` +bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... +``` + +[从这里](https://cloud.tsinghua.edu.cn/f/9257ee84045644b8ac06/)下载我们的评估数据集,并在 `scripts/evaluate.sh` 中设置 `DATA_PATH` 为你的本地数据集目录。任务文件夹包含我们为 GLM-130B 评估的 30 多个任务的 YAML 文件。以 [CoLA](https://nyu-mll.github.io/CoLA/) 任务为例,运行 `bash scripts/evaluate.sh tasks/bloom/glue_cola.yaml`,其输出的最佳提示准确率约为 65%,中值约为 57%。 + +
+预期输出 + +```plain +MultiChoiceTaskConfig(name='glue_cola', type=, path='/thudm/LargeScale/data/zeroshot/bloom/glue_cola', module=None, metrics=['Accuracy'], use_task_mask=False, use_multitask_encoding=False, unidirectional=False, max_seq_length=2048, file_pattern={'validation': '**/validation.jsonl'}, micro_batch_size=8) +Evaluating task glue_cola: + Evaluating group validation: + Finish Following_sentence_acceptable/mul/validation.jsonl, Accuracy = 42.665 + Finish Make_sense_yes_no/mul/validation.jsonl, Accuracy = 56.951 + Finish Previous_sentence_acceptable/mul/validation.jsonl, Accuracy = 65.197 + Finish editing/mul/validation.jsonl, Accuracy = 57.622 + Finish is_this_correct/mul/validation.jsonl, Accuracy = 65.197 +Evaluation results of task glue_cola: + Group validation Accuracy: max = 65.197, median = 57.622, average = 57.526 +Finish task glue_cola in 101.2s. +``` +
+ +可以通过在 `scripts/evaluate_multiple_node.sh` 中设置 `HOST_FILE_PATH`([DeepSpeed lanucher](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) 要求)来配置多节点评估。在 `scripts/evaluate_multiple_node.sh` 中设置 `DATA_PATH` 并运行以下命令来评估`./task`目录中的所有任务。 + +``` +bash scripts/evaluate_multiple_node.sh ./tasks +``` + +关于如何添加新任务的细节,请参见 [评估你自己的任务](docs/evaluate-your-own-tasks.md)。 + +### 使用 FasterTransformer 加速推理速度(高达 2.5 倍) + +- 通过将 GLM-130B 模型与 [FasterTransfomer](https://github.com/NVIDIA/FasterTransformer)(NVIDIA 高度优化的 Transformer 模型库)相适应,我们可以在生成时达到 2.5 倍的速度,详见 [Inference with FasterTransformer](docs/inference-with-fastertransformer.md) 。 + + +## 何为GLM-130B? + +GLM-130B是一个开放的双语(中文与英文)双向语言模型,含1300亿个参数。截至2022年7月,它已经训练了超过4000亿个文本标记。它的底层架构基于[通用语言模型(GLM)](https://aclanthology.org/2022.acl-long.26/),在语言理解和语言生成任务上均展示出强大的性能。 + +### 架构 + +GLM-130B将BERT和GPT的目标进行了统一,并与最近提出的一些技术进行结合以提升语言模型的性能表现。 + +#### 1\. 训练目标:自回归文本填空 + +GLM利用自回归文本填空作为其主要的预训练目标。它掩盖了随机的连续跨度(例如,下面的例子中的 "complete unknown"),并对其进行自回归预测。上下文之间的注意力(例如,"like a [MASK], like a rolling stone")是双向的。相反,被掩盖的标记之间的注意力,和从上下文到被掩盖的标识符的注意力是自回归掩码的。 + + + +在GLM-130B的实现中,有两种不同的MASK标识符,表示两个不同的目的: + +* `[MASK]`根据[泊松分布](https://en.wikipedia.org/wiki/Poisson_distribution) (λ=3)对输入中标识符进行短跨度的采样; +* `[gMASK]`掩盖一个长的跨度,从其位置到整个文本的结束。 + +`[sop]`标识符表示一个片断的开始,`[eop]`表示一个片断的结束。这两个目标在GLM-130B的预训练中是混合的,分别占预训练标记的30%和70%。 + +| | +|:--:| +| *例如:GLM-130B是如何对 `"like a complete unknown, like a rolling stone"`进行预训练的* | + +#### 2\. 位置编码:旋转位置编码 + +GLM-130B使用[旋转位置编码(RoPE)](https://arxiv.org/abs/2104.09864),谷歌的[PaLM](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html)和[ElutherAI](https://www.eleuther.ai/)的GPT-*系列也采用这种编码。RoPE是一种相对位置编码,它利用复数空间的正交投影矩阵来表示标识符的相对距离。还有其他的相对位置编码选项,如Bigscience的[BLOOM](https://huggingface.co/bigscience/bloom)所使用的[AliBi](https://arxiv.org/abs/2108.12409)。但在我们的初步实验中,我们发现。 + +* 当序列长度增长时,RoPE的实现速度更快。 +* RoPE对双向注意力更友好,在下游微调实验中效果更好 + +因此,对于GLM-130B,RoPE是一种有效的、高效的位置编码。 + +#### 3\. 归一化:使用DeepNet的Post-LN + +层归一化(LayerNorm,或LN)是transformer中的一个重要组成部分,其应用可以大大影响训练的稳定性和性能。BERT应用了Post-LN,这意味着LayerNorm是在添加残余分支后应用的。然而,[后续工作](https://arxiv.org/abs/2002.04745)表明,单纯的Post-LN会导致预训练的不稳定,因此现有的大规模模型都选择Pre-LN架构,即在添加残差分支之前应用LayerNorm。 + +| | +|:--:| +| *(a) Post-LN在下游任务中表现更佳;(b) Post-LN + DeepNorm 比 Sandwich-LN 要更加稳定* | + +尽管如此,在现有的实践中,Pre-LN在用FP16训练大规模模型时仍然可能不稳定。[OPT-175B](https://arxiv.org/abs/2205.01068)在训练崩溃时手动调整学习率;[BLOOM](https://huggingface.co/bigscience/bloom)使用BF16(仅适用于NVIDIA Ampere GPU:A100s和3090s)以获得更好的浮点精度来避免崩溃。[CogView](https://proceedings.neurips.cc/paper/2021/file/a4d92e2cd541fca87e4620aba658316d-Paper.pdf)提出了Sandwich-LN作为一种补救措施。更重要的是,[近期工作](https://aclanthology.org/2021.findings-acl.81.pdf)表明,与Post-LN相比,Pre-LN的下游微调性能更差。 + +考虑到所有这些因素,在GLM-130B中,我们决定使用Post-LN,并使用新提出的[DeepNorm](https://arxiv.org/abs/2203.00555)来克服不稳定性。DeepNorm的重点是改进初始化,可以帮助Post-LN变换器扩展到1000层以上。在我们的初步实验中,模型扩展到130B,Sandwich-LN的梯度在大约2.5k步时就会出现损失突变(导致损失发散),而带有DeepNorm的Post-Ln则保持健康并呈现出较小的梯度大小(即更稳定)。 + +#### 4\. 前馈网络:Gated Linear Unit (GLU) + GeLU 激活 + +最近一些改进transformer结构的努力集中在前馈网络(FFN)上,包括用[GLU](https://arxiv.org/abs/1612.08083)(在PaLM中采用)和新提出的[门控注意单元(GAU)](https://arxiv.org/abs/2202.10447)取代它。 + +| | RTE | COPA | BoolQ | WSC | Average | +|------------------------------|------------|------------|------------|------------|---------| +| GLM-base (GeGLU-Sandwich_LN) | 71.00±0.61 | 77.00±1.63 | 77.24±0.43 | 78.21±1.81 | 75.08 | +| GLM-base (GAU-Pre_LN) | | | _diverged_ | | | +| GLM-base (GAU-Sandwich_LN) | 69.92±0.61 | 75.67±0.94 | 77.00±0.15 | 72.44±1.81 | 74.20 | +| GLN-base (FFN-Sandwich_LN) | 71.00±0.74 | 72.33±1.70 | 76.75±0.05 | 73.72±2.40 | 73.36 | + +我们在初步实验中通过对随机的50G中英文混合语料库进行GLM-base(110M)的预训练来测试它们。我们发现,虽然GLU和GAU可以比原始FFN实现更好,但GLU在训练中可以更好、更稳定。 + +因此,在GLM-130B的实现中,我们选择带有GeLU激活的GLU,即GeGLU。GeGLU需要三个投影矩阵;为了保持相同数量的参数,与只利用两个矩阵的FFN相比,我们将其隐藏状态减少到2/3。 + +#### 总结 + +基于以上所有设计,GLM-130B的参数配置为: + +| 层数 | 隐层维度 | GeGLU 隐层维度 | 注意力头数量 | 最大序列长度 | 词表大小 | +|--------|--------------|--------------------|-----------------|---------------------|-------------| +| 70 | 12,288 | 32,768 | 96 | 2,048 | 150,000 | + +该词表和分词器是基于[icetk](https://github.com/THUDM/icetk)实现的。icetk是一个统一的图像、中文和英文的多模态标记器。 + +### 训练 +训练大规模语言模型的最关键挑战是**训练的稳定性**,无一例外。GLM-130B的预训练持续了60天,使用96个DGX-A100(40G)节点,等价花费490万美元的云服务费用;如果训练在半路上失败,并无法恢复训练,那将是一个巨大的损失。 + +| | +|:--:| +| *所有模型都面临训练不稳定,它可能发生在预训练的开始、中间或结束阶段(图(a)和(b)分别取自OPT和BLOOM)* | + +不幸的是,据我们观察,大模型比我们认为的那些小模型更容易受到不可避免的噪音数据和意外涌现的梯度影响。原因是,在训练效率和稳定性之间存在着权衡: + +* **效率**:我们需要一个低精度的浮点格式(如FP16),以减少内存和计算成本; +* **稳定性**:低精度浮点格式容易出现溢出和下溢。 + +而为了平衡这两个要素,我们以及最近的开放性大型模型(如[OPT-175B](https://arxiv.org/abs/2205.01068)、[BLOOM](https://huggingface.co/bigscience/bloom))都付出了巨大的努力来寻找解决方案。在此,我们提出我们的答案。 + +#### 1\. 浮点数格式:FP16 混合精度 + +FP16混合精度已经成为主流大规模模型训练框架的默认选项,用于训练十亿到百亿规模的模型。但其仍太容易遇到精度问题。作为补救措施,NVIDIA Ampere GPU提供了BF16浮点格式(被[BLOOM](https://huggingface.co/bigscience/bloom)采用)来缓解这个问题。然而,BF16在其他平台上不被支持,这大大缩小了它在更广泛的应用中的潜力。 + +为了让更多开发者使用,GLM-130B仍然选择FP16作为其训练浮点格式。同时,这意味着GLM-130B将面临着更多的稳定性挑战。幸运的是,经过多次尝试,我们发现以下的训练策略最终有助于稳定GLM-130B的训练。 + +#### 2\. 嵌入层:梯度缩减 + +我们观察到,在训练的早期阶段,嵌入层的梯度范数明显比其他层大。根据经验,我们发现大多数训练崩溃都发生在其梯度范数激增之后。为了解决这个问题,[BLOOM](https://huggingface.co/bigscience/bloom)汇报了使用[嵌入归一化](https://openreview.net/pdf?id=rI7BL3fHIZq)(我们也发现它能稳定训练),但同时,其牺牲了相对较大的下游性能。 + +由于根本问题是输入嵌入层的急剧梯度,我们建议缩小输入嵌入层的梯度。实现起来相当简单。 + +```python +word_embedding = word_embedding * α + word_embedding.detach() * (1 - α) +``` + +这就把梯度缩小到`α`。在我们的实践中,我们发现`α=0.1`对GLM-130B是最好的。 + +| ![EmbeddingShrink.png](resources/03DF31017FE184DB45D41DFFC6F80EF0.png) | +|:--:| +| *(a) 嵌入层的梯度范数在早期阶段比其他部分大得多
(b) 嵌入梯度缩减的初步实验 (alpha=0.1)* | + +在我们的初步实验中,我们观察到,对于早期阶段的训练来说,缩小嵌入梯度并没有减缓收敛速度;相反,没有缩小梯度的模型会出现意外的尖峰,并在5k步左右出现训练崩溃的情况。 + +#### 3\. 注意力计算:FP32 Softmax + +梯度收缩是一种避免训练崩溃的事后技术。从本质上讲,崩溃是由异常的损失 "梯度"形成的,要么是由于噪声数据,要么是正向计算中的精度上溢或者下溢。 + +| ![scale.png](resources/7CB441707D1035B2890AA2164C5B6EAC.png) | +|:--:| +| *每个注意力头计算出的注意力得分有非常不同的数值范围(摘自[CogView](https://proceedings.neurips.cc/paper/2021/file/a4d92e2cd541fca87e4620aba658316d-Paper.pdf))* | + +我们观察到,在大型语言模型中,注意力的计算操作是最容易上溢或下溢的。[CogView](https://proceedings.neurips.cc/paper/2021/file/a4d92e2cd541fca87e4620aba658316d-Paper.pdf)显示,不同的注意力头对其注意力分数有非常不同的数值范围,有些注意力头计算出的平均分数可以达到+1e4或-1e-3。这种不同的数值范围会导致在softmax计算中FP16下的频繁上溢或下溢。CogView提出了精度瓶颈放松(PB-Relax)来缓解这个问题,它在做softmax之前扣除了每个头的注意力得分矩阵中的最大绝对值。 + +然而,事实证明,PB-Relax在GLM-130B的训练中很慢,可能是因为在96个大小为2048*2048的注意分数矩阵中寻找最大值和操作标量对CUDA内核不友好。最后,经过几周的艰苦探索,我们发现避免这一问题的最快和最简单的方法是在softmax计算中使用FP32。与完全的FP16计算相比,它几乎没有任何速度上的损失,但明显提高了训练的稳定性。 + + + +### 预训练数据 + +#### 自监督预训练 + +我们在2.5T网络爬取的语料上,对GLM-130B进行了预训练,包括英文1.2T来自Pile的语料和1.3T中文语料. + +#### 多任务指令预训练(Multi-Task Instruction Pre-Training,MIP) + +同时,[FLAN](https://arxiv.org/pdf/2109.01652.pdf)和[T0](https://arxiv.org/pdf/2110.08207.pdf)的最新进展表明,大规模语言模型的多提示多任务指令微调可以促进更好的零样本学习能力。此外,正如[T5](https://www.jmlr.org/papers/volume21/20-074/20-074.pdf?ref=https://githubhelp.com)和[ExT5](https://arxiv.org/pdf/2111.10952.pdf)所指出的,将多任务的下游数据合并到预训练中,甚至比多任务微调更有帮助。 + +因此,在GLM-130B的预训练中,我们包括了许多从自然语言理解到生成的提示数据集,作为自监督预训练的补充。我们设定95%的标记来自自监督的预训练语料,5%的训练标记来自MIP数据集。这些数据集是从[T0](https://arxiv.org/pdf/2110.08207.pdf)和[DeepStruct](https://arxiv.org/pdf/2205.10475.pdf)中收集和转换的。按照T0的做法,每个多提示数据集中的样本都应被截断到最大数量(一般来说,T0数据集为100k,DeepStruct数据集为200k)。 + +不幸的是,由于数据准备中的一个错误,在前20k个预训练步骤中,我们意外地包括了T0++的所有数据集(其中包括最初用于评估T0中零样本任务泛化的任务)、没有调成权重进行截断、并排除了所有DeepStruct数据集。虽然我们把这个问题在20000步时进行了修正,但GLM-130B似乎对训练样本的记忆非常好,直到50000步也没有出现大量遗忘的现象,因此我们在此提醒所有用户***切勿在这个[列表](resources/multitask_list.txt)的数据集上评估GLM-130B在零样本或少样本学习的性能。 + +## GLM-130B表现如何? + +众所周知,像[GPT-3](https://arxiv.org/pdf/2005.14165.pdf)这样的大规模语言模型是优秀的少样本和零样本学习器。与GPT-3和OPT-175B的零样本学习相比,GLM-130B有一些架构上的劣势。首先,它是一个双语语言模型,不能像GPT-3(350B tokens)和OPT-175B(350B tokens)那样看到很多英语标记(GLM-130B大概见到了200B 英文tokens)。第二,GLM-130B的参数比GPT-3(175B)和OPT-175B少。 + +尽管有这些缺点,GLM-130B仍有上述的许多技术改进,这可能会弥补其在零点学习性能方面的差距。 + +* **双向注意力**。GLM-130B是一个类似于BERT的双向模型,而现有的大型语言模型主要是GPT(单向的)。双向模型在语言理解和条件生成方面远远优于GPT。 +* **改进的架构设计**。GLM-130B采用了新的架构设计,包括GeGLU、RoPE和DeepNorm。这些技术已被证明可以提高语言模型的性能。 +* **多任务指令预训练**。正如[FLAN](https://arxiv.org/pdf/2109.01652.pdf)和[T0](https://arxiv.org/pdf/2110.08207.pdf)所指出的,多任务指令预训练有助于提高零样本学习性能。 + +从目前的中间结果来看,GLM-130B在中文与英文中都是一个强大的零样本学习器。具体来说,它的表现是 + +* 在英语中与GPT-3 175B相当。 +* 在英语中优于BLOOM-176B和OPT-175B。 +* 在中文方面比ERNIE 3.0 Titan(260B)更好。 + +```diff +- 请注意,本节中的所有结果目前都是中间结果,不代表最终性能。 +``` + +### 讨论:GLM-130B的零样本学习设置 + +由于GLM-130B利用了多任务指令预训练(MIP),我们认为有必要澄清我们对零样本学习的设定。该问题似乎没有官方认可的定义,而社区中也存在许多不同的解释。我们参考了影响力较大的零样本学习[综述](https://ieeexplore.ieee.org/abstract/document/8413121)中的定义,其指出。 + +``` +At test time, in zero-shot learning setting, the aim is to assign a test image to an unseen class label, and in generalized zero-shot learning setting, the test image can be assigned either to seen or unseen classes. +``` + +其中,被评估的任务是否涉及未见过的类标签是一个关键。考虑到NLP的实际情况,我们为GLM-130B零样本学习评估挑选数据集的原则如下。 + +* 英文 + + 对于有固定标签的任务(如自然语言推理):同一任务中的任何数据集都不应该被评估。 + + 对于没有固定标签的任务(例如,问题回答,主题分类):只应考虑:1)相比MIP中数据集具有明显的领域转移,且 2)与MIP中的标签不同的数据集 +* 中文:所有的数据集都可以被评估 + +我们欢迎更多关于这个话题的讨论,以促进整个社区对零样本学习的研究。 + +### 零样本学习:英文 + +我们在各种不同的下游任务中测试GLM-130B。请注意,我们仍在经历评估阶段;这些结果不是最终结果,而是**中间结果**。 + +#### 语言建模(LAMBADA) +语言建模测试的是语言模型在给定其前缀语境下预测下一个单词的内在能力。我们以[LAMBADA](https://aclanthology.org/P16-1144/)为例,它是一项具有挑战性的零样本末位单词预测任务,在评估现有大规模语言模型时被广泛采用。 + +我们绘制了GLM-130B的零样本LAMBADA(En)性能,以及GPT-3 175B、OPT 175B和BLOOM 176B(OPT和BLOOM的中间结果取自[BLOOM的评估库](https://github.com/bigscience-workshop/evaluation-results/tree/676f6a8cf27d4df30b073fb490deb9e359da64aa))。与其他三个使用上下文自回归的GPT式模型相比,我们提出了GLM-130B的两个版本。 + +* **GLM-130B (bi)**对前缀上下文有双向的关注。 +* **GLM-130B (uni)**遵循传统的GPT风格,对前缀语境进行自回归注意力。 + +如图所示,双向注意力可以用较少的模型参数达到更好的性能。 + +|

| +|:--:| +| *与其他大规模语言模型相比,GLM-130B的零样本 LAMBADA(En)性能* | + +#### MMLU(大规模多任务语言理解) + +[MMLU](https://arxiv.org/pdf/2009.03300.pdf) 是一个多样化的基准数据集,包括57个关于人类知识的多选题回答任务,范围从高中水平到专家水平。它可以作为大规模语言模型少样本学习性能的理想测试平台。 + +我们绘制了GLM-130B在其训练过程上的少样本学习(5-shot)性能。GLM-130B在学习了大约3000亿个tokens后,接近GPT-3的可比性能43.9。随着训练的进行,它的能力继续增长,在学习了4000亿个tokens后达到了44.8。当我们的训练终止时,它似乎并没有饱和,这与[Chinchilla](https://arxiv.org/pdf/2203.15556.pdf)中的观察相一致,即现有的大规模语言模型仍然远远没有得到充分的训练。 + +|

| +|:--:| +| *与其他大规模语言模型相比,GLM-130B的少样本学习(5-shot)MMLU性能* | + +### 零样本学习:中文 + +由于GLM-130B是一个双语语言模型,我们也评估了它在既有的中文NLP基准上的零样本性能:[CLUE](https://arxiv.org/pdf/2004.05986.pdf) 和[FewCLUE](https://arxiv.org/pdf/2107.07498.pdf)。请注意,我们在多任务指令预训练(MIP)中不包括任何中文下游数据集。由于仍在评估阶段,我们目前仅评估了7个CLUE数据集和5个FewCLUE数据集。更多数据集上的结果会在之后公布。 + +我们将GLM-130B与现有最大的中文单语语言模型ERNIE Titan 3.0进行比较,后者有260B的参数。如图所示,GLM-130B的表现优于ERNIE Titan 3.0,尤其是在生成式阅读理解数据集DRCD和CMRC2018上。 + +| | +|:--:| +*部分CLUE和FewCLUE基准数据集的零点性能。跟随ERNIE Titan 3.0的做法,我们报告了开发数据集的结果。除了DRCD和CMRC2018的报告EM外,其他数据集报告Acc.* | + +
+致谢 + +这一项目由国家自然科学基金国家杰出青年科学基金项目(No. 61825602)支持。 + +### 学生负责人 +[曾奥涵(清华大学计算机系知识工程实验室)](https://github.com/Sengxian),[刘潇(清华大学计算机系知识工程实验室)](https://github.com/xiao9905) + +### 技术贡献 +#### 清华大学计算机系知识工程实验室——the Knowledge Engineering Group at Tsinghua +杜政晓,丁铭,郑勤锴,赖瀚宇,汪子涵,杨卓毅,于济凡,张笑涵,郑问迪,夏箫,徐逸凡,谭咏霖,东昱晓,唐杰 + +#### 清华大学计算机系PACMAN实验室——the Parallel Architecture & Compiler technology of Mobile, Accelerated, and Networked systems Group at Tsinghua +马子轩,何家傲,孙桢波,翟季冬,陈文光 + +#### 清华大学计算机系自然语言处理实验室(BMInf)——the Natural Language Processing Group at Tsinghua +曾国洋,韩旭,赵威霖,刘知远 + +#### 智谱AI——an AI startup that aims to teach machines to think like humans +薛宇飞,王山,陕杰才,姜皓瀚,郭振钢,张鹏 + +### 计算资源赞助 +智谱AI + +### 项目总负责 +[唐杰(清华大学计算机系知识工程实验室 & 北京智源人工智能研究院)](http://keg.cs.tsinghua.edu.cn/jietang/) + +
+ diff --git a/PyTorch/contrib/nlp/GLM-130B/benchmark.py b/PyTorch/contrib/nlp/GLM-130B/benchmark.py new file mode 100644 index 0000000000..c23e8069cf --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/benchmark.py @@ -0,0 +1,20 @@ +import torch +import time +from initialize import initialize, initialize_model_and_tokenizer + +if __name__ == "__main__": + args = initialize(extra_args_provider=lambda parser: None) + model, tokenizer = initialize_model_and_tokenizer(args) + + for seq_len in [512, 1024, 2048]: + torch.distributed.barrier() + start = time.time() + with torch.no_grad(): + _, *_ = model( + torch.ones(1, seq_len, device=torch.cuda.current_device(), dtype=torch.int64), + torch.arange(seq_len, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1), + torch.randn(1, 1, seq_len, seq_len, device=torch.cuda.current_device()) < 0.5, + ) + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(f"Encode {seq_len}: {(time.time() - start) * 1000:.2f} ms") diff --git a/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b.sh b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b.sh new file mode 100644 index 0000000000..e3f13db2b6 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b.sh @@ -0,0 +1,15 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=8 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16" diff --git a/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int4.sh b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int4.sh new file mode 100644 index 0000000000..391e1cbef7 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int4.sh @@ -0,0 +1,16 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=4 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --quantization-bit-width 4 \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16" diff --git a/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int8.sh b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int8.sh new file mode 100644 index 0000000000..5eb4ec8841 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_int8.sh @@ -0,0 +1,16 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=8 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --quantization-bit-width 8 \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16" diff --git a/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_v100.sh b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_v100.sh new file mode 100644 index 0000000000..0b33485e7f --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/configs/model_glm_130b_v100.sh @@ -0,0 +1,17 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=8 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16 \ + --bminf \ + --bminf-memory-limit 25" diff --git a/PyTorch/contrib/nlp/GLM-130B/cuda/Makefile b/PyTorch/contrib/nlp/GLM-130B/cuda/Makefile new file mode 100644 index 0000000000..46df11b608 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/cuda/Makefile @@ -0,0 +1,22 @@ +NVCC=nvcc +OPTIONS=-gencode arch=compute_61,code=sm_61 \ + -gencode arch=compute_62,code=sm_62 \ + -gencode arch=compute_70,code=sm_70 \ + -gencode arch=compute_72,code=sm_72 \ + -gencode arch=compute_75,code=sm_75 \ + -gencode arch=compute_80,code=sm_80 \ + -gencode arch=compute_86,code=sm_86 + +TARGETS=$(patsubst %.cu, %.fatbin, $(wildcard *.cu)) + +all: $(TARGETS) + +%.fatbin: %.cu + $(NVCC) -fatbin $^ $(OPTIONS) -o $@ + +.PHONY : clean, copy +clean: + rm $(TARGETS) + +copy: + cp $(TARGETS) ../kernels/ diff --git a/PyTorch/contrib/nlp/GLM-130B/cuda/quantization.cu b/PyTorch/contrib/nlp/GLM-130B/cuda/quantization.cu new file mode 100644 index 0000000000..36ac67d63c --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/cuda/quantization.cu @@ -0,0 +1,81 @@ +#include + +template +__device__ void +int4WeightExtractionDevice(const int8_t* weight, + const T* scale_list, + T* output, + const int n, + const int k) +{ + for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ + int8_t original = weight[i]; + int8_t high = original >> 4; + int8_t low = original << 4; low = low >> 4; + output[i * 2] = T(high) * scale_list[blockIdx.x]; + output[i * 2 + 1] = T(low) * scale_list[blockIdx.x]; + } +} + +__device__ void +int4WeightCompressionDevice(const int8_t* input, + int8_t* output, + const int n, + const int k) +{ + for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ + output[i] = (input[i * 2] << 4) | (input[i * 2 + 1] & 0b00001111); + } +} + +template +__device__ void +int8WeightExtractionDevice(const int8_t* weight, + const T* scale_list, + T* output, + const int n, + const int k) +{ + for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ + output[i] = T(weight[i]) * scale_list[blockIdx.x]; + } +} + +extern "C" __global__ void int4WeightExtractionHalf(const int8_t* weight, + const half* scale_list, + half* output, + const int n, + const int k){ + int4WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int4WeightExtractionFloat(const int8_t* weight, + const float* scale_list, + float* output, + const int n, + const int k){ + int4WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int8WeightExtractionHalf(const int8_t* weight, + const half* scale_list, + half* output, + const int n, + const int k){ + int8WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int8WeightExtractionFloat(const int8_t* weight, + const float* scale_list, + float* output, + const int n, + const int k){ + int8WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int4WeightCompression(const int8_t* input, + int8_t* output, + const int n, + const int k){ + int4WeightCompressionDevice(input, output, n, k); + } diff --git a/PyTorch/contrib/nlp/GLM-130B/docs/evaluate-your-own-tasks.md b/PyTorch/contrib/nlp/GLM-130B/docs/evaluate-your-own-tasks.md new file mode 100644 index 0000000000..396035f9bd --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/docs/evaluate-your-own-tasks.md @@ -0,0 +1,86 @@ +# Evaluate Your Own Tasks + +## YAML file for tasks + +We use the YAML file to define tasks, this allows us to easily evaluate multiple tasks at a single run and configure them independently. Specifically, you can add multiple tasks or folders at a time for evaluation, and the script will automatically collect all YAML files under those folders recursively. + +``` +# Single node +bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... +# Multi node +bash scripts/evaluate_multiple_node.sh task1.yaml task2.yaml dir1 dir2 ... +``` + +We support two types of evaluation tasks: multi-choice and generation. The YAML config options for both tasks are defined in `evaluation/configs.py`. Basically, all types of tasks share common configs defining task information: + +```yaml +name: 'glue_cola' # Task Name +type: 'mul' # Task type, 'gen' (generate) or 'mul' (multiple choice) +path: 'bloom/glue_cola' # task data path relative to DATA_PATH in 'evaluate.sh' +use_task_mask: False # Whether use [gMASK] for evaluation +unidirectional: False # Whether use unidirectional attention +max_seq_length: 2048 # Max sequence length +file-pattern: # Organize jsonl file in groups + validation: "**/validation.jsonl" # Will search for all file named 'validation.jsonl' in `DATA_PATH/bloom/glue_cola` using glob.glob() +micro-batch-size: 30 # 'gen' task only support mbs = 1 for now +``` + +See configuration details for multi-choice and generation tasks in `evaluation/configs.py`. + +## Data format for tasks + +We recommend organizing the task data in the following structure and setup up two groups named "validation" and "test" in the `file-pattern` config so that it becomes very easy to evaluate different prompts on both validation and test sets independently. + +```bash +DATA_PATH +└── task_name + ├── prompt_1 + │   ├── test.jsonl + │   └── val.jsonl + ├── prompt_2 + │   ├── test.jsonl + │   └── val.jsonl + └── prompt_3 + ├── test.jsonl + └── val.jsonl +``` + +The evaluation data for each prompt are organized into jsonline format. For multi-choice tasks, the format of each line of JSON should be + +```json +{ + "inputs_pretokenized": "Context and question here", + "choices_pretokenized": ["Choice 1", "Choice 2", "Choice 3"], + "label": int +} +``` + +The default metric for the multi-choice task is Accuracy. + +For the generation task, the format of each line of JSON should be + +```json +{ + "inputs_pretokenized": "Context and question here", + "targets_pretokenized": ["Target 1", "Target 2", "Target 3"], + "label": int +} +``` + +The default metrics for the generation task are EM(Exact-Match) and F1. Given inputs, the sequence generated by the model will be metricized separately from all targets and the highest value will be taken. + + +## Implement Your Metrics + +You can customize your evaluation metrics function and add it to `DEFAULT_METRICS` in `evaluation/metrics.py`, and then you can specify `metric: ['Your metric name']` in the task YAML file. + +## Fully customize the evaluation process + +By default, we implement classes named `MultiChoiceTask` and `GenerationTask` in `evaluation/tasks.py` for multi-choice tasks and generation tasks, respectively. + +You can implement a new task class and inherit from one of these two classes, and implement the `process_single_batch` function to define how to process a batch of inputs and get the predictions. Following [Big-Bench](https://github.com/google/BIG-bench/#creating-the-task), we implemented two methods you can use for your evaluation: + +- `model.cond_log_prob()`: Compute the probabilities of provided model outputs for given inputs. +- `model.generate_text()`: Generate text for given inputs. + +Once you have created the new task class, you need to specify the relative path to import the class in the `module` field of the task YAML file. See `tasks/lambada/tasks.py` and `tasks/lambada/lambada.yaml` for how we customize the beam search generation strategy for LAMBADA tasks and configure the YAML file. diff --git a/PyTorch/contrib/nlp/GLM-130B/docs/inference-with-fastertransformer.md b/PyTorch/contrib/nlp/GLM-130B/docs/inference-with-fastertransformer.md new file mode 100644 index 0000000000..7c55d56ec3 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/docs/inference-with-fastertransformer.md @@ -0,0 +1,156 @@ +# Inference with FasterTransformer + +[FasterTransformer](https://github.com/NVIDIA/FasterTransformer) provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. + +We adapted the GLM-130B based on Fastertransformer for fast inference, with details in [benchmark](#benchmark) section. + +## Download the Model + +See [Get Model](/README.md#environment-setup). + +## Recommend: Run With Docker + +Use Docker to quickly build a Flask API application for GLM-130B. + +### Requirements + +- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) + +### Build Container Image + +```bash +git clone https://github.com/THUDM/FasterTransformer.git +cd FasterTransformer +bash docker/build.sh +``` + +### Run API With Checkpoints + +Set MPSIZE to the number of gpus needed for the checkpoints, and DATA_TYPE to checkpoints precision. The checkpoint we distribute is in 8-way tensor parallel in FP16 precision, a conversion scripts is also provided if you need to change the tensor parallel dimension and the weight precision. + +```bash +# Convert the checkpoint to MP=4, DATA_TYPE=INT4 +python tools/convert_tp.py \ + --input-folder \ + --output-folder \ + --target-tp 8 \ + --quantization-bit-width 4 \ +# Run API +docker run -it --rm --gpus all --shm-size=10g -p 5000:5000 \ + -v /49300:/checkpoints:ro \ + -e MPSIZE=4 -e DATA_TYPE=int4 \ + ftglm:latest +``` + +### Test + +#### Benchmark + +```bash +python3 examples/pytorch/glm/glm_server_test.py +``` + +#### Web Demo + +```bash +pip install gradio +python3 examples/pytorch/glm/glm_server_frontend_test.py +``` + +## Manual Configuration + +### Requirements + +- CMake >= 3.13 for PyTorch +- CUDA 11.0 or newer version +- NCCL 2.10 or newer version +- Python 3 is recommended because some features are not supported in python 2 +- PyTorch: Verify on 1.10.1, >= 1.8.0 should work. + +### Setup Using Docker + +```bash +docker run -it --rm --gpus all nvcr.io/nvidia/pytorch:22.09-py3 /bin/bash +conda install -y pybind11 +``` + +### Setup Using Conda + +As another way, all the packages can be installed using conda. + +> Some of our current [structure](https://github.com/THUDM/FasterTransformer/blob/main/src/fastertransformer/th_op/glm/GlmOp.h#L30) requires that `g++` and `libtorch` produce the same results, so a pre-compiled `libtorch` may only work with `g++-7` or `g++-9`. And although GLM-130B itself does not rely on openmpi, FasterTransformer requires it during the build process. We are working on these issues. + +```bash +conda install -y cmake pybind11 +conda install -y -c conda-forge cudatoolkit-dev cudnn +cp -r $CONDA_PREFIX/lib/libcudnn* /usr/local/cuda/lib64/ +cp -r $CONDA_PREFIX/include/cudnn*.h /usr/local/cuda/include/ +``` + +If it's hard to install cudatoolkit-dev and cudnn by conda, just install them from [NVIDIA Developer](https://developer.nvidia.com/cuda-downloads), and make sure cmake is able to find cudnn. + +```bash +cp cudnn/include/cudnn*.h /usr/local/cuda/include +cp cudnn/lib/libcudnn* /usr/local/cuda/lib64 +chmod a+r /usr/local/cuda/include/cudnn*.h +chmod a+r /usr/local/cuda/lib64/libcudnn* +``` + +GLM-130B is trained with FP16 precision, a total of 260G of GPU memory is required to store model weights. The model is tested with 8 * 40G A100s. + +### Build + +Get the code and install all dependencies: + +```bash +git clone https://github.com/THUDM/FasterTransformer.git +mkdir -p FasterTransformer/build +cd FasterTransformer/build +pip3 install icetk transformers +``` + +Note: the `xx` of `-DSM=xx` in following scripts means the compute capability of your GPU. For example, 60 (P40) or 61 (P4) or 70 (V100) or 75(T4) or 80 (A100) or 86(RTX 3090). Default setting is including 70, 75, 80 and 86. + +```bash +cmake -DSM=80 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .. +make -j +``` + +### Run GLM-130B + +Generate the `gemm_config.in` file. + +```bash +# ./bin/gpt_gemm +./bin/gpt_gemm 1 1 128 96 128 49152 150528 1 8 +``` + +Running GLM_130B in Pytorch and Flask. + +```bash +bash ../examples/pytorch/glm/glm-server.sh +``` + +You need to check and edit this file to set arguments such as `CHECKPOINT_PATH`. + +## Optimization methods + +Optimization in GLM_130B are similar to optimization in GPT and GPT-J, describing in the [FasterTransformer/gpt_guide.md](https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md). Meanwhile, some of the operators are differ from GPT, such as the implementation of RotaryEmbedding, and the use of GeGLU, so we add them additionally into FasterTransformer. + +## Benchmark + +- Hardware: DGX-A100(8 * 40G) + +## Encode + +| **Sequence Len** | 512 | 1024 | 2048 | +| ---------- | ------ | ------ | ------ | +| Megatron | 145 ms | 250 ms | 453 ms | +| FasterTransformer | 120 ms | 220 ms | OOM | + +## Decode + +| **Sequence Len** | 512 | 1024 | 2048 | +| ---------- | ------- | ------- | -------- | +| Megatron | 45.21 s | 89.00 s | 179.22 s | +| FasterTransformer | 18.77 s | 39.81 s | 89.88 s | diff --git a/PyTorch/contrib/nlp/GLM-130B/docs/low-resource-inference.md b/PyTorch/contrib/nlp/GLM-130B/docs/low-resource-inference.md new file mode 100644 index 0000000000..5dbea23393 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/docs/low-resource-inference.md @@ -0,0 +1,28 @@ +# Low-resource Inference with BMInf + +GLM-130B is trained with 4-way tensor parallel and 8-way pipeline parallel for efficiency. Then the checkpoint is converted into a 8-way tensor parallel one in order to inference the model in a single node. GLM-130B has 130 billion parameters in FP16 precision, a total of 260G of GPU memory is required to store model weights. The DGX-A100 server has 8 A100s and provides an amount of 320G of GPU memory (640G for 80G A100 version) so it suits GLM-130B well. + +However, a server with 8 * 32G V100 only provides an amount of 256G of GPU memory, which indicates that the full loading of model weights is not possible. Fortunately, with the swap-in-and-out feature between CPU and GPU memory provided by the [BMInf](https://github.com/OpenBMB/BMInf) library, GLM-130B can still run on servers with a smaller amount of GPU memory. After joint debugging with the BMInf team, we achieved a resonable evaluation efficiency on DGX-1 servers with 8 * 32G V100 by carefully overlapping computation and communication, see the [benchmark section](#benchmark) for details. + +We have integrated BMInf into our codebase, just install BMInf via `pip install bminf`, and change the model configuration file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_v100.sh` in your launch shell script. The default BMInf config is for V100 servers, you can also adjust the maximum memory the model weights can occupy on one GPU by setting `--bminf-memory-limit` according to your GPU memory in the model config file. + +## Benchmark + +### Evaluation + +- CoLA task on the validation set +- Micro Batch Size = 30 +- BMInf: 25GB model weights in GPU memory limit by: `--bminf-memory-limit 25` + +| | Peak GPU Memory | Time | +| -------------- | ---------- | ------ | +| A100-SAT | 40.3 G | 74.6 s | +| V100-SAT | OOM | OOM | +| V100-SAT-BMInf | 32.3 G | 196.0 s | + +The `micro-batch-size` config in task YAML files is configured according to the maximum utilization of the DGX-A100 server. If you encounter an OOM error on the V100 server, please adjust the `micro-batch-size` appropriately. + +### Text generation + +In text generation, due to the small amount of calculation per model forward (usually <10 tokens/forward using beam search strategy), the communication between the CPU and GPU memory becomes the bottleneck. With the help of the BMInf team, we did an in-depth profile on our V100 server. Given a 25GB model weight limit per GPU, a total of 13 layers need to be copied from CPU to GPU for a single forward, each layer will take about 75ms on IO, indicating that the real IO speed between CPU and GPU is `260GB / 70 / 8 / 75ms = 6.19GB/s`. Our V100 server uses PCI-E 3.0 and two V100s share a switch, so the theoretical bandwidth for each GPU is 8GB/s, close to our profiling results. A server with PCI-E 4.0 will greatly reduce the IO time. Even that, long text generation tokens can still take several minutes so **we do not recommend using V100 servers in text generation scenario**. For this, we are working on INT8 quantization so that GLM-130B can even fit a single RTX-3090 server (24G * 8). + diff --git a/PyTorch/contrib/nlp/GLM-130B/docs/quantization.md b/PyTorch/contrib/nlp/GLM-130B/docs/quantization.md new file mode 100644 index 0000000000..3fc869c0cb --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/docs/quantization.md @@ -0,0 +1,66 @@ +# Quantization of GLM-130B + +## Usage + +> Please note that SwissArmyTransformer>=0.2.11 is required for quantization + +Set `CHECKPOINT_PATH` in `configs/model_glm_130b_{int4/int8}.sh` to your local checkpoint folder. The model will be first initialized from the FP16 checkpoint on the CPU memory, then dynamically quantized and transferred to the GPU memory. So please make sure you have enough CPU memory (>260GB) to store the FP16 model weights. + +You need to pay attention to the tensor parallel dimension of the model checkpoint, we only provide the checkpoint in 8-way tensor parallel, i.e. 8 GPUs store a whole model. If you need to do inference on a small number of GPUs, e.g. 4 * RTX 3090 GPUs with INT4 precision, you first need to convert the checkpoint to 4-way tensor parallel using the following command and modify `MP_SIZE` in corresponding model config file. + +```bash +python tools/convert_tp.py \ + --input-folder \ + --output-folder \ + --target-tp 4 +``` + +Finally, change the model config file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_{int4/int8}.sh` in your scripts (e.g. `scripts/generate.sh`), then run your scripts just as normal. + +By default, the full precision checkpoint is expected to be loaded. Run the conversion script with `--quantization-bit-width <4 or 8>` will produce quantized model weights. To load from a quantized checkpoint, you should add `--from-quantized-checkpoint` in your model config file. + +## Evaluation Results + +| | **MMLU(Accuracy↑)** | **LAMBADA(Accuracy↑ )** | **WikiText-2(PPL↓)** | **WikiText-103(PPL↓)** | **PTB(PPL↓)** | +| ---- | -------- | ----------- | ------------------- | --------------------- | ------------ | +| FP16 | 44.751 | 80.206 | 10.901 | 10.759 | 18.964 | +| INT8 | 44.709 | 80.206 | 10.904 | 10.763 | 18.994 | +| INT4 | 44.801 | 79.468 | 11.167 | 11.046 | 19.535 | + +## Space and Speed Benchmark + +| **Hardware** | **GPU Memory** | **Precison** | **512** | **1024** | **2048** | +| ------------ | -------------- | ------------ | -------- | -------- | -------- | +| 8 * A100 | 40 GB | FP16 | 45.21 s | 89.00 s | 179.22 s | +| 8 * V100 | 32 GB | INT8 | 106.35 s | 216.50 s | 449.17 s | +| 4 * RTX 3090 | 24 GB | INT4 | 138.66 s | 292.69 s | 649.64 s | +| 8 * RTX 2080 Ti | 11 GB | INT4 | 117.39 s | 240.96 s | 528.66 s | + + +The above results in the table is tests with SAT. Using FasterTransformer can speed up more than 2X, as shown in the table below, and the detailed usage is shown in [Inference with FasterTransformer](../docs/inference-with-fastertransformer.md). + +| **Hardware** | **GPU Memory** | **Precison** | **128** Encode / Decode | **512** Encode / Decode | **1024** Encode / Decode | **2048** Encode / Decode | +| --------------- | -------------- | ------------ | ----------------------- | ----------------------- | ------------------------ | ------------------------ | +| 8 * A100 | 40 GB | INT4 | 145 ms / 4.29 s | 183 ms / 17.7 s | 313 ms / 37.8 s | 495 ms / 86.0 s | +| 4 * A100 | 80 GB | INT4 | 174 ms / 6.62 s | 272 ms / 27.1 s | 439 ms / 56.2 s | 810 ms / 123 s | +| 8 * V100 | 32 GB | INT4 | 309 ms / 6.97 s | 666 ms / 28.1 s | 1208 ms / 58.4 s | 2304 ms / 125 s | +| 4 * V100 | 32 GB | INT4 | 448 ms / 11.4 s | 843 ms / 45.87 s | 1488 ms / 93.5 s | 2803 ms / 196 s | +| 8 * RTX 3090 | 24 GB | INT4 | 283 ms / 5.07 s | 915 ms / 20.5 s | 1793 ms / 42.7 s | 3477 ms / 90.3 s | +| 4 * RTX 3090 | 24 GB | INT4 | 374 ms / 8.16 s | 1300 ms / 32.3 s | OOM / 66.5 s | OOM / 150 s | +| 8 * RTX 2080 Ti | 11 GB | INT4 | 392 ms / 6.77 s | 1044 ms / 27.29 s | OOM / 56.02 s | OOM / OOM | + +## Details + +Typical methods quantize both model weights and activations to INT8, enabling the INT8 matrix multiplication kernel for efficiency. However, we found that there are outliers in GLM-130B's activations, making it hard to reduce the precision of activations. + +Concurrently, researchers from [Meta AI](https://arxiv.org/abs/2208.07339) also found the emergent outliers issue in large-scale transformers (>6.8B), which is consistent with our observations on GLM-130B. They conducted an in-depth analysis and found that the outliers make up only about 0.1% of all feature dimensions, so it's possible to make a decomposition for matrix multiplication that focuses on high precision multiplication for these particular dimensions. + +| ![](media/16613396005977.jpg) | +|:--:| +| *Distribution of outliers (the white ones) in GLM-130B's activation* | + +Unfortunately, the outliers in GLM-130B can sometimes make up at most 30% of the feature dimension, possibly because we used GLU as a variant of FFN. Therefore, a mixed-precision decomposition for matmul can be much less efficient than a single FP16 matmul. After a few weeks of trial, we finally decided to keep the precision of activations to FP16 and only consider the quantization of model weights. In that case, the quantized model parameters are dynamically converted to FP16 precision at runtime, introducing a small computational overhead but greatly reducing GPU memory requirements for storing model weights. + +We quantized all linear layers as they take up most of the model parameters. All model weights, excluding input/output embedding, layernorm and bias terms are quantized using vector-wise symmetric quantization. At the quantization precision of INT4, two INT4 weights are compressed into one INT8 weight for saving GPU memory usage, so that only 70GB of GPU memory approximately is required for INT4 model weights. + + diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluate.py b/PyTorch/contrib/nlp/GLM-130B/evaluate.py new file mode 100644 index 0000000000..fef259f8aa --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluate.py @@ -0,0 +1,67 @@ +import time +import importlib + +from os.path import join, isdir, isfile, relpath +from glob import glob + +from evaluation import BaseConfig, ModelForEvaluation, DEFAULT_CLASS, print_rank_0 +from initialize import initialize, initialize_model_and_tokenizer + + +def add_evaluation_specific_args(parser): + """Arguments for evaluation""" + group = parser.add_argument_group("evaluation", "Evaluation configurations") + + # Task + group.add_argument("--task", nargs="+", default=[], help="All task config to evaluation") + group.add_argument("--data-path", type=str, required=True, help="Data dir path for all tasks") + return parser + + +def find_all_tasks(all_task_config_path): + tasks = [] + for task in all_task_config_path: + if isdir(task): + tasks += [relpath(path, ".") for path in glob(join(task, "**/*.yaml"), recursive=True)] + elif isfile(task): + tasks.append(task) + return tasks + + +def evaluate_all_tasks(data_path, model, tokenizer, all_task_config_path, task_classes): + for config_path, task_class in zip(all_task_config_path, task_classes): + config = task_class.config_class().from_yaml_file(config_path) + config.path = join(data_path, config.path) + task = task_class(model, tokenizer, config) + task.evaluate() + + +def main(): + args = initialize(extra_args_provider=add_evaluation_specific_args) + args.task = find_all_tasks(args.task) + + task_classes = [] + print_rank_0("> Loading task configs") + for task_config_path in args.task: + config = BaseConfig.from_yaml_file(task_config_path) + if config.module: + path = ".".join(config.module.split(".")[:-1]) + module = importlib.import_module(path) + class_name = config.module.split(".")[-1] + task_class = getattr(module, class_name) + task_classes.append(task_class) + else: + task_classes.append(DEFAULT_CLASS[config.type]) + print_rank_0(f" Task {config.name} loaded from config {task_config_path}") + print_rank_0(f"> Successfully load {len(task_classes)} task{'s' if len(task_classes) > 1 else ''}") + + model, tokenizer = initialize_model_and_tokenizer(args) + model = ModelForEvaluation(model) + + start = time.time() + evaluate_all_tasks(args.data_path, model, tokenizer, args.task, task_classes) + print_rank_0(f"Finish {len(task_classes)} task{'s' if len(task_classes) > 1 else ''} in {time.time() - start:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluation/__init__.py b/PyTorch/contrib/nlp/GLM-130B/evaluation/__init__.py new file mode 100644 index 0000000000..a9a28fec72 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluation/__init__.py @@ -0,0 +1,12 @@ +from .configs import * +from .model import ModelForEvaluation +from .tasks import BaseTask, GenerationTask, MultiChoiceTask, LanguageModelTask +from .dataset import GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset +from .metrics import qa_evaluate +from .utils import print_rank_0 + +DEFAULT_CLASS = { + TaskType.GENERATION: GenerationTask, + TaskType.MULTICHOICE: MultiChoiceTask, + TaskType.LANGUAGE_MODEL: LanguageModelTask, +} diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluation/configs.py b/PyTorch/contrib/nlp/GLM-130B/evaluation/configs.py new file mode 100644 index 0000000000..e2982c0fe0 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluation/configs.py @@ -0,0 +1,59 @@ +from __future__ import annotations +from dataclass_wizard import YAMLWizard +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional, List, Dict + + +class TaskType(Enum): + MULTICHOICE = "mul" + GENERATION = "gen" + LANGUAGE_MODEL = "lm" + OTHER = "other" + + +@dataclass +class BaseConfig(YAMLWizard): + name: str # Task name + type: TaskType # Task type + path: str # task data path relative to DATA_PATH + + module: Optional[str] = None # Custom task module file, optional + metrics: List[str] = field(default_factory=list) # Evaluation metrics + + use_task_mask: bool = False # Whether to use [gMASK] for evaluation + use_multitask_encoding: bool = False # Not supported now + unidirectional: bool = False # Whether to use unidirectional attention + max_seq_length: int = 2048 # Max sequence length + file_pattern: str | Dict[str, str] = "**/*.json*" # Organize data file in groups + + micro_batch_size: int = 1 # 'gen' task only support mbs = 1 for now + + def __post_init__(self): + assert self.use_task_mask or not self.unidirectional, "[MASK] doesn't support unidirectional attention" + + +@dataclass +class MultiChoiceTaskConfig(BaseConfig): + module = "evaluation.MultiChoiceTask" + metrics: List[str] = field(default_factory=lambda: ["Accuracy"]) + + +@dataclass +class GenerationTaskConfig(BaseConfig): + module = "evaluation.GenerationTask" + metrics: List[str] = field(default_factory=lambda: ["EM", "F1"]) + sampling_strategy: str = "BaseStrategy" + num_beams: int = 4 + length_penalty: float = 1.0 + no_repeat_ngram_size: int = 3 + min_gen_length: int = 0 + max_gen_length: int = 128 + + +@dataclass +class LanguageModelTaskConfig(BaseConfig): + module = "evaluation.LanguageModelTask" + metrics: List[str] = field(default_factory=lambda: ["PPL"]) + + generation_length: int = 256 # Generated length in each window diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluation/dataset.py b/PyTorch/contrib/nlp/GLM-130B/evaluation/dataset.py new file mode 100644 index 0000000000..ac9520d1b0 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluation/dataset.py @@ -0,0 +1,371 @@ +import os +import math +import json + +import numpy as np +import torch + +from typing import List, Union +from abc import ABC, abstractmethod +from scipy.linalg import block_diag +from itertools import accumulate +from bisect import bisect_right + +from SwissArmyTransformer import get_tokenizer + +from .configs import BaseConfig, MultiChoiceTaskConfig, GenerationTaskConfig, LanguageModelTaskConfig +from .utils import get_tokenized_input + + +def pad_batch(tokens, position_ids, attention_mask, max_seq_length): + attention_mask = np.pad( + attention_mask, + pad_width=((0, max_seq_length - len(tokens)),), + mode="constant", + constant_values=0, + ) + tokens = np.concatenate((tokens, np.zeros(max_seq_length - len(tokens), dtype=np.int64))) + position_ids = np.concatenate((position_ids, np.zeros(max_seq_length - len(position_ids), dtype=np.int64))) + return tokens, position_ids, attention_mask + + +class EvaluationDataset(torch.utils.data.Dataset, ABC): + """ + Jsonlines of { + "text": context + "choices": [choice_id1,...], if not None, len(target) == 1 + "label": If generation task -1, else [0, len(choices)) + } + If [MASK] not in context, will append [MASK] after text + """ + + def __init__(self, path: Union[str, List[str]], config: BaseConfig): + self.path = path if isinstance(path, list) else [path] + self.config = config + self.max_seq_length = self.config.max_seq_length + self.dtype = np.int64 + + self.tokenizer = get_tokenizer() + self.mask_id = self.tokenizer.get_command("[MASK]") + self.gmask_id = self.tokenizer.get_command("[gMASK]") + + self.data = [] + for p in self.path: + self.process_single_file(p) + + @property + def has_collate_fn(self) -> bool: + return False + + def collate_fn(self, samples): + return None + + def process_single_file(self, path): + with open(os.path.join(path), "r", encoding="utf-8") as file: + for line in file: + item = json.loads(line) + self.data.append(self.process_single_item(item)) + + @abstractmethod + def process_single_item(self, item) -> dict: + pass + + def __len__(self): + return len(self.data) + + +class GenerationTaskDataset(EvaluationDataset): + config: GenerationTaskConfig + + def process_single_item(self, item): + text, targets = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "targets") + if len(text) + self.config.max_gen_length + 2 > self.config.max_seq_length: + text_length = self.config.max_seq_length - self.config.max_gen_length - 2 + text = text[len(text) - text_length : len(text)] + return {"text": text, "targets": targets} + + @property + def has_collate_fn(self) -> bool: + return True + + def collate_fn(self, samples): + TILE = 32 + length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE + + token_batch, position_id_batch, attention_mask_batch = [], [], [] + context_length_batch, target_position_id_batch = [], [] + + for sample in samples: + token, position_id, attention_mask = pad_batch( + sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad + ) + token_batch.append(token) + position_id_batch.append(position_id) + attention_mask_batch.append(attention_mask) + context_length_batch.append(sample['context_length']) + target_position_id_batch.append(sample['target_position_id']) + return { + "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64), + "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64), + "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5, + "context_length": torch.tensor(context_length_batch, dtype=torch.int64), + "target_position_ids": torch.tensor(np.array(target_position_id_batch), dtype=torch.int64), + } + + @staticmethod + def build_generation_sample(text, max_gen_length, use_task_mask, unidirectional=True): + tokenizer = get_tokenizer() + + sop_id = tokenizer.get_command("sop") + mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]") + + token = np.array(text, dtype=np.int64) + + blank_filling = mask_id in text + if blank_filling: + assert not unidirectional, "Unidirectional attention doesn't support blank filling" + assert not use_task_mask, "Unidirectional attention doesn't support task mask" + mask_position = text.index(mask_id) + token = np.concatenate((token, [sop_id])) + else: + mask_position = len(token) + if unidirectional: + token = np.concatenate(([mask_id, sop_id], token)) + else: + token = np.concatenate((token, [mask_id, sop_id])) + context_length = len(token) + + position_id = np.arange(0, context_length, dtype=np.int64) + target_position_id = np.arange(context_length, context_length + max_gen_length, dtype=np.int64) + if not use_task_mask: + position_id[context_length - 1:] = mask_position + target_position_id[:] = mask_position + + attention_mask = np.tril(np.ones((context_length, context_length), dtype=np.int64)) + if not unidirectional: + attention_mask[: context_length - 1, : context_length - 1] = 1 + + item = { + "token": token, + "position_id": position_id, + "target_position_id": target_position_id, + "attention_mask": attention_mask, + "context_length": context_length, + } + return item + + def __getitem__(self, idx): + item = self.data[idx] + sample = self.build_generation_sample( + item["text"], + max_gen_length=self.config.max_gen_length, + use_task_mask=self.config.use_task_mask, + unidirectional=self.config.unidirectional, + ) + sample["targets"] = [np.array(target, dtype=self.dtype) for target in item["targets"]] + return sample + + +class MultiChoiceTaskDataset(EvaluationDataset): + config: MultiChoiceTaskConfig + + def __init__(self, path, config: MultiChoiceTaskConfig): + self.is_single_token = True # set to False later in process_single_item func + super().__init__(path, config) + + @property + def has_collate_fn(self) -> bool: + return True + + def collate_fn(self, samples): + TILE = 32 + length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE + + token_batch, position_id_batch, attention_mask_batch = [], [], [] + choices_batch, choice_target_ids_batch = [], [] + + for sample in samples: + token, position_id, attention_mask = pad_batch( + sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad + ) + token_batch.append(token) + position_id_batch.append(position_id) + attention_mask_batch.append(attention_mask) + choices_batch.append(sample["choices"]) + choice_target_ids_batch.append(sample["choice_target_ids"]) + + return { + "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64), + "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64), + "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5, + "choices": choices_batch, + "choice_target_ids": choice_target_ids_batch, + "is_single_token": self.is_single_token, + } + + def process_single_item(self, item): + text, choices, label = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "choices"), item["label"] + + tgt_seq_length = sum([len(choice) for choice in choices]) + if tgt_seq_length == len(choices): + # For single token, we only insert one [sop] + tgt_seq_length = 1 + + assert tgt_seq_length < self.config.max_seq_length + if len(text) + tgt_seq_length + 2 > self.config.max_seq_length: + text_length = self.config.max_seq_length - tgt_seq_length - 2 + text = text[len(text) - text_length : len(text)] + + assert not ( + self.mask_id in text and self.config.use_multitask_encoding + ), "Unified multitask encoding don't support blank filling" + + if tgt_seq_length != 1: + self.is_single_token = False + + return { + "text": text, + "choices": choices, + "label": label, + } + + @staticmethod + def build_multiple_choice_sample( + text, choices, is_single_token, unified_multitask_encoding=False, use_task_mask=False + ): + tokenizer = get_tokenizer() + + sop_id = tokenizer.get_command("sop") + mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]") + + token = np.array(text, dtype=np.int64) + target = np.array(text, dtype=np.int64) + position_id = np.arange(len(text), dtype=np.int64) + choice_target_id = [] + + blank_filling = mask_id in text + if not blank_filling: + mask_position = len(token) + token = np.concatenate((token, [mask_id])) + target = np.concatenate((target, [mask_id])) + position_id = np.concatenate((position_id, [mask_position])) + else: + mask_position = text.index(mask_id) + + division = len(token) + attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)] + + for choice in choices: + if use_task_mask == False: + position_id = np.concatenate( + ( + position_id, + [mask_position] * len(choice) + if blank_filling or not unified_multitask_encoding + else np.arange(mask_position, mask_position + len(choice), dtype=np.int64), + ) + ) + else: + position_id = np.concatenate( + ( + position_id, + np.arange(division, division + len(choice), dtype=np.int64), + ) + ) + + choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64)) + attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64))) + token = np.concatenate((token, [sop_id], choice[:-1])) + target = np.concatenate((target, choice)) + + if is_single_token: + break + + attention_mask = block_diag(*attention_mask) + attention_mask[: len(token), :division] = 1 + + if is_single_token: + choices = np.array(choices, dtype=np.int64).squeeze().tolist() + + item = { + "token": token, + "position_id": position_id, + "attention_mask": attention_mask, + "choices": choices, + "choice_target_ids": choice_target_id[0] if is_single_token else choice_target_id, + } + return item + + def __getitem__(self, idx): + item = self.data[idx] + sample = self.build_multiple_choice_sample( + item["text"], + item["choices"], + is_single_token=self.is_single_token, + unified_multitask_encoding=self.config.use_multitask_encoding, + use_task_mask=self.config.use_task_mask, + ) + sample["label"] = item["label"] + return sample + + +class LanguageModelTaskDataset(EvaluationDataset): + config: LanguageModelTaskConfig + left_weights: List[int] + weights: List[int] + + def process_single_file(self, path): + num_sequences = [] + with open(os.path.join(path), "r", encoding="utf-8") as file: + raw_text = file.read() + tokens = self.tokenizer.tokenize(raw_text) + self.data.append( + { + "raw_text": tokens, + "num_original_tokens": len(raw_text.strip().split(" ")), + "num_sequences": max( + math.ceil( + max(len(tokens) - (self.config.max_seq_length - 1), 0) / self.config.generation_length + ) + + 1, + 1, + ), + } + ) + num_sequences.append(self.data[-1]["num_sequences"]) + self.weights = list(accumulate(num_sequences)) + self.left_weights = [0] + self.weights[:-1] + + def process_single_item(self, item): + pass + + def __len__(self): + return self.data[0]["num_sequences"] + + def __getitem__(self, idx): + document_idx = bisect_right(self.weights, idx) + idx = idx - self.left_weights[document_idx] + start_idx = idx * self.config.generation_length + end_idx = start_idx + self.config.max_seq_length - 1 # for additional [gMASK] + tokens = self.data[document_idx]["raw_text"][start_idx:end_idx] + + mask_id = self.gmask_id if self.config.use_task_mask else self.mask_id + sop_id = self.tokenizer.get_command("sop") + + if idx == 0 or self.config.unidirectional: + prompt, text = [], tokens + else: + prompt_length = self.config.max_seq_length - 1 - self.config.generation_length + prompt, text = tokens[:prompt_length], tokens[prompt_length:] + + seq_length = len(prompt) + len(text) + 1 + attention_mask = np.tril(np.ones((seq_length, seq_length), dtype=np.int64)) + attention_mask[: len(prompt) + 1, : len(prompt) + 1] = 1 + + return { + "tokens": np.array(prompt + [mask_id, sop_id] + text[:-1], dtype=np.int64), + "targets": np.array(prompt + [mask_id] + text, dtype=np.int64), + "position_ids": np.arange(0, seq_length, dtype=np.int64), + "attention_mask": attention_mask < 0.5, + "loss_masks": np.array([0] * (len(prompt) + 1) + [1] * len(text), dtype=np.int64), + } diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluation/metrics.py b/PyTorch/contrib/nlp/GLM-130B/evaluation/metrics.py new file mode 100644 index 0000000000..af1b05b17f --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluation/metrics.py @@ -0,0 +1,140 @@ +import re +import math +import string +import functools + +import torch +import numpy as np + +from typing import Tuple, List +from collections import Counter +from collections import defaultdict +from SwissArmyTransformer import get_tokenizer + +from .utils import print_rank_0 + + +def accuracy_metric(predictions, examples): + count = 0 + num_predictions = max(len(predictions), 1) + assert len(predictions) == len(examples) + for prediction, example in zip(predictions, examples): + count += prediction == example["label"] + return count * 100.0 / num_predictions + + +def F1_metric(predictions, examples): + assert len(predictions) == len(examples) + from sklearn.metrics import f1_score + + truth = [] + for prediction, example in zip(predictions, examples): + truth.append(example["label"]) + return f1_score(truth, predictions, average="micro") * 100.0 + + +def precision_metric(predictions, examples): + assert len(predictions) == len(examples) + from sklearn.metrics import precision_score + + truth = [] + for prediction, example in zip(predictions, examples): + truth.append(example["label"]) + return precision_score(truth, predictions, average="micro") * 100.0 + + +def recall_metric(predictions, examples): + assert len(predictions) == len(examples) + from sklearn.metrics import recall_score + + truth = [] + for prediction, example in zip(predictions, examples): + truth.append(example["label"]) + return recall_score(truth, predictions, average="micro") * 100.0 + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + if not ground_truths: + return 0.0 + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def qa_evaluate(predictions, examples, metric): + assert len(examples) == len(predictions) + tokenizer = get_tokenizer() + + score = 0.0 + for example, prediction in zip(examples, predictions): + ground_truths = [tokenizer.tokenizer.decode(target) for target in example["targets"]] + prediction = tokenizer.tokenizer.decode(prediction) + if ground_truths: + score += metric_max_over_ground_truths(metric, prediction, ground_truths) + score = 100.0 * score / len(predictions) + return score + + +qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score) +qa_f1 = functools.partial(qa_evaluate, metric=f1_score) + + +def calculate_perplexity(loss: List[float], data): + return math.exp(min(20, np.sum(loss) / data[0]["num_original_tokens"])) + + +def special_for_dataset(predictions, examples): + print_rank_0("Metrics not found, maybe dataset special metric or metric name error") + return True + + +DEFAULT_METRICS = defaultdict(lambda: special_for_dataset) +DEFAULT_METRICS.update( + { + "EM": qa_exact_match, + "F1": qa_f1, + "Accuracy": accuracy_metric, + "PPL": calculate_perplexity, + "Precision": precision_metric, + "Recall": recall_metric, + "F1_mul": F1_metric, + } +) diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluation/model.py b/PyTorch/contrib/nlp/GLM-130B/evaluation/model.py new file mode 100644 index 0000000000..07e657ae23 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluation/model.py @@ -0,0 +1,202 @@ +import torch + +from typing import List, Union + +from SwissArmyTransformer.generation.autoregressive_sampling import update_mems, get_masks_and_position_ids_default +from SwissArmyTransformer.mpu import vocab_parallel_cross_entropy + + +def batch_filling_sequence( + model, + seqs, + context_lengths, + strategy, + max_memory_length=100000, + get_masks_and_position_ids=get_masks_and_position_ids_default, + mems=None, + **kw_args + ): + ''' + seq: [2, 3, 5, ..., -1(to be generated), -1, ...] + mems: [num_layers, batch_size, len_mems(index), mem_hidden_size] + cache, should be first mems.shape[1] parts of context_tokens. + mems are the first-level citizens here, but we don't assume what is memorized. + input mems are used when multi-phase generation. + ''' + assert len(seqs.shape) == 2 + + # building the initial tokens, attention_mask, and position_ids + batch_size, context_length = seqs.shape + # seqs: [1, max_gen_len]; attention_mask: [1, 1, max_gen_len, max_gen_len]; position_ids: [1, max_seq_len] + seqs, attention_mask, position_ids = get_masks_and_position_ids(seqs) + # tokens: [1, context_len] + tokens = seqs[..., :context_length] + if attention_mask.dtype != torch.bool: + attention_mask = attention_mask.type_as(next(model.parameters())) # if fp16 + # initialize generation + counter = context_length - 1 # Last fixed index is ``counter'' + index = 0 if mems is None else mems.shape[2] # Next forward starting index, also the length of cache. + num_beams = 1 + # step-by-step generation + while counter < seqs.shape[1] - 1: + # Now, we want to generate seq[counter + 1], + # token[:, index: counter+1] needs forwarding. + # forward + # first: index=0; other: index=counter + tokens = tokens.reshape(batch_size * num_beams, -1) + mems = mems.reshape(mems.shape[0], batch_size * num_beams, mems.shape[-2], mems.shape[-1]) if mems is not None else None + logits, *output_per_layers = model( + tokens[:, index:], # first: [1, 0:counter]; other: [1, counter:counter+1] + position_ids[..., index: counter+1], # first: [1, 0:counter+1]; other: [1, counter:counter+1] + attention_mask[..., index: counter+1, :counter+1], # TODO memlen # first: [1,1,0:counter+1,0:counter+1]; other: [1,1,counter:counter+1,0:counter+1] + mems=mems, + **kw_args + ) + mem_kv = [o['mem_kv'] for o in output_per_layers] + mems = update_mems(mem_kv, mems, max_memory_length=max_memory_length) + if counter == context_length - 1: + logits = logits[torch.arange(batch_size), context_lengths - 1] + else: + logits = logits[:, -1] + counter += 1 + index = counter + # if torch.distributed.get_rank() == 0: + # print(f"counter: {counter}: logits: {logits.float().abs().mean()}") + # sampling + logits = logits.reshape(batch_size, num_beams, -1) + tokens = tokens.reshape(batch_size, num_beams, -1) + mems = mems.reshape(mems.shape[0], batch_size, num_beams, mems.shape[-2], mems.shape[-1]) + tokens, mems = strategy.forward(logits, tokens, mems) + if len(tokens.shape) == 3 and num_beams == 1: + num_beams = tokens.shape[1] + position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, -1).reshape(batch_size * num_beams, -1) + attention_mask_shape = attention_mask.shape[-3:] + attention_mask = attention_mask.unsqueeze(1).expand(batch_size, num_beams, -1, -1, -1).reshape( + batch_size * num_beams, *attention_mask_shape) + if strategy.is_done: + break + return strategy.finalize(tokens, mems) + + +class ModelForEvaluation(torch.nn.Module): + def __init__(self, model): + super().__init__() + + self.model = model + self.device = next(self.model.parameters()).device + + @staticmethod + def process_data(batch, device): + return ( + batch["tokens"].to(device=device).long(), + batch["position_ids"].to(device=device).long(), + batch["attention_mask"].to(device=device).bool().unsqueeze(1), + ) + + def cond_log_prob(self, batch) -> List[List[float]]: + """ + @return: Conditional log probability of each option + """ + tokens, position_ids, attention_mask = self.process_data(batch, self.device) + choices_batch, choice_target_ids_batch = batch["choices"], batch["choice_target_ids"] + is_single_token = batch["is_single_token"] + + self.model.eval() + with torch.no_grad(): + logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None) + logits_batch = torch.nn.functional.log_softmax(logits, dim=-1) + + # output: [b, sq, vocab] + log_probs = [] + + if is_single_token: # Single token + for logits, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch): + log_probs.append(logits[choice_target_ids[0], choices].tolist()) + else: # Multi token + for output, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch): + log_probs_single = [] + for choice, choice_target_id in zip(choices, choice_target_ids): + tmp = output[choice_target_id, choice] + log_probs_single.append(tmp.sum().tolist()) + log_probs.append(log_probs_single) + return log_probs + + def generate_text(self, sample, strategy, return_all_beams=False) -> Union[ + List[List[int]], List[List[List[int]]]]: + """ + @return: A list of text model generated, sorted by score in descending order + """ + + seqs = sample["tokens"].to(device=self.device).long() + context_lengths = sample["context_length"].long() + + def get_masks_and_position_ids(seq): + batch_size = seq.shape[0] + max_gen_length = sample['target_position_ids'].shape[-1] + tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode='constant', value=-1) + position_ids = torch.cat((sample['position_ids'], sample['target_position_ids']), dim=-1) + position_ids = position_ids.to(device=self.device).long() + attention_mask = sample["attention_mask"].to(device=self.device) + context_mask = attention_mask[torch.arange(batch_size), context_lengths - 1].unsqueeze(1).repeat(1, + max_gen_length, + 1) + causal_mask = torch.tril(context_mask.new_ones((batch_size, max_gen_length, max_gen_length))) < 0.5 + generation_mask = torch.cat( + (context_mask, causal_mask), dim=-1) + attention_mask = torch.nn.functional.pad(attention_mask, (0, max_gen_length), mode='constant', value=1) + attention_mask = torch.cat((attention_mask, generation_mask), dim=1) + attention_mask = attention_mask.bool().unsqueeze(1) + return tokens, attention_mask, position_ids + + self.model.eval() + with torch.no_grad(): + output = batch_filling_sequence( + self.model, + seqs, + context_lengths, + get_masks_and_position_ids=get_masks_and_position_ids, + strategy=strategy, + )[0] + + if isinstance(output, torch.Tensor): # different strategies + output = output.tolist() + + output_targets = [] + context_length = seqs.shape[1] + for lines in output: + lines = lines.tolist() if isinstance(lines, torch.Tensor) else lines + output_target = [] + if not isinstance(lines, list): + lines = [lines] + for line in lines: + unfinished = line.index(-1) if -1 in line else len(line) + if line[unfinished - 1] in strategy.end_tokens: + unfinished -= 1 + line = line[context_length:unfinished] + output_target.append(line) + if not return_all_beams: + output_targets.append(output_target[0]) + else: + output_targets.append(output_target) + return output_targets + + + def calculate_loss(self, batch) -> List[float]: + tokens, position_ids, attention_mask = self.process_data(batch, self.device) + targets, loss_masks = ( + batch["targets"].to(device=self.device).long(), + batch["loss_masks"].to(device=self.device).long(), + ) + + original_parallel_output = self.model.transformer.parallel_output + self.model.transformer.parallel_output = True + self.model.eval() + + with torch.no_grad(): + logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None) + losses = vocab_parallel_cross_entropy(logits.contiguous().float(), targets) + loss = torch.sum(losses * loss_masks, dim=-1) + + self.model.transformer.parallel_output = original_parallel_output + + return loss.tolist() diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluation/tasks.py b/PyTorch/contrib/nlp/GLM-130B/evaluation/tasks.py new file mode 100644 index 0000000000..cdba83bd15 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluation/tasks.py @@ -0,0 +1,220 @@ +import torch +import time +import numpy as np +import torch.distributed as dist + +from typing import Dict, Callable, Type, Tuple, List, Any +from abc import ABC, abstractmethod +from glob import glob +from os.path import join, relpath +from collections import defaultdict + +from SwissArmyTransformer.tokenization.icetk_glm_130B.ice_tokenizer import _IceTokenizer + +from generation import BaseStrategy, BeamSearchStrategy +from .configs import BaseConfig, GenerationTaskConfig, MultiChoiceTaskConfig, LanguageModelTaskConfig +from .model import ModelForEvaluation +from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset +from .utils import build_data_loader, gather_result, print_rank_0 +from .metrics import DEFAULT_METRICS + + +class BaseTask(ABC): + model: ModelForEvaluation + tokenizer: _IceTokenizer + config: BaseConfig + file_groups: Dict[str, List[str]] + + @classmethod + def config_class(cls) -> Type[BaseConfig]: + return BaseConfig + + @property + def metrics(self) -> Dict[str, Callable]: + return {metric: DEFAULT_METRICS[metric] for metric in self.config.metrics} + + def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: BaseConfig): + self.model = model + self.tokenizer = tokenizer + self.config = config + self.config.metrics = list(self.metrics.keys()) + + self.file_groups = self.get_file_groups() + self.verbose = dist.get_rank() == 0 + + def get_file_groups(self): + pattern_group = {} + if isinstance(self.config.file_pattern, str): + pattern_group["all"] = self.config.file_pattern + else: + pattern_group = self.config.file_pattern + return { + name: [ + relpath(path, start=self.config.path) + for path in sorted(glob(join(self.config.path, pattern), recursive=True)) + ] + for name, pattern in pattern_group.items() + } + + def evaluate(self): + dist.barrier() + start = time.time() + print_rank_0("\n") + print_rank_0(f"{self.config}") + print_rank_0(f"Evaluating task {self.config.name}:") + + result_dict_all = {} + + for group_name, filelist in self.file_groups.items(): + print_rank_0(f" Evaluating group {group_name}:") + + result_dict_group = {} + for file in filelist: + dataset = self.build_dataset(file) + dataloader = build_data_loader( + dataset, + micro_batch_size=self.config.micro_batch_size, + num_workers=1, + drop_last=False, + collate_fn=dataset.collate_fn if dataset.has_collate_fn else None, + ) + + prediction = [] + with torch.no_grad(): + for _, batch in enumerate(dataloader): + prediction.append(self.predict_single_batch(batch)) + + prediction = gather_result(prediction, len(dataset), self.config.micro_batch_size) + result_dict = {key: metric(prediction, dataset.data) for key, metric in self.metrics.items()} + result_dict_group[file] = (result_dict, len(dataset)) + + if self.verbose: + self.report_single_metrics(file, result_dict) + + result_dict_all[group_name] = result_dict_group + + print_rank_0(f"Evaluation results of task {self.config.name}:") + + if self.verbose: + for group_name, result_dict_group in result_dict_all.items(): + self.report_group_metrics(group_name, result_dict_group) + self.report_overall_metrics( + {k: v for result_dict_group in result_dict_all.values() for k, v in result_dict_group.items()}, + ) + + print_rank_0(f"Finish task {self.config.name} in {time.time() - start:.1f}s.") + + def report_single_metrics(self, file: str, result_dict: Dict[str, float]): + output_str = f" Finish {file}" + for key, value in result_dict.items(): + output_str += f", {key} = {value:.3f}" + print_rank_0(output_str) + + @staticmethod + def calc_group_metrics(result_dict_group: Dict[str, Tuple[Dict[str, float], int]]): + metrics_dict = defaultdict(lambda: []) + weight = [] + for file, (result_dict, length) in result_dict_group.items(): + for key, value in result_dict.items(): + metrics_dict[key].append(value) + weight.append(length) + return { + name: { + "max": np.max(value), + "median": np.median(value), + "average": np.average(value, weights=weight), + } + for name, value in metrics_dict.items() + } + + def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1): + stats_dict = self.calc_group_metrics(result_dict_group) + if len(stats_dict) == 1: + name, stats = next(iter(stats_dict.items())) + print_rank_0( + " " * level + f"Group {group_name} {name}: max = {stats['max']:.3f}, " + f"median = {stats['median']:.3f}, average = {stats['average']:.3f}" + ) + else: + print_rank_0(" " * level + f" Group {group_name}: ") + for name, stats in stats_dict.items(): + print( + " " * (level + 1) + f"Metric {name}: max = {stats['max']:.3f}, " + f"median = {stats['median']:.3f}, average = {stats['average']:.3f}" + ) + + def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]): + pass + + @abstractmethod + def predict_single_batch(self, batch) -> List[Any]: + pass + + @abstractmethod + def build_dataset(self, relative_path: str) -> EvaluationDataset: + pass + + +class GenerationTask(BaseTask, ABC): + config: GenerationTaskConfig + + @classmethod + def config_class(cls): + return GenerationTaskConfig + + def build_dataset(self, relative_path): + return GenerationTaskDataset(join(self.config.path, relative_path), self.config) + + def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: GenerationTaskConfig): + super(GenerationTask, self).__init__(model, tokenizer, config) + + end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")] + if self.config.sampling_strategy == "BaseStrategy": + self.strategy = BaseStrategy(batch_size=self.config.micro_batch_size, temperature=1.0, top_k=1, + end_tokens=end_tokens) + elif self.config.sampling_strategy == "BeamSearchStrategy": + self.strategy = BeamSearchStrategy( + self.config.micro_batch_size, + self.config.num_beams, + length_penalty=self.config.length_penalty, + consider_end=True, + end_tokens=end_tokens, + no_repeat_ngram_size=self.config.no_repeat_ngram_size, + min_gen_length=self.config.min_gen_length, + deterministic=True, # For evaluation, we need a determined generation strategy + ) + else: + raise ValueError(f"unknown strategy {self.config.sampling_strategy}") + + def predict_single_batch(self, batch) -> List[List[int]]: + output = self.model.generate_text(batch, self.strategy, return_all_beams=False) + return output + + +class MultiChoiceTask(BaseTask, ABC): + config: MultiChoiceTaskConfig + + @classmethod + def config_class(cls): + return MultiChoiceTaskConfig + + def build_dataset(self, relative_path): + return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.config) + + def predict_single_batch(self, batch) -> List[int]: + log_probs = self.model.cond_log_prob(batch) + return [np.argmax(log_probs_single).item() for log_probs_single in log_probs] + + +class LanguageModelTask(BaseTask, ABC): + config: LanguageModelTaskConfig + + @classmethod + def config_class(cls): + return LanguageModelTaskConfig + + def build_dataset(self, relative_path): + return LanguageModelTaskDataset(join(self.config.path, relative_path), self.config) + + def predict_single_batch(self, batch) -> List[float]: + return self.model.calculate_loss(batch) diff --git a/PyTorch/contrib/nlp/GLM-130B/evaluation/utils.py b/PyTorch/contrib/nlp/GLM-130B/evaluation/utils.py new file mode 100644 index 0000000000..a1be20638b --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/evaluation/utils.py @@ -0,0 +1,67 @@ +import torch +import torch.distributed as dist + +from SwissArmyTransformer import mpu, get_tokenizer + + +def print_rank_0(*args, **kwargs): + if torch.distributed.get_rank() == 0: + print(*args, **kwargs) + + +def build_data_loader(dataset, micro_batch_size, num_workers, drop_last, collate_fn=None): + # Sampler. + world_size = mpu.get_data_parallel_world_size() + rank = mpu.get_data_parallel_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=False + ) + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=micro_batch_size, + sampler=sampler, + shuffle=False, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=True, + collate_fn=collate_fn, + ) + + return data_loader + + +def gather_result(prediction, total_length, micro_batch_size): + """ + @param prediction: Local predictions with order defined by distributed sampler + @param total_length: Total sample num + @return: [sample_0, sample_1, ..., sample_{total_length-1}] + """ + torch.cuda.empty_cache() + world_size = mpu.get_data_parallel_world_size() + prediction_gathered = [None for _ in range(world_size)] + dist.all_gather_object(prediction_gathered, prediction, group=mpu.get_data_parallel_group()) + prediction = [] + for i in range(len(prediction_gathered[0])): + for j in range(micro_batch_size): + for k in range(world_size): + if j < len(prediction_gathered[k][i]): + prediction.append(prediction_gathered[k][i][j]) + prediction = prediction[:total_length] + return prediction + + +def get_tokenized_input(item, key): + if key in item: + return item[key] + tokenizer = get_tokenizer() + pretokenized_key = key + "_pretokenized" + assert pretokenized_key in item + if isinstance(item[pretokenized_key], list): + result = [] + for raw in item[pretokenized_key]: + result.append(tokenizer.tokenize(raw)) + return result + else: + return tokenizer.tokenize(item[pretokenized_key]) diff --git a/PyTorch/contrib/nlp/GLM-130B/generate.py b/PyTorch/contrib/nlp/GLM-130B/generate.py new file mode 100644 index 0000000000..a56030ab4e --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/generate.py @@ -0,0 +1,219 @@ +import os +import torch +import stat +import re + +from functools import partial +from typing import List, Tuple + +from SwissArmyTransformer import mpu +from evaluation.model import batch_filling_sequence +from generation import BeamSearchStrategy, BaseStrategy +from SwissArmyTransformer.generation.utils import timed_name, generate_continually +from initialize import initialize, initialize_model_and_tokenizer + + +def add_generation_specific_args(parser): + parser.add_argument("--sampling-strategy", type=str, default="BaseStrategy", help="Type of sampling strategy.") + parser.add_argument("--min-gen-length", type=int, default=0, help="The minimum length each blank should generate.") + parser.add_argument( + "--print-all-beams", action="store_true", help="Print all output generated by beam search strategy." + ) + + +def isEnglish(s): + try: + s.encode(encoding="utf-8").decode("ascii") + except UnicodeDecodeError: + return False + else: + return True + + +def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False): + context_length = seq.shape[1] + # [1, max_gen_len] + tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode="constant", value=-1) + # [1, max_gen_len, max_gen_len] + attention_mask = torch.ones((1, tokens.shape[-1], tokens.shape[-1]), device=tokens.device) + attention_mask.tril_() + attention_mask[..., : context_length - 1] = 1 + # [1, 1, max_gen_len, max_gen_len] + attention_mask.unsqueeze_(1) + attention_mask = (attention_mask < 0.5).bool() + + # [1, max_seq_len] + position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device) + if not gmask: + position_ids[context_length - 1 :] = mask_position + + position_ids = position_ids.unsqueeze(0) + + return tokens, attention_mask, position_ids + + +def fill_blanks(raw_text: str, model, tokenizer, strategy) -> Tuple[List[str], List[str], List[List[str]]]: + # add MASK + generation_mask = "[gMASK]" + if "[MASK]" in raw_text: + generation_mask = "[MASK]" + elif "[sMASK]" in raw_text: + generation_mask = "[sMASK]" + use_gmask = "[MASK]" not in raw_text and "[sMASK]" not in raw_text + + mask_pattern = r"\[[sg]?MASK\]" + text_list = re.split(mask_pattern, raw_text) + pattern_list = re.compile(mask_pattern).findall(raw_text) + seq = [] + for i in range(len(pattern_list)): + pattern = pattern_list[i] + sub_text = text_list[i] + seq.extend(tokenizer.tokenize(sub_text)) + seq.append(tokenizer.get_command(pattern)) + + seq.extend(tokenizer.tokenize(text_list[-1])) + + if "MASK]" not in raw_text: + seq += [tokenizer.get_command(generation_mask)] + raw_text += " " + generation_mask + if not raw_text.endswith("MASK]"): + seq = seq + [tokenizer.get_command("eos")] + if mpu.get_model_parallel_rank() == 0: + print("\nInput: {}\n".format(raw_text)) + if len(seq) > args.max_sequence_length: + raise ValueError("text too long.") + + # generation + is_english = isEnglish(raw_text) + output_list = [seq] + num_output = args.num_beams if args.sampling_strategy == "BeamSearchStrategy" else 1 + last_pos, answers, answers_with_style, blanks = ( + [0] * num_output, + ["" for _ in range(num_output)], + ["" for _ in range(num_output)], + [[] for _ in range(num_output)], + ) + + # continually detect the first mark position + while True: + seq = output_list[0] + # detect mask position + mask_token = tokenizer.get_command(generation_mask) + if mask_token not in seq: + break + mask_position = seq.index(mask_token) + + output_list = [] + + input_seq = torch.cuda.LongTensor( + [seq + [tokenizer.get_command("sop")]], + device=args.device, + ) + output, _ = batch_filling_sequence( + model, + input_seq, + torch.cuda.LongTensor([input_seq.shape[-1]], device=args.device), + strategy=strategy, + get_masks_and_position_ids=partial( + get_masks_and_position_ids, + mask_position=mask_position, + max_gen_length=args.out_seq_length - input_seq.shape[-1], + gmask=use_gmask, + ), + ) + if isinstance(output, torch.Tensor): # different strategies + output = output.tolist() + output = output[0] # batch_size = 1 + output_list.extend(output) + + # clip -1s and fill back generated things into seq + for i in range(len(output_list)): + output = output_list[i].tolist() if isinstance(output_list[i], torch.Tensor) else output_list[i] + try: + unfinished = output.index(-1) + except ValueError: + unfinished = len(output) + if output[unfinished - 1] in strategy.end_tokens: + unfinished -= 1 + bog = output.index(tokenizer.get_command("sop")) + + prefix = tokenizer.detokenize(output[last_pos[i] : mask_position]) + blank = tokenizer.detokenize(output[bog + 1 : unfinished]) + answers_with_style[i] += ( + prefix + + (" " if is_english else "") + + ("\033[4m" if use_gmask else "\x1b[0;32m\033[4m") + + blank + + ("\033[0m" if use_gmask else "\033[0m\x1b[0m") + + (" " if is_english else "") + ) + blanks[i].append(blank) + last_pos[i] = mask_position + unfinished - (bog + 1) + output_list[i] = output[:mask_position] + output[bog + 1 : unfinished] + output[mask_position + 1 : bog] + + for i, output in enumerate(output_list): + if output[-1] == tokenizer.get_command("eos"): + output = output[:-1] + answers_with_style[i] += tokenizer.detokenize(output[last_pos[i] :]) + answers[i] = tokenizer.detokenize(output) + + return answers, answers_with_style, blanks + + +def main(args): + model, tokenizer = initialize_model_and_tokenizer(args) + + end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")] + + if args.sampling_strategy == "BaseStrategy": + strategy = BaseStrategy( + batch_size=1, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, end_tokens=end_tokens + ) + elif args.sampling_strategy == "BeamSearchStrategy": + strategy = BeamSearchStrategy( + 1, + args.num_beams, + length_penalty=args.length_penalty, + consider_end=True, + end_tokens=end_tokens, + no_repeat_ngram_size=args.no_repeat_ngram_size, + min_gen_length=args.min_gen_length, + ) + else: + raise ValueError(f"unknown strategy {args.sampling_strategy}") + + def process(raw_text): + if args.with_id: + query_id, raw_text = raw_text.split("\t") + + answers, answers_with_style, blanks = fill_blanks(raw_text, model, tokenizer, strategy) + + # save + if args.with_id: + full_path = os.path.join(args.output_path, query_id + ".txt") + else: + prefix = raw_text.replace("/", "")[:20] + full_path = timed_name(prefix, ".txt", args.output_path) + if mpu.get_model_parallel_rank() == 0: + if args.print_all_beams and len(answers) > 1: + for idx, answer_with_style in enumerate(answers_with_style): + print(f"Output beam {idx}:", answer_with_style) # print the first. + if len(answer_with_style) > 120: + print("") + else: + print(f"Output:", answers_with_style[0]) # print the first. + with open(full_path, "w", encoding="utf-8") as fout: + for answer in answers: + fout.write(answer + "\n") + + os.chmod(full_path, stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU) + + os.makedirs(args.output_path, exist_ok=True) + generate_continually(process, args.input_source) + + +if __name__ == "__main__": + args = initialize(extra_args_provider=add_generation_specific_args) + + with torch.no_grad(): + main(args) diff --git a/PyTorch/contrib/nlp/GLM-130B/generation/__init__.py b/PyTorch/contrib/nlp/GLM-130B/generation/__init__.py new file mode 100644 index 0000000000..540346c645 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/generation/__init__.py @@ -0,0 +1 @@ +from .strategies import BaseStrategy, BeamSearchStrategy diff --git a/PyTorch/contrib/nlp/GLM-130B/generation/strategies.py b/PyTorch/contrib/nlp/GLM-130B/generation/strategies.py new file mode 100644 index 0000000000..92525bdb87 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/generation/strategies.py @@ -0,0 +1,193 @@ +import numpy as np +import torch +import torch.nn.functional as F +from SwissArmyTransformer.generation.sampling_strategies.base_strategy import top_k_logits + +class BaseStrategy: + def __init__(self, batch_size, invalid_slices=[], temperature=1., top_k=200, eps=1e-4, top_p=0.0, end_tokens=None): + self.batch_size = batch_size + self.invalid_slices = invalid_slices + self.temperature = temperature + self.topk = top_k + self.top_p = top_p + self.eps = eps + if end_tokens is None: + end_tokens = [] + self.end_tokens = end_tokens + self._is_done = np.zeros(self.batch_size, dtype=np.bool) + + @property + def is_done(self) -> bool: + return self._is_done.all() + + def forward(self, logits, tokens, mems, temperature=None): + logits = logits.view(-1, logits.size(-1)) + batch_size = tokens.shape[0] + if temperature is None: + temperature = self.temperature + logits = logits / temperature + for invalid_slice in self.invalid_slices: + logits[..., invalid_slice] = -65504 + + logits = top_k_logits(logits, self.topk, self.top_p) + probs = F.softmax(logits.float(), dim=-1) # float is essetial, due to a bug in Pytorch + pred = torch.multinomial(probs, num_samples=1) + for i in range(self.batch_size): + if i >= batch_size: + self._is_done[i] = True + elif self._is_done[i]: + pred[i] = -1 + elif pred[i].item() in self.end_tokens: + self._is_done[i] = True + tokens = torch.cat((tokens, pred.view(tokens.shape[:-1] + (1,))), dim=-1) + return tokens, mems + + def finalize(self, tokens, mems): + self._is_done = np.zeros(self.batch_size, dtype=np.bool) + return tokens, mems + + +class BeamSearchStrategy: + def __init__( + self, + batch_size, + num_beams, + length_penalty=1.0, + consider_end=False, + end_tokens=[], + invalid_slices=[], + no_repeat_ngram_size=0, + min_gen_length=0, + deterministic=False, + ): + self.batch_size = batch_size + self.num_beams = num_beams + self.length_penalty = length_penalty + self.end_tokens = end_tokens + self.ngram = no_repeat_ngram_size + self.min_gen_length = min_gen_length + self.invalid_slices = invalid_slices + self.consider_end = consider_end + self.deterministic = deterministic + self._init_cache() + + def _init_cache(self): + self.end_beams = [[] for _ in range(self.batch_size)] # list of LongTensors + self.end_beams_penalized_scores = [[] for _ in range(self.batch_size)] # list of LongTensors + self.cached_beam_scores = 0 # [batch_size] + self.cached_beam_ngram_bans = [[{} for _ in range(self.num_beams)] for _ in range(self.batch_size)] + self.length_generated = 0 + self._is_done = np.zeros(self.batch_size, dtype=np.bool) + + def _add_end_beams(self, score, beam, batch_idx): + score = score / ((5.0 + len(beam)) / 6) ** self.length_penalty # Magic number for OpenNMT + for i in range(len(self.end_beams[batch_idx]), -1, -1): + if i == 0 or score < self.end_beams_penalized_scores[batch_idx][i - 1]: + break + self.end_beams[batch_idx].insert(i, beam) + self.end_beams_penalized_scores[batch_idx].insert(i, score) + + self.end_beams[batch_idx] = self.end_beams[batch_idx][: self.num_beams] + self.end_beams_penalized_scores[batch_idx] = self.end_beams_penalized_scores[batch_idx][: self.num_beams] + + @property + def is_done(self) -> bool: + return self._is_done.all() + + def forward(self, logits, tokens, mems): + batch_size, num_beams, vocab_size = logits.shape + seq_len = tokens.shape[-1] + logits = logits.float() + for invalid_slice in self.invalid_slices: + logits[..., invalid_slice] = -65504 + if self.min_gen_length > self.length_generated: + for end_token in self.end_tokens: + logits[..., end_token] = -65504 + if self.ngram > 0 and seq_len > self.ngram: + for batch_idx in range(batch_size): + for i in range(num_beams): + ngram_prefix = tokens[batch_idx, i, -(self.ngram - 1) :].tolist() # TODO ngram=1 + for banned_index in self.cached_beam_ngram_bans[batch_idx][i].get(tuple(ngram_prefix), []): + logits[batch_idx, i, banned_index] = -65504 + + next_token_scores = F.log_softmax(logits, dim=-1) # [batch_size, vocab_size] + prev_scores = self.cached_beam_scores + if isinstance(prev_scores, torch.Tensor): + prev_scores = prev_scores[..., None].expand_as(next_token_scores) + next_token_scores = next_token_scores + prev_scores + + next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) + + probs = F.softmax(next_token_scores, dim=-1) + if num_beams < self.num_beams: # First token + probs = probs[..., :vocab_size] + if self.deterministic: + next_tokens = torch.topk(probs, k=(max(1, len(self.end_tokens)) + 1) * self.num_beams).indices # [2*nb] + else: + next_tokens = torch.multinomial( + probs, num_samples=(max(1, len(self.end_tokens)) + 1) * self.num_beams + ) # [2*nb] + next_token_scores = next_token_scores[torch.arange(batch_size).unsqueeze(1), next_tokens] + next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1) + next_tokens = next_tokens[torch.arange(batch_size).unsqueeze(1), _indices] + + next_indices = torch.div(next_tokens, vocab_size, rounding_mode="trunc") + next_tokens = next_tokens % vocab_size + + # select out end beams or continue beams + beam_continue_batch, score_continue_batch, mems_continue_batch = [], [], [] + for batch_idx in range(batch_size): + beam_continue = [] + scores_continue = [] + bans_continue = [] + mems_contiue = [] + for i in range(len(next_tokens[batch_idx])): + beam = torch.cat((tokens[batch_idx, next_indices[batch_idx, i]], next_tokens[batch_idx, i : i + 1])) + if not self._is_done[batch_idx] and int(next_tokens[batch_idx, i]) in self.end_tokens: + self._add_end_beams(next_token_scores[batch_idx, i], beam, batch_idx) + elif len(beam_continue) < self.num_beams: + beam_continue.append(beam) + mems_contiue.append(mems[:, batch_idx, next_indices[batch_idx, i]]) + # update caches + scores_continue.append(next_token_scores[batch_idx, i]) + if self.ngram > 0: + bans = self.cached_beam_ngram_bans[batch_idx][next_indices[batch_idx, i]].copy() + # TODO ngram=1 + ngram_prefix = tuple(tokens[batch_idx, next_indices[batch_idx, i], -(self.ngram - 1):].tolist()) + bans[ngram_prefix] = bans.get(ngram_prefix, tuple()) + (next_tokens[batch_idx, i],) + bans_continue.append(bans) + else: + break + beam_continue_batch.append(torch.stack(beam_continue)) + mems_continue_batch.append(torch.stack(mems_contiue, dim=1)) + score_continue_batch.append(scores_continue) + self.cached_beam_ngram_bans[batch_idx] = bans_continue + tokens = torch.stack(beam_continue_batch) + mems = torch.stack(mems_continue_batch, dim=1) + self.cached_beam_scores = torch.tensor(score_continue_batch, device=logits.device) + self.length_generated += 1 + for batch_idx in range(self.batch_size): + if batch_idx >= batch_size: + self._is_done[batch_idx] = True + elif ( + len(self.end_beams[batch_idx]) == self.num_beams + and self.end_beams_penalized_scores[batch_idx][-1] + >= self.cached_beam_scores[batch_idx].max() / ((5.0 + (seq_len + 1)) / 6) ** self.length_penalty + ): # We're done if none of current tokens will better than the worst in end_beams + self._is_done[batch_idx] = True + + return tokens, mems + + def finalize(self, tokens, mems): + if self.consider_end: + batch_size, num_beams = tokens.shape[:2] + for batch_idx in range(batch_size): + if not self._is_done[batch_idx]: + for i in range(num_beams): + self._add_end_beams(self.cached_beam_scores[batch_idx, i], tokens[batch_idx, i], batch_idx) + mems = None + ret = self.end_beams[:batch_size] + else: + ret = tokens + self._init_cache() + return ret, mems diff --git a/PyTorch/contrib/nlp/GLM-130B/initialize.py b/PyTorch/contrib/nlp/GLM-130B/initialize.py new file mode 100644 index 0000000000..6adf4fc6a0 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/initialize.py @@ -0,0 +1,116 @@ +import argparse +import torch +import time + +from quantization import quantize + +from SwissArmyTransformer import get_args, get_tokenizer +from SwissArmyTransformer.arguments import initialize_distributed +from SwissArmyTransformer.training import load_checkpoint +from SwissArmyTransformer.model import GLM130B +from SwissArmyTransformer.mpu import get_model_parallel_world_size, get_model_parallel_rank, get_model_parallel_group + + +def add_bminf_args(parser): + """Arguments for BMInf""" + group = parser.add_argument_group("BMInf") + + group.add_argument("--bminf", action="store_true", help="Use BMInf to support low resource evaluation") + group.add_argument("--bminf-memory-limit", type=int, default=20, help="Max memory for model per GPU (in GB)") + return parser + + +def add_quantization_args(parser): + group = parser.add_argument_group("Quantization") + + group.add_argument("--quantization-bit-width", type=int, default=None) + group.add_argument("--from-quantized-checkpoint", action="store_true", help="Loading from a quantized checkpoint") + + +def add_initialization_args(parser): + group = parser.add_argument_group("Initialization") + + group.add_argument( + "--sequential-initialization", + action="store_true", + help="Initialize sequentially in tensor parallel group (reduce CPU RAM for initialization)", + ) + + +def initialize(extra_args_provider): + parser = argparse.ArgumentParser(add_help=False) + add_bminf_args(parser) + add_quantization_args(parser) + add_initialization_args(parser) + GLM130B.add_model_specific_args(parser) + extra_args_provider(parser) + known, args_list = parser.parse_known_args() + args = get_args(args_list) + args = argparse.Namespace(**vars(args), **vars(known)) + args.do_train = False + initialize_distributed(args) + return args + + +def initialize_model_and_tokenizer(args): + tokenizer = get_tokenizer(args) + + torch.distributed.barrier() + start = time.time() + + for i in range(get_model_parallel_world_size()): + if get_model_parallel_rank() == i: + # Initialize model + model = GLM130B(args).half() + + if args.from_quantized_checkpoint: + assert args.quantization_bit_width is not None + # Quantize model before moving to GPU + model = quantize(model, args.quantization_bit_width) + + # Load checkpoint + load_checkpoint(model, args) + + if args.quantization_bit_width is not None and not args.from_quantized_checkpoint: + # Quantize model before moving to GPU + model = quantize(model, args.quantization_bit_width) + + if args.bminf: + import bminf + + if torch.distributed.get_rank() == 0: + print(f"> BMInf activated, memory limit: {args.bminf_memory_limit} GB") + with torch.cuda.device(args.device): + model = bminf.wrapper(model, quantization=False, memory_limit=args.bminf_memory_limit << 30) + else: + model = model.to(args.device) + if args.sequential_initialization: + torch.distributed.barrier(group=get_model_parallel_group()) + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(f"> Model initialized in {time.time() - start:.1f}s") + + torch.cuda.empty_cache() + model.eval() + + # generate rotary embedding cache + original_parallel_output = model.transformer.parallel_output + model.transformer.parallel_output = True + with torch.no_grad(): + _, *_ = model( + torch.ones(1, args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64), + torch.arange(args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1), + torch.randn( + 1, + 1, + args.max_sequence_length, + args.max_sequence_length, + device=torch.cuda.current_device(), + ) + < 0.5, + ) + model.transformer.parallel_output = original_parallel_output + torch.distributed.barrier() + + return model, tokenizer diff --git a/PyTorch/contrib/nlp/GLM-130B/kernels/__init__.py b/PyTorch/contrib/nlp/GLM-130B/kernels/__init__.py new file mode 100644 index 0000000000..6037536fa5 --- /dev/null +++ b/PyTorch/contrib/nlp/GLM-130B/kernels/__init__.py @@ -0,0 +1,99 @@ +import pkg_resources +import torch +import ctypes + +from typing import List +from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up + +RESOURCE_PACKAGE_NAME = __name__ + + +class Kernel: + def __init__(self, filename: str, function_names: List[str]): + filename = filename + ".fatbin" + if not pkg_resources.resource_exists(RESOURCE_PACKAGE_NAME, filename): + raise RuntimeError("File `%s` not found in `%s`" % (filename, RESOURCE_PACKAGE_NAME)) + self.filename = filename + self.code = pkg_resources.resource_string(RESOURCE_PACKAGE_NAME, filename) + self._function_names = function_names + self._cmodule = LazyKernelCModule(self.code) + + for name in self._function_names: + setattr(self, name, KernelFunction(self._cmodule, name)) + + +kernels = Kernel( + "quantization", + [ + "int4WeightCompression", + "int4WeightExtractionFloat", + "int4WeightExtractionHalf", + "int8WeightExtractionFloat", + "int8WeightExtractionHalf", + ], +) + + +def compress_int4_weight(weight: torch.Tensor): # (n, m) + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + assert m % 2 == 0 + m = m // 2 + out = torch.empty(n, m, dtype=torch.int8, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + kernels.int4WeightCompression( + gridDim, + blockDim, + 0, + stream, + [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], + ) + return out + + +def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): + if source_bit_width == 8: + func = kernels.int8WeightExtractionHalf + elif source_bit_width == 4: + func = kernels.int4WeightExtractionHalf + else: + assert False, "Unsupported bit-width" + + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + func( + gridDim, + blockDim, + 0, + stream, + [ + ctypes.c_void_p(weight.data_ptr()), + ctypes.c_void_p(scale_list.data_ptr()), + ctypes.c_void_p(out.data_ptr()), + ctypes.c_int32(n), + ctypes.c_int32(m), + ], + ) + return out + + +if __name__ == "__main__": + weight = torch.randn(4, 32).to(torch.int8).cuda() + scale = torch.ones(weight.size(0)).to(torch.half).cuda() + + print(weight) + b = compress_int4_weight(weight) + print(b) + + a = extract_weight_to_half(b, scale, source_bit_width=4) + print(a) diff --git a/PyTorch/contrib/nlp/GLM-130B/kernels/quantization.fatbin b/PyTorch/contrib/nlp/GLM-130B/kernels/quantization.fatbin new file mode 100644 index 0000000000000000000000000000000000000000..03f4431928aa1814e4ffcd91d61ee72b88157b68 GIT binary patch literal 70696 zcmeHQZ){yfcAxj&efPd+Cw?zpNMfMzOKQNORY)ldt`xxAU4o@#iIG~umTYk#Awhz~ zwcXT-sP;u&E!i&+ttu`3P${jVZa$#ZmM=^Dp>kWajaCTQhz}9sgX6F?pd~WuHc%<# z^>=2@+&lNrT*tBf5|SC|y}5JeoS8Fc&dl6Dzk6o*(?9*!f~bqtHx&ehq9_yw5uaE~ zsVD@ZARdU4Vuf*R9)9oPt(%L5!XE~FvMByd{Dn!t^rdGxqZ*>7aJyaPfp-);@<7MpWCx{;+y+l z7#-U^KK|VPeIZd!_W0hNW4m|d5InPg-}uDNeG?mPG`ju)RsX}26JtA{nLzcMpWnZ8 zA|^u)NhC)OQA`#aomHNpozFj;FHT62BTYz?FN`A7vT)zz=_ZFHk|u{JCXS8HDo@xd zvMHmdBZ)(b9BD$Dd|^VevE9#aP_}XfhHo+XL=)RR9G)88Ikxi!0paFtpI2wRWJ#(z zNNY0Hkg)%>vDhghW3ki3j3t{a>g&iLSzTI6o5}M$nQ7CJ#v?g%%83bTq05pquFe!= zN}YC!95KUE@}&&P%&})KQ1Rm-BuiKyli|2L-+Dc+sivCCPe0S>n z=komW)cJuTmGiaK`D5~YSL*y3dH$x<`OPINrlS*T5060~9)q5=rs=f?i9S3AeRvFd>J{pFl1KLt(TB&N5061lWmlP=jwuj*cntdR z_DU95*=b=XfK> z12RU@94mg@t8iNJ7lh(32*qCzioYNfKTKNE7rB1eC8rgCk@<_vUu6Cw^F#2PeqYM( z{+tG!2Al?*2Al@2UkzLc#3k{q+eAUUB$~miTRpM8OtCMvV<|m3`%uuh^o=)rv4*aa z>5`B9@71??h;gC%*o9T=E=|4KD+Hy5cvSfJhwuaWXq z9+_&EX739?DH^BG*F@z)t2Q{(3Z`44*^B!Jl3SduiI-Y*UbiEuzmoVXsXyp3ra!b# zrmG(Ep}!Kvg-Cxzq(uSx^L{K=7dAdwMf@t_NPj=h)Zb^pe<=NV{|&wM4|pEhe|YuQ za#6K&HduFw^jl*6bjbF-cfKeh{fajqs%ZT}?`Z$8&If|ZTR(6nn7Sms-z$9ah%8s@ zH4sXk-vr0bHjX?bf?&JUYvm5o`x`qdm7gHqyW=x)tR*h?ii%I=egM55k$TnoYz5*X z>o=xXt>0c;{WU{o%eV?lH=l4-=%>SxC zy{Z})N{^?`e-sQ}6z{DOepSt9$$q>L{QE_5q*r+U=Pj%`9{LY@>mR6iD!n4W{Pe9& zqO$HKsaLOx^#%RXQKj^W9iP$s`6rvu|EgZgB%s zI^{h3{`r5V`4jflA^ZK*dDV|Y1G=B3{8)d({$)A+b$g^dU()(~? zP^Ekt-)rUg4gxj40}pf!Ile!=ob?NZEGS@-!rGD!gM8}$uXS)w1~ih)=$l z$V~Fnlf^_8viLXXp%DT-JE1@{RF`}hfS8_#QCP2{Q58l1!%R5Kr?eRKY|IRgszi!^ zkRAXcH08f)Bc9laB>yRupW>fl{-XvzJ#|g;?^Nwe@elJOU6Y@l&L;WKTll9l_(|7E z{=ZhrPt`w4%0@8jr>DJ1{!hRU(iFdbKzdr168-7^)A7s5q9!D5;50vAeS-U+N~q(P ztp66|XYp_32f!l+f0q7#V&Ol^7q}*Wmj3U9;92$0lCTISf0q9L)51T@kK#@KEdAeu zI;eggE>xuqQ~bz&klRo850zFMDm(mt9Y6;Ud=3Upk!NQ##gmOJWh9 zw$mi%IkrC?n|w)Gle}hH^X1_@9yA!DH#^2j9L^W#@nHEu;cy;!7aP67Ougxi-p0mW zVcO{Qi{Dg!_=>Y_ucwn3oYYI|Hot_n%ahVg+ut?M8{ibZQIBCcboX-uVtHwGC$l3n zwD8AFFoVY7JP6_ynp31t`*4h@G@C5y>(mfR^mi&h-RD^(k(?>Rw@g14x-3bRHmrh% z%F^>vs_2j+N6e5WrcQfN4(H);9=OhTI1h*O(3h|l9E`(xu!1c5aX1fcKrVI~a2jwL za2jwL_-bk({hn{T!+AKIhr@Zy8)rlBfgR36LBb@laKb0T83DxgY?G(}bR69++Fsd@^ zyWQyJu@)xvMm8deq{$(QiDRR)%9GMfs)UdtN1Bi(Ul>KUYxmQCx#y{8;r(K_*g$Wc zlP7k@&!GR!che3R5*diLp_H*kBvG&B5XHL9LI)s2Do1swHZDuH0l2VJ@3=e<#A_maEYCBz*2sGdOJ^zcIuT_ke<5^57Z3ClDhubK{Dm<1 z)?hgX9BYKhnk|=5+{vJ0jC5f;%3c(@ai%q{E8mRq_JRj+y ziS|%Xeng`7>hsEvNYt(-<8qZBkw|`hzJ<`QiWGEAp(MN)DXuL$+-&xXO6zd5zakG6 zA?0mko@6x3@_E#BuYo^Rh&P}piFYCOjjCe8f8Fj>efKU-ebZr0>R_qs8)p6KYDc;M zQtf*}Q7lxSQGmJSy%d-V_ z%W$3fovHA=ND<97=J#tfmilSXB2Yff6>m@ifzmV=(DeufN~f+|BwhMUldW^hrM3kfk38TK+EE^Lbz_t0vY)PE1iEs0F=K|gu#3wb; zH(8R`5>g(}*AEp-0{3XG>CXajvL;p>DwV`{knY0O$h$Rhc)U!P9zvX11NQ`7NPGD7 zl+!biXR=(T>lj(y%r?3eh?)0!grUP1Y|3-~iG|HB&ctL0fJz@Jh2SaAqn%|6NX zeOUAAhrmqy)9@toR!#h1TzIkxXHd&bA=SP|ekeqPZZ2DYgZA}`Z)<&Vdmll1R_9OE z#Og2~m9!yUJOpLYg*7Q9v;UQPb^jL+p|wvjz3hK_gN(2izg=V>)gt9rb^jkicOqSq z{eQRA59!O+|HaAW>YI6z2mOEhfnt&RXF1vD+hh;8OVai*Q}YgwcPartM*siQ8rTP{ zD`p?Q`h58;@?j6RA3%TaKzSq&_+$ME`y%@c<%9kE;}!L-`c3HH<1dTGSZ77K8W@EA zZIk^k(;ut;r~V$+{Rw-XtH0+Sf9=(O-$ZANYLw#z`q29CwCbP7->*skUHSU|unYS1 zls@UpcIfk-vB1L!Ye+{KGuy)c>CEaM^goR^IV1f#8gF5LVLplLf9~;i?diYPqURDx z{nxDepP&8rOseNFEgBVaeBp^0#zRf%Tl7p;Wc@zv*YMUW(*Ef=fROgzm2dy}famb| z@@PE%Bh9}ZK0TDt^ZCk2zoY(lltzGk^-TILdWNLxqw(fP;|=_CjnBr|a$ntd4u<1# zIVq6F<3|N)4^x-){5$t}Z1?;-xBc<_YOcSSPo+LH*WaKd=V$6qx&8{7Hr8J`d~EY? z--MqW|0SA#{m}j@L45u-$3M-#a2OPuf7vfD&A%A`-IHF0E->W!d-4_ZzbEa@TM2uE zJs{tU@@rz{WQW&0!uEpo)*tVX%auf29Ph&XQSC7253Kh=eE!1p0{iw!AJEsz$WQIpoBE^4BU1W$R6O$U`vwu$O=#)B0@$aR^A{raGX6-oV3-I}k?=U+xlHjywBnp$L-Gz=!I|7`xChWN!# z)8NnQKbt?OlT;ak$)DAKHh)k<{7Tg1&+0#$Kd6)Z>eS@V>c4v+BQ;T7q?5!dG|4R< z1y(HCKTK*_lg=68bsT5%_b4J=Kk>sP>P>Yz{I~go61;zu$&W7WO7h$MK{c^`0fnX^ zKWwJEDbGfKHk_d7RWv3)Od?f3=?&|j&^WoMWOa0cPv}_NFX@cj04T5{Xu2n1cwYhuGToC4(IX*rQ5G@f=Y5c#!izjjH*n_lEV5#ZPLQzkVMkt5XHo?(OKn5=_XY|NRcB=NRuy& zB3mH#$l(OjHk2~fh$QN@9HLmaS?C;2khWZm)R`J6kYBG6b(kK?$M z@uvP`c|O9&X%>eQRGM%&L8S+MJ)FYP-K+BHNn1Y~k9-V{zxm)Grok=Ef=T3s)1T;BbNtCrCGx4ksAlOdL+o;RF`}C)+pSaDwtZG~p5*P7pYJ zLO39p>#xHJ0x{A6&e-7u2^Z~ff^vk!{v1wFspBYXg=Uc~oSDN3I-DS3#*#QkhZ9r- zN57|}@#AoU*_ceFsqFvD!3l0LSQiT?xTU0urJ&X<>3mB587nV&d57tn#FElPV#k$dM+b$rnbEIh-IiSQKub zNw?4B_TA6z**g(!XQ{@=bu?Cug)?zT*n5W)ywUs#(#I+6PjHhmh=tnvbhBlIxc@?Jl6i}F4|`bUEQoPQzze()D3od!vNQ>7NURw$Cc9`YY}uw10;clZxnCQhtD z+ADf=??vgEF2xW3N0J}qrTqzdkv~EBGt&M8y&36WPV|jQcSa~crt3YEWm&(V`r&Ud z%fA&NDW!iV^aD93>lYlRs9FT-&xOO z5a~Z@A4nI+*t-Sc7Lktm7yK#Nhwn>!IjiiW9sUFd=h5Hk@fX)W{4=)5KUm~HjB!J2 z(cY&Q!+&F^dPhw9lm6wD|3>LQ@QUw`VhjH$05;GeVtKS|2}-Y=woQt2;K`X|MF2LFTs`4g1x-q@d@JP!Q{N_T{Y zKS2}@$f_(~^5@tu?7JKG9kox{d)%KO%A*ka6U?`7w1*rF#{3CN37tPdsYb%S@ro(a z`tN$d3BsIYpjDr|gUD+cbxQu9Qpd^HIbwwU0Ulx6`Qil6W$=@Wxn%t|e}bdzPtdGC z%O9Q1pWqi6{8|3!Z2knB3-l-W1g*yi!z#i%SJifUR!#v^f($l)f+v{&O8p5o7%K^aDomeI5D>K8HW>eI6<`-8vn;}?EDF)_Z@EXb=o`7Pj{WJv{lgOQ~m^*KDqC( zyiovaa^GQnexbNMEtih1II~TPVxjeo>WVnF);G8~^-Xg>26ydjPZvS9Xha89WNNq@_RGJ^r@fjC64LA)r4LA)r z4LA*4ZyHGBl$Owc;QMpzGpunho1ta8K*W3Z+E3IF#tr%kM@VVl1{3=V zd)7oYpJxeT9Le0M4*V`xI9QX9#&hoZt%J z1b=hw;{@Bi&t|f|w&D~s?CSE6w4F(*72T+eG~| z-_eTU-3)u&u(Kfi5LZJJbFQ|CMZi^qphnAbtK20p{YmJaq@Mj z(EEO~TMwz@`QUVSFwXY-To=`!h4Zxe^PJA$&%$}yaJswrA--9E7S7X#(><5LKOda# z=mK%NLkq;|zPdo1Zhe6`-J=V{>DCvB(>=OKI9-jCAp3<$ENiko>*#eH&>TeXHDlox zl`{EZ67^<`NC?}$-&y;1qf5Jz{BNW3%=+o$QVKlqDZ+D{%g5=iliV?Wm_(|6YB!!B zi4dpCY&tr@2Rg2Bnq($EZKr%XjP`Q>>Dc5;={n>}A85XC-|q~Z?&AjAmEv>n@9m;BPeeUGBzU0hsH9U!|9Ue#Bc6@VRUTw_&DKYHzZUN%S*dk zO=hRMCP|RhEhdfg=jm{|4yUUYL*xHAjvY?7{W#qwUk5G}H*0jIJhb9i9Zt7ZQ?5-b zt;5a!%Hd|SpOr^_u#u0iHcoex?IMK>eo+bNaJs_bbXTc|F`j_G*Vz@q<8(=^j-`uq zhdH}8%r6MseHm$KQAn3=Z3&NkRyJ`XXv4_Tz1v8kv2^>Lvkcb)PM79Utq0Q67W$*$ zaJotj_;)y6sS5Y!G~hJgG~hJgG;qUcAdQo7I9=*|=|2v*28Yu%h5~S9i;bJS+Bn_f z4{i7l!>+f1BTXNzjo z{s?n%;s#Dq`d5@Vu_R91;dEvH(ccoxS996ZeDklq{W+X&+`p_Z{mV*!x$s}=kNXOP z68e`_)@AsY6(KGi>jnJF`U>JS2>;0Hy9tNWRXWQWk8|1oqWV+5%=!0S0?}lBr!(*Q zmvFkcp8gKz_byyuWPB$9T&{HByomm7<+t~GJ<0WNi}u9)+d@A2?t(s%lQ>w3+YJ)9 zUHG5H<7inv=vPkqzIFUyedn~-Z)@Uq zN4~(f